From 99b3b837855b987563bcfb397cf9ddd88262814b Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Sun, 4 Sep 2022 23:17:13 +0800 Subject: pstore/zone: Use GFP_ATOMIC to allocate zone buffer There is a case found when triggering a panic_on_oom, pstore fails to dump kmsg. Because psz_kmsg_write_record can't get the new buffer. Handle this by using GFP_ATOMIC to allocate a buffer at lower watermark. Signed-off-by: Qiujun Huang Fixes: 335426c6dcdd ("pstore/zone: Provide way to skip "broken" zone for MTD devices") Cc: WeiXiong Liao Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/CAJRQjofRCF7wjrYmw3D7zd5QZnwHQq+F8U-mJDJ6NZ4bddYdLA@mail.gmail.com --- fs/pstore/zone.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index 017d0d4ad329..2770746bb7aa 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -761,7 +761,7 @@ static inline int notrace psz_kmsg_write_record(struct psz_context *cxt, /* avoid destroying old data, allocate a new one */ len = zone->buffer_size + sizeof(*zone->buffer); zone->oldbuf = zone->buffer; - zone->buffer = kzalloc(len, GFP_KERNEL); + zone->buffer = kzalloc(len, GFP_ATOMIC); if (!zone->buffer) { zone->buffer = zone->oldbuf; return -ENOMEM; -- cgit From d85644dc5cf4023bf1a325f476d58a16bcc0798b Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Thu, 6 Oct 2022 19:42:05 -0300 Subject: pstore: Improve error reporting in case of backend overlap The pstore infrastructure supports one single backend at a time; trying to load a another backend causes an error and displays a message, introduced on commit 0d7cd09a3dbb ("pstore: Improve register_pstore() error reporting"). Happens that this message is not really clear about the situation, also the current error returned (-EPERM) isn't accurate, whereas -EBUSY makes more sense. We have another place in the code that relies in the -EBUSY return for a similar check. So, make it consistent here by returning -EBUSY and using a similar message in both scenarios. Signed-off-by: Guilherme G. Piccoli Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221006224212.569555-2-gpiccoli@igalia.com --- fs/pstore/platform.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 0c034ea39954..c32957e4b256 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -562,8 +562,9 @@ out: int pstore_register(struct pstore_info *psi) { if (backend && strcmp(backend, psi->name)) { - pr_warn("ignoring unexpected backend '%s'\n", psi->name); - return -EPERM; + pr_warn("backend '%s' already in use: ignoring '%s'\n", + backend, psi->name); + return -EBUSY; } /* Sanity check flags. */ -- cgit From 8f5de3fd38b7e64112017bb1630f9df6e1b3331d Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Thu, 6 Oct 2022 19:42:06 -0300 Subject: pstore: Expose kmsg_bytes as a module parameter Currently this tuning is only exposed as a filesystem option, but most Linux distros automatically mount pstore, hence changing this setting requires remounting it. Also, if that mount option wasn't explicitly set it doesn't show up in mount information, so users cannot check what is the current value of kmsg_bytes. Let's then expose it as a module parameter, allowing both user visibility at all times (even if not manually set) and also the possibility of setting that as a boot/module parameter. Signed-off-by: Guilherme G. Piccoli Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221006224212.569555-3-gpiccoli@igalia.com --- fs/pstore/platform.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index c32957e4b256..be05090076ce 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -89,6 +89,11 @@ static char *compress = module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "compression to use"); +/* How much of the kernel log to snapshot */ +unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; +module_param(kmsg_bytes, ulong, 0444); +MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)"); + /* Compression parameters */ static struct crypto_comp *tfm; @@ -100,9 +105,6 @@ struct pstore_zbackend { static char *big_oops_buf; static size_t big_oops_buf_sz; -/* How much of the console log to snapshot */ -unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; - void pstore_set_kmsg_bytes(int bytes) { kmsg_bytes = bytes; -- cgit From 6a14f1982e6b476c84938c9ca011d258a6bba24d Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Thu, 6 Oct 2022 19:42:07 -0300 Subject: pstore: Inform unregistered backend names as well Currently we only show the registered ones in the kernel log; users that change backend for some reason require first to unregister the current one, so let's confirm this operation with a message in the log. Signed-off-by: Guilherme G. Piccoli Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221006224212.569555-4-gpiccoli@igalia.com --- fs/pstore/platform.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index be05090076ce..06c2c66af332 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -665,6 +665,8 @@ void pstore_unregister(struct pstore_info *psi) psinfo = NULL; kfree(backend); backend = NULL; + + pr_info("Unregistered %s as persistent store backend\n", psi->name); mutex_unlock(&psinfo_lock); } EXPORT_SYMBOL_GPL(pstore_unregister); -- cgit From 3219122b8cdd580f0793e803d4e80ecd7384cf17 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 11 Oct 2022 13:01:08 -0700 Subject: pstore/ram: Consolidate kfree() paths There's no reason to keep separate kfree() paths: either all allocations succeeded, or not. Everything is torn down in the case of failure, so adjust the callers to reflect this. Cc: Anton Vorontsov Cc: Colin Cross Cc: Tony Luck Signed-off-by: Kees Cook Reviewed-and-tested-by: Guilherme G. Piccoli Link: https://lore.kernel.org/r/20221011200112.731334-2-keescook@chromium.org --- fs/pstore/ram.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index fefe3d391d3a..348820a3ce88 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -451,6 +451,12 @@ static void ramoops_free_przs(struct ramoops_context *cxt) { int i; + /* Free pmsg PRZ */ + persistent_ram_free(cxt->mprz); + + /* Free console PRZ */ + persistent_ram_free(cxt->cprz); + /* Free dump PRZs */ if (cxt->dprzs) { for (i = 0; i < cxt->max_dump_cnt; i++) @@ -772,12 +778,12 @@ static int ramoops_probe(struct platform_device *pdev) dump_mem_sz, cxt->record_size, &cxt->max_dump_cnt, 0, 0); if (err) - goto fail_out; + goto fail_init; err = ramoops_init_prz("console", dev, cxt, &cxt->cprz, &paddr, cxt->console_size, 0); if (err) - goto fail_init_cprz; + goto fail_init; cxt->max_ftrace_cnt = (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU) ? nr_cpu_ids @@ -788,12 +794,12 @@ static int ramoops_probe(struct platform_device *pdev) (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU) ? PRZ_FLAG_NO_LOCK : 0); if (err) - goto fail_init_fprz; + goto fail_init; err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr, cxt->pmsg_size, 0); if (err) - goto fail_init_mprz; + goto fail_init; cxt->pstore.data = cxt; /* @@ -857,11 +863,7 @@ fail_buf: kfree(cxt->pstore.buf); fail_clear: cxt->pstore.bufsize = 0; - persistent_ram_free(cxt->mprz); -fail_init_mprz: -fail_init_fprz: - persistent_ram_free(cxt->cprz); -fail_init_cprz: +fail_init: ramoops_free_przs(cxt); fail_out: return err; @@ -876,8 +878,6 @@ static int ramoops_remove(struct platform_device *pdev) kfree(cxt->pstore.buf); cxt->pstore.bufsize = 0; - persistent_ram_free(cxt->mprz); - persistent_ram_free(cxt->cprz); ramoops_free_przs(cxt); return 0; -- cgit From 6daf4e82bd54ef004ace4dd6deed60a32c282a95 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 11 Oct 2022 13:01:09 -0700 Subject: pstore/ram: Move pmsg init earlier Since the ftrace area can vary in size based on CPU count, move pmsg initialization earlier so it will have a stable location. Suggested-by: Paramjit Oberoi Cc: Anton Vorontsov Cc: Colin Cross Cc: Tony Luck Signed-off-by: Kees Cook Reviewed: Guilherme G. Piccoli Link: https://lore.kernel.org/r/20221011200112.731334-3-keescook@chromium.org --- fs/pstore/ram.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 348820a3ce88..2f18563c8141 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -785,6 +785,11 @@ static int ramoops_probe(struct platform_device *pdev) if (err) goto fail_init; + err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr, + cxt->pmsg_size, 0); + if (err) + goto fail_init; + cxt->max_ftrace_cnt = (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU) ? nr_cpu_ids : 1; @@ -796,11 +801,6 @@ static int ramoops_probe(struct platform_device *pdev) if (err) goto fail_init; - err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr, - cxt->pmsg_size, 0); - if (err) - goto fail_init; - cxt->pstore.data = cxt; /* * Prepare frontend flags based on which areas are initialized. -- cgit From 8bd4da0f0626ae9a82099d3d99cd6efd4355879b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 11 Oct 2022 13:01:10 -0700 Subject: pstore/ram: Move internal definitions out of kernel-wide include Most of the details of the ram backend are entirely internal to the backend itself. Leave only what is needed to instantiate a ram backend in the kernel-wide header. Cc: Anton Vorontsov Cc: Colin Cross Cc: Tony Luck Signed-off-by: Kees Cook Reviewed-and-tested-by: Guilherme G. Piccoli Link: https://lore.kernel.org/r/20221011200112.731334-4-keescook@chromium.org --- fs/pstore/ram.c | 3 +- fs/pstore/ram_core.c | 3 +- fs/pstore/ram_internal.h | 98 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+), 2 deletions(-) create mode 100644 fs/pstore/ram_internal.h (limited to 'fs') diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 2f18563c8141..f5bf360cf905 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -18,10 +18,11 @@ #include #include #include -#include #include #include + #include "internal.h" +#include "ram_internal.h" #define RAMOOPS_KERNMSG_HDR "====" #define MIN_MEM_SIZE 4096UL diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index a89e33719fcf..9e1047f4316d 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -13,13 +13,14 @@ #include #include #include -#include #include #include #include #include #include +#include "ram_internal.h" + /** * struct persistent_ram_buffer - persistent circular RAM buffer * diff --git a/fs/pstore/ram_internal.h b/fs/pstore/ram_internal.h new file mode 100644 index 000000000000..440ee7a35e10 --- /dev/null +++ b/fs/pstore/ram_internal.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2010 Marco Stornelli + * Copyright (C) 2011 Kees Cook + * Copyright (C) 2011 Google, Inc. + */ + +#include + +/* + * Choose whether access to the RAM zone requires locking or not. If a zone + * can be written to from different CPUs like with ftrace for example, then + * PRZ_FLAG_NO_LOCK is used. For all other cases, locking is required. + */ +#define PRZ_FLAG_NO_LOCK BIT(0) +/* + * If a PRZ should only have a single-boot lifetime, this marks it as + * getting wiped after its contents get copied out after boot. + */ +#define PRZ_FLAG_ZAP_OLD BIT(1) + +/** + * struct persistent_ram_zone - Details of a persistent RAM zone (PRZ) + * used as a pstore backend + * + * @paddr: physical address of the mapped RAM area + * @size: size of mapping + * @label: unique name of this PRZ + * @type: frontend type for this PRZ + * @flags: holds PRZ_FLAGS_* bits + * + * @buffer_lock: + * locks access to @buffer "size" bytes and "start" offset + * @buffer: + * pointer to actual RAM area managed by this PRZ + * @buffer_size: + * bytes in @buffer->data (not including any trailing ECC bytes) + * + * @par_buffer: + * pointer into @buffer->data containing ECC bytes for @buffer->data + * @par_header: + * pointer into @buffer->data containing ECC bytes for @buffer header + * (i.e. all fields up to @data) + * @rs_decoder: + * RSLIB instance for doing ECC calculations + * @corrected_bytes: + * ECC corrected bytes accounting since boot + * @bad_blocks: + * ECC uncorrectable bytes accounting since boot + * @ecc_info: + * ECC configuration details + * + * @old_log: + * saved copy of @buffer->data prior to most recent wipe + * @old_log_size: + * bytes contained in @old_log + * + */ +struct persistent_ram_zone { + phys_addr_t paddr; + size_t size; + void *vaddr; + char *label; + enum pstore_type_id type; + u32 flags; + + raw_spinlock_t buffer_lock; + struct persistent_ram_buffer *buffer; + size_t buffer_size; + + char *par_buffer; + char *par_header; + struct rs_control *rs_decoder; + int corrected_bytes; + int bad_blocks; + struct persistent_ram_ecc_info ecc_info; + + char *old_log; + size_t old_log_size; +}; + +struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, + u32 sig, struct persistent_ram_ecc_info *ecc_info, + unsigned int memtype, u32 flags, char *label); +void persistent_ram_free(struct persistent_ram_zone *prz); +void persistent_ram_zap(struct persistent_ram_zone *prz); + +int persistent_ram_write(struct persistent_ram_zone *prz, const void *s, + unsigned int count); +int persistent_ram_write_user(struct persistent_ram_zone *prz, + const void __user *s, unsigned int count); + +void persistent_ram_save_old(struct persistent_ram_zone *prz); +size_t persistent_ram_old_size(struct persistent_ram_zone *prz); +void *persistent_ram_old(struct persistent_ram_zone *prz); +void persistent_ram_free_old(struct persistent_ram_zone *prz); +ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz, + char *str, size_t len); -- cgit From 11c2a8700cdcabf9b639b7204a1e38e2a0b6798e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 17 Oct 2022 17:06:34 +0200 Subject: attr: add in_group_or_capable() In setattr_{copy,prepare}() we need to perform the same permission checks to determine whether we need to drop the setgid bit or not. Instead of open-coding it twice add a simple helper the encapsulates the logic. We will reuse this helpers to make dropping the setgid bit during write operations more consistent in a follow up patch. Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner (Microsoft) --- fs/attr.c | 10 +++++----- fs/inode.c | 28 ++++++++++++++++++++++++---- fs/internal.h | 2 ++ 3 files changed, 31 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/attr.c b/fs/attr.c index 1552a5f23d6b..b1162fca84a2 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -18,6 +18,8 @@ #include #include +#include "internal.h" + /** * chown_ok - verify permissions to chown inode * @mnt_userns: user namespace of the mount @inode was found from @@ -140,8 +142,7 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry, vfsgid = i_gid_into_vfsgid(mnt_userns, inode); /* Also check the setgid bit! */ - if (!vfsgid_in_group_p(vfsgid) && - !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!in_group_or_capable(mnt_userns, inode, vfsgid)) attr->ia_mode &= ~S_ISGID; } @@ -251,9 +252,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); - if (!vfsgid_in_group_p(vfsgid) && - !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!in_group_or_capable(mnt_userns, inode, + i_gid_into_vfsgid(mnt_userns, inode))) mode &= ~S_ISGID; inode->i_mode = mode; } diff --git a/fs/inode.c b/fs/inode.c index b608528efd3a..55299b710c45 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2487,6 +2487,28 @@ struct timespec64 current_time(struct inode *inode) } EXPORT_SYMBOL(current_time); +/** + * in_group_or_capable - check whether caller is CAP_FSETID privileged + * @mnt_userns: user namespace of the mount @inode was found from + * @inode: inode to check + * @vfsgid: the new/current vfsgid of @inode + * + * Check wether @vfsgid is in the caller's group list or if the caller is + * privileged with CAP_FSETID over @inode. This can be used to determine + * whether the setgid bit can be kept or must be dropped. + * + * Return: true if the caller is sufficiently privileged, false if not. + */ +bool in_group_or_capable(struct user_namespace *mnt_userns, + const struct inode *inode, vfsgid_t vfsgid) +{ + if (vfsgid_in_group_p(vfsgid)) + return true; + if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + return true; + return false; +} + /** * mode_strip_sgid - handle the sgid bit for non-directories * @mnt_userns: User namespace of the mount the inode was created from @@ -2508,11 +2530,9 @@ umode_t mode_strip_sgid(struct user_namespace *mnt_userns, return mode; if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID)) return mode; - if (in_group_p(i_gid_into_mnt(mnt_userns, dir))) - return mode; - if (capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID)) + if (in_group_or_capable(mnt_userns, dir, + i_gid_into_vfsgid(mnt_userns, dir))) return mode; - return mode & ~S_ISGID; } EXPORT_SYMBOL(mode_strip_sgid); diff --git a/fs/internal.h b/fs/internal.h index 6f0386b34fae..1de39bbc9ddd 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -151,6 +151,8 @@ extern int vfs_open(const struct path *, struct file *); */ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); extern int dentry_needs_remove_privs(struct dentry *dentry); +bool in_group_or_capable(struct user_namespace *mnt_userns, + const struct inode *inode, vfsgid_t vfsgid); /* * fs-writeback.c -- cgit From e243e3f94c804ecca9a8241b5babe28f35258ef4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 17 Oct 2022 17:06:35 +0200 Subject: fs: move should_remove_suid() Move the helper from inode.c to attr.c. This keeps the the core of the set{g,u}id stripping logic in one place when we add follow-up changes. It is the better place anyway, since should_remove_suid() returns ATTR_KILL_S{G,U}ID flags. Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner (Microsoft) --- fs/attr.c | 29 +++++++++++++++++++++++++++++ fs/inode.c | 29 ----------------------------- 2 files changed, 29 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/attr.c b/fs/attr.c index b1162fca84a2..e508b3caae76 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -20,6 +20,35 @@ #include "internal.h" +/* + * The logic we want is + * + * if suid or (sgid and xgrp) + * remove privs + */ +int should_remove_suid(struct dentry *dentry) +{ + umode_t mode = d_inode(dentry)->i_mode; + int kill = 0; + + /* suid always must be killed */ + if (unlikely(mode & S_ISUID)) + kill = ATTR_KILL_SUID; + + /* + * sgid without any exec bits is just a mandatory locking mark; leave + * it alone. If some exec bits are set, it's a real sgid; kill it. + */ + if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) + kill |= ATTR_KILL_SGID; + + if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) + return kill; + + return 0; +} +EXPORT_SYMBOL(should_remove_suid); + /** * chown_ok - verify permissions to chown inode * @mnt_userns: user namespace of the mount @inode was found from diff --git a/fs/inode.c b/fs/inode.c index 55299b710c45..6df2b7c936c2 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1948,35 +1948,6 @@ skip_update: } EXPORT_SYMBOL(touch_atime); -/* - * The logic we want is - * - * if suid or (sgid and xgrp) - * remove privs - */ -int should_remove_suid(struct dentry *dentry) -{ - umode_t mode = d_inode(dentry)->i_mode; - int kill = 0; - - /* suid always must be killed */ - if (unlikely(mode & S_ISUID)) - kill = ATTR_KILL_SUID; - - /* - * sgid without any exec bits is just a mandatory locking mark; leave - * it alone. If some exec bits are set, it's a real sgid; kill it. - */ - if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) - kill |= ATTR_KILL_SGID; - - if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) - return kill; - - return 0; -} -EXPORT_SYMBOL(should_remove_suid); - /* * Return mask of changes for notify_change() that need to be done as a * response to write or truncate. Return 0 if nothing has to be changed. -- cgit From 72ae017c5451860443a16fb2a8c243bff3e396b8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 17 Oct 2022 17:06:36 +0200 Subject: attr: add setattr_should_drop_sgid() The current setgid stripping logic during write and ownership change operations is inconsistent and strewn over multiple places. In order to consolidate it and make more consistent we'll add a new helper setattr_should_drop_sgid(). The function retains the old behavior where we remove the S_ISGID bit unconditionally when S_IXGRP is set but also when it isn't set and the caller is neither in the group of the inode nor privileged over the inode. We will use this helper both in write operation permission removal such as file_remove_privs() as well as in ownership change operations. Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner (Microsoft) --- fs/attr.c | 28 ++++++++++++++++++++++++++++ fs/internal.h | 6 ++++++ 2 files changed, 34 insertions(+) (limited to 'fs') diff --git a/fs/attr.c b/fs/attr.c index e508b3caae76..085322536127 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -20,6 +20,34 @@ #include "internal.h" +/** + * setattr_should_drop_sgid - determine whether the setgid bit needs to be + * removed + * @mnt_userns: user namespace of the mount @inode was found from + * @inode: inode to check + * + * This function determines whether the setgid bit needs to be removed. + * We retain backwards compatibility and require setgid bit to be removed + * unconditionally if S_IXGRP is set. Otherwise we have the exact same + * requirements as setattr_prepare() and setattr_copy(). + * + * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise. + */ +int setattr_should_drop_sgid(struct user_namespace *mnt_userns, + const struct inode *inode) +{ + umode_t mode = inode->i_mode; + + if (!(mode & S_ISGID)) + return 0; + if (mode & S_IXGRP) + return ATTR_KILL_SGID; + if (!in_group_or_capable(mnt_userns, inode, + i_gid_into_vfsgid(mnt_userns, inode))) + return ATTR_KILL_SGID; + return 0; +} + /* * The logic we want is * diff --git a/fs/internal.h b/fs/internal.h index 1de39bbc9ddd..771b0468d70c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -236,3 +236,9 @@ int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct xattr_ctx *ctx); ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos); + +/* + * fs/attr.c + */ +int setattr_should_drop_sgid(struct user_namespace *mnt_userns, + const struct inode *inode); -- cgit From ed5a7047d2011cb6b2bf84ceb6680124cc6a7d95 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 17 Oct 2022 17:06:37 +0200 Subject: attr: use consistent sgid stripping checks Currently setgid stripping in file_remove_privs()'s should_remove_suid() helper is inconsistent with other parts of the vfs. Specifically, it only raises ATTR_KILL_SGID if the inode is S_ISGID and S_IXGRP but not if the inode isn't in the caller's groups and the caller isn't privileged over the inode although we require this already in setattr_prepare() and setattr_copy() and so all filesystem implement this requirement implicitly because they have to use setattr_{prepare,copy}() anyway. But the inconsistency shows up in setgid stripping bugs for overlayfs in xfstests (e.g., generic/673, generic/683, generic/685, generic/686, generic/687). For example, we test whether suid and setgid stripping works correctly when performing various write-like operations as an unprivileged user (fallocate, reflink, write, etc.): echo "Test 1 - qa_user, non-exec file $verb" setup_testfile chmod a+rws $junk_file commit_and_check "$qa_user" "$verb" 64k 64k The test basically creates a file with 6666 permissions. While the file has the S_ISUID and S_ISGID bits set it does not have the S_IXGRP set. On a regular filesystem like xfs what will happen is: sys_fallocate() -> vfs_fallocate() -> xfs_file_fallocate() -> file_modified() -> __file_remove_privs() -> dentry_needs_remove_privs() -> should_remove_suid() -> __remove_privs() newattrs.ia_valid = ATTR_FORCE | kill; -> notify_change() -> setattr_copy() In should_remove_suid() we can see that ATTR_KILL_SUID is raised unconditionally because the file in the test has S_ISUID set. But we also see that ATTR_KILL_SGID won't be set because while the file is S_ISGID it is not S_IXGRP (see above) which is a condition for ATTR_KILL_SGID being raised. So by the time we call notify_change() we have attr->ia_valid set to ATTR_KILL_SUID | ATTR_FORCE. Now notify_change() sees that ATTR_KILL_SUID is set and does: ia_valid = attr->ia_valid |= ATTR_MODE attr->ia_mode = (inode->i_mode & ~S_ISUID); which means that when we call setattr_copy() later we will definitely update inode->i_mode. Note that attr->ia_mode still contains S_ISGID. Now we call into the filesystem's ->setattr() inode operation which will end up calling setattr_copy(). Since ATTR_MODE is set we will hit: if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); if (!vfsgid_in_group_p(vfsgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; inode->i_mode = mode; } and since the caller in the test is neither capable nor in the group of the inode the S_ISGID bit is stripped. But assume the file isn't suid then ATTR_KILL_SUID won't be raised which has the consequence that neither the setgid nor the suid bits are stripped even though it should be stripped because the inode isn't in the caller's groups and the caller isn't privileged over the inode. If overlayfs is in the mix things become a bit more complicated and the bug shows up more clearly. When e.g., ovl_setattr() is hit from ovl_fallocate()'s call to file_remove_privs() then ATTR_KILL_SUID and ATTR_KILL_SGID might be raised but because the check in notify_change() is questioning the ATTR_KILL_SGID flag again by requiring S_IXGRP for it to be stripped the S_ISGID bit isn't removed even though it should be stripped: sys_fallocate() -> vfs_fallocate() -> ovl_fallocate() -> file_remove_privs() -> dentry_needs_remove_privs() -> should_remove_suid() -> __remove_privs() newattrs.ia_valid = ATTR_FORCE | kill; -> notify_change() -> ovl_setattr() // TAKE ON MOUNTER'S CREDS -> ovl_do_notify_change() -> notify_change() // GIVE UP MOUNTER'S CREDS // TAKE ON MOUNTER'S CREDS -> vfs_fallocate() -> xfs_file_fallocate() -> file_modified() -> __file_remove_privs() -> dentry_needs_remove_privs() -> should_remove_suid() -> __remove_privs() newattrs.ia_valid = attr_force | kill; -> notify_change() The fix for all of this is to make file_remove_privs()'s should_remove_suid() helper to perform the same checks as we already require in setattr_prepare() and setattr_copy() and have notify_change() not pointlessly requiring S_IXGRP again. It doesn't make any sense in the first place because the caller must calculate the flags via should_remove_suid() anyway which would raise ATTR_KILL_SGID. While we're at it we move should_remove_suid() from inode.c to attr.c where it belongs with the rest of the iattr helpers. Especially since it returns ATTR_KILL_S{G,U}ID flags. We also rename it to setattr_should_drop_suidgid() to better reflect that it indicates both setuid and setgid bit removal and also that it returns attr flags. Running xfstests with this doesn't report any regressions. We should really try and use consistent checks. Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner (Microsoft) --- fs/attr.c | 33 +++++++++++++++++++-------------- fs/fuse/file.c | 2 +- fs/inode.c | 7 ++++--- fs/internal.h | 2 +- fs/ocfs2/file.c | 4 ++-- fs/open.c | 8 ++++---- 6 files changed, 31 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/attr.c b/fs/attr.c index 085322536127..b45f30e516fa 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -48,34 +48,39 @@ int setattr_should_drop_sgid(struct user_namespace *mnt_userns, return 0; } -/* - * The logic we want is +/** + * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to + * be dropped + * @mnt_userns: user namespace of the mount @inode was found from + * @inode: inode to check * - * if suid or (sgid and xgrp) - * remove privs + * This function determines whether the set{g,u}id bits need to be removed. + * If the setuid bit needs to be removed ATTR_KILL_SUID is returned. If the + * setgid bit needs to be removed ATTR_KILL_SGID is returned. If both + * set{g,u}id bits need to be removed the corresponding mask of both flags is + * returned. + * + * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits + * to remove, 0 otherwise. */ -int should_remove_suid(struct dentry *dentry) +int setattr_should_drop_suidgid(struct user_namespace *mnt_userns, + struct inode *inode) { - umode_t mode = d_inode(dentry)->i_mode; + umode_t mode = inode->i_mode; int kill = 0; /* suid always must be killed */ if (unlikely(mode & S_ISUID)) kill = ATTR_KILL_SUID; - /* - * sgid without any exec bits is just a mandatory locking mark; leave - * it alone. If some exec bits are set, it's a real sgid; kill it. - */ - if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) - kill |= ATTR_KILL_SGID; + kill |= setattr_should_drop_sgid(mnt_userns, inode); if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) return kill; return 0; } -EXPORT_SYMBOL(should_remove_suid); +EXPORT_SYMBOL(setattr_should_drop_suidgid); /** * chown_ok - verify permissions to chown inode @@ -432,7 +437,7 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, } } if (ia_valid & ATTR_KILL_SGID) { - if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + if (mode & S_ISGID) { if (!(ia_valid & ATTR_MODE)) { ia_valid = attr->ia_valid |= ATTR_MODE; attr->ia_mode = inode->i_mode; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 1a3afd469e3a..97e2d815075d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1313,7 +1313,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) return err; if (fc->handle_killpriv_v2 && - should_remove_suid(file_dentry(file))) { + setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) { goto writethrough; } diff --git a/fs/inode.c b/fs/inode.c index 6df2b7c936c2..8c4078889754 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1953,7 +1953,8 @@ EXPORT_SYMBOL(touch_atime); * response to write or truncate. Return 0 if nothing has to be changed. * Negative value on error (change should be denied). */ -int dentry_needs_remove_privs(struct dentry *dentry) +int dentry_needs_remove_privs(struct user_namespace *mnt_userns, + struct dentry *dentry) { struct inode *inode = d_inode(dentry); int mask = 0; @@ -1962,7 +1963,7 @@ int dentry_needs_remove_privs(struct dentry *dentry) if (IS_NOSEC(inode)) return 0; - mask = should_remove_suid(dentry); + mask = setattr_should_drop_suidgid(mnt_userns, inode); ret = security_inode_need_killpriv(dentry); if (ret < 0) return ret; @@ -1994,7 +1995,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags) if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; - kill = dentry_needs_remove_privs(dentry); + kill = dentry_needs_remove_privs(file_mnt_user_ns(file), dentry); if (kill < 0) return kill; diff --git a/fs/internal.h b/fs/internal.h index 771b0468d70c..5545c26d86ae 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -150,7 +150,7 @@ extern int vfs_open(const struct path *, struct file *); * inode.c */ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); -extern int dentry_needs_remove_privs(struct dentry *dentry); +int dentry_needs_remove_privs(struct user_namespace *, struct dentry *dentry); bool in_group_or_capable(struct user_namespace *mnt_userns, const struct inode *inode, vfsgid_t vfsgid); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9c67edd215d5..4d78e0979517 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1991,7 +1991,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, } } - if (file && should_remove_suid(file->f_path.dentry)) { + if (file && setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) { ret = __ocfs2_write_remove_suid(inode, di_bh); if (ret) { mlog_errno(ret); @@ -2279,7 +2279,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * inode. There's also the dinode i_size state which * can be lost via setattr during extending writes (we * set inode->i_size at the end of a write. */ - if (should_remove_suid(dentry)) { + if (setattr_should_drop_suidgid(&init_user_ns, inode)) { if (meta_level == 0) { ocfs2_inode_unlock_for_extent_tree(inode, &di_bh, diff --git a/fs/open.c b/fs/open.c index a81319b6177f..9d0197db15e7 100644 --- a/fs/open.c +++ b/fs/open.c @@ -54,7 +54,7 @@ int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry, } /* Remove suid, sgid, and file capabilities on truncate too */ - ret = dentry_needs_remove_privs(dentry); + ret = dentry_needs_remove_privs(mnt_userns, dentry); if (ret < 0) return ret; if (ret) @@ -723,10 +723,10 @@ retry_deleg: return -EINVAL; if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) return -EINVAL; - if (!S_ISDIR(inode->i_mode)) - newattrs.ia_valid |= - ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; inode_lock(inode); + if (!S_ISDIR(inode->i_mode)) + newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | + setattr_should_drop_sgid(mnt_userns, inode); /* Continue to send actual fs values, not the mount values. */ error = security_path_chown( path, -- cgit From b306e90ffabdaa7e3b3350dbcd19b7663e71ab17 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 17 Oct 2022 17:06:38 +0200 Subject: ovl: remove privs in ovl_copyfile() Underlying fs doesn't remove privs because copy_range/remap_range are called with privileged mounter credentials. This fixes some failures in fstest generic/673. Fixes: 8ede205541ff ("ovl: add reflink/copyfile/dedup support") Acked-by: Miklos Szeredi Signed-off-by: Amir Goldstein Signed-off-by: Christian Brauner (Microsoft) --- fs/overlayfs/file.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index a1a22f58ba18..755a11c63596 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -567,14 +567,23 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, const struct cred *old_cred; loff_t ret; + inode_lock(inode_out); + if (op != OVL_DEDUPE) { + /* Update mode */ + ovl_copyattr(inode_out); + ret = file_remove_privs(file_out); + if (ret) + goto out_unlock; + } + ret = ovl_real_fdget(file_out, &real_out); if (ret) - return ret; + goto out_unlock; ret = ovl_real_fdget(file_in, &real_in); if (ret) { fdput(real_out); - return ret; + goto out_unlock; } old_cred = ovl_override_creds(file_inode(file_out)->i_sb); @@ -603,6 +612,9 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, fdput(real_in); fdput(real_out); +out_unlock: + inode_unlock(inode_out); + return ret; } -- cgit From 23a8ce16419a3066829ad4a8b7032a75817af65b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 17 Oct 2022 17:06:39 +0200 Subject: ovl: remove privs in ovl_fallocate() Underlying fs doesn't remove privs because fallocate is called with privileged mounter credentials. This fixes some failure in fstests generic/683..687. Fixes: aab8848cee5e ("ovl: add ovl_fallocate()") Acked-by: Miklos Szeredi Signed-off-by: Amir Goldstein Signed-off-by: Christian Brauner (Microsoft) --- fs/overlayfs/file.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 755a11c63596..d066be3b9226 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -517,9 +517,16 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len const struct cred *old_cred; int ret; + inode_lock(inode); + /* Update mode */ + ovl_copyattr(inode); + ret = file_remove_privs(file); + if (ret) + goto out_unlock; + ret = ovl_real_fdget(file, &real); if (ret) - return ret; + goto out_unlock; old_cred = ovl_override_creds(file_inode(file)->i_sb); ret = vfs_fallocate(real.file, mode, offset, len); @@ -530,6 +537,9 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len fdput(real); +out_unlock: + inode_unlock(inode); + return ret; } -- cgit From 898f706695682b9954f280d95e49fa86ffa55d08 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Tue, 18 Oct 2022 08:48:07 -0500 Subject: fs: jfs: fix shift-out-of-bounds in dbAllocAG Syzbot found a crash : UBSAN: shift-out-of-bounds in dbAllocAG. The underlying bug is the missing check of bmp->db_agl2size. The field can be greater than 64 and trigger the shift-out-of-bounds. Fix this bug by adding a check of bmp->db_agl2size in dbMount since this field is used in many following functions. The upper bound for this field is L2MAXL2SIZE - L2MAXAG, thanks for the help of Dave Kleikamp. Note that, for maintenance, I reorganized error handling code of dbMount. Reported-by: syzbot+15342c1aa6a00fb7a438@syzkaller.appspotmail.com Signed-off-by: Dongliang Mu Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_dmap.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 6b838d3ae7c2..e1cbfbb60303 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -155,7 +155,7 @@ int dbMount(struct inode *ipbmap) struct bmap *bmp; struct dbmap_disk *dbmp_le; struct metapage *mp; - int i; + int i, err; /* * allocate/initialize the in-memory bmap descriptor @@ -170,8 +170,8 @@ int dbMount(struct inode *ipbmap) BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage, PSIZE, 0); if (mp == NULL) { - kfree(bmp); - return -EIO; + err = -EIO; + goto err_kfree_bmp; } /* copy the on-disk bmap descriptor to its in-memory version. */ @@ -181,9 +181,8 @@ int dbMount(struct inode *ipbmap) bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage); bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); if (!bmp->db_numag) { - release_metapage(mp); - kfree(bmp); - return -EINVAL; + err = -EINVAL; + goto err_release_metapage; } bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel); @@ -194,6 +193,11 @@ int dbMount(struct inode *ipbmap) bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); + if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG) { + err = -EINVAL; + goto err_release_metapage; + } + for (i = 0; i < MAXAG; i++) bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]); bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize); @@ -214,6 +218,12 @@ int dbMount(struct inode *ipbmap) BMAP_LOCK_INIT(bmp); return (0); + +err_release_metapage: + release_metapage(mp); +err_kfree_bmp: + kfree(bmp); + return err; } -- cgit From 73c6da327ff17a07a9260b0ff1f36e13665ac63d Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Thu, 14 Jul 2022 10:56:56 +0800 Subject: fs/jfs: replace ternary operator with min_t() Fix the following coccicheck warning: fs/jfs/super.c:748: WARNING opportunity for min(). fs/jfs/super.c:788: WARNING opportunity for min(). min_t() macro is defined in include/linux/minmax.h. It avoids multiple evaluations of the arguments when non-constant and performs strict type-checking. Signed-off-by: Jiangshan Yi Signed-off-by: Dave Kleikamp --- fs/jfs/super.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 85d4f44f2ac4..d2f82cb7db1b 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -745,8 +745,7 @@ static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data, len = i_size-off; toread = len; while (toread > 0) { - tocopy = sb->s_blocksize - offset < toread ? - sb->s_blocksize - offset : toread; + tocopy = min_t(size_t, sb->s_blocksize - offset, toread); tmp_bh.b_state = 0; tmp_bh.b_size = i_blocksize(inode); @@ -785,8 +784,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, inode_lock(inode); while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; + tocopy = min_t(size_t, sb->s_blocksize - offset, towrite); tmp_bh.b_state = 0; tmp_bh.b_size = i_blocksize(inode); -- cgit From b0a35efa0ebc50032ab81d60ed5a12de4324c5e2 Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Fri, 2 Sep 2022 16:53:38 +0800 Subject: fs/jfs/jfs_xattr.h: Fix spelling typo in comment Fix spelling typo in comment. Reported-by: k2ci Signed-off-by: Jiangshan Yi Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_xattr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h index c50167a7bc50..0d33816d251d 100644 --- a/fs/jfs/jfs_xattr.h +++ b/fs/jfs/jfs_xattr.h @@ -25,7 +25,7 @@ struct jfs_ea_list { struct jfs_ea ea[]; /* Variable length list */ }; -/* Macros for defining maxiumum number of bytes supported for EAs */ +/* Macros for defining maximum number of bytes supported for EAs */ #define MAXEASIZE 65535 #define MAXEALISTSIZE MAXEASIZE -- cgit From 1ea66d71b17608b49ddb907f6155bc47c6e33506 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Fri, 9 Sep 2022 14:50:55 +0800 Subject: jfs: remove unused declarations for jfs extRealloc(), xtRelocate(), xtDelete() and extFill() have been removed since commit e471e5942c00 ("fs/jfs: Remove dead code"), so remove them. Signed-off-by: Gaosheng Cui Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_extent.h | 2 -- fs/jfs/jfs_xtree.h | 4 ---- 2 files changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/jfs/jfs_extent.h b/fs/jfs/jfs_extent.h index 1c984214e95e..a0ee4ccea66e 100644 --- a/fs/jfs/jfs_extent.h +++ b/fs/jfs/jfs_extent.h @@ -10,9 +10,7 @@ (addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1) extern int extAlloc(struct inode *, s64, s64, xad_t *, bool); -extern int extFill(struct inode *, xad_t *); extern int extHint(struct inode *, s64, xad_t *); -extern int extRealloc(struct inode *, s64, xad_t *, bool); extern int extRecord(struct inode *, xad_t *); #endif /* _H_JFS_EXTENT */ diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h index 142caafc73b1..ad7592191d76 100644 --- a/fs/jfs/jfs_xtree.h +++ b/fs/jfs/jfs_xtree.h @@ -96,12 +96,8 @@ extern int xtInsert(tid_t tid, struct inode *ip, extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen, int flag); extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad); -extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen, - int flag); extern s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int type); extern s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size); -extern int xtRelocate(tid_t tid, struct inode *ip, - xad_t * oxad, s64 nxaddr, int xtype); extern int xtAppend(tid_t tid, struct inode *ip, int xflag, s64 xoff, int maxblocks, int *xlenp, s64 * xaddrp, int flag); -- cgit From dee8744524096514109a9680a9c45a4259dae2a4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 18 Oct 2022 16:23:10 +0100 Subject: jfs: remove redundant assignments to ipaimap and ipaimap2 The pointers ipaimap and ipaimap2 are re-assigned with values a second time with the same values when they were initialized. The re-assignments are redundant and can be removed. Cleans up two clang scan build warnings: fs/jfs/jfs_umount.c:42:16: warning: Value stored to 'ipaimap' during its initialization is never read [deadcode.DeadStores] fs/jfs/jfs_umount.c:43:16: warning: Value stored to 'ipaimap2' during its initialization is never read [deadcode.DeadStores] Signed-off-by: Colin Ian King Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_umount.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c index 3e8b13e6aa01..95ebcd17ce75 100644 --- a/fs/jfs/jfs_umount.c +++ b/fs/jfs/jfs_umount.c @@ -68,7 +68,6 @@ int jfs_umount(struct super_block *sb) /* * close secondary aggregate inode allocation map */ - ipaimap2 = sbi->ipaimap2; if (ipaimap2) { diUnmount(ipaimap2, 0); diFreeSpecial(ipaimap2); @@ -78,7 +77,6 @@ int jfs_umount(struct super_block *sb) /* * close aggregate inode allocation map */ - ipaimap = sbi->ipaimap; diUnmount(ipaimap, 0); diFreeSpecial(ipaimap); sbi->ipaimap = NULL; -- cgit From 3350607dc5637be2563f484dcfe2fed456f3d4ff Mon Sep 17 00:00:00 2001 From: Günther Noack Date: Tue, 18 Oct 2022 20:22:06 +0200 Subject: security: Create file_truncate hook from path_truncate hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Like path_truncate, the file_truncate hook also restricts file truncation, but is called in the cases where truncation is attempted on an already-opened file. This is required in a subsequent commit to handle ftruncate() operations differently to truncate() operations. Acked-by: Tetsuo Handa Acked-by: John Johansen Acked-by: Paul Moore Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20221018182216.301684-2-gnoack3000@gmail.com Signed-off-by: Mickaël Salaün --- fs/namei.c | 2 +- fs/open.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 578c2110df02..85e9b9cc5c38 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3211,7 +3211,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp) if (error) return error; - error = security_path_truncate(path); + error = security_file_truncate(filp); if (!error) { error = do_truncate(mnt_userns, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, diff --git a/fs/open.c b/fs/open.c index a81319b6177f..c92f76ab341a 100644 --- a/fs/open.c +++ b/fs/open.c @@ -188,7 +188,7 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small) if (IS_APPEND(file_inode(f.file))) goto out_putf; sb_start_write(inode->i_sb); - error = security_path_truncate(&f.file->f_path); + error = security_file_truncate(f.file); if (!error) error = do_truncate(file_mnt_user_ns(f.file), dentry, length, ATTR_MTIME | ATTR_CTIME, f.file); -- cgit From 4053d2500beb0ca1e0757665af9e31da249a7a52 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 23 Sep 2022 10:19:34 +0200 Subject: orangefs: rework posix acl handling when creating new filesystem objects When creating new filesytem objects orangefs used to create posix acls after it had created and inserted a new inode. This made it necessary to all posix_acl_chmod() on the newly created inode in case the mode of the inode would be changed by the posix acls. Instead of doing it this way calculate the correct mode directly before actually creating the inode. So we first create posix acls, then pass the mode that posix acls mandate into the orangefs getattr helper and calculate the correct mode. This is needed so we can simply change posix_acl_chmod() to take a dentry instead of an inode argument in the next patch. Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner (Microsoft) --- fs/orangefs/acl.c | 44 ++----------------------------------------- fs/orangefs/inode.c | 44 +++++++++++++++++++++++++++++++++++-------- fs/orangefs/orangefs-kernel.h | 6 ++++-- fs/orangefs/orangefs-utils.c | 10 ++++++++-- 4 files changed, 50 insertions(+), 54 deletions(-) (limited to 'fs') diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 605e5a3506ec..0e2db840c217 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -64,8 +64,7 @@ struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu) return acl; } -static int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, - int type) +int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int error = 0; void *value = NULL; @@ -153,46 +152,7 @@ int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, rc = __orangefs_set_acl(inode, acl, type); if (!rc && (iattr.ia_valid == ATTR_MODE)) - rc = __orangefs_setattr(inode, &iattr); + rc = __orangefs_setattr_mode(inode, &iattr); return rc; } - -int orangefs_init_acl(struct inode *inode, struct inode *dir) -{ - struct posix_acl *default_acl, *acl; - umode_t mode = inode->i_mode; - struct iattr iattr; - int error = 0; - - error = posix_acl_create(dir, &mode, &default_acl, &acl); - if (error) - return error; - - if (default_acl) { - error = __orangefs_set_acl(inode, default_acl, - ACL_TYPE_DEFAULT); - posix_acl_release(default_acl); - } else { - inode->i_default_acl = NULL; - } - - if (acl) { - if (!error) - error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS); - posix_acl_release(acl); - } else { - inode->i_acl = NULL; - } - - /* If mode of the inode was changed, then do a forcible ->setattr */ - if (mode != inode->i_mode) { - memset(&iattr, 0, sizeof iattr); - inode->i_mode = mode; - iattr.ia_mode = mode; - iattr.ia_valid |= ATTR_MODE; - __orangefs_setattr(inode, &iattr); - } - - return error; -} diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 7a8c0c6e698d..35788cde6d24 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -828,15 +828,22 @@ again: spin_unlock(&inode->i_lock); mark_inode_dirty(inode); - if (iattr->ia_valid & ATTR_MODE) - /* change mod on a file that has ACLs */ - ret = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); - ret = 0; out: return ret; } +int __orangefs_setattr_mode(struct inode *inode, struct iattr *iattr) +{ + int ret; + + ret = __orangefs_setattr(inode, iattr); + /* change mode on a file that has ACLs */ + if (!ret && (iattr->ia_valid & ATTR_MODE)) + ret = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + return ret; +} + /* * Change attributes of an object referenced by dentry. */ @@ -849,7 +856,7 @@ int orangefs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, ret = setattr_prepare(&init_user_ns, dentry, iattr); if (ret) goto out; - ret = __orangefs_setattr(d_inode(dentry), iattr); + ret = __orangefs_setattr_mode(d_inode(dentry), iattr); sync_inode_metadata(d_inode(dentry), 1); out: gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n", @@ -1097,8 +1104,9 @@ struct inode *orangefs_iget(struct super_block *sb, * Allocate an inode for a newly created file and insert it into the inode hash. */ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, - int mode, dev_t dev, struct orangefs_object_kref *ref) + umode_t mode, dev_t dev, struct orangefs_object_kref *ref) { + struct posix_acl *acl = NULL, *default_acl = NULL; unsigned long hash = orangefs_handle_hash(ref); struct inode *inode; int error; @@ -1115,16 +1123,33 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, if (!inode) return ERR_PTR(-ENOMEM); + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) + goto out_iput; + orangefs_set_inode(inode, ref); inode->i_ino = hash; /* needed for stat etc */ - error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW); + error = __orangefs_inode_getattr(inode, mode, ORANGEFS_GETATTR_NEW); if (error) goto out_iput; orangefs_init_iops(inode); inode->i_rdev = dev; + if (default_acl) { + error = __orangefs_set_acl(inode, default_acl, + ACL_TYPE_DEFAULT); + if (error) + goto out_iput; + } + + if (acl) { + error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS); + if (error) + goto out_iput; + } + error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref); if (error < 0) goto out_iput; @@ -1132,10 +1157,13 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, gossip_debug(GOSSIP_INODE_DEBUG, "Initializing ACL's for inode %pU\n", get_khandle_from_ino(inode)); - orangefs_init_acl(inode, dir); + posix_acl_release(acl); + posix_acl_release(default_acl); return inode; out_iput: iput(inode); + posix_acl_release(acl); + posix_acl_release(default_acl); return ERR_PTR(error); } diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index b5940ec1836a..3298b15684b7 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -103,13 +103,13 @@ enum orangefs_vfs_op_states { #define ORANGEFS_CACHE_CREATE_FLAGS 0 #endif -extern int orangefs_init_acl(struct inode *inode, struct inode *dir); extern const struct xattr_handler *orangefs_xattr_handlers[]; extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu); extern int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); +int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type); /* * orangefs data structures @@ -356,11 +356,12 @@ void fsid_key_table_finalize(void); vm_fault_t orangefs_page_mkwrite(struct vm_fault *); struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, - int mode, + umode_t mode, dev_t dev, struct orangefs_object_kref *ref); int __orangefs_setattr(struct inode *, struct iattr *); +int __orangefs_setattr_mode(struct inode *inode, struct iattr *iattr); int orangefs_setattr(struct user_namespace *, struct dentry *, struct iattr *); int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, @@ -422,6 +423,7 @@ int orangefs_inode_setxattr(struct inode *inode, #define ORANGEFS_GETATTR_SIZE 2 int orangefs_inode_getattr(struct inode *, int); +int __orangefs_inode_getattr(struct inode *inode, umode_t mode, int flags); int orangefs_inode_check_changed(struct inode *inode); diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 46b7dcff18ac..334a2fd98c37 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -233,7 +233,7 @@ static int orangefs_inode_is_stale(struct inode *inode, return 0; } -int orangefs_inode_getattr(struct inode *inode, int flags) +int __orangefs_inode_getattr(struct inode *inode, umode_t mode, int flags) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op; @@ -369,7 +369,8 @@ again2: /* special case: mark the root inode as sticky */ inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) | - orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes); + orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes) | + mode; orangefs_inode->getattr_time = jiffies + orangefs_getattr_timeout_msecs*HZ/1000; @@ -381,6 +382,11 @@ out: return ret; } +int orangefs_inode_getattr(struct inode *inode, int flags) +{ + return __orangefs_inode_getattr(inode, 0, flags); +} + int orangefs_inode_check_changed(struct inode *inode) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); -- cgit From 138060ba92b3b0d77c8e6818d0f33398b23ea42e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 23 Sep 2022 10:29:39 +0200 Subject: fs: pass dentry to set acl method The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. Since some filesystem rely on the dentry being available to them when setting posix acls (e.g., 9p and cifs) they cannot rely on set acl inode operation. But since ->set_acl() is required in order to use the generic posix acl xattr handlers filesystems that do not implement this inode operation cannot use the handler and need to implement their own dedicated posix acl handlers. Update the ->set_acl() inode method to take a dentry argument. This allows all filesystems to rely on ->set_acl(). As far as I can tell all codepaths can be switched to rely on the dentry instead of just the inode. Note that the original motivation for passing the dentry separate from the inode instead of just the dentry in the xattr handlers was because of security modules that call security_d_instantiate(). This hook is called during d_instantiate_new(), d_add(), __d_instantiate_anon(), and d_splice_alias() to initialize the inode's security context and possibly to set security.* xattrs. Since this only affects security.* xattrs this is completely irrelevant for posix acls. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner (Microsoft) --- fs/bad_inode.c | 2 +- fs/btrfs/acl.c | 3 ++- fs/btrfs/ctree.h | 2 +- fs/btrfs/inode.c | 2 +- fs/ceph/acl.c | 3 ++- fs/ceph/inode.c | 2 +- fs/ceph/super.h | 2 +- fs/ext2/acl.c | 3 ++- fs/ext2/acl.h | 2 +- fs/ext2/inode.c | 2 +- fs/ext4/acl.c | 3 ++- fs/ext4/acl.h | 2 +- fs/ext4/inode.c | 2 +- fs/f2fs/acl.c | 4 +++- fs/f2fs/acl.h | 2 +- fs/f2fs/file.c | 2 +- fs/fuse/acl.c | 3 ++- fs/fuse/fuse_i.h | 2 +- fs/gfs2/acl.c | 3 ++- fs/gfs2/acl.h | 2 +- fs/gfs2/inode.c | 2 +- fs/jffs2/acl.c | 3 ++- fs/jffs2/acl.h | 2 +- fs/jffs2/fs.c | 2 +- fs/jfs/acl.c | 3 ++- fs/jfs/file.c | 2 +- fs/jfs/jfs_acl.h | 2 +- fs/ksmbd/smb2pdu.c | 4 ++-- fs/ksmbd/smbacl.c | 4 ++-- fs/ksmbd/vfs.c | 15 ++++++++------- fs/ksmbd/vfs.h | 4 ++-- fs/nfs/nfs3_fs.h | 2 +- fs/nfs/nfs3acl.c | 3 ++- fs/nfsd/nfs2acl.c | 4 ++-- fs/nfsd/nfs3acl.c | 4 ++-- fs/nfsd/vfs.c | 4 ++-- fs/ntfs3/file.c | 2 +- fs/ntfs3/ntfs_fs.h | 4 ++-- fs/ntfs3/xattr.c | 9 +++++---- fs/ocfs2/acl.c | 3 ++- fs/ocfs2/acl.h | 2 +- fs/orangefs/acl.c | 5 +++-- fs/orangefs/inode.c | 7 ++++--- fs/orangefs/orangefs-kernel.h | 4 ++-- fs/posix_acl.c | 22 +++++++++++++--------- fs/reiserfs/acl.h | 6 +++--- fs/reiserfs/inode.c | 2 +- fs/reiserfs/xattr_acl.c | 9 ++++++--- fs/xfs/xfs_acl.c | 3 ++- fs/xfs/xfs_acl.h | 2 +- fs/xfs/xfs_iops.c | 10 ++++++---- 51 files changed, 112 insertions(+), 86 deletions(-) (limited to 'fs') diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 9d1cde8066cf..bc9917d372ed 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -154,7 +154,7 @@ static int bad_inode_tmpfile(struct user_namespace *mnt_userns, } static int bad_inode_set_acl(struct user_namespace *mnt_userns, - struct inode *inode, struct posix_acl *acl, + struct dentry *dentry, struct posix_acl *acl, int type) { return -EIO; diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 548d6a5477b4..1e47b3ec3989 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -110,10 +110,11 @@ out: return ret; } -int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int ret; + struct inode *inode = d_inode(dentry); umode_t old_mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 727595eee973..d93a4d027706 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3987,7 +3987,7 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); -int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct posix_acl *acl, int type); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b0807c59e321..312ba03c56ae 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5256,7 +5256,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr err = btrfs_dirty_inode(inode); if (!err && attr->ia_valid & ATTR_MODE) - err = posix_acl_chmod(mnt_userns, inode, inode->i_mode); + err = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); } return err; diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index f4fc8e0b847c..c7e8dd5b58d4 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -85,13 +85,14 @@ retry: return acl; } -int ceph_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ceph_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int ret = 0, size = 0; const char *name = NULL; char *value = NULL; struct iattr newattrs; + struct inode *inode = d_inode(dentry); struct timespec64 old_ctime = inode->i_ctime; umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 4af5e55abc15..ca8aef906dc1 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2255,7 +2255,7 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, err = __ceph_setattr(inode, attr); if (err >= 0 && (attr->ia_valid & ATTR_MODE)) - err = posix_acl_chmod(&init_user_ns, inode, attr->ia_mode); + err = posix_acl_chmod(&init_user_ns, dentry, attr->ia_mode); return err; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 40630e6f691c..50e57a1fa32f 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1117,7 +1117,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx); struct posix_acl *ceph_get_acl(struct inode *, int, bool); int ceph_set_acl(struct user_namespace *mnt_userns, - struct inode *inode, struct posix_acl *acl, int type); + struct dentry *dentry, struct posix_acl *acl, int type); int ceph_pre_init_acls(struct inode *dir, umode_t *mode, struct ceph_acl_sec_ctx *as_ctx); void ceph_init_inode_acls(struct inode *inode, diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index bf298967c5b8..440d5f1e9d47 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -219,11 +219,12 @@ __ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) * inode->i_mutex: down */ int -ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int error; int update_mode = 0; + struct inode *inode = d_inode(dentry); umode_t mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 925ab6287d35..3841becb94ff 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -56,7 +56,7 @@ static inline int ext2_acl_count(size_t size) /* acl.c */ extern struct posix_acl *ext2_get_acl(struct inode *inode, int type, bool rcu); -extern int ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +extern int ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); extern int ext2_init_acl (struct inode *, struct inode *); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 918ab2f9e4c0..e97e77be64f3 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1652,7 +1652,7 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } setattr_copy(&init_user_ns, inode, iattr); if (iattr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); mark_inode_dirty(inode); return error; diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 57e82e25f8e2..a9f89539aeee 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -225,12 +225,13 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, } int -ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { handle_t *handle; int error, credits, retries = 0; size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0; + struct inode *inode = d_inode(dentry); umode_t mode = inode->i_mode; int update_mode = 0; diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 3219669732bf..09c4a8a3b716 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -56,7 +56,7 @@ static inline int ext4_acl_count(size_t size) /* acl.c */ struct posix_acl *ext4_get_acl(struct inode *inode, int type, bool rcu); -int ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2b5ef1b64249..a8e12ce6673d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5550,7 +5550,7 @@ out_mmap_sem: ext4_orphan_del(NULL, inode); if (!error && (ia_valid & ATTR_MODE)) - rc = posix_acl_chmod(mnt_userns, inode, inode->i_mode); + rc = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); err_out: if (error) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 5bbc44a5216e..c1c74aa658ae 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -276,9 +276,11 @@ static int __f2fs_set_acl(struct user_namespace *mnt_userns, return error; } -int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int f2fs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { + struct inode *inode = d_inode(dentry); + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index a26e33cab4ff..ea2bbb3f264b 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -34,7 +34,7 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL extern struct posix_acl *f2fs_get_acl(struct inode *, int, bool); -extern int f2fs_set_acl(struct user_namespace *, struct inode *, +extern int f2fs_set_acl(struct user_namespace *, struct dentry *, struct posix_acl *, int); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, struct page *); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 82cda1258227..122339482bdc 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1025,7 +1025,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, __setattr_copy(mnt_userns, inode, attr); if (attr->ia_valid & ATTR_MODE) { - err = posix_acl_chmod(mnt_userns, inode, f2fs_get_inode_mode(inode)); + err = posix_acl_chmod(mnt_userns, dentry, f2fs_get_inode_mode(inode)); if (is_inode_flag_set(inode, FI_ACL_MODE)) { if (!err) diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 337cb29a8dd5..8edd0f313515 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -53,9 +53,10 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu) return acl; } -int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { + struct inode *inode = d_inode(dentry); struct fuse_conn *fc = get_fuse_conn(inode); const char *name; int ret; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 98a9cf531873..26a7c524eb70 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1269,7 +1269,7 @@ extern const struct xattr_handler *fuse_no_acl_xattr_handlers[]; struct posix_acl; struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu); -int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); /* readdir.c */ diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 734d1f05d823..3dcde4912413 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -109,9 +109,10 @@ out: return error; } -int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; bool need_unlock = false; diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index cd180ca7c959..b8de8c148f5c 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -13,7 +13,7 @@ extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu); extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); -extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); #endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 04a201584fa7..314b9ce70682 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1997,7 +1997,7 @@ static int gfs2_setattr(struct user_namespace *mnt_userns, else { error = gfs2_setattr_simple(inode, attr); if (!error && attr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(&init_user_ns, inode, + error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); } diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index e945e3484788..8bb58ce5c06c 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -229,10 +229,11 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a return rc; } -int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int rc, xprefix; + struct inode *inode = d_inode(dentry); switch (type) { case ACL_TYPE_ACCESS: diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index 9d9fb7cf093e..ca36a6eca594 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -28,7 +28,7 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL struct posix_acl *jffs2_get_acl(struct inode *inode, int type, bool rcu); -int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *); extern int jffs2_init_acl_post(struct inode *); diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 39cec28096a7..66af51c41619 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -202,7 +202,7 @@ int jffs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, rc = jffs2_do_setattr(inode, iattr); if (!rc && (iattr->ia_valid & ATTR_MODE)) - rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); return rc; } diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index a653f34c6e26..3b667eccc73b 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -94,12 +94,13 @@ out: return rc; } -int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int rc; tid_t tid; int update_mode = 0; + struct inode *inode = d_inode(dentry); umode_t mode = inode->i_mode; tid = txBegin(inode->i_sb, 0); diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 332dc9ac47a9..e3eb9c36751f 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -123,7 +123,7 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) - rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); return rc; } diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index 3de40286d31f..f0704a25835f 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -8,7 +8,7 @@ #ifdef CONFIG_JFS_POSIX_ACL struct posix_acl *jfs_get_acl(struct inode *inode, int type, bool rcu); -int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); int jfs_init_acl(tid_t, struct inode *, struct inode *); diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index b2fc85d440d0..2466edc57424 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -2956,7 +2956,7 @@ int smb2_open(struct ksmbd_work *work) struct inode *inode = d_inode(path.dentry); posix_acl_rc = ksmbd_vfs_inherit_posix_acl(user_ns, - inode, + path.dentry, d_inode(path.dentry->d_parent)); if (posix_acl_rc) ksmbd_debug(SMB, "inherit posix acl failed : %d\n", posix_acl_rc); @@ -2972,7 +2972,7 @@ int smb2_open(struct ksmbd_work *work) if (rc) { if (posix_acl_rc) ksmbd_vfs_set_init_posix_acl(user_ns, - inode); + path.dentry); if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) { diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c index b05ff9b146b5..a1e05fe997fe 100644 --- a/fs/ksmbd/smbacl.c +++ b/fs/ksmbd/smbacl.c @@ -1386,14 +1386,14 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, ksmbd_vfs_remove_acl_xattrs(user_ns, path->dentry); /* Update posix acls */ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) { - rc = set_posix_acl(user_ns, inode, + rc = set_posix_acl(user_ns, path->dentry, ACL_TYPE_ACCESS, fattr.cf_acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", rc); if (S_ISDIR(inode->i_mode) && fattr.cf_dacls) { - rc = set_posix_acl(user_ns, inode, + rc = set_posix_acl(user_ns, path->dentry, ACL_TYPE_DEFAULT, fattr.cf_dacls); if (rc) ksmbd_debug(SMB, diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 8de970d6146f..7dee8b78762d 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -1824,10 +1824,11 @@ void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock) } int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, - struct inode *inode) + struct dentry *dentry) { struct posix_acl_state acl_state; struct posix_acl *acls; + struct inode *inode = d_inode(dentry); int rc; if (!IS_ENABLED(CONFIG_FS_POSIX_ACL)) @@ -1856,14 +1857,13 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, return -ENOMEM; } posix_state_to_acl(&acl_state, acls->a_entries); - rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, acls); + rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", rc); else if (S_ISDIR(inode->i_mode)) { posix_state_to_acl(&acl_state, acls->a_entries); - rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT, - acls); + rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", rc); @@ -1874,10 +1874,11 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, } int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, - struct inode *inode, struct inode *parent_inode) + struct dentry *dentry, struct inode *parent_inode) { struct posix_acl *acls; struct posix_acl_entry *pace; + struct inode *inode = d_inode(dentry); int rc, i; if (!IS_ENABLED(CONFIG_FS_POSIX_ACL)) @@ -1895,12 +1896,12 @@ int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, } } - rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, acls); + rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", rc); if (S_ISDIR(inode->i_mode)) { - rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT, + rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 593059ca8511..0d73d735cc39 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -160,8 +160,8 @@ int ksmbd_vfs_get_dos_attrib_xattr(struct user_namespace *user_ns, struct dentry *dentry, struct xattr_dos_attrib *da); int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, - struct inode *inode); + struct dentry *dentry); int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, - struct inode *inode, + struct dentry *dentry, struct inode *parent_inode); #endif /* __KSMBD_VFS_H__ */ diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h index 03a4e679fd99..df9ca56db347 100644 --- a/fs/nfs/nfs3_fs.h +++ b/fs/nfs/nfs3_fs.h @@ -12,7 +12,7 @@ */ #ifdef CONFIG_NFS_V3_ACL extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu); -extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, struct posix_acl *dfacl); diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 93de0b58647a..22890d97a9e4 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -255,10 +255,11 @@ int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, } -int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { struct posix_acl *orig = acl, *dfacl = NULL, *alloc; + struct inode *inode = d_inode(dentry); int status; if (S_ISDIR(inode->i_mode)) { diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 13e6e6897f6c..b1839638500c 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -113,11 +113,11 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp) inode_lock(inode); - error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS, + error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS, argp->acl_access); if (error) goto out_drop_lock; - error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT, + error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT, argp->acl_default); if (error) goto out_drop_lock; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 2fb9ee356455..da4a0d09bd84 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -103,11 +103,11 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp) inode_lock(inode); - error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS, + error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS, argp->acl_access); if (error) goto out_drop_lock; - error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT, + error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT, argp->acl_default); out_drop_lock: diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index f650afedd67f..4eded0729108 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -480,12 +480,12 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, attr->na_seclabel->data, attr->na_seclabel->len); if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && attr->na_pacl) attr->na_aclerr = set_posix_acl(&init_user_ns, - inode, ACL_TYPE_ACCESS, + dentry, ACL_TYPE_ACCESS, attr->na_pacl); if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && !attr->na_aclerr && attr->na_dpacl && S_ISDIR(inode->i_mode)) attr->na_aclerr = set_posix_acl(&init_user_ns, - inode, ACL_TYPE_DEFAULT, + dentry, ACL_TYPE_DEFAULT, attr->na_dpacl); inode_unlock(inode); if (size_change) diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 4f2ffc7ef296..ee5101e6bd68 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -802,7 +802,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, setattr_copy(mnt_userns, inode, attr); if (mode != inode->i_mode) { - err = ntfs_acl_chmod(mnt_userns, inode); + err = ntfs_acl_chmod(mnt_userns, dentry); if (err) goto out; diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 2c791222c4e2..a4d292809a33 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -843,7 +843,7 @@ int ntfs_cmp_names_cpu(const struct cpu_str *uni1, const struct le_str *uni2, /* globals from xattr.c */ #ifdef CONFIG_NTFS3_FS_POSIX_ACL struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu); -int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, struct inode *dir); @@ -852,7 +852,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, #define ntfs_set_acl NULL #endif -int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode); +int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry); int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask); ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size); diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 7de8718c68a9..aafe98ee0b21 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -619,10 +619,10 @@ out: /* * ntfs_set_acl - inode_operations::set_acl */ -int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { - return ntfs_set_acl_ex(mnt_userns, inode, acl, type, false); + return ntfs_set_acl_ex(mnt_userns, d_inode(dentry), acl, type, false); } /* @@ -664,8 +664,9 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, /* * ntfs_acl_chmod - Helper for ntfs3_setattr(). */ -int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode) +int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry) { + struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; if (!(sb->s_flags & SB_POSIXACL)) @@ -674,7 +675,7 @@ int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode) if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - return posix_acl_chmod(mnt_userns, inode, inode->i_mode); + return posix_acl_chmod(mnt_userns, dentry, inode->i_mode); } /* diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 23a72a423955..9f19cf9a5a9f 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -260,12 +260,13 @@ static int ocfs2_set_acl(handle_t *handle, return ret; } -int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { struct buffer_head *bh = NULL; int status, had_lock; struct ocfs2_lock_holder oh; + struct inode *inode = d_inode(dentry); had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh); if (had_lock < 0) diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 95a57c888ab6..a897c4e41b26 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -17,7 +17,7 @@ struct ocfs2_acl_entry { }; struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu); -int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 0e2db840c217..c5da2091cefb 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -118,12 +118,13 @@ out: return error; } -int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int orangefs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int error; struct iattr iattr; int rc; + struct inode *inode = d_inode(dentry); memset(&iattr, 0, sizeof iattr); @@ -152,7 +153,7 @@ int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, rc = __orangefs_set_acl(inode, acl, type); if (!rc && (iattr.ia_valid == ATTR_MODE)) - rc = __orangefs_setattr_mode(inode, &iattr); + rc = __orangefs_setattr_mode(dentry, &iattr); return rc; } diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 35788cde6d24..825872d8d377 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -833,14 +833,15 @@ out: return ret; } -int __orangefs_setattr_mode(struct inode *inode, struct iattr *iattr) +int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr) { int ret; + struct inode *inode = d_inode(dentry); ret = __orangefs_setattr(inode, iattr); /* change mode on a file that has ACLs */ if (!ret && (iattr->ia_valid & ATTR_MODE)) - ret = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + ret = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); return ret; } @@ -856,7 +857,7 @@ int orangefs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, ret = setattr_prepare(&init_user_ns, dentry, iattr); if (ret) goto out; - ret = __orangefs_setattr_mode(d_inode(dentry), iattr); + ret = __orangefs_setattr_mode(dentry, iattr); sync_inode_metadata(d_inode(dentry), 1); out: gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n", diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 3298b15684b7..55cd6d50eea1 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -107,7 +107,7 @@ extern const struct xattr_handler *orangefs_xattr_handlers[]; extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu); extern int orangefs_set_acl(struct user_namespace *mnt_userns, - struct inode *inode, struct posix_acl *acl, + struct dentry *dentry, struct posix_acl *acl, int type); int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type); @@ -361,7 +361,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct orangefs_object_kref *ref); int __orangefs_setattr(struct inode *, struct iattr *); -int __orangefs_setattr_mode(struct inode *inode, struct iattr *iattr); +int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr); int orangefs_setattr(struct user_namespace *, struct dentry *, struct iattr *); int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 74dc0f571dc9..c4bc58a1160e 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -578,19 +578,20 @@ EXPORT_SYMBOL(__posix_acl_chmod); * posix_acl_chmod - chmod a posix acl * * @mnt_userns: user namespace of the mount @inode was found from - * @inode: inode to check permissions on + * @dentry: dentry to check permissions on * @mode: the new mode of @inode * - * If the inode has been found through an idmapped mount the user namespace of + * If the dentry has been found through an idmapped mount the user namespace of * the vfsmount must be passed through @mnt_userns. This function will then * take care to map the inode according to @mnt_userns before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply passs init_user_ns. */ int - posix_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode, + posix_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry, umode_t mode) { + struct inode *inode = d_inode(dentry); struct posix_acl *acl; int ret = 0; @@ -609,7 +610,7 @@ int ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); if (ret) return ret; - ret = inode->i_op->set_acl(mnt_userns, inode, acl, ACL_TYPE_ACCESS); + ret = inode->i_op->set_acl(mnt_userns, dentry, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); return ret; } @@ -1139,9 +1140,11 @@ posix_acl_xattr_get(const struct xattr_handler *handler, } int -set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode, +set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry, int type, struct posix_acl *acl) { + struct inode *inode = d_inode(dentry); + if (!IS_POSIXACL(inode)) return -EOPNOTSUPP; if (!inode->i_op->set_acl) @@ -1157,14 +1160,14 @@ set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode, if (ret) return ret; } - return inode->i_op->set_acl(mnt_userns, inode, acl, type); + return inode->i_op->set_acl(mnt_userns, dentry, acl, type); } EXPORT_SYMBOL(set_posix_acl); static int posix_acl_xattr_set(const struct xattr_handler *handler, struct user_namespace *mnt_userns, - struct dentry *unused, struct inode *inode, + struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { @@ -1186,7 +1189,7 @@ posix_acl_xattr_set(const struct xattr_handler *handler, if (IS_ERR(acl)) return PTR_ERR(acl); } - ret = set_posix_acl(mnt_userns, inode, handler->flags, acl); + ret = set_posix_acl(mnt_userns, dentry, handler->flags, acl); posix_acl_release(acl); return ret; } @@ -1215,10 +1218,11 @@ const struct xattr_handler posix_acl_default_xattr_handler = { }; EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler); -int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int simple_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int error; + struct inode *inode = d_inode(dentry); if (type == ACL_TYPE_ACCESS) { error = posix_acl_update_mode(mnt_userns, inode, diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h index d9052b8ce6dd..29c503a06db4 100644 --- a/fs/reiserfs/acl.h +++ b/fs/reiserfs/acl.h @@ -49,9 +49,9 @@ static inline int reiserfs_acl_count(size_t size) #ifdef CONFIG_REISERFS_FS_POSIX_ACL struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu); -int reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); -int reiserfs_acl_chmod(struct inode *inode); +int reiserfs_acl_chmod(struct dentry *dentry); int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, struct inode *dir, struct dentry *dentry, struct inode *inode); @@ -63,7 +63,7 @@ int reiserfs_cache_default_acl(struct inode *dir); #define reiserfs_get_acl NULL #define reiserfs_set_acl NULL -static inline int reiserfs_acl_chmod(struct inode *inode) +static inline int reiserfs_acl_chmod(struct dentry *dentry) { return 0; } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index b9580a6515ee..c7d1fa526dea 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -3404,7 +3404,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (!error && reiserfs_posixacl(inode->i_sb)) { if (attr->ia_valid & ATTR_MODE) - error = reiserfs_acl_chmod(inode); + error = reiserfs_acl_chmod(dentry); } out: diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index d6fcddc46f5b..966ba48e33ec 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -18,7 +18,7 @@ static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, int -reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int error, error2; @@ -26,6 +26,7 @@ reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, size_t jcreate_blocks; int size = acl ? posix_acl_xattr_size(acl->a_count) : 0; int update_mode = 0; + struct inode *inode = d_inode(dentry); umode_t mode = inode->i_mode; /* @@ -396,13 +397,15 @@ int reiserfs_cache_default_acl(struct inode *inode) /* * Called under i_mutex */ -int reiserfs_acl_chmod(struct inode *inode) +int reiserfs_acl_chmod(struct dentry *dentry) { + struct inode *inode = d_inode(dentry); + if (IS_PRIVATE(inode)) return 0; if (get_inode_sd_version(inode) == STAT_DATA_V1 || !reiserfs_posixacl(inode->i_sb)) return 0; - return posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + return posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); } diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index b744c62052b6..a05f44eb8178 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -242,12 +242,13 @@ xfs_acl_set_mode( } int -xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { umode_t mode; bool set_mode = false; int error = 0; + struct inode *inode = d_inode(dentry); if (!acl) goto set_acl; diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 263404d0bfda..dcd176149c7a 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -11,7 +11,7 @@ struct posix_acl; #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu); -extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +extern int xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); void xfs_forget_acl(struct inode *inode, const char *name); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 2e10e1c66ad6..ab266ba65a84 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -651,6 +651,7 @@ xfs_vn_change_ok( static int xfs_setattr_nonsize( struct user_namespace *mnt_userns, + struct dentry *dentry, struct xfs_inode *ip, struct iattr *iattr) { @@ -757,7 +758,7 @@ xfs_setattr_nonsize( * Posix ACL code seems to care about this issue either. */ if (mask & ATTR_MODE) { - error = posix_acl_chmod(mnt_userns, inode, inode->i_mode); + error = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); if (error) return error; } @@ -779,6 +780,7 @@ out_dqrele: STATIC int xfs_setattr_size( struct user_namespace *mnt_userns, + struct dentry *dentry, struct xfs_inode *ip, struct iattr *iattr) { @@ -810,7 +812,7 @@ xfs_setattr_size( * Use the regular setattr path to update the timestamps. */ iattr->ia_valid &= ~ATTR_SIZE; - return xfs_setattr_nonsize(mnt_userns, ip, iattr); + return xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr); } /* @@ -987,7 +989,7 @@ xfs_vn_setattr_size( error = xfs_vn_change_ok(mnt_userns, dentry, iattr); if (error) return error; - return xfs_setattr_size(mnt_userns, ip, iattr); + return xfs_setattr_size(mnt_userns, dentry, ip, iattr); } STATIC int @@ -1019,7 +1021,7 @@ xfs_vn_setattr( error = xfs_vn_change_ok(mnt_userns, dentry, iattr); if (!error) - error = xfs_setattr_nonsize(mnt_userns, ip, iattr); + error = xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr); } return error; -- cgit From 06b4e09aab6c1ae6b3991a729d2fcb31e358e7f3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 11 Oct 2022 13:01:11 -0700 Subject: pstore/ram: Set freed addresses to NULL For good measure, set all the freed addresses to NULL when managing przs. Cc: Anton Vorontsov Cc: Colin Cross Cc: Tony Luck Signed-off-by: Kees Cook Reviewed-and-tested-by: Guilherme G. Piccoli Link: https://lore.kernel.org/r/20221011200112.731334-5-keescook@chromium.org --- fs/pstore/ram.c | 13 ++++++++----- fs/pstore/ram_core.c | 11 +++++++++-- fs/pstore/ram_internal.h | 2 +- 3 files changed, 18 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index f5bf360cf905..5a39fa8522b0 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -453,25 +453,27 @@ static void ramoops_free_przs(struct ramoops_context *cxt) int i; /* Free pmsg PRZ */ - persistent_ram_free(cxt->mprz); + persistent_ram_free(&cxt->mprz); /* Free console PRZ */ - persistent_ram_free(cxt->cprz); + persistent_ram_free(&cxt->cprz); /* Free dump PRZs */ if (cxt->dprzs) { for (i = 0; i < cxt->max_dump_cnt; i++) - persistent_ram_free(cxt->dprzs[i]); + persistent_ram_free(&cxt->dprzs[i]); kfree(cxt->dprzs); + cxt->dprzs = NULL; cxt->max_dump_cnt = 0; } /* Free ftrace PRZs */ if (cxt->fprzs) { for (i = 0; i < cxt->max_ftrace_cnt; i++) - persistent_ram_free(cxt->fprzs[i]); + persistent_ram_free(&cxt->fprzs[i]); kfree(cxt->fprzs); + cxt->fprzs = NULL; cxt->max_ftrace_cnt = 0; } } @@ -555,9 +557,10 @@ static int ramoops_init_przs(const char *name, while (i > 0) { i--; - persistent_ram_free(prz_ar[i]); + persistent_ram_free(&prz_ar[i]); } kfree(prz_ar); + prz_ar = NULL; goto fail; } *paddr += zone_sz; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 9e1047f4316d..97dde525150a 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -544,8 +544,14 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, return 0; } -void persistent_ram_free(struct persistent_ram_zone *prz) +void persistent_ram_free(struct persistent_ram_zone **_prz) { + struct persistent_ram_zone *prz; + + if (!_prz) + return; + + prz = *_prz; if (!prz) return; @@ -569,6 +575,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz) persistent_ram_free_old(prz); kfree(prz->label); kfree(prz); + *_prz = NULL; } struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, @@ -605,6 +612,6 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, return prz; err: - persistent_ram_free(prz); + persistent_ram_free(&prz); return ERR_PTR(ret); } diff --git a/fs/pstore/ram_internal.h b/fs/pstore/ram_internal.h index 440ee7a35e10..5f694698351f 100644 --- a/fs/pstore/ram_internal.h +++ b/fs/pstore/ram_internal.h @@ -82,7 +82,7 @@ struct persistent_ram_zone { struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, u32 sig, struct persistent_ram_ecc_info *ecc_info, unsigned int memtype, u32 flags, char *label); -void persistent_ram_free(struct persistent_ram_zone *prz); +void persistent_ram_free(struct persistent_ram_zone **_prz); void persistent_ram_zap(struct persistent_ram_zone *prz); int persistent_ram_write(struct persistent_ram_zone *prz, const void *s, -- cgit From 38b91847c314f49c80e30062549d4709a3754ea6 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Thu, 13 Oct 2022 18:06:46 -0300 Subject: pstore: Alert on backend write error The pstore dump function doesn't alert at all on errors - despite pstore is usually a last resource and if it fails users won't be able to read the kernel log, this is not the case for server users with serial access, for example. So, let's at least attempt to inform such advanced users on the first backend writing error detected during the kmsg dump - this is also very useful for pstore debugging purposes. Signed-off-by: Guilherme G. Piccoli Acked-by: Ard Biesheuvel Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221013210648.137452-2-gpiccoli@igalia.com --- fs/pstore/platform.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 06c2c66af332..cbc0b468c1ab 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -393,6 +393,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, const char *why; unsigned int part = 1; unsigned long flags = 0; + int saved_ret = 0; int ret; why = kmsg_dump_reason_str(reason); @@ -463,12 +464,21 @@ static void pstore_dump(struct kmsg_dumper *dumper, if (ret == 0 && reason == KMSG_DUMP_OOPS) { pstore_new_entry = 1; pstore_timer_kick(); + } else { + /* Preserve only the first non-zero returned value. */ + if (!saved_ret) + saved_ret = ret; } total += record.size; part++; } spin_unlock_irqrestore(&psinfo->buf_lock, flags); + + if (saved_ret) { + pr_err_once("backend (%s) writing error (%d)\n", psinfo->name, + saved_ret); + } } static struct kmsg_dumper pstore_dumper = { -- cgit From cac2f8b8d8b50ef32b3e34f6dcbbf08937e4f616 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:00 +0200 Subject: fs: rename current get acl method The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. The current inode operation for getting posix acls takes an inode argument but various filesystems (e.g., 9p, cifs, overlayfs) need access to the dentry. In contrast to the ->set_acl() inode operation we cannot simply extend ->get_acl() to take a dentry argument. The ->get_acl() inode operation is called from: acl_permission_check() -> check_acl() -> get_acl() which is part of generic_permission() which in turn is part of inode_permission(). Both generic_permission() and inode_permission() are called in the ->permission() handler of various filesystems (e.g., overlayfs). So simply passing a dentry argument to ->get_acl() would amount to also having to pass a dentry argument to ->permission(). We should avoid this unnecessary change. So instead of extending the existing inode operation rename it from ->get_acl() to ->get_inode_acl() and add a ->get_acl() method later that passes a dentry argument and which filesystems that need access to the dentry can implement instead of ->get_inode_acl(). Filesystems like cifs which allow setting and getting posix acls but not using them for permission checking during lookup can simply not implement ->get_inode_acl(). This is intended to be a non-functional change. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Suggested-by/Inspired-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner (Microsoft) --- fs/9p/vfs_inode_dotl.c | 4 ++-- fs/bad_inode.c | 2 +- fs/btrfs/inode.c | 6 +++--- fs/ceph/dir.c | 2 +- fs/ceph/inode.c | 2 +- fs/erofs/inode.c | 6 +++--- fs/erofs/namei.c | 2 +- fs/ext2/file.c | 2 +- fs/ext2/namei.c | 4 ++-- fs/ext4/file.c | 2 +- fs/ext4/ialloc.c | 2 +- fs/ext4/namei.c | 4 ++-- fs/f2fs/file.c | 2 +- fs/f2fs/namei.c | 4 ++-- fs/fuse/dir.c | 4 ++-- fs/gfs2/inode.c | 4 ++-- fs/jffs2/dir.c | 2 +- fs/jffs2/file.c | 2 +- fs/jfs/file.c | 2 +- fs/jfs/namei.c | 2 +- fs/ksmbd/smb2pdu.c | 4 ++-- fs/ksmbd/smbacl.c | 2 +- fs/ksmbd/vfs.c | 4 ++-- fs/namei.c | 4 ++-- fs/nfs/nfs3acl.c | 6 +++--- fs/nfs/nfs3proc.c | 4 ++-- fs/nfsd/nfs2acl.c | 4 ++-- fs/nfsd/nfs3acl.c | 4 ++-- fs/nfsd/nfs4acl.c | 4 ++-- fs/ntfs3/file.c | 2 +- fs/ntfs3/namei.c | 4 ++-- fs/ocfs2/file.c | 4 ++-- fs/ocfs2/namei.c | 2 +- fs/orangefs/inode.c | 2 +- fs/orangefs/namei.c | 2 +- fs/overlayfs/dir.c | 2 +- fs/overlayfs/inode.c | 6 +++--- fs/posix_acl.c | 43 ++++++++++++++++++++++--------------------- fs/reiserfs/file.c | 2 +- fs/reiserfs/namei.c | 4 ++-- fs/reiserfs/xattr_acl.c | 2 +- fs/xfs/xfs_iops.c | 6 +++--- 42 files changed, 89 insertions(+), 88 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 5cfa4b4f070f..0d1a7f2c579d 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -983,14 +983,14 @@ const struct inode_operations v9fs_dir_inode_operations_dotl = { .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .listxattr = v9fs_listxattr, - .get_acl = v9fs_iop_get_acl, + .get_inode_acl = v9fs_iop_get_acl, }; const struct inode_operations v9fs_file_inode_operations_dotl = { .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .listxattr = v9fs_listxattr, - .get_acl = v9fs_iop_get_acl, + .get_inode_acl = v9fs_iop_get_acl, }; const struct inode_operations v9fs_symlink_inode_operations_dotl = { diff --git a/fs/bad_inode.c b/fs/bad_inode.c index bc9917d372ed..92737166203f 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -177,7 +177,7 @@ static const struct inode_operations bad_inode_ops = .setattr = bad_inode_setattr, .listxattr = bad_inode_listxattr, .get_link = bad_inode_get_link, - .get_acl = bad_inode_get_acl, + .get_inode_acl = bad_inode_get_acl, .fiemap = bad_inode_fiemap, .update_time = bad_inode_update_time, .atomic_open = bad_inode_atomic_open, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 312ba03c56ae..7a3076ccbab4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -11288,7 +11288,7 @@ static const struct inode_operations btrfs_dir_inode_operations = { .mknod = btrfs_mknod, .listxattr = btrfs_listxattr, .permission = btrfs_permission, - .get_acl = btrfs_get_acl, + .get_inode_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, .tmpfile = btrfs_tmpfile, @@ -11341,7 +11341,7 @@ static const struct inode_operations btrfs_file_inode_operations = { .listxattr = btrfs_listxattr, .permission = btrfs_permission, .fiemap = btrfs_fiemap, - .get_acl = btrfs_get_acl, + .get_inode_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, .fileattr_get = btrfs_fileattr_get, @@ -11352,7 +11352,7 @@ static const struct inode_operations btrfs_special_inode_operations = { .setattr = btrfs_setattr, .permission = btrfs_permission, .listxattr = btrfs_listxattr, - .get_acl = btrfs_get_acl, + .get_inode_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index e7e2ebac330d..6c7026cc8988 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -2033,7 +2033,7 @@ const struct inode_operations ceph_dir_iops = { .getattr = ceph_getattr, .setattr = ceph_setattr, .listxattr = ceph_listxattr, - .get_acl = ceph_get_acl, + .get_inode_acl = ceph_get_acl, .set_acl = ceph_set_acl, .mknod = ceph_mknod, .symlink = ceph_symlink, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index ca8aef906dc1..31cd27843eca 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -126,7 +126,7 @@ const struct inode_operations ceph_file_iops = { .setattr = ceph_setattr, .getattr = ceph_getattr, .listxattr = ceph_listxattr, - .get_acl = ceph_get_acl, + .get_inode_acl = ceph_get_acl, .set_acl = ceph_set_acl, }; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index ad2a82f2eb4c..2d571343deec 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -371,7 +371,7 @@ int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, const struct inode_operations erofs_generic_iops = { .getattr = erofs_getattr, .listxattr = erofs_listxattr, - .get_acl = erofs_get_acl, + .get_inode_acl = erofs_get_acl, .fiemap = erofs_fiemap, }; @@ -379,12 +379,12 @@ const struct inode_operations erofs_symlink_iops = { .get_link = page_get_link, .getattr = erofs_getattr, .listxattr = erofs_listxattr, - .get_acl = erofs_get_acl, + .get_inode_acl = erofs_get_acl, }; const struct inode_operations erofs_fast_symlink_iops = { .get_link = simple_get_link, .getattr = erofs_getattr, .listxattr = erofs_listxattr, - .get_acl = erofs_get_acl, + .get_inode_acl = erofs_get_acl, }; diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index 0dc34721080c..b64a108fac92 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -228,6 +228,6 @@ const struct inode_operations erofs_dir_iops = { .lookup = erofs_lookup, .getattr = erofs_getattr, .listxattr = erofs_listxattr, - .get_acl = erofs_get_acl, + .get_inode_acl = erofs_get_acl, .fiemap = erofs_fiemap, }; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index eb97aa3d700e..6b4bebe982ca 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -200,7 +200,7 @@ const struct inode_operations ext2_file_inode_operations = { .listxattr = ext2_listxattr, .getattr = ext2_getattr, .setattr = ext2_setattr, - .get_acl = ext2_get_acl, + .get_inode_acl = ext2_get_acl, .set_acl = ext2_set_acl, .fiemap = ext2_fiemap, .fileattr_get = ext2_fileattr_get, diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 9125eab85146..c056957221a2 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -427,7 +427,7 @@ const struct inode_operations ext2_dir_inode_operations = { .listxattr = ext2_listxattr, .getattr = ext2_getattr, .setattr = ext2_setattr, - .get_acl = ext2_get_acl, + .get_inode_acl = ext2_get_acl, .set_acl = ext2_set_acl, .tmpfile = ext2_tmpfile, .fileattr_get = ext2_fileattr_get, @@ -438,6 +438,6 @@ const struct inode_operations ext2_special_inode_operations = { .listxattr = ext2_listxattr, .getattr = ext2_getattr, .setattr = ext2_setattr, - .get_acl = ext2_get_acl, + .get_inode_acl = ext2_get_acl, .set_acl = ext2_set_acl, }; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index a7a597c727e6..7ac0a81bd371 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -955,7 +955,7 @@ const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_file_getattr, .listxattr = ext4_listxattr, - .get_acl = ext4_get_acl, + .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, .fileattr_get = ext4_fileattr_get, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index e9bc46684106..67a257a69758 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -870,7 +870,7 @@ static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode, struct super_block *sb = dir->i_sb; int nblocks = 0; #ifdef CONFIG_EXT4_FS_POSIX_ACL - struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); + struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT); if (IS_ERR(p)) return PTR_ERR(p); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index d5daaf41e1fc..b8a91d74fdd1 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -4186,7 +4186,7 @@ const struct inode_operations ext4_dir_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, - .get_acl = ext4_get_acl, + .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, .fileattr_get = ext4_fileattr_get, @@ -4197,6 +4197,6 @@ const struct inode_operations ext4_special_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, - .get_acl = ext4_get_acl, + .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, }; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 122339482bdc..83df6f6173d3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1046,7 +1046,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, const struct inode_operations f2fs_file_inode_operations = { .getattr = f2fs_getattr, .setattr = f2fs_setattr, - .get_acl = f2fs_get_acl, + .get_inode_acl = f2fs_get_acl, .set_acl = f2fs_set_acl, .listxattr = f2fs_listxattr, .fiemap = f2fs_fiemap, diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a389772fd212..c227113b0f26 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1379,7 +1379,7 @@ const struct inode_operations f2fs_dir_inode_operations = { .tmpfile = f2fs_tmpfile, .getattr = f2fs_getattr, .setattr = f2fs_setattr, - .get_acl = f2fs_get_acl, + .get_inode_acl = f2fs_get_acl, .set_acl = f2fs_set_acl, .listxattr = f2fs_listxattr, .fiemap = f2fs_fiemap, @@ -1397,7 +1397,7 @@ const struct inode_operations f2fs_symlink_inode_operations = { const struct inode_operations f2fs_special_inode_operations = { .getattr = f2fs_getattr, .setattr = f2fs_setattr, - .get_acl = f2fs_get_acl, + .get_inode_acl = f2fs_get_acl, .set_acl = f2fs_set_acl, .listxattr = f2fs_listxattr, }; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index bb97a384dc5d..25e6b0f7e73d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1935,7 +1935,7 @@ static const struct inode_operations fuse_dir_inode_operations = { .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, - .get_acl = fuse_get_acl, + .get_inode_acl = fuse_get_acl, .set_acl = fuse_set_acl, .fileattr_get = fuse_fileattr_get, .fileattr_set = fuse_fileattr_set, @@ -1957,7 +1957,7 @@ static const struct inode_operations fuse_common_inode_operations = { .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, - .get_acl = fuse_get_acl, + .get_inode_acl = fuse_get_acl, .set_acl = fuse_set_acl, .fileattr_get = fuse_fileattr_get, .fileattr_set = fuse_fileattr_set, diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 314b9ce70682..1371e067d2a7 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -2149,7 +2149,7 @@ static const struct inode_operations gfs2_file_iops = { .getattr = gfs2_getattr, .listxattr = gfs2_listxattr, .fiemap = gfs2_fiemap, - .get_acl = gfs2_get_acl, + .get_inode_acl = gfs2_get_acl, .set_acl = gfs2_set_acl, .update_time = gfs2_update_time, .fileattr_get = gfs2_fileattr_get, @@ -2171,7 +2171,7 @@ static const struct inode_operations gfs2_dir_iops = { .getattr = gfs2_getattr, .listxattr = gfs2_listxattr, .fiemap = gfs2_fiemap, - .get_acl = gfs2_get_acl, + .get_inode_acl = gfs2_get_acl, .set_acl = gfs2_set_acl, .update_time = gfs2_update_time, .atomic_open = gfs2_atomic_open, diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index c0aabbcbfd58..f399b390b5f6 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -62,7 +62,7 @@ const struct inode_operations jffs2_dir_inode_operations = .rmdir = jffs2_rmdir, .mknod = jffs2_mknod, .rename = jffs2_rename, - .get_acl = jffs2_get_acl, + .get_inode_acl = jffs2_get_acl, .set_acl = jffs2_set_acl, .setattr = jffs2_setattr, .listxattr = jffs2_listxattr, diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index ba86acbe12d3..3cf71befa475 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -64,7 +64,7 @@ const struct file_operations jffs2_file_operations = const struct inode_operations jffs2_file_inode_operations = { - .get_acl = jffs2_get_acl, + .get_inode_acl = jffs2_get_acl, .set_acl = jffs2_set_acl, .setattr = jffs2_setattr, .listxattr = jffs2_listxattr, diff --git a/fs/jfs/file.c b/fs/jfs/file.c index e3eb9c36751f..88663465aecd 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -133,7 +133,7 @@ const struct inode_operations jfs_file_inode_operations = { .fileattr_get = jfs_fileattr_get, .fileattr_set = jfs_fileattr_set, #ifdef CONFIG_JFS_POSIX_ACL - .get_acl = jfs_get_acl, + .get_inode_acl = jfs_get_acl, .set_acl = jfs_set_acl, #endif }; diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 9db4f5789c0e..b50afaf7966f 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1525,7 +1525,7 @@ const struct inode_operations jfs_dir_inode_operations = { .fileattr_get = jfs_fileattr_get, .fileattr_set = jfs_fileattr_set, #ifdef CONFIG_JFS_POSIX_ACL - .get_acl = jfs_get_acl, + .get_inode_acl = jfs_get_acl, .set_acl = jfs_set_acl, #endif }; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 2466edc57424..9306e10753f9 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -2487,9 +2487,9 @@ static void ksmbd_acls_fattr(struct smb_fattr *fattr, fattr->cf_dacls = NULL; if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) { - fattr->cf_acls = get_acl(inode, ACL_TYPE_ACCESS); + fattr->cf_acls = get_inode_acl(inode, ACL_TYPE_ACCESS); if (S_ISDIR(inode->i_mode)) - fattr->cf_dacls = get_acl(inode, ACL_TYPE_DEFAULT); + fattr->cf_dacls = get_inode_acl(inode, ACL_TYPE_DEFAULT); } } diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c index a1e05fe997fe..ab5c68cc0e13 100644 --- a/fs/ksmbd/smbacl.c +++ b/fs/ksmbd/smbacl.c @@ -1289,7 +1289,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, } if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) { - posix_acls = get_acl(d_inode(path->dentry), ACL_TYPE_ACCESS); + posix_acls = get_inode_acl(d_inode(path->dentry), ACL_TYPE_ACCESS); if (posix_acls && !found) { unsigned int id = -1; diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 7dee8b78762d..93f65f01a4a6 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -1375,7 +1375,7 @@ static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct user_namespac if (!IS_ENABLED(CONFIG_FS_POSIX_ACL)) return NULL; - posix_acls = get_acl(inode, acl_type); + posix_acls = get_inode_acl(inode, acl_type); if (!posix_acls) return NULL; @@ -1884,7 +1884,7 @@ int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, if (!IS_ENABLED(CONFIG_FS_POSIX_ACL)) return -EOPNOTSUPP; - acls = get_acl(parent_inode, ACL_TYPE_DEFAULT); + acls = get_inode_acl(parent_inode, ACL_TYPE_DEFAULT); if (!acls) return -ENOENT; pace = acls->a_entries; diff --git a/fs/namei.c b/fs/namei.c index 578c2110df02..1c80fd23cda2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -297,13 +297,13 @@ static int check_acl(struct user_namespace *mnt_userns, acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); if (!acl) return -EAGAIN; - /* no ->get_acl() calls in RCU mode... */ + /* no ->get_inode_acl() calls in RCU mode... */ if (is_uncached_acl(acl)) return -ECHILD; return posix_acl_permission(mnt_userns, inode, acl, mask); } - acl = get_acl(inode, ACL_TYPE_ACCESS); + acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 22890d97a9e4..74d11e3c4205 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -265,14 +265,14 @@ int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, if (S_ISDIR(inode->i_mode)) { switch(type) { case ACL_TYPE_ACCESS: - alloc = get_acl(inode, ACL_TYPE_DEFAULT); + alloc = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(alloc)) goto fail; dfacl = alloc; break; case ACL_TYPE_DEFAULT: - alloc = get_acl(inode, ACL_TYPE_ACCESS); + alloc = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(alloc)) goto fail; dfacl = acl; @@ -313,7 +313,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, struct posix_acl *acl; char *p = data + *result; - acl = get_acl(inode, type); + acl = get_inode_acl(inode, type); if (IS_ERR_OR_NULL(acl)) return 0; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 2e7579626cf0..4bf208a0a8e9 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -998,7 +998,7 @@ static const struct inode_operations nfs3_dir_inode_operations = { .setattr = nfs_setattr, #ifdef CONFIG_NFS_V3_ACL .listxattr = nfs3_listxattr, - .get_acl = nfs3_get_acl, + .get_inode_acl = nfs3_get_acl, .set_acl = nfs3_set_acl, #endif }; @@ -1009,7 +1009,7 @@ static const struct inode_operations nfs3_file_inode_operations = { .setattr = nfs_setattr, #ifdef CONFIG_NFS_V3_ACL .listxattr = nfs3_listxattr, - .get_acl = nfs3_get_acl, + .get_inode_acl = nfs3_get_acl, .set_acl = nfs3_set_acl, #endif }; diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index b1839638500c..c43c25a8da2e 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -55,7 +55,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp) goto out; if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { - acl = get_acl(inode, ACL_TYPE_ACCESS); + acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (acl == NULL) { /* Solaris returns the inode's minimum ACL. */ acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); @@ -69,7 +69,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp) if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { /* Check how Solaris handles requests for the Default ACL of a non-directory! */ - acl = get_acl(inode, ACL_TYPE_DEFAULT); + acl = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) { resp->status = nfserrno(PTR_ERR(acl)); goto fail; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index da4a0d09bd84..9daa621817d8 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -47,7 +47,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp) resp->mask = argp->mask; if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { - acl = get_acl(inode, ACL_TYPE_ACCESS); + acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (acl == NULL) { /* Solaris returns the inode's minimum ACL. */ acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); @@ -61,7 +61,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp) if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { /* Check how Solaris handles requests for the Default ACL of a non-directory! */ - acl = get_acl(inode, ACL_TYPE_DEFAULT); + acl = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) { resp->status = nfserrno(PTR_ERR(acl)); goto fail; diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index bb8e2f6d7d03..518203821790 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -135,7 +135,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, unsigned int flags = 0; int size = 0; - pacl = get_acl(inode, ACL_TYPE_ACCESS); + pacl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (!pacl) pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); @@ -147,7 +147,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, if (S_ISDIR(inode->i_mode)) { flags = NFS4_ACL_DIR; - dpacl = get_acl(inode, ACL_TYPE_DEFAULT); + dpacl = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(dpacl)) { error = PTR_ERR(dpacl); goto rel_pacl; diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index ee5101e6bd68..c5e4a886593d 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -1255,7 +1255,7 @@ const struct inode_operations ntfs_file_inode_operations = { .setattr = ntfs3_setattr, .listxattr = ntfs_listxattr, .permission = ntfs_permission, - .get_acl = ntfs_get_acl, + .get_inode_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, .fiemap = ntfs_fiemap, }; diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index bc22cc321a74..053cc0e0f8b5 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -367,7 +367,7 @@ const struct inode_operations ntfs_dir_inode_operations = { .mknod = ntfs_mknod, .rename = ntfs_rename, .permission = ntfs_permission, - .get_acl = ntfs_get_acl, + .get_inode_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, .setattr = ntfs3_setattr, .getattr = ntfs_getattr, @@ -379,7 +379,7 @@ const struct inode_operations ntfs_special_inode_operations = { .setattr = ntfs3_setattr, .getattr = ntfs_getattr, .listxattr = ntfs_listxattr, - .get_acl = ntfs_get_acl, + .get_inode_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, }; // clang-format on diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9c67edd215d5..af900aaa9275 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2712,7 +2712,7 @@ const struct inode_operations ocfs2_file_iops = { .permission = ocfs2_permission, .listxattr = ocfs2_listxattr, .fiemap = ocfs2_fiemap, - .get_acl = ocfs2_iop_get_acl, + .get_inode_acl = ocfs2_iop_get_acl, .set_acl = ocfs2_iop_set_acl, .fileattr_get = ocfs2_fileattr_get, .fileattr_set = ocfs2_fileattr_set, @@ -2722,7 +2722,7 @@ const struct inode_operations ocfs2_special_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, - .get_acl = ocfs2_iop_get_acl, + .get_inode_acl = ocfs2_iop_get_acl, .set_acl = ocfs2_iop_set_acl, }; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 961d1cf54388..c5ffded7ac92 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2916,7 +2916,7 @@ const struct inode_operations ocfs2_dir_iops = { .permission = ocfs2_permission, .listxattr = ocfs2_listxattr, .fiemap = ocfs2_fiemap, - .get_acl = ocfs2_iop_get_acl, + .get_inode_acl = ocfs2_iop_get_acl, .set_acl = ocfs2_iop_set_acl, .fileattr_get = ocfs2_fileattr_get, .fileattr_set = ocfs2_fileattr_set, diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 825872d8d377..8974b0fbf00d 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -975,7 +975,7 @@ static int orangefs_fileattr_set(struct user_namespace *mnt_userns, /* ORANGEFS2 implementation of VFS inode operations for files */ static const struct inode_operations orangefs_file_inode_operations = { - .get_acl = orangefs_get_acl, + .get_inode_acl = orangefs_get_acl, .set_acl = orangefs_set_acl, .setattr = orangefs_setattr, .getattr = orangefs_getattr, diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 600e8eee541f..75c1a3dcf68c 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -430,7 +430,7 @@ static int orangefs_rename(struct user_namespace *mnt_userns, /* ORANGEFS implementation of VFS inode operations for directories */ const struct inode_operations orangefs_dir_inode_operations = { .lookup = orangefs_lookup, - .get_acl = orangefs_get_acl, + .get_inode_acl = orangefs_get_acl, .set_acl = orangefs_set_acl, .create = orangefs_create, .unlink = orangefs_unlink, diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 6b03457f72bb..7bece7010c00 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -1311,7 +1311,7 @@ const struct inode_operations ovl_dir_inode_operations = { .permission = ovl_permission, .getattr = ovl_getattr, .listxattr = ovl_listxattr, - .get_acl = ovl_get_acl, + .get_inode_acl = ovl_get_acl, .update_time = ovl_update_time, .fileattr_get = ovl_fileattr_get, .fileattr_set = ovl_fileattr_set, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 9e61511de7a7..6eefd8b7868e 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -517,7 +517,7 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) const struct cred *old_cred; old_cred = ovl_override_creds(inode->i_sb); - acl = get_acl(realinode, type); + acl = get_inode_acl(realinode, type); revert_creds(old_cred); } /* @@ -721,7 +721,7 @@ static const struct inode_operations ovl_file_inode_operations = { .permission = ovl_permission, .getattr = ovl_getattr, .listxattr = ovl_listxattr, - .get_acl = ovl_get_acl, + .get_inode_acl = ovl_get_acl, .update_time = ovl_update_time, .fiemap = ovl_fiemap, .fileattr_get = ovl_fileattr_get, @@ -741,7 +741,7 @@ static const struct inode_operations ovl_special_inode_operations = { .permission = ovl_permission, .getattr = ovl_getattr, .listxattr = ovl_listxattr, - .get_acl = ovl_get_acl, + .get_inode_acl = ovl_get_acl, .update_time = ovl_update_time, }; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index c4bc58a1160e..8bbabe477cb2 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -64,7 +64,7 @@ struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) if (acl == ACL_DONT_CACHE) { struct posix_acl *ret; - ret = inode->i_op->get_acl(inode, type, LOOKUP_RCU); + ret = inode->i_op->get_inode_acl(inode, type, LOOKUP_RCU); if (!IS_ERR(ret)) acl = ret; } @@ -106,7 +106,7 @@ void forget_all_cached_acls(struct inode *inode) } EXPORT_SYMBOL(forget_all_cached_acls); -struct posix_acl *get_acl(struct inode *inode, int type) +struct posix_acl *get_inode_acl(struct inode *inode, int type) { void *sentinel; struct posix_acl **p; @@ -114,7 +114,7 @@ struct posix_acl *get_acl(struct inode *inode, int type) /* * The sentinel is used to detect when another operation like - * set_cached_acl() or forget_cached_acl() races with get_acl(). + * set_cached_acl() or forget_cached_acl() races with get_inode_acl(). * It is guaranteed that is_uncached_acl(sentinel) is true. */ @@ -133,24 +133,24 @@ struct posix_acl *get_acl(struct inode *inode, int type) * current value of the ACL will not be ACL_NOT_CACHED and so our own * sentinel will not be set; another task will update the cache. We * could wait for that other task to complete its job, but it's easier - * to just call ->get_acl to fetch the ACL ourself. (This is going to - * be an unlikely race.) + * to just call ->get_inode_acl to fetch the ACL ourself. (This is + * going to be an unlikely race.) */ cmpxchg(p, ACL_NOT_CACHED, sentinel); /* - * Normally, the ACL returned by ->get_acl will be cached. + * Normally, the ACL returned by ->get_inode_acl will be cached. * A filesystem can prevent that by calling - * forget_cached_acl(inode, type) in ->get_acl. + * forget_cached_acl(inode, type) in ->get_inode_acl. * - * If the filesystem doesn't have a get_acl() function at all, we'll - * just create the negative cache entry. + * If the filesystem doesn't have a get_inode_acl() function at all, + * we'll just create the negative cache entry. */ - if (!inode->i_op->get_acl) { + if (!inode->i_op->get_inode_acl) { set_cached_acl(inode, type, NULL); return NULL; } - acl = inode->i_op->get_acl(inode, type, false); + acl = inode->i_op->get_inode_acl(inode, type, false); if (IS_ERR(acl)) { /* @@ -169,7 +169,7 @@ struct posix_acl *get_acl(struct inode *inode, int type) posix_acl_release(acl); return acl; } -EXPORT_SYMBOL(get_acl); +EXPORT_SYMBOL(get_inode_acl); /* * Init a fresh posix_acl @@ -600,7 +600,7 @@ int if (!inode->i_op->set_acl) return -EOPNOTSUPP; - acl = get_acl(inode, ACL_TYPE_ACCESS); + acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR_OR_NULL(acl)) { if (acl == ERR_PTR(-EOPNOTSUPP)) return 0; @@ -630,7 +630,7 @@ posix_acl_create(struct inode *dir, umode_t *mode, if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) return 0; - p = get_acl(dir, ACL_TYPE_DEFAULT); + p = get_inode_acl(dir, ACL_TYPE_DEFAULT); if (!p || p == ERR_PTR(-EOPNOTSUPP)) { *mode &= ~current_umask(); return 0; @@ -1045,7 +1045,8 @@ posix_acl_from_xattr_kgid(struct user_namespace *mnt_userns, * Filesystems that store POSIX ACLs in the unaltered uapi format should use * posix_acl_from_xattr() when reading them from the backing store and * converting them into the struct posix_acl VFS format. The helper is - * specifically intended to be called from the ->get_acl() inode operation. + * specifically intended to be called from the ->get_inode_acl() inode + * operation. * * The posix_acl_from_xattr() function will map the raw {g,u}id values stored * in ACL_{GROUP,USER} entries into the filesystem idmapping in @fs_userns. The @@ -1053,11 +1054,11 @@ posix_acl_from_xattr_kgid(struct user_namespace *mnt_userns, * correct k{g,u}id_t. The returned struct posix_acl can be cached. * * Note that posix_acl_from_xattr() does not take idmapped mounts into account. - * If it did it calling is from the ->get_acl() inode operation would return - * POSIX ACLs mapped according to an idmapped mount which would mean that the - * value couldn't be cached for the filesystem. Idmapped mounts are taken into - * account on the fly during permission checking or right at the VFS - - * userspace boundary before reporting them to the user. + * If it did it calling is from the ->get_inode_acl() inode operation would + * return POSIX ACLs mapped according to an idmapped mount which would mean + * that the value couldn't be cached for the filesystem. Idmapped mounts are + * taken into account on the fly during permission checking or right at the VFS + * - userspace boundary before reporting them to the user. * * Return: Allocated struct posix_acl on success, NULL for a valid header but * without actual POSIX ACL entries, or ERR_PTR() encoded error code. @@ -1127,7 +1128,7 @@ posix_acl_xattr_get(const struct xattr_handler *handler, if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - acl = get_acl(inode, handler->flags); + acl = get_inode_acl(inode, handler->flags); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 6e228bfbe7ef..467d13da198f 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -256,7 +256,7 @@ const struct inode_operations reiserfs_file_inode_operations = { .setattr = reiserfs_setattr, .listxattr = reiserfs_listxattr, .permission = reiserfs_permission, - .get_acl = reiserfs_get_acl, + .get_inode_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, .fileattr_get = reiserfs_fileattr_get, .fileattr_set = reiserfs_fileattr_set, diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 3d7a35d6a18b..4d428e8704bc 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -1659,7 +1659,7 @@ const struct inode_operations reiserfs_dir_inode_operations = { .setattr = reiserfs_setattr, .listxattr = reiserfs_listxattr, .permission = reiserfs_permission, - .get_acl = reiserfs_get_acl, + .get_inode_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, .fileattr_get = reiserfs_fileattr_get, .fileattr_set = reiserfs_fileattr_set, @@ -1683,6 +1683,6 @@ const struct inode_operations reiserfs_special_inode_operations = { .setattr = reiserfs_setattr, .listxattr = reiserfs_listxattr, .permission = reiserfs_permission, - .get_acl = reiserfs_get_acl, + .get_inode_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, }; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 966ba48e33ec..93fe414fed18 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -372,7 +372,7 @@ int reiserfs_cache_default_acl(struct inode *inode) if (IS_PRIVATE(inode)) return 0; - acl = get_acl(inode, ACL_TYPE_DEFAULT); + acl = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (acl && !IS_ERR(acl)) { int size = reiserfs_acl_size(acl->a_count); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ab266ba65a84..712238305bc3 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1103,7 +1103,7 @@ xfs_vn_tmpfile( } static const struct inode_operations xfs_inode_operations = { - .get_acl = xfs_get_acl, + .get_inode_acl = xfs_get_acl, .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, @@ -1130,7 +1130,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .get_acl = xfs_get_acl, + .get_inode_acl = xfs_get_acl, .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, @@ -1157,7 +1157,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .get_acl = xfs_get_acl, + .get_inode_acl = xfs_get_acl, .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, -- cgit From bd9684b042dcbb400ecb4d169b74c9adc84aa088 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:02 +0200 Subject: cifs: implement get acl method The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. In order to build a type safe posix api around get and set acl we need all filesystem to implement get and set acl. So far cifs wasn't able to implement get and set acl inode operations because it needs access to the dentry. Now that we extended the set acl inode operation to take a dentry argument and added a new get acl inode operation that takes a dentry argument we can let cifs implement get and set acl inode operations. This is mostly a copy and paste of the codepaths currently used in cifs' posix acl xattr handler. After we have fully implemented the posix acl api and switched the vfs over to it, the cifs specific posix acl xattr handler and associated code will be removed and the code duplication will go away. Note, until the vfs has been switched to the new posix acl api this patch is a non-functional change. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) --- fs/cifs/cifsacl.c | 67 ++++++++++++++++++ fs/cifs/cifsfs.c | 2 + fs/cifs/cifsproto.h | 6 ++ fs/cifs/cifssmb.c | 196 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 271 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index fa480d62f313..a6730e0eb57b 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "cifspdu.h" #include "cifsglob.h" @@ -20,6 +21,8 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "fs_context.h" +#include "cifs_fs_sb.h" +#include "cifs_unicode.h" /* security id for everyone/world system group */ static const struct cifs_sid sid_everyone = { @@ -1668,3 +1671,67 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, kfree(pntsd); return rc; } + +struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, int type) +{ +#if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX) + struct posix_acl *acl = NULL; + ssize_t rc = -EOPNOTSUPP; + unsigned int xid; + struct super_block *sb = dentry->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + const char *full_path; + void *page; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return ERR_CAST(tlink); + pTcon = tlink_tcon(tlink); + + xid = get_xid(); + page = alloc_dentry_path(); + + full_path = build_path_from_dentry(dentry, page); + if (IS_ERR(full_path)) { + acl = ERR_CAST(full_path); + goto out; + } + + /* return alt name if available as pseudo attr */ + switch (type) { + case ACL_TYPE_ACCESS: + if (sb->s_flags & SB_POSIXACL) + rc = cifs_do_get_acl(xid, pTcon, full_path, &acl, + ACL_TYPE_ACCESS, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + break; + + case ACL_TYPE_DEFAULT: + if (sb->s_flags & SB_POSIXACL) + rc = cifs_do_get_acl(xid, pTcon, full_path, &acl, + ACL_TYPE_DEFAULT, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + break; + } + + if (rc < 0) { + if (rc == -EINVAL) + acl = ERR_PTR(-EOPNOTSUPP); + else + acl = ERR_PTR(rc); + } + +out: + free_dentry_path(page); + free_xid(xid); + cifs_put_tlink(tlink); + return acl; +#else + return ERR_PTR(-EOPNOTSUPP); +#endif +} diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index c6ac19223ddc..05d0f84ede95 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1133,6 +1133,7 @@ const struct inode_operations cifs_dir_inode_ops = { .symlink = cifs_symlink, .mknod = cifs_mknod, .listxattr = cifs_listxattr, + .get_acl = cifs_get_acl, }; const struct inode_operations cifs_file_inode_ops = { @@ -1141,6 +1142,7 @@ const struct inode_operations cifs_file_inode_ops = { .permission = cifs_permission, .listxattr = cifs_listxattr, .fiemap = cifs_fiemap, + .get_acl = cifs_get_acl, }; const struct inode_operations cifs_symlink_inode_ops = { diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 83e83d8beabb..28e848495594 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -224,6 +224,8 @@ extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, const char *, u32 *, u32); extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *, const struct cifs_fid *, u32 *, u32); +extern struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, int type); extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, const char *, int); extern unsigned int setup_authusers_ACE(struct cifs_ace *pace); @@ -541,6 +543,10 @@ extern int CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, char *acl_inf, const int buflen, const int acl_type, const struct nls_table *nls_codepage, int remap_special_chars); +extern int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, + struct posix_acl **acl, const int acl_type, + const struct nls_table *nls_codepage, int remap); extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *fileName, const char *local_acl, const int buflen, const int acl_type, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 1724066c1536..d30535406885 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -3212,6 +3212,202 @@ setACLerrorExit: return rc; } +#ifdef CONFIG_FS_POSIX_ACL +/** + * cifs_init_posix_acl - convert ACL from cifs to POSIX ACL format + * @ace: POSIX ACL entry to store converted ACL into + * @cifs_ace: ACL in cifs format + * + * Convert an Access Control Entry from wire format to local POSIX xattr + * format. + * + * Note that the @cifs_uid member is used to store both {g,u}id_t. + */ +static void cifs_init_posix_acl(struct posix_acl_entry *ace, + struct cifs_posix_ace *cifs_ace) +{ + /* u8 cifs fields do not need le conversion */ + ace->e_perm = cifs_ace->cifs_e_perm; + ace->e_tag = cifs_ace->cifs_e_tag; + + switch (ace->e_tag) { + case ACL_USER: + ace->e_uid = make_kuid(&init_user_ns, + le64_to_cpu(cifs_ace->cifs_uid)); + break; + case ACL_GROUP: + ace->e_gid = make_kgid(&init_user_ns, + le64_to_cpu(cifs_ace->cifs_uid)); + break; + } + return; +} + +/** + * cifs_to_posix_acl - copy cifs ACL format to POSIX ACL format + * @acl: ACLs returned in POSIX ACL format + * @src: ACLs in cifs format + * @acl_type: type of POSIX ACL requested + * @size_of_data_area: size of SMB we got + * + * This function converts ACLs from cifs format to POSIX ACL format. + * If @acl is NULL then the size of the buffer required to store POSIX ACLs in + * their uapi format is returned. + */ +static int cifs_to_posix_acl(struct posix_acl **acl, char *src, + const int acl_type, const int size_of_data_area) +{ + int size = 0; + __u16 count; + struct cifs_posix_ace *pACE; + struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)src; + struct posix_acl *kacl = NULL; + struct posix_acl_entry *pa, *pe; + + if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION) + return -EOPNOTSUPP; + + if (acl_type == ACL_TYPE_ACCESS) { + count = le16_to_cpu(cifs_acl->access_entry_count); + pACE = &cifs_acl->ace_array[0]; + size = sizeof(struct cifs_posix_acl); + size += sizeof(struct cifs_posix_ace) * count; + /* check if we would go beyond end of SMB */ + if (size_of_data_area < size) { + cifs_dbg(FYI, "bad CIFS POSIX ACL size %d vs. %d\n", + size_of_data_area, size); + return -EINVAL; + } + } else if (acl_type == ACL_TYPE_DEFAULT) { + count = le16_to_cpu(cifs_acl->access_entry_count); + size = sizeof(struct cifs_posix_acl); + size += sizeof(struct cifs_posix_ace) * count; + /* skip past access ACEs to get to default ACEs */ + pACE = &cifs_acl->ace_array[count]; + count = le16_to_cpu(cifs_acl->default_entry_count); + size += sizeof(struct cifs_posix_ace) * count; + /* check if we would go beyond end of SMB */ + if (size_of_data_area < size) + return -EINVAL; + } else { + /* illegal type */ + return -EINVAL; + } + + /* Allocate number of POSIX ACLs to store in VFS format. */ + kacl = posix_acl_alloc(count, GFP_NOFS); + if (!kacl) + return -ENOMEM; + + FOREACH_ACL_ENTRY(pa, kacl, pe) { + cifs_init_posix_acl(pa, pACE); + pACE++; + } + + *acl = kacl; + return 0; +} + +int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, struct posix_acl **acl, + const int acl_type, const struct nls_table *nls_codepage, + int remap) +{ +/* SMB_QUERY_POSIX_ACL */ + TRANSACTION2_QPI_REQ *pSMB = NULL; + TRANSACTION2_QPI_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned; + int name_len; + __u16 params, byte_count; + + cifs_dbg(FYI, "In GetPosixACL (Unix) for path %s\n", searchName); + +queryAclRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->FileName, + searchName, PATH_MAX, nls_codepage, + remap); + name_len++; /* trailing null */ + name_len *= 2; + pSMB->FileName[name_len] = 0; + pSMB->FileName[name_len+1] = 0; + } else { + name_len = copy_path_name(pSMB->FileName, searchName); + } + + params = 2 /* level */ + 4 /* rsrvd */ + name_len /* incl null */ ; + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max data count below from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(4000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + pSMB->ParameterOffset = cpu_to_le16( + offsetof(struct smb_com_transaction2_qpi_req, + InformationLevel) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION); + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_ACL); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get); + if (rc) { + cifs_dbg(FYI, "Send error in Query POSIX ACL = %d\n", rc); + } else { + /* decode response */ + + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + /* BB also check enough total bytes returned */ + if (rc || get_bcc(&pSMBr->hdr) < 2) + rc = -EIO; /* bad smb */ + else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + __u16 count = le16_to_cpu(pSMBr->t2.DataCount); + rc = cifs_to_posix_acl(acl, + (char *)&pSMBr->hdr.Protocol+data_offset, + acl_type, count); + } + } + cifs_buf_release(pSMB); + /* + * The else branch after SendReceive() doesn't return EAGAIN so if we + * allocated @acl in cifs_to_posix_acl() we are guaranteed to return + * here and don't leak POSIX ACLs. + */ + if (rc == -EAGAIN) + goto queryAclRetry; + return rc; +} +#else +int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, struct posix_acl **acl, + const int acl_type, const struct nls_table *nls_codepage, + int remap) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_FS_POSIX_ACL */ + int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon, const int netfid, __u64 *pExtAttrBits, __u64 *pMask) -- cgit From dc1af4c4b4721abfa07b351063825acc7e69cc66 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:03 +0200 Subject: cifs: implement set acl method The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. In order to build a type safe posix api around get and set acl we need all filesystem to implement get and set acl. So far cifs wasn't able to implement get and set acl inode operations because it needs access to the dentry. Now that we extended the set acl inode operation to take a dentry argument and added a new get acl inode operation that takes a dentry argument we can let cifs implement get and set acl inode operations. This is mostly a copy and paste of the codepaths currently used in cifs' posix acl xattr handler. After we have fully implemented the posix acl api and switched the vfs over to it, the cifs specific posix acl xattr handler and associated code will be removed and the code duplication will go away. Note, until the vfs has been switched to the new posix acl api this patch is a non-functional change. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) --- fs/cifs/cifsacl.c | 72 +++++++++++++++++++++++ fs/cifs/cifsfs.c | 2 + fs/cifs/cifsproto.h | 6 ++ fs/cifs/cifssmb.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 240 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index a6730e0eb57b..6a9f03c882dc 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include "cifspdu.h" #include "cifsglob.h" @@ -1735,3 +1737,73 @@ out: return ERR_PTR(-EOPNOTSUPP); #endif } + +int cifs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + struct posix_acl *acl, int type) +{ +#if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX) + int rc = -EOPNOTSUPP; + unsigned int xid; + struct super_block *sb = dentry->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + const char *full_path; + void *page; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + pTcon = tlink_tcon(tlink); + + xid = get_xid(); + page = alloc_dentry_path(); + + full_path = build_path_from_dentry(dentry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); + goto out; + } + /* return dos attributes as pseudo xattr */ + /* return alt name if available as pseudo attr */ + + /* if proc/fs/cifs/streamstoxattr is set then + search server for EAs or streams to + returns as xattrs */ + if (posix_acl_xattr_size(acl->a_count) > CIFSMaxBufSize) { + cifs_dbg(FYI, "size of EA value too large\n"); + rc = -EOPNOTSUPP; + goto out; + } + + switch (type) { + case ACL_TYPE_ACCESS: + if (!acl) + goto out; + if (sb->s_flags & SB_POSIXACL) + rc = cifs_do_set_acl(xid, pTcon, full_path, acl, + ACL_TYPE_ACCESS, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + break; + + case ACL_TYPE_DEFAULT: + if (!acl) + goto out; + if (sb->s_flags & SB_POSIXACL) + rc = cifs_do_set_acl(xid, pTcon, full_path, acl, + ACL_TYPE_DEFAULT, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + break; + } + +out: + free_dentry_path(page); + free_xid(xid); + cifs_put_tlink(tlink); + return rc; +#else + return -EOPNOTSUPP; +#endif +} diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 05d0f84ede95..9d8f077a23c4 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1134,6 +1134,7 @@ const struct inode_operations cifs_dir_inode_ops = { .mknod = cifs_mknod, .listxattr = cifs_listxattr, .get_acl = cifs_get_acl, + .set_acl = cifs_set_acl, }; const struct inode_operations cifs_file_inode_ops = { @@ -1143,6 +1144,7 @@ const struct inode_operations cifs_file_inode_ops = { .listxattr = cifs_listxattr, .fiemap = cifs_fiemap, .get_acl = cifs_get_acl, + .set_acl = cifs_set_acl, }; const struct inode_operations cifs_symlink_inode_ops = { diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 28e848495594..ce7f08a387b5 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -226,6 +226,8 @@ extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *, const struct cifs_fid *, u32 *, u32); extern struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns, struct dentry *dentry, int type); +extern int cifs_set_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, struct posix_acl *acl, int type); extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, const char *, int); extern unsigned int setup_authusers_ACE(struct cifs_ace *pace); @@ -551,6 +553,10 @@ extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *fileName, const char *local_acl, const int buflen, const int acl_type, const struct nls_table *nls_codepage, int remap_special_chars); +extern int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *fileName, + const struct posix_acl *acl, const int acl_type, + const struct nls_table *nls_codepage, int remap); extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon, const int netfid, __u64 *pExtAttrBits, __u64 *pMask); #endif /* CIFS_ALLOW_INSECURE_LEGACY */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index d30535406885..f119ca917947 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -3308,6 +3308,83 @@ static int cifs_to_posix_acl(struct posix_acl **acl, char *src, return 0; } +/** + * cifs_init_ace - convert ACL entry from POSIX ACL to cifs format + * @cifs_ace: the cifs ACL entry to store into + * @local_ace: the POSIX ACL entry to convert + */ +static void cifs_init_ace(struct cifs_posix_ace *cifs_ace, + const struct posix_acl_entry *local_ace) +{ + cifs_ace->cifs_e_perm = local_ace->e_perm; + cifs_ace->cifs_e_tag = local_ace->e_tag; + + switch (local_ace->e_tag) { + case ACL_USER: + cifs_ace->cifs_uid = + cpu_to_le64(from_kuid(&init_user_ns, local_ace->e_uid)); + break; + case ACL_GROUP: + cifs_ace->cifs_uid = + cpu_to_le64(from_kgid(&init_user_ns, local_ace->e_gid)); + break; + default: + cifs_ace->cifs_uid = cpu_to_le64(-1); + } +} + +/** + * posix_acl_to_cifs - convert ACLs from POSIX ACL to cifs format + * @parm_data: ACLs in cifs format to conver to + * @acl: ACLs in POSIX ACL format to convert from + * @acl_type: the type of POSIX ACLs stored in @acl + * + * Return: the number cifs ACL entries after conversion + */ +static __u16 posix_acl_to_cifs(char *parm_data, const struct posix_acl *acl, + const int acl_type) +{ + __u16 rc = 0; + struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data; + const struct posix_acl_entry *pa, *pe; + int count; + int i = 0; + + if ((acl == NULL) || (cifs_acl == NULL)) + return 0; + + count = acl->a_count; + cifs_dbg(FYI, "setting acl with %d entries\n", count); + + /* + * Note that the uapi POSIX ACL version is verified by the VFS and is + * independent of the cifs ACL version. Changing the POSIX ACL version + * is a uapi change and if it's changed we will pass down the POSIX ACL + * version in struct posix_acl from the VFS. For now there's really + * only one that all filesystems know how to deal with. + */ + cifs_acl->version = cpu_to_le16(1); + if (acl_type == ACL_TYPE_ACCESS) { + cifs_acl->access_entry_count = cpu_to_le16(count); + cifs_acl->default_entry_count = cpu_to_le16(0xFFFF); + } else if (acl_type == ACL_TYPE_DEFAULT) { + cifs_acl->default_entry_count = cpu_to_le16(count); + cifs_acl->access_entry_count = cpu_to_le16(0xFFFF); + } else { + cifs_dbg(FYI, "unknown ACL type %d\n", acl_type); + return 0; + } + FOREACH_ACL_ENTRY(pa, acl, pe) { + cifs_init_ace(&cifs_acl->ace_array[i++], pa); + } + if (rc == 0) { + rc = (__u16)(count * sizeof(struct cifs_posix_ace)); + rc += sizeof(struct cifs_posix_acl); + /* BB add check to make sure ACL does not overflow SMB */ + } + return rc; +} + int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, struct posix_acl **acl, const int acl_type, const struct nls_table *nls_codepage, @@ -3398,6 +3475,81 @@ queryAclRetry: goto queryAclRetry; return rc; } + +int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *fileName, const struct posix_acl *acl, + const int acl_type, const struct nls_table *nls_codepage, + int remap) +{ + struct smb_com_transaction2_spi_req *pSMB = NULL; + struct smb_com_transaction2_spi_rsp *pSMBr = NULL; + char *parm_data; + int name_len; + int rc = 0; + int bytes_returned = 0; + __u16 params, byte_count, data_count, param_offset, offset; + + cifs_dbg(FYI, "In SetPosixACL (Unix) for path %s\n", fileName); +setAclRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { + name_len = copy_path_name(pSMB->FileName, fileName); + } + params = 6 + name_len; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find max SMB size from sess */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_spi_req, + InformationLevel) - 4; + offset = param_offset + params; + parm_data = ((char *) &pSMB->hdr.Protocol) + offset; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + + /* convert to on the wire format for POSIX ACL */ + data_count = posix_acl_to_cifs(parm_data, acl, acl_type); + + if (data_count == 0) { + rc = -EOPNOTSUPP; + goto setACLerrorExit; + } + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); + pSMB->InformationLevel = cpu_to_le16(SMB_SET_POSIX_ACL); + byte_count = 3 /* pad */ + params + data_count; + pSMB->DataCount = cpu_to_le16(data_count); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cifs_dbg(FYI, "Set POSIX ACL returned %d\n", rc); + +setACLerrorExit: + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto setAclRetry; + return rc; +} #else int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, struct posix_acl **acl, @@ -3406,6 +3558,14 @@ int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon, { return -EOPNOTSUPP; } + +int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *fileName, const struct posix_acl *acl, + const int acl_type, const struct nls_table *nls_codepage, + int remap) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_FS_POSIX_ACL */ int -- cgit From 6cd4d4e8b6e1495eb0cafa3a59d1fde137a98d22 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:04 +0200 Subject: 9p: implement get acl method The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. In order to build a type safe posix api around get and set acl we need all filesystem to implement get and set acl. So far 9p implemented a ->get_inode_acl() operation that didn't require access to the dentry in order to allow (limited) permission checking via posix acls in the vfs. Now that we have get and set acl inode operations that take a dentry argument we can give 9p get and set acl inode operations. This is mostly a refactoring of the codepaths currently used in 9p posix acl xattr handler. After we have fully implemented the posix acl api and switched the vfs over to it, the 9p specific posix acl xattr handler and associated code will be removed. Note, until the vfs has been switched to the new posix acl api this patch is a non-functional change. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) --- fs/9p/acl.c | 80 ++++++++++++++++++++++++++++++++++++++------------ fs/9p/acl.h | 5 +++- fs/9p/vfs_inode_dotl.c | 6 ++-- 3 files changed, 69 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 4dac4a0dc5f4..67f8b57c67e0 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -17,34 +17,64 @@ #include "v9fs_vfs.h" #include "fid.h" -static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name) +static struct posix_acl *v9fs_fid_get_acl(struct p9_fid *fid, const char *name) { ssize_t size; void *value = NULL; struct posix_acl *acl = NULL; size = v9fs_fid_xattr_get(fid, name, NULL, 0); - if (size > 0) { - value = kzalloc(size, GFP_NOFS); - if (!value) - return ERR_PTR(-ENOMEM); - size = v9fs_fid_xattr_get(fid, name, value, size); - if (size > 0) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - goto err_out; - } - } else if (size == -ENODATA || size == 0 || - size == -ENOSYS || size == -EOPNOTSUPP) { - acl = NULL; - } else - acl = ERR_PTR(-EIO); - -err_out: + if (size < 0) + return ERR_PTR(size); + if (size == 0) + return ERR_PTR(-ENODATA); + + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + + size = v9fs_fid_xattr_get(fid, name, value, size); + if (size < 0) + acl = ERR_PTR(size); + else if (size == 0) + acl = ERR_PTR(-ENODATA); + else + acl = posix_acl_from_xattr(&init_user_ns, value, size); kfree(value); return acl; } +static struct posix_acl *v9fs_acl_get(struct dentry *dentry, const char *name) +{ + struct p9_fid *fid; + struct posix_acl *acl = NULL; + + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return ERR_CAST(fid); + + acl = v9fs_fid_get_acl(fid, name); + p9_fid_put(fid); + return acl; +} + +static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, const char *name) +{ + int retval; + struct posix_acl *acl = NULL; + + acl = v9fs_fid_get_acl(fid, name); + if (!IS_ERR(acl)) + return acl; + + retval = PTR_ERR(acl); + if (retval == -ENODATA || retval == -ENOSYS || retval == -EOPNOTSUPP) + return NULL; + + /* map everything else to -EIO */ + return ERR_PTR(-EIO); +} + int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) { int retval = 0; @@ -89,7 +119,7 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type) return acl; } -struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu) +struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type, bool rcu) { struct v9fs_session_info *v9ses; @@ -109,6 +139,18 @@ struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu) } +struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, int type) +{ + struct v9fs_session_info *v9ses; + + v9ses = v9fs_dentry2v9ses(dentry); + /* We allow set/get/list of acl when access=client is not specified. */ + if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) + return v9fs_acl_get(dentry, posix_acl_xattr_name(type)); + return v9fs_get_cached_acl(d_inode(dentry), type); +} + static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl) { int retval; diff --git a/fs/9p/acl.h b/fs/9p/acl.h index ce5175d463dd..359dab4da900 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -8,8 +8,10 @@ #ifdef CONFIG_9P_FS_POSIX_ACL int v9fs_get_acl(struct inode *inode, struct p9_fid *fid); -struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, +struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type, bool rcu); +struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, int type); int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid); int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid, struct posix_acl *dacl, struct posix_acl *acl); @@ -17,6 +19,7 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep, struct posix_acl **dpacl, struct posix_acl **pacl); void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl); #else +#define v9fs_iop_get_inode_acl NULL #define v9fs_iop_get_acl NULL static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) { diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 0d1a7f2c579d..a4211fcb9168 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -983,14 +983,16 @@ const struct inode_operations v9fs_dir_inode_operations_dotl = { .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .listxattr = v9fs_listxattr, - .get_inode_acl = v9fs_iop_get_acl, + .get_inode_acl = v9fs_iop_get_inode_acl, + .get_acl = v9fs_iop_get_acl, }; const struct inode_operations v9fs_file_inode_operations_dotl = { .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .listxattr = v9fs_listxattr, - .get_inode_acl = v9fs_iop_get_acl, + .get_inode_acl = v9fs_iop_get_inode_acl, + .get_acl = v9fs_iop_get_acl, }; const struct inode_operations v9fs_symlink_inode_operations_dotl = { -- cgit From 079da629383ea960dfa0615e18d51d8bd121bd0c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:05 +0200 Subject: 9p: implement set acl method The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. In order to build a type safe posix api around get and set acl we need all filesystem to implement get and set acl. So far 9p implemented a ->get_inode_acl() operation that didn't require access to the dentry in order to allow (limited) permission checking via posix acls in the vfs. Now that we have get and set acl inode operations that take a dentry argument we can give 9p get and set acl inode operations. This is mostly a light refactoring of the codepaths currently used in 9p posix acl xattr handler. After we have fully implemented the posix acl api and switched the vfs over to it, the 9p specific posix acl xattr handler and associated code will be removed. Note, until the vfs has been switched to the new posix acl api this patch is a non-functional change. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) --- fs/9p/acl.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/9p/acl.h | 3 ++ fs/9p/vfs_inode_dotl.c | 2 ++ 3 files changed, 99 insertions(+) (limited to 'fs') diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 67f8b57c67e0..135b26cee63a 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -151,6 +151,100 @@ struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns, return v9fs_get_cached_acl(d_inode(dentry), type); } +int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + struct posix_acl *acl, int type) +{ + int retval; + size_t size = 0; + void *value = NULL; + const char *acl_name; + struct v9fs_session_info *v9ses; + struct inode *inode = d_inode(dentry); + + if (acl) { + retval = posix_acl_valid(inode->i_sb->s_user_ns, acl); + if (retval) + goto err_out; + + size = posix_acl_xattr_size(acl->a_count); + + value = kzalloc(size, GFP_NOFS); + if (!value) { + retval = -ENOMEM; + goto err_out; + } + + retval = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (retval < 0) + goto err_out; + } + + /* + * set the attribute on the remote. Without even looking at the + * xattr value. We leave it to the server to validate + */ + acl_name = posix_acl_xattr_name(type); + v9ses = v9fs_dentry2v9ses(dentry); + if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) { + retval = v9fs_xattr_set(dentry, acl_name, value, size, 0); + goto err_out; + } + + if (S_ISLNK(inode->i_mode)) { + retval = -EOPNOTSUPP; + goto err_out; + } + + if (!inode_owner_or_capable(&init_user_ns, inode)) { + retval = -EPERM; + goto err_out; + } + + switch (type) { + case ACL_TYPE_ACCESS: + if (acl) { + struct iattr iattr = {}; + struct posix_acl *acl_mode = acl; + + retval = posix_acl_update_mode(&init_user_ns, inode, + &iattr.ia_mode, + &acl_mode); + if (retval) + goto err_out; + if (!acl_mode) { + /* + * ACL can be represented by the mode bits. + * So don't update ACL below. + */ + kfree(value); + value = NULL; + size = 0; + } + iattr.ia_valid = ATTR_MODE; + /* + * FIXME should we update ctime ? + * What is the following setxattr update the mode ? + */ + v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr); + } + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) { + retval = acl ? -EINVAL : 0; + goto err_out; + } + break; + } + + retval = v9fs_xattr_set(dentry, acl_name, value, size, 0); + if (!retval) + set_cached_acl(inode, type, acl); + +err_out: + kfree(value); + return retval; +} + static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl) { int retval; diff --git a/fs/9p/acl.h b/fs/9p/acl.h index 359dab4da900..4c60a2bce5de 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -12,6 +12,8 @@ struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type, bool rcu); struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns, struct dentry *dentry, int type); +int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + struct posix_acl *acl, int type); int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid); int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid, struct posix_acl *dacl, struct posix_acl *acl); @@ -21,6 +23,7 @@ void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl); #else #define v9fs_iop_get_inode_acl NULL #define v9fs_iop_get_acl NULL +#define v9fs_iop_set_acl NULL static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) { return 0; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index a4211fcb9168..03c1743c4aff 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -985,6 +985,7 @@ const struct inode_operations v9fs_dir_inode_operations_dotl = { .listxattr = v9fs_listxattr, .get_inode_acl = v9fs_iop_get_inode_acl, .get_acl = v9fs_iop_get_acl, + .set_acl = v9fs_iop_set_acl, }; const struct inode_operations v9fs_file_inode_operations_dotl = { @@ -993,6 +994,7 @@ const struct inode_operations v9fs_file_inode_operations_dotl = { .listxattr = v9fs_listxattr, .get_inode_acl = v9fs_iop_get_inode_acl, .get_acl = v9fs_iop_get_acl, + .set_acl = v9fs_iop_set_acl, }; const struct inode_operations v9fs_symlink_inode_operations_dotl = { -- cgit From 56851bc9b9f072dd738f25ed29c0d5abe9f2908b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 29 Sep 2022 10:47:36 +0200 Subject: internal: add may_write_xattr() Split out the generic checks whether an inode allows writing xattrs. Since security.* and system.* xattrs don't have any restrictions and we're going to split out posix acls into a dedicated api we will use this helper to check whether we can write posix acls. Signed-off-by: Christian Brauner (Microsoft) --- fs/internal.h | 1 + fs/xattr.c | 43 ++++++++++++++++++++++++++++++------------- 2 files changed, 31 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/internal.h b/fs/internal.h index 6f0386b34fae..aa5f240496d9 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -232,5 +232,6 @@ ssize_t do_getxattr(struct user_namespace *mnt_userns, int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct xattr_ctx *ctx); +int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode); ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos); diff --git a/fs/xattr.c b/fs/xattr.c index 61107b6bbed2..31b5ac65ca34 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -80,6 +80,31 @@ xattr_resolve_name(struct inode *inode, const char **name) return ERR_PTR(-EOPNOTSUPP); } +/** + * may_write_xattr - check whether inode allows writing xattr + * @mnt_userns: User namespace of the mount the inode was found from + * @inode: the inode on which to set an xattr + * + * Check whether the inode allows writing xattrs. Specifically, we can never + * set or remove an extended attribute on a read-only filesystem or on an + * immutable / append-only inode. + * + * We also need to ensure that the inode has a mapping in the mount to + * not risk writing back invalid i_{g,u}id values. + * + * Return: On success zero is returned. On error a negative errno is returned. + */ +int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode) +{ + if (IS_IMMUTABLE(inode)) + return -EPERM; + if (IS_APPEND(inode)) + return -EPERM; + if (HAS_UNMAPPED_ID(mnt_userns, inode)) + return -EPERM; + return 0; +} + /* * Check permissions for extended attribute access. This is a bit complicated * because different namespaces have very different rules. @@ -88,20 +113,12 @@ static int xattr_permission(struct user_namespace *mnt_userns, struct inode *inode, const char *name, int mask) { - /* - * We can never set or remove an extended attribute on a read-only - * filesystem or on an immutable / append-only inode. - */ if (mask & MAY_WRITE) { - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - /* - * Updating an xattr will likely cause i_uid and i_gid - * to be writen back improperly if their true value is - * unknown to the vfs. - */ - if (HAS_UNMAPPED_ID(mnt_userns, inode)) - return -EPERM; + int ret; + + ret = may_write_xattr(mnt_userns, inode); + if (ret) + return ret; } /* -- cgit From e4cc9163032fed6ff27dd03325ddc54f88863a24 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:06 +0200 Subject: acl: add vfs_set_acl() In previous patches we implemented get and set inode operations for all non-stacking filesystems that support posix acls but didn't yet implement get and/or set acl inode operations. This specifically affected cifs and 9p. Now we can build a posix acl api based solely on get and set inode operations. We add a new vfs_set_acl() api that can be used to set posix acls. This finally removes all type unsafety and type conversion issues explained in detail in [1] that we aim to get rid of. After we finished building the vfs api we can switch stacking filesystems to rely on the new posix api and then finally switch the xattr system calls themselves to rely on the posix acl api. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) --- fs/posix_acl.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) (limited to 'fs') diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 8bbabe477cb2..970250506f07 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -25,6 +25,11 @@ #include #include #include +#include +#include +#include + +#include "internal.h" static struct posix_acl **acl_by_type(struct inode *inode, int type) { @@ -1257,3 +1262,105 @@ int simple_acl_create(struct inode *dir, struct inode *inode) posix_acl_release(acl); return 0; } + +static int vfs_set_acl_idmapped_mnt(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + struct posix_acl *acl) +{ + for (int n = 0; n < acl->a_count; n++) { + struct posix_acl_entry *acl_e = &acl->a_entries[n]; + + switch (acl_e->e_tag) { + case ACL_USER: + acl_e->e_uid = from_vfsuid(mnt_userns, fs_userns, + VFSUIDT_INIT(acl_e->e_uid)); + break; + case ACL_GROUP: + acl_e->e_gid = from_vfsgid(mnt_userns, fs_userns, + VFSGIDT_INIT(acl_e->e_gid)); + break; + } + } + + return 0; +} + +/** + * vfs_set_acl - set posix acls + * @mnt_userns: user namespace of the mount + * @dentry: the dentry based on which to set the posix acls + * @acl_name: the name of the posix acl + * @kacl: the posix acls in the appropriate VFS format + * + * This function sets @kacl. The caller must all posix_acl_release() on @kacl + * afterwards. + * + * Return: On success 0, on error negative errno. + */ +int vfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *acl_name, struct posix_acl *kacl) +{ + int acl_type; + int error; + struct inode *inode = d_inode(dentry); + struct inode *delegated_inode = NULL; + + acl_type = posix_acl_type(acl_name); + if (acl_type < 0) + return -EINVAL; + + if (kacl) { + /* + * If we're on an idmapped mount translate from mount specific + * vfs{g,u}id_t into global filesystem k{g,u}id_t. + * Afterwards we can cache the POSIX ACLs filesystem wide and - + * if this is a filesystem with a backing store - ultimately + * translate them to backing store values. + */ + error = vfs_set_acl_idmapped_mnt(mnt_userns, i_user_ns(inode), kacl); + if (error) + return error; + } + +retry_deleg: + inode_lock(inode); + + /* + * We only care about restrictions the inode struct itself places upon + * us otherwise POSIX ACLs aren't subject to any VFS restrictions. + */ + error = may_write_xattr(mnt_userns, inode); + if (error) + goto out_inode_unlock; + + error = security_inode_set_acl(mnt_userns, dentry, acl_name, kacl); + if (error) + goto out_inode_unlock; + + error = try_break_deleg(inode, &delegated_inode); + if (error) + goto out_inode_unlock; + + if (inode->i_opflags & IOP_XATTR) + error = set_posix_acl(mnt_userns, dentry, acl_type, kacl); + else if (unlikely(is_bad_inode(inode))) + error = -EIO; + else + error = -EOPNOTSUPP; + if (!error) { + fsnotify_xattr(dentry); + evm_inode_post_set_acl(dentry, acl_name, kacl); + } + +out_inode_unlock: + inode_unlock(inode); + + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } + + return error; +} +EXPORT_SYMBOL_GPL(vfs_set_acl); -- cgit From 4f353ba4a9f42ad283dc6afdd84dae0b1d294842 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:13 +0200 Subject: acl: add vfs_get_acl() In previous patches we implemented get and set inode operations for all non-stacking filesystems that support posix acls but didn't yet implement get and/or set acl inode operations. This specifically affected cifs and 9p. Now we can build a posix acl api based solely on get and set inode operations. We add a new vfs_get_acl() api that can be used to get posix acls. This finally removes all type unsafety and type conversion issues explained in detail in [1] that we aim to get rid of. After we finished building the vfs api we can switch stacking filesystems to rely on the new posix api and then finally switch the xattr system calls themselves to rely on the posix acl api. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) --- fs/posix_acl.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 122 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 970250506f07..af04e177f019 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -111,7 +111,9 @@ void forget_all_cached_acls(struct inode *inode) } EXPORT_SYMBOL(forget_all_cached_acls); -struct posix_acl *get_inode_acl(struct inode *inode, int type) +static struct posix_acl *__get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, struct inode *inode, + int type) { void *sentinel; struct posix_acl **p; @@ -144,19 +146,21 @@ struct posix_acl *get_inode_acl(struct inode *inode, int type) cmpxchg(p, ACL_NOT_CACHED, sentinel); /* - * Normally, the ACL returned by ->get_inode_acl will be cached. + * Normally, the ACL returned by ->get{_inode}_acl will be cached. * A filesystem can prevent that by calling - * forget_cached_acl(inode, type) in ->get_inode_acl. + * forget_cached_acl(inode, type) in ->get{_inode}_acl. * - * If the filesystem doesn't have a get_inode_acl() function at all, + * If the filesystem doesn't have a get{_inode}_ acl() function at all, * we'll just create the negative cache entry. */ - if (!inode->i_op->get_inode_acl) { + if (dentry && inode->i_op->get_acl) { + acl = inode->i_op->get_acl(mnt_userns, dentry, type); + } else if (inode->i_op->get_inode_acl) { + acl = inode->i_op->get_inode_acl(inode, type, false); + } else { set_cached_acl(inode, type, NULL); return NULL; } - acl = inode->i_op->get_inode_acl(inode, type, false); - if (IS_ERR(acl)) { /* * Remove our sentinel so that we don't block future attempts @@ -174,6 +178,11 @@ struct posix_acl *get_inode_acl(struct inode *inode, int type) posix_acl_release(acl); return acl; } + +struct posix_acl *get_inode_acl(struct inode *inode, int type) +{ + return __get_acl(&init_user_ns, NULL, inode, type); +} EXPORT_SYMBOL(get_inode_acl); /* @@ -1120,6 +1129,67 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, } EXPORT_SYMBOL (posix_acl_to_xattr); +/** + * vfs_posix_acl_to_xattr - convert from kernel to userspace representation + * @mnt_userns: user namespace of the mount + * @inode: inode the posix acls are set on + * @acl: the posix acls as represented by the vfs + * @buffer: the buffer into which to convert @acl + * @size: size of @buffer + * + * This converts @acl from the VFS representation in the filesystem idmapping + * to the uapi form reportable to userspace. And mount and caller idmappings + * are handled appropriately. + * + * Return: On success, the size of the stored uapi posix acls, on error a + * negative errno. + */ +ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns, + struct inode *inode, const struct posix_acl *acl, + void *buffer, size_t size) + +{ + struct posix_acl_xattr_header *ext_acl = buffer; + struct posix_acl_xattr_entry *ext_entry; + struct user_namespace *fs_userns, *caller_userns; + ssize_t real_size, n; + vfsuid_t vfsuid; + vfsgid_t vfsgid; + + real_size = posix_acl_xattr_size(acl->a_count); + if (!buffer) + return real_size; + if (real_size > size) + return -ERANGE; + + ext_entry = (void *)(ext_acl + 1); + ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); + + fs_userns = i_user_ns(inode); + caller_userns = current_user_ns(); + for (n=0; n < acl->a_count; n++, ext_entry++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; + ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); + ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch(acl_e->e_tag) { + case ACL_USER: + vfsuid = make_vfsuid(mnt_userns, fs_userns, acl_e->e_uid); + ext_entry->e_id = cpu_to_le32(from_kuid( + caller_userns, vfsuid_into_kuid(vfsuid))); + break; + case ACL_GROUP: + vfsgid = make_vfsgid(mnt_userns, fs_userns, acl_e->e_gid); + ext_entry->e_id = cpu_to_le32(from_kgid( + caller_userns, vfsgid_into_kgid(vfsgid))); + break; + default: + ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); + break; + } + } + return real_size; +} + static int posix_acl_xattr_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, @@ -1364,3 +1434,48 @@ out_inode_unlock: return error; } EXPORT_SYMBOL_GPL(vfs_set_acl); + +/** + * vfs_get_acl - get posix acls + * @mnt_userns: user namespace of the mount + * @dentry: the dentry based on which to retrieve the posix acls + * @acl_name: the name of the posix acl + * + * This function retrieves @kacl from the filesystem. The caller must all + * posix_acl_release() on @kacl. + * + * Return: On success POSIX ACLs in VFS format, on error negative errno. + */ +struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *acl_name) +{ + struct inode *inode = d_inode(dentry); + struct posix_acl *acl; + int acl_type, error; + + acl_type = posix_acl_type(acl_name); + if (acl_type < 0) + return ERR_PTR(-EINVAL); + + /* + * The VFS has no restrictions on reading POSIX ACLs so calling + * something like xattr_permission() isn't needed. Only LSMs get a say. + */ + error = security_inode_get_acl(mnt_userns, dentry, acl_name); + if (error) + return ERR_PTR(error); + + if (!IS_POSIXACL(inode)) + return ERR_PTR(-EOPNOTSUPP); + if (S_ISLNK(inode->i_mode)) + return ERR_PTR(-EOPNOTSUPP); + + acl = __get_acl(mnt_userns, dentry, inode, acl_type); + if (IS_ERR(acl)) + return acl; + if (!acl) + return ERR_PTR(-ENODATA); + + return acl; +} +EXPORT_SYMBOL_GPL(vfs_get_acl); -- cgit From aeb7f00542af48ac63e448de46d672cfd79a7069 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:14 +0200 Subject: acl: add vfs_remove_acl() In previous patches we implemented get and set inode operations for all non-stacking filesystems that support posix acls but didn't yet implement get and/or set acl inode operations. This specifically affected cifs and 9p. Now we can build a posix acl api based solely on get and set inode operations. We add a new vfs_remove_acl() api that can be used to set posix acls. This finally removes all type unsafety and type conversion issues explained in detail in [1] that we aim to get rid of. After we finished building the vfs api we can switch stacking filesystems to rely on the new posix api and then finally switch the xattr system calls themselves to rely on the posix acl api. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) --- fs/posix_acl.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'fs') diff --git a/fs/posix_acl.c b/fs/posix_acl.c index af04e177f019..3a141082b24c 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -1479,3 +1479,68 @@ struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, return acl; } EXPORT_SYMBOL_GPL(vfs_get_acl); + +/** + * vfs_remove_acl - remove posix acls + * @mnt_userns: user namespace of the mount + * @dentry: the dentry based on which to retrieve the posix acls + * @acl_name: the name of the posix acl + * + * This function removes posix acls. + * + * Return: On success 0, on error negative errno. + */ +int vfs_remove_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *acl_name) +{ + int acl_type; + int error; + struct inode *inode = d_inode(dentry); + struct inode *delegated_inode = NULL; + + acl_type = posix_acl_type(acl_name); + if (acl_type < 0) + return -EINVAL; + +retry_deleg: + inode_lock(inode); + + /* + * We only care about restrictions the inode struct itself places upon + * us otherwise POSIX ACLs aren't subject to any VFS restrictions. + */ + error = may_write_xattr(mnt_userns, inode); + if (error) + goto out_inode_unlock; + + error = security_inode_remove_acl(mnt_userns, dentry, acl_name); + if (error) + goto out_inode_unlock; + + error = try_break_deleg(inode, &delegated_inode); + if (error) + goto out_inode_unlock; + + if (inode->i_opflags & IOP_XATTR) + error = set_posix_acl(mnt_userns, dentry, acl_type, NULL); + else if (unlikely(is_bad_inode(inode))) + error = -EIO; + else + error = -EOPNOTSUPP; + if (!error) { + fsnotify_xattr(dentry); + evm_inode_post_remove_acl(mnt_userns, dentry, acl_name); + } + +out_inode_unlock: + inode_unlock(inode); + + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } + + return error; +} +EXPORT_SYMBOL_GPL(vfs_remove_acl); -- cgit From b82784a2f52a7a2a5491d36f0c257cf64d87abb5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:16 +0200 Subject: ksmbd: use vfs_remove_acl() The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. Now that we've switched all filesystems that can serve as the lower filesystem for ksmbd we can switch ksmbd over to rely on the posix acl api. Note that this is orthogonal to switching the vfs itself over. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) --- fs/ksmbd/vfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 93f65f01a4a6..7ccda91b0dc4 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -1321,7 +1321,7 @@ int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns, sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1) || !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1)) { - err = ksmbd_vfs_remove_xattr(user_ns, dentry, name); + err = vfs_remove_acl(user_ns, dentry, name); if (err) ksmbd_debug(SMB, "remove acl xattr failed : %s\n", name); -- cgit From af84016f1cfe78b23c8efbf6ba5c3c660d52a9ee Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:17 +0200 Subject: ecryptfs: implement get acl method The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. In order to build a type safe posix api around get and set acl we need all filesystem to implement get and set acl. So far ecryptfs didn't implement get and set acl inode operations because it wanted easy access to the dentry. Now that we extended the set acl inode operation to take a dentry argument and added a new get acl inode operation that takes a dentry argument we can let ecryptfs implement get and set acl inode operations. Note, until the vfs has been switched to the new posix acl api this patch is a non-functional change. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) --- fs/ecryptfs/inode.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index c214fe0981bd..b4be9f6bafd2 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include #include "ecryptfs_kernel.h" @@ -1120,6 +1122,13 @@ static int ecryptfs_fileattr_set(struct user_namespace *mnt_userns, return rc; } +static struct posix_acl *ecryptfs_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, int type) +{ + return vfs_get_acl(mnt_userns, ecryptfs_dentry_to_lower(dentry), + posix_acl_xattr_name(type)); +} + const struct inode_operations ecryptfs_symlink_iops = { .get_link = ecryptfs_get_link, .permission = ecryptfs_permission, @@ -1143,6 +1152,7 @@ const struct inode_operations ecryptfs_dir_iops = { .listxattr = ecryptfs_listxattr, .fileattr_get = ecryptfs_fileattr_get, .fileattr_set = ecryptfs_fileattr_set, + .get_acl = ecryptfs_get_acl, }; const struct inode_operations ecryptfs_main_iops = { @@ -1152,6 +1162,7 @@ const struct inode_operations ecryptfs_main_iops = { .listxattr = ecryptfs_listxattr, .fileattr_get = ecryptfs_fileattr_get, .fileattr_set = ecryptfs_fileattr_set, + .get_acl = ecryptfs_get_acl, }; static int ecryptfs_xattr_get(const struct xattr_handler *handler, -- cgit From 86c261b9eb4ce0a4ae189c0317cd8eddb8302d5a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:18 +0200 Subject: ecryptfs: implement set acl method The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. In order to build a type safe posix api around get and set acl we need all filesystem to implement get and set acl. So far ecryptfs didn't implement get and set acl inode operations because it wanted easy access to the dentry. Now that we extended the set acl inode operation to take a dentry argument and added a new get acl inode operation that takes a dentry argument we can let ecryptfs implement get and set acl inode operations. Note, until the vfs has been switched to the new posix acl api this patch is a non-functional change. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) --- fs/ecryptfs/inode.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index b4be9f6bafd2..5802b93b2cda 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -1129,6 +1129,21 @@ static struct posix_acl *ecryptfs_get_acl(struct user_namespace *mnt_userns, posix_acl_xattr_name(type)); } +static int ecryptfs_set_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, struct posix_acl *acl, + int type) +{ + int rc; + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + struct inode *lower_inode = d_inode(lower_dentry); + + rc = vfs_set_acl(&init_user_ns, lower_dentry, + posix_acl_xattr_name(type), acl); + if (!rc) + fsstack_copy_attr_all(d_inode(dentry), lower_inode); + return rc; +} + const struct inode_operations ecryptfs_symlink_iops = { .get_link = ecryptfs_get_link, .permission = ecryptfs_permission, @@ -1153,6 +1168,7 @@ const struct inode_operations ecryptfs_dir_iops = { .fileattr_get = ecryptfs_fileattr_get, .fileattr_set = ecryptfs_fileattr_set, .get_acl = ecryptfs_get_acl, + .set_acl = ecryptfs_set_acl, }; const struct inode_operations ecryptfs_main_iops = { @@ -1163,6 +1179,7 @@ const struct inode_operations ecryptfs_main_iops = { .fileattr_get = ecryptfs_fileattr_get, .fileattr_set = ecryptfs_fileattr_set, .get_acl = ecryptfs_get_acl, + .set_acl = ecryptfs_set_acl, }; static int ecryptfs_xattr_get(const struct xattr_handler *handler, -- cgit From 6c0a8bfb84af8f3d1960e6905eace59d6488d270 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:19 +0200 Subject: ovl: implement get acl method The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. In order to build a type safe posix api around get and set acl we need all filesystem to implement get and set acl. Now that we have added get and set acl inode operations that allow easy access to the dentry we give overlayfs it's own get and set acl inode operations. Since overlayfs is a stacking filesystem it will use the newly added posix acl api when retrieving posix acls from the relevant layer. Since overlayfs can also be mounted on top of idmapped layers. If idmapped layers are used overlayfs must take the layer's idmapping into account after it retrieved the posix acls from the relevant layer. Note, until the vfs has been switched to the new posix acl api this patch is a non-functional change. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) --- fs/overlayfs/dir.c | 3 +- fs/overlayfs/inode.c | 109 +++++++++++++++++++++++++++++++++-------------- fs/overlayfs/overlayfs.h | 17 +++++++- 3 files changed, 95 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 7bece7010c00..eb49d5d7b56f 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -1311,7 +1311,8 @@ const struct inode_operations ovl_dir_inode_operations = { .permission = ovl_permission, .getattr = ovl_getattr, .listxattr = ovl_listxattr, - .get_inode_acl = ovl_get_acl, + .get_inode_acl = ovl_get_inode_acl, + .get_acl = ovl_get_acl, .update_time = ovl_update_time, .fileattr_get = ovl_fileattr_get, .fileattr_set = ovl_fileattr_set, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 6eefd8b7868e..74edbd1c1159 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include "overlayfs.h" @@ -460,7 +462,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) * of the POSIX ACLs retrieved from the lower layer to this function to not * alter the POSIX ACLs for the underlying filesystem. */ -static void ovl_idmap_posix_acl(struct inode *realinode, +static void ovl_idmap_posix_acl(const struct inode *realinode, struct user_namespace *mnt_userns, struct posix_acl *acl) { @@ -484,6 +486,64 @@ static void ovl_idmap_posix_acl(struct inode *realinode, } } +/* + * The @noperm argument is used to skip permission checking and is a temporary + * measure. Quoting Miklos from an earlier discussion: + * + * > So there are two paths to getting an acl: + * > 1) permission checking and 2) retrieving the value via getxattr(2). + * > This is a similar situation as reading a symlink vs. following it. + * > When following a symlink overlayfs always reads the link on the + * > underlying fs just as if it was a readlink(2) call, calling + * > security_inode_readlink() instead of security_inode_follow_link(). + * > This is logical: we are reading the link from the underlying storage, + * > and following it on overlayfs. + * > + * > Applying the same logic to acl: we do need to call the + * > security_inode_getxattr() on the underlying fs, even if just want to + * > check permissions on overlay. This is currently not done, which is an + * > inconsistency. + * > + * > Maybe adding the check to ovl_get_acl() is the right way to go, but + * > I'm a little afraid of a performance regression. Will look into that. + * + * Until we have made a decision allow this helper to take the @noperm + * argument. We should hopefully be able to remove it soon. + */ +static struct posix_acl *ovl_get_acl_path(const struct path *path, + const char *acl_name, bool noperm) +{ + struct posix_acl *real_acl, *clone; + struct user_namespace *mnt_userns; + struct inode *realinode = d_inode(path->dentry); + + mnt_userns = mnt_user_ns(path->mnt); + + if (noperm) + real_acl = get_inode_acl(realinode, posix_acl_type(acl_name)); + else + real_acl = vfs_get_acl(mnt_userns, path->dentry, acl_name); + if (IS_ERR_OR_NULL(real_acl)) + return real_acl; + + if (!is_idmapped_mnt(path->mnt)) + return real_acl; + + /* + * We cannot alter the ACLs returned from the relevant layer as that + * would alter the cached values filesystem wide for the lower + * filesystem. Instead we can clone the ACLs and then apply the + * relevant idmapping of the layer. + */ + clone = posix_acl_clone(real_acl, GFP_KERNEL); + posix_acl_release(real_acl); /* release original acl */ + if (!clone) + return ERR_PTR(-ENOMEM); + + ovl_idmap_posix_acl(realinode, mnt_userns, clone); + return clone; +} + /* * When the relevant layer is an idmapped mount we need to take the idmapping * of the layer into account and translate any ACL_{GROUP,USER} values @@ -495,10 +555,12 @@ static void ovl_idmap_posix_acl(struct inode *realinode, * * This is obviously only relevant when idmapped layers are used. */ -struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) +struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns, + struct inode *inode, int type, + bool rcu, bool noperm) { struct inode *realinode = ovl_inode_real(inode); - struct posix_acl *acl, *clone; + struct posix_acl *acl; struct path realpath; if (!IS_POSIXACL(realinode)) @@ -512,40 +574,23 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) } if (rcu) { + /* + * If the layer is idmapped drop out of RCU path walk + * so we can clone the ACLs. + */ + if (is_idmapped_mnt(realpath.mnt)) + return ERR_PTR(-ECHILD); + acl = get_cached_acl_rcu(realinode, type); } else { const struct cred *old_cred; old_cred = ovl_override_creds(inode->i_sb); - acl = get_inode_acl(realinode, type); + acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm); revert_creds(old_cred); } - /* - * If there are no POSIX ACLs, or we encountered an error, - * or the layer isn't idmapped we don't need to do anything. - */ - if (!is_idmapped_mnt(realpath.mnt) || IS_ERR_OR_NULL(acl)) - return acl; - - /* - * We only get here if the layer is idmapped. So drop out of RCU path - * walk so we can clone the ACLs. There's no need to release the ACLs - * since get_cached_acl_rcu() doesn't take a reference on the ACLs. - */ - if (rcu) - return ERR_PTR(-ECHILD); - clone = posix_acl_clone(acl, GFP_KERNEL); - if (!clone) - clone = ERR_PTR(-ENOMEM); - else - ovl_idmap_posix_acl(realinode, mnt_user_ns(realpath.mnt), clone); - /* - * Since we're not in RCU path walk we always need to release the - * original ACLs. - */ - posix_acl_release(acl); - return clone; + return acl; } #endif @@ -721,7 +766,8 @@ static const struct inode_operations ovl_file_inode_operations = { .permission = ovl_permission, .getattr = ovl_getattr, .listxattr = ovl_listxattr, - .get_inode_acl = ovl_get_acl, + .get_inode_acl = ovl_get_inode_acl, + .get_acl = ovl_get_acl, .update_time = ovl_update_time, .fiemap = ovl_fiemap, .fileattr_get = ovl_fileattr_get, @@ -741,7 +787,8 @@ static const struct inode_operations ovl_special_inode_operations = { .permission = ovl_permission, .getattr = ovl_getattr, .listxattr = ovl_listxattr, - .get_inode_acl = ovl_get_acl, + .get_inode_acl = ovl_get_inode_acl, + .get_acl = ovl_get_acl, .update_time = ovl_update_time, }; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index eee8f08d32b6..d334977030c8 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -594,9 +594,22 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); #ifdef CONFIG_FS_POSIX_ACL -struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu); +struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns, + struct inode *inode, int type, + bool rcu, bool noperm); +static inline struct posix_acl *ovl_get_inode_acl(struct inode *inode, int type, + bool rcu) +{ + return do_ovl_get_acl(&init_user_ns, inode, type, rcu, true); +} +static inline struct posix_acl *ovl_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, int type) +{ + return do_ovl_get_acl(mnt_userns, d_inode(dentry), type, false, false); +} #else -#define ovl_get_acl NULL +#define ovl_get_inode_acl NULL +#define ovl_get_acl NULL #endif int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags); -- cgit From 0e641857322f4d0a99aec3f49b21b732f3bb5ef9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:20 +0200 Subject: ovl: implement set acl method The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. In order to build a type safe posix api around get and set acl we need all filesystem to implement get and set acl. Now that we have added get and set acl inode operations that allow easy access to the dentry we give overlayfs it's own get and set acl inode operations. The set acl inode operation is duplicates most of the ovl posix acl xattr handler. The main difference being that the set acl inode operation relies on the new posix acl api. Once the vfs has been switched over the custom posix acl xattr handler will be removed completely. Note, until the vfs has been switched to the new posix acl api this patch is a non-functional change. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Acked-by: Miklos Szeredi Signed-off-by: Christian Brauner (Microsoft) --- fs/overlayfs/dir.c | 1 + fs/overlayfs/inode.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/overlayfs/overlayfs.h | 17 +++++++++ 3 files changed, 112 insertions(+) (limited to 'fs') diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index eb49d5d7b56f..0e817ebce92c 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -1313,6 +1313,7 @@ const struct inode_operations ovl_dir_inode_operations = { .listxattr = ovl_listxattr, .get_inode_acl = ovl_get_inode_acl, .get_acl = ovl_get_acl, + .set_acl = ovl_set_acl, .update_time = ovl_update_time, .fileattr_get = ovl_fileattr_get, .fileattr_set = ovl_fileattr_set, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 74edbd1c1159..304a6dbb852a 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -592,6 +592,98 @@ struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns, return acl; } + +static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, + struct posix_acl *acl, int type) +{ + int err; + struct path realpath; + const char *acl_name; + const struct cred *old_cred; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct dentry *upperdentry = ovl_dentry_upper(dentry); + struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); + + err = ovl_want_write(dentry); + if (err) + return err; + + /* + * If ACL is to be removed from a lower file, check if it exists in + * the first place before copying it up. + */ + acl_name = posix_acl_xattr_name(type); + if (!acl && !upperdentry) { + struct posix_acl *real_acl; + + ovl_path_lower(dentry, &realpath); + old_cred = ovl_override_creds(dentry->d_sb); + real_acl = vfs_get_acl(mnt_user_ns(realpath.mnt), realdentry, + acl_name); + revert_creds(old_cred); + posix_acl_release(real_acl); + if (IS_ERR(real_acl)) { + err = PTR_ERR(real_acl); + goto out_drop_write; + } + } + + if (!upperdentry) { + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + realdentry = ovl_dentry_upper(dentry); + } + + old_cred = ovl_override_creds(dentry->d_sb); + if (acl) + err = ovl_do_set_acl(ofs, realdentry, acl_name, acl); + else + err = ovl_do_remove_acl(ofs, realdentry, acl_name); + revert_creds(old_cred); + + /* copy c/mtime */ + ovl_copyattr(inode); + +out_drop_write: + ovl_drop_write(dentry); + return err; +} + +int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + struct posix_acl *acl, int type) +{ + int err; + struct inode *inode = d_inode(dentry); + struct dentry *workdir = ovl_workdir(dentry); + struct inode *realinode = ovl_inode_real(inode); + + if (!IS_POSIXACL(d_inode(workdir))) + return -EOPNOTSUPP; + if (!realinode->i_op->set_acl) + return -EOPNOTSUPP; + if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + if (!inode_owner_or_capable(&init_user_ns, inode)) + return -EPERM; + + /* + * Check if sgid bit needs to be cleared (actual setacl operation will + * be done with mounter's capabilities and so that won't do it for us). + */ + if (unlikely(inode->i_mode & S_ISGID) && type == ACL_TYPE_ACCESS && + !in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) { + struct iattr iattr = { .ia_valid = ATTR_KILL_SGID }; + + err = ovl_setattr(&init_user_ns, dentry, &iattr); + if (err) + return err; + } + + return ovl_set_or_remove_acl(dentry, inode, acl, type); +} #endif int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) @@ -768,6 +860,7 @@ static const struct inode_operations ovl_file_inode_operations = { .listxattr = ovl_listxattr, .get_inode_acl = ovl_get_inode_acl, .get_acl = ovl_get_acl, + .set_acl = ovl_set_acl, .update_time = ovl_update_time, .fiemap = ovl_fiemap, .fileattr_get = ovl_fileattr_get, @@ -789,6 +882,7 @@ static const struct inode_operations ovl_special_inode_operations = { .listxattr = ovl_listxattr, .get_inode_acl = ovl_get_inode_acl, .get_acl = ovl_get_acl, + .set_acl = ovl_set_acl, .update_time = ovl_update_time, }; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index d334977030c8..ab5061c9aa2a 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include "ovl_entry.h" #undef pr_fmt @@ -278,6 +280,18 @@ static inline int ovl_removexattr(struct ovl_fs *ofs, struct dentry *dentry, return ovl_do_removexattr(ofs, dentry, ovl_xattr(ofs, ox)); } +static inline int ovl_do_set_acl(struct ovl_fs *ofs, struct dentry *dentry, + const char *acl_name, struct posix_acl *acl) +{ + return vfs_set_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name, acl); +} + +static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry, + const char *acl_name) +{ + return vfs_remove_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name); +} + static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, struct dentry *olddentry, struct inode *newdir, struct dentry *newdentry, unsigned int flags) @@ -607,9 +621,12 @@ static inline struct posix_acl *ovl_get_acl(struct user_namespace *mnt_userns, { return do_ovl_get_acl(mnt_userns, d_inode(dentry), type, false, false); } +int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + struct posix_acl *acl, int type); #else #define ovl_get_inode_acl NULL #define ovl_get_acl NULL +#define ovl_set_acl NULL #endif int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags); -- cgit From 31acceb97500dd6e9105526301d76488cd6ca21c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:21 +0200 Subject: ovl: use posix acl api Now that posix acls have a proper api us it to copy them. All filesystems that can serve as lower or upper layers for overlayfs have gained support for the new posix acl api in previous patches. So switch all internal overlayfs codepaths for copying posix acls to the new posix acl api. Acked-by: Miklos Szeredi Signed-off-by: Christian Brauner (Microsoft) --- fs/overlayfs/copy_up.c | 38 ++++++++++++++++++++++++++++++++++++++ fs/overlayfs/dir.c | 20 ++------------------ fs/overlayfs/inode.c | 4 ++-- fs/overlayfs/overlayfs.h | 8 ++++++++ fs/overlayfs/super.c | 6 ++---- fs/xattr.c | 6 ------ 6 files changed, 52 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index f436d8847f08..6e4e65ee050d 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -44,6 +44,35 @@ static bool ovl_must_copy_xattr(const char *name) !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); } +static int ovl_copy_acl(struct ovl_fs *ofs, const struct path *path, + struct dentry *dentry, const char *acl_name) +{ + int err; + struct posix_acl *clone, *real_acl = NULL; + + real_acl = ovl_get_acl_path(path, acl_name, false); + if (!real_acl) + return 0; + + if (IS_ERR(real_acl)) { + err = PTR_ERR(real_acl); + if (err == -ENODATA || err == -EOPNOTSUPP) + return 0; + return err; + } + + clone = posix_acl_clone(real_acl, GFP_KERNEL); + posix_acl_release(real_acl); /* release original acl */ + if (!clone) + return -ENOMEM; + + err = ovl_do_set_acl(ofs, dentry, acl_name, clone); + + /* release cloned acl */ + posix_acl_release(clone); + return err; +} + int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct dentry *new) { struct dentry *old = oldpath->dentry; @@ -93,6 +122,15 @@ int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct de error = 0; continue; /* Discard */ } + + if (is_posix_acl_xattr(name)) { + error = ovl_copy_acl(OVL_FS(sb), oldpath, new, name); + if (!error) + continue; + /* POSIX ACLs must be copied. */ + break; + } + retry: size = ovl_do_getxattr(oldpath, name, value, value_size); if (size == -ERANGE) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 0e817ebce92c..cbb569d5d234 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -435,28 +435,12 @@ out: } static int ovl_set_upper_acl(struct ovl_fs *ofs, struct dentry *upperdentry, - const char *name, const struct posix_acl *acl) + const char *acl_name, struct posix_acl *acl) { - void *buffer; - size_t size; - int err; - if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl) return 0; - size = posix_acl_xattr_size(acl->a_count); - buffer = kmalloc(size, GFP_KERNEL); - if (!buffer) - return -ENOMEM; - - err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - if (err < 0) - goto out_free; - - err = ovl_do_setxattr(ofs, upperdentry, name, buffer, size, XATTR_CREATE); -out_free: - kfree(buffer); - return err; + return ovl_do_set_acl(ofs, upperdentry, acl_name, acl); } static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 304a6dbb852a..77a77fd7a77b 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -510,8 +510,8 @@ static void ovl_idmap_posix_acl(const struct inode *realinode, * Until we have made a decision allow this helper to take the @noperm * argument. We should hopefully be able to remove it soon. */ -static struct posix_acl *ovl_get_acl_path(const struct path *path, - const char *acl_name, bool noperm) +struct posix_acl *ovl_get_acl_path(const struct path *path, + const char *acl_name, bool noperm) { struct posix_acl *real_acl, *clone; struct user_namespace *mnt_userns; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index ab5061c9aa2a..480e6aabef27 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -623,10 +623,18 @@ static inline struct posix_acl *ovl_get_acl(struct user_namespace *mnt_userns, } int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); +struct posix_acl *ovl_get_acl_path(const struct path *path, + const char *acl_name, bool noperm); #else #define ovl_get_inode_acl NULL #define ovl_get_acl NULL #define ovl_set_acl NULL +static inline struct posix_acl *ovl_get_acl_path(const struct path *path, + const char *acl_name, + bool noperm) +{ + return NULL; +} #endif int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index a29a8afe9b26..5c1b7971a9b3 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -813,13 +813,11 @@ retry: * allowed as upper are limited to "normal" ones, where checking * for the above two errors is sufficient. */ - err = ovl_do_removexattr(ofs, work, - XATTR_NAME_POSIX_ACL_DEFAULT); + err = ovl_do_remove_acl(ofs, work, XATTR_NAME_POSIX_ACL_DEFAULT); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; - err = ovl_do_removexattr(ofs, work, - XATTR_NAME_POSIX_ACL_ACCESS); + err = ovl_do_remove_acl(ofs, work, XATTR_NAME_POSIX_ACL_ACCESS); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; diff --git a/fs/xattr.c b/fs/xattr.c index 31b5ac65ca34..9ed9eea4d1b9 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -299,12 +299,6 @@ out: } EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); -static inline bool is_posix_acl_xattr(const char *name) -{ - return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || - (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0); -} - int vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) -- cgit From 318e66856ddec05384f32d60b5598128289f4e7b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:22 +0200 Subject: xattr: use posix acl api In previous patches we built a new posix api solely around get and set inode operations. Now that we have all the pieces in place we can switch the system calls and the vfs over to only rely on this api when interacting with posix acls. This finally removes all type unsafety and type conversion issues explained in detail in [1] that we aim to get rid of. With the new posix acl api we immediately translate into an appropriate kernel internal struct posix_acl format both when getting and setting posix acls. This is a stark contrast to before were we hacked unsafe raw values into the uapi struct that was stored in a void pointer relying and having filesystems and security modules hack around in the uapi struct as well. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) --- fs/internal.h | 20 ++++++++++++++++++++ fs/posix_acl.c | 37 +++++++++++++++++++++++++++++++++++++ fs/xattr.c | 31 ++++++++++++++++++++----------- 3 files changed, 77 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/internal.h b/fs/internal.h index aa5f240496d9..e377eb7bbe7f 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -234,4 +234,24 @@ int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct xattr_ctx *ctx); int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode); +#ifdef CONFIG_FS_POSIX_ACL +int do_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *acl_name, const void *kvalue, size_t size); +ssize_t do_get_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *acl_name, void *kvalue, size_t size); +#else +static inline int do_set_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *acl_name, + const void *kvalue, size_t size) +{ + return -EOPNOTSUPP; +} +static inline ssize_t do_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *acl_name, + void *kvalue, size_t size) +{ + return -EOPNOTSUPP; +} +#endif + ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 3a141082b24c..14b87653b797 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -1544,3 +1544,40 @@ out_inode_unlock: return error; } EXPORT_SYMBOL_GPL(vfs_remove_acl); + +int do_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *acl_name, const void *kvalue, size_t size) +{ + int error; + struct posix_acl *acl = NULL; + + if (size) { + /* + * Note that posix_acl_from_xattr() uses GFP_NOFS when it + * probably doesn't need to here. + */ + acl = posix_acl_from_xattr(current_user_ns(), kvalue, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + + error = vfs_set_acl(mnt_userns, dentry, acl_name, acl); + posix_acl_release(acl); + return error; +} + +ssize_t do_get_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *acl_name, void *kvalue, size_t size) +{ + ssize_t error; + struct posix_acl *acl; + + acl = vfs_get_acl(mnt_userns, dentry, acl_name); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + error = vfs_posix_acl_to_xattr(mnt_userns, d_inode(dentry), + acl, kvalue, size); + posix_acl_release(acl); + return error; +} diff --git a/fs/xattr.c b/fs/xattr.c index 9ed9eea4d1b9..77db5aa26d13 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -189,6 +189,9 @@ __vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, { const struct xattr_handler *handler; + if (is_posix_acl_xattr(name)) + return -EOPNOTSUPP; + handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); @@ -410,6 +413,9 @@ __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, { const struct xattr_handler *handler; + if (is_posix_acl_xattr(name)) + return -EOPNOTSUPP; + handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); @@ -482,6 +488,9 @@ __vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode = d_inode(dentry); const struct xattr_handler *handler; + if (is_posix_acl_xattr(name)) + return -EOPNOTSUPP; + handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); @@ -591,17 +600,13 @@ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx) return error; } -static void setxattr_convert(struct user_namespace *mnt_userns, - struct dentry *d, struct xattr_ctx *ctx) -{ - if (ctx->size && is_posix_acl_xattr(ctx->kname->name)) - posix_acl_fix_xattr_from_user(ctx->kvalue, ctx->size); -} - int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct xattr_ctx *ctx) { - setxattr_convert(mnt_userns, dentry, ctx); + if (is_posix_acl_xattr(ctx->kname->name)) + return do_set_acl(mnt_userns, dentry, ctx->kname->name, + ctx->kvalue, ctx->size); + return vfs_setxattr(mnt_userns, dentry, ctx->kname->name, ctx->kvalue, ctx->size, ctx->flags); } @@ -708,10 +713,11 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, return -ENOMEM; } - error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size); + if (is_posix_acl_xattr(ctx->kname->name)) + error = do_get_acl(mnt_userns, d, kname, ctx->kvalue, ctx->size); + else + error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size); if (error > 0) { - if (is_posix_acl_xattr(kname)) - posix_acl_fix_xattr_to_user(ctx->kvalue, error); if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { @@ -886,6 +892,9 @@ removexattr(struct user_namespace *mnt_userns, struct dentry *d, if (error < 0) return error; + if (is_posix_acl_xattr(kname)) + return vfs_remove_acl(mnt_userns, d, kname); + return vfs_removexattr(mnt_userns, d, kname); } -- cgit From 04af28faae6760514f0302aa9db561e60ee53a20 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:23 +0200 Subject: ecryptfs: use stub posix acl handlers Now that ecryptfs supports the get and set acl inode operations and the vfs has been switched to the new posi api, ecryptfs can simply rely on the stub posix acl handlers. Signed-off-by: Christian Brauner (Microsoft) --- fs/ecryptfs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 5802b93b2cda..f3cd00fac9c3 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -1210,6 +1210,10 @@ static const struct xattr_handler ecryptfs_xattr_handler = { }; const struct xattr_handler *ecryptfs_xattr_handlers[] = { +#ifdef CONFIG_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif &ecryptfs_xattr_handler, NULL }; -- cgit From 200afb77cde7e51364d4a8b3e176f4797a3b5ec7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:24 +0200 Subject: ovl: use stub posix acl handlers Now that ovl supports the get and set acl inode operations and the vfs has been switched to the new posi api, ovl can simply rely on the stub posix acl handlers. The custom xattr handlers and associated unused helpers can be removed. Signed-off-by: Christian Brauner (Microsoft) --- fs/overlayfs/super.c | 101 ++------------------------------------------------- 1 file changed, 4 insertions(+), 97 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 5c1b7971a9b3..2addafe4e14a 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -999,83 +999,6 @@ static unsigned int ovl_split_lowerdirs(char *str) return ctr; } -static int __maybe_unused -ovl_posix_acl_xattr_get(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, void *buffer, size_t size) -{ - return ovl_xattr_get(dentry, inode, handler->name, buffer, size); -} - -static int __maybe_unused -ovl_posix_acl_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) -{ - struct dentry *workdir = ovl_workdir(dentry); - struct inode *realinode = ovl_inode_real(inode); - struct posix_acl *acl = NULL; - int err; - - /* Check that everything is OK before copy-up */ - if (value) { - /* The above comment can be understood in two ways: - * - * 1. We just want to check whether the basic POSIX ACL format - * is ok. For example, if the header is correct and the size - * is sane. - * 2. We want to know whether the ACL_{GROUP,USER} entries can - * be mapped according to the underlying filesystem. - * - * Currently, we only check 1. If we wanted to check 2. we - * would need to pass the mnt_userns and the fs_userns of the - * underlying filesystem. But frankly, I think checking 1. is - * enough to start the copy-up. - */ - acl = vfs_set_acl_prepare(&init_user_ns, &init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - err = -EOPNOTSUPP; - if (!IS_POSIXACL(d_inode(workdir))) - goto out_acl_release; - if (!realinode->i_op->set_acl) - goto out_acl_release; - if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) { - err = acl ? -EACCES : 0; - goto out_acl_release; - } - err = -EPERM; - if (!inode_owner_or_capable(&init_user_ns, inode)) - goto out_acl_release; - - posix_acl_release(acl); - - /* - * Check if sgid bit needs to be cleared (actual setacl operation will - * be done with mounter's capabilities and so that won't do it for us). - */ - if (unlikely(inode->i_mode & S_ISGID) && - handler->flags == ACL_TYPE_ACCESS && - !in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) { - struct iattr iattr = { .ia_valid = ATTR_KILL_SGID }; - - err = ovl_setattr(&init_user_ns, dentry, &iattr); - if (err) - return err; - } - - err = ovl_xattr_set(dentry, inode, handler->name, value, size, flags); - return err; - -out_acl_release: - posix_acl_release(acl); - return err; -} - static int ovl_own_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) @@ -1108,22 +1031,6 @@ static int ovl_other_xattr_set(const struct xattr_handler *handler, return ovl_xattr_set(dentry, inode, name, value, size, flags); } -static const struct xattr_handler __maybe_unused -ovl_posix_acl_access_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_ACCESS, - .flags = ACL_TYPE_ACCESS, - .get = ovl_posix_acl_xattr_get, - .set = ovl_posix_acl_xattr_set, -}; - -static const struct xattr_handler __maybe_unused -ovl_posix_acl_default_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .get = ovl_posix_acl_xattr_get, - .set = ovl_posix_acl_xattr_set, -}; - static const struct xattr_handler ovl_own_trusted_xattr_handler = { .prefix = OVL_XATTR_TRUSTED_PREFIX, .get = ovl_own_xattr_get, @@ -1144,8 +1051,8 @@ static const struct xattr_handler ovl_other_xattr_handler = { static const struct xattr_handler *ovl_trusted_xattr_handlers[] = { #ifdef CONFIG_FS_POSIX_ACL - &ovl_posix_acl_access_xattr_handler, - &ovl_posix_acl_default_xattr_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif &ovl_own_trusted_xattr_handler, &ovl_other_xattr_handler, @@ -1154,8 +1061,8 @@ static const struct xattr_handler *ovl_trusted_xattr_handlers[] = { static const struct xattr_handler *ovl_user_xattr_handlers[] = { #ifdef CONFIG_FS_POSIX_ACL - &ovl_posix_acl_access_xattr_handler, - &ovl_posix_acl_default_xattr_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif &ovl_own_user_xattr_handler, &ovl_other_xattr_handler, -- cgit From c39c07fce78439dfb0665b294753716297389179 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:25 +0200 Subject: cifs: use stub posix acl handlers Now that cifs supports the get and set acl inode operations and the vfs has been switched to the new posi api, cifs can simply rely on the stub posix acl handlers. The custom xattr handlers and associated unused helpers can be removed. Signed-off-by: Christian Brauner (Microsoft) --- fs/cifs/cifsproto.h | 8 -- fs/cifs/cifssmb.c | 298 ---------------------------------------------------- fs/cifs/xattr.c | 68 +----------- 3 files changed, 4 insertions(+), 370 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index ce7f08a387b5..f50f96e4ec30 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -541,18 +541,10 @@ extern int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen); extern int CIFSSMBSetCIFSACL(const unsigned int, struct cifs_tcon *, __u16, struct cifs_ntsd *, __u32, int); -extern int CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, - const unsigned char *searchName, - char *acl_inf, const int buflen, const int acl_type, - const struct nls_table *nls_codepage, int remap_special_chars); extern int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, struct posix_acl **acl, const int acl_type, const struct nls_table *nls_codepage, int remap); -extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, - const unsigned char *fileName, - const char *local_acl, const int buflen, const int acl_type, - const struct nls_table *nls_codepage, int remap_special_chars); extern int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *fileName, const struct posix_acl *acl, const int acl_type, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index f119ca917947..23f10e0d6e7e 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2914,304 +2914,6 @@ CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon, #ifdef CONFIG_CIFS_POSIX -/*Convert an Access Control Entry from wire format to local POSIX xattr format*/ -static void cifs_convert_ace(struct posix_acl_xattr_entry *ace, - struct cifs_posix_ace *cifs_ace) -{ - /* u8 cifs fields do not need le conversion */ - ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm); - ace->e_tag = cpu_to_le16(cifs_ace->cifs_e_tag); - ace->e_id = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid)); -/* - cifs_dbg(FYI, "perm %d tag %d id %d\n", - ace->e_perm, ace->e_tag, ace->e_id); -*/ - - return; -} - -/* Convert ACL from CIFS POSIX wire format to local Linux POSIX ACL xattr */ -static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, - const int acl_type, const int size_of_data_area) -{ - int size = 0; - int i; - __u16 count; - struct cifs_posix_ace *pACE; - struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)src; - struct posix_acl_xattr_header *local_acl = (void *)trgt; - - if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION) - return -EOPNOTSUPP; - - if (acl_type == ACL_TYPE_ACCESS) { - count = le16_to_cpu(cifs_acl->access_entry_count); - pACE = &cifs_acl->ace_array[0]; - size = sizeof(struct cifs_posix_acl); - size += sizeof(struct cifs_posix_ace) * count; - /* check if we would go beyond end of SMB */ - if (size_of_data_area < size) { - cifs_dbg(FYI, "bad CIFS POSIX ACL size %d vs. %d\n", - size_of_data_area, size); - return -EINVAL; - } - } else if (acl_type == ACL_TYPE_DEFAULT) { - count = le16_to_cpu(cifs_acl->access_entry_count); - size = sizeof(struct cifs_posix_acl); - size += sizeof(struct cifs_posix_ace) * count; -/* skip past access ACEs to get to default ACEs */ - pACE = &cifs_acl->ace_array[count]; - count = le16_to_cpu(cifs_acl->default_entry_count); - size += sizeof(struct cifs_posix_ace) * count; - /* check if we would go beyond end of SMB */ - if (size_of_data_area < size) - return -EINVAL; - } else { - /* illegal type */ - return -EINVAL; - } - - size = posix_acl_xattr_size(count); - if ((buflen == 0) || (local_acl == NULL)) { - /* used to query ACL EA size */ - } else if (size > buflen) { - return -ERANGE; - } else /* buffer big enough */ { - struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1); - - local_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); - for (i = 0; i < count ; i++) { - cifs_convert_ace(&ace[i], pACE); - pACE++; - } - } - return size; -} - -static void convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace, - const struct posix_acl_xattr_entry *local_ace) -{ - cifs_ace->cifs_e_perm = le16_to_cpu(local_ace->e_perm); - cifs_ace->cifs_e_tag = le16_to_cpu(local_ace->e_tag); - /* BB is there a better way to handle the large uid? */ - if (local_ace->e_id == cpu_to_le32(-1)) { - /* Probably no need to le convert -1 on any arch but can not hurt */ - cifs_ace->cifs_uid = cpu_to_le64(-1); - } else - cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id)); -/* - cifs_dbg(FYI, "perm %d tag %d id %d\n", - ace->e_perm, ace->e_tag, ace->e_id); -*/ -} - -/* Convert ACL from local Linux POSIX xattr to CIFS POSIX ACL wire format */ -static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, - const int buflen, const int acl_type) -{ - __u16 rc = 0; - struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data; - struct posix_acl_xattr_header *local_acl = (void *)pACL; - struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1); - int count; - int i; - - if ((buflen == 0) || (pACL == NULL) || (cifs_acl == NULL)) - return 0; - - count = posix_acl_xattr_count((size_t)buflen); - cifs_dbg(FYI, "setting acl with %d entries from buf of length %d and version of %d\n", - count, buflen, le32_to_cpu(local_acl->a_version)); - if (le32_to_cpu(local_acl->a_version) != 2) { - cifs_dbg(FYI, "unknown POSIX ACL version %d\n", - le32_to_cpu(local_acl->a_version)); - return 0; - } - cifs_acl->version = cpu_to_le16(1); - if (acl_type == ACL_TYPE_ACCESS) { - cifs_acl->access_entry_count = cpu_to_le16(count); - cifs_acl->default_entry_count = cpu_to_le16(0xFFFF); - } else if (acl_type == ACL_TYPE_DEFAULT) { - cifs_acl->default_entry_count = cpu_to_le16(count); - cifs_acl->access_entry_count = cpu_to_le16(0xFFFF); - } else { - cifs_dbg(FYI, "unknown ACL type %d\n", acl_type); - return 0; - } - for (i = 0; i < count; i++) - convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], &ace[i]); - if (rc == 0) { - rc = (__u16)(count * sizeof(struct cifs_posix_ace)); - rc += sizeof(struct cifs_posix_acl); - /* BB add check to make sure ACL does not overflow SMB */ - } - return rc; -} - -int -CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, - const unsigned char *searchName, - char *acl_inf, const int buflen, const int acl_type, - const struct nls_table *nls_codepage, int remap) -{ -/* SMB_QUERY_POSIX_ACL */ - TRANSACTION2_QPI_REQ *pSMB = NULL; - TRANSACTION2_QPI_RSP *pSMBr = NULL; - int rc = 0; - int bytes_returned; - int name_len; - __u16 params, byte_count; - - cifs_dbg(FYI, "In GetPosixACL (Unix) for path %s\n", searchName); - -queryAclRetry: - rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, - (void **) &pSMBr); - if (rc) - return rc; - - if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { - name_len = - cifsConvertToUTF16((__le16 *) pSMB->FileName, - searchName, PATH_MAX, nls_codepage, - remap); - name_len++; /* trailing null */ - name_len *= 2; - pSMB->FileName[name_len] = 0; - pSMB->FileName[name_len+1] = 0; - } else { - name_len = copy_path_name(pSMB->FileName, searchName); - } - - params = 2 /* level */ + 4 /* rsrvd */ + name_len /* incl null */ ; - pSMB->TotalDataCount = 0; - pSMB->MaxParameterCount = cpu_to_le16(2); - /* BB find exact max data count below from sess structure BB */ - pSMB->MaxDataCount = cpu_to_le16(4000); - pSMB->MaxSetupCount = 0; - pSMB->Reserved = 0; - pSMB->Flags = 0; - pSMB->Timeout = 0; - pSMB->Reserved2 = 0; - pSMB->ParameterOffset = cpu_to_le16( - offsetof(struct smb_com_transaction2_qpi_req, - InformationLevel) - 4); - pSMB->DataCount = 0; - pSMB->DataOffset = 0; - pSMB->SetupCount = 1; - pSMB->Reserved3 = 0; - pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION); - byte_count = params + 1 /* pad */ ; - pSMB->TotalParameterCount = cpu_to_le16(params); - pSMB->ParameterCount = pSMB->TotalParameterCount; - pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_ACL); - pSMB->Reserved4 = 0; - inc_rfc1001_len(pSMB, byte_count); - pSMB->ByteCount = cpu_to_le16(byte_count); - - rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, - (struct smb_hdr *) pSMBr, &bytes_returned, 0); - cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get); - if (rc) { - cifs_dbg(FYI, "Send error in Query POSIX ACL = %d\n", rc); - } else { - /* decode response */ - - rc = validate_t2((struct smb_t2_rsp *)pSMBr); - /* BB also check enough total bytes returned */ - if (rc || get_bcc(&pSMBr->hdr) < 2) - rc = -EIO; /* bad smb */ - else { - __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); - __u16 count = le16_to_cpu(pSMBr->t2.DataCount); - rc = cifs_copy_posix_acl(acl_inf, - (char *)&pSMBr->hdr.Protocol+data_offset, - buflen, acl_type, count); - } - } - cifs_buf_release(pSMB); - if (rc == -EAGAIN) - goto queryAclRetry; - return rc; -} - -int -CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, - const unsigned char *fileName, - const char *local_acl, const int buflen, - const int acl_type, - const struct nls_table *nls_codepage, int remap) -{ - struct smb_com_transaction2_spi_req *pSMB = NULL; - struct smb_com_transaction2_spi_rsp *pSMBr = NULL; - char *parm_data; - int name_len; - int rc = 0; - int bytes_returned = 0; - __u16 params, byte_count, data_count, param_offset, offset; - - cifs_dbg(FYI, "In SetPosixACL (Unix) for path %s\n", fileName); -setAclRetry: - rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, - (void **) &pSMBr); - if (rc) - return rc; - if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { - name_len = - cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, - PATH_MAX, nls_codepage, remap); - name_len++; /* trailing null */ - name_len *= 2; - } else { - name_len = copy_path_name(pSMB->FileName, fileName); - } - params = 6 + name_len; - pSMB->MaxParameterCount = cpu_to_le16(2); - /* BB find max SMB size from sess */ - pSMB->MaxDataCount = cpu_to_le16(1000); - pSMB->MaxSetupCount = 0; - pSMB->Reserved = 0; - pSMB->Flags = 0; - pSMB->Timeout = 0; - pSMB->Reserved2 = 0; - param_offset = offsetof(struct smb_com_transaction2_spi_req, - InformationLevel) - 4; - offset = param_offset + params; - parm_data = ((char *) &pSMB->hdr.Protocol) + offset; - pSMB->ParameterOffset = cpu_to_le16(param_offset); - - /* convert to on the wire format for POSIX ACL */ - data_count = ACL_to_cifs_posix(parm_data, local_acl, buflen, acl_type); - - if (data_count == 0) { - rc = -EOPNOTSUPP; - goto setACLerrorExit; - } - pSMB->DataOffset = cpu_to_le16(offset); - pSMB->SetupCount = 1; - pSMB->Reserved3 = 0; - pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); - pSMB->InformationLevel = cpu_to_le16(SMB_SET_POSIX_ACL); - byte_count = 3 /* pad */ + params + data_count; - pSMB->DataCount = cpu_to_le16(data_count); - pSMB->TotalDataCount = pSMB->DataCount; - pSMB->ParameterCount = cpu_to_le16(params); - pSMB->TotalParameterCount = pSMB->ParameterCount; - pSMB->Reserved4 = 0; - inc_rfc1001_len(pSMB, byte_count); - pSMB->ByteCount = cpu_to_le16(byte_count); - rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, - (struct smb_hdr *) pSMBr, &bytes_returned, 0); - if (rc) - cifs_dbg(FYI, "Set POSIX ACL returned %d\n", rc); - -setACLerrorExit: - cifs_buf_release(pSMB); - if (rc == -EAGAIN) - goto setAclRetry; - return rc; -} - #ifdef CONFIG_FS_POSIX_ACL /** * cifs_init_posix_acl - convert ACL from cifs to POSIX ACL format diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 998fa51f9b68..5f2fb2fd2e37 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -200,32 +200,6 @@ static int cifs_xattr_set(const struct xattr_handler *handler, } break; } - -#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY - case XATTR_ACL_ACCESS: -#ifdef CONFIG_CIFS_POSIX - if (!value) - goto out; - if (sb->s_flags & SB_POSIXACL) - rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, - value, (const int)size, - ACL_TYPE_ACCESS, cifs_sb->local_nls, - cifs_remap(cifs_sb)); -#endif /* CONFIG_CIFS_POSIX */ - break; - - case XATTR_ACL_DEFAULT: -#ifdef CONFIG_CIFS_POSIX - if (!value) - goto out; - if (sb->s_flags & SB_POSIXACL) - rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, - value, (const int)size, - ACL_TYPE_DEFAULT, cifs_sb->local_nls, - cifs_remap(cifs_sb)); -#endif /* CONFIG_CIFS_POSIX */ - break; -#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ } out: @@ -366,27 +340,6 @@ static int cifs_xattr_get(const struct xattr_handler *handler, } break; } -#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY - case XATTR_ACL_ACCESS: -#ifdef CONFIG_CIFS_POSIX - if (sb->s_flags & SB_POSIXACL) - rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, - value, size, ACL_TYPE_ACCESS, - cifs_sb->local_nls, - cifs_remap(cifs_sb)); -#endif /* CONFIG_CIFS_POSIX */ - break; - - case XATTR_ACL_DEFAULT: -#ifdef CONFIG_CIFS_POSIX - if (sb->s_flags & SB_POSIXACL) - rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, - value, size, ACL_TYPE_DEFAULT, - cifs_sb->local_nls, - cifs_remap(cifs_sb)); -#endif /* CONFIG_CIFS_POSIX */ - break; -#endif /* ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ } /* We could add an additional check for streams ie @@ -525,21 +478,6 @@ static const struct xattr_handler smb3_ntsd_full_xattr_handler = { .set = cifs_xattr_set, }; - -static const struct xattr_handler cifs_posix_acl_access_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_ACCESS, - .flags = XATTR_ACL_ACCESS, - .get = cifs_xattr_get, - .set = cifs_xattr_set, -}; - -static const struct xattr_handler cifs_posix_acl_default_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_DEFAULT, - .flags = XATTR_ACL_DEFAULT, - .get = cifs_xattr_get, - .set = cifs_xattr_set, -}; - const struct xattr_handler *cifs_xattr_handlers[] = { &cifs_user_xattr_handler, &cifs_os2_xattr_handler, @@ -549,7 +487,9 @@ const struct xattr_handler *cifs_xattr_handlers[] = { &smb3_ntsd_xattr_handler, /* alias for above since avoiding "cifs" */ &cifs_cifs_ntsd_full_xattr_handler, &smb3_ntsd_full_xattr_handler, /* alias for above since avoiding "cifs" */ - &cifs_posix_acl_access_xattr_handler, - &cifs_posix_acl_default_xattr_handler, +#ifdef CONFIG_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif NULL }; -- cgit From 39a6497a9bbb760c3c26a1bef7a0ab0b9fdefd9f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:26 +0200 Subject: 9p: use stub posix acl handlers Now that 9p supports the get and set acl inode operations and the vfs has been switched to the new posi api, 9p can simply rely on the stub posix acl handlers. The custom xattr handlers and associated unused helpers can be removed. Signed-off-by: Christian Brauner (Microsoft) --- fs/9p/acl.c | 121 ---------------------------------------------------------- fs/9p/xattr.c | 7 ++-- fs/9p/xattr.h | 2 - 3 files changed, 4 insertions(+), 126 deletions(-) (limited to 'fs') diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 135b26cee63a..c397c51f80d9 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -343,124 +343,3 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep, *modep = mode; return 0; } - -static int v9fs_xattr_get_acl(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, void *buffer, size_t size) -{ - struct v9fs_session_info *v9ses; - struct posix_acl *acl; - int error; - - v9ses = v9fs_dentry2v9ses(dentry); - /* - * We allow set/get/list of acl when access=client is not specified - */ - if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) - return v9fs_xattr_get(dentry, handler->name, buffer, size); - - acl = v9fs_get_cached_acl(inode, handler->flags); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int v9fs_xattr_set_acl(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) -{ - int retval; - struct posix_acl *acl; - struct v9fs_session_info *v9ses; - - v9ses = v9fs_dentry2v9ses(dentry); - /* - * set the attribute on the remote. Without even looking at the - * xattr value. We leave it to the server to validate - */ - if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) - return v9fs_xattr_set(dentry, handler->name, value, size, - flags); - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(&init_user_ns, inode)) - return -EPERM; - if (value) { - /* update the cached acl value */ - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - retval = posix_acl_valid(inode->i_sb->s_user_ns, acl); - if (retval) - goto err_out; - } - } else - acl = NULL; - - switch (handler->flags) { - case ACL_TYPE_ACCESS: - if (acl) { - struct iattr iattr = { 0 }; - struct posix_acl *old_acl = acl; - - retval = posix_acl_update_mode(&init_user_ns, inode, - &iattr.ia_mode, &acl); - if (retval) - goto err_out; - if (!acl) { - /* - * ACL can be represented - * by the mode bits. So don't - * update ACL. - */ - posix_acl_release(old_acl); - value = NULL; - size = 0; - } - iattr.ia_valid = ATTR_MODE; - /* FIXME should we update ctime ? - * What is the following setxattr update the - * mode ? - */ - v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr); - } - break; - case ACL_TYPE_DEFAULT: - if (!S_ISDIR(inode->i_mode)) { - retval = acl ? -EINVAL : 0; - goto err_out; - } - break; - default: - BUG(); - } - retval = v9fs_xattr_set(dentry, handler->name, value, size, flags); - if (!retval) - set_cached_acl(inode, handler->flags, acl); -err_out: - posix_acl_release(acl); - return retval; -} - -const struct xattr_handler v9fs_xattr_acl_access_handler = { - .name = XATTR_NAME_POSIX_ACL_ACCESS, - .flags = ACL_TYPE_ACCESS, - .get = v9fs_xattr_get_acl, - .set = v9fs_xattr_set_acl, -}; - -const struct xattr_handler v9fs_xattr_acl_default_handler = { - .name = XATTR_NAME_POSIX_ACL_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .get = v9fs_xattr_get_acl, - .set = v9fs_xattr_set_acl, -}; diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 1f9298a4bd42..ae6a93871338 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -182,9 +183,9 @@ static struct xattr_handler v9fs_xattr_security_handler = { const struct xattr_handler *v9fs_xattr_handlers[] = { &v9fs_xattr_user_handler, &v9fs_xattr_trusted_handler, -#ifdef CONFIG_9P_FS_POSIX_ACL - &v9fs_xattr_acl_access_handler, - &v9fs_xattr_acl_default_handler, +#ifdef CONFIG_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif #ifdef CONFIG_9P_FS_SECURITY &v9fs_xattr_security_handler, diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h index 3e11fc3331eb..b5636e544c8a 100644 --- a/fs/9p/xattr.h +++ b/fs/9p/xattr.h @@ -11,8 +11,6 @@ #include extern const struct xattr_handler *v9fs_xattr_handlers[]; -extern const struct xattr_handler v9fs_xattr_acl_access_handler; -extern const struct xattr_handler v9fs_xattr_acl_default_handler; ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, void *buffer, size_t buffer_size); -- cgit From 0a26bde2c9db9817e2b4c0f890236f78d4d8ed7c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Sep 2022 17:17:27 +0200 Subject: acl: remove a slew of now unused helpers Now that the posix acl api is active we can remove all the hacky helpers we had to keep around for all these years and also remove the set and get posix acl xattr handler methods as they aren't needed anymore. Signed-off-by: Christian Brauner (Microsoft) --- fs/posix_acl.c | 363 ++++----------------------------------------------------- fs/xattr.c | 5 +- 2 files changed, 23 insertions(+), 345 deletions(-) (limited to 'fs') diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 14b87653b797..acb7f7d7f11d 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -747,118 +747,32 @@ static int posix_acl_fix_xattr_common(const void *value, size_t size) return count; } -void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, - const struct inode *inode, - void *value, size_t size) -{ - struct posix_acl_xattr_header *header = value; - struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; - struct user_namespace *fs_userns = i_user_ns(inode); - int count; - vfsuid_t vfsuid; - vfsgid_t vfsgid; - kuid_t uid; - kgid_t gid; - - if (no_idmapping(mnt_userns, i_user_ns(inode))) - return; - - count = posix_acl_fix_xattr_common(value, size); - if (count <= 0) - return; - - for (end = entry + count; entry != end; entry++) { - switch (le16_to_cpu(entry->e_tag)) { - case ACL_USER: - uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); - vfsuid = make_vfsuid(mnt_userns, fs_userns, uid); - entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, - vfsuid_into_kuid(vfsuid))); - break; - case ACL_GROUP: - gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); - vfsgid = make_vfsgid(mnt_userns, fs_userns, gid); - entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, - vfsgid_into_kgid(vfsgid))); - break; - default: - break; - } - } -} - -static void posix_acl_fix_xattr_userns( - struct user_namespace *to, struct user_namespace *from, - void *value, size_t size) -{ - struct posix_acl_xattr_header *header = value; - struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; - int count; - kuid_t uid; - kgid_t gid; - - count = posix_acl_fix_xattr_common(value, size); - if (count <= 0) - return; - - for (end = entry + count; entry != end; entry++) { - switch(le16_to_cpu(entry->e_tag)) { - case ACL_USER: - uid = make_kuid(from, le32_to_cpu(entry->e_id)); - entry->e_id = cpu_to_le32(from_kuid(to, uid)); - break; - case ACL_GROUP: - gid = make_kgid(from, le32_to_cpu(entry->e_id)); - entry->e_id = cpu_to_le32(from_kgid(to, gid)); - break; - default: - break; - } - } -} - -void posix_acl_fix_xattr_from_user(void *value, size_t size) -{ - struct user_namespace *user_ns = current_user_ns(); - if (user_ns == &init_user_ns) - return; - posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); -} - -void posix_acl_fix_xattr_to_user(void *value, size_t size) -{ - struct user_namespace *user_ns = current_user_ns(); - if (user_ns == &init_user_ns) - return; - posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); -} - /** - * make_posix_acl - convert POSIX ACLs from uapi to VFS format using the - * provided callbacks to map ACL_{GROUP,USER} entries into the - * appropriate format - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping + * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format + * @userns: the filesystem's idmapping * @value: the uapi representation of POSIX ACLs * @size: the size of @void - * @uid_cb: callback to use for mapping the uid stored in ACL_USER entries - * @gid_cb: callback to use for mapping the gid stored in ACL_GROUP entries * - * The make_posix_acl() helper is an abstraction to translate from uapi format - * into the VFS format allowing the caller to specific callbacks to map - * ACL_{GROUP,USER} entries into the expected format. This is used in - * posix_acl_from_xattr() and vfs_set_acl_prepare() and avoids pointless code - * duplication. + * Filesystems that store POSIX ACLs in the unaltered uapi format should use + * posix_acl_from_xattr() when reading them from the backing store and + * converting them into the struct posix_acl VFS format. The helper is + * specifically intended to be called from the acl inode operation. + * + * The posix_acl_from_xattr() function will map the raw {g,u}id values stored + * in ACL_{GROUP,USER} entries into idmapping in @userns. + * + * Note that posix_acl_from_xattr() does not take idmapped mounts into account. + * If it did it calling it from the get acl inode operation would return POSIX + * ACLs mapped according to an idmapped mount which would mean that the value + * couldn't be cached for the filesystem. Idmapped mounts are taken into + * account on the fly during permission checking or right at the VFS - + * userspace boundary before reporting them to the user. * * Return: Allocated struct posix_acl on success, NULL for a valid header but * without actual POSIX ACL entries, or ERR_PTR() encoded error code. */ -static struct posix_acl *make_posix_acl(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, const void *value, size_t size, - kuid_t (*uid_cb)(struct user_namespace *, struct user_namespace *, - const struct posix_acl_xattr_entry *), - kgid_t (*gid_cb)(struct user_namespace *, struct user_namespace *, - const struct posix_acl_xattr_entry *)) +struct posix_acl *posix_acl_from_xattr(struct user_namespace *userns, + const void *value, size_t size) { const struct posix_acl_xattr_header *header = value; const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end; @@ -889,12 +803,14 @@ static struct posix_acl *make_posix_acl(struct user_namespace *mnt_userns, break; case ACL_USER: - acl_e->e_uid = uid_cb(mnt_userns, fs_userns, entry); + acl_e->e_uid = make_kuid(userns, + le32_to_cpu(entry->e_id)); if (!uid_valid(acl_e->e_uid)) goto fail; break; case ACL_GROUP: - acl_e->e_gid = gid_cb(mnt_userns, fs_userns, entry); + acl_e->e_gid = make_kgid(userns, + le32_to_cpu(entry->e_id)); if (!gid_valid(acl_e->e_gid)) goto fail; break; @@ -909,182 +825,6 @@ fail: posix_acl_release(acl); return ERR_PTR(-EINVAL); } - -/** - * vfs_set_acl_prepare_kuid - map ACL_USER uid according to mount- and - * filesystem idmapping - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * @e: a ACL_USER entry in POSIX ACL uapi format - * - * The uid stored as ACL_USER entry in @e is a kuid_t stored as a raw {g,u}id - * value. The vfs_set_acl_prepare_kuid() will recover the kuid_t through - * KUIDT_INIT() and then map it according to the idmapped mount. The resulting - * kuid_t is the value which the filesystem can map up into a raw backing store - * id in the filesystem's idmapping. - * - * This is used in vfs_set_acl_prepare() to generate the proper VFS - * representation of POSIX ACLs with ACL_USER entries during setxattr(). - * - * Return: A kuid in @fs_userns for the uid stored in @e. - */ -static inline kuid_t -vfs_set_acl_prepare_kuid(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - const struct posix_acl_xattr_entry *e) -{ - kuid_t kuid = KUIDT_INIT(le32_to_cpu(e->e_id)); - return from_vfsuid(mnt_userns, fs_userns, VFSUIDT_INIT(kuid)); -} - -/** - * vfs_set_acl_prepare_kgid - map ACL_GROUP gid according to mount- and - * filesystem idmapping - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * @e: a ACL_GROUP entry in POSIX ACL uapi format - * - * The gid stored as ACL_GROUP entry in @e is a kgid_t stored as a raw {g,u}id - * value. The vfs_set_acl_prepare_kgid() will recover the kgid_t through - * KGIDT_INIT() and then map it according to the idmapped mount. The resulting - * kgid_t is the value which the filesystem can map up into a raw backing store - * id in the filesystem's idmapping. - * - * This is used in vfs_set_acl_prepare() to generate the proper VFS - * representation of POSIX ACLs with ACL_GROUP entries during setxattr(). - * - * Return: A kgid in @fs_userns for the gid stored in @e. - */ -static inline kgid_t -vfs_set_acl_prepare_kgid(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - const struct posix_acl_xattr_entry *e) -{ - kgid_t kgid = KGIDT_INIT(le32_to_cpu(e->e_id)); - return from_vfsgid(mnt_userns, fs_userns, VFSGIDT_INIT(kgid)); -} - -/** - * vfs_set_acl_prepare - convert POSIX ACLs from uapi to VFS format taking - * mount and filesystem idmappings into account - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * @value: the uapi representation of POSIX ACLs - * @size: the size of @void - * - * When setting POSIX ACLs with ACL_{GROUP,USER} entries they need to be - * mapped according to the relevant mount- and filesystem idmapping. It is - * important that the ACL_{GROUP,USER} entries in struct posix_acl will be - * mapped into k{g,u}id_t that are supposed to be mapped up in the filesystem - * idmapping. This is crucial since the resulting struct posix_acl might be - * cached filesystem wide. The vfs_set_acl_prepare() function will take care to - * perform all necessary idmappings. - * - * Note, that since basically forever the {g,u}id values encoded as - * ACL_{GROUP,USER} entries in the uapi POSIX ACLs passed via @value contain - * values that have been mapped according to the caller's idmapping. In other - * words, POSIX ACLs passed in uapi format as @value during setxattr() contain - * {g,u}id values in their ACL_{GROUP,USER} entries that should actually have - * been stored as k{g,u}id_t. - * - * This means, vfs_set_acl_prepare() needs to first recover the k{g,u}id_t by - * calling K{G,U}IDT_INIT(). Afterwards they can be interpreted as vfs{g,u}id_t - * through from_vfs{g,u}id() to account for any idmapped mounts. The - * vfs_set_acl_prepare_k{g,u}id() helpers will take care to generate the - * correct k{g,u}id_t. - * - * The filesystem will then receive the POSIX ACLs ready to be cached - * filesystem wide and ready to be written to the backing store taking the - * filesystem's idmapping into account. - * - * Return: Allocated struct posix_acl on success, NULL for a valid header but - * without actual POSIX ACL entries, or ERR_PTR() encoded error code. - */ -struct posix_acl *vfs_set_acl_prepare(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - const void *value, size_t size) -{ - return make_posix_acl(mnt_userns, fs_userns, value, size, - vfs_set_acl_prepare_kuid, - vfs_set_acl_prepare_kgid); -} -EXPORT_SYMBOL(vfs_set_acl_prepare); - -/** - * posix_acl_from_xattr_kuid - map ACL_USER uid into filesystem idmapping - * @mnt_userns: unused - * @fs_userns: the filesystem's idmapping - * @e: a ACL_USER entry in POSIX ACL uapi format - * - * Map the uid stored as ACL_USER entry in @e into the filesystem's idmapping. - * This is used in posix_acl_from_xattr() to generate the proper VFS - * representation of POSIX ACLs with ACL_USER entries. - * - * Return: A kuid in @fs_userns for the uid stored in @e. - */ -static inline kuid_t -posix_acl_from_xattr_kuid(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - const struct posix_acl_xattr_entry *e) -{ - return make_kuid(fs_userns, le32_to_cpu(e->e_id)); -} - -/** - * posix_acl_from_xattr_kgid - map ACL_GROUP gid into filesystem idmapping - * @mnt_userns: unused - * @fs_userns: the filesystem's idmapping - * @e: a ACL_GROUP entry in POSIX ACL uapi format - * - * Map the gid stored as ACL_GROUP entry in @e into the filesystem's idmapping. - * This is used in posix_acl_from_xattr() to generate the proper VFS - * representation of POSIX ACLs with ACL_GROUP entries. - * - * Return: A kgid in @fs_userns for the gid stored in @e. - */ -static inline kgid_t -posix_acl_from_xattr_kgid(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - const struct posix_acl_xattr_entry *e) -{ - return make_kgid(fs_userns, le32_to_cpu(e->e_id)); -} - -/** - * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format - * @fs_userns: the filesystem's idmapping - * @value: the uapi representation of POSIX ACLs - * @size: the size of @void - * - * Filesystems that store POSIX ACLs in the unaltered uapi format should use - * posix_acl_from_xattr() when reading them from the backing store and - * converting them into the struct posix_acl VFS format. The helper is - * specifically intended to be called from the ->get_inode_acl() inode - * operation. - * - * The posix_acl_from_xattr() function will map the raw {g,u}id values stored - * in ACL_{GROUP,USER} entries into the filesystem idmapping in @fs_userns. The - * posix_acl_from_xattr_k{g,u}id() helpers will take care to generate the - * correct k{g,u}id_t. The returned struct posix_acl can be cached. - * - * Note that posix_acl_from_xattr() does not take idmapped mounts into account. - * If it did it calling is from the ->get_inode_acl() inode operation would - * return POSIX ACLs mapped according to an idmapped mount which would mean - * that the value couldn't be cached for the filesystem. Idmapped mounts are - * taken into account on the fly during permission checking or right at the VFS - * - userspace boundary before reporting them to the user. - * - * Return: Allocated struct posix_acl on success, NULL for a valid header but - * without actual POSIX ACL entries, or ERR_PTR() encoded error code. - */ -struct posix_acl * -posix_acl_from_xattr(struct user_namespace *fs_userns, - const void *value, size_t size) -{ - return make_posix_acl(&init_user_ns, fs_userns, value, size, - posix_acl_from_xattr_kuid, - posix_acl_from_xattr_kgid); -} EXPORT_SYMBOL (posix_acl_from_xattr); /* @@ -1190,31 +930,6 @@ ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns, return real_size; } -static int -posix_acl_xattr_get(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *name, void *value, size_t size) -{ - struct posix_acl *acl; - int error; - - if (!IS_POSIXACL(inode)) - return -EOPNOTSUPP; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - acl = get_inode_acl(inode, handler->flags); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - - error = posix_acl_to_xattr(&init_user_ns, acl, value, size); - posix_acl_release(acl); - - return error; -} - int set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry, int type, struct posix_acl *acl) @@ -1240,36 +955,6 @@ set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry, } EXPORT_SYMBOL(set_posix_acl); -static int -posix_acl_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, size_t size, - int flags) -{ - struct posix_acl *acl = NULL; - int ret; - - if (value) { - /* - * By the time we end up here the {g,u}ids stored in - * ACL_{GROUP,USER} have already been mapped according to the - * caller's idmapping. The vfs_set_acl_prepare() helper will - * recover them and take idmapped mounts into account. The - * filesystem will receive the POSIX ACLs in the correct - * format ready to be cached or written to the backing store - * taking the filesystem idmapping into account. - */ - acl = vfs_set_acl_prepare(mnt_userns, i_user_ns(inode), - value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - ret = set_posix_acl(mnt_userns, dentry, handler->flags, acl); - posix_acl_release(acl); - return ret; -} - static bool posix_acl_xattr_list(struct dentry *dentry) { @@ -1280,8 +965,6 @@ const struct xattr_handler posix_acl_access_xattr_handler = { .name = XATTR_NAME_POSIX_ACL_ACCESS, .flags = ACL_TYPE_ACCESS, .list = posix_acl_xattr_list, - .get = posix_acl_xattr_get, - .set = posix_acl_xattr_set, }; EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler); @@ -1289,8 +972,6 @@ const struct xattr_handler posix_acl_default_xattr_handler = { .name = XATTR_NAME_POSIX_ACL_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = posix_acl_xattr_list, - .get = posix_acl_xattr_get, - .set = posix_acl_xattr_set, }; EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler); diff --git a/fs/xattr.c b/fs/xattr.c index 77db5aa26d13..df3af9fa8c77 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -454,10 +454,7 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry, return ret; } nolsm: - error = __vfs_getxattr(dentry, inode, name, value, size); - if (error > 0 && is_posix_acl_xattr(name)) - posix_acl_getxattr_idmapped_mnt(mnt_userns, inode, value, size); - return error; + return __vfs_getxattr(dentry, inode, name, value, size); } EXPORT_SYMBOL_GPL(vfs_getxattr); -- cgit From 6a542d1d5f6c814fd3643b43e85b21757c1e363b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 8 Jun 2020 12:21:07 -0400 Subject: kill signal_pt_regs() Once upon at it was used on hot paths, but that had not been true since 2013. IOW, there's no point for arch-optimized equivalent of task_pt_regs(current) - remaining two users are not worth bothering with. Signed-off-by: Al Viro --- fs/coredump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/coredump.c b/fs/coredump.c index 7bad7785e8e6..b4ec1bf889f9 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -525,7 +525,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { .siginfo = siginfo, - .regs = signal_pt_regs(), + .regs = task_pt_regs(current), .limit = rlimit(RLIMIT_CORE), /* * We must use the same mm->flags while dumping core to avoid -- cgit From 9a938eba8d284fba0daff62142dece74ae3c16de Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 8 Jun 2020 12:25:39 -0400 Subject: kill coredump_params->regs it's always task_pt_regs(current) Signed-off-by: Al Viro --- fs/binfmt_elf.c | 4 ++-- fs/coredump.c | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 63c7ebb0da89..002fd713ac11 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2082,7 +2082,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, /* now collect the dump for the current */ memset(info->prstatus, 0, sizeof(*info->prstatus)); fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo); - elf_core_copy_regs(&info->prstatus->pr_reg, cprm->regs); + elf_core_copy_regs(&info->prstatus->pr_reg, task_pt_regs(current)); /* Set up header */ fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); @@ -2109,7 +2109,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, /* Try to dump the FPU. */ info->prstatus->pr_fpvalid = - elf_core_copy_task_fpregs(current, cprm->regs, info->fpu); + elf_core_copy_task_fpregs(current, task_pt_regs(current), info->fpu); if (info->prstatus->pr_fpvalid) fill_note(info->notes + info->numnote++, "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu); diff --git a/fs/coredump.c b/fs/coredump.c index b4ec1bf889f9..1a474de1e52b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -525,7 +525,6 @@ void do_coredump(const kernel_siginfo_t *siginfo) static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { .siginfo = siginfo, - .regs = task_pt_regs(current), .limit = rlimit(RLIMIT_CORE), /* * We must use the same mm->flags while dumping core to avoid -- cgit From 922ef161b21e605bf803f65dc928fabefb735702 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Sep 2022 00:39:23 -0400 Subject: [elf][regset] clean fill_note_info() a bit *info is already initialized... Signed-off-by: Al Viro --- fs/binfmt_elf.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 002fd713ac11..4190dafd2ec4 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1833,24 +1833,17 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_thread_core_info *t; struct elf_prpsinfo *psinfo; struct core_thread *ct; - unsigned int i; - - info->size = 0; - info->thread = NULL; psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); - if (psinfo == NULL) { - info->psinfo.data = NULL; /* So we don't free this wrongly */ + if (!psinfo) return 0; - } - fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); /* * Figure out how many notes we're going to need for each thread. */ info->thread_notes = 0; - for (i = 0; i < view->n; ++i) + for (int i = 0; i < view->n; ++i) if (view->regsets[i].core_note_type != 0) ++info->thread_notes; -- cgit From 4b0e21d64253f56fa5b177e08383934680957697 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 8 Jun 2020 13:44:16 -0400 Subject: [elf][regset] simplify thread list handling in fill_note_info() fill_note_info() iterates through the list of threads collected in mm->core_state->dumper, allocating a struct elf_thread_core_info instance for each and linking those into a list. We need the entry corresponding to current to be first in the resulting list, so the logics for list insertion is if it's for current or list is empty insert in the head else insert after the first element However, in mm->core_state->dumper the entry for current is guaranteed to be the first one. Which means that both parts of condition will be true on the first iteration and neither will be true on all subsequent ones. Taking the first iteration out of the loop simplifies things nicely... Signed-off-by: Al Viro --- fs/binfmt_elf.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 4190dafd2ec4..e990075fb43d 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1866,7 +1866,14 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, /* * Allocate a structure for each thread. */ - for (ct = &dump_task->signal->core_state->dumper; ct; ct = ct->next) { + info->thread = kzalloc(offsetof(struct elf_thread_core_info, + notes[info->thread_notes]), + GFP_KERNEL); + if (unlikely(!info->thread)) + return 0; + + info->thread->task = dump_task; + for (ct = dump_task->signal->core_state->dumper.next; ct; ct = ct->next) { t = kzalloc(offsetof(struct elf_thread_core_info, notes[info->thread_notes]), GFP_KERNEL); @@ -1874,17 +1881,8 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, return 0; t->task = ct->task; - if (ct->task == dump_task || !info->thread) { - t->next = info->thread; - info->thread = t; - } else { - /* - * Make sure to keep the original task at - * the head of the list. - */ - t->next = info->thread->next; - info->thread->next = t; - } + t->next = info->thread->next; + info->thread->next = t; } /* -- cgit From 3aca47127a646165965ff52803e2b269eed91afc Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 22 Sep 2022 13:25:25 -0400 Subject: fs: drop useless condition from inode_needs_update_time Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/inode.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index b608528efd3a..762feb8d0c6c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2071,9 +2071,6 @@ static int inode_needs_update_time(struct inode *inode, struct timespec64 *now) if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) sync_it |= S_VERSION; - if (!sync_it) - return 0; - return sync_it; } -- cgit From 0dafb7e671f0e769c854609548037754a68dcbc6 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Wed, 12 Oct 2022 18:42:35 +0800 Subject: fs: udf: Optimize udf_free_in_core_inode and udf_find_fileset function These two functions perform the following optimizations. 1. Delete the type cast of foo pointer. Void * does not need to convert the type. 2. Delete the initialization assignment of bh variable, which is assigned first. Signed-off-by: Li zeming Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20221012104235.3331-1-zeming@nfschina.com --- fs/udf/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/udf/super.c b/fs/udf/super.c index 4042d9739fb7..06eda8177b5f 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -162,7 +162,7 @@ static void udf_free_in_core_inode(struct inode *inode) static void init_once(void *foo) { - struct udf_inode_info *ei = (struct udf_inode_info *)foo; + struct udf_inode_info *ei = foo; ei->i_data = NULL; inode_init_once(&ei->vfs_inode); @@ -820,7 +820,7 @@ static int udf_find_fileset(struct super_block *sb, struct kernel_lb_addr *fileset, struct kernel_lb_addr *root) { - struct buffer_head *bh = NULL; + struct buffer_head *bh; uint16_t ident; int ret; -- cgit From c791730f2554a9ebb8f18df9368dc27d4ebc38c2 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Sun, 23 Oct 2022 18:57:41 +0900 Subject: udf: Avoid double brelse() in udf_rename() syzbot reported a warning like below [1]: VFS: brelse: Trying to free free buffer WARNING: CPU: 2 PID: 7301 at fs/buffer.c:1145 __brelse+0x67/0xa0 ... Call Trace: invalidate_bh_lru+0x99/0x150 smp_call_function_many_cond+0xe2a/0x10c0 ? generic_remap_file_range_prep+0x50/0x50 ? __brelse+0xa0/0xa0 ? __mutex_lock+0x21c/0x12d0 ? smp_call_on_cpu+0x250/0x250 ? rcu_read_lock_sched_held+0xb/0x60 ? lock_release+0x587/0x810 ? __brelse+0xa0/0xa0 ? generic_remap_file_range_prep+0x50/0x50 on_each_cpu_cond_mask+0x3c/0x80 blkdev_flush_mapping+0x13a/0x2f0 blkdev_put_whole+0xd3/0xf0 blkdev_put+0x222/0x760 deactivate_locked_super+0x96/0x160 deactivate_super+0xda/0x100 cleanup_mnt+0x222/0x3d0 task_work_run+0x149/0x240 ? task_work_cancel+0x30/0x30 do_exit+0xb29/0x2a40 ? reacquire_held_locks+0x4a0/0x4a0 ? do_raw_spin_lock+0x12a/0x2b0 ? mm_update_next_owner+0x7c0/0x7c0 ? rwlock_bug.part.0+0x90/0x90 ? zap_other_threads+0x234/0x2d0 do_group_exit+0xd0/0x2a0 __x64_sys_exit_group+0x3a/0x50 do_syscall_64+0x34/0xb0 entry_SYSCALL_64_after_hwframe+0x63/0xcd The cause of the issue is that brelse() is called on both ofibh.sbh and ofibh.ebh by udf_find_entry() when it returns NULL. However, brelse() is called by udf_rename(), too. So, b_count on buffer_head becomes unbalanced. This patch fixes the issue by not calling brelse() by udf_rename() when udf_find_entry() returns NULL. Link: https://syzkaller.appspot.com/bug?id=8297f45698159c6bca8a1f87dc983667c1a1c851 [1] Reported-by: syzbot+7902cd7684bc35306224@syzkaller.appspotmail.com Signed-off-by: Shigeru Yoshida Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20221023095741.271430-1-syoshida@redhat.com --- fs/udf/namei.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/udf/namei.c b/fs/udf/namei.c index fb4c30e05245..d6081538bfc0 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1091,8 +1091,9 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir, return -EINVAL; ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); - if (IS_ERR(ofi)) { - retval = PTR_ERR(ofi); + if (!ofi || IS_ERR(ofi)) { + if (IS_ERR(ofi)) + retval = PTR_ERR(ofi); goto end_rename; } @@ -1101,8 +1102,7 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir, brelse(ofibh.sbh); tloc = lelb_to_cpu(ocfi.icb.extLocation); - if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) - != old_inode->i_ino) + if (udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino) goto end_rename; nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi); -- cgit From c3db3c2fd9992c08f49aa93752d3c103c3a4f6aa Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Mon, 24 Oct 2022 19:30:12 +0200 Subject: f2fs: should put a page when checking the summary info The commit introduces another bug. Cc: stable@vger.kernel.org Fixes: c6ad7fd16657e ("f2fs: fix to do sanity check on summary info") Signed-off-by: Pavel Machek Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 4546e01b2ee0..dab794225cce 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1110,6 +1110,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (ofs_in_node >= max_addrs) { f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u", ofs_in_node, dni->ino, dni->nid, max_addrs); + f2fs_put_page(node_page, 1); return false; } -- cgit From 14dc00a0e2dbea4b685ab9723ff511fcfd223c18 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 17 Oct 2022 17:52:05 -0700 Subject: f2fs: let's avoid to get cp_rwsem twice by f2fs_evict_inode by d_invalidate f2fs_unlink -> f2fs_lock_op -> d_invalidate -> shrink_dentry_list -> iput_final -> f2fs_evict_inode -> f2fs_lock_op Reviewed-by: Chao Yu Tested-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a389772fd212..e104409c3a0e 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -632,6 +632,8 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) goto fail; } f2fs_delete_entry(de, page, dir, inode); + f2fs_unlock_op(sbi); + #if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid @@ -642,8 +644,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) if (IS_CASEFOLDED(dir)) d_invalidate(dentry); #endif - f2fs_unlock_op(sbi); - if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); fail: -- cgit From 2b5f9dad32ed19e8db3b0f10a84aa824a219803b Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 20 Sep 2022 17:31:19 -0700 Subject: fs/exec: switch timens when a task gets a new mm Changing a time namespace requires remapping a vvar page, so we don't want to allow doing that if any other tasks can use the same mm. Currently, we install a time namespace when a task is created with a new vm. exec() is another case when a task gets a new mm and so it can switch a time namespace safely, but it isn't handled now. One more issue of the current interface is that clone() with CLONE_VM isn't allowed if the current task has unshared a time namespace (timens_for_children doesn't match the current timens). Both these issues make some inconvenience for users. For example, Alexey and Florian reported that posix_spawn() uses vfork+exec and this pattern doesn't work with time namespaces due to the both described issues. LXC needed to workaround the exec() issue by calling setns. In the commit 133e2d3e81de5 ("fs/exec: allow to unshare a time namespace on vfork+exec"), we tried to fix these issues with minimal impact on UAPI. But it adds extra complexity and some undesirable side effects. Eric suggested fixing the issues properly because here are all the reasons to suppose that there are no users that depend on the old behavior. Cc: Alexey Izbyshev Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Florian Weimer Cc: Kees Cook Suggested-by: "Eric W. Biederman" Origin-author: "Eric W. Biederman" Signed-off-by: Andrei Vagin Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220921003120.209637-1-avagin@google.com --- fs/exec.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 32dc8cf5fceb..34e6a2e53268 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include @@ -1297,6 +1298,10 @@ int begin_new_exec(struct linux_binprm * bprm) bprm->mm = NULL; + retval = exec_task_namespaces(); + if (retval) + goto out_unlock; + #ifdef CONFIG_POSIX_TIMERS spin_lock_irq(&me->sighand->siglock); posix_cpu_timers_exit(me); -- cgit From 275498a98b1fe77deebddfc4f8986c0cf2c3ced7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 18 Oct 2022 00:17:24 -0700 Subject: exec: Add comments on check_unsafe_exec() fs counting Add some comments about what the fs counting is doing in check_unsafe_exec() and how it relates to the call graph. Specifically, we can't force an unshare of the fs because of at least Chrome: https://lore.kernel.org/lkml/86CE201B-5632-4BB7-BCF6-7CB2C2895409@chromium.org/ Cc: Eric Biederman Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Kees Cook Acked-by: Christian Brauner (Microsoft) Link: https://lore.kernel.org/r/20221018071537.never.662-kees@kernel.org --- fs/exec.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 34e6a2e53268..b3e87bd50962 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1573,6 +1573,12 @@ static void check_unsafe_exec(struct linux_binprm *bprm) if (task_no_new_privs(current)) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; + /* + * If another task is sharing our fs, we cannot safely + * suid exec because the differently privileged task + * will be able to manipulate the current directory, etc. + * It would be nice to force an unshare instead... + */ t = p; n_fs = 1; spin_lock(&p->fs->lock); @@ -1753,6 +1759,7 @@ static int search_binary_handler(struct linux_binprm *bprm) return retval; } +/* binfmt handlers will call back into begin_new_exec() on success. */ static int exec_binprm(struct linux_binprm *bprm) { pid_t old_pid, old_vpid; @@ -1811,6 +1818,11 @@ static int bprm_execve(struct linux_binprm *bprm, if (retval) return retval; + /* + * Check for unsafe execution states before exec_binprm(), which + * will call back into begin_new_exec(), into bprm_creds_from_file(), + * where setuid-ness is evaluated. + */ check_unsafe_exec(bprm); current->in_execve = 1; -- cgit From 8f6e3f9e5a0f58e458a348b7e36af11d0e9702af Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 18 Oct 2022 00:14:20 -0700 Subject: binfmt: Fix whitespace issues Fix the annoying whitespace issues that have been following these files around for years. Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Kees Cook Acked-by: Christian Brauner (Microsoft) Link: https://lore.kernel.org/r/20221018071350.never.230-kees@kernel.org --- fs/binfmt_elf.c | 14 +++++++------- fs/binfmt_elf_fdpic.c | 2 +- fs/exec.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 6a11025e5850..38a9496f83f3 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -248,7 +248,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, } while (0) #ifdef ARCH_DLINFO - /* + /* * ARCH_DLINFO must come first so PPC can do its special alignment of * AUXV. * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in @@ -1020,7 +1020,7 @@ out_free_interp: executable_stack); if (retval < 0) goto out_free_dentry; - + elf_bss = 0; elf_brk = 0; @@ -1043,7 +1043,7 @@ out_free_interp: if (unlikely (elf_brk > elf_bss)) { unsigned long nbyte; - + /* There was a PT_LOAD segment with p_memsz > p_filesz before this one. Map anonymous pages, if needed, and clear the area. */ @@ -1521,7 +1521,7 @@ static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) phdr->p_align = 0; } -static void fill_note(struct memelfnote *note, const char *name, int type, +static void fill_note(struct memelfnote *note, const char *name, int type, unsigned int sz, void *data) { note->name = name; @@ -2004,8 +2004,8 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t) t->num_notes = 0; fill_prstatus(&t->prstatus.common, p, signr); - elf_core_copy_task_regs(p, &t->prstatus.pr_reg); - + elf_core_copy_task_regs(p, &t->prstatus.pr_reg); + fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), &(t->prstatus)); t->num_notes++; @@ -2295,7 +2295,7 @@ static int elf_core_dump(struct coredump_params *cprm) if (!elf_core_write_extra_phdrs(cprm, offset)) goto end_coredump; - /* write out the notes section */ + /* write out the notes section */ if (!write_note_info(&info, cprm)) goto end_coredump; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 08d0c8797828..e90c1192dec6 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1603,7 +1603,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!elf_core_write_extra_phdrs(cprm, offset)) goto end_coredump; - /* write out the notes section */ + /* write out the notes section */ if (!writenote(thread_list->notes, cprm)) goto end_coredump; if (!writenote(&psinfo_note, cprm)) diff --git a/fs/exec.c b/fs/exec.c index b3e87bd50962..1644bf10fb9d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -172,7 +172,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) exit: fput(file); out: - return error; + return error; } #endif /* #ifdef CONFIG_USELIB */ -- cgit From bfb4a2b95875a47a01234f2de113ec089d524e71 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Wed, 19 Oct 2022 09:32:35 +0200 Subject: exec: simplify initial stack size expansion I had a hard time trying to understand completely why it is using vm_end in one side of the expression and vm_start in the other one, and using something in the "if" clause that is not an exact copy of what is used below. The whole point is that the stack_size variable that was used in the "if" clause is the difference between vm_start and vm_end, which is not far away but makes this thing harder to read than it must be. Signed-off-by: Rolf Eike Beer Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/2017429.gqNitNVd0C@mobilepool36.emlix.com --- fs/exec.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 1644bf10fb9d..9585bc1bc970 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -843,16 +843,13 @@ int setup_arg_pages(struct linux_binprm *bprm, * will align it up. */ rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK; + + stack_expand = min(rlim_stack, stack_size + stack_expand); + #ifdef CONFIG_STACK_GROWSUP - if (stack_size + stack_expand > rlim_stack) - stack_base = vma->vm_start + rlim_stack; - else - stack_base = vma->vm_end + stack_expand; + stack_base = vma->vm_start + stack_expand; #else - if (stack_size + stack_expand > rlim_stack) - stack_base = vma->vm_end - rlim_stack; - else - stack_base = vma->vm_start - stack_expand; + stack_base = vma->vm_end - stack_expand; #endif current->mm->start_stack = bprm->p; ret = expand_stack(vma, stack_base); -- cgit From cfc46ca4fdcaef6f0215f56d89162400aee402e3 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Wed, 19 Oct 2022 09:43:01 +0200 Subject: binfmt_elf: fix documented return value for load_elf_phdrs() This function has never returned anything but a plain NULL. Fixes: 6a8d38945cf4 ("binfmt_elf: Hoist ELF program header loading to a function") Signed-off-by: Rolf Eike Beer Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/2359389.EDbqzprbEW@mobilepool36.emlix.com --- fs/binfmt_elf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 38a9496f83f3..71df1a3c2dac 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -456,7 +456,7 @@ static unsigned long maximum_alignment(struct elf_phdr *cmds, int nr) * * Loads ELF program headers from the binary file elf_file, which has the ELF * header pointed to by elf_ex, into a newly allocated array. The caller is - * responsible for freeing the allocated data. Returns an ERR_PTR upon failure. + * responsible for freeing the allocated data. Returns NULL upon failure. */ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex, struct file *elf_file) -- cgit From ef20c5139c3157b5c6eda46f496952bddffe9ad6 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Wed, 19 Oct 2022 09:52:16 +0200 Subject: binfmt_elf: simplify error handling in load_elf_phdrs() The err variable was the same like retval, but capped to <= 0. This is the same as retval as elf_read() never returns positive values. Signed-off-by: Rolf Eike Beer Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/4137126.7Qn9TF0dmF@mobilepool36.emlix.com --- fs/binfmt_elf.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 71df1a3c2dac..528e2ac8931f 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -462,7 +462,7 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex, struct file *elf_file) { struct elf_phdr *elf_phdata = NULL; - int retval, err = -1; + int retval = -1; unsigned int size; /* @@ -484,15 +484,9 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex, /* Read in the program headers */ retval = elf_read(elf_file, elf_phdata, size, elf_ex->e_phoff); - if (retval < 0) { - err = retval; - goto out; - } - /* Success! */ - err = 0; out: - if (err) { + if (retval) { kfree(elf_phdata); elf_phdata = NULL; } -- cgit From a2bd096fb2d7f50fb4db246b33e7bfcf5e2eda3a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Jun 2022 22:12:16 +0200 Subject: fs: use type safe idmapping helpers We already ported most parts and filesystems over for v6.0 to the new vfs{g,u}id_t type and associated helpers for v6.0. Convert the remaining places so we can remove all the old helpers. This is a non-functional change. Reviewed-by: Seth Forshee (DigitalOcean) Signed-off-by: Christian Brauner (Microsoft) --- fs/coredump.c | 4 ++-- fs/exec.c | 16 ++++++++-------- fs/inode.c | 8 ++++---- fs/namei.c | 40 ++++++++++++++++++++-------------------- fs/remap_range.c | 2 +- fs/stat.c | 7 +++++-- 6 files changed, 40 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/coredump.c b/fs/coredump.c index 7bad7785e8e6..a133103eb721 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -716,8 +716,8 @@ void do_coredump(const kernel_siginfo_t *siginfo) * filesystem. */ mnt_userns = file_mnt_user_ns(cprm.file); - if (!uid_eq(i_uid_into_mnt(mnt_userns, inode), - current_fsuid())) { + if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), + current_fsuid())) { pr_info_ratelimited("Core dump to %s aborted: cannot preserve file owner\n", cn.corename); goto close_fail; diff --git a/fs/exec.c b/fs/exec.c index 349a5da91efe..dd91adec7a11 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1591,8 +1591,8 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) struct user_namespace *mnt_userns; struct inode *inode = file_inode(file); unsigned int mode; - kuid_t uid; - kgid_t gid; + vfsuid_t vfsuid; + vfsgid_t vfsgid; if (!mnt_may_suid(file->f_path.mnt)) return; @@ -1611,23 +1611,23 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) /* reload atomically mode/uid/gid now that lock held */ mode = inode->i_mode; - uid = i_uid_into_mnt(mnt_userns, inode); - gid = i_gid_into_mnt(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid = i_gid_into_vfsgid(mnt_userns, inode); inode_unlock(inode); /* We ignore suid/sgid if there are no mappings for them in the ns */ - if (!kuid_has_mapping(bprm->cred->user_ns, uid) || - !kgid_has_mapping(bprm->cred->user_ns, gid)) + if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) || + !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid)) return; if (mode & S_ISUID) { bprm->per_clear |= PER_CLEAR_ON_SETID; - bprm->cred->euid = uid; + bprm->cred->euid = vfsuid_into_kuid(vfsuid); } if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { bprm->per_clear |= PER_CLEAR_ON_SETID; - bprm->cred->egid = gid; + bprm->cred->egid = vfsgid_into_kgid(vfsgid); } } diff --git a/fs/inode.c b/fs/inode.c index 8c4078889754..757cac29bd5a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2326,15 +2326,15 @@ EXPORT_SYMBOL(inode_init_owner); bool inode_owner_or_capable(struct user_namespace *mnt_userns, const struct inode *inode) { - kuid_t i_uid; + vfsuid_t vfsuid; struct user_namespace *ns; - i_uid = i_uid_into_mnt(mnt_userns, inode); - if (uid_eq(current_fsuid(), i_uid)) + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return true; ns = current_user_ns(); - if (kuid_has_mapping(ns, i_uid) && ns_capable(ns, CAP_FOWNER)) + if (vfsuid_has_mapping(ns, vfsuid) && ns_capable(ns, CAP_FOWNER)) return true; return false; } diff --git a/fs/namei.c b/fs/namei.c index 578c2110df02..d5c5cb7dd023 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -336,11 +336,11 @@ static int acl_permission_check(struct user_namespace *mnt_userns, struct inode *inode, int mask) { unsigned int mode = inode->i_mode; - kuid_t i_uid; + vfsuid_t vfsuid; /* Are we the owner? If so, ACL's don't matter */ - i_uid = i_uid_into_mnt(mnt_userns, inode); - if (likely(uid_eq(current_fsuid(), i_uid))) { + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { mask &= 7; mode >>= 6; return (mask & ~mode) ? -EACCES : 0; @@ -362,8 +362,8 @@ static int acl_permission_check(struct user_namespace *mnt_userns, * about? Need to check group ownership if so. */ if (mask & (mode ^ (mode >> 3))) { - kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); - if (in_group_p(kgid)) + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + if (vfsgid_in_group_p(vfsgid)) mode >>= 3; } @@ -581,7 +581,7 @@ struct nameidata { struct nameidata *saved; unsigned root_seq; int dfd; - kuid_t dir_uid; + vfsuid_t dir_vfsuid; umode_t dir_mode; } __randomize_layout; @@ -1095,15 +1095,15 @@ fs_initcall(init_fs_namei_sysctls); static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { struct user_namespace *mnt_userns; - kuid_t i_uid; + vfsuid_t vfsuid; if (!sysctl_protected_symlinks) return 0; mnt_userns = mnt_user_ns(nd->path.mnt); - i_uid = i_uid_into_mnt(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); /* Allowed if owner and follower match. */ - if (uid_eq(current_cred()->fsuid, i_uid)) + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return 0; /* Allowed if parent directory not sticky and world-writable. */ @@ -1111,7 +1111,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod return 0; /* Allowed if parent directory and link owner match. */ - if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid)) + if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid)) return 0; if (nd->flags & LOOKUP_RCU) @@ -1183,8 +1183,8 @@ int may_linkat(struct user_namespace *mnt_userns, const struct path *link) struct inode *inode = link->dentry->d_inode; /* Inode writeback is not safe when the uid or gid are invalid. */ - if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) || - !gid_valid(i_gid_into_mnt(mnt_userns, inode))) + if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) || + !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) return -EOVERFLOW; if (!sysctl_protected_hardlinks) @@ -1232,13 +1232,13 @@ static int may_create_in_sticky(struct user_namespace *mnt_userns, struct nameidata *nd, struct inode *const inode) { umode_t dir_mode = nd->dir_mode; - kuid_t dir_uid = nd->dir_uid; + vfsuid_t dir_vfsuid = nd->dir_vfsuid; if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || likely(!(dir_mode & S_ISVTX)) || - uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) || - uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) + vfsuid_eq(i_uid_into_vfsuid(mnt_userns, inode), dir_vfsuid) || + vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid())) return 0; if (likely(dir_mode & 0002) || @@ -2307,7 +2307,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) OK: /* pathname or trailing symlink, done */ if (!depth) { - nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode); + nd->dir_vfsuid = i_uid_into_vfsuid(mnt_userns, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; return 0; @@ -2885,9 +2885,9 @@ int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir, { kuid_t fsuid = current_fsuid(); - if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid)) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), fsuid)) return 0; - if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid)) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, dir), fsuid)) return 0; return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER); } @@ -2926,8 +2926,8 @@ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir, BUG_ON(victim->d_parent->d_inode != dir); /* Inode writeback is not safe when the uid or gid are invalid. */ - if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) || - !gid_valid(i_gid_into_mnt(mnt_userns, inode))) + if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) || + !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) return -EOVERFLOW; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); diff --git a/fs/remap_range.c b/fs/remap_range.c index 654912d06862..290743c8d226 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -429,7 +429,7 @@ static bool allow_file_dedupe(struct file *file) return true; if (file->f_mode & FMODE_WRITE) return true; - if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid())) return true; if (!inode_permission(mnt_userns, inode, MAY_WRITE)) return true; diff --git a/fs/stat.c b/fs/stat.c index ef50573c72a2..d6cc74ca8486 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -44,12 +44,15 @@ void generic_fillattr(struct user_namespace *mnt_userns, struct inode *inode, struct kstat *stat) { + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + stat->dev = inode->i_sb->s_dev; stat->ino = inode->i_ino; stat->mode = inode->i_mode; stat->nlink = inode->i_nlink; - stat->uid = i_uid_into_mnt(mnt_userns, inode); - stat->gid = i_gid_into_mnt(mnt_userns, inode); + stat->uid = vfsuid_into_kuid(vfsuid); + stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->i_rdev; stat->size = i_size_read(inode); stat->atime = inode->i_atime; -- cgit From a03a972b26da2d30d1f3b3a72963191cd2938835 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 9 Sep 2022 11:40:21 +0200 Subject: fuse: port to vfs{g,u}id_t and associated helpers A while ago we introduced a dedicated vfs{g,u}id_t type in commit 1e5267cd0895 ("mnt_idmapping: add vfs{g,u}id_t"). We already switched over a good part of the VFS. Ultimately we will remove all legacy idmapped mount helpers that operate only on k{g,u}id_t in favor of the new type safe helpers that operate on vfs{g,u}id_t. Cc: Christoph Hellwig Reviewed-by: Seth Forshee (DigitalOcean) Signed-off-by: Christian Brauner (Microsoft) --- fs/fuse/acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 337cb29a8dd5..84c1ca4bc1dc 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -98,7 +98,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, return ret; } - if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) && + if (!vfsgid_in_group_p(i_gid_into_vfsgid(&init_user_ns, inode)) && !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; -- cgit From c12db92d62bf8cd4532ab4e572be0926d3a375be Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 9 Sep 2022 11:07:47 +0200 Subject: ovl: port to vfs{g,u}id_t and associated helpers A while ago we introduced a dedicated vfs{g,u}id_t type in commit 1e5267cd0895 ("mnt_idmapping: add vfs{g,u}id_t"). We already switched over a good part of the VFS. Ultimately we will remove all legacy idmapped mount helpers that operate only on k{g,u}id_t in favor of the new type safe helpers that operate on vfs{g,u}id_t. Cc: Amir Goldstein Cc: Christoph Hellwig Reviewed-by: Seth Forshee (DigitalOcean) Signed-off-by: Christian Brauner (Microsoft) --- fs/overlayfs/util.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 81a57a8d80d9..c0c20d33691b 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -1104,13 +1104,18 @@ void ovl_copyattr(struct inode *inode) struct path realpath; struct inode *realinode; struct user_namespace *real_mnt_userns; + vfsuid_t vfsuid; + vfsgid_t vfsgid; ovl_i_path_real(inode, &realpath); realinode = d_inode(realpath.dentry); real_mnt_userns = mnt_user_ns(realpath.mnt); - inode->i_uid = i_uid_into_mnt(real_mnt_userns, realinode); - inode->i_gid = i_gid_into_mnt(real_mnt_userns, realinode); + vfsuid = i_uid_into_vfsuid(real_mnt_userns, realinode); + vfsgid = i_gid_into_vfsgid(real_mnt_userns, realinode); + + inode->i_uid = vfsuid_into_kuid(vfsuid); + inode->i_gid = vfsgid_into_kgid(vfsgid); inode->i_mode = realinode->i_mode; inode->i_atime = realinode->i_atime; inode->i_mtime = realinode->i_mtime; -- cgit From ebe060369f8d6e4588b115f252bebf5ba4d64350 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 22 Oct 2022 21:39:14 +0100 Subject: jfs: Fix fortify moan in symlink JFS has in jfs_incore.h: /* _inline may overflow into _inline_ea when needed */ /* _inline_ea may overlay the last part of * file._xtroot if maxentry = XTROOTINITSLOT */ union { struct { /* 128: inline symlink */ unchar _inline[128]; /* 128: inline extended attr */ unchar _inline_ea[128]; }; unchar _inline_all[256]; and currently the symlink code copies into _inline; if this is larger than 128 bytes it triggers a fortify warning of the form: memcpy: detected field-spanning write (size 132) of single field "ip->i_link" at fs/jfs/namei.c:950 (size 18446744073709551615) when it's actually OK. Copy it into _inline_all instead. Reported-by: syzbot+5fc38b2ddbbca7f5c680@syzkaller.appspotmail.com Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Kees Cook Signed-off-by: Dave Kleikamp --- fs/jfs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 9db4f5789c0e..4fbbf88435e6 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -946,7 +946,7 @@ static int jfs_symlink(struct user_namespace *mnt_userns, struct inode *dip, if (ssize <= IDATASIZE) { ip->i_op = &jfs_fast_symlink_inode_operations; - ip->i_link = JFS_IP(ip)->i_inline; + ip->i_link = JFS_IP(ip)->i_inline_all; memcpy(ip->i_link, name, ssize); ip->i_size = ssize - 1; -- cgit From 25e70c6162f207828dd405b432d8f2a98dbf7082 Mon Sep 17 00:00:00 2001 From: Hoi Pok Wu Date: Tue, 25 Oct 2022 23:20:45 +0800 Subject: fs: jfs: fix shift-out-of-bounds in dbDiscardAG This should be applied to most URSAN bugs found recently by syzbot, by guarding the dbMount. As syzbot feeding rubbish into the bmap descriptor. Signed-off-by: Hoi Pok Wu Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_dmap.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index e1cbfbb60303..765838578a72 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -198,6 +198,11 @@ int dbMount(struct inode *ipbmap) goto err_release_metapage; } + if (((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) { + err = -EINVAL; + goto err_release_metapage; + } + for (i = 0; i < MAXAG; i++) bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]); bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize); -- cgit From 28fc4e9077ce59ab28c89c20dc6be5154473218f Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Tue, 18 Oct 2022 10:45:32 +0800 Subject: f2fs: Fix the race condition of resize flag between resizefs Because the set/clear SBI_IS_RESIZEFS flag not between any locks, In the following case: thread1 thread2 ->ioctl(resizefs) ->set RESIZEFS flag ->ioctl(resizefs) ... ->set RESIZEFS flag ->clear RESIZEFS flag ->resizefs stream # No RESIZEFS flag in the stream Also before freeze_super, the resizefs not started, we should not set the SBI_IS_RESIZEFS flag. So move the set/clear SBI_IS_RESIZEFS flag between the cp_mutex and gc_lock. Fixes: b4b10061ef98 ("f2fs: refactor resize_fs to avoid meta updates in progress") Signed-off-by: Zhang Xiaoxu Signed-off-by: Zhang Qilong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index dab794225cce..7b4be412cec0 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -2134,8 +2134,6 @@ out_unlock: if (err) return err; - set_sbi_flag(sbi, SBI_IS_RESIZEFS); - freeze_super(sbi->sb); f2fs_down_write(&sbi->gc_lock); f2fs_down_write(&sbi->cp_global_sem); @@ -2151,6 +2149,7 @@ out_unlock: if (err) goto out_err; + set_sbi_flag(sbi, SBI_IS_RESIZEFS); err = free_segment_range(sbi, secs, false); if (err) goto recover_out; @@ -2174,6 +2173,7 @@ out_unlock: f2fs_commit_super(sbi, false); } recover_out: + clear_sbi_flag(sbi, SBI_IS_RESIZEFS); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_err(sbi, "resize_fs failed, should run fsck to repair!"); @@ -2186,6 +2186,5 @@ out_err: f2fs_up_write(&sbi->cp_global_sem); f2fs_up_write(&sbi->gc_lock); thaw_super(sbi->sb); - clear_sbi_flag(sbi, SBI_IS_RESIZEFS); return err; } -- cgit From a351b1f444187312bb42479cb26e82f26fc481d2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 28 Oct 2022 10:00:26 +0200 Subject: acl: make vfs_posix_acl_to_xattr() static After reworking posix acls this helper isn't used anywhere outside the core posix acl paths. Make it static. Signed-off-by: Christian Brauner (Microsoft) --- fs/posix_acl.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/posix_acl.c b/fs/posix_acl.c index acb7f7d7f11d..989bbf280bfe 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -884,9 +884,10 @@ EXPORT_SYMBOL (posix_acl_to_xattr); * Return: On success, the size of the stored uapi posix acls, on error a * negative errno. */ -ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns, - struct inode *inode, const struct posix_acl *acl, - void *buffer, size_t size) +static ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns, + struct inode *inode, + const struct posix_acl *acl, void *buffer, + size_t size) { struct posix_acl_xattr_header *ext_acl = buffer; -- cgit From cb2144d66b0b24fd1b880fc72678ba21ca414dab Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 28 Oct 2022 12:45:10 +0200 Subject: cifs: check whether acl is valid early Dan reported that acl is dereferenced before being checked and this is a valid problem. Fix it be erroring out early instead of doing it later after we've already relied on acl to be a valid pointer. Fixes: dc1af4c4b472 ("cifs: implement set acl method") Reported-by: Dan Carpenter Signed-off-by: Christian Brauner (Microsoft) --- fs/cifs/cifsacl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 6a9f03c882dc..c647f0d56518 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1764,6 +1764,10 @@ int cifs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, rc = PTR_ERR(full_path); goto out; } + + if (!acl) + goto out; + /* return dos attributes as pseudo xattr */ /* return alt name if available as pseudo attr */ @@ -1778,8 +1782,6 @@ int cifs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, switch (type) { case ACL_TYPE_ACCESS: - if (!acl) - goto out; if (sb->s_flags & SB_POSIXACL) rc = cifs_do_set_acl(xid, pTcon, full_path, acl, ACL_TYPE_ACCESS, @@ -1788,8 +1790,6 @@ int cifs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, break; case ACL_TYPE_DEFAULT: - if (!acl) - goto out; if (sb->s_flags & SB_POSIXACL) rc = cifs_do_set_acl(xid, pTcon, full_path, acl, ACL_TYPE_DEFAULT, -- cgit From 256c8aed2b420a7c57ed6469fbb0f8310f5aeec9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 26 Oct 2022 12:51:27 +0200 Subject: fs: introduce dedicated idmap type for mounts Last cycle we've already made the interaction with idmapped mounts more robust and type safe by introducing the vfs{g,u}id_t type. This cycle we concluded the conversion and removed the legacy helpers. Currently we still pass around the plain namespace that was attached to a mount. This is in general pretty convenient but it makes it easy to conflate filesystem and mount namespaces and what different roles they have to play. Especially for filesystem developers without much experience in this area this is an easy source for bugs. Instead of passing the plain namespace we introduce a dedicated type struct mnt_idmap and replace the pointer with a pointer to a struct mnt_idmap. There are no semantic or size changes for the mount struct caused by this. We then start converting all places aware of idmapped mounts to rely on struct mnt_idmap. Once the conversion is done all helpers down to the really low-level make_vfs{g,u}id() and from_vfs{g,u}id() will take a struct mnt_idmap argument instead of two namespace arguments. This way it becomes impossible to conflate the two, removing and thus eliminating the possibility of any bugs. Fwiw, I fixed some issues in that area a while ago in ntfs3 and ksmbd in the past. Afterwards, only low-level code can ultimately use the associated namespace for any permission checks. Even most of the vfs can be ultimately completely oblivious about this and filesystems will never interact with it directly in any form in the future. A struct mnt_idmap currently encompasses a simple refcount and a pointer to the relevant namespace the mount is idmapped to. If a mount isn't idmapped then it will point to a static nop_mnt_idmap. If it is an idmapped mount it will point to a new struct mnt_idmap. As usual there are no allocations or anything happening for non-idmapped mounts. Everthing is carefully written to be a nop for non-idmapped mounts as has always been the case. If an idmapped mount or mount tree is created a new struct mnt_idmap is allocated and a reference taken on the relevant namespace. For each mount in a mount tree that gets idmapped or a mount that inherits the idmap when it is cloned the reference count on the associated struct mnt_idmap is bumped. Just a reminder that we only allow a mount to change it's idmapping a single time and only if it hasn't already been attached to the filesystems and has no active writers. The actual changes are fairly straightforward. This will have huge benefits for maintenance and security in the long run even if it causes some churn. I'm aware that there's some cost for all of you. And I'll commit to doing this work and make this as painless as I can. Note that this also makes it possible to extend struct mount_idmap in the future. For example, it would be possible to place the namespace pointer in an anonymous union together with an idmapping struct. This would allow us to expose an api to userspace that would let it specify idmappings directly instead of having to go through the detour of setting up namespaces at all. This just adds the infrastructure and doesn't do any conversions. Reviewed-by: Seth Forshee (DigitalOcean) Signed-off-by: Christian Brauner (Microsoft) --- fs/namespace.c | 176 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 142 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index df137ba19d37..e276c0e7ecea 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -75,6 +75,22 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +struct mnt_idmap { + struct user_namespace *owner; + refcount_t count; +}; + +/* + * Carries the initial idmapping of 0:0:4294967295 which is an identity + * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is + * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. + */ +struct mnt_idmap nop_mnt_idmap = { + .owner = &init_user_ns, + .count = REFCOUNT_INIT(1), +}; +EXPORT_SYMBOL_GPL(nop_mnt_idmap); + struct mount_kattr { unsigned int attr_set; unsigned int attr_clr; @@ -82,6 +98,7 @@ struct mount_kattr { unsigned int lookup_flags; bool recurse; struct user_namespace *mnt_userns; + struct mnt_idmap *mnt_idmap; }; /* /sys/fs */ @@ -193,6 +210,104 @@ int mnt_get_count(struct mount *mnt) #endif } +/** + * mnt_idmap_owner - retrieve owner of the mount's idmapping + * @idmap: mount idmapping + * + * This helper will go away once the conversion to use struct mnt_idmap + * everywhere has finished at which point the helper will be unexported. + * + * Only code that needs to perform permission checks based on the owner of the + * idmapping will get access to it. All other code will solely rely on + * idmappings. This will get us type safety so it's impossible to conflate + * filesystems idmappings with mount idmappings. + * + * Return: The owner of the idmapping. + */ +struct user_namespace *mnt_idmap_owner(const struct mnt_idmap *idmap) +{ + return idmap->owner; +} +EXPORT_SYMBOL_GPL(mnt_idmap_owner); + +/** + * mnt_user_ns - retrieve owner of an idmapped mount + * @mnt: the relevant vfsmount + * + * This helper will go away once the conversion to use struct mnt_idmap + * everywhere has finished at which point the helper will be unexported. + * + * Only code that needs to perform permission checks based on the owner of the + * idmapping will get access to it. All other code will solely rely on + * idmappings. This will get us type safety so it's impossible to conflate + * filesystems idmappings with mount idmappings. + * + * Return: The owner of the idmapped. + */ +struct user_namespace *mnt_user_ns(const struct vfsmount *mnt) +{ + struct mnt_idmap *idmap = mnt_idmap(mnt); + + /* Return the actual owner of the filesystem instead of the nop. */ + if (idmap == &nop_mnt_idmap && + !initial_idmapping(mnt->mnt_sb->s_user_ns)) + return mnt->mnt_sb->s_user_ns; + return mnt_idmap_owner(idmap); +} +EXPORT_SYMBOL_GPL(mnt_user_ns); + +/** + * alloc_mnt_idmap - allocate a new idmapping for the mount + * @mnt_userns: owning userns of the idmapping + * + * Allocate a new struct mnt_idmap which carries the idmapping of the mount. + * + * Return: On success a new idmap, on error an error pointer is returned. + */ +static struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) +{ + struct mnt_idmap *idmap; + + idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); + if (!idmap) + return ERR_PTR(-ENOMEM); + + idmap->owner = get_user_ns(mnt_userns); + refcount_set(&idmap->count, 1); + return idmap; +} + +/** + * mnt_idmap_get - get a reference to an idmapping + * @idmap: the idmap to bump the reference on + * + * If @idmap is not the @nop_mnt_idmap bump the reference count. + * + * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed. + */ +static inline struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap) +{ + if (idmap != &nop_mnt_idmap) + refcount_inc(&idmap->count); + + return idmap; +} + +/** + * mnt_idmap_put - put a reference to an idmapping + * @idmap: the idmap to put the reference on + * + * If this is a non-initial idmapping, put the reference count when a mount is + * released and free it if we're the last user. + */ +static inline void mnt_idmap_put(struct mnt_idmap *idmap) +{ + if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) { + put_user_ns(idmap->owner); + kfree(idmap); + } +} + static struct mount *alloc_vfsmnt(const char *name) { struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); @@ -232,7 +347,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); - mnt->mnt.mnt_userns = &init_user_ns; + mnt->mnt.mnt_idmap = &nop_mnt_idmap; } return mnt; @@ -602,11 +717,7 @@ int sb_prepare_remount_readonly(struct super_block *sb) static void free_vfsmnt(struct mount *mnt) { - struct user_namespace *mnt_userns; - - mnt_userns = mnt_user_ns(&mnt->mnt); - if (!initial_idmapping(mnt_userns)) - put_user_ns(mnt_userns); + mnt_idmap_put(mnt_idmap(&mnt->mnt)); kfree_const(mnt->mnt_devname); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); @@ -1009,7 +1120,6 @@ static struct mount *skip_mnt_tree(struct mount *p) struct vfsmount *vfs_create_mount(struct fs_context *fc) { struct mount *mnt; - struct user_namespace *fs_userns; if (!fc->root) return ERR_PTR(-EINVAL); @@ -1027,10 +1137,6 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc) mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; - fs_userns = mnt->mnt.mnt_sb->s_user_ns; - if (!initial_idmapping(fs_userns)) - mnt->mnt.mnt_userns = get_user_ns(fs_userns); - lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); unlock_mount_hash(); @@ -1120,9 +1226,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); atomic_inc(&sb->s_active); - mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt); - if (!initial_idmapping(mnt->mnt.mnt_userns)) - mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns); + mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); + mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); mnt->mnt_mountpoint = mnt->mnt.mnt_root; @@ -3981,14 +4086,14 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) struct vfsmount *m = &mnt->mnt; struct user_namespace *fs_userns = m->mnt_sb->s_user_ns; - if (!kattr->mnt_userns) + if (!kattr->mnt_idmap) return 0; /* * Creating an idmapped mount with the filesystem wide idmapping * doesn't make sense so block that. We don't allow mushy semantics. */ - if (kattr->mnt_userns == fs_userns) + if (mnt_idmap_owner(kattr->mnt_idmap) == fs_userns) return -EINVAL; /* @@ -4028,7 +4133,7 @@ static inline bool mnt_allow_writers(const struct mount_kattr *kattr, { return (!(kattr->attr_set & MNT_READONLY) || (mnt->mnt.mnt_flags & MNT_READONLY)) && - !kattr->mnt_userns; + !kattr->mnt_idmap; } static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) @@ -4082,27 +4187,18 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { - struct user_namespace *mnt_userns, *old_mnt_userns; - - if (!kattr->mnt_userns) + if (!kattr->mnt_idmap) return; /* - * We're the only ones able to change the mount's idmapping. So - * mnt->mnt.mnt_userns is stable and we can retrieve it directly. - */ - old_mnt_userns = mnt->mnt.mnt_userns; - - mnt_userns = get_user_ns(kattr->mnt_userns); - /* Pairs with smp_load_acquire() in mnt_user_ns(). */ - smp_store_release(&mnt->mnt.mnt_userns, mnt_userns); - - /* - * If this is an idmapped filesystem drop the reference we've taken - * in vfs_create_mount() before. + * Pairs with smp_load_acquire() in mnt_idmap(). + * + * Since we only allow a mount to change the idmapping once and + * verified this in can_idmap_mount() we know that the mount has + * @nop_mnt_idmap attached to it. So there's no need to drop any + * references. */ - if (!initial_idmapping(old_mnt_userns)) - put_user_ns(old_mnt_userns); + smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap)); } static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) @@ -4136,6 +4232,15 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) if (path->dentry != mnt->mnt.mnt_root) return -EINVAL; + if (kattr->mnt_userns) { + struct mnt_idmap *mnt_idmap; + + mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns); + if (IS_ERR(mnt_idmap)) + return PTR_ERR(mnt_idmap); + kattr->mnt_idmap = mnt_idmap; + } + if (kattr->propagation) { /* * Only take namespace_lock() if we're actually changing @@ -4323,6 +4428,9 @@ static void finish_mount_kattr(struct mount_kattr *kattr) { put_user_ns(kattr->mnt_userns); kattr->mnt_userns = NULL; + + if (kattr->mnt_idmap) + mnt_idmap_put(kattr->mnt_idmap); } SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, -- cgit From 5a6f52d20ce3cd6d30103a27f18edff337da191b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 28 Oct 2022 09:56:20 +0200 Subject: acl: conver higher-level helpers to rely on mnt_idmap Convert an initial portion to rely on struct mnt_idmap by converting the high level xattr helpers. Reviewed-by: Seth Forshee (DigitalOcean) Signed-off-by: Christian Brauner (Microsoft) --- fs/internal.h | 12 ++++++------ fs/posix_acl.c | 16 +++++++++------- fs/xattr.c | 39 ++++++++++++++++++++------------------- 3 files changed, 35 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/internal.h b/fs/internal.h index 0c8812fe7ca4..a803cc3cf716 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -227,28 +227,28 @@ struct xattr_ctx { }; -ssize_t do_getxattr(struct user_namespace *mnt_userns, +ssize_t do_getxattr(struct mnt_idmap *idmap, struct dentry *d, struct xattr_ctx *ctx); int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); -int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_ctx *ctx); int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode); #ifdef CONFIG_FS_POSIX_ACL -int do_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, const void *kvalue, size_t size); -ssize_t do_get_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, void *kvalue, size_t size); #else -static inline int do_set_acl(struct user_namespace *mnt_userns, +static inline int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, const void *kvalue, size_t size) { return -EOPNOTSUPP; } -static inline ssize_t do_get_acl(struct user_namespace *mnt_userns, +static inline ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, void *kvalue, size_t size) { diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 989bbf280bfe..7260b59b26a0 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -871,7 +871,7 @@ EXPORT_SYMBOL (posix_acl_to_xattr); /** * vfs_posix_acl_to_xattr - convert from kernel to userspace representation - * @mnt_userns: user namespace of the mount + * @idmap: idmap of the mount * @inode: inode the posix acls are set on * @acl: the posix acls as represented by the vfs * @buffer: the buffer into which to convert @acl @@ -884,7 +884,7 @@ EXPORT_SYMBOL (posix_acl_to_xattr); * Return: On success, the size of the stored uapi posix acls, on error a * negative errno. */ -static ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns, +static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap, struct inode *inode, const struct posix_acl *acl, void *buffer, size_t size) @@ -893,6 +893,7 @@ static ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns, struct posix_acl_xattr_header *ext_acl = buffer; struct posix_acl_xattr_entry *ext_entry; struct user_namespace *fs_userns, *caller_userns; + struct user_namespace *mnt_userns; ssize_t real_size, n; vfsuid_t vfsuid; vfsgid_t vfsgid; @@ -908,6 +909,7 @@ static ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns, fs_userns = i_user_ns(inode); caller_userns = current_user_ns(); + mnt_userns = mnt_idmap_owner(idmap); for (n=0; n < acl->a_count; n++, ext_entry++) { const struct posix_acl_entry *acl_e = &acl->a_entries[n]; ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); @@ -1227,7 +1229,7 @@ out_inode_unlock: } EXPORT_SYMBOL_GPL(vfs_remove_acl); -int do_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, const void *kvalue, size_t size) { int error; @@ -1243,22 +1245,22 @@ int do_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, return PTR_ERR(acl); } - error = vfs_set_acl(mnt_userns, dentry, acl_name, acl); + error = vfs_set_acl(mnt_idmap_owner(idmap), dentry, acl_name, acl); posix_acl_release(acl); return error; } -ssize_t do_get_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, void *kvalue, size_t size) { ssize_t error; struct posix_acl *acl; - acl = vfs_get_acl(mnt_userns, dentry, acl_name); + acl = vfs_get_acl(mnt_idmap_owner(idmap), dentry, acl_name); if (IS_ERR(acl)) return PTR_ERR(acl); - error = vfs_posix_acl_to_xattr(mnt_userns, d_inode(dentry), + error = vfs_posix_acl_to_xattr(idmap, d_inode(dentry), acl, kvalue, size); posix_acl_release(acl); return error; diff --git a/fs/xattr.c b/fs/xattr.c index df3af9fa8c77..3641a0ce5380 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -597,19 +597,19 @@ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx) return error; } -int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_ctx *ctx) { if (is_posix_acl_xattr(ctx->kname->name)) - return do_set_acl(mnt_userns, dentry, ctx->kname->name, + return do_set_acl(idmap, dentry, ctx->kname->name, ctx->kvalue, ctx->size); - return vfs_setxattr(mnt_userns, dentry, ctx->kname->name, + return vfs_setxattr(mnt_idmap_owner(idmap), dentry, ctx->kname->name, ctx->kvalue, ctx->size, ctx->flags); } static long -setxattr(struct user_namespace *mnt_userns, struct dentry *d, +setxattr(struct mnt_idmap *idmap, struct dentry *d, const char __user *name, const void __user *value, size_t size, int flags) { @@ -627,7 +627,7 @@ setxattr(struct user_namespace *mnt_userns, struct dentry *d, if (error) return error; - error = do_setxattr(mnt_userns, d, &ctx); + error = do_setxattr(idmap, d, &ctx); kvfree(ctx.kvalue); return error; @@ -646,7 +646,7 @@ retry: return error; error = mnt_want_write(path.mnt); if (!error) { - error = setxattr(mnt_user_ns(path.mnt), path.dentry, name, + error = setxattr(mnt_idmap(path.mnt), path.dentry, name, value, size, flags); mnt_drop_write(path.mnt); } @@ -683,7 +683,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, audit_file(f.file); error = mnt_want_write_file(f.file); if (!error) { - error = setxattr(file_mnt_user_ns(f.file), + error = setxattr(file_mnt_idmap(f.file), f.file->f_path.dentry, name, value, size, flags); mnt_drop_write_file(f.file); @@ -696,7 +696,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, * Extended attribute GET operations */ ssize_t -do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, +do_getxattr(struct mnt_idmap *idmap, struct dentry *d, struct xattr_ctx *ctx) { ssize_t error; @@ -711,9 +711,10 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, } if (is_posix_acl_xattr(ctx->kname->name)) - error = do_get_acl(mnt_userns, d, kname, ctx->kvalue, ctx->size); + error = do_get_acl(idmap, d, kname, ctx->kvalue, ctx->size); else - error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size); + error = vfs_getxattr(mnt_idmap_owner(idmap), d, kname, + ctx->kvalue, ctx->size); if (error > 0) { if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) error = -EFAULT; @@ -727,7 +728,7 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, } static ssize_t -getxattr(struct user_namespace *mnt_userns, struct dentry *d, +getxattr(struct mnt_idmap *idmap, struct dentry *d, const char __user *name, void __user *value, size_t size) { ssize_t error; @@ -746,7 +747,7 @@ getxattr(struct user_namespace *mnt_userns, struct dentry *d, if (error < 0) return error; - error = do_getxattr(mnt_userns, d, &ctx); + error = do_getxattr(idmap, d, &ctx); kvfree(ctx.kvalue); return error; @@ -762,7 +763,7 @@ retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; - error = getxattr(mnt_user_ns(path.mnt), path.dentry, name, value, size); + error = getxattr(mnt_idmap(path.mnt), path.dentry, name, value, size); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -792,7 +793,7 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, if (!f.file) return error; audit_file(f.file); - error = getxattr(file_mnt_user_ns(f.file), f.file->f_path.dentry, + error = getxattr(file_mnt_idmap(f.file), f.file->f_path.dentry, name, value, size); fdput(f); return error; @@ -877,7 +878,7 @@ SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) * Extended attribute REMOVE operations */ static long -removexattr(struct user_namespace *mnt_userns, struct dentry *d, +removexattr(struct mnt_idmap *idmap, struct dentry *d, const char __user *name) { int error; @@ -890,9 +891,9 @@ removexattr(struct user_namespace *mnt_userns, struct dentry *d, return error; if (is_posix_acl_xattr(kname)) - return vfs_remove_acl(mnt_userns, d, kname); + return vfs_remove_acl(mnt_idmap_owner(idmap), d, kname); - return vfs_removexattr(mnt_userns, d, kname); + return vfs_removexattr(mnt_idmap_owner(idmap), d, kname); } static int path_removexattr(const char __user *pathname, @@ -906,7 +907,7 @@ retry: return error; error = mnt_want_write(path.mnt); if (!error) { - error = removexattr(mnt_user_ns(path.mnt), path.dentry, name); + error = removexattr(mnt_idmap(path.mnt), path.dentry, name); mnt_drop_write(path.mnt); } path_put(&path); @@ -939,7 +940,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) audit_file(f.file); error = mnt_want_write_file(f.file); if (!error) { - error = removexattr(file_mnt_user_ns(f.file), + error = removexattr(file_mnt_idmap(f.file), f.file->f_path.dentry, name); mnt_drop_write_file(f.file); } -- cgit From 7ee47dcfff1835ff75a794d1075b6b5f5462cfed Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 31 Oct 2022 18:52:56 +0100 Subject: fs: use acquire ordering in __fget_light() We must prevent the CPU from reordering the files->count read with the FD table access like this, on architectures where read-read reordering is possible: files_lookup_fd_raw() close_fd() put_files_struct() atomic_read(&files->count) I would like to mark this for stable, but the stable rules explicitly say "no theoretical races", and given that the FD table pointer and files->count are explicitly stored in the same cacheline, this sort of reordering seems quite unlikely in practice... Signed-off-by: Jann Horn Signed-off-by: Al Viro --- fs/file.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index 5f9c802a5d8d..c942c89ca4cd 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1003,7 +1003,16 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask) struct files_struct *files = current->files; struct file *file; - if (atomic_read(&files->count) == 1) { + /* + * If another thread is concurrently calling close_fd() followed + * by put_files_struct(), we must not observe the old table + * entry combined with the new refcount - otherwise we could + * return a file that is concurrently being freed. + * + * atomic_read_acquire() pairs with atomic_dec_and_test() in + * put_files_struct(). + */ + if (atomic_read_acquire(&files->count) == 1) { file = files_lookup_fd_raw(files, fd); if (!file || unlikely(file->f_mode & mask)) return 0; -- cgit From 5a17f040fa332e71a45ca9ff02d6979d9176a423 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 26 Oct 2022 16:31:11 -0700 Subject: cred: Do not default to init_cred in prepare_kernel_cred() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A common exploit pattern for ROP attacks is to abuse prepare_kernel_cred() in order to construct escalated privileges[1]. Instead of providing a short-hand argument (NULL) to the "daemon" argument to indicate using init_cred as the base cred, require that "daemon" is always set to an actual task. Replace all existing callers that were passing NULL with &init_task. Future attacks will need to have sufficiently powerful read/write primitives to have found an appropriately privileged task and written it to the ROP stack as an argument to succeed, which is similarly difficult to the prior effort needed to escalate privileges before struct cred existed: locate the current cred and overwrite the uid member. This has the added benefit of meaning that prepare_kernel_cred() can no longer exceed the privileges of the init task, which may have changed from the original init_cred (e.g. dropping capabilities from the bounding set). [1] https://google.com/search?q=commit_creds(prepare_kernel_cred(0)) Cc: "Eric W. Biederman" Cc: David Howells Cc: "Rafael J. Wysocki" Cc: Steve French Cc: Ronnie Sahlberg Cc: Shyam Prasad N Cc: Tom Talpey Cc: Namjae Jeon Cc: Trond Myklebust Cc: Anna Schumaker Cc: Chuck Lever Cc: Jeff Layton Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: "Michal Koutný" Cc: Peter Zijlstra Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Cc: linux-nfs@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Acked-by: Luis Chamberlain Reviewed-by: Sergey Senozhatsky Acked-by: Russ Weight Acked-by: Greg Kroah-Hartman Acked-by: Paulo Alcantara (SUSE) Link: https://lore.kernel.org/r/20221026232943.never.775-kees@kernel.org --- fs/cifs/cifs_spnego.c | 2 +- fs/cifs/cifsacl.c | 2 +- fs/ksmbd/smb_common.c | 2 +- fs/nfs/flexfilelayout/flexfilelayout.c | 4 ++-- fs/nfs/nfs4idmap.c | 2 +- fs/nfsd/nfs4callback.c | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 342717bf1dc2..6f3285f1dfee 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -189,7 +189,7 @@ init_cifs_spnego(void) * spnego upcalls. */ - cred = prepare_kernel_cred(NULL); + cred = prepare_kernel_cred(&init_task); if (!cred) return -ENOMEM; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index fa480d62f313..574de2b225ae 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -465,7 +465,7 @@ init_cifs_idmap(void) * this is used to prevent malicious redirections from being installed * with add_key(). */ - cred = prepare_kernel_cred(NULL); + cred = prepare_kernel_cred(&init_task); if (!cred) return -ENOMEM; diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index d96da872d70a..2a4fbbd55b91 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -623,7 +623,7 @@ int ksmbd_override_fsids(struct ksmbd_work *work) if (share->force_gid != KSMBD_SHARE_INVALID_GID) gid = share->force_gid; - cred = prepare_kernel_cred(NULL); + cred = prepare_kernel_cred(&init_task); if (!cred) return -ENOMEM; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 1ec79ccf89ad..7deb3cd76abe 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -493,10 +493,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, gid = make_kgid(&init_user_ns, id); if (gfp_flags & __GFP_FS) - kcred = prepare_kernel_cred(NULL); + kcred = prepare_kernel_cred(&init_task); else { unsigned int nofs_flags = memalloc_nofs_save(); - kcred = prepare_kernel_cred(NULL); + kcred = prepare_kernel_cred(&init_task); memalloc_nofs_restore(nofs_flags); } rc = -ENOMEM; diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index e3fdd2f45b01..25a7c771cfd8 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -203,7 +203,7 @@ int nfs_idmap_init(void) printk(KERN_NOTICE "NFS: Registering the %s key type\n", key_type_id_resolver.name); - cred = prepare_kernel_cred(NULL); + cred = prepare_kernel_cred(&init_task); if (!cred) return -ENOMEM; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index f0e69edf5f0f..4a9e8d17e56a 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -870,7 +870,7 @@ static const struct cred *get_backchannel_cred(struct nfs4_client *clp, struct r } else { struct cred *kcred; - kcred = prepare_kernel_cred(NULL); + kcred = prepare_kernel_cred(&init_task); if (!kcred) return NULL; -- cgit From 905889bc6c842d18f369bf2834cf7219f32709ae Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 23 Sep 2022 13:28:13 -0700 Subject: btrfs: send: Proactively round up to kmalloc bucket size Instead of discovering the kmalloc bucket size _after_ allocation, round up proactively so the allocation is explicitly made for the full size, allowing the compiler to correctly reason about the resulting size of the buffer through the existing __alloc_size() hint. Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Acked-by: David Sterba Link: https://lore.kernel.org/lkml/20220922133014.GI32411@suse.cz Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220923202822.2667581-8-keescook@chromium.org --- fs/btrfs/send.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4ef4167072b8..f53e8049473d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -438,6 +438,11 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) path_len = p->end - p->start; old_buf_len = p->buf_len; + /* + * Allocate to the next largest kmalloc bucket size, to let + * the fast path happen most of the time. + */ + len = kmalloc_size_roundup(len); /* * First time the inline_buf does not suffice */ @@ -451,11 +456,7 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (!tmp_buf) return -ENOMEM; p->buf = tmp_buf; - /* - * The real size of the buffer is bigger, this will let the fast path - * happen most of the time - */ - p->buf_len = ksize(p->buf); + p->buf_len = len; if (p->reversed) { tmp_buf = p->buf + old_buf_len - path_len - 1; -- cgit From 6dd142d9013ca82155d0c069434c60a0d5755ec0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 20 Sep 2022 14:13:05 -0700 Subject: coredump: Proactively round up to kmalloc bucket size Instead of discovering the kmalloc bucket size _after_ allocation, round up proactively so the allocation is explicitly made for the full size, allowing the compiler to correctly reason about the resulting size of the buffer through the existing __alloc_size() hint. Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Kees Cook --- fs/coredump.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/coredump.c b/fs/coredump.c index 7bad7785e8e6..97eaee325251 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -68,7 +68,10 @@ struct core_name { static int expand_corename(struct core_name *cn, int size) { - char *corename = krealloc(cn->corename, size, GFP_KERNEL); + char *corename; + + size = kmalloc_size_roundup(size); + corename = krealloc(cn->corename, size, GFP_KERNEL); if (!corename) return -ENOMEM; @@ -76,7 +79,7 @@ static int expand_corename(struct core_name *cn, int size) if (size > core_name_size) /* racy but harmless */ core_name_size = size; - cn->size = ksize(corename); + cn->size = size; cn->corename = corename; return 0; } -- cgit From 91586ce0d39a05f88795aa8814fb99b1387236b3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 21 Oct 2022 10:34:22 +0800 Subject: f2fs: fix to invalidate dcc->f2fs_issue_discard in error path Syzbot reports a NULL pointer dereference issue as below: __refcount_add include/linux/refcount.h:193 [inline] __refcount_inc include/linux/refcount.h:250 [inline] refcount_inc include/linux/refcount.h:267 [inline] get_task_struct include/linux/sched/task.h:110 [inline] kthread_stop+0x34/0x1c0 kernel/kthread.c:703 f2fs_stop_discard_thread+0x3c/0x5c fs/f2fs/segment.c:1638 kill_f2fs_super+0x5c/0x194 fs/f2fs/super.c:4522 deactivate_locked_super+0x70/0xe8 fs/super.c:332 deactivate_super+0xd0/0xd4 fs/super.c:363 cleanup_mnt+0x1f8/0x234 fs/namespace.c:1186 __cleanup_mnt+0x20/0x30 fs/namespace.c:1193 task_work_run+0xc4/0x14c kernel/task_work.c:177 exit_task_work include/linux/task_work.h:38 [inline] do_exit+0x26c/0xbe0 kernel/exit.c:795 do_group_exit+0x60/0xe8 kernel/exit.c:925 __do_sys_exit_group kernel/exit.c:936 [inline] __se_sys_exit_group kernel/exit.c:934 [inline] __wake_up_parent+0x0/0x40 kernel/exit.c:934 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline] invoke_syscall arch/arm64/kernel/syscall.c:52 [inline] el0_svc_common+0x138/0x220 arch/arm64/kernel/syscall.c:142 do_el0_svc+0x48/0x164 arch/arm64/kernel/syscall.c:206 el0_svc+0x58/0x150 arch/arm64/kernel/entry-common.c:636 el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:654 el0t_64_sync+0x18c/0x190 arch/arm64/kernel/entry.S:581 The root cause of this issue is in error path of f2fs_start_discard_thread(), it missed to invalidate dcc->f2fs_issue_discard, later kthread_stop() may access invalid pointer. Fixes: 4d67490498ac ("f2fs: Don't create discard thread when device doesn't support realtime discard") Reported-by: syzbot+035a381ea1afb63f098d@syzkaller.appspotmail.com Reported-by: syzbot+729c925c2d9fc495ddee@syzkaller.appspotmail.com Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index acf3d3fa4363..7a4f7c88b8b9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2025,8 +2025,10 @@ int f2fs_start_discard_thread(struct f2fs_sb_info *sbi) dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); - if (IS_ERR(dcc->f2fs_issue_discard)) + if (IS_ERR(dcc->f2fs_issue_discard)) { err = PTR_ERR(dcc->f2fs_issue_discard); + dcc->f2fs_issue_discard = NULL; + } return err; } -- cgit From 18792e64c86dd7e34ba28e4f61faba472b7bf5fc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 6 Oct 2022 23:09:28 +0800 Subject: f2fs: support fault injection for f2fs_is_valid_blkaddr() This patch supports to inject fault into f2fs_is_valid_blkaddr() to simulate accessing inconsistent data/meta block addressses from caller. Usage: a) echo 262144 > /sys/fs/f2fs//inject_type or b) mount -o fault_type=262144 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 +++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 1 + 3 files changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0c82dae082aa..c00694a50222 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -171,6 +171,11 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { + if (time_to_inject(sbi, FAULT_BLKADDR)) { + f2fs_show_injection_info(sbi, FAULT_BLKADDR); + return false; + } + switch (type) { case META_NAT: break; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e6355a5683b7..f57cb49dc383 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -60,6 +60,7 @@ enum { FAULT_SLAB_ALLOC, FAULT_DQUOT_INIT, FAULT_LOCK_OP, + FAULT_BLKADDR, FAULT_MAX, }; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3834ead04620..df26fbe2bf58 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -61,6 +61,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_SLAB_ALLOC] = "slab alloc", [FAULT_DQUOT_INIT] = "dquot initialize", [FAULT_LOCK_OP] = "lock_op", + [FAULT_BLKADDR] = "invalid blkaddr", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, -- cgit From 3688cbe39b7a9ef3feb73234fb351de33fd1da52 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 11:08:31 +0800 Subject: f2fs: remove batched_trim_sections node commit 377224c47118("f2fs: don't split checkpoint in fstrim") obsolete batch mode and related sysfs entry. Since this testing sysfs node has been deprecated for a long time, let's remove it. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 --- fs/f2fs/sysfs.c | 5 ----- 2 files changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f57cb49dc383..e990870bdab9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1063,9 +1063,6 @@ struct f2fs_sm_info { /* a threshold to reclaim prefree segments */ unsigned int rec_prefree_segments; - /* for batched trimming */ - unsigned int trim_sections; /* # of sections to trim */ - struct list_head sit_entry_set; /* sit entry set list */ unsigned int ipu_policy; /* in-place-update policy */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index df27afd71ef4..926b7a844362 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -488,9 +488,6 @@ out: return -EINVAL; } - if (!strcmp(a->attr.name, "trim_sections")) - return -EINVAL; - if (!strcmp(a->attr.name, "gc_urgent")) { if (t == 0) { sbi->gc_mode = GC_NORMAL; @@ -790,7 +787,6 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); @@ -919,7 +915,6 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(max_discard_issue_time), ATTR_LIST(discard_granularity), ATTR_LIST(pending_discard), - ATTR_LIST(batched_trim_sections), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), -- cgit From 6359a1aaca527311b7145ec6eb16890a5ddf5214 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 14:50:24 +0800 Subject: f2fs: fix gc mode when gc_urgent_high_remaining is 1 Under the current logic, when gc_urgent_high_remaining is set to 1, the mode will be switched to normal at the beginning, instead of running in gc_urgent mode. Let's switch the gc mode back to normal when the gc ends. Fixes: 265576181b4a ("f2fs: remove gc_urgent_high_limited for cleanup") Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 7b4be412cec0..d2e9c280773f 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -96,16 +96,6 @@ static int gc_thread_func(void *data) * invalidated soon after by user update or deletion. * So, I'd like to wait some time to collect dirty segments. */ - if (sbi->gc_mode == GC_URGENT_HIGH) { - spin_lock(&sbi->gc_urgent_high_lock); - if (sbi->gc_urgent_high_remaining) { - sbi->gc_urgent_high_remaining--; - if (!sbi->gc_urgent_high_remaining) - sbi->gc_mode = GC_NORMAL; - } - spin_unlock(&sbi->gc_urgent_high_lock); - } - if (sbi->gc_mode == GC_URGENT_HIGH || sbi->gc_mode == GC_URGENT_MID) { wait_ms = gc_th->urgent_sleep_time; @@ -162,6 +152,15 @@ do_gc: /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi, true); next: + if (sbi->gc_mode == GC_URGENT_HIGH) { + spin_lock(&sbi->gc_urgent_high_lock); + if (sbi->gc_urgent_high_remaining) { + sbi->gc_urgent_high_remaining--; + if (!sbi->gc_urgent_high_remaining) + sbi->gc_mode = GC_NORMAL; + } + spin_unlock(&sbi->gc_urgent_high_lock); + } sb_end_write(sbi->sb); } while (!kthread_should_stop()); -- cgit From 44b9d01f2ee32884a7de270394b0fb7f75f87dba Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 16:05:26 +0800 Subject: f2fs: cleanup in f2fs_create_flush_cmd_control() Just cleanup for readable, no functional changes. Suggested-by: Chao Yu Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7a4f7c88b8b9..0df47ad80efb 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -620,12 +620,12 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct flush_cmd_control *fcc; - int err = 0; + int err; if (SM_I(sbi)->fcc_info) { fcc = SM_I(sbi)->fcc_info; if (fcc->f2fs_issue_flush) - return err; + return 0; goto init_thread; } @@ -638,7 +638,7 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; if (!test_opt(sbi, FLUSH_MERGE)) - return err; + return 0; init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, @@ -650,7 +650,7 @@ init_thread: return err; } - return err; + return 0; } void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) -- cgit From b5f1a218ae5e4339130d6e733f0e63d623e09a2c Mon Sep 17 00:00:00 2001 From: Dongdong Zhang Date: Tue, 25 Oct 2022 17:40:36 +0800 Subject: f2fs: fix normal discard process In the DPOLICY_BG mode, there is a conflict between the two conditions "i + 1 < dpolicy->granularity" and "i < DEFAULT_DISCARD_GRANULARITY". If i = 15, the first condition is false, it will enter the second condition and dispatch all small granularity discards in function __issue_discard_cmd_orderly. The restrictive effect of the first condition to small discards will be invalidated. These two conditions should align. Fixes: 20ee4382322c ("f2fs: issue small discard by LBA order") Signed-off-by: Dongdong Zhang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0df47ad80efb..38f6a2bcb158 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1448,7 +1448,7 @@ retry: if (i + 1 < dpolicy->granularity) break; - if (i < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered) + if (i + 1 < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered) return __issue_discard_cmd_orderly(sbi, dpolicy); pend_list = &dcc->pend_list[i]; -- cgit From 6047de5482c33d5f912cdc907336fde9ebc5714e Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 01:54:01 +0800 Subject: f2fs: add barrier mount option This patch adds a mount option, barrier, in f2fs. The barrier option is the opposite of nobarrier. If this option is set, cache_flush commands are allowed to be issued. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index df26fbe2bf58..a247027711d8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -111,6 +111,7 @@ enum { Opt_noinline_dentry, Opt_flush_merge, Opt_noflush_merge, + Opt_barrier, Opt_nobarrier, Opt_fastboot, Opt_extent_cache, @@ -187,6 +188,7 @@ static match_table_t f2fs_tokens = { {Opt_noinline_dentry, "noinline_dentry"}, {Opt_flush_merge, "flush_merge"}, {Opt_noflush_merge, "noflush_merge"}, + {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, {Opt_fastboot, "fastboot"}, {Opt_extent_cache, "extent_cache"}, @@ -807,6 +809,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_nobarrier: set_opt(sbi, NOBARRIER); break; + case Opt_barrier: + clear_opt(sbi, NOBARRIER); + break; case Opt_fastboot: set_opt(sbi, FASTBOOT); break; @@ -1940,6 +1945,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",flush_merge"); if (test_opt(sbi, NOBARRIER)) seq_puts(seq, ",nobarrier"); + else + seq_puts(seq, ",barrier"); if (test_opt(sbi, FASTBOOT)) seq_puts(seq, ",fastboot"); if (test_opt(sbi, EXTENT_CACHE)) -- cgit From a995627e6dd81d4485d40ce64880017a080d71e6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 24 Oct 2022 16:00:35 -0700 Subject: f2fs: allow to set compression for inlined file The below commit disallows to set compression on empty created file which has a inline_data. Let's fix it. Fixes: 7165841d578e ("f2fs: fix to check inline_data during compressed inode conversion") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 82cda1258227..f96bbfa8b399 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1915,6 +1915,10 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) if (!f2fs_disable_compressed_file(inode)) return -EINVAL; } else { + /* try to convert inline_data to support compression */ + int err = f2fs_convert_inline_inode(inode); + if (err) + return err; if (!f2fs_may_compress(inode)) return -EINVAL; if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode)) -- cgit From c46867e9b9b8e0cdd6a5212c2b5ae616583a3bfd Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 16:32:26 +0800 Subject: f2fs: introduce max_ordered_discard sysfs node The current max_ordered_discard is a fixed value, change it to be configurable through the sys node. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +++ fs/f2fs/segment.c | 3 ++- fs/f2fs/sysfs.c | 11 +++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e990870bdab9..fa8dc00dfb2b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -331,6 +331,8 @@ struct discard_entry { /* default discard granularity of inner discard thread, unit: block count */ #define DEFAULT_DISCARD_GRANULARITY 16 +/* default maximum discard granularity of ordered discard, unit: block count */ +#define DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY 16 /* max discard pend list number */ #define MAX_PLIST_NUM 512 @@ -410,6 +412,7 @@ struct discard_cmd_control { unsigned int mid_discard_issue_time; /* mid. interval between discard issue */ unsigned int max_discard_issue_time; /* max. interval between discard issue */ unsigned int discard_granularity; /* discard granularity */ + unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */ unsigned int undiscard_blks; /* # of undiscard blocks */ unsigned int next_pos; /* next discard position */ atomic_t issued_discard; /* # of issued discard */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 38f6a2bcb158..c470b443615f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1448,7 +1448,7 @@ retry: if (i + 1 < dpolicy->granularity) break; - if (i + 1 < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered) + if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered) return __issue_discard_cmd_orderly(sbi, dpolicy); pend_list = &dcc->pend_list[i]; @@ -2048,6 +2048,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; + dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) dcc->discard_granularity = sbi->blocks_per_seg; else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 926b7a844362..8095345ebdad 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -483,6 +483,15 @@ out: return count; } + if (!strcmp(a->attr.name, "max_ordered_discard")) { + if (t == 0 || t > MAX_PLIST_NUM) + return -EINVAL; + if (!f2fs_block_unit_discard(sbi)) + return -EINVAL; + *ui = t; + return count; + } + if (!strcmp(a->attr.name, "migration_granularity")) { if (t == 0 || t > sbi->segs_per_sec) return -EINVAL; @@ -786,6 +795,7 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); @@ -914,6 +924,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(mid_discard_issue_time), ATTR_LIST(max_discard_issue_time), ATTR_LIST(discard_granularity), + ATTR_LIST(max_ordered_discard), ATTR_LIST(pending_discard), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), -- cgit From a5029a57a2f3f2e2711f4ac2d876c3c83d1758fe Mon Sep 17 00:00:00 2001 From: Keoseong Park Date: Thu, 27 Oct 2022 20:01:05 +0900 Subject: f2fs: Fix typo in comments Change "truncateion" to "truncation". Signed-off-by: Keoseong Park Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f96bbfa8b399..c605a4f2bce2 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -571,7 +571,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) raw_node = F2FS_NODE(dn->node_page); addr = blkaddr_in_node(raw_node) + base + ofs; - /* Assumption: truncateion starts with cluster */ + /* Assumption: truncation starts with cluster */ for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) { block_t blkaddr = le32_to_cpu(*addr); -- cgit From 146dbcbf17a6d07169e75224d949cc2670de2e20 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 27 Oct 2022 18:24:46 +0800 Subject: f2fs: fix return val in f2fs_start_ckpt_thread() Return PTR_ERR(cprc->f2fs_issue_ckpt) instead of -ENOMEM; Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 +++- fs/f2fs/gc.c | 15 +++++++-------- fs/f2fs/segment.c | 4 ++-- 3 files changed, 12 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c00694a50222..56f7d0d6a8b2 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1902,8 +1902,10 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi, "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(cprc->f2fs_issue_ckpt)) { + int err = PTR_ERR(cprc->f2fs_issue_ckpt); + cprc->f2fs_issue_ckpt = NULL; - return -ENOMEM; + return err; } set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d2e9c280773f..15f56859966c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -171,13 +171,10 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th; dev_t dev = sbi->sb->s_bdev->bd_dev; - int err = 0; gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); - if (!gc_th) { - err = -ENOMEM; - goto out; - } + if (!gc_th) + return -ENOMEM; gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; @@ -192,12 +189,14 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { - err = PTR_ERR(gc_th->f2fs_gc_task); + int err = PTR_ERR(gc_th->f2fs_gc_task); + kfree(gc_th); sbi->gc_thread = NULL; + return err; } -out: - return err; + + return 0; } void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c470b443615f..c4270cd6eaab 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -620,7 +620,6 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct flush_cmd_control *fcc; - int err; if (SM_I(sbi)->fcc_info) { fcc = SM_I(sbi)->fcc_info; @@ -644,7 +643,8 @@ init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { - err = PTR_ERR(fcc->f2fs_issue_flush); + int err = PTR_ERR(fcc->f2fs_issue_flush); + kfree(fcc); SM_I(sbi)->fcc_info = NULL; return err; -- cgit From 7b02b2201893a71b881026cf574902019ab00db5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 28 Oct 2022 17:30:26 +0800 Subject: f2fs: fix to destroy sbi->post_read_wq in error path of f2fs_fill_super() In error path of f2fs_fill_super(), this patch fixes to call f2fs_destroy_post_read_wq() once if we fail in f2fs_start_ckpt_thread(). Fixes: 261eeb9c1585 ("f2fs: introduce checkpoint_merge mount option") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a247027711d8..e6365f040171 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4531,9 +4531,9 @@ free_nm: f2fs_destroy_node_manager(sbi); free_sm: f2fs_destroy_segment_manager(sbi); - f2fs_destroy_post_read_wq(sbi); stop_ckpt_thread: f2fs_stop_ckpt_thread(sbi); + f2fs_destroy_post_read_wq(sbi); free_devices: destroy_device_list(sbi); kvfree(sbi->ckpt); -- cgit From a3951cd199a5d26138532d4e55af41262237632e Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 11:32:16 +0800 Subject: f2fs: introduce gc_mode sysfs node Revert "f2fs: make gc_urgent and gc_segment_mode sysfs node readable". Add a gc_mode sysfs node to show the current gc_mode as a string. Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/sysfs.c | 15 +++++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fa8dc00dfb2b..662b27c19de1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1319,6 +1319,7 @@ enum { MAX_TIME, }; +/* Note that you need to keep synchronization with this gc_mode_names array */ enum { GC_NORMAL, GC_IDLE_CB, diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 8095345ebdad..1fbd41c48328 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -143,6 +143,12 @@ static ssize_t pending_discard_show(struct f2fs_attr *a, &SM_I(sbi)->dcc_info->discard_cmd_cnt)); } +static ssize_t gc_mode_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%s\n", gc_mode_names[sbi->gc_mode]); +} + static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -332,13 +338,8 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return sysfs_emit(buf, "%u\n", sbi->compr_new_inode); #endif - if (!strcmp(a->attr.name, "gc_urgent")) - return sysfs_emit(buf, "%s\n", - gc_mode_names[sbi->gc_mode]); - if (!strcmp(a->attr.name, "gc_segment_mode")) - return sysfs_emit(buf, "%s\n", - gc_mode_names[sbi->gc_segment_mode]); + return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode); if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { return sysfs_emit(buf, "%u\n", @@ -844,6 +845,7 @@ F2FS_GENERAL_RO_ATTR(encoding); F2FS_GENERAL_RO_ATTR(mounted_time_sec); F2FS_GENERAL_RO_ATTR(main_blkaddr); F2FS_GENERAL_RO_ATTR(pending_discard); +F2FS_GENERAL_RO_ATTR(gc_mode); #ifdef CONFIG_F2FS_STAT_FS F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count); F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count); @@ -926,6 +928,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(discard_granularity), ATTR_LIST(max_ordered_discard), ATTR_LIST(pending_discard), + ATTR_LIST(gc_mode), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), -- cgit From 23ddc81b087c8bd9a73272afa076e204dc2e5410 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 28 Oct 2022 09:49:53 -0700 Subject: f2fs: use sysfs_emit instead of sprintf Let's use sysfs_emit. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 1fbd41c48328..af23ed6121b0 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -95,28 +95,28 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) static ssize_t dirty_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(dirty_segments(sbi))); } static ssize_t free_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(free_segments(sbi))); } static ssize_t ovp_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(overprovision_segments(sbi))); } static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(sbi->kbytes_written + ((f2fs_get_sectors_written(sbi) - sbi->sectors_written_start) >> 1))); @@ -125,13 +125,13 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, static ssize_t sb_status_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%lx\n", sbi->s_flag); + return sysfs_emit(buf, "%lx\n", sbi->s_flag); } static ssize_t cp_status_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags)); + return sysfs_emit(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags)); } static ssize_t pending_discard_show(struct f2fs_attr *a, @@ -139,7 +139,7 @@ static ssize_t pending_discard_show(struct f2fs_attr *a, { if (!SM_I(sbi)->dcc_info) return -EINVAL; - return sprintf(buf, "%llu\n", (unsigned long long)atomic_read( + return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read( &SM_I(sbi)->dcc_info->discard_cmd_cnt)); } @@ -205,7 +205,7 @@ static ssize_t features_show(struct f2fs_attr *a, static ssize_t current_reserved_blocks_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%u\n", sbi->current_reserved_blocks); + return sysfs_emit(buf, "%u\n", sbi->current_reserved_blocks); } static ssize_t unusable_show(struct f2fs_attr *a, @@ -217,7 +217,7 @@ static ssize_t unusable_show(struct f2fs_attr *a, unusable = sbi->unusable_block_count; else unusable = f2fs_get_unusable_blocks(sbi); - return sprintf(buf, "%llu\n", (unsigned long long)unusable); + return sysfs_emit(buf, "%llu\n", (unsigned long long)unusable); } static ssize_t encoding_show(struct f2fs_attr *a, @@ -232,13 +232,13 @@ static ssize_t encoding_show(struct f2fs_attr *a, (sb->s_encoding->version >> 8) & 0xff, sb->s_encoding->version & 0xff); #endif - return sprintf(buf, "(none)"); + return sysfs_emit(buf, "(none)"); } static ssize_t mounted_time_sec_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu", SIT_I(sbi)->mounted_time); + return sysfs_emit(buf, "%llu", SIT_I(sbi)->mounted_time); } #ifdef CONFIG_F2FS_STAT_FS @@ -247,7 +247,7 @@ static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a, { struct f2fs_stat_info *si = F2FS_STAT(sbi); - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->tot_blks - (si->bg_data_blks + si->bg_node_blks))); } @@ -257,7 +257,7 @@ static ssize_t moved_blocks_background_show(struct f2fs_attr *a, { struct f2fs_stat_info *si = F2FS_STAT(sbi); - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->bg_data_blks + si->bg_node_blks)); } @@ -268,7 +268,7 @@ static ssize_t avg_vblocks_show(struct f2fs_attr *a, si->dirty_count = dirty_segments(sbi); f2fs_update_sit_info(sbi); - return sprintf(buf, "%llu\n", (unsigned long long)(si->avg_vblocks)); + return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->avg_vblocks)); } #endif @@ -363,7 +363,7 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, ui = (unsigned int *)(ptr + a->offset); - return sprintf(buf, "%u\n", *ui); + return sysfs_emit(buf, "%u\n", *ui); } static ssize_t __sbi_store(struct f2fs_attr *a, @@ -728,7 +728,7 @@ static void f2fs_sb_release(struct kobject *kobj) static ssize_t f2fs_feature_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "supported\n"); + return sysfs_emit(buf, "supported\n"); } #define F2FS_FEATURE_RO_ATTR(_name) \ @@ -741,8 +741,8 @@ static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { if (F2FS_HAS_FEATURE(sbi, a->id)) - return sprintf(buf, "supported\n"); - return sprintf(buf, "unsupported\n"); + return sysfs_emit(buf, "supported\n"); + return sysfs_emit(buf, "unsupported\n"); } #define F2FS_SB_FEATURE_RO_ATTR(_name, _feat) \ -- cgit From e5a0db6a9e2eafe50e3ebc73a8285ae561e7d850 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 14:50:25 +0800 Subject: f2fs: replace gc_urgent_high_remaining with gc_remaining_trials The user can set the trial count limit for GC urgent and idle mode with replaced gc_remaining_trials.. If GC thread gets to the limit, the mode will turn back to GC normal mode finally. It was applied only to GC_URGENT, while this patch expands it for GC_IDLE. Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +++-- fs/f2fs/gc.c | 12 ++++++------ fs/f2fs/super.c | 2 +- fs/f2fs/sysfs.c | 12 ++++++------ 4 files changed, 16 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 662b27c19de1..04ef4cce3d7f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1736,8 +1736,9 @@ struct f2fs_sb_info { unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ - spinlock_t gc_urgent_high_lock; - unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */ + spinlock_t gc_remaining_trials_lock; + /* remaining trial count for GC_URGENT_* and GC_IDLE_* */ + unsigned int gc_remaining_trials; /* for skip statistic */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 15f56859966c..6466db75af5d 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -152,14 +152,14 @@ do_gc: /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi, true); next: - if (sbi->gc_mode == GC_URGENT_HIGH) { - spin_lock(&sbi->gc_urgent_high_lock); - if (sbi->gc_urgent_high_remaining) { - sbi->gc_urgent_high_remaining--; - if (!sbi->gc_urgent_high_remaining) + if (sbi->gc_mode != GC_NORMAL) { + spin_lock(&sbi->gc_remaining_trials_lock); + if (sbi->gc_remaining_trials) { + sbi->gc_remaining_trials--; + if (!sbi->gc_remaining_trials) sbi->gc_mode = GC_NORMAL; } - spin_unlock(&sbi->gc_urgent_high_lock); + spin_unlock(&sbi->gc_remaining_trials_lock); } sb_end_write(sbi->sb); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e6365f040171..a43d8a46a6e5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3624,7 +3624,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->seq_file_ra_mul = MIN_RA_MUL; sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE; sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; - spin_lock_init(&sbi->gc_urgent_high_lock); + spin_lock_init(&sbi->gc_remaining_trials_lock); atomic64_set(&sbi->current_atomic_write, 0); sbi->dir_level = DEF_DIR_LEVEL; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index af23ed6121b0..032c03e09580 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -538,10 +538,10 @@ out: return count; } - if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) { - spin_lock(&sbi->gc_urgent_high_lock); - sbi->gc_urgent_high_remaining = t; - spin_unlock(&sbi->gc_urgent_high_lock); + if (!strcmp(a->attr.name, "gc_remaining_trials")) { + spin_lock(&sbi->gc_remaining_trials_lock); + sbi->gc_remaining_trials = t; + spin_unlock(&sbi->gc_remaining_trials_lock); return count; } @@ -832,7 +832,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent_high_remaining, gc_urgent_high_remaining); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_remaining_trials, gc_remaining_trials); F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); @@ -961,7 +961,7 @@ static struct attribute *f2fs_attrs[] = { #endif ATTR_LIST(data_io_flag), ATTR_LIST(node_io_flag), - ATTR_LIST(gc_urgent_high_remaining), + ATTR_LIST(gc_remaining_trials), ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), -- cgit From 3b21b794b5797d35f4fad930b53b1cd881c12dd3 Mon Sep 17 00:00:00 2001 From: "wangkailong@jari.cn" Date: Sat, 29 Oct 2022 22:49:30 +0800 Subject: f2fs: replace ternary operator with max() Fix the following coccicheck warning: ./fs/f2fs/segment.c:877:24-25: WARNING opportunity for max() Signed-off-by: KaiLong Wang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c4270cd6eaab..aa4be7f25963 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -856,7 +856,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi) } mutex_unlock(&dirty_i->seglist_lock); - unusable = holes[DATA] > holes[NODE] ? holes[DATA] : holes[NODE]; + unusable = max(holes[DATA], holes[NODE]); if (unusable > ovp_holes) return unusable - ovp_holes; return 0; -- cgit From 5b52aebef8954cadff29918bb61d7fdc7be07837 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 3 Nov 2022 08:18:46 +0100 Subject: ovl: call posix_acl_release() after error checking The current placement of posix_acl_release() in ovl_set_or_remove_acl() means it can be called on an error pointer instead of actual acls. Fix this by moving the posix_acl_release() call after the error handling. Fixes: 0e641857322f ("ovl: implement set acl method") # mainline only Reported-by: syzbot+3f6ef1c4586bb6fd1f61@syzkaller.appspotmail.com Signed-off-by: Christian Brauner (Microsoft) --- fs/overlayfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 77a77fd7a77b..ee6dfa577c93 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -621,11 +621,11 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, real_acl = vfs_get_acl(mnt_user_ns(realpath.mnt), realdentry, acl_name); revert_creds(old_cred); - posix_acl_release(real_acl); if (IS_ERR(real_acl)) { err = PTR_ERR(real_acl); goto out_drop_write; } + posix_acl_release(real_acl); } if (!upperdentry) { -- cgit From 118e021b4b66f758f8e8f21dc0e5e0a4c721e69e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 7 Nov 2022 10:09:11 +1100 Subject: xfs: write page faults in iomap are not buffered writes When we reserve a delalloc region in xfs_buffered_write_iomap_begin, we mark the iomap as IOMAP_F_NEW so that the the write context understands that it allocated the delalloc region. If we then fail that buffered write, xfs_buffered_write_iomap_end() checks for the IOMAP_F_NEW flag and if it is set, it punches out the unused delalloc region that was allocated for the write. The assumption this code makes is that all buffered write operations that can allocate space are run under an exclusive lock (i_rwsem). This is an invalid assumption: page faults in mmap()d regions call through this same function pair to map the file range being faulted and this runs only holding the inode->i_mapping->invalidate_lock in shared mode. IOWs, we can have races between page faults and write() calls that fail the nested page cache write operation that result in data loss. That is, the failing iomap_end call will punch out the data that the other racing iomap iteration brought into the page cache. This can be reproduced with generic/34[46] if we arbitrarily fail page cache copy-in operations from write() syscalls. Code analysis tells us that the iomap_page_mkwrite() function holds the already instantiated and uptodate folio locked across the iomap mapping iterations. Hence the folio cannot be removed from memory whilst we are mapping the range it covers, and as such we do not care if the mapping changes state underneath the iomap iteration loop: 1. if the folio is not already dirty, there is no writeback races possible. 2. if we allocated the mapping (delalloc or unwritten), the folio cannot already be dirty. See #1. 3. If the folio is already dirty, it must be up to date. As we hold it locked, it cannot be reclaimed from memory. Hence we always have valid data in the page cache while iterating the mapping. 4. Valid data in the page cache can exist when the underlying mapping is DELALLOC, UNWRITTEN or WRITTEN. Having the mapping change from DELALLOC->UNWRITTEN or UNWRITTEN->WRITTEN does not change the data in the page - it only affects actions if we are initialising a new page. Hence #3 applies and we don't care about these extent map transitions racing with iomap_page_mkwrite(). 5. iomap_page_mkwrite() checks for page invalidation races (truncate, hole punch, etc) after it locks the folio. We also hold the mapping->invalidation_lock here, and hence the mapping cannot change due to extent removal operations while we are iterating the folio. As such, filesystems that don't use bufferheads will never fail the iomap_folio_mkwrite_iter() operation on the current mapping, regardless of whether the iomap should be considered stale. Further, the range we are asked to iterate is limited to the range inside EOF that the folio spans. Hence, for XFS, we will only map the exact range we are asked for, and we will only do speculative preallocation with delalloc if we are mapping a hole at the EOF page. The iterator will consume the entire range of the folio that is within EOF, and anything beyond the EOF block cannot be accessed. We never need to truncate this post-EOF speculative prealloc away in the context of the iomap_page_mkwrite() iterator because if it remains unused we'll remove it when the last reference to the inode goes away. Hence we don't actually need an .iomap_end() cleanup/error handling path at all for iomap_page_mkwrite() for XFS. This means we can separate the page fault processing from the complexity of the .iomap_end() processing in the buffered write path. This also means that the buffered write path will also be able to take the mapping->invalidate_lock as necessary. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_iomap.c | 9 +++++++++ fs/xfs/xfs_iomap.h | 1 + 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e462d39c840e..595a5bcf46b9 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1325,7 +1325,7 @@ __xfs_filemap_fault( if (write_fault) { xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = iomap_page_mkwrite(vmf, - &xfs_buffered_write_iomap_ops); + &xfs_page_mkwrite_iomap_ops); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); } else { ret = filemap_fault(vmf); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 07da03976ec1..5cea069a38b4 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1187,6 +1187,15 @@ const struct iomap_ops xfs_buffered_write_iomap_ops = { .iomap_end = xfs_buffered_write_iomap_end, }; +/* + * iomap_page_mkwrite() will never fail in a way that requires delalloc extents + * that it allocated to be revoked. Hence we do not need an .iomap_end method + * for this operation. + */ +const struct iomap_ops xfs_page_mkwrite_iomap_ops = { + .iomap_begin = xfs_buffered_write_iomap_begin, +}; + static int xfs_read_iomap_begin( struct inode *inode, diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index c782e8c0479c..0f62ab633040 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -47,6 +47,7 @@ xfs_aligned_fsb_count( } extern const struct iomap_ops xfs_buffered_write_iomap_ops; +extern const struct iomap_ops xfs_page_mkwrite_iomap_ops; extern const struct iomap_ops xfs_direct_write_iomap_ops; extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops; -- cgit From 42da66ac7bcb19181385e851094ceedfe7c81984 Mon Sep 17 00:00:00 2001 From: Michael Weiß Date: Mon, 24 Oct 2022 21:15:52 +0200 Subject: squashfs: enable idmapped mounts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For squashfs all needed functionality for idmapped mounts is already implemented by the generic handlers in the VFS. Thus, it is sufficient to just enable the corresponding FS_ALLOW_IDMAP flag to support idmapped mounts. We use this for unprivileged (user namespaced) containers based on squashfs images as rootfs in GyroidOS. A simple test using the mount-idmapped tool executed as user with uid=1000 looks as follows: $ mkdir test $ echo "test" > test/test_file $ mksquashfs test/ fs.img $ sudo mkdir /mnt/test $ sudo mkdir /mnt/mapped $ sudo mount fs.img -o loop /mnt/test/ $ sudo ./mount-idmapped --map-mount b:1000:2000:1 /mnt/test/ /mnt/mapped/ $ mount | tail -n2 fs.img on /mnt/test type squashfs (ro,relatime,errors=continue) fs.img on /mnt/mapped type squashfs (ro,relatime,idmapped,errors=continue) $ ls -lan /mnt/test/ total 5 drwxr-xr-x 2 1000 1000 32 Okt 24 13:36 . drwxr-xr-x 6 0 0 4096 Okt 24 13:38 .. -rw-r--r-- 1 1000 1000 5 Okt 24 13:36 test_file $ ls -lan /mnt/mapped/ total 5 drwxr-xr-x 2 2000 2000 32 Okt 24 13:36 . drwxr-xr-x 6 0 0 4096 Okt 24 13:38 .. -rw-r--r-- 1 2000 2000 5 Okt 24 13:36 test_file Signed-off-by: Michael Weiß Reviewed-by: Christian Brauner Reviewed-by: Phillip Lougher Signed-off-by: Christian Brauner (Microsoft) --- fs/squashfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 32565dafa7f3..2636cb354435 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -568,7 +568,7 @@ static struct file_system_type squashfs_fs_type = { .init_fs_context = squashfs_init_fs_context, .parameters = squashfs_fs_parameters, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("squashfs"); -- cgit From e7eda157c4071cd1e69f4b1687b0fbe1ae5e6f46 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Thu, 3 Nov 2022 16:12:05 +0100 Subject: fs: don't audit the capability check in simple_xattr_list() The check being unconditional may lead to unwanted denials reported by LSMs when a process has the capability granted by DAC, but denied by an LSM. In the case of SELinux such denials are a problem, since they can't be effectively filtered out via the policy and when not silenced, they produce noise that may hide a true problem or an attack. Checking for the capability only if any trusted xattr is actually present wouldn't really address the issue, since calling listxattr(2) on such node on its own doesn't indicate an explicit attempt to see the trusted xattrs. Additionally, it could potentially leak the presence of trusted xattrs to an unprivileged user if they can check for the denials (e.g. through dmesg). Therefore, it's best (and simplest) to keep the check unconditional and instead use ns_capable_noaudit() that will silence any associated LSM denials. Fixes: 38f38657444d ("xattr: extract simple_xattr code from tmpfs") Reported-by: Martin Pitt Suggested-by: Christian Brauner (Microsoft) Signed-off-by: Ondrej Mosnacek Reviewed-by: Christian Brauner (Microsoft) Reviewed-by: Paul Moore Signed-off-by: Christian Brauner (Microsoft) --- fs/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xattr.c b/fs/xattr.c index 61107b6bbed2..427b8cea1f96 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1140,7 +1140,7 @@ static int xattr_list_one(char **buffer, ssize_t *remaining_size, ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size) { - bool trusted = capable(CAP_SYS_ADMIN); + bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); struct simple_xattr *xattr; ssize_t remaining_size = size; int err = 0; -- cgit From f6b1a1cf1c3ee430d3f5e47847047ce789a690aa Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Sep 2022 20:04:34 +0800 Subject: ext4: fix use-after-free in ext4_ext_shift_extents MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the starting position of our insert range happens to be in the hole between the two ext4_extent_idx, because the lblk of the ext4_extent in the previous ext4_extent_idx is always less than the start, which leads to the "extent" variable access across the boundary, the following UAF is triggered: ================================================================== BUG: KASAN: use-after-free in ext4_ext_shift_extents+0x257/0x790 Read of size 4 at addr ffff88819807a008 by task fallocate/8010 CPU: 3 PID: 8010 Comm: fallocate Tainted: G E 5.10.0+ #492 Call Trace: dump_stack+0x7d/0xa3 print_address_description.constprop.0+0x1e/0x220 kasan_report.cold+0x67/0x7f ext4_ext_shift_extents+0x257/0x790 ext4_insert_range+0x5b6/0x700 ext4_fallocate+0x39e/0x3d0 vfs_fallocate+0x26f/0x470 ksys_fallocate+0x3a/0x70 __x64_sys_fallocate+0x4f/0x60 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 ================================================================== For right shifts, we can divide them into the following situations: 1. When the first ee_block of ext4_extent_idx is greater than or equal to start, make right shifts directly from the first ee_block. 1) If it is greater than start, we need to continue searching in the previous ext4_extent_idx. 2) If it is equal to start, we can exit the loop (iterator=NULL). 2. When the first ee_block of ext4_extent_idx is less than start, then traverse from the last extent to find the first extent whose ee_block is less than start. 1) If extent is still the last extent after traversal, it means that the last ee_block of ext4_extent_idx is less than start, that is, start is located in the hole between idx and (idx+1), so we can exit the loop directly (break) without right shifts. 2) Otherwise, make right shifts at the corresponding position of the found extent, and then exit the loop (iterator=NULL). Fixes: 331573febb6a ("ext4: Add support FALLOC_FL_INSERT_RANGE for fallocate") Cc: stable@vger.kernel.org # v4.2+ Signed-off-by: Zhihao Cheng Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20220922120434.1294789-1-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f1956288307f..6c399a8b22b3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5184,6 +5184,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, * and it is decreased till we reach start. */ again: + ret = 0; if (SHIFT == SHIFT_LEFT) iterator = &start; else @@ -5227,14 +5228,21 @@ again: ext4_ext_get_actual_len(extent); } else { extent = EXT_FIRST_EXTENT(path[depth].p_hdr); - if (le32_to_cpu(extent->ee_block) > 0) + if (le32_to_cpu(extent->ee_block) > start) *iterator = le32_to_cpu(extent->ee_block) - 1; - else - /* Beginning is reached, end of the loop */ + else if (le32_to_cpu(extent->ee_block) == start) iterator = NULL; - /* Update path extent in case we need to stop */ - while (le32_to_cpu(extent->ee_block) < start) + else { + extent = EXT_LAST_EXTENT(path[depth].p_hdr); + while (le32_to_cpu(extent->ee_block) >= start) + extent--; + + if (extent == EXT_LAST_EXTENT(path[depth].p_hdr)) + break; + extent++; + iterator = NULL; + } path[depth].p_ext = extent; } ret = ext4_ext_shift_path_extents(path, shift, inode, -- cgit From d96d0f9617793b6cd95b9b9a8fef66f69f8f6b1b Mon Sep 17 00:00:00 2001 From: Paulo Miguel Almeida Date: Wed, 12 Oct 2022 09:23:14 +1300 Subject: dlm: replace one-element array with fixed size array One-element arrays are deprecated. So, replace one-element array with fixed size array member in struct dlm_ls, and refactor the rest of the code, accordingly. Link: https://github.com/KSPP/linux/issues/79 Link: https://github.com/KSPP/linux/issues/228 Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101836 Link: https://lore.kernel.org/lkml/Y0W5jkiXUkpNl4ap@mail.google.com/ Signed-off-by: Paulo Miguel Almeida Reviewed-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Signed-off-by: David Teigland --- fs/dlm/dlm_internal.h | 2 +- fs/dlm/lockspace.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index e34c3d2639a5..94fadb619ba0 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -670,7 +670,7 @@ struct dlm_ls { void *ls_ops_arg; int ls_namelen; - char ls_name[1]; + char ls_name[DLM_LOCKSPACE_LEN + 1]; }; /* diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index bae050df7abf..9479c8110979 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -473,7 +473,7 @@ static int new_lockspace(const char *name, const char *cluster, error = -ENOMEM; - ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS); + ls = kzalloc(sizeof(*ls), GFP_NOFS); if (!ls) goto out; memcpy(ls->ls_name, name, namelen); -- cgit From 08ae0547e75ec3d062b6b6b9cf4830c730df68df Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Oct 2022 16:45:11 -0400 Subject: fs: dlm: fix sock release if listen fails This patch fixes a double sock_release() call when the listen() is called for the dlm lowcomms listen socket. The caller of dlm_listen_for_all should never care about releasing the socket if dlm_listen_for_all() fails, it's done now only once if listen() fails. Cc: stable@vger.kernel.org Fixes: 2dc6b1158c28 ("fs: dlm: introduce generic listen") Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 59f64c596233..2cb9f3b49e05 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1820,7 +1820,7 @@ static int dlm_listen_for_all(void) result = sock->ops->listen(sock, 5); if (result < 0) { dlm_close_sock(&listen_con.sock); - goto out; + return result; } return 0; @@ -2023,7 +2023,6 @@ fail_listen: dlm_proto_ops = NULL; fail_proto_ops: dlm_allow_conn = 0; - dlm_close_sock(&listen_con.sock); work_stop(); fail_local: deinit_local(); -- cgit From f0f4bb431bd543ed7bebbaea3ce326cfcd5388bc Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Oct 2022 16:45:12 -0400 Subject: fs: dlm: retry accept() until -EAGAIN or error returns This patch fixes a race if we get two times an socket data ready event while the listen connection worker is queued. Currently it will be served only once but we need to do it (in this case twice) until we hit -EAGAIN which tells us there is no pending accept going on. This patch wraps an do while loop until we receive a return value which is different than 0 as it was done before commit d11ccd451b65 ("fs: dlm: listen socket out of connection hash"). Cc: stable@vger.kernel.org Fixes: d11ccd451b65 ("fs: dlm: listen socket out of connection hash") Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 2cb9f3b49e05..871d4e9f49fb 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1543,7 +1543,11 @@ static void process_recv_sockets(struct work_struct *work) static void process_listen_recv_socket(struct work_struct *work) { - accept_from_sock(&listen_con); + int ret; + + do { + ret = accept_from_sock(&listen_con); + } while (!ret); } static void dlm_connect(struct connection *con) -- cgit From 57a5724ef0b332eb6e78250157910a006b01bf6e Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Oct 2022 16:45:13 -0400 Subject: fs: dlm: remove send repeat remove handling This patch removes the send repeat remove handling. This handling is there to repeatingly DLM_MSG_REMOVE messages in cases the dlm stack thinks it was not received at the first time. In cases of message drops this functionality is necessary, but since the DLM midcomms layer guarantees there are no messages drops between cluster nodes this feature became not strict necessary anymore. Due message delays/processing it could be that two send_repeat_remove() are sent out while the other should be still on it's way. We remove the repeat remove handling because we are sure that the message cannot be dropped due communication errors. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lock.c | 74 ----------------------------------------------------------- 1 file changed, 74 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 94a72ede5764..b246d71b5e17 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -4044,66 +4044,6 @@ out: return error; } -static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len) -{ - char name[DLM_RESNAME_MAXLEN + 1]; - struct dlm_message *ms; - struct dlm_mhandle *mh; - struct dlm_rsb *r; - uint32_t hash, b; - int rv, dir_nodeid; - - memset(name, 0, sizeof(name)); - memcpy(name, ms_name, len); - - hash = jhash(name, len, 0); - b = hash & (ls->ls_rsbtbl_size - 1); - - dir_nodeid = dlm_hash2nodeid(ls, hash); - - log_error(ls, "send_repeat_remove dir %d %s", dir_nodeid, name); - - spin_lock(&ls->ls_rsbtbl[b].lock); - rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); - if (!rv) { - spin_unlock(&ls->ls_rsbtbl[b].lock); - log_error(ls, "repeat_remove on keep %s", name); - return; - } - - rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); - if (!rv) { - spin_unlock(&ls->ls_rsbtbl[b].lock); - log_error(ls, "repeat_remove on toss %s", name); - return; - } - - /* use ls->remove_name2 to avoid conflict with shrink? */ - - spin_lock(&ls->ls_remove_spin); - ls->ls_remove_len = len; - memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); - spin_unlock(&ls->ls_remove_spin); - spin_unlock(&ls->ls_rsbtbl[b].lock); - - rv = _create_message(ls, sizeof(struct dlm_message) + len, - dir_nodeid, DLM_MSG_REMOVE, &ms, &mh); - if (rv) - goto out; - - memcpy(ms->m_extra, name, len); - ms->m_hash = cpu_to_le32(hash); - - send_message(mh, ms); - -out: - spin_lock(&ls->ls_remove_spin); - ls->ls_remove_len = 0; - memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN); - spin_unlock(&ls->ls_remove_spin); - wake_up(&ls->ls_remove_wait); -} - static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; @@ -4173,25 +4113,11 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) ENOTBLK request failures when the lookup reply designating us as master is delayed. */ - /* We could repeatedly return -EBADR here if our send_remove() is - delayed in being sent/arriving/being processed on the dir node. - Another node would repeatedly lookup up the master, and the dir - node would continue returning our nodeid until our send_remove - took effect. - - We send another remove message in case our previous send_remove - was lost/ignored/missed somehow. */ - if (error != -ENOTBLK) { log_limit(ls, "receive_request %x from %d %d", le32_to_cpu(ms->m_lkid), from_nodeid, error); } - if (namelen && error == -EBADR) { - send_repeat_remove(ls, ms->m_extra, namelen); - msleep(1000); - } - setup_stub_lkb(ls, ms); send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); return error; -- cgit From 5b787667e87a373a2f8f70e6be2b5d99c408462f Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Oct 2022 16:45:14 -0400 Subject: fs: dlm: use packet in dlm_mhandle To allow more than just dereferencing the inner header we directly point to the inner dlm packet which allows us to dereference the header, rcom or message structure. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/midcomms.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 6489bc22ad61..4eed40d66da1 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -194,7 +194,7 @@ struct midcomms_node { }; struct dlm_mhandle { - const struct dlm_header *inner_hd; + const union dlm_packet *inner_p; struct midcomms_node *node; struct dlm_opts *opts; struct dlm_msg *msg; @@ -1055,7 +1055,7 @@ static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int node dlm_fill_opts_header(opts, len, mh->seq); *ppc += sizeof(*opts); - mh->inner_hd = (const struct dlm_header *)*ppc; + mh->inner_p = (const union dlm_packet *)*ppc; return msg; } @@ -1133,7 +1133,7 @@ err: static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh) { /* nexthdr chain for fast lookup */ - mh->opts->o_nextcmd = mh->inner_hd->h_cmd; + mh->opts->o_nextcmd = mh->inner_p->header.h_cmd; mh->committed = true; dlm_lowcomms_commit_msg(mh->msg); } -- cgit From e01c4b7bd41522ae0299c07e2ee8c721fee02595 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Oct 2022 16:45:15 -0400 Subject: fd: dlm: trace send/recv of dlm message and rcom This patch adds tracepoints for send and recv cases of dlm messages and dlm rcom messages. In case of send and dlm message we add the dlm rsb resource name this dlm messages belongs to. This has the advantage to follow dlm messages on a per lock basis. In case of recv message the resource name can be extracted by follow the send message sequence number. The dlm message DLM_MSG_PURGE doesn't belong to a lock request and will not set the resource name in a dlm_message trace. The same for all rcom messages. There is additional handling required for this debugging functionality which is tried to be small as possible. Also the midcomms layer gets aware of lock resource names, for now this is required to make a connection between sequence number and lock resource names. It is for debugging purpose only. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lock.c | 21 +++++++++++---------- fs/dlm/midcomms.c | 45 +++++++++++++++++++++++++++++++++++++++++---- fs/dlm/midcomms.h | 3 ++- fs/dlm/rcom.c | 4 ++-- 4 files changed, 56 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index b246d71b5e17..0b1bc24536ce 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -3611,9 +3611,10 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb, /* further lowcomms enhancements or alternate implementations may make the return value from this function useful at some point */ -static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms) +static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms, + const void *name, int namelen) { - dlm_midcomms_commit_mhandle(mh); + dlm_midcomms_commit_mhandle(mh, name, namelen); return 0; } @@ -3679,7 +3680,7 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype) send_args(r, lkb, ms); - error = send_message(mh, ms); + error = send_message(mh, ms, r->res_name, r->res_length); if (error) goto fail; return 0; @@ -3742,7 +3743,7 @@ static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb) ms->m_result = 0; - error = send_message(mh, ms); + error = send_message(mh, ms, r->res_name, r->res_length); out: return error; } @@ -3763,7 +3764,7 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode) ms->m_bastmode = cpu_to_le32(mode); - error = send_message(mh, ms); + error = send_message(mh, ms, r->res_name, r->res_length); out: return error; } @@ -3786,7 +3787,7 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb) send_args(r, lkb, ms); - error = send_message(mh, ms); + error = send_message(mh, ms, r->res_name, r->res_length); if (error) goto fail; return 0; @@ -3811,7 +3812,7 @@ static int send_remove(struct dlm_rsb *r) memcpy(ms->m_extra, r->res_name, r->res_length); ms->m_hash = cpu_to_le32(r->res_hash); - error = send_message(mh, ms); + error = send_message(mh, ms, r->res_name, r->res_length); out: return error; } @@ -3833,7 +3834,7 @@ static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, ms->m_result = cpu_to_le32(to_dlm_errno(rv)); - error = send_message(mh, ms); + error = send_message(mh, ms, r->res_name, r->res_length); out: return error; } @@ -3874,7 +3875,7 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, ms->m_result = cpu_to_le32(to_dlm_errno(rv)); ms->m_nodeid = cpu_to_le32(ret_nodeid); - error = send_message(mh, ms); + error = send_message(mh, ms, ms_in->m_extra, receive_extralen(ms_in)); out: return error; } @@ -6300,7 +6301,7 @@ static int send_purge(struct dlm_ls *ls, int nodeid, int pid) ms->m_nodeid = cpu_to_le32(nodeid); ms->m_pid = cpu_to_le32(pid); - return send_message(mh, ms); + return send_message(mh, ms, NULL, 0); } int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 4eed40d66da1..e10a6e97df44 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -132,6 +132,7 @@ */ #define DLM_DEBUG_FENCE_TERMINATION 0 +#include #include #include "dlm_internal.h" @@ -415,7 +416,7 @@ static int dlm_send_fin(struct midcomms_node *node, m_header->h_cmd = DLM_FIN; pr_debug("sending fin msg to node %d\n", node->nodeid); - dlm_midcomms_commit_mhandle(mh); + dlm_midcomms_commit_mhandle(mh, NULL, 0); set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags); return 0; @@ -474,6 +475,20 @@ static void dlm_pas_fin_ack_rcv(struct midcomms_node *node) spin_unlock(&node->state_lock); } +static void dlm_receive_buffer_3_2_trace(uint32_t seq, union dlm_packet *p) +{ + switch (p->header.h_cmd) { + case DLM_MSG: + trace_dlm_recv_message(seq, &p->message); + break; + case DLM_RCOM: + trace_dlm_recv_rcom(seq, &p->rcom); + break; + default: + break; + } +} + static void dlm_midcomms_receive_buffer(union dlm_packet *p, struct midcomms_node *node, uint32_t seq) @@ -534,6 +549,7 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, break; default: WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); + dlm_receive_buffer_3_2_trace(seq, p); dlm_receive_buffer(p, node->nodeid); set_bit(DLM_NODE_ULP_DELIVERED, &node->flags); break; @@ -1130,11 +1146,30 @@ err: } #endif -static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh) +static void dlm_midcomms_commit_msg_3_2_trace(const struct dlm_mhandle *mh, + const void *name, int namelen) +{ + switch (mh->inner_p->header.h_cmd) { + case DLM_MSG: + trace_dlm_send_message(mh->seq, &mh->inner_p->message, + name, namelen); + break; + case DLM_RCOM: + trace_dlm_send_rcom(mh->seq, &mh->inner_p->rcom); + break; + default: + /* nothing to trace */ + break; + } +} + +static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh, + const void *name, int namelen) { /* nexthdr chain for fast lookup */ mh->opts->o_nextcmd = mh->inner_p->header.h_cmd; mh->committed = true; + dlm_midcomms_commit_msg_3_2_trace(mh, name, namelen); dlm_lowcomms_commit_msg(mh->msg); } @@ -1142,8 +1177,10 @@ static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh) * dlm_midcomms_get_mhandle */ #ifndef __CHECKER__ -void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh) +void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, + const void *name, int namelen) { + switch (mh->node->version) { case DLM_VERSION_3_1: srcu_read_unlock(&nodes_srcu, mh->idx); @@ -1154,7 +1191,7 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh) dlm_free_mhandle(mh); break; case DLM_VERSION_3_2: - dlm_midcomms_commit_msg_3_2(mh); + dlm_midcomms_commit_msg_3_2(mh, name, namelen); srcu_read_unlock(&nodes_srcu, mh->idx); break; default: diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index 82bcd9661922..d6286e80b077 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -17,7 +17,8 @@ struct midcomms_node; int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen); struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, gfp_t allocation, char **ppc); -void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh); +void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name, + int namelen); int dlm_midcomms_close(int nodeid); int dlm_midcomms_start(void); void dlm_midcomms_shutdown(void); diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index f19860315043..b76d52e2f6bd 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -91,7 +91,7 @@ static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type, static void send_rcom(struct dlm_mhandle *mh, struct dlm_rcom *rc) { - dlm_midcomms_commit_mhandle(mh); + dlm_midcomms_commit_mhandle(mh, NULL, 0); } static void send_rcom_stateless(struct dlm_msg *msg, struct dlm_rcom *rc) @@ -516,7 +516,7 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) rf = (struct rcom_config *) rc->rc_buf; rf->rf_lvblen = cpu_to_le32(~0U); - dlm_midcomms_commit_mhandle(mh); + dlm_midcomms_commit_mhandle(mh, NULL, 0); return 0; } -- cgit From 85839f27b17ddebe21d36a174050420783824ed3 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Oct 2022 16:45:16 -0400 Subject: fs: dlm: let dlm_add_cb queue work after resume only We should allow dlm_add_cb() to call queue_work() only after the recovery queued pending for delayed lkbs. This patch will move the switch LSFL_CB_DELAY after the delayed lkb work was processed. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/ast.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index d60a8d8f109d..6e07c151ad28 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -308,8 +308,6 @@ void dlm_callback_resume(struct dlm_ls *ls) if (!ls->ls_callback_wq) return; - clear_bit(LSFL_CB_DELAY, &ls->ls_flags); - more: mutex_lock(&ls->ls_cb_mutex); list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) { @@ -320,6 +318,8 @@ more: break; } empty = list_empty(&ls->ls_cb_delay); + if (empty) + clear_bit(LSFL_CB_DELAY, &ls->ls_flags); mutex_unlock(&ls->ls_cb_mutex); sum += count; -- cgit From d3e4dc5d68c8fef719291cc9f3dc907aac494c55 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Oct 2022 16:45:17 -0400 Subject: fs: dlm: use list_first_entry marco Instead of using list_entry() this patch moves to using the list_first_entry() macro. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dlm/user.c b/fs/dlm/user.c index c5d27bccc3dc..6a5de0918a96 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -857,7 +857,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, without removing lkb_cb_list; so empty lkb_cb_list is always consistent with empty lkb_callbacks */ - lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_cb_list); + lkb = list_first_entry(&proc->asts, struct dlm_lkb, lkb_cb_list); /* rem_lkb_callback sets a new lkb_last_cast */ old_mode = lkb->lkb_last_cast.mode; -- cgit From a4c0352bb1094cbe242f4458e267de845790737a Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Oct 2022 16:45:18 -0400 Subject: fs: dlm: convert ls_cb_mutex mutex to spinlock This patch converts the ls_cb_mutex mutex to a spinlock, there is no sleepable context when this lock is held. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/ast.c | 12 ++++++------ fs/dlm/dlm_internal.h | 2 +- fs/dlm/lockspace.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 6e07c151ad28..daaa0dff6ef4 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -200,13 +200,13 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, if (!prev_seq) { kref_get(&lkb->lkb_ref); - mutex_lock(&ls->ls_cb_mutex); + spin_lock(&ls->ls_cb_lock); if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) { list_add(&lkb->lkb_cb_list, &ls->ls_cb_delay); } else { queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); } - mutex_unlock(&ls->ls_cb_mutex); + spin_unlock(&ls->ls_cb_lock); } out: mutex_unlock(&lkb->lkb_cb_mutex); @@ -289,9 +289,9 @@ void dlm_callback_stop(struct dlm_ls *ls) void dlm_callback_suspend(struct dlm_ls *ls) { if (ls->ls_callback_wq) { - mutex_lock(&ls->ls_cb_mutex); + spin_lock(&ls->ls_cb_lock); set_bit(LSFL_CB_DELAY, &ls->ls_flags); - mutex_unlock(&ls->ls_cb_mutex); + spin_unlock(&ls->ls_cb_lock); flush_workqueue(ls->ls_callback_wq); } @@ -309,7 +309,7 @@ void dlm_callback_resume(struct dlm_ls *ls) return; more: - mutex_lock(&ls->ls_cb_mutex); + spin_lock(&ls->ls_cb_lock); list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) { list_del_init(&lkb->lkb_cb_list); queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); @@ -320,7 +320,7 @@ more: empty = list_empty(&ls->ls_cb_delay); if (empty) clear_bit(LSFL_CB_DELAY, &ls->ls_flags); - mutex_unlock(&ls->ls_cb_mutex); + spin_unlock(&ls->ls_cb_lock); sum += count; if (!empty) { diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 94fadb619ba0..fc4be8c35703 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -631,7 +631,7 @@ struct dlm_ls { /* recovery related */ - struct mutex ls_cb_mutex; + spinlock_t ls_cb_lock; struct list_head ls_cb_delay; /* save for queue_work later */ struct timer_list ls_timer; struct task_struct *ls_recoverd_task; diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 9479c8110979..4965737705b7 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -567,7 +567,7 @@ static int new_lockspace(const char *name, const char *cluster, init_completion(&ls->ls_recovery_done); ls->ls_recovery_result = -1; - mutex_init(&ls->ls_cb_mutex); + spin_lock_init(&ls->ls_cb_lock); INIT_LIST_HEAD(&ls->ls_cb_delay); ls->ls_recoverd_task = NULL; -- cgit From 92e95733307e7b6dd352c12fa174089ed51e7208 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Oct 2022 16:45:19 -0400 Subject: fs: dlm: use spin lock instead of mutex There is no need to use a mutex in those hot path sections. We change it to spin lock to serve callbacks more faster by not allowing schedule. The locked sections will not be locked for a long time. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/ast.c | 8 ++++---- fs/dlm/dlm_internal.h | 2 +- fs/dlm/lock.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index daaa0dff6ef4..3e76ec75bc55 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -190,7 +190,7 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, return; } - mutex_lock(&lkb->lkb_cb_mutex); + spin_lock(&lkb->lkb_cb_lock); prev_seq = lkb->lkb_callbacks[0].seq; rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, new_seq); @@ -209,7 +209,7 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, spin_unlock(&ls->ls_cb_lock); } out: - mutex_unlock(&lkb->lkb_cb_mutex); + spin_unlock(&lkb->lkb_cb_lock); } void dlm_callback_work(struct work_struct *work) @@ -223,7 +223,7 @@ void dlm_callback_work(struct work_struct *work) memset(&callbacks, 0, sizeof(callbacks)); - mutex_lock(&lkb->lkb_cb_mutex); + spin_lock(&lkb->lkb_cb_lock); if (!lkb->lkb_callbacks[0].seq) { /* no callback work exists, shouldn't happen */ log_error(ls, "dlm_callback_work %x no work", lkb->lkb_id); @@ -244,7 +244,7 @@ void dlm_callback_work(struct work_struct *work) dlm_print_lkb(lkb); dlm_dump_lkb_callbacks(lkb); } - mutex_unlock(&lkb->lkb_cb_mutex); + spin_unlock(&lkb->lkb_cb_lock); castfn = lkb->lkb_astfn; bastfn = lkb->lkb_bastfn; diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index fc4be8c35703..730808289a42 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -268,7 +268,7 @@ struct dlm_lkb { unsigned long lkb_timeout_cs; #endif - struct mutex lkb_cb_mutex; + spinlock_t lkb_cb_lock; struct work_struct lkb_cb_work; struct list_head lkb_cb_list; /* for ls_cb_delay or proc->asts */ struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE]; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 0b1bc24536ce..40e4e4a1c582 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1218,7 +1218,7 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret, INIT_LIST_HEAD(&lkb->lkb_time_list); #endif INIT_LIST_HEAD(&lkb->lkb_cb_list); - mutex_init(&lkb->lkb_cb_mutex); + spin_lock_init(&lkb->lkb_cb_lock); INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work); idr_preload(GFP_NOFS); -- cgit From 27d3994ebb5cea9c26f52064a3da8b0e606a8d11 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Oct 2022 16:45:20 -0400 Subject: fs: dlm: move last cast bast time to function call This patch moves the debugging information of the last cast and bast time when calling the last and bast function call. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/ast.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 3e76ec75bc55..8393d2090c1c 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -158,15 +158,11 @@ int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, } } - if (cb->flags & DLM_CB_CAST) { + if (cb->flags & DLM_CB_CAST) memcpy(&lkb->lkb_last_cast, cb, sizeof(struct dlm_callback)); - lkb->lkb_last_cast_time = ktime_get(); - } - if (cb->flags & DLM_CB_BAST) { + if (cb->flags & DLM_CB_BAST) memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback)); - lkb->lkb_last_bast_time = ktime_get(); - } rv = 0; out: return rv; @@ -256,11 +252,13 @@ void dlm_callback_work(struct work_struct *work) continue; } else if (callbacks[i].flags & DLM_CB_BAST) { trace_dlm_bast(ls, lkb, callbacks[i].mode); + lkb->lkb_last_bast_time = ktime_get(); bastfn(lkb->lkb_astparam, callbacks[i].mode); } else if (callbacks[i].flags & DLM_CB_CAST) { lkb->lkb_lksb->sb_status = callbacks[i].sb_status; lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags; trace_dlm_ast(ls, lkb); + lkb->lkb_last_cast_time = ktime_get(); castfn(lkb->lkb_astparam); } } -- cgit From 61bed0baa4dba17dd06cdfe20481a580718d6c7c Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Oct 2022 16:45:21 -0400 Subject: fs: dlm: use a non-static queue for callbacks This patch will introducde a queue implementation for callbacks by using the Linux lists. The current callback queue handling is implemented by a static limit of 6 entries, see DLM_CALLBACKS_SIZE. The sequence number inside the callback structure was used to see if the entries inside the static entry is valid or not. We don't need any sequence numbers anymore with a dynamic datastructure with grows and shrinks during runtime to offer such functionality. We assume that every callback will be delivered to the DLM user if once queued. Therefore the callback flag DLM_CB_SKIP was dropped and the check for skipping bast was moved before worker handling and not skip while the callback worker executes. This will reduce unnecessary queues of the callback worker. All last callback saves are pointers now and don't need to copied over. There is a reference counter for callback structures which will care about to free the callback structures at the right time if they are not referenced anymore. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/ast.c | 294 +++++++++++++++++++++----------------------------- fs/dlm/ast.h | 17 ++- fs/dlm/debug_fs.c | 2 +- fs/dlm/dlm_internal.h | 15 +-- fs/dlm/lock.c | 8 +- fs/dlm/memory.c | 26 +++++ fs/dlm/memory.h | 2 + fs/dlm/user.c | 73 +++++++------ fs/dlm/user.h | 2 +- 9 files changed, 222 insertions(+), 217 deletions(-) (limited to 'fs') diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 8393d2090c1c..078bbbd43a53 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -12,55 +12,68 @@ #include #include "dlm_internal.h" +#include "memory.h" #include "lock.h" #include "user.h" #include "ast.h" -static uint64_t dlm_cb_seq; -static DEFINE_SPINLOCK(dlm_cb_seq_spin); +void dlm_release_callback(struct kref *ref) +{ + struct dlm_callback *cb = container_of(ref, struct dlm_callback, ref); + + dlm_free_cb(cb); +} + +void dlm_callback_set_last_ptr(struct dlm_callback **from, + struct dlm_callback *to) +{ + if (*from) + kref_put(&(*from)->ref, dlm_release_callback); + + if (to) + kref_get(&to->ref); + + *from = to; +} -static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb) +void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb) { - int i; - - log_print("last_bast %x %llu flags %x mode %d sb %d %x", - lkb->lkb_id, - (unsigned long long)lkb->lkb_last_bast.seq, - lkb->lkb_last_bast.flags, - lkb->lkb_last_bast.mode, - lkb->lkb_last_bast.sb_status, - lkb->lkb_last_bast.sb_flags); - - log_print("last_cast %x %llu flags %x mode %d sb %d %x", - lkb->lkb_id, - (unsigned long long)lkb->lkb_last_cast.seq, - lkb->lkb_last_cast.flags, - lkb->lkb_last_cast.mode, - lkb->lkb_last_cast.sb_status, - lkb->lkb_last_cast.sb_flags); - - for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { - log_print("cb %x %llu flags %x mode %d sb %d %x", - lkb->lkb_id, - (unsigned long long)lkb->lkb_callbacks[i].seq, - lkb->lkb_callbacks[i].flags, - lkb->lkb_callbacks[i].mode, - lkb->lkb_callbacks[i].sb_status, - lkb->lkb_callbacks[i].sb_flags); + struct dlm_callback *cb, *safe; + + list_for_each_entry_safe(cb, safe, &lkb->lkb_callbacks, list) { + list_del(&cb->list); + kref_put(&cb->ref, dlm_release_callback); } + + /* TODO */ + lkb->lkb_flags &= ~DLM_IFL_NEED_SCHED; + + /* invalidate */ + dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL); + dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL); + lkb->lkb_last_bast_mode = -1; } -int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, - int status, uint32_t sbflags, uint64_t seq) +int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; - uint64_t prev_seq; + int rv = DLM_ENQUEUE_CALLBACK_SUCCESS; + struct dlm_callback *cb; int prev_mode; - int i, rv; - for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { - if (lkb->lkb_callbacks[i].seq) - continue; + if (flags & DLM_CB_BAST) { + /* if cb is a bast, it should be skipped if the blocking mode is + * compatible with the last granted mode + */ + if (lkb->lkb_last_cast) { + if (dlm_modes_compat(mode, lkb->lkb_last_cast->mode)) { + log_debug(ls, "skip %x bast mode %d for cast mode %d", + lkb->lkb_id, mode, + lkb->lkb_last_cast->mode); + goto out; + } + } /* * Suppress some redundant basts here, do more on removal. @@ -68,132 +81,75 @@ int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, * is a bast for the same mode or a more restrictive mode. * (the addional > PR check is needed for PR/CW inversion) */ - - if ((i > 0) && (flags & DLM_CB_BAST) && - (lkb->lkb_callbacks[i-1].flags & DLM_CB_BAST)) { - - prev_seq = lkb->lkb_callbacks[i-1].seq; - prev_mode = lkb->lkb_callbacks[i-1].mode; + if (lkb->lkb_last_cb && lkb->lkb_last_cb->flags & DLM_CB_BAST) { + prev_mode = lkb->lkb_last_cb->mode; if ((prev_mode == mode) || (prev_mode > mode && prev_mode > DLM_LOCK_PR)) { - - log_debug(ls, "skip %x add bast %llu mode %d " - "for bast %llu mode %d", - lkb->lkb_id, - (unsigned long long)seq, - mode, - (unsigned long long)prev_seq, - prev_mode); - rv = 0; + log_debug(ls, "skip %x add bast mode %d for bast mode %d", + lkb->lkb_id, mode, prev_mode); goto out; } } - - lkb->lkb_callbacks[i].seq = seq; - lkb->lkb_callbacks[i].flags = flags; - lkb->lkb_callbacks[i].mode = mode; - lkb->lkb_callbacks[i].sb_status = status; - lkb->lkb_callbacks[i].sb_flags = (sbflags & 0x000000FF); - rv = 0; - break; } - if (i == DLM_CALLBACKS_SIZE) { - log_error(ls, "no callbacks %x %llu flags %x mode %d sb %d %x", - lkb->lkb_id, (unsigned long long)seq, - flags, mode, status, sbflags); - dlm_dump_lkb_callbacks(lkb); - rv = -1; + cb = dlm_allocate_cb(); + if (!cb) { + rv = DLM_ENQUEUE_CALLBACK_FAILURE; goto out; } - out: - return rv; -} - -int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_callback *cb, int *resid) -{ - int i, rv; - - *resid = 0; - - if (!lkb->lkb_callbacks[0].seq) { - rv = -ENOENT; - goto out; - } - - /* oldest undelivered cb is callbacks[0] */ - - memcpy(cb, &lkb->lkb_callbacks[0], sizeof(struct dlm_callback)); - memset(&lkb->lkb_callbacks[0], 0, sizeof(struct dlm_callback)); - /* shift others down */ - - for (i = 1; i < DLM_CALLBACKS_SIZE; i++) { - if (!lkb->lkb_callbacks[i].seq) - break; - memcpy(&lkb->lkb_callbacks[i-1], &lkb->lkb_callbacks[i], - sizeof(struct dlm_callback)); - memset(&lkb->lkb_callbacks[i], 0, sizeof(struct dlm_callback)); - (*resid)++; + cb->flags = flags; + cb->mode = mode; + cb->sb_status = status; + cb->sb_flags = (sbflags & 0x000000FF); + kref_init(&cb->ref); + if (!(lkb->lkb_flags & DLM_IFL_NEED_SCHED)) { + lkb->lkb_flags |= DLM_IFL_NEED_SCHED; + rv = DLM_ENQUEUE_CALLBACK_NEED_SCHED; } + list_add_tail(&cb->list, &lkb->lkb_callbacks); - /* if cb is a bast, it should be skipped if the blocking mode is - compatible with the last granted mode */ - - if ((cb->flags & DLM_CB_BAST) && lkb->lkb_last_cast.seq) { - if (dlm_modes_compat(cb->mode, lkb->lkb_last_cast.mode)) { - cb->flags |= DLM_CB_SKIP; - - log_debug(ls, "skip %x bast %llu mode %d " - "for cast %llu mode %d", - lkb->lkb_id, - (unsigned long long)cb->seq, - cb->mode, - (unsigned long long)lkb->lkb_last_cast.seq, - lkb->lkb_last_cast.mode); - rv = 0; - goto out; - } - } + if (flags & DLM_CB_CAST) + dlm_callback_set_last_ptr(&lkb->lkb_last_cast, cb); - if (cb->flags & DLM_CB_CAST) - memcpy(&lkb->lkb_last_cast, cb, sizeof(struct dlm_callback)); + dlm_callback_set_last_ptr(&lkb->lkb_last_cb, cb); - if (cb->flags & DLM_CB_BAST) - memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback)); - rv = 0; out: return rv; } +int dlm_dequeue_lkb_callback(struct dlm_lkb *lkb, struct dlm_callback **cb) +{ + /* oldest undelivered cb is callbacks first entry */ + *cb = list_first_entry_or_null(&lkb->lkb_callbacks, + struct dlm_callback, list); + if (!*cb) + return DLM_DEQUEUE_CALLBACK_EMPTY; + + /* remove it from callbacks so shift others down */ + list_del(&(*cb)->list); + if (list_empty(&lkb->lkb_callbacks)) + return DLM_DEQUEUE_CALLBACK_LAST; + + return DLM_DEQUEUE_CALLBACK_SUCCESS; +} + void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, uint32_t sbflags) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; - uint64_t new_seq, prev_seq; int rv; - spin_lock(&dlm_cb_seq_spin); - new_seq = ++dlm_cb_seq; - if (!dlm_cb_seq) - new_seq = ++dlm_cb_seq; - spin_unlock(&dlm_cb_seq_spin); - if (lkb->lkb_flags & DLM_IFL_USER) { - dlm_user_add_ast(lkb, flags, mode, status, sbflags, new_seq); + dlm_user_add_ast(lkb, flags, mode, status, sbflags); return; } spin_lock(&lkb->lkb_cb_lock); - prev_seq = lkb->lkb_callbacks[0].seq; - - rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, new_seq); - if (rv < 0) - goto out; - - if (!prev_seq) { + rv = dlm_enqueue_lkb_callback(lkb, flags, mode, status, sbflags); + switch (rv) { + case DLM_ENQUEUE_CALLBACK_NEED_SCHED: kref_get(&lkb->lkb_ref); spin_lock(&ls->ls_cb_lock); @@ -203,8 +159,16 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); } spin_unlock(&ls->ls_cb_lock); + break; + case DLM_ENQUEUE_CALLBACK_FAILURE: + WARN_ON(1); + break; + case DLM_ENQUEUE_CALLBACK_SUCCESS: + break; + default: + WARN_ON(1); + break; } - out: spin_unlock(&lkb->lkb_cb_lock); } @@ -214,53 +178,43 @@ void dlm_callback_work(struct work_struct *work) struct dlm_ls *ls = lkb->lkb_resource->res_ls; void (*castfn) (void *astparam); void (*bastfn) (void *astparam, int mode); - struct dlm_callback callbacks[DLM_CALLBACKS_SIZE]; - int i, rv, resid; - - memset(&callbacks, 0, sizeof(callbacks)); + struct dlm_callback *cb; + int rv; spin_lock(&lkb->lkb_cb_lock); - if (!lkb->lkb_callbacks[0].seq) { - /* no callback work exists, shouldn't happen */ - log_error(ls, "dlm_callback_work %x no work", lkb->lkb_id); - dlm_print_lkb(lkb); - dlm_dump_lkb_callbacks(lkb); - } - - for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { - rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid); - if (rv < 0) - break; - } - - if (resid) { - /* cbs remain, loop should have removed all, shouldn't happen */ - log_error(ls, "dlm_callback_work %x resid %d", lkb->lkb_id, - resid); - dlm_print_lkb(lkb); - dlm_dump_lkb_callbacks(lkb); - } + rv = dlm_dequeue_lkb_callback(lkb, &cb); spin_unlock(&lkb->lkb_cb_lock); - castfn = lkb->lkb_astfn; - bastfn = lkb->lkb_bastfn; + if (WARN_ON(rv == DLM_DEQUEUE_CALLBACK_EMPTY)) + return; - for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { - if (!callbacks[i].seq) - break; - if (callbacks[i].flags & DLM_CB_SKIP) { - continue; - } else if (callbacks[i].flags & DLM_CB_BAST) { - trace_dlm_bast(ls, lkb, callbacks[i].mode); + for (;;) { + castfn = lkb->lkb_astfn; + bastfn = lkb->lkb_bastfn; + + if (cb->flags & DLM_CB_BAST) { + trace_dlm_bast(ls, lkb, cb->mode); lkb->lkb_last_bast_time = ktime_get(); - bastfn(lkb->lkb_astparam, callbacks[i].mode); - } else if (callbacks[i].flags & DLM_CB_CAST) { - lkb->lkb_lksb->sb_status = callbacks[i].sb_status; - lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags; + lkb->lkb_last_bast_mode = cb->mode; + bastfn(lkb->lkb_astparam, cb->mode); + } else if (cb->flags & DLM_CB_CAST) { + lkb->lkb_lksb->sb_status = cb->sb_status; + lkb->lkb_lksb->sb_flags = cb->sb_flags; trace_dlm_ast(ls, lkb); lkb->lkb_last_cast_time = ktime_get(); castfn(lkb->lkb_astparam); } + + kref_put(&cb->ref, dlm_release_callback); + + spin_lock(&lkb->lkb_cb_lock); + rv = dlm_dequeue_lkb_callback(lkb, &cb); + if (rv == DLM_DEQUEUE_CALLBACK_EMPTY) { + lkb->lkb_flags &= ~DLM_IFL_NEED_SCHED; + spin_unlock(&lkb->lkb_cb_lock); + break; + } + spin_unlock(&lkb->lkb_cb_lock); } /* undo kref_get from dlm_add_callback, may cause lkb to be freed */ diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h index e5e05fcc5813..880b11882495 100644 --- a/fs/dlm/ast.h +++ b/fs/dlm/ast.h @@ -11,13 +11,22 @@ #ifndef __ASTD_DOT_H__ #define __ASTD_DOT_H__ -int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, - int status, uint32_t sbflags, uint64_t seq); -int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_callback *cb, int *resid); +#define DLM_ENQUEUE_CALLBACK_NEED_SCHED 1 +#define DLM_ENQUEUE_CALLBACK_SUCCESS 0 +#define DLM_ENQUEUE_CALLBACK_FAILURE -1 +int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags); +#define DLM_DEQUEUE_CALLBACK_EMPTY 2 +#define DLM_DEQUEUE_CALLBACK_LAST 1 +#define DLM_DEQUEUE_CALLBACK_SUCCESS 0 +int dlm_dequeue_lkb_callback(struct dlm_lkb *lkb, struct dlm_callback **cb); void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, uint32_t sbflags); +void dlm_callback_set_last_ptr(struct dlm_callback **from, + struct dlm_callback *to); +void dlm_release_callback(struct kref *ref); +void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb); void dlm_callback_work(struct work_struct *work); int dlm_callback_start(struct dlm_ls *ls); void dlm_callback_stop(struct dlm_ls *ls); diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 8fb04ebbafb5..8a0e1b1f74ad 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -246,7 +246,7 @@ static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb, lkb->lkb_status, lkb->lkb_grmode, lkb->lkb_rqmode, - lkb->lkb_last_bast.mode, + lkb->lkb_last_bast_mode, rsb_lookup, lkb->lkb_wait_type, lkb->lkb_lvbseq, diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 730808289a42..69e3928c756b 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -211,6 +211,7 @@ struct dlm_args { #endif #define DLM_IFL_DEADLOCK_CANCEL 0x01000000 #define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */ +#define DLM_IFL_NEED_SCHED 0x04000000 /* least significant 2 bytes are message changed, they are full transmitted * but at receive side only the 2 bytes LSB will be set. * @@ -222,18 +223,17 @@ struct dlm_args { #define DLM_IFL_USER 0x00000001 #define DLM_IFL_ORPHAN 0x00000002 -#define DLM_CALLBACKS_SIZE 6 - #define DLM_CB_CAST 0x00000001 #define DLM_CB_BAST 0x00000002 -#define DLM_CB_SKIP 0x00000004 struct dlm_callback { - uint64_t seq; uint32_t flags; /* DLM_CBF_ */ int sb_status; /* copy to lksb status */ uint8_t sb_flags; /* copy to lksb flags */ int8_t mode; /* rq mode of bast, gr mode of cast */ + + struct list_head list; + struct kref ref; }; struct dlm_lkb { @@ -271,9 +271,10 @@ struct dlm_lkb { spinlock_t lkb_cb_lock; struct work_struct lkb_cb_work; struct list_head lkb_cb_list; /* for ls_cb_delay or proc->asts */ - struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE]; - struct dlm_callback lkb_last_cast; - struct dlm_callback lkb_last_bast; + struct list_head lkb_callbacks; + struct dlm_callback *lkb_last_cast; + struct dlm_callback *lkb_last_cb; + int lkb_last_bast_mode; ktime_t lkb_last_cast_time; /* for debugging */ ktime_t lkb_last_bast_time; /* for debugging */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 40e4e4a1c582..f4b2fb17bbb1 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1209,6 +1209,7 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret, if (!lkb) return -ENOMEM; + lkb->lkb_last_bast_mode = -1; lkb->lkb_nodeid = -1; lkb->lkb_grmode = DLM_LOCK_IV; kref_init(&lkb->lkb_ref); @@ -1218,6 +1219,7 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret, INIT_LIST_HEAD(&lkb->lkb_time_list); #endif INIT_LIST_HEAD(&lkb->lkb_cb_list); + INIT_LIST_HEAD(&lkb->lkb_callbacks); spin_lock_init(&lkb->lkb_cb_lock); INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work); @@ -6221,8 +6223,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) } list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) { - memset(&lkb->lkb_callbacks, 0, - sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE); + dlm_purge_lkb_callbacks(lkb); list_del_init(&lkb->lkb_cb_list); dlm_put_lkb(lkb); } @@ -6263,8 +6264,7 @@ static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) spin_lock(&proc->asts_spin); list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) { - memset(&lkb->lkb_callbacks, 0, - sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE); + dlm_purge_lkb_callbacks(lkb); list_del_init(&lkb->lkb_cb_list); dlm_put_lkb(lkb); } diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index ce35c3c19aeb..61fe0d1f5646 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -14,12 +14,14 @@ #include "lowcomms.h" #include "config.h" #include "memory.h" +#include "ast.h" static struct kmem_cache *writequeue_cache; static struct kmem_cache *mhandle_cache; static struct kmem_cache *msg_cache; static struct kmem_cache *lkb_cache; static struct kmem_cache *rsb_cache; +static struct kmem_cache *cb_cache; int __init dlm_memory_init(void) @@ -46,8 +48,16 @@ int __init dlm_memory_init(void) if (!rsb_cache) goto rsb; + cb_cache = kmem_cache_create("dlm_cb", sizeof(struct dlm_callback), + __alignof__(struct dlm_callback), 0, + NULL); + if (!rsb_cache) + goto cb; + return 0; +cb: + kmem_cache_destroy(rsb_cache); rsb: kmem_cache_destroy(msg_cache); msg: @@ -67,6 +77,7 @@ void dlm_memory_exit(void) kmem_cache_destroy(msg_cache); kmem_cache_destroy(lkb_cache); kmem_cache_destroy(rsb_cache); + kmem_cache_destroy(cb_cache); } char *dlm_allocate_lvb(struct dlm_ls *ls) @@ -115,6 +126,11 @@ void dlm_free_lkb(struct dlm_lkb *lkb) kfree(ua); } } + + /* drop references if they are set */ + dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL); + dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL); + kmem_cache_free(lkb_cache, lkb); } @@ -147,3 +163,13 @@ void dlm_free_msg(struct dlm_msg *msg) { kmem_cache_free(msg_cache, msg); } + +struct dlm_callback *dlm_allocate_cb(void) +{ + return kmem_cache_alloc(cb_cache, GFP_ATOMIC); +} + +void dlm_free_cb(struct dlm_callback *cb) +{ + kmem_cache_free(cb_cache, cb); +} diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h index 7bd3f1a391ca..c1583ec8b2cf 100644 --- a/fs/dlm/memory.h +++ b/fs/dlm/memory.h @@ -26,6 +26,8 @@ struct writequeue_entry *dlm_allocate_writequeue(void); void dlm_free_writequeue(struct writequeue_entry *writequeue); struct dlm_msg *dlm_allocate_msg(gfp_t allocation); void dlm_free_msg(struct dlm_msg *msg); +struct dlm_callback *dlm_allocate_cb(void); +void dlm_free_cb(struct dlm_callback *cb); #endif /* __MEMORY_DOT_H__ */ diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 6a5de0918a96..6b530db4bc0b 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -25,6 +25,7 @@ #include "user.h" #include "ast.h" #include "config.h" +#include "memory.h" static const char name_prefix[] = "dlm"; static const struct file_operations device_fops; @@ -175,7 +176,7 @@ static int lkb_is_endoflife(int mode, int status) being removed and then remove that lkb from the orphans list and free it */ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, - int status, uint32_t sbflags, uint64_t seq) + int status, uint32_t sbflags) { struct dlm_ls *ls; struct dlm_user_args *ua; @@ -209,16 +210,22 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, spin_lock(&proc->asts_spin); - rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq); - if (rv < 0) { + rv = dlm_enqueue_lkb_callback(lkb, flags, mode, status, sbflags); + switch (rv) { + case DLM_ENQUEUE_CALLBACK_FAILURE: spin_unlock(&proc->asts_spin); + WARN_ON(1); goto out; - } - - if (list_empty(&lkb->lkb_cb_list)) { + case DLM_ENQUEUE_CALLBACK_NEED_SCHED: kref_get(&lkb->lkb_ref); list_add_tail(&lkb->lkb_cb_list, &proc->asts); wake_up_interruptible(&proc->wait); + break; + case DLM_ENQUEUE_CALLBACK_SUCCESS: + break; + default: + WARN_ON(1); + break; } spin_unlock(&proc->asts_spin); @@ -800,8 +807,8 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, struct dlm_user_proc *proc = file->private_data; struct dlm_lkb *lkb; DECLARE_WAITQUEUE(wait, current); - struct dlm_callback cb; - int rv, resid, copy_lvb = 0; + struct dlm_callback *cb; + int rv, copy_lvb = 0; int old_mode, new_mode; if (count == sizeof(struct dlm_device_version)) { @@ -860,50 +867,56 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, lkb = list_first_entry(&proc->asts, struct dlm_lkb, lkb_cb_list); /* rem_lkb_callback sets a new lkb_last_cast */ - old_mode = lkb->lkb_last_cast.mode; + old_mode = lkb->lkb_last_cast->mode; - rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid); - if (rv < 0) { + rv = dlm_dequeue_lkb_callback(lkb, &cb); + switch (rv) { + case DLM_DEQUEUE_CALLBACK_EMPTY: /* this shouldn't happen; lkb should have been removed from - list when resid was zero */ + * list when last item was dequeued + */ log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id); list_del_init(&lkb->lkb_cb_list); spin_unlock(&proc->asts_spin); /* removes ref for proc->asts, may cause lkb to be freed */ dlm_put_lkb(lkb); + WARN_ON(1); goto try_another; - } - if (!resid) + case DLM_DEQUEUE_CALLBACK_LAST: list_del_init(&lkb->lkb_cb_list); - spin_unlock(&proc->asts_spin); - - if (cb.flags & DLM_CB_SKIP) { - /* removes ref for proc->asts, may cause lkb to be freed */ - if (!resid) - dlm_put_lkb(lkb); - goto try_another; + /* TODO */ + lkb->lkb_flags &= ~DLM_IFL_NEED_SCHED; + break; + case DLM_DEQUEUE_CALLBACK_SUCCESS: + break; + default: + WARN_ON(1); + break; } + spin_unlock(&proc->asts_spin); - if (cb.flags & DLM_CB_BAST) { - trace_dlm_bast(lkb->lkb_resource->res_ls, lkb, cb.mode); - } else if (cb.flags & DLM_CB_CAST) { - new_mode = cb.mode; + if (cb->flags & DLM_CB_BAST) { + trace_dlm_bast(lkb->lkb_resource->res_ls, lkb, cb->mode); + } else if (cb->flags & DLM_CB_CAST) { + new_mode = cb->mode; - if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr && + if (!cb->sb_status && lkb->lkb_lksb->sb_lvbptr && dlm_lvb_operations[old_mode + 1][new_mode + 1]) copy_lvb = 1; - lkb->lkb_lksb->sb_status = cb.sb_status; - lkb->lkb_lksb->sb_flags = cb.sb_flags; + lkb->lkb_lksb->sb_status = cb->sb_status; + lkb->lkb_lksb->sb_flags = cb->sb_flags; trace_dlm_ast(lkb->lkb_resource->res_ls, lkb); } rv = copy_result_to_user(lkb->lkb_ua, test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags), - cb.flags, cb.mode, copy_lvb, buf, count); + cb->flags, cb->mode, copy_lvb, buf, count); + + kref_put(&cb->ref, dlm_release_callback); /* removes ref for proc->asts, may cause lkb to be freed */ - if (!resid) + if (rv == DLM_DEQUEUE_CALLBACK_LAST) dlm_put_lkb(lkb); return rv; diff --git a/fs/dlm/user.h b/fs/dlm/user.h index 6b9bce6b96e0..33059452d79e 100644 --- a/fs/dlm/user.h +++ b/fs/dlm/user.h @@ -7,7 +7,7 @@ #define __USER_DOT_H__ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, - int status, uint32_t sbflags, uint64_t seq); + int status, uint32_t sbflags); int dlm_user_init(void); void dlm_user_exit(void); int dlm_device_deregister(struct dlm_ls *ls); -- cgit From e1711fe3fd59fa1e34e02add2e9c188441c12021 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Oct 2022 16:45:22 -0400 Subject: fs: dlm: allow different allocation context per _create_message This patch allows to give the use control about the allocation context based on a per message basis. Currently all messages forced to be created under GFP_NOFS context. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lock.c | 31 +++++++++++++++++++------------ fs/dlm/memory.c | 4 ++-- fs/dlm/memory.h | 2 +- fs/dlm/midcomms.c | 2 +- 4 files changed, 23 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index f4b2fb17bbb1..a2930e33c134 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -3554,7 +3554,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace, static int _create_message(struct dlm_ls *ls, int mb_len, int to_nodeid, int mstype, struct dlm_message **ms_ret, - struct dlm_mhandle **mh_ret) + struct dlm_mhandle **mh_ret, + gfp_t allocation) { struct dlm_message *ms; struct dlm_mhandle *mh; @@ -3564,7 +3565,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len, pass into midcomms_commit and a message buffer (mb) that we write our data into */ - mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb); + mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, allocation, &mb); if (!mh) return -ENOBUFS; @@ -3586,7 +3587,8 @@ static int _create_message(struct dlm_ls *ls, int mb_len, static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb, int to_nodeid, int mstype, struct dlm_message **ms_ret, - struct dlm_mhandle **mh_ret) + struct dlm_mhandle **mh_ret, + gfp_t allocation) { int mb_len = sizeof(struct dlm_message); @@ -3607,7 +3609,7 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb, } return _create_message(r->res_ls, mb_len, to_nodeid, mstype, - ms_ret, mh_ret); + ms_ret, mh_ret, allocation); } /* further lowcomms enhancements or alternate implementations may make @@ -3676,7 +3678,7 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype) if (error) return error; - error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); + error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS); if (error) goto fail; @@ -3737,7 +3739,8 @@ static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb) to_nodeid = lkb->lkb_nodeid; - error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh); + error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh, + GFP_NOFS); if (error) goto out; @@ -3758,7 +3761,8 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode) to_nodeid = lkb->lkb_nodeid; - error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh); + error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh, + GFP_NOFS); if (error) goto out; @@ -3783,7 +3787,8 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb) if (error) return error; - error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh); + error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh, + GFP_NOFS); if (error) goto fail; @@ -3807,7 +3812,8 @@ static int send_remove(struct dlm_rsb *r) to_nodeid = dlm_dir_nodeid(r); - error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh); + error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh, + GFP_NOFS); if (error) goto out; @@ -3828,7 +3834,7 @@ static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, to_nodeid = lkb->lkb_nodeid; - error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); + error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS); if (error) goto out; @@ -3869,7 +3875,8 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, struct dlm_mhandle *mh; int error, nodeid = le32_to_cpu(ms_in->m_header.h_nodeid); - error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh); + error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh, + GFP_NOFS); if (error) goto out; @@ -6295,7 +6302,7 @@ static int send_purge(struct dlm_ls *ls, int nodeid, int pid) int error; error = _create_message(ls, sizeof(struct dlm_message), nodeid, - DLM_MSG_PURGE, &ms, &mh); + DLM_MSG_PURGE, &ms, &mh, GFP_NOFS); if (error) return error; ms->m_nodeid = cpu_to_le32(nodeid); diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index 61fe0d1f5646..eb7a08641fcf 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -134,9 +134,9 @@ void dlm_free_lkb(struct dlm_lkb *lkb) kmem_cache_free(lkb_cache, lkb); } -struct dlm_mhandle *dlm_allocate_mhandle(void) +struct dlm_mhandle *dlm_allocate_mhandle(gfp_t allocation) { - return kmem_cache_alloc(mhandle_cache, GFP_NOFS); + return kmem_cache_alloc(mhandle_cache, allocation); } void dlm_free_mhandle(struct dlm_mhandle *mhandle) diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h index c1583ec8b2cf..6b29563d24f7 100644 --- a/fs/dlm/memory.h +++ b/fs/dlm/memory.h @@ -20,7 +20,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls); void dlm_free_lkb(struct dlm_lkb *l); char *dlm_allocate_lvb(struct dlm_ls *ls); void dlm_free_lvb(char *l); -struct dlm_mhandle *dlm_allocate_mhandle(void); +struct dlm_mhandle *dlm_allocate_mhandle(gfp_t allocation); void dlm_free_mhandle(struct dlm_mhandle *mhandle); struct writequeue_entry *dlm_allocate_writequeue(void); void dlm_free_writequeue(struct writequeue_entry *writequeue); diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index e10a6e97df44..3a4e20d6cd2d 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -1097,7 +1097,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, /* this is a bug, however we going on and hope it will be resolved */ WARN_ON(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags)); - mh = dlm_allocate_mhandle(); + mh = dlm_allocate_mhandle(allocation); if (!mh) goto err; -- cgit From 3872f87b09e2f274ecf477895f9d1f9b9bdcf04b Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Oct 2022 16:45:23 -0400 Subject: fs: dlm: remove ls_remove_wait waitqueue This patch removes the ls_remove_wait waitqueue handling. The current handling tries to wait before a lookup is send out for a identically resource name which is going to be removed. Hereby the remove message should be send out before the new lookup message. The reason is that after a lookup request and response will actually use the specific remote rsb. A followed remove message would delete the rsb on the remote side but it's still being used. To reach a similar behaviour we simple send the remove message out while the rsb lookup lock is held and the rsb is removed from the toss list. Other find_rsb() calls would never have the change to get a rsb back to live while a remove message will be send out (without holding the lock). This behaviour requires a non-sleepable context which should be provided now and might be the reason why it was not implemented so in the first place. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/dlm_internal.h | 4 ---- fs/dlm/lock.c | 56 ++------------------------------------------------- fs/dlm/lockspace.c | 3 --- 3 files changed, 2 insertions(+), 61 deletions(-) (limited to 'fs') diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 69e3928c756b..6318e0f51bc9 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -592,11 +592,7 @@ struct dlm_ls { int ls_new_rsb_count; struct list_head ls_new_rsb; /* new rsb structs */ - spinlock_t ls_remove_spin; - wait_queue_head_t ls_remove_wait; - char ls_remove_name[DLM_RESNAME_MAXLEN+1]; char *ls_remove_names[DLM_REMOVE_NAMES_MAX]; - int ls_remove_len; int ls_remove_lens[DLM_REMOVE_NAMES_MAX]; struct list_head ls_nodes; /* current nodes in ls */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index a2930e33c134..e1adfa5aed05 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1589,37 +1589,6 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms) return error; } -/* If there's an rsb for the same resource being removed, ensure - * that the remove message is sent before the new lookup message. - */ - -#define DLM_WAIT_PENDING_COND(ls, r) \ - (ls->ls_remove_len && \ - !rsb_cmp(r, ls->ls_remove_name, \ - ls->ls_remove_len)) - -static void wait_pending_remove(struct dlm_rsb *r) -{ - struct dlm_ls *ls = r->res_ls; - restart: - spin_lock(&ls->ls_remove_spin); - if (DLM_WAIT_PENDING_COND(ls, r)) { - log_debug(ls, "delay lookup for remove dir %d %s", - r->res_dir_nodeid, r->res_name); - spin_unlock(&ls->ls_remove_spin); - wait_event(ls->ls_remove_wait, !DLM_WAIT_PENDING_COND(ls, r)); - goto restart; - } - spin_unlock(&ls->ls_remove_spin); -} - -/* - * ls_remove_spin protects ls_remove_name and ls_remove_len which are - * read by other threads in wait_pending_remove. ls_remove_names - * and ls_remove_lens are only used by the scan thread, so they do - * not need protection. - */ - static void shrink_bucket(struct dlm_ls *ls, int b) { struct rb_node *n, *next; @@ -1701,11 +1670,6 @@ static void shrink_bucket(struct dlm_ls *ls, int b) * list and sending the removal. Keeping this gap small is * important to keep us (the master node) from being out of sync * with the remote dir node for very long. - * - * From the time the rsb is removed from toss until just after - * send_remove, the rsb name is saved in ls_remove_name. A new - * lookup checks this to ensure that a new lookup message for the - * same resource name is not sent just before the remove message. */ for (i = 0; i < remote_count; i++) { @@ -1752,22 +1716,8 @@ static void shrink_bucket(struct dlm_ls *ls, int b) } rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); - - /* block lookup of same name until we've sent remove */ - spin_lock(&ls->ls_remove_spin); - ls->ls_remove_len = len; - memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); - spin_unlock(&ls->ls_remove_spin); - spin_unlock(&ls->ls_rsbtbl[b].lock); - send_remove(r); - - /* allow lookup of name again */ - spin_lock(&ls->ls_remove_spin); - ls->ls_remove_len = 0; - memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN); - spin_unlock(&ls->ls_remove_spin); - wake_up(&ls->ls_remove_wait); + spin_unlock(&ls->ls_rsbtbl[b].lock); dlm_free_rsb(r); } @@ -2718,8 +2668,6 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) return 0; } - wait_pending_remove(r); - r->res_first_lkid = lkb->lkb_id; send_lookup(r, lkb); return 1; @@ -3813,7 +3761,7 @@ static int send_remove(struct dlm_rsb *r) to_nodeid = dlm_dir_nodeid(r); error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh, - GFP_NOFS); + GFP_ATOMIC); if (error) goto out; diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 4965737705b7..e7135883cf78 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -524,9 +524,6 @@ static int new_lockspace(const char *name, const char *cluster, spin_lock_init(&ls->ls_rsbtbl[i].lock); } - spin_lock_init(&ls->ls_remove_spin); - init_waitqueue_head(&ls->ls_remove_wait); - for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) { ls->ls_remove_names[i] = kzalloc(DLM_RESNAME_MAXLEN+1, GFP_KERNEL); -- cgit From 194a3fb488f2760eda67c3ab0ce3a095e8006d72 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Oct 2022 16:45:24 -0400 Subject: fs: dlm: relax sending to allow receiving This patch drops additionally the sock_mutex when there is a sending message burst. Since we have acknowledge handling we free sending buffers only when we receive an ack back, but if we are stuck in send_to_sock() looping because dlm sends a lot of messages and we never leave the loop the sending buffer fill up very quickly. We can't receive during this iteration because the sock_mutex is held. This patch will unlock the sock_mutex so it should be possible to receive messages when a burst of sending messages happens. This will allow to free up memory because acks which are already received can be processed. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 871d4e9f49fb..b05c6d9b5102 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1418,7 +1418,10 @@ static void send_to_sock(struct connection *con) const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; struct writequeue_entry *e; int len, offset, ret; - int count = 0; + int count; + +again: + count = 0; mutex_lock(&con->sock_mutex); if (con->sock == NULL) @@ -1453,14 +1456,16 @@ static void send_to_sock(struct connection *con) } else if (ret < 0) goto out; + spin_lock(&con->writequeue_lock); + writequeue_entry_complete(e, ret); + /* Don't starve people filling buffers */ if (++count >= MAX_SEND_MSG_COUNT) { + spin_unlock(&con->writequeue_lock); + mutex_unlock(&con->sock_mutex); cond_resched(); - count = 0; + goto again; } - - spin_lock(&con->writequeue_lock); - writequeue_entry_complete(e, ret); } spin_unlock(&con->writequeue_lock); -- cgit From 9c693d76abb3956433a28994924046abf1a73ef4 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Oct 2022 16:45:25 -0400 Subject: fs: dlm: catch dlm_add_member() error This patch will catch a possible dlm_add_member() and delivers it to the dlm recovery handling. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/member.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 2af2ccfe43a9..923c01a8a0aa 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -573,7 +573,10 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) node = &rv->nodes[i]; if (dlm_is_member(ls, node->nodeid)) continue; - dlm_add_member(ls, node); + error = dlm_add_member(ls, node); + if (error) + return error; + log_rinfo(ls, "add member %d", node->nodeid); } -- cgit From 3e54c9e80e68b765d8877023d93f1eea1b9d1c54 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Oct 2022 16:45:26 -0400 Subject: fs: dlm: fix log of lowcomms vs midcomms This patch will fix a small issue when printing out that dlm_midcomms_start() failed to start and it was printing out that the dlm subcomponent lowcomms was failed but lowcomms is behind the midcomms layer. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index e7135883cf78..96fe8f7ca09c 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -391,7 +391,7 @@ static int threads_start(void) /* Thread for sending/receiving messages for all lockspace's */ error = dlm_midcomms_start(); if (error) { - log_print("cannot start dlm lowcomms %d", error); + log_print("cannot start dlm midcomms %d", error); goto scand_fail; } -- cgit From 775af207464bd28a2086f8399c0b2a3f1f40c7ae Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Oct 2022 16:45:27 -0400 Subject: fs: dlm: use WARN_ON_ONCE() instead of WARN_ON() To not get the console spammed about WARN_ON() of invalid states in the dlm midcomms hot path handling we switch to WARN_ON_ONCE() to get it only once that there might be an issue with the midcomms state handling. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/midcomms.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 3a4e20d6cd2d..32194a750fe1 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -469,7 +469,7 @@ static void dlm_pas_fin_ack_rcv(struct midcomms_node *node) spin_unlock(&node->state_lock); log_print("%s: unexpected state: %d\n", __func__, node->state); - WARN_ON(1); + WARN_ON_ONCE(1); return; } spin_unlock(&node->state_lock); @@ -540,7 +540,7 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, spin_unlock(&node->state_lock); log_print("%s: unexpected state: %d\n", __func__, node->state); - WARN_ON(1); + WARN_ON_ONCE(1); return; } spin_unlock(&node->state_lock); @@ -548,7 +548,7 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); break; default: - WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); + WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); dlm_receive_buffer_3_2_trace(seq, p); dlm_receive_buffer(p, node->nodeid); set_bit(DLM_NODE_ULP_DELIVERED, &node->flags); @@ -770,7 +770,7 @@ static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid) goto out; } - WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); + WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); dlm_receive_buffer(p, nodeid); break; case DLM_OPTS: @@ -1095,7 +1095,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, } /* this is a bug, however we going on and hope it will be resolved */ - WARN_ON(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags)); + WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags)); mh = dlm_allocate_mhandle(allocation); if (!mh) @@ -1127,7 +1127,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, break; default: dlm_free_mhandle(mh); - WARN_ON(1); + WARN_ON_ONCE(1); goto err; } @@ -1196,7 +1196,7 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, break; default: srcu_read_unlock(&nodes_srcu, mh->idx); - WARN_ON(1); + WARN_ON_ONCE(1); break; } } @@ -1238,7 +1238,7 @@ static void dlm_act_fin_ack_rcv(struct midcomms_node *node) spin_unlock(&node->state_lock); log_print("%s: unexpected state: %d\n", __func__, node->state); - WARN_ON(1); + WARN_ON_ONCE(1); return; } spin_unlock(&node->state_lock); @@ -1356,7 +1356,7 @@ static void midcomms_node_release(struct rcu_head *rcu) { struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu); - WARN_ON(atomic_read(&node->send_queue_cnt)); + WARN_ON_ONCE(atomic_read(&node->send_queue_cnt)); kfree(node); } -- cgit From 149562f7509404c382c32c3fa8a6ba356135e5cf Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Sep 2022 10:42:05 -0500 Subject: mm/hugetlb: add hugetlb_folio_subpool() helpers Allow hugetlbfs_migrate_folio to check and read subpool information by passing in a folio. Link: https://lkml.kernel.org/r/20220922154207.1575343-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Arnd Bergmann Cc: Colin Cross Cc: David Howells Cc: "Eric W . Biederman" Cc: Hugh Dickins Cc: kernel test robot Cc: Matthew Wilcox Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index dd54f67e47fd..c5137607e523 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1091,10 +1091,10 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, if (rc != MIGRATEPAGE_SUCCESS) return rc; - if (hugetlb_page_subpool(&src->page)) { - hugetlb_set_page_subpool(&dst->page, - hugetlb_page_subpool(&src->page)); - hugetlb_set_page_subpool(&src->page, NULL); + if (hugetlb_folio_subpool(src)) { + hugetlb_set_folio_subpool(dst, + hugetlb_folio_subpool(src)); + hugetlb_set_folio_subpool(src, NULL); } if (mode != MIGRATE_SYNC_NO_COPY) -- cgit From ece62684dcfb714b7d8452056b4a33d426b16457 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Sep 2022 10:42:06 -0500 Subject: hugetlbfs: convert hugetlb_delete_from_page_cache() to use folios Remove the last caller of delete_from_page_cache() by converting the code to its folio equivalent. Link: https://lkml.kernel.org/r/20220922154207.1575343-5-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Arnd Bergmann Cc: Colin Cross Cc: David Howells Cc: "Eric W . Biederman" Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c5137607e523..00495fc128c5 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -364,11 +364,11 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, return -EINVAL; } -static void hugetlb_delete_from_page_cache(struct page *page) +static void hugetlb_delete_from_page_cache(struct folio *folio) { - ClearPageDirty(page); - ClearPageUptodate(page); - delete_from_page_cache(page); + folio_clear_dirty(folio); + folio_clear_uptodate(folio); + filemap_remove_folio(folio); } /* @@ -574,8 +574,8 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, * map could fail. Correspondingly, the subpool and global * reserve usage count can need to be adjusted. */ - VM_BUG_ON(HPageRestoreReserve(&folio->page)); - hugetlb_delete_from_page_cache(&folio->page); + VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio); + hugetlb_delete_from_page_cache(folio); ret = true; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, index, -- cgit From 5d89c224328bce791d051bf60aa92d90bae93c01 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 23 Sep 2022 11:33:41 +0800 Subject: fs/proc/kcore.c: use hotplug_memory_notifier() directly Commit 76ae847497bc52 ("Documentation: raise minimum supported version of GCC to 5.1") updated the minimum gcc version to 5.1. So the problem mentioned in f02c69680088 ("include/linux/memory.h: implement register_hotmemory_notifier()") no longer exist. So we can now switch to use hotplug_memory_notifier() directly rather than register_hotmemory_notifier(). Link: https://lkml.kernel.org/r/20220923033347.3935160-3-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: David Hildenbrand Cc: Christoph Lameter Cc: Kefeng Wang Cc: Waiman Long Cc: zefan li Signed-off-by: Andrew Morton --- fs/proc/kcore.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index dff921f7ca33..7692a360972d 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -638,10 +637,6 @@ static int __meminit kcore_callback(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block kcore_callback_nb __meminitdata = { - .notifier_call = kcore_callback, - .priority = 0, -}; static struct kcore_list kcore_vmalloc; @@ -694,7 +689,7 @@ static int __init proc_kcore_init(void) add_modules_range(); /* Store direct-map area from physical memory map */ kcore_update_ram(); - register_hotmemory_notifier(&kcore_callback_nb); + hotplug_memory_notifier(kcore_callback, 0); return 0; } -- cgit From 1eeaa4fd39b0b1b3e986f8eab6978e69b01e3c5e Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 23 Sep 2022 11:33:47 +0800 Subject: memory: move hotplug memory notifier priority to same file for easy sorting The priority of hotplug memory callback is defined in a different file. And there are some callers using numbers directly. Collect them together into include/linux/memory.h for easy reading. This allows us to sort their priorities more intuitively without additional comments. Link: https://lkml.kernel.org/r/20220923033347.3935160-9-liushixin2@huawei.com Signed-off-by: Liu Shixin Cc: Christoph Lameter Cc: David Hildenbrand Cc: Kefeng Wang Cc: Waiman Long Cc: zefan li Signed-off-by: Andrew Morton --- fs/proc/kcore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 7692a360972d..98f3289556e4 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -689,7 +689,7 @@ static int __init proc_kcore_init(void) add_modules_range(); /* Store direct-map area from physical memory map */ kcore_update_ram(); - hotplug_memory_notifier(kcore_callback, 0); + hotplug_memory_notifier(kcore_callback, DEFAULT_CALLBACK_PRI); return 0; } -- cgit From e025ab842ec35225b1a8e163d1f311beb9e38ce9 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 18 Oct 2022 15:40:14 +0800 Subject: mm: remove kern_addr_valid() completely Most architectures (except arm64/x86/sparc) simply return 1 for kern_addr_valid(), which is only used in read_kcore(), and it calls copy_from_kernel_nofault() which could check whether the address is a valid kernel address. So as there is no need for kern_addr_valid(), let's remove it. Link: https://lkml.kernel.org/r/20221018074014.185687-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Geert Uytterhoeven [m68k] Acked-by: Heiko Carstens [s390] Acked-by: Christoph Hellwig Acked-by: Helge Deller [parisc] Acked-by: Michael Ellerman [powerpc] Acked-by: Guo Ren [csky] Acked-by: Catalin Marinas [arm64] Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Anton Ivanov Cc: Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Greg Ungerer Cc: H. Peter Anvin Cc: Huacai Chen Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Johannes Berg Cc: Jonas Bonn Cc: Matt Turner Cc: Max Filippov Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Xuerui Wang Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- fs/proc/kcore.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 98f3289556e4..71157ee35c1a 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -540,25 +540,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) fallthrough; case KCORE_VMEMMAP: case KCORE_TEXT: - if (kern_addr_valid(start)) { - /* - * Using bounce buffer to bypass the - * hardened user copy kernel text checks. - */ - if (copy_from_kernel_nofault(buf, (void *)start, - tsz)) { - if (clear_user(buffer, tsz)) { - ret = -EFAULT; - goto out; - } - } else { - if (copy_to_user(buffer, buf, tsz)) { - ret = -EFAULT; - goto out; - } + /* + * Using bounce buffer to bypass the + * hardened user copy kernel text checks. + */ + if (copy_from_kernel_nofault(buf, (void *)start, tsz)) { + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; } } else { - if (clear_user(buffer, tsz)) { + if (copy_to_user(buffer, buf, tsz)) { ret = -EFAULT; goto out; } -- cgit From 26215b7ee923b9251f7bb12c4e5f09dc465d35f2 Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Fri, 21 Oct 2022 07:16:08 +0800 Subject: hugetlbfs: fix null-ptr-deref in hugetlbfs_parse_param() Syzkaller reports a null-ptr-deref bug as follows: ====================================================== KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] RIP: 0010:hugetlbfs_parse_param+0x1dd/0x8e0 fs/hugetlbfs/inode.c:1380 [...] Call Trace: vfs_parse_fs_param fs/fs_context.c:148 [inline] vfs_parse_fs_param+0x1f9/0x3c0 fs/fs_context.c:129 vfs_parse_fs_string+0xdb/0x170 fs/fs_context.c:191 generic_parse_monolithic+0x16f/0x1f0 fs/fs_context.c:231 do_new_mount fs/namespace.c:3036 [inline] path_mount+0x12de/0x1e20 fs/namespace.c:3370 do_mount fs/namespace.c:3383 [inline] __do_sys_mount fs/namespace.c:3591 [inline] __se_sys_mount fs/namespace.c:3568 [inline] __x64_sys_mount+0x27f/0x300 fs/namespace.c:3568 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] ====================================================== According to commit "vfs: parse: deal with zero length string value", kernel will set the param->string to null pointer in vfs_parse_fs_string() if fs string has zero length. Yet the problem is that, hugetlbfs_parse_param() will dereference the param->string, without checking whether it is a null pointer. To be more specific, if hugetlbfs_parse_param() parses an illegal mount parameter, such as "size=,", kernel will constructs struct fs_parameter with null pointer in vfs_parse_fs_string(), then passes this struct fs_parameter to hugetlbfs_parse_param(), which triggers the above null-ptr-deref bug. This patch solves it by adding sanity check on param->string in hugetlbfs_parse_param(). Link: https://lkml.kernel.org/r/20221020231609.4810-1-yin31149@gmail.com Reported-by: syzbot+a3e6acd85ded5c16a709@syzkaller.appspotmail.com Tested-by: syzbot+a3e6acd85ded5c16a709@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/0000000000005ad00405eb7148c6@google.com/ Signed-off-by: Hawkins Jiawei Reviewed-by: Mike Kravetz Cc: Hawkins Jiawei Cc: Muchun Song Cc: Ian Kent Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 00495fc128c5..09e644f80a4a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1378,7 +1378,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par case Opt_size: /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(param->string[0])) + if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->max_size_opt = memparse(param->string, &rest); ctx->max_val_type = SIZE_STD; @@ -1388,7 +1388,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par case Opt_nr_inodes: /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(param->string[0])) + if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->nr_inodes = memparse(param->string, &rest); return 0; @@ -1404,7 +1404,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par case Opt_min_size: /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(param->string[0])) + if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->min_size_opt = memparse(param->string, &rest); ctx->min_val_type = SIZE_STD; -- cgit From d0e482c45c50117bfb568825a41f0693e5f33c0f Mon Sep 17 00:00:00 2001 From: Oleg Kanatov Date: Fri, 28 Oct 2022 15:16:39 +0300 Subject: jfs: Fix a typo in function jfs_umount When closing the block allocation map, an incorrect pointer was NULL'ed. This commit fixes that. Signed-off-by: Oleg Kanatov Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_umount.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c index 95ebcd17ce75..8ec43f53f686 100644 --- a/fs/jfs/jfs_umount.c +++ b/fs/jfs/jfs_umount.c @@ -87,7 +87,7 @@ int jfs_umount(struct super_block *sb) dbUnmount(ipbmap, 0); diFreeSpecial(ipbmap); - sbi->ipimap = NULL; + sbi->ipbmap = NULL; /* * Make sure all metadata makes it to disk before we mark -- cgit From a60dca73a1a8079d867b2c2e9549440346c1ba83 Mon Sep 17 00:00:00 2001 From: Oleg Kanatov Date: Fri, 28 Oct 2022 15:22:54 +0300 Subject: jfs: makes diUnmount/diMount in jfs_mount_rw atomic jfs_mount_rw can call diUnmount and then diMount. These calls change the imap pointer. Between these two calls there may be calls of function jfs_lookup(). The jfs_lookup() function calls jfs_iget(), which, in turn calls diRead(). The latter references the imap pointer. That may cause diRead() to refer to a pointer freed in diUnmount(). This commit makes the calls to diUnmount()/diMount() atomic so that nothing will read the imap pointer until the whole remount is completed. Signed-off-by: Oleg Kanatov Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_imap.c | 2 +- fs/jfs/jfs_mount.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 799d3837e7c2..390cbfce391f 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -310,8 +310,8 @@ int diRead(struct inode *ip) iagno = INOTOIAG(ip->i_ino); /* read the iag */ - imap = JFS_IP(ipimap)->i_imap; IREAD_LOCK(ipimap, RDWRLOCK_IMAP); + imap = JFS_IP(ipimap)->i_imap; rc = diIAGRead(imap, iagno, &mp); IREAD_UNLOCK(ipimap); if (rc) { diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index 48d1f70f786c..b83aae56a1f2 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -234,11 +234,15 @@ int jfs_mount_rw(struct super_block *sb, int remount) truncate_inode_pages(sbi->ipimap->i_mapping, 0); truncate_inode_pages(sbi->ipbmap->i_mapping, 0); + + IWRITE_LOCK(sbi->ipimap, RDWRLOCK_IMAP); diUnmount(sbi->ipimap, 1); if ((rc = diMount(sbi->ipimap))) { + IWRITE_UNLOCK(sbi->ipimap); jfs_err("jfs_mount_rw: diMount failed!"); return rc; } + IWRITE_UNLOCK(sbi->ipimap); dbUnmount(sbi->ipbmap, 1); if ((rc = dbMount(sbi->ipbmap))) { -- cgit From d030bd1a66580b7b198a085e7220c794bcdc2770 Mon Sep 17 00:00:00 2001 From: Bo Liu Date: Fri, 11 Nov 2022 01:43:52 -0500 Subject: ext2: Fix some kernel-doc warnings The current code provokes some kernel-doc warnings: fs/ext2/dir.c:417: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst Signed-off-by: Bo Liu Acked-by: Randy Dunlap Signed-off-by: Jan Kara --- fs/ext2/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 8f597753ac12..27873733ed8c 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -413,7 +413,7 @@ found: return de; } -/** +/* * Return the '..' directory entry and the page in which the entry was found * (as a parameter - p). * -- cgit From e6ecb142429183cef4835f31d4134050ae660032 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 8 Nov 2022 17:59:34 -0800 Subject: f2fs: allow to read node block after shutdown If block address is still alive, we should give a valid node block even after shutdown. Otherwise, we can see zero data when reading out a file. Cc: stable@vger.kernel.org Fixes: 83a3bfdb5a8a ("f2fs: indicate shutdown f2fs to allow unmount successfully") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 983572f23896..b9ee5a1176a0 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1360,8 +1360,7 @@ static int read_node_page(struct page *page, blk_opf_t op_flags) return err; /* NEW_ADDR can be seen, after cp_error drops some dirty node pages */ - if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) || - is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) { + if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR)) { ClearPageUptodate(page); return -ENOENT; } -- cgit From 225d6795abf47c3340214ca1b4c22728e463db4f Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 21:26:38 +0800 Subject: f2fs: add proc entry to show discard_plist info This patch adds a new proc entry to show discard_plist information in more detail, which is very helpful to know the discard pend list count clearly. Such as: Discard pend list(Show diacrd_cmd count on each entry, .:not exist): 0 390 156 85 67 46 37 26 14 8 17 12 9 9 6 12 11 10 16 5 9 2 4 8 3 4 1 24 3 2 2 5 2 4 5 4 32 3 3 2 3 . 3 3 1 40 . 4 1 3 2 1 2 1 48 1 . 1 1 . 1 1 . 56 . 1 1 1 . 2 . 1 64 1 2 . . . . . . 72 . 1 . . . . . . 80 3 1 . . 1 1 . . 88 1 . . . 1 . . 1 ...... Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 032c03e09580..97bf0dbb0974 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -1252,6 +1252,44 @@ static int __maybe_unused victim_bits_seq_show(struct seq_file *seq, return 0; } +static int __maybe_unused discard_plist_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i, count; + + seq_puts(seq, "Discard pend list(Show diacrd_cmd count on each entry, .:not exist):\n"); + if (!f2fs_realtime_discard_enable(sbi)) + return 0; + + if (dcc) { + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) { + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + + if (i % 8 == 0) + seq_printf(seq, " %-3d", i); + count = 0; + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) + count++; + if (count) + seq_printf(seq, " %7d", count); + else + seq_puts(seq, " ."); + if (i % 8 == 7) + seq_putc(seq, '\n'); + } + seq_putc(seq, '\n'); + mutex_unlock(&dcc->cmd_lock); + } + + return 0; +} + int __init f2fs_init_sysfs(void) { int ret; @@ -1322,6 +1360,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) #endif proc_create_single_data("victim_bits", 0444, sbi->s_proc, victim_bits_seq_show, sb); + proc_create_single_data("discard_plist_info", 0444, sbi->s_proc, + discard_plist_seq_show, sb); } return 0; put_feature_list_kobj: @@ -1345,6 +1385,7 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry("victim_bits", sbi->s_proc); + remove_proc_entry("discard_plist_info", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } -- cgit From 4d8d45df2252980f800c1b2fde941a103a18a70e Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 31 Oct 2022 12:24:15 -0700 Subject: f2fs: correct i_size change for atomic writes We need to make sure i_size doesn't change until atomic write commit is successful and restore it when commit is failed. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 8 ++++++++ fs/f2fs/file.c | 18 +++++++++++------- fs/f2fs/inode.c | 5 ++++- fs/f2fs/segment.c | 14 ++++++++++---- 4 files changed, 33 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 04ef4cce3d7f..11c475beca2c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -768,6 +768,7 @@ enum { FI_COMPRESS_RELEASED, /* compressed blocks were released */ FI_ALIGNED_WRITE, /* enable aligned write */ FI_COW_FILE, /* indicate COW file */ + FI_ATOMIC_COMMITTED, /* indicate atomic commit completed except disk sync */ FI_MAX, /* max flag, never be used */ }; @@ -826,6 +827,7 @@ struct f2fs_inode_info { unsigned int i_cluster_size; /* cluster size */ unsigned int atomic_write_cnt; + loff_t original_i_size; /* original i_size before atomic write */ }; static inline void get_extent_info(struct extent_info *ext, @@ -3075,6 +3077,8 @@ static inline void f2fs_i_blocks_write(struct inode *inode, set_inode_flag(inode, FI_AUTO_RECOVER); } +static inline bool f2fs_is_atomic_file(struct inode *inode); + static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) { bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); @@ -3084,6 +3088,10 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) return; i_size_write(inode, i_size); + + if (f2fs_is_atomic_file(inode)) + return; + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c605a4f2bce2..28f586e77999 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2041,6 +2041,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct inode *pinode; + loff_t isize; int ret; if (!inode_owner_or_capable(mnt_userns, inode)) @@ -2099,7 +2100,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) f2fs_up_write(&fi->i_gc_rwsem[WRITE]); goto out; } - f2fs_i_size_write(fi->cow_inode, i_size_read(inode)); + + f2fs_write_inode(inode, NULL); + + isize = i_size_read(inode); + fi->original_i_size = isize; + f2fs_i_size_write(fi->cow_inode, isize); stat_inc_atomic_inode(inode); @@ -2137,16 +2143,14 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (f2fs_is_atomic_file(inode)) { ret = f2fs_commit_atomic_write(inode); - if (ret) - goto unlock_out; - - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) - f2fs_abort_atomic_write(inode, false); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + + f2fs_abort_atomic_write(inode, ret); } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } -unlock_out: + inode_unlock(inode); mnt_drop_write_file(filp); return ret; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 9f0d3864d9f1..577f109b4e1d 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -621,9 +621,12 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) ri->i_uid = cpu_to_le32(i_uid_read(inode)); ri->i_gid = cpu_to_le32(i_gid_read(inode)); ri->i_links = cpu_to_le32(inode->i_nlink); - ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1); + if (!f2fs_is_atomic_file(inode) || + is_inode_flag_set(inode, FI_ATOMIC_COMMITTED)) + ri->i_size = cpu_to_le64(i_size_read(inode)); + if (et) { read_lock(&et->lock); set_raw_extent(&et->largest, &ri->i_ext); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index aa4be7f25963..8aa81238c770 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -192,14 +192,18 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean) if (!f2fs_is_atomic_file(inode)) return; - if (clean) - truncate_inode_pages_final(inode->i_mapping); clear_inode_flag(fi->cow_inode, FI_COW_FILE); iput(fi->cow_inode); fi->cow_inode = NULL; release_atomic_write_cnt(inode); + clear_inode_flag(inode, FI_ATOMIC_COMMITTED); clear_inode_flag(inode, FI_ATOMIC_FILE); stat_dec_atomic_inode(inode); + + if (clean) { + truncate_inode_pages_final(inode->i_mapping); + f2fs_i_size_write(inode, fi->original_i_size); + } } static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, @@ -335,10 +339,12 @@ next: } out: - if (ret) + if (ret) { sbi->revoked_atomic_block += fi->atomic_write_cnt; - else + } else { sbi->committed_atomic_block += fi->atomic_write_cnt; + set_inode_flag(inode, FI_ATOMIC_COMMITTED); + } __complete_revoke_list(inode, &revoke_list, ret ? true : false); -- cgit From cc249e4cba9a6002c9d9e1438daf8440a160bc9e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 6 Nov 2022 21:25:44 +0800 Subject: f2fs: fix to avoid accessing uninitialized spinlock syzbot reports a kernel bug: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1e3/0x2cb lib/dump_stack.c:106 assign_lock_key+0x22a/0x240 kernel/locking/lockdep.c:981 register_lock_class+0x287/0x9b0 kernel/locking/lockdep.c:1294 __lock_acquire+0xe4/0x1f60 kernel/locking/lockdep.c:4934 lock_acquire+0x1a7/0x400 kernel/locking/lockdep.c:5668 __raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline] _raw_spin_lock+0x2a/0x40 kernel/locking/spinlock.c:154 spin_lock include/linux/spinlock.h:350 [inline] f2fs_save_errors fs/f2fs/super.c:3868 [inline] f2fs_handle_error+0x29/0x230 fs/f2fs/super.c:3896 f2fs_iget+0x215/0x4bb0 fs/f2fs/inode.c:516 f2fs_fill_super+0x47d3/0x7b50 fs/f2fs/super.c:4222 mount_bdev+0x26c/0x3a0 fs/super.c:1401 legacy_get_tree+0xea/0x180 fs/fs_context.c:610 vfs_get_tree+0x88/0x270 fs/super.c:1531 do_new_mount+0x289/0xad0 fs/namespace.c:3040 do_mount fs/namespace.c:3383 [inline] __do_sys_mount fs/namespace.c:3591 [inline] __se_sys_mount+0x2e3/0x3d0 fs/namespace.c:3568 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd F2FS-fs (loop1): Failed to read F2FS meta data inode The root cause is if sbi->error_lock may be accessed before its initialization, fix it. Link: https://lore.kernel.org/linux-f2fs-devel/0000000000007edb6605ecbb6442@google.com/T/#u Reported-by: syzbot+40642be9b7e0bb28e0df@syzkaller.appspotmail.com Fixes: 95fa90c9e5a7 ("f2fs: support recording errors into superblock") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a43d8a46a6e5..68a6c2eedcac 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4196,6 +4196,9 @@ try_onemore: if (err) goto free_bio_info; + spin_lock_init(&sbi->error_lock); + memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); + init_f2fs_rwsem(&sbi->cp_rwsem); init_f2fs_rwsem(&sbi->quota_sem); init_waitqueue_head(&sbi->cp_wait); @@ -4263,9 +4266,6 @@ try_onemore: goto free_devices; } - spin_lock_init(&sbi->error_lock); - memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); - sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, -- cgit From 59237a21776f70ffb0420611c23e7158e1317037 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 8 Nov 2022 22:33:21 +0800 Subject: f2fs: optimize iteration over sparse directories Wei Chen reports a kernel bug as blew: INFO: task syz-executor.0:29056 blocked for more than 143 seconds. Not tainted 5.15.0-rc5 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:syz-executor.0 state:D stack:14632 pid:29056 ppid: 6574 flags:0x00000004 Call Trace: __schedule+0x4a1/0x1720 schedule+0x36/0xe0 rwsem_down_write_slowpath+0x322/0x7a0 fscrypt_ioctl_set_policy+0x11f/0x2a0 __f2fs_ioctl+0x1a9f/0x5780 f2fs_ioctl+0x89/0x3a0 __x64_sys_ioctl+0xe8/0x140 do_syscall_64+0x34/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae Eric did some investigation on this issue, quoted from reply of Eric: "Well, the quality of this bug report has a lot to be desired (not on upstream kernel, reproducer is full of totally irrelevant stuff, not sent to the mailing list of the filesystem whose disk image is being fuzzed, etc.). But what is going on is that f2fs_empty_dir() doesn't consider the case of a directory with an extremely large i_size on a malicious disk image. Specifically, the reproducer mounts an f2fs image with a directory that has an i_size of 14814520042850357248, then calls FS_IOC_SET_ENCRYPTION_POLICY on it. That results in a call to f2fs_empty_dir() to check whether the directory is empty. f2fs_empty_dir() then iterates through all 3616826182336513 blocks the directory allegedly contains to check whether any contain anything. i_rwsem is held during this, so anything else that tries to take it will hang." In order to solve this issue, let's use f2fs_get_next_page_offset() to speed up iteration by skipping holes for all below functions: - f2fs_empty_dir - f2fs_readdir - find_in_level The way why we can speed up iteration was described in 'commit 3cf4574705b4 ("f2fs: introduce get_next_page_offset to speed up SEEK_DATA")'. Meanwhile, in f2fs_empty_dir(), let's use f2fs_find_data_page() instead f2fs_get_lock_data_page(), due to i_rwsem was held in caller of f2fs_empty_dir(), there shouldn't be any races, so it's fine to not lock dentry page during lookuping dirents in the page. Link: https://lore.kernel.org/lkml/536944df-a0ae-1dd8-148f-510b476e1347@kernel.org/T/ Reported-by: Wei Chen Cc: Eric Biggers Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 17 ++++++++++++----- fs/f2fs/dir.c | 34 ++++++++++++++++++++++++---------- fs/f2fs/f2fs.h | 5 +++-- fs/f2fs/gc.c | 4 ++-- 4 files changed, 41 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a71e818cd67b..9b47ded653d1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1206,7 +1206,8 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) } struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - blk_opf_t op_flags, bool for_write) + blk_opf_t op_flags, bool for_write, + pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; @@ -1232,12 +1233,17 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); - if (err) + if (err) { + if (err == -ENOENT && next_pgofs) + *next_pgofs = f2fs_get_next_page_offset(&dn, index); goto put_err; + } f2fs_put_dnode(&dn); if (unlikely(dn.data_blkaddr == NULL_ADDR)) { err = -ENOENT; + if (next_pgofs) + *next_pgofs = index + 1; goto put_err; } if (dn.data_blkaddr != NEW_ADDR && @@ -1281,7 +1287,8 @@ put_err: return ERR_PTR(err); } -struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index) +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, + pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct page *page; @@ -1291,7 +1298,7 @@ struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = f2fs_get_read_data_page(inode, index, 0, false); + page = f2fs_get_read_data_page(inode, index, 0, false, next_pgofs); if (IS_ERR(page)) return page; @@ -1317,7 +1324,7 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = f2fs_get_read_data_page(inode, index, 0, for_write); + page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL); if (IS_ERR(page)) return page; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 21960a899b6a..030b7fd4142f 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -340,6 +340,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, unsigned int bidx, end_block; struct page *dentry_page; struct f2fs_dir_entry *de = NULL; + pgoff_t next_pgofs; bool room = false; int max_slots; @@ -350,12 +351,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, le32_to_cpu(fname->hash) % nbucket); end_block = bidx + nblock; - for (; bidx < end_block; bidx++) { + while (bidx < end_block) { /* no need to allocate new dentry pages to all the indices */ - dentry_page = f2fs_find_data_page(dir, bidx); + dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) { room = true; + bidx = next_pgofs; continue; } else { *res_page = dentry_page; @@ -376,6 +378,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, if (max_slots >= s) room = true; f2fs_put_page(dentry_page, 0); + + bidx++; } if (!de && room && F2FS_I(dir)->chash != fname->hash) { @@ -956,7 +960,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, bool f2fs_empty_dir(struct inode *dir) { - unsigned long bidx; + unsigned long bidx = 0; struct page *dentry_page; unsigned int bit_pos; struct f2fs_dentry_block *dentry_blk; @@ -965,13 +969,17 @@ bool f2fs_empty_dir(struct inode *dir) if (f2fs_has_inline_dentry(dir)) return f2fs_empty_inline_dir(dir); - for (bidx = 0; bidx < nblock; bidx++) { - dentry_page = f2fs_get_lock_data_page(dir, bidx, false); + while (bidx < nblock) { + pgoff_t next_pgofs; + + dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs); if (IS_ERR(dentry_page)) { - if (PTR_ERR(dentry_page) == -ENOENT) + if (PTR_ERR(dentry_page) == -ENOENT) { + bidx = next_pgofs; continue; - else + } else { return false; + } } dentry_blk = page_address(dentry_page); @@ -983,10 +991,12 @@ bool f2fs_empty_dir(struct inode *dir) NR_DENTRY_IN_BLOCK, bit_pos); - f2fs_put_page(dentry_page, 1); + f2fs_put_page(dentry_page, 0); if (bit_pos < NR_DENTRY_IN_BLOCK) return false; + + bidx++; } return true; } @@ -1104,7 +1114,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) goto out_free; } - for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) { + for (; n < npages; ctx->pos = n * NR_DENTRY_IN_BLOCK) { + pgoff_t next_pgofs; /* allow readdir() to be interrupted */ if (fatal_signal_pending(current)) { @@ -1118,11 +1129,12 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - dentry_page = f2fs_find_data_page(inode, n); + dentry_page = f2fs_find_data_page(inode, n, &next_pgofs); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); if (err == -ENOENT) { err = 0; + n = next_pgofs; continue; } else { goto out_free; @@ -1141,6 +1153,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) } f2fs_put_page(dentry_page, 0); + + n++; } out_free: fscrypt_fname_free_buffer(&fstr); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 11c475beca2c..6a8cbf5bb187 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3807,8 +3807,9 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - blk_opf_t op_flags, bool for_write); -struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index); + blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs); +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, + pgoff_t *next_pgofs); struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, bool for_write); struct page *f2fs_get_new_data_page(struct inode *inode, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6466db75af5d..f1a46519a5fe 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1562,8 +1562,8 @@ next_step: continue; } - data_page = f2fs_get_read_data_page(inode, - start_bidx, REQ_RAHEAD, true); + data_page = f2fs_get_read_data_page(inode, start_bidx, + REQ_RAHEAD, true, NULL); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); -- cgit From 92b4cf5b48955a4bdd15fe4e2067db8ebd87f04c Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 9 Nov 2022 07:04:42 +0900 Subject: f2fs: initialize locks earlier in f2fs_fill_super() syzbot is reporting lockdep warning at f2fs_handle_error() [1], for spin_lock(&sbi->error_lock) is called before spin_lock_init() is called. For safe locking in error handling, move initialization of locks (and obvious structures) in f2fs_fill_super() to immediately after memory allocation. Link: https://syzkaller.appspot.com/bug?extid=40642be9b7e0bb28e0df [1] Reported-by: syzbot Signed-off-by: Tetsuo Handa Tested-by: syzbot Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 68a6c2eedcac..8f4fc3ad6765 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4103,6 +4103,24 @@ try_onemore: sbi->sb = sb; + /* initialize locks within allocated memory */ + init_f2fs_rwsem(&sbi->gc_lock); + mutex_init(&sbi->writepages); + init_f2fs_rwsem(&sbi->cp_global_sem); + init_f2fs_rwsem(&sbi->node_write); + init_f2fs_rwsem(&sbi->node_change); + spin_lock_init(&sbi->stat_lock); + init_f2fs_rwsem(&sbi->cp_rwsem); + init_f2fs_rwsem(&sbi->quota_sem); + init_waitqueue_head(&sbi->cp_wait); + spin_lock_init(&sbi->error_lock); + + for (i = 0; i < NR_INODE_TYPE; i++) { + INIT_LIST_HEAD(&sbi->inode_list[i]); + spin_lock_init(&sbi->inode_lock[i]); + } + mutex_init(&sbi->flush_lock); + /* Load the checksum driver */ sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { @@ -4126,6 +4144,8 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); + /* precompute checksum seed for metadata */ if (f2fs_sb_has_inode_chksum(sbi)) sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, @@ -4182,26 +4202,14 @@ try_onemore: /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; - init_f2fs_rwsem(&sbi->gc_lock); - mutex_init(&sbi->writepages); - init_f2fs_rwsem(&sbi->cp_global_sem); - init_f2fs_rwsem(&sbi->node_write); - init_f2fs_rwsem(&sbi->node_change); /* disallow all the data/node/meta page writes */ set_sbi_flag(sbi, SBI_POR_DOING); - spin_lock_init(&sbi->stat_lock); err = f2fs_init_write_merge_io(sbi); if (err) goto free_bio_info; - spin_lock_init(&sbi->error_lock); - memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); - - init_f2fs_rwsem(&sbi->cp_rwsem); - init_f2fs_rwsem(&sbi->quota_sem); - init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); err = f2fs_init_iostat(sbi); @@ -4279,12 +4287,6 @@ try_onemore: limit_reserve_root(sbi); adjust_unusable_cap_perc(sbi); - for (i = 0; i < NR_INODE_TYPE; i++) { - INIT_LIST_HEAD(&sbi->inode_list[i]); - spin_lock_init(&sbi->inode_lock[i]); - } - mutex_init(&sbi->flush_lock); - f2fs_init_extent_cache_info(sbi); f2fs_init_ino_entry_info(sbi); -- cgit From 967eaad1fed5f6335ea97a47d45214744dc57925 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 10 Nov 2022 17:15:01 +0800 Subject: f2fs: fix to set flush_merge opt and show noflush_merge Some minor modifications to flush_merge and related parameters: 1.The FLUSH_MERGE opt is set by default only in non-ro mode. 2.When ro and merge are set at the same time, an error is reported. 3.Display noflush_merge mount opt. Suggested-by: Chao Yu Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8f4fc3ad6765..75027ff85cd9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1353,6 +1353,12 @@ default_check: return -EINVAL; } + if ((f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb)) && + test_opt(sbi, FLUSH_MERGE)) { + f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode"); + return -EINVAL; + } + if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; @@ -1941,8 +1947,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",inline_dentry"); else seq_puts(seq, ",noinline_dentry"); - if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) + if (test_opt(sbi, FLUSH_MERGE)) seq_puts(seq, ",flush_merge"); + else + seq_puts(seq, ",noflush_merge"); if (test_opt(sbi, NOBARRIER)) seq_puts(seq, ",nobarrier"); else @@ -2073,7 +2081,8 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, MERGE_CHECKPOINT); F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; - set_opt(sbi, FLUSH_MERGE); + if (!f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) + set_opt(sbi, FLUSH_MERGE); if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) set_opt(sbi, DISCARD); if (f2fs_sb_has_blkzoned(sbi)) { -- cgit From 3b4c7bc01727e3a465759236eeac03d0dd686da3 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 4 Nov 2022 13:52:42 +0100 Subject: xattr: use rbtree for simple_xattrs A while ago Vasily reported that it is possible to set a large number of xattrs on inodes of filesystems that make use of the simple xattr infrastructure. This includes all kernfs-based filesystems that support xattrs (e.g., cgroupfs and tmpfs). Both cgroupfs and tmpfs can be mounted by unprivileged users in unprivileged containers and root in an unprivileged container can set an unrestricted number of security.* xattrs and privileged users can also set unlimited trusted.* xattrs. As there are apparently users that have a fairly large number of xattrs we should scale a bit better. Other xattrs such as user.* are restricted for kernfs-based instances to a fairly limited number. Using a simple linked list protected by a spinlock used for set, get, and list operations doesn't scale well if users use a lot of xattrs even if it's not a crazy number. There's no need to bring in the big guns like rhashtables or rw semaphores for this. An rbtree with a rwlock, or limited rcu semanics and seqlock is enough. It scales within the constraints we are working in. By far the most common operation is getting an xattr. Setting xattrs should be a moderately rare operation. And listxattr() often only happens when copying xattrs between files or together with the contents to a new file. Holding a lock across listxattr() is unproblematic because it doesn't list the values of xattrs. It can only be used to list the names of all xattrs set on a file. And the number of xattr names that can be listed with listxattr() is limited to XATTR_LIST_MAX aka 65536 bytes. If a larger buffer is passed then vfs_listxattr() caps it to XATTR_LIST_MAX and if more xattr names are found it will return -E2BIG. In short, the maximum amount of memory that can be retrieved via listxattr() is limited. Of course, the API is broken as documented on xattr(7) already. In the future we might want to address this but for now this is the world we live in and have lived for a long time. But it does indeed mean that once an application goes over XATTR_LIST_MAX limit of xattrs set on an inode it isn't possible to copy the file and include its xattrs in the copy unless the caller knows all xattrs or limits the copy of the xattrs to important ones it knows by name (At least for tmpfs, and kernfs-based filesystems. Other filesystems might provide ways of achieving this.). Bonus of this port to rbtree+rwlock is that we shrink the memory consumption for users of the simple xattr infrastructure. Also add proper kernel documentation to all the functions. A big thanks to Paul for his comments. Cc: Vasily Averin Cc: "Paul E. McKenney" Acked-by: Roman Gushchin Acked-by: Paul E. McKenney Signed-off-by: Christian Brauner (Microsoft) --- fs/xattr.c | 317 ++++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 250 insertions(+), 67 deletions(-) (limited to 'fs') diff --git a/fs/xattr.c b/fs/xattr.c index 61107b6bbed2..402d9d43fde0 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -992,8 +992,29 @@ const char *xattr_full_name(const struct xattr_handler *handler, } EXPORT_SYMBOL(xattr_full_name); -/* - * Allocate new xattr and copy in the value; but leave the name to callers. +/** + * free_simple_xattr - free an xattr object + * @xattr: the xattr object + * + * Free the xattr object. Can handle @xattr being NULL. + */ +static inline void free_simple_xattr(struct simple_xattr *xattr) +{ + if (xattr) + kfree(xattr->name); + kvfree(xattr); +} + +/** + * simple_xattr_alloc - allocate new xattr object + * @value: value of the xattr object + * @size: size of @value + * + * Allocate a new xattr object and initialize respective members. The caller is + * responsible for handling the name of the xattr. + * + * Return: On success a new xattr object is returned. On failure NULL is + * returned. */ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) { @@ -1014,20 +1035,69 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) return new_xattr; } -/* - * xattr GET operation for in-memory/pseudo filesystems +/** + * rbtree_simple_xattr_cmp - compare xattr name with current rbtree xattr entry + * @key: xattr name + * @node: current node + * + * Compare the xattr name with the xattr name attached to @node in the rbtree. + * + * Return: Negative value if continuing left, positive if continuing right, 0 + * if the xattr attached to @node matches @key. + */ +static int rbtree_simple_xattr_cmp(const void *key, const struct rb_node *node) +{ + const char *xattr_name = key; + const struct simple_xattr *xattr; + + xattr = rb_entry(node, struct simple_xattr, rb_node); + return strcmp(xattr->name, xattr_name); +} + +/** + * rbtree_simple_xattr_node_cmp - compare two xattr rbtree nodes + * @new_node: new node + * @node: current node + * + * Compare the xattr attached to @new_node with the xattr attached to @node. + * + * Return: Negative value if continuing left, positive if continuing right, 0 + * if the xattr attached to @new_node matches the xattr attached to @node. + */ +static int rbtree_simple_xattr_node_cmp(struct rb_node *new_node, + const struct rb_node *node) +{ + struct simple_xattr *xattr; + xattr = rb_entry(new_node, struct simple_xattr, rb_node); + return rbtree_simple_xattr_cmp(xattr->name, node); +} + +/** + * simple_xattr_get - get an xattr object + * @xattrs: the header of the xattr object + * @name: the name of the xattr to retrieve + * @buffer: the buffer to store the value into + * @size: the size of @buffer + * + * Try to find and retrieve the xattr object associated with @name. + * If @buffer is provided store the value of @xattr in @buffer + * otherwise just return the length. The size of @buffer is limited + * to XATTR_SIZE_MAX which currently is 65536. + * + * Return: On success the length of the xattr value is returned. On error a + * negative error code is returned. */ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size) { - struct simple_xattr *xattr; + struct simple_xattr *xattr = NULL; + struct rb_node *rbp; int ret = -ENODATA; - spin_lock(&xattrs->lock); - list_for_each_entry(xattr, &xattrs->head, list) { - if (strcmp(name, xattr->name)) - continue; - + read_lock(&xattrs->lock); + rbp = rb_find(name, &xattrs->rb_root, rbtree_simple_xattr_cmp); + if (rbp) { + xattr = rb_entry(rbp, struct simple_xattr, rb_node); ret = xattr->size; if (buffer) { if (size < xattr->size) @@ -1035,34 +1105,44 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, else memcpy(buffer, xattr->value, xattr->size); } - break; } - spin_unlock(&xattrs->lock); + read_unlock(&xattrs->lock); return ret; } /** - * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems - * @xattrs: target simple_xattr list - * @name: name of the extended attribute - * @value: value of the xattr. If %NULL, will remove the attribute. - * @size: size of the new xattr - * @flags: %XATTR_{CREATE|REPLACE} - * @removed_size: returns size of the removed xattr, -1 if none removed + * simple_xattr_set - set an xattr object + * @xattrs: the header of the xattr object + * @name: the name of the xattr to retrieve + * @value: the value to store along the xattr + * @size: the size of @value + * @flags: the flags determining how to set the xattr + * @removed_size: the size of the removed xattr + * + * Set a new xattr object. + * If @value is passed a new xattr object will be allocated. If XATTR_REPLACE + * is specified in @flags a matching xattr object for @name must already exist. + * If it does it will be replaced with the new xattr object. If it doesn't we + * fail. If XATTR_CREATE is specified and a matching xattr does already exist + * we fail. If it doesn't we create a new xattr. If @flags is zero we simply + * insert the new xattr replacing any existing one. + * + * If @value is empty and a matching xattr object is found we delete it if + * XATTR_REPLACE is specified in @flags or @flags is zero. * - * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails - * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist; - * otherwise, fails with -ENODATA. + * If @value is empty and no matching xattr object for @name is found we do + * nothing if XATTR_CREATE is specified in @flags or @flags is zero. For + * XATTR_REPLACE we fail as mentioned above. * - * Returns 0 on success, -errno on failure. + * Return: On success zero and on error a negative error code is returned. */ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags, ssize_t *removed_size) { - struct simple_xattr *xattr; - struct simple_xattr *new_xattr = NULL; - int err = 0; + struct simple_xattr *xattr = NULL, *new_xattr = NULL; + struct rb_node *parent = NULL, **rbp; + int err = 0, ret; if (removed_size) *removed_size = -1; @@ -1075,42 +1155,68 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, new_xattr->name = kstrdup(name, GFP_KERNEL); if (!new_xattr->name) { - kvfree(new_xattr); + free_simple_xattr(new_xattr); return -ENOMEM; } } - spin_lock(&xattrs->lock); - list_for_each_entry(xattr, &xattrs->head, list) { - if (!strcmp(name, xattr->name)) { - if (flags & XATTR_CREATE) { - xattr = new_xattr; - err = -EEXIST; - } else if (new_xattr) { - list_replace(&xattr->list, &new_xattr->list); - if (removed_size) - *removed_size = xattr->size; - } else { - list_del(&xattr->list); - if (removed_size) - *removed_size = xattr->size; - } - goto out; - } - } - if (flags & XATTR_REPLACE) { - xattr = new_xattr; - err = -ENODATA; - } else { - list_add(&new_xattr->list, &xattrs->head); - xattr = NULL; + write_lock(&xattrs->lock); + rbp = &xattrs->rb_root.rb_node; + while (*rbp) { + parent = *rbp; + ret = rbtree_simple_xattr_cmp(name, *rbp); + if (ret < 0) + rbp = &(*rbp)->rb_left; + else if (ret > 0) + rbp = &(*rbp)->rb_right; + else + xattr = rb_entry(*rbp, struct simple_xattr, rb_node); + if (xattr) + break; } -out: - spin_unlock(&xattrs->lock); + if (xattr) { - kfree(xattr->name); - kvfree(xattr); + /* Fail if XATTR_CREATE is requested and the xattr exists. */ + if (flags & XATTR_CREATE) { + err = -EEXIST; + goto out_unlock; + } + + if (new_xattr) + rb_replace_node(&xattr->rb_node, &new_xattr->rb_node, + &xattrs->rb_root); + else + rb_erase(&xattr->rb_node, &xattrs->rb_root); + if (!err && removed_size) + *removed_size = xattr->size; + } else { + /* Fail if XATTR_REPLACE is requested but no xattr is found. */ + if (flags & XATTR_REPLACE) { + err = -ENODATA; + goto out_unlock; + } + + /* + * If XATTR_CREATE or no flags are specified together with a + * new value simply insert it. + */ + if (new_xattr) { + rb_link_node(&new_xattr->rb_node, parent, rbp); + rb_insert_color(&new_xattr->rb_node, &xattrs->rb_root); + } + + /* + * If XATTR_CREATE or no flags are specified and neither an + * old or new xattr exist then we don't need to do anything. + */ } + +out_unlock: + write_unlock(&xattrs->lock); + if (err) + free_simple_xattr(new_xattr); + else + free_simple_xattr(xattr); return err; } @@ -1134,14 +1240,31 @@ static int xattr_list_one(char **buffer, ssize_t *remaining_size, return 0; } -/* - * xattr LIST operation for in-memory/pseudo filesystems +/** + * simple_xattr_list - list all xattr objects + * @inode: inode from which to get the xattrs + * @xattrs: the header of the xattr object + * @buffer: the buffer to store all xattrs into + * @size: the size of @buffer + * + * List all xattrs associated with @inode. If @buffer is NULL we returned + * the required size of the buffer. If @buffer is provided we store the + * xattrs value into it provided it is big enough. + * + * Note, the number of xattr names that can be listed with listxattr(2) is + * limited to XATTR_LIST_MAX aka 65536 bytes. If a larger buffer is passed + * then vfs_listxattr() caps it to XATTR_LIST_MAX and if more xattr names + * are found it will return -E2BIG. + * + * Return: On success the required size or the size of the copied xattrs is + * returned. On error a negative error code is returned. */ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size) { bool trusted = capable(CAP_SYS_ADMIN); struct simple_xattr *xattr; + struct rb_node *rbp; ssize_t remaining_size = size; int err = 0; @@ -1162,8 +1285,10 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, } #endif - spin_lock(&xattrs->lock); - list_for_each_entry(xattr, &xattrs->head, list) { + read_lock(&xattrs->lock); + for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) { + xattr = rb_entry(rbp, struct simple_xattr, rb_node); + /* skip "trusted." attributes for unprivileged callers */ if (!trusted && xattr_is_trusted(xattr->name)) continue; @@ -1172,18 +1297,76 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, if (err) break; } - spin_unlock(&xattrs->lock); + read_unlock(&xattrs->lock); return err ? err : size - remaining_size; } -/* - * Adds an extended attribute to the list +/** + * rbtree_simple_xattr_less - compare two xattr rbtree nodes + * @new_node: new node + * @node: current node + * + * Compare the xattr attached to @new_node with the xattr attached to @node. + * Note that this function technically tolerates duplicate entries. + * + * Return: True if insertion point in the rbtree is found. */ -void simple_xattr_list_add(struct simple_xattrs *xattrs, - struct simple_xattr *new_xattr) +static bool rbtree_simple_xattr_less(struct rb_node *new_node, + const struct rb_node *node) { - spin_lock(&xattrs->lock); - list_add(&new_xattr->list, &xattrs->head); - spin_unlock(&xattrs->lock); + return rbtree_simple_xattr_node_cmp(new_node, node) < 0; +} + +/** + * simple_xattr_add - add xattr objects + * @xattrs: the header of the xattr object + * @new_xattr: the xattr object to add + * + * Add an xattr object to @xattrs. This assumes no replacement or removal + * of matching xattrs is wanted. Should only be called during inode + * initialization when a few distinct initial xattrs are supposed to be set. + */ +void simple_xattr_add(struct simple_xattrs *xattrs, + struct simple_xattr *new_xattr) +{ + write_lock(&xattrs->lock); + rb_add(&new_xattr->rb_node, &xattrs->rb_root, rbtree_simple_xattr_less); + write_unlock(&xattrs->lock); +} + +/** + * simple_xattrs_init - initialize new xattr header + * @xattrs: header to initialize + * + * Initialize relevant fields of a an xattr header. + */ +void simple_xattrs_init(struct simple_xattrs *xattrs) +{ + xattrs->rb_root = RB_ROOT; + rwlock_init(&xattrs->lock); +} + +/** + * simple_xattrs_free - free xattrs + * @xattrs: xattr header whose xattrs to destroy + * + * Destroy all xattrs in @xattr. When this is called no one can hold a + * reference to any of the xattrs anymore. + */ +void simple_xattrs_free(struct simple_xattrs *xattrs) +{ + struct rb_node *rbp; + + rbp = rb_first(&xattrs->rb_root); + while (rbp) { + struct simple_xattr *xattr; + struct rb_node *rbp_next; + + rbp_next = rb_next(rbp); + xattr = rb_entry(rbp, struct simple_xattr, rb_node); + rb_erase(&xattr->rb_node, &xattrs->rb_root); + free_simple_xattr(xattr); + rbp = rbp_next; + } } -- cgit From e40df4281b86d5f7c1615dd9eda597675340a8d3 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 12 Nov 2022 22:36:42 +0100 Subject: orangefs: fix mode handling In 4053d2500beb ("orangefs: rework posix acl handling when creating new filesystem objects") we tried to precalculate the correct mode when creating a new inode. However, this leads to regressions when creating new filesystem objects. Even if we precalculate the mode we still need to call __orangefs_setattr() to perform additional checks and we also need to update the mode of ACL_TYPE_ACCESS acls set on the inode. The patch referenced above regressed that. Restore that part of the old behavior and remove the mode precalculation as it doesn't get us anything anymore. Fixes: 4053d2500beb ("orangefs: rework posix acl handling when creating new filesystem objects") Reported-by: Mike Marshall Signed-off-by: Christian Brauner (Microsoft) --- fs/orangefs/inode.c | 11 ++++++++++- fs/orangefs/orangefs-kernel.h | 1 - fs/orangefs/orangefs-utils.c | 10 ++-------- 3 files changed, 12 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 8974b0fbf00d..3d65accfae17 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -1131,7 +1131,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, orangefs_set_inode(inode, ref); inode->i_ino = hash; /* needed for stat etc */ - error = __orangefs_inode_getattr(inode, mode, ORANGEFS_GETATTR_NEW); + error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW); if (error) goto out_iput; @@ -1158,6 +1158,15 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, gossip_debug(GOSSIP_INODE_DEBUG, "Initializing ACL's for inode %pU\n", get_khandle_from_ino(inode)); + if (mode != inode->i_mode) { + struct iattr iattr = { + .ia_mode = mode, + .ia_valid = ATTR_MODE, + }; + inode->i_mode = mode; + __orangefs_setattr(inode, &iattr); + __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + } posix_acl_release(acl); posix_acl_release(default_acl); return inode; diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 55cd6d50eea1..6e0cc01b3a14 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -423,7 +423,6 @@ int orangefs_inode_setxattr(struct inode *inode, #define ORANGEFS_GETATTR_SIZE 2 int orangefs_inode_getattr(struct inode *, int); -int __orangefs_inode_getattr(struct inode *inode, umode_t mode, int flags); int orangefs_inode_check_changed(struct inode *inode); diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 334a2fd98c37..46b7dcff18ac 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -233,7 +233,7 @@ static int orangefs_inode_is_stale(struct inode *inode, return 0; } -int __orangefs_inode_getattr(struct inode *inode, umode_t mode, int flags) +int orangefs_inode_getattr(struct inode *inode, int flags) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op; @@ -369,8 +369,7 @@ again2: /* special case: mark the root inode as sticky */ inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) | - orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes) | - mode; + orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes); orangefs_inode->getattr_time = jiffies + orangefs_getattr_timeout_msecs*HZ/1000; @@ -382,11 +381,6 @@ out: return ret; } -int orangefs_inode_getattr(struct inode *inode, int flags) -{ - return __orangefs_inode_getattr(inode, 0, flags); -} - int orangefs_inode_check_changed(struct inode *inode) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); -- cgit From f1bd37a4735286585751dbd9db330b48525cb193 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 14 Nov 2022 10:35:32 -0800 Subject: iomap: directly use logical block size Don't transform the logical block size to a bit shift only to shift it back to the original block size. Just use the size. Cc: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Darrick J. Wong Reviewed-by: Bart Van Assche Signed-off-by: Keith Busch Signed-off-by: Darrick J. Wong Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- fs/iomap/direct-io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 4eb559a16c9e..9804714b1751 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -240,7 +240,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, { const struct iomap *iomap = &iter->iomap; struct inode *inode = iter->inode; - unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); unsigned int fs_block_size = i_blocksize(inode), pad; loff_t length = iomap_length(iter); loff_t pos = iter->pos; @@ -252,7 +251,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, size_t copied = 0; size_t orig_count; - if ((pos | length) & ((1 << blkbits) - 1) || + if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) return -EINVAL; -- cgit From bdcdd86ca94b5e9faa18d6f4d3dda660ac5c887e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 00:54:40 +0000 Subject: btrfs: fix assertion failure and blocking during nowait buffered write When doing a nowait buffered write we can trigger the following assertion: [11138.437027] assertion failed: !path->nowait, in fs/btrfs/ctree.c:4658 [11138.438251] ------------[ cut here ]------------ [11138.438254] kernel BUG at fs/btrfs/messages.c:259! [11138.438762] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI [11138.439450] CPU: 4 PID: 1091021 Comm: fsstress Not tainted 6.1.0-rc4-btrfs-next-128 #1 [11138.440611] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [11138.442553] RIP: 0010:btrfs_assertfail+0x19/0x1b [btrfs] [11138.443583] Code: 5b 41 5a 41 (...) [11138.446437] RSP: 0018:ffffbaf0cf05b840 EFLAGS: 00010246 [11138.447235] RAX: 0000000000000039 RBX: ffffbaf0cf05b938 RCX: 0000000000000000 [11138.448303] RDX: 0000000000000000 RSI: ffffffffb2ef59f6 RDI: 00000000ffffffff [11138.449370] RBP: ffff9165f581eb68 R08: 00000000ffffffff R09: 0000000000000001 [11138.450493] R10: ffff9167a88421f8 R11: 0000000000000000 R12: ffff9164981b1000 [11138.451661] R13: 000000008c8f1000 R14: ffff9164991d4000 R15: ffff9164981b1000 [11138.452225] FS: 00007f1438a66440(0000) GS:ffff9167ad600000(0000) knlGS:0000000000000000 [11138.452949] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [11138.453394] CR2: 00007f1438a64000 CR3: 0000000100c36002 CR4: 0000000000370ee0 [11138.454057] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [11138.454879] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [11138.455779] Call Trace: [11138.456211] [11138.456598] btrfs_next_old_leaf.cold+0x18/0x1d [btrfs] [11138.457827] ? kmem_cache_alloc+0x18d/0x2a0 [11138.458516] btrfs_lookup_csums_range+0x149/0x4d0 [btrfs] [11138.459407] csum_exist_in_range+0x56/0x110 [btrfs] [11138.460271] can_nocow_file_extent+0x27c/0x310 [btrfs] [11138.461155] can_nocow_extent+0x1ec/0x2e0 [btrfs] [11138.461672] btrfs_check_nocow_lock+0x114/0x1c0 [btrfs] [11138.462951] btrfs_buffered_write+0x44c/0x8e0 [btrfs] [11138.463482] btrfs_do_write_iter+0x42b/0x5f0 [btrfs] [11138.463982] ? lock_release+0x153/0x4a0 [11138.464347] io_write+0x11b/0x570 [11138.464660] ? lock_release+0x153/0x4a0 [11138.465213] ? lock_is_held_type+0xe8/0x140 [11138.466003] io_issue_sqe+0x63/0x4a0 [11138.466339] io_submit_sqes+0x238/0x770 [11138.466741] __do_sys_io_uring_enter+0x37b/0xb10 [11138.467206] ? lock_is_held_type+0xe8/0x140 [11138.467879] ? syscall_enter_from_user_mode+0x1d/0x50 [11138.468688] do_syscall_64+0x38/0x90 [11138.469265] entry_SYSCALL_64_after_hwframe+0x63/0xcd [11138.470017] RIP: 0033:0x7f1438c539e6 This is because to check if we can NOCOW, we check that if we can NOCOW into an extent (it's prealloc extent or the inode has NOCOW attribute), and then check if there are csums for the extent's range in the csum tree. The search may leave us beyond the last slot of a leaf, and then when we call btrfs_next_leaf() we end up at btrfs_next_old_leaf() with a time_seq of 0. This triggers a failure of the first assertion at btrfs_next_old_leaf(), since we have a nowait path. With assertions disabled, we simply don't respect the NOWAIT semantics, allowing the write to block on locks or blocking on IO for reading an extent buffer from disk. Fix this by: 1) Triggering the assertion only if time_seq is not 0, which means that search is being done by a tree mod log user, and in the buffered and direct IO write paths we don't use the tree mod log; 2) Implementing NOWAIT semantics at btrfs_next_old_leaf(). Any failure to lock an extent buffer should return immediately and not retry the search, as well as if we need to do IO to read an extent buffer from disk. Fixes: c922b016f353 ("btrfs: assert nowait mode is not used for some btree search functions") Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a9543f01184c..dcb510f38dda 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4663,7 +4663,12 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, int ret; int i; - ASSERT(!path->nowait); + /* + * The nowait semantics are used only for write paths, where we don't + * use the tree mod log and sequence numbers. + */ + if (time_seq) + ASSERT(!path->nowait); nritems = btrfs_header_nritems(path->nodes[0]); if (nritems == 0) @@ -4683,7 +4688,14 @@ again: if (path->need_commit_sem) { path->need_commit_sem = 0; need_commit_sem = true; - down_read(&fs_info->commit_root_sem); + if (path->nowait) { + if (!down_read_trylock(&fs_info->commit_root_sem)) { + ret = -EAGAIN; + goto done; + } + } else { + down_read(&fs_info->commit_root_sem); + } } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); } @@ -4759,7 +4771,7 @@ again: next = c; ret = read_block_for_search(root, path, &next, level, slot, &key); - if (ret == -EAGAIN) + if (ret == -EAGAIN && !path->nowait) goto again; if (ret < 0) { @@ -4769,6 +4781,10 @@ again: if (!path->skip_locking) { ret = btrfs_try_tree_read_lock(next); + if (!ret && path->nowait) { + ret = -EAGAIN; + goto done; + } if (!ret && time_seq) { /* * If we don't get the lock, we may be racing @@ -4799,7 +4815,7 @@ again: ret = read_block_for_search(root, path, &next, level, 0, &key); - if (ret == -EAGAIN) + if (ret == -EAGAIN && !path->nowait) goto again; if (ret < 0) { @@ -4807,8 +4823,16 @@ again: goto done; } - if (!path->skip_locking) - btrfs_tree_read_lock(next); + if (!path->skip_locking) { + if (path->nowait) { + if (!btrfs_try_tree_read_lock(next)) { + ret = -EAGAIN; + goto done; + } + } else { + btrfs_tree_read_lock(next); + } + } } ret = 0; done: -- cgit From b740d806166979488e798e41743aaec051f2443f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 7 Nov 2022 11:44:51 -0500 Subject: btrfs: free btrfs_path before copying root refs to userspace Syzbot reported the following lockdep splat ====================================================== WARNING: possible circular locking dependency detected 6.0.0-rc7-syzkaller-18095-gbbed346d5a96 #0 Not tainted ------------------------------------------------------ syz-executor307/3029 is trying to acquire lock: ffff0000c02525d8 (&mm->mmap_lock){++++}-{3:3}, at: __might_fault+0x54/0xb4 mm/memory.c:5576 but task is already holding lock: ffff0000c958a608 (btrfs-root-00){++++}-{3:3}, at: __btrfs_tree_read_lock fs/btrfs/locking.c:134 [inline] ffff0000c958a608 (btrfs-root-00){++++}-{3:3}, at: btrfs_tree_read_lock fs/btrfs/locking.c:140 [inline] ffff0000c958a608 (btrfs-root-00){++++}-{3:3}, at: btrfs_read_lock_root_node+0x13c/0x1c0 fs/btrfs/locking.c:279 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (btrfs-root-00){++++}-{3:3}: down_read_nested+0x64/0x84 kernel/locking/rwsem.c:1624 __btrfs_tree_read_lock fs/btrfs/locking.c:134 [inline] btrfs_tree_read_lock fs/btrfs/locking.c:140 [inline] btrfs_read_lock_root_node+0x13c/0x1c0 fs/btrfs/locking.c:279 btrfs_search_slot_get_root+0x74/0x338 fs/btrfs/ctree.c:1637 btrfs_search_slot+0x1b0/0xfd8 fs/btrfs/ctree.c:1944 btrfs_update_root+0x6c/0x5a0 fs/btrfs/root-tree.c:132 commit_fs_roots+0x1f0/0x33c fs/btrfs/transaction.c:1459 btrfs_commit_transaction+0x89c/0x12d8 fs/btrfs/transaction.c:2343 flush_space+0x66c/0x738 fs/btrfs/space-info.c:786 btrfs_async_reclaim_metadata_space+0x43c/0x4e0 fs/btrfs/space-info.c:1059 process_one_work+0x2d8/0x504 kernel/workqueue.c:2289 worker_thread+0x340/0x610 kernel/workqueue.c:2436 kthread+0x12c/0x158 kernel/kthread.c:376 ret_from_fork+0x10/0x20 arch/arm64/kernel/entry.S:860 -> #2 (&fs_info->reloc_mutex){+.+.}-{3:3}: __mutex_lock_common+0xd4/0xca8 kernel/locking/mutex.c:603 __mutex_lock kernel/locking/mutex.c:747 [inline] mutex_lock_nested+0x38/0x44 kernel/locking/mutex.c:799 btrfs_record_root_in_trans fs/btrfs/transaction.c:516 [inline] start_transaction+0x248/0x944 fs/btrfs/transaction.c:752 btrfs_start_transaction+0x34/0x44 fs/btrfs/transaction.c:781 btrfs_create_common+0xf0/0x1b4 fs/btrfs/inode.c:6651 btrfs_create+0x8c/0xb0 fs/btrfs/inode.c:6697 lookup_open fs/namei.c:3413 [inline] open_last_lookups fs/namei.c:3481 [inline] path_openat+0x804/0x11c4 fs/namei.c:3688 do_filp_open+0xdc/0x1b8 fs/namei.c:3718 do_sys_openat2+0xb8/0x22c fs/open.c:1313 do_sys_open fs/open.c:1329 [inline] __do_sys_openat fs/open.c:1345 [inline] __se_sys_openat fs/open.c:1340 [inline] __arm64_sys_openat+0xb0/0xe0 fs/open.c:1340 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline] invoke_syscall arch/arm64/kernel/syscall.c:52 [inline] el0_svc_common+0x138/0x220 arch/arm64/kernel/syscall.c:142 do_el0_svc+0x48/0x164 arch/arm64/kernel/syscall.c:206 el0_svc+0x58/0x150 arch/arm64/kernel/entry-common.c:636 el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:654 el0t_64_sync+0x18c/0x190 arch/arm64/kernel/entry.S:581 -> #1 (sb_internal#2){.+.+}-{0:0}: percpu_down_read include/linux/percpu-rwsem.h:51 [inline] __sb_start_write include/linux/fs.h:1826 [inline] sb_start_intwrite include/linux/fs.h:1948 [inline] start_transaction+0x360/0x944 fs/btrfs/transaction.c:683 btrfs_join_transaction+0x30/0x40 fs/btrfs/transaction.c:795 btrfs_dirty_inode+0x50/0x140 fs/btrfs/inode.c:6103 btrfs_update_time+0x1c0/0x1e8 fs/btrfs/inode.c:6145 inode_update_time fs/inode.c:1872 [inline] touch_atime+0x1f0/0x4a8 fs/inode.c:1945 file_accessed include/linux/fs.h:2516 [inline] btrfs_file_mmap+0x50/0x88 fs/btrfs/file.c:2407 call_mmap include/linux/fs.h:2192 [inline] mmap_region+0x7fc/0xc14 mm/mmap.c:1752 do_mmap+0x644/0x97c mm/mmap.c:1540 vm_mmap_pgoff+0xe8/0x1d0 mm/util.c:552 ksys_mmap_pgoff+0x1cc/0x278 mm/mmap.c:1586 __do_sys_mmap arch/arm64/kernel/sys.c:28 [inline] __se_sys_mmap arch/arm64/kernel/sys.c:21 [inline] __arm64_sys_mmap+0x58/0x6c arch/arm64/kernel/sys.c:21 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline] invoke_syscall arch/arm64/kernel/syscall.c:52 [inline] el0_svc_common+0x138/0x220 arch/arm64/kernel/syscall.c:142 do_el0_svc+0x48/0x164 arch/arm64/kernel/syscall.c:206 el0_svc+0x58/0x150 arch/arm64/kernel/entry-common.c:636 el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:654 el0t_64_sync+0x18c/0x190 arch/arm64/kernel/entry.S:581 -> #0 (&mm->mmap_lock){++++}-{3:3}: check_prev_add kernel/locking/lockdep.c:3095 [inline] check_prevs_add kernel/locking/lockdep.c:3214 [inline] validate_chain kernel/locking/lockdep.c:3829 [inline] __lock_acquire+0x1530/0x30a4 kernel/locking/lockdep.c:5053 lock_acquire+0x100/0x1f8 kernel/locking/lockdep.c:5666 __might_fault+0x7c/0xb4 mm/memory.c:5577 _copy_to_user include/linux/uaccess.h:134 [inline] copy_to_user include/linux/uaccess.h:160 [inline] btrfs_ioctl_get_subvol_rootref+0x3a8/0x4bc fs/btrfs/ioctl.c:3203 btrfs_ioctl+0xa08/0xa64 fs/btrfs/ioctl.c:5556 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:870 [inline] __se_sys_ioctl fs/ioctl.c:856 [inline] __arm64_sys_ioctl+0xd0/0x140 fs/ioctl.c:856 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline] invoke_syscall arch/arm64/kernel/syscall.c:52 [inline] el0_svc_common+0x138/0x220 arch/arm64/kernel/syscall.c:142 do_el0_svc+0x48/0x164 arch/arm64/kernel/syscall.c:206 el0_svc+0x58/0x150 arch/arm64/kernel/entry-common.c:636 el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:654 el0t_64_sync+0x18c/0x190 arch/arm64/kernel/entry.S:581 other info that might help us debug this: Chain exists of: &mm->mmap_lock --> &fs_info->reloc_mutex --> btrfs-root-00 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(btrfs-root-00); lock(&fs_info->reloc_mutex); lock(btrfs-root-00); lock(&mm->mmap_lock); *** DEADLOCK *** 1 lock held by syz-executor307/3029: #0: ffff0000c958a608 (btrfs-root-00){++++}-{3:3}, at: __btrfs_tree_read_lock fs/btrfs/locking.c:134 [inline] #0: ffff0000c958a608 (btrfs-root-00){++++}-{3:3}, at: btrfs_tree_read_lock fs/btrfs/locking.c:140 [inline] #0: ffff0000c958a608 (btrfs-root-00){++++}-{3:3}, at: btrfs_read_lock_root_node+0x13c/0x1c0 fs/btrfs/locking.c:279 stack backtrace: CPU: 0 PID: 3029 Comm: syz-executor307 Not tainted 6.0.0-rc7-syzkaller-18095-gbbed346d5a96 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/30/2022 Call trace: dump_backtrace+0x1c4/0x1f0 arch/arm64/kernel/stacktrace.c:156 show_stack+0x2c/0x54 arch/arm64/kernel/stacktrace.c:163 __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x104/0x16c lib/dump_stack.c:106 dump_stack+0x1c/0x58 lib/dump_stack.c:113 print_circular_bug+0x2c4/0x2c8 kernel/locking/lockdep.c:2053 check_noncircular+0x14c/0x154 kernel/locking/lockdep.c:2175 check_prev_add kernel/locking/lockdep.c:3095 [inline] check_prevs_add kernel/locking/lockdep.c:3214 [inline] validate_chain kernel/locking/lockdep.c:3829 [inline] __lock_acquire+0x1530/0x30a4 kernel/locking/lockdep.c:5053 lock_acquire+0x100/0x1f8 kernel/locking/lockdep.c:5666 __might_fault+0x7c/0xb4 mm/memory.c:5577 _copy_to_user include/linux/uaccess.h:134 [inline] copy_to_user include/linux/uaccess.h:160 [inline] btrfs_ioctl_get_subvol_rootref+0x3a8/0x4bc fs/btrfs/ioctl.c:3203 btrfs_ioctl+0xa08/0xa64 fs/btrfs/ioctl.c:5556 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:870 [inline] __se_sys_ioctl fs/ioctl.c:856 [inline] __arm64_sys_ioctl+0xd0/0x140 fs/ioctl.c:856 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline] invoke_syscall arch/arm64/kernel/syscall.c:52 [inline] el0_svc_common+0x138/0x220 arch/arm64/kernel/syscall.c:142 do_el0_svc+0x48/0x164 arch/arm64/kernel/syscall.c:206 el0_svc+0x58/0x150 arch/arm64/kernel/entry-common.c:636 el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:654 el0t_64_sync+0x18c/0x190 arch/arm64/kernel/entry.S:581 We do generally the right thing here, copying the references into a temporary buffer, however we are still holding the path when we do copy_to_user from the temporary buffer. Fix this by freeing the path before we copy to user space. Reported-by: syzbot+4ef9e52e464c6ff47d9d@syzkaller.appspotmail.com CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d5dd8bed1488..89b8d14cb68c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3194,6 +3194,8 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, } out: + btrfs_free_path(path); + if (!ret || ret == -EOVERFLOW) { rootrefs->num_items = found; /* update min_treeid for next search */ @@ -3205,7 +3207,6 @@ out: } kfree(rootrefs); - btrfs_free_path(path); return ret; } -- cgit From 418ffb9e3cf6c4e2574d3a732b724916684bd133 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 10 Nov 2022 11:36:28 +0530 Subject: btrfs: free btrfs_path before copying inodes to userspace btrfs_ioctl_logical_to_ino() frees the search path after the userspace copy from the temp buffer @inodes. Which potentially can lead to a lock splat. Fix this by freeing the path before we copy @inodes to userspace. CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 89b8d14cb68c..b595f2c6dfc9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4282,21 +4282,20 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, size = min_t(u32, loi->size, SZ_16M); } - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - inodes = init_data_container(size); if (IS_ERR(inodes)) { ret = PTR_ERR(inodes); - inodes = NULL; - goto out; + goto out_loi; } + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } ret = iterate_inodes_from_logical(loi->logical, fs_info, path, inodes, ignore_offset); + btrfs_free_path(path); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) @@ -4308,7 +4307,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, ret = -EFAULT; out: - btrfs_free_path(path); kvfree(inodes); out_loi: kfree(loi); -- cgit From 8cf96b409d9b3946ece58ced13f92d0f775b0442 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 10 Nov 2022 11:36:29 +0530 Subject: btrfs: free btrfs_path before copying fspath to userspace btrfs_ioctl_ino_to_path() frees the search path after the userspace copy from the temp buffer @ipath->fspath. Which potentially can lead to a lock splat warning. Fix this by freeing the path before we copy it to userspace. CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b595f2c6dfc9..df5b893494fa 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4232,6 +4232,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) ipath->fspath->val[i] = rel_ptr; } + btrfs_free_path(path); + path = NULL; ret = copy_to_user((void __user *)(unsigned long)ipa->fspath, ipath->fspath, size); if (ret) { -- cgit From 013c1c5585ebcfb19c88efe79063d0463b1b6159 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 10 Nov 2022 11:36:31 +0530 Subject: btrfs: free btrfs_path before copying subvol info to userspace btrfs_ioctl_get_subvol_info() frees the search path after the userspace copy from the temp buffer @subvol_info. This can lead to a lock splat warning. Fix this by freeing the path before we copy it to userspace. CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index df5b893494fa..5ba2e810dc6e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3105,6 +3105,8 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) } } + btrfs_free_path(path); + path = NULL; if (copy_to_user(argp, subvol_info, sizeof(*subvol_info))) ret = -EFAULT; -- cgit From 02aef4225258fa6d022ce9040716aeecc3afc521 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 10 Nov 2022 00:29:42 -0800 Subject: fscrypt: pass super_block to fscrypt_put_master_key_activeref() As this code confused Linus [1], pass the super_block as an argument to fscrypt_put_master_key_activeref(). This removes the need to have the back-pointer ->mk_sb, so remove that. [1] https://lore.kernel.org/linux-fscrypt/CAHk-=wgud4Bc_um+htgfagYpZAnOoCb3NUoW67hc9LhOKsMtJg@mail.gmail.com Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20221110082942.351615-1-ebiggers@kernel.org --- fs/crypto/fscrypt_private.h | 13 ++++--------- fs/crypto/keyring.c | 14 ++++++-------- fs/crypto/keysetup.c | 2 +- 3 files changed, 11 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index d5f68a0c5d15..316a778cec0f 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -439,13 +439,7 @@ struct fscrypt_master_key_secret { struct fscrypt_master_key { /* - * Back-pointer to the super_block of the filesystem to which this - * master key has been added. Only valid if ->mk_active_refs > 0. - */ - struct super_block *mk_sb; - - /* - * Link in ->mk_sb->s_master_keys->key_hashtable. + * Link in ->s_master_keys->key_hashtable. * Only valid if ->mk_active_refs > 0. */ struct hlist_node mk_node; @@ -456,7 +450,7 @@ struct fscrypt_master_key { /* * Active and structural reference counts. An active ref guarantees * that the struct continues to exist, continues to be in the keyring - * ->mk_sb->s_master_keys, and that any embedded subkeys (e.g. + * ->s_master_keys, and that any embedded subkeys (e.g. * ->mk_direct_keys) that have been prepared continue to exist. * A structural ref only guarantees that the struct continues to exist. * @@ -569,7 +563,8 @@ static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec) void fscrypt_put_master_key(struct fscrypt_master_key *mk); -void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk); +void fscrypt_put_master_key_activeref(struct super_block *sb, + struct fscrypt_master_key *mk); struct fscrypt_master_key * fscrypt_find_master_key(struct super_block *sb, diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 2a24b1f0ae68..78dd2ff306bd 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -79,10 +79,9 @@ void fscrypt_put_master_key(struct fscrypt_master_key *mk) call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key); } -void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk) +void fscrypt_put_master_key_activeref(struct super_block *sb, + struct fscrypt_master_key *mk) { - struct super_block *sb = mk->mk_sb; - struct fscrypt_keyring *keyring = sb->s_master_keys; size_t i; if (!refcount_dec_and_test(&mk->mk_active_refs)) @@ -93,9 +92,9 @@ void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk) * destroying any subkeys embedded in it. */ - spin_lock(&keyring->lock); + spin_lock(&sb->s_master_keys->lock); hlist_del_rcu(&mk->mk_node); - spin_unlock(&keyring->lock); + spin_unlock(&sb->s_master_keys->lock); /* * ->mk_active_refs == 0 implies that ->mk_secret is not present and @@ -243,7 +242,7 @@ void fscrypt_destroy_keyring(struct super_block *sb) WARN_ON(refcount_read(&mk->mk_struct_refs) != 1); WARN_ON(!is_master_key_secret_present(&mk->mk_secret)); wipe_master_key_secret(&mk->mk_secret); - fscrypt_put_master_key_activeref(mk); + fscrypt_put_master_key_activeref(sb, mk); } } kfree_sensitive(keyring); @@ -424,7 +423,6 @@ static int add_new_master_key(struct super_block *sb, if (!mk) return -ENOMEM; - mk->mk_sb = sb; init_rwsem(&mk->mk_sem); refcount_set(&mk->mk_struct_refs, 1); mk->mk_spec = *mk_spec; @@ -1068,7 +1066,7 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) err = -ENOKEY; if (is_master_key_secret_present(&mk->mk_secret)) { wipe_master_key_secret(&mk->mk_secret); - fscrypt_put_master_key_activeref(mk); + fscrypt_put_master_key_activeref(sb, mk); err = 0; } inodes_remain = refcount_read(&mk->mk_active_refs) > 0; diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index f7407071a952..9e44dc078a81 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -509,7 +509,7 @@ static void put_crypt_info(struct fscrypt_info *ci) spin_lock(&mk->mk_decrypted_inodes_lock); list_del(&ci->ci_master_key_link); spin_unlock(&mk->mk_decrypted_inodes_lock); - fscrypt_put_master_key_activeref(mk); + fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk); } memzero_explicit(ci, sizeof(*ci)); kmem_cache_free(fscrypt_info_cachep, ci); -- cgit From 9a48b4a6fd512bdaed7e38ba844be743163d49c6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:13 -0800 Subject: xfs: fully initialize xfs_da_args in xchk_directory_blocks While running the online fsck test suite, I noticed the following assertion in the kernel log (edited for brevity): XFS: Assertion failed: 0, file: fs/xfs/xfs_health.c, line: 571 ------------[ cut here ]------------ WARNING: CPU: 3 PID: 11667 at fs/xfs/xfs_message.c:104 assfail+0x46/0x4a [xfs] CPU: 3 PID: 11667 Comm: xfs_scrub Tainted: G W 5.19.0-rc7-xfsx #rc7 6e6475eb29fd9dda3181f81b7ca7ff961d277a40 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 RIP: 0010:assfail+0x46/0x4a [xfs] Call Trace: xfs_dir2_isblock+0xcc/0xe0 xchk_directory_blocks+0xc7/0x420 xchk_directory+0x53/0xb0 xfs_scrub_metadata+0x2b6/0x6b0 xfs_scrubv_metadata+0x35e/0x4d0 xfs_ioc_scrubv_metadata+0x111/0x160 xfs_file_ioctl+0x4ec/0xef0 __x64_sys_ioctl+0x82/0xa0 do_syscall_64+0x2b/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 This assertion triggers in xfs_dirattr_mark_sick when the caller passes in a whichfork value that is neither of XFS_{DATA,ATTR}_FORK. The cause of this is that xchk_directory_blocks only partially initializes the xfs_da_args structure that is passed to xfs_dir2_isblock. If the data fork is not correct, the XFS_IS_CORRUPT clause will trigger. My development branch reports this failure to the health monitoring subsystem, which accesses the uninitialized args->whichfork field, leading the the assertion tripping. We really shouldn't be passing random stack contents around, so the solution here is to force the compiler to zero-initialize the struct. Found by fuzzing u3.bmx[0].blockcount = middlebit on xfs/1554. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/dir.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 5c87800ab223..d1b0f23c2c59 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -666,7 +666,12 @@ xchk_directory_blocks( struct xfs_scrub *sc) { struct xfs_bmbt_irec got; - struct xfs_da_args args; + struct xfs_da_args args = { + .dp = sc ->ip, + .whichfork = XFS_DATA_FORK, + .geo = sc->mp->m_dir_geo, + .trans = sc->tp, + }; struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); struct xfs_mount *mp = sc->mp; xfs_fileoff_t leaf_lblk; @@ -689,9 +694,6 @@ xchk_directory_blocks( free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET); /* Is this a block dir? */ - args.dp = sc->ip; - args.geo = mp->m_dir_geo; - args.trans = sc->tp; error = xfs_dir2_isblock(&args, &is_block); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; -- cgit From be1317fdb8d4e3ccbac43e199b360c248c600d99 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:14 -0800 Subject: xfs: don't track the AGFL buffer in the scrub AG context While scrubbing an allocation group, we don't need to hold the AGFL buffer as part of the scrub context. All that is necessary to lock an AG is to hold the AGI and AGF buffers, so fix all the existing users of the AGFL buffer to grab them only when necessary. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/agheader.c | 47 ++++++++++++++++++++++++++---------------- fs/xfs/scrub/agheader_repair.c | 1 - fs/xfs/scrub/common.c | 8 ------- fs/xfs/scrub/repair.c | 11 +++++----- fs/xfs/scrub/scrub.h | 1 - 5 files changed, 35 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index b7b838bd4ba4..af284baa6f4c 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -609,9 +609,16 @@ out: /* AGFL */ struct xchk_agfl_info { - unsigned int sz_entries; + /* Number of AGFL entries that the AGF claims are in use. */ + unsigned int agflcount; + + /* Number of AGFL entries that we found. */ unsigned int nr_entries; + + /* Buffer to hold AGFL entries for extent checking. */ xfs_agblock_t *entries; + + struct xfs_buf *agfl_bp; struct xfs_scrub *sc; }; @@ -641,10 +648,10 @@ xchk_agfl_block( struct xfs_scrub *sc = sai->sc; if (xfs_verify_agbno(sc->sa.pag, agbno) && - sai->nr_entries < sai->sz_entries) + sai->nr_entries < sai->agflcount) sai->entries[sai->nr_entries++] = agbno; else - xchk_block_set_corrupt(sc, sc->sa.agfl_bp); + xchk_block_set_corrupt(sc, sai->agfl_bp); xchk_agfl_block_xref(sc, agbno); @@ -696,19 +703,26 @@ int xchk_agfl( struct xfs_scrub *sc) { - struct xchk_agfl_info sai; + struct xchk_agfl_info sai = { + .sc = sc, + }; struct xfs_agf *agf; xfs_agnumber_t agno = sc->sm->sm_agno; - unsigned int agflcount; unsigned int i; int error; + /* Lock the AGF and AGI so that nobody can touch this AG. */ error = xchk_ag_read_headers(sc, agno, &sc->sa); if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error)) - goto out; + return error; if (!sc->sa.agf_bp) return -EFSCORRUPTED; - xchk_buffer_recheck(sc, sc->sa.agfl_bp); + + /* Try to read the AGFL, and verify its structure if we get it. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &sai.agfl_bp); + if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error)) + return error; + xchk_buffer_recheck(sc, sai.agfl_bp); xchk_agfl_xref(sc); @@ -717,24 +731,21 @@ xchk_agfl( /* Allocate buffer to ensure uniqueness of AGFL entries. */ agf = sc->sa.agf_bp->b_addr; - agflcount = be32_to_cpu(agf->agf_flcount); - if (agflcount > xfs_agfl_size(sc->mp)) { + sai.agflcount = be32_to_cpu(agf->agf_flcount); + if (sai.agflcount > xfs_agfl_size(sc->mp)) { xchk_block_set_corrupt(sc, sc->sa.agf_bp); goto out; } - memset(&sai, 0, sizeof(sai)); - sai.sc = sc; - sai.sz_entries = agflcount; - sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, - KM_MAYFAIL); + sai.entries = kvcalloc(sai.agflcount, sizeof(xfs_agblock_t), + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!sai.entries) { error = -ENOMEM; goto out; } /* Check the blocks in the AGFL. */ - error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr, - sc->sa.agfl_bp, xchk_agfl_block, &sai); + error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr, sai.agfl_bp, + xchk_agfl_block, &sai); if (error == -ECANCELED) { error = 0; goto out_free; @@ -742,7 +753,7 @@ xchk_agfl( if (error) goto out_free; - if (agflcount != sai.nr_entries) { + if (sai.agflcount != sai.nr_entries) { xchk_block_set_corrupt(sc, sc->sa.agf_bp); goto out_free; } @@ -758,7 +769,7 @@ xchk_agfl( } out_free: - kmem_free(sai.entries); + kvfree(sai.entries); out: return error; } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 1b0b4e243f77..2e75ff9b5b2e 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -697,7 +697,6 @@ xrep_agfl( * freespace overflow to the freespace btrees. */ sc->sa.agf_bp = agf_bp; - sc->sa.agfl_bp = agfl_bp; error = xrep_roll_ag_trans(sc); if (error) goto err; diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 9bbbf20f401b..ad70f29233c3 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -424,10 +424,6 @@ xchk_ag_read_headers( if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF)) return error; - error = xfs_alloc_read_agfl(sa->pag, sc->tp, &sa->agfl_bp); - if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL)) - return error; - return 0; } @@ -515,10 +511,6 @@ xchk_ag_free( struct xchk_ag *sa) { xchk_ag_btcur_free(sa); - if (sa->agfl_bp) { - xfs_trans_brelse(sc->tp, sa->agfl_bp); - sa->agfl_bp = NULL; - } if (sa->agf_bp) { xfs_trans_brelse(sc->tp, sa->agf_bp); sa->agf_bp = NULL; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index c18bd039fce9..2ada7fc1c398 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -126,8 +126,6 @@ xrep_roll_ag_trans( xfs_trans_bhold(sc->tp, sc->sa.agi_bp); if (sc->sa.agf_bp) xfs_trans_bhold(sc->tp, sc->sa.agf_bp); - if (sc->sa.agfl_bp) - xfs_trans_bhold(sc->tp, sc->sa.agfl_bp); /* * Roll the transaction. We still own the buffer and the buffer lock @@ -145,8 +143,6 @@ xrep_roll_ag_trans( xfs_trans_bjoin(sc->tp, sc->sa.agi_bp); if (sc->sa.agf_bp) xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); - if (sc->sa.agfl_bp) - xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp); return 0; } @@ -498,6 +494,7 @@ xrep_put_freelist( struct xfs_scrub *sc, xfs_agblock_t agbno) { + struct xfs_buf *agfl_bp; int error; /* Make sure there's space on the freelist. */ @@ -516,8 +513,12 @@ xrep_put_freelist( return error; /* Put the block on the AGFL. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + return error; + error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp, - sc->sa.agfl_bp, agbno, 0); + agfl_bp, agbno, 0); if (error) return error; xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1, diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 3de5287e98d8..151567f88366 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -39,7 +39,6 @@ struct xchk_ag { /* AG btree roots */ struct xfs_buf *agf_bp; - struct xfs_buf *agfl_bp; struct xfs_buf *agi_bp; /* AG btrees */ -- cgit From 3e59c0103e66d6e687a8b47fd70169542aba938e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:14 -0800 Subject: xfs: log the AGI/AGF buffers when rolling transactions during an AG repair Currently, the only way to lock an allocation group is to hold the AGI and AGF buffers. If a repair needs to roll the transaction while repairing some AG metadata, it maintains that lock by holding the two buffers across the transaction roll and joins them afterwards. However, repair is not like other parts of XFS that employ the bhold - roll - bjoin sequence because it's possible that the AGI or AGF buffers are not actually dirty before the roll. This presents two problems -- First, we need to redirty those buffers to keep them moving along in the log to avoid pinning the log tail. Second, a clean buffer log item can detach from the buffer. If this happens, the buffer type state is discarded along with the bli and must be reattached before the next time the buffer is logged. If it is not, the logging code will complain and log recovery will not work properly. An earlier version of this patch tried to fix the second problem by re-setting the buffer type in the bli after joining the buffer to the new transaction, but that looked weird and didn't solve the first problem. Instead, solve both problems by logging the buffer before rolling the transaction. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/repair.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 2ada7fc1c398..22335619c84e 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -121,24 +121,36 @@ xrep_roll_ag_trans( { int error; - /* Keep the AG header buffers locked so we can keep going. */ - if (sc->sa.agi_bp) + /* + * Keep the AG header buffers locked while we roll the transaction. + * Ensure that both AG buffers are dirty and held when we roll the + * transaction so that they move forward in the log without losing the + * bli (and hence the bli type) when the transaction commits. + * + * Normal code would never hold clean buffers across a roll, but repair + * needs both buffers to maintain a total lock on the AG. + */ + if (sc->sa.agi_bp) { + xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM); xfs_trans_bhold(sc->tp, sc->sa.agi_bp); - if (sc->sa.agf_bp) + } + + if (sc->sa.agf_bp) { + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM); xfs_trans_bhold(sc->tp, sc->sa.agf_bp); + } /* - * Roll the transaction. We still own the buffer and the buffer lock - * regardless of whether or not the roll succeeds. If the roll fails, - * the buffers will be released during teardown on our way out of the - * kernel. If it succeeds, we join them to the new transaction and - * move on. + * Roll the transaction. We still hold the AG header buffers locked + * regardless of whether or not that succeeds. On failure, the buffers + * will be released during teardown on our way out of the kernel. If + * successful, join the buffers to the new transaction and move on. */ error = xfs_trans_roll(&sc->tp); if (error) return error; - /* Join AG headers to the new transaction. */ + /* Join the AG headers to the new transaction. */ if (sc->sa.agi_bp) xfs_trans_bjoin(sc->tp, sc->sa.agi_bp); if (sc->sa.agf_bp) -- cgit From b255fab0f80cc65a334fcd90cd278673cddbc988 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:14 -0800 Subject: xfs: make AGFL repair function avoid crosslinked blocks Teach the AGFL repair function to check each block of the proposed AGFL against the rmap btree. If the rmapbt finds any mappings that are not OWN_AG, strike that block from the list. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/agheader_repair.c | 78 +++++++++++++++++++++++++++++++++++------- 1 file changed, 66 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 2e75ff9b5b2e..82ceb60ea5fc 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -442,12 +442,18 @@ out_revert: /* AGFL */ struct xrep_agfl { + /* Bitmap of alleged AGFL blocks that we're not going to add. */ + struct xbitmap crossed; + /* Bitmap of other OWN_AG metadata blocks. */ struct xbitmap agmetablocks; /* Bitmap of free space. */ struct xbitmap *freesp; + /* rmapbt cursor for finding crosslinked blocks */ + struct xfs_btree_cur *rmap_cur; + struct xfs_scrub *sc; }; @@ -477,6 +483,41 @@ xrep_agfl_walk_rmap( return xbitmap_set_btcur_path(&ra->agmetablocks, cur); } +/* Strike out the blocks that are cross-linked according to the rmapbt. */ +STATIC int +xrep_agfl_check_extent( + struct xrep_agfl *ra, + uint64_t start, + uint64_t len) +{ + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start); + xfs_agblock_t last_agbno = agbno + len - 1; + int error; + + ASSERT(XFS_FSB_TO_AGNO(ra->sc->mp, start) == ra->sc->sa.pag->pag_agno); + + while (agbno <= last_agbno) { + bool other_owners; + + error = xfs_rmap_has_other_keys(ra->rmap_cur, agbno, 1, + &XFS_RMAP_OINFO_AG, &other_owners); + if (error) + return error; + + if (other_owners) { + error = xbitmap_set(&ra->crossed, agbno, 1); + if (error) + return error; + } + + if (xchk_should_terminate(ra->sc, &error)) + return error; + agbno++; + } + + return 0; +} + /* * Map out all the non-AGFL OWN_AG space in this AG so that we can deduce * which blocks belong to the AGFL. @@ -496,44 +537,58 @@ xrep_agfl_collect_blocks( struct xrep_agfl ra; struct xfs_mount *mp = sc->mp; struct xfs_btree_cur *cur; + struct xbitmap_range *br, *n; int error; ra.sc = sc; ra.freesp = agfl_extents; xbitmap_init(&ra.agmetablocks); + xbitmap_init(&ra.crossed); /* Find all space used by the free space btrees & rmapbt. */ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra); - if (error) - goto err; xfs_btree_del_cursor(cur, error); + if (error) + goto out_bmp; /* Find all blocks currently being used by the bnobt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag, XFS_BTNUM_BNO); error = xbitmap_set_btblocks(&ra.agmetablocks, cur); - if (error) - goto err; xfs_btree_del_cursor(cur, error); + if (error) + goto out_bmp; /* Find all blocks currently being used by the cntbt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag, XFS_BTNUM_CNT); error = xbitmap_set_btblocks(&ra.agmetablocks, cur); - if (error) - goto err; - xfs_btree_del_cursor(cur, error); + if (error) + goto out_bmp; /* * Drop the freesp meta blocks that are in use by btrees. * The remaining blocks /should/ be AGFL blocks. */ error = xbitmap_disunion(agfl_extents, &ra.agmetablocks); - xbitmap_destroy(&ra.agmetablocks); if (error) - return error; + goto out_bmp; + + /* Strike out the blocks that are cross-linked. */ + ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); + for_each_xbitmap_extent(br, n, agfl_extents) { + error = xrep_agfl_check_extent(&ra, br->start, br->len); + if (error) + break; + } + xfs_btree_del_cursor(ra.rmap_cur, error); + if (error) + goto out_bmp; + error = xbitmap_disunion(agfl_extents, &ra.crossed); + if (error) + goto out_bmp; /* * Calculate the new AGFL size. If we found more blocks than fit in @@ -541,11 +596,10 @@ xrep_agfl_collect_blocks( */ *flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents), xfs_agfl_size(mp)); - return 0; -err: +out_bmp: + xbitmap_destroy(&ra.crossed); xbitmap_destroy(&ra.agmetablocks); - xfs_btree_del_cursor(cur, error); return error; } -- cgit From 48ff40458f871fb19e7b1b40e0e5084b8751d9cb Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:15 -0800 Subject: xfs: standardize GFP flags usage in online scrub Memory allocation usage is the same throughout online fsck -- we want kernel memory, we have to be able to back out if we can't allocate memory, and we don't want to spray dmesg with memory allocation failure reports. Standardize the GFP flag usage and document these requirements. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/agheader.c | 2 +- fs/xfs/scrub/attr.c | 9 ++++----- fs/xfs/scrub/scrub.h | 9 +++++++++ fs/xfs/scrub/symlink.c | 2 +- 4 files changed, 15 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index af284baa6f4c..4dd52b15f09c 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -737,7 +737,7 @@ xchk_agfl( goto out; } sai.entries = kvcalloc(sai.agflcount, sizeof(xfs_agblock_t), - GFP_KERNEL | __GFP_RETRY_MAYFAIL); + XCHK_GFP_FLAGS); if (!sai.entries) { error = -ENOMEM; goto out; diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index b6f0c9f3f124..11b2593a2be7 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -79,7 +79,8 @@ xchk_setup_xattr( * without the inode lock held, which means we can sleep. */ if (sc->flags & XCHK_TRY_HARDER) { - error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, GFP_KERNEL); + error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, + XCHK_GFP_FLAGS); if (error) return error; } @@ -138,8 +139,7 @@ xchk_xattr_listent( * doesn't work, we overload the seen_enough variable to convey * the error message back to the main scrub function. */ - error = xchk_setup_xattr_buf(sx->sc, valuelen, - GFP_KERNEL | __GFP_RETRY_MAYFAIL); + error = xchk_setup_xattr_buf(sx->sc, valuelen, XCHK_GFP_FLAGS); if (error == -ENOMEM) error = -EDEADLOCK; if (error) { @@ -324,8 +324,7 @@ xchk_xattr_block( return 0; /* Allocate memory for block usage checking. */ - error = xchk_setup_xattr_buf(ds->sc, 0, - GFP_KERNEL | __GFP_RETRY_MAYFAIL); + error = xchk_setup_xattr_buf(ds->sc, 0, XCHK_GFP_FLAGS); if (error == -ENOMEM) return -EDEADLOCK; if (error) diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 151567f88366..a0f097b8acb0 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -8,6 +8,15 @@ struct xfs_scrub; +/* + * Standard flags for allocating memory within scrub. NOFS context is + * configured by the process allocation scope. Scrub and repair must be able + * to back out gracefully if there isn't enough memory. Force-cast to avoid + * complaints from static checkers. + */ +#define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \ + __GFP_RETRY_MAYFAIL)) + /* Type info and names for the scrub types. */ enum xchk_type { ST_NONE = 1, /* disabled */ diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 75311f8daeeb..c1c99ffe7408 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -21,7 +21,7 @@ xchk_setup_symlink( struct xfs_scrub *sc) { /* Allocate the buffer without the inode lock held. */ - sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, GFP_KERNEL); + sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; -- cgit From fcd2a43488d5a211aec94e28369b2a72c28258a2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:15 -0800 Subject: xfs: initialize the check_owner object fully Initialize the check_owner list head so that we don't corrupt the list. Reduce the scope of the object pointer. Fixes: 858333dcf021 ("xfs: check btree block ownership with bnobt/rmapbt when scrubbing btree") Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/btree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 2f4519590dc1..075ff3071122 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -408,7 +408,6 @@ xchk_btree_check_owner( struct xfs_buf *bp) { struct xfs_btree_cur *cur = bs->cur; - struct check_owner *co; /* * In theory, xfs_btree_get_block should only give us a null buffer @@ -431,10 +430,14 @@ xchk_btree_check_owner( * later scanning. */ if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) { + struct check_owner *co; + co = kmem_alloc(sizeof(struct check_owner), KM_MAYFAIL); if (!co) return -ENOMEM; + + INIT_LIST_HEAD(&co->list); co->level = level; co->daddr = xfs_buf_daddr(bp); list_add_tail(&co->list, &bs->to_check); -- cgit From 306195f355bbdcc3eff6cffac05bcd93a5e419ed Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:16 -0800 Subject: xfs: pivot online scrub away from kmem.[ch] Convert all the online scrub code to use the Linux slab allocator functions directly instead of going through the kmem wrappers. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/agheader_repair.c | 2 +- fs/xfs/scrub/attr.c | 2 +- fs/xfs/scrub/bitmap.c | 11 ++++++----- fs/xfs/scrub/btree.c | 9 ++++----- fs/xfs/scrub/dabtree.c | 4 ++-- fs/xfs/scrub/fscounters.c | 2 +- fs/xfs/scrub/refcount.c | 12 ++++++------ fs/xfs/scrub/scrub.c | 6 +++--- 8 files changed, 24 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 82ceb60ea5fc..d75d82151eeb 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -685,7 +685,7 @@ xrep_agfl_init_header( if (br->len) break; list_del(&br->list); - kmem_free(br); + kfree(br); } /* Write new AGFL to disk. */ diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 11b2593a2be7..31529b9bf389 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -49,7 +49,7 @@ xchk_setup_xattr_buf( if (ab) { if (sz <= ab->sz) return 0; - kmem_free(ab); + kvfree(ab); sc->buf = NULL; } diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index b89bf9de9b1c..a255f09e9f0a 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -10,6 +10,7 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "scrub/scrub.h" #include "scrub/bitmap.h" /* @@ -25,7 +26,7 @@ xbitmap_set( { struct xbitmap_range *bmr; - bmr = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL); + bmr = kmalloc(sizeof(struct xbitmap_range), XCHK_GFP_FLAGS); if (!bmr) return -ENOMEM; @@ -47,7 +48,7 @@ xbitmap_destroy( for_each_xbitmap_extent(bmr, n, bitmap) { list_del(&bmr->list); - kmem_free(bmr); + kfree(bmr); } } @@ -174,15 +175,15 @@ xbitmap_disunion( /* Total overlap, just delete ex. */ lp = lp->next; list_del(&br->list); - kmem_free(br); + kfree(br); break; case 0: /* * Deleting from the middle: add the new right extent * and then shrink the left extent. */ - new_br = kmem_alloc(sizeof(struct xbitmap_range), - KM_MAYFAIL); + new_br = kmalloc(sizeof(struct xbitmap_range), + XCHK_GFP_FLAGS); if (!new_br) { error = -ENOMEM; goto out; diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 075ff3071122..0fd36d5b4646 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -432,8 +432,7 @@ xchk_btree_check_owner( if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) { struct check_owner *co; - co = kmem_alloc(sizeof(struct check_owner), - KM_MAYFAIL); + co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS); if (!co) return -ENOMEM; @@ -652,7 +651,7 @@ xchk_btree( xchk_btree_set_corrupt(sc, cur, 0); return 0; } - bs = kmem_zalloc(cur_sz, KM_NOFS | KM_MAYFAIL); + bs = kzalloc(cur_sz, XCHK_GFP_FLAGS); if (!bs) return -ENOMEM; bs->cur = cur; @@ -743,9 +742,9 @@ out: error = xchk_btree_check_block_owner(bs, co->level, co->daddr); list_del(&co->list); - kmem_free(co); + kfree(co); } - kmem_free(bs); + kfree(bs); return error; } diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 84fe3d33d699..d17cee177085 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -486,7 +486,7 @@ xchk_da_btree( return 0; /* Set up initial da state. */ - ds = kmem_zalloc(sizeof(struct xchk_da_btree), KM_NOFS | KM_MAYFAIL); + ds = kzalloc(sizeof(struct xchk_da_btree), XCHK_GFP_FLAGS); if (!ds) return -ENOMEM; ds->dargs.dp = sc->ip; @@ -591,6 +591,6 @@ out: out_state: xfs_da_state_free(ds->state); - kmem_free(ds); + kfree(ds); return error; } diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 6a6f8fe7f87c..3c56f5890da4 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -116,7 +116,7 @@ xchk_setup_fscounters( struct xchk_fscounters *fsc; int error; - sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0); + sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; fsc = sc->buf; diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index a26ee0f24ef2..d9c1b3cea4a5 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -127,8 +127,8 @@ xchk_refcountbt_rmap_check( * is healthy each rmap_irec we see will be in agbno order * so we don't need insertion sort here. */ - frag = kmem_alloc(sizeof(struct xchk_refcnt_frag), - KM_MAYFAIL); + frag = kmalloc(sizeof(struct xchk_refcnt_frag), + XCHK_GFP_FLAGS); if (!frag) return -ENOMEM; memcpy(&frag->rm, rec, sizeof(frag->rm)); @@ -215,7 +215,7 @@ xchk_refcountbt_process_rmap_fragments( continue; } list_del(&frag->list); - kmem_free(frag); + kfree(frag); nr++; } @@ -257,11 +257,11 @@ done: /* Delete fragments and work list. */ list_for_each_entry_safe(frag, n, &worklist, list) { list_del(&frag->list); - kmem_free(frag); + kfree(frag); } list_for_each_entry_safe(frag, n, &refchk->fragments, list) { list_del(&frag->list); - kmem_free(frag); + kfree(frag); } } @@ -306,7 +306,7 @@ xchk_refcountbt_xref_rmap( out_free: list_for_each_entry_safe(frag, n, &refchk.fragments, list) { list_del(&frag->list); - kmem_free(frag); + kfree(frag); } } diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 2e8e400f10a9..07a7a75f987f 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -174,7 +174,7 @@ xchk_teardown( if (sc->flags & XCHK_REAPING_DISABLED) xchk_start_reaping(sc); if (sc->buf) { - kmem_free(sc->buf); + kvfree(sc->buf); sc->buf = NULL; } return error; @@ -467,7 +467,7 @@ xfs_scrub_metadata( xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB, "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); - sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL); + sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS); if (!sc) { error = -ENOMEM; goto out; @@ -557,7 +557,7 @@ out_nofix: out_teardown: error = xchk_teardown(sc, error); out_sc: - kmem_free(sc); + kfree(sc); out: trace_xchk_done(XFS_I(file_inode(file)), sm, error); if (error == -EFSCORRUPTED || error == -EFSBADCRC) { -- cgit From a7a0f9a5503f4da3b6489583ce4ef9abc0ab2475 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:16 -0800 Subject: xfs: return EINTR when a fatal signal terminates scrub If the program calling online fsck is terminated with a fatal signal, bail out to userspace by returning EINTR, not EAGAIN. EAGAIN is used by scrubbers to indicate that we should try again with more resources locked, and not to indicate that the operation was cancelled. The miswiring is mostly harmless, but it shows up in the trace data. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 454145db10e7..b73648d81d23 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -25,7 +25,7 @@ xchk_should_terminate( if (fatal_signal_pending(current)) { if (*error == 0) - *error = -EAGAIN; + *error = -EINTR; return true; } return false; -- cgit From 0a713bd41ea2b19904232b9c5278012c4361bc04 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:17 -0800 Subject: xfs: fix return code when fatal signal encountered during dquot scrub If the scrub process is sent a fatal signal while we're checking dquots, the predicate for this will set the error code to -EINTR. Don't then squash that into -ECANCELED, because the wrong errno turns up in the trace output. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/quota.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 21b4c9006859..0b643ff32b22 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -84,7 +84,7 @@ xchk_quota_item( int error = 0; if (xchk_should_terminate(sc, &error)) - return -ECANCELED; + return error; /* * Except for the root dquot, the actual dquot we got must either have -- cgit From 6bf2f87915970160ded16c310e2e8887deff97a2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:17 -0800 Subject: xfs: don't retry repairs harder when EAGAIN is returned Repair functions will not return EAGAIN -- if they were not able to obtain resources, they should return EDEADLOCK (like the rest of online fsck) to signal that we need to grab all the resources and try again. Hence we don't need to deal with this case except as a debugging assertion. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/repair.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 22335619c84e..7323bd9fddfb 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -61,7 +61,6 @@ xrep_attempt( sc->flags |= XREP_ALREADY_FIXED; return -EAGAIN; case -EDEADLOCK: - case -EAGAIN: /* Tell the caller to try again having grabbed all the locks. */ if (!(sc->flags & XCHK_TRY_HARDER)) { sc->flags |= XCHK_TRY_HARDER; @@ -74,6 +73,11 @@ xrep_attempt( */ return -EFSCORRUPTED; default: + /* + * EAGAIN tells the caller to re-scrub, so we cannot return + * that here. + */ + ASSERT(error != -EAGAIN); return error; } } -- cgit From 93b0c58ed04b6cbe45354f23bb5628fff31f9084 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:17 -0800 Subject: xfs: don't return -EFSCORRUPTED from repair when resources cannot be grabbed If we tried to repair something but the repair failed with -EDEADLOCK, that means that the repair function couldn't grab some resource it needed and wants us to try again. If we try again (with TRY_HARDER) but still can't get all the resources we need, the repair fails and errors remain on the filesystem. Right now, repair returns the -EDEADLOCK to the caller as -EFSCORRUPTED, which results in XFS_SCRUB_OFLAG_CORRUPT being passed out to userspace. This is not correct because repair has not determined that anything is corrupt. If the repair had been invoked on an object that could be optimized but wasn't corrupt (OFLAG_PREEN), the inability to grab resources will be reported to userspace as corrupt metadata, and users will be unnecessarily alarmed that their suboptimal metadata turned into a corruption. Fix this by returning zero so that the results of the actual scrub will be copied back out to userspace. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/repair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 7323bd9fddfb..4b92f9253ccd 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -69,9 +69,9 @@ xrep_attempt( /* * We tried harder but still couldn't grab all the resources * we needed to fix it. The corruption has not been fixed, - * so report back to userspace. + * so exit to userspace with the scan's output flags unchanged. */ - return -EFSCORRUPTED; + return 0; default: /* * EAGAIN tells the caller to re-scrub, so we cannot return -- cgit From 9e13975bb0620c2bfa1a4d2943e7eb8514f7708e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:18 -0800 Subject: xfs: load rtbitmap and rtsummary extent mapping btrees at mount time It turns out that GETFSMAP and online fsck have had a bug for years due to their use of ILOCK_SHARED to coordinate their linear scans of the realtime bitmap. If the bitmap file's data fork happens to be in BTREE format and the scan occurs immediately after mounting, the incore bmbt will not be populated, leading to ASSERTs tripping over the incorrect inode state. Because the bitmap scans always lock bitmap buffers in increasing order of file offset, it is appropriate for these two callers to take a shared ILOCK to improve scalability. To fix this problem, load both data and attr fork state into memory when mounting the realtime inodes. Realtime metadata files aren't supposed to have an attr fork so the second step is likely a nop. On most filesystems this is unlikely since the rtbitmap data fork is usually in extents format, but it's possible to craft a filesystem that will by fragmenting the free space in the data section and growfsing the rt section. Fixes: 4c934c7dd60c ("xfs: report realtime space information via the rtbitmap") Also-Fixes: 46d9bfb5e706 ("xfs: cross-reference the realtime bitmap") Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_rtalloc.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 292d5e54a92c..b0846204c436 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1325,6 +1325,41 @@ xfs_rtalloc_reinit_frextents( return 0; } +/* + * Read in the bmbt of an rt metadata inode so that we never have to load them + * at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use + * an empty transaction to avoid deadlocking on loops in the bmbt. + */ +static inline int +xfs_rtmount_iread_extents( + struct xfs_inode *ip, + unsigned int lock_class) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc_empty(ip->i_mount, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL | lock_class); + + error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + + if (xfs_inode_has_attr_fork(ip)) { + error = xfs_iread_extents(tp, ip, XFS_ATTR_FORK); + if (error) + goto out_unlock; + } + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL | lock_class); + xfs_trans_cancel(tp); + return error; +} + /* * Get the bitmap and summary inodes and the summary cache into the mount * structure at mount time. @@ -1342,14 +1377,27 @@ xfs_rtmount_inodes( return error; ASSERT(mp->m_rbmip != NULL); + error = xfs_rtmount_iread_extents(mp->m_rbmip, XFS_ILOCK_RTBITMAP); + if (error) + goto out_rele_bitmap; + error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip); - if (error) { - xfs_irele(mp->m_rbmip); - return error; - } + if (error) + goto out_rele_bitmap; ASSERT(mp->m_rsumip != NULL); + + error = xfs_rtmount_iread_extents(mp->m_rsumip, XFS_ILOCK_RTSUM); + if (error) + goto out_rele_summary; + xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks); return 0; + +out_rele_summary: + xfs_irele(mp->m_rsumip); +out_rele_bitmap: + xfs_irele(mp->m_rbmip); + return error; } void -- cgit From 5f369dc5b4eb2becbdfd08924dcaf00e391f4ea1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:18 -0800 Subject: xfs: make rtbitmap ILOCKing consistent when scanning the rt bitmap file xfs_rtalloc_query_range scans the realtime bitmap file in order of increasing file offset, so this caller can take ILOCK_SHARED on the rt bitmap inode instead of ILOCK_EXCL. This isn't going to yield any practical benefits at mount time, but we'd like to make the locking usage consistent around xfs_rtalloc_query_all calls. Make all the places we do this use the same xfs_ilock lockflags for consistency. Fixes: 4c934c7dd60c ("xfs: report realtime space information via the rtbitmap") Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_fsmap.c | 4 ++-- fs/xfs/xfs_rtalloc.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index d8337274c74d..88a88506ffff 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -524,7 +524,7 @@ xfs_getfsmap_rtdev_rtbitmap_query( struct xfs_mount *mp = tp->t_mountp; int error; - xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED); + xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); /* * Set up query parameters to return free rtextents covering the range @@ -551,7 +551,7 @@ xfs_getfsmap_rtdev_rtbitmap_query( if (error) goto err; err: - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED); + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); return error; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index b0846204c436..16534e9873f6 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1311,10 +1311,10 @@ xfs_rtalloc_reinit_frextents( uint64_t val = 0; int error; - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent, &val); - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); if (error) return error; -- cgit From 11f97e684583469fc342a561387cc44fac4f9b1f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:19 -0800 Subject: xfs: skip fscounters comparisons when the scan is incomplete If any part of the per-AG summary counter scan loop aborts without collecting all of the data we need, the scrubber's observation data will be invalid. Set the incomplete flag so that we abort the scrub without reporting false corruptions. Document the data dependency here too. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/fscounters.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 3c56f5890da4..eeb36ac2c6d2 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -138,6 +138,18 @@ xchk_setup_fscounters( return xchk_trans_alloc(sc, 0); } +/* + * Part 1: Collecting filesystem summary counts. For each AG, we add its + * summary counts (total inodes, free inodes, free data blocks) to an incore + * copy of the overall filesystem summary counts. + * + * To avoid false corruption reports in part 2, any failure in this part must + * set the INCOMPLETE flag even when a negative errno is returned. This care + * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, + * ECANCELED) that are absorbed into a scrub state flag update by + * xchk_*_process_error. + */ + /* Count free space btree blocks manually for pre-lazysbcount filesystems. */ static int xchk_fscount_btreeblks( @@ -225,8 +237,10 @@ retry: } if (pag) xfs_perag_put(pag); - if (error) + if (error) { + xchk_set_incomplete(sc); return error; + } /* * The global incore space reservation is taken from the incore @@ -267,6 +281,11 @@ retry: return 0; } +/* + * Part 2: Comparing filesystem summary counters. All we have to do here is + * sum the percpu counters and compare them to what we've observed. + */ + /* * Is the @counter reasonably close to the @expected value? * -- cgit From e74331d6fa2c21a8ecccfe0648dad5193b83defe Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:19 -0800 Subject: xfs: online checking of the free rt extent count Teach the summary count checker to count the number of free realtime extents and compare that to the superblock copy. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/fscounters.c | 86 +++++++++++++++++++++++++++++++++++++++++++++-- fs/xfs/scrub/scrub.h | 8 ----- 2 files changed, 84 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index eeb36ac2c6d2..4777e7b89fdc 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -14,6 +14,8 @@ #include "xfs_health.h" #include "xfs_btree.h" #include "xfs_ag.h" +#include "xfs_rtalloc.h" +#include "xfs_inode.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -43,6 +45,16 @@ * our tolerance for mismatch between expected and actual counter values. */ +struct xchk_fscounters { + struct xfs_scrub *sc; + uint64_t icount; + uint64_t ifree; + uint64_t fdblocks; + uint64_t frextents; + unsigned long long icount_min; + unsigned long long icount_max; +}; + /* * Since the expected value computation is lockless but only browses incore * values, the percpu counters should be fairly close to each other. However, @@ -120,6 +132,7 @@ xchk_setup_fscounters( if (!sc->buf) return -ENOMEM; fsc = sc->buf; + fsc->sc = sc; xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max); @@ -281,6 +294,59 @@ retry: return 0; } +#ifdef CONFIG_XFS_RT +STATIC int +xchk_fscount_add_frextent( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xchk_fscounters *fsc = priv; + int error = 0; + + fsc->frextents += rec->ar_extcount; + + xchk_should_terminate(fsc->sc, &error); + return error; +} + +/* Calculate the number of free realtime extents from the realtime bitmap. */ +STATIC int +xchk_fscount_count_frextents( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc) +{ + struct xfs_mount *mp = sc->mp; + int error; + + fsc->frextents = 0; + if (!xfs_has_realtime(mp)) + return 0; + + xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + error = xfs_rtalloc_query_all(sc->mp, sc->tp, + xchk_fscount_add_frextent, fsc); + if (error) { + xchk_set_incomplete(sc); + goto out_unlock; + } + +out_unlock: + xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + return error; +} +#else +STATIC int +xchk_fscount_count_frextents( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc) +{ + fsc->frextents = 0; + return 0; +} +#endif /* CONFIG_XFS_RT */ + /* * Part 2: Comparing filesystem summary counters. All we have to do here is * sum the percpu counters and compare them to what we've observed. @@ -352,16 +418,17 @@ xchk_fscounters( { struct xfs_mount *mp = sc->mp; struct xchk_fscounters *fsc = sc->buf; - int64_t icount, ifree, fdblocks; + int64_t icount, ifree, fdblocks, frextents; int error; /* Snapshot the percpu counters. */ icount = percpu_counter_sum(&mp->m_icount); ifree = percpu_counter_sum(&mp->m_ifree); fdblocks = percpu_counter_sum(&mp->m_fdblocks); + frextents = percpu_counter_sum(&mp->m_frextents); /* No negative values, please! */ - if (icount < 0 || ifree < 0 || fdblocks < 0) + if (icount < 0 || ifree < 0 || fdblocks < 0 || frextents < 0) xchk_set_corrupt(sc); /* See if icount is obviously wrong. */ @@ -372,6 +439,10 @@ xchk_fscounters( if (fdblocks > mp->m_sb.sb_dblocks) xchk_set_corrupt(sc); + /* See if frextents is obviously wrong. */ + if (frextents > mp->m_sb.sb_rextents) + xchk_set_corrupt(sc); + /* * If ifree exceeds icount by more than the minimum variance then * something's probably wrong with the counters. @@ -386,6 +457,13 @@ xchk_fscounters( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) return 0; + /* Count the free extents counter for rt volumes. */ + error = xchk_fscount_count_frextents(sc, fsc); + if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + return 0; + /* Compare the in-core counters with whatever we counted. */ if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount)) xchk_set_corrupt(sc); @@ -397,5 +475,9 @@ xchk_fscounters( fsc->fdblocks)) xchk_set_corrupt(sc); + if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, + fsc->frextents)) + xchk_set_corrupt(sc); + return 0; } diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index a0f097b8acb0..b4d391b4c938 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -169,12 +169,4 @@ void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno, # define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0) #endif -struct xchk_fscounters { - uint64_t icount; - uint64_t ifree; - uint64_t fdblocks; - unsigned long long icount_min; - unsigned long long icount_max; -}; - #endif /* __XFS_SCRUB_SCRUB_H__ */ -- cgit From 033985b6fe875a7a971cf4e3941e1f3085ba037c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:19 -0800 Subject: xfs: fix perag loop in xchk_bmap_check_rmaps sparse complains that we can return an uninitialized error from this function and that pag could be uninitialized. We know that there are no zero-AG filesystems and hence we had to call xchk_bmap_check_ag_rmaps at least once, so this is not actually possible, but I'm too worn out from automated complaints from unsophisticated AIs so let's just fix this and move on to more interesting problems, eh? Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/bmap.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index f0b9cb6506fd..cb203e083a4c 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -599,14 +599,14 @@ xchk_bmap_check_rmaps( for_each_perag(sc->mp, agno, pag) { error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag); - if (error) - break; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - break; + if (error || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { + xfs_perag_put(pag); + return error; + } } - if (pag) - xfs_perag_put(pag); - return error; + + return 0; } /* -- cgit From 6a5777865eebee1b53d7ae0fd2fa9ec2c6596df6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:20 -0800 Subject: xfs: teach scrub to check for adjacent bmaps when rmap larger than bmap When scrub is checking file fork mappings against rmap records and the rmap record starts before or ends after the bmap record, check the adjacent bmap records to make sure that they're adjacent to the one we're checking. This helps us to detect cases where the rmaps cover territory that the bmaps do not. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/bmap.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index cb203e083a4c..a4a156dcaa8a 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -90,6 +90,7 @@ out: struct xchk_bmap_info { struct xfs_scrub *sc; + struct xfs_iext_cursor icur; xfs_fileoff_t lastoff; bool is_rt; bool is_shared; @@ -146,6 +147,48 @@ xchk_bmap_get_rmap( return has_rmap; } +static inline bool +xchk_bmap_has_prev( + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_bmbt_irec got; + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); + + if (!xfs_iext_peek_prev_extent(ifp, &info->icur, &got)) + return false; + if (got.br_startoff + got.br_blockcount != irec->br_startoff) + return false; + if (got.br_startblock + got.br_blockcount != irec->br_startblock) + return false; + if (got.br_state != irec->br_state) + return false; + return true; +} + +static inline bool +xchk_bmap_has_next( + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_bmbt_irec got; + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); + + if (!xfs_iext_peek_next_extent(ifp, &info->icur, &got)) + return false; + if (irec->br_startoff + irec->br_blockcount != got.br_startoff) + return false; + if (irec->br_startblock + irec->br_blockcount != got.br_startblock) + return false; + if (got.br_state != irec->br_state) + return false; + return true; +} + /* Make sure that we have rmapbt records for this extent. */ STATIC void xchk_bmap_xref_rmap( @@ -214,6 +257,34 @@ xchk_bmap_xref_rmap( if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); + + /* + * If the rmap starts before this bmbt record, make sure there's a bmbt + * record for the previous offset that is contiguous with this mapping. + * Skip this for CoW fork extents because the refcount btree (and not + * the inode) is the ondisk owner for those extents. + */ + if (info->whichfork != XFS_COW_FORK && rmap.rm_startblock < agbno && + !xchk_bmap_has_prev(info, irec)) { + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + return; + } + + /* + * If the rmap ends after this bmbt record, make sure there's a bmbt + * record for the next offset that is contiguous with this mapping. + * Skip this for CoW fork extents because the refcount btree (and not + * the inode) is the ondisk owner for those extents. + */ + rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; + if (info->whichfork != XFS_COW_FORK && + rmap_end > agbno + irec->br_blockcount && + !xchk_bmap_has_next(info, irec)) { + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + return; + } } /* Cross-reference a single rtdev extent record. */ @@ -626,7 +697,6 @@ xchk_bmap( struct xfs_inode *ip = sc->ip; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t endoff; - struct xfs_iext_cursor icur; int error = 0; /* Non-existent forks can be ignored. */ @@ -690,7 +760,7 @@ xchk_bmap( /* Scrub extent records. */ info.lastoff = 0; ifp = xfs_ifork_ptr(ip, whichfork); - for_each_xfs_iext(ifp, &icur, &irec) { + for_each_xfs_iext(ifp, &info.icur, &irec) { if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) goto out; -- cgit From 830ffa09fb130d31f111848a75b65506fdac623d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:20 -0800 Subject: xfs: block map scrub should handle incore delalloc reservations Enhance the block map scrubber to check delayed allocation reservations. Though there are no physical space allocations to check, we do need to make sure that the range of file offsets being mapped are correct, and to bump the lastoff cursor so that key order checking works correctly. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/bmap.c | 55 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index a4a156dcaa8a..fa8f22ed057a 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -368,14 +368,13 @@ xchk_bmap_dirattr_extent( } /* Scrub a single extent record. */ -STATIC int +STATIC void xchk_bmap_iextent( struct xfs_inode *ip, struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { struct xfs_mount *mp = info->sc->mp; - int error = 0; /* * Check for out-of-order extents. This record could have come @@ -396,14 +395,6 @@ xchk_bmap_iextent( xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); - /* - * Check for delalloc extents. We never iterate the ones in the - * in-core extent scan, and we should never see these in the bmbt. - */ - if (isnullstartblock(irec->br_startblock)) - xchk_fblock_set_corrupt(info->sc, info->whichfork, - irec->br_startoff); - /* Make sure the extent points to a valid place. */ if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) xchk_fblock_set_corrupt(info->sc, info->whichfork, @@ -424,15 +415,12 @@ xchk_bmap_iextent( irec->br_startoff); if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return 0; + return; if (info->is_rt) xchk_bmap_rt_iextent_xref(ip, info, irec); else xchk_bmap_iextent_xref(ip, info, irec); - - info->lastoff = irec->br_startoff + irec->br_blockcount; - return error; } /* Scrub a bmbt record. */ @@ -680,6 +668,33 @@ xchk_bmap_check_rmaps( return 0; } +/* Scrub a delalloc reservation from the incore extent map tree. */ +STATIC void +xchk_bmap_iextent_delalloc( + struct xfs_inode *ip, + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = info->sc->mp; + + /* + * Check for out-of-order extents. This record could have come + * from the incore list, for which there is no ordering check. + */ + if (irec->br_startoff < info->lastoff) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* Make sure the extent points to a valid place. */ + if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); +} + /* * Scrub an inode fork's block mappings. * @@ -764,16 +779,18 @@ xchk_bmap( if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) goto out; - if (isnullstartblock(irec.br_startblock)) - continue; + if (irec.br_startoff >= endoff) { xchk_fblock_set_corrupt(sc, whichfork, irec.br_startoff); goto out; } - error = xchk_bmap_iextent(ip, &info, &irec); - if (error) - goto out; + + if (isnullstartblock(irec.br_startblock)) + xchk_bmap_iextent_delalloc(ip, &info, &irec); + else + xchk_bmap_iextent(ip, &info, &irec); + info.lastoff = irec.br_startoff + irec.br_blockcount; } error = xchk_bmap_check_rmaps(sc, whichfork); -- cgit From f23c40443d1c2af87c99c1424f9e43bbd7307f92 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:21 -0800 Subject: xfs: check quota files for unwritten extents Teach scrub to flag quota files containing unwritten extents. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/quota.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 0b643ff32b22..9eeac8565394 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -14,6 +14,7 @@ #include "xfs_inode.h" #include "xfs_quota.h" #include "xfs_qm.h" +#include "xfs_bmap.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -189,11 +190,12 @@ xchk_quota_data_fork( for_each_xfs_iext(ifp, &icur, &irec) { if (xchk_should_terminate(sc, &error)) break; + /* - * delalloc extents or blocks mapped above the highest + * delalloc/unwritten extents or blocks mapped above the highest * quota id shouldn't happen. */ - if (isnullstartblock(irec.br_startblock) || + if (!xfs_bmap_is_written_extent(&irec) || irec.br_startoff > max_dqid_off || irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, -- cgit From 31785537010a91a0d1dc403e5d049a38a3d4a30b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:21 -0800 Subject: xfs: check that CoW fork extents are not shared Ensure that extents in an inode's CoW fork are not marked as shared in the refcount btree. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/bmap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index fa8f22ed057a..4bb6672b02ba 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -335,6 +335,8 @@ xchk_bmap_iextent_xref( case XFS_COW_FORK: xchk_xref_is_cow_staging(info->sc, agbno, irec->br_blockcount); + xchk_xref_is_not_shared(info->sc, agbno, + irec->br_blockcount); break; } -- cgit From 5eef46358fae1a6018d9f886a3ecd30e843728dd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:22 -0800 Subject: xfs: teach scrub to flag non-extents format cow forks CoW forks only exist in memory, which means that they can only ever have an incore extent tree. Hence they must always be FMT_EXTENTS, so check this when we're scrubbing them. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/bmap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 4bb6672b02ba..d50d0eab196a 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -748,6 +748,8 @@ xchk_bmap( case XFS_DINODE_FMT_DEV: case XFS_DINODE_FMT_LOCAL: /* No mappings to check. */ + if (whichfork == XFS_COW_FORK) + xchk_fblock_set_corrupt(sc, whichfork, 0); goto out; case XFS_DINODE_FMT_EXTENTS: break; -- cgit From bd5ab5f9874109586cbae5bc98e1f9ff574627e2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 16 Nov 2022 16:08:03 -0800 Subject: xfs: don't warn about files that are exactly s_maxbytes long We can handle files that are exactly s_maxbytes bytes long; we just can't handle anything larger than that. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 51820b40ab1c..7a2f38e5202c 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -365,7 +365,7 @@ xchk_dinode( * pagecache can't cache all the blocks in this file due to * overly large offsets, flag the inode for admin review. */ - if (isize >= mp->m_super->s_maxbytes) + if (isize > mp->m_super->s_maxbytes) xchk_ino_set_warning(sc, ino); /* di_nblocks */ -- cgit From f36b954a1f1bf06b5746fea7ecf0fa639ac65324 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 16 Nov 2022 16:08:03 -0800 Subject: xfs: check inode core when scrubbing metadata files Metadata files (e.g. realtime bitmaps and quota files) do not show up in the bulkstat output, which means that scrub-by-handle does not work; they can only be checked through a specific scrub type. Therefore, each scrub type calls xchk_metadata_inode_forks to check the metadata for whatever's in the file. Unfortunately, that function doesn't actually check the inode record itself. Refactor the function a bit to make that happen. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/common.c | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index ad70f29233c3..613260b04a3d 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -781,6 +781,33 @@ xchk_buffer_recheck( trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa); } +static inline int +xchk_metadata_inode_subtype( + struct xfs_scrub *sc, + unsigned int scrub_type) +{ + __u32 smtype = sc->sm->sm_type; + int error; + + sc->sm->sm_type = scrub_type; + + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xchk_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xchk_bmap_data(sc); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + break; + } + + sc->sm->sm_type = smtype; + return error; +} + /* * Scrub the attr/data forks of a metadata inode. The metadata inode must be * pointed to by sc->ip and the ILOCK must be held. @@ -789,13 +816,17 @@ int xchk_metadata_inode_forks( struct xfs_scrub *sc) { - __u32 smtype; bool shared; int error; if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return 0; + /* Check the inode record. */ + error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + /* Metadata inodes don't live on the rt device. */ if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); @@ -815,10 +846,7 @@ xchk_metadata_inode_forks( } /* Invoke the data fork scrubber. */ - smtype = sc->sm->sm_type; - sc->sm->sm_type = XFS_SCRUB_TYPE_BMBTD; - error = xchk_bmap_data(sc); - sc->sm->sm_type = smtype; + error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; @@ -833,7 +861,7 @@ xchk_metadata_inode_forks( xchk_ino_set_corrupt(sc, sc->ip->i_ino); } - return error; + return 0; } /* -- cgit From 2653d53345bda90604f673bb211dd060a5a5c232 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 16 Nov 2022 19:20:20 -0800 Subject: xfs: fix incorrect error-out in xfs_remove Clean up resources if resetting the dotdot entry doesn't succeed. Observed through code inspection. Fixes: 5838d0356bb3 ("xfs: reset child dir '..' entry when unlinking child") Signed-off-by: Darrick J. Wong Reviewed-by: Andrey Albershteyn --- fs/xfs/xfs_inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index aa303be11576..d354ea2b74f9 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2479,7 +2479,7 @@ xfs_remove( error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, tp->t_mountp->m_sb.sb_rootino, 0); if (error) - return error; + goto out_trans_cancel; } } else { /* -- cgit From 59f6ab40fd8735c9a1a15401610a31cc06a0bbd6 Mon Sep 17 00:00:00 2001 From: Long Li Date: Wed, 16 Nov 2022 19:20:20 -0800 Subject: xfs: fix sb write verify for lazysbcount When lazysbcount is enabled, fsstress and loop mount/unmount test report the following problems: XFS (loop0): SB summary counter sanity check failed XFS (loop0): Metadata corruption detected at xfs_sb_write_verify+0x13b/0x460, xfs_sb block 0x0 XFS (loop0): Unmount and run xfs_repair XFS (loop0): First 128 bytes of corrupted metadata buffer: 00000000: 58 46 53 42 00 00 10 00 00 00 00 00 00 28 00 00 XFSB.........(.. 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000020: 69 fb 7c cd 5f dc 44 af 85 74 e0 cc d4 e3 34 5a i.|._.D..t....4Z 00000030: 00 00 00 00 00 20 00 06 00 00 00 00 00 00 00 80 ..... .......... 00000040: 00 00 00 00 00 00 00 81 00 00 00 00 00 00 00 82 ................ 00000050: 00 00 00 01 00 0a 00 00 00 00 00 04 00 00 00 00 ................ 00000060: 00 00 0a 00 b4 b5 02 00 02 00 00 08 00 00 00 00 ................ 00000070: 00 00 00 00 00 00 00 00 0c 09 09 03 14 00 00 19 ................ XFS (loop0): Corruption of in-memory data (0x8) detected at _xfs_buf_ioapply +0xe1e/0x10e0 (fs/xfs/xfs_buf.c:1580). Shutting down filesystem. XFS (loop0): Please unmount the filesystem and rectify the problem(s) XFS (loop0): log mount/recovery failed: error -117 XFS (loop0): log mount failed This corruption will shutdown the file system and the file system will no longer be mountable. The following script can reproduce the problem, but it may take a long time. #!/bin/bash device=/dev/sda testdir=/mnt/test round=0 function fail() { echo "$*" exit 1 } mkdir -p $testdir while [ $round -lt 10000 ] do echo "******* round $round ********" mkfs.xfs -f $device mount $device $testdir || fail "mount failed!" fsstress -d $testdir -l 0 -n 10000 -p 4 >/dev/null & sleep 4 killall -w fsstress umount $testdir xfs_repair -e $device > /dev/null if [ $? -eq 2 ];then echo "ERR CODE 2: Dirty log exception during repair." exit 1 fi round=$(($round+1)) done With lazysbcount is enabled, There is no additional lock protection for reading m_ifree and m_icount in xfs_log_sb(), if other cpu modifies the m_ifree, this will make the m_ifree greater than m_icount. For example, consider the following sequence and ifreedelta is postive: CPU0 CPU1 xfs_log_sb xfs_trans_unreserve_and_mod_sb ---------- ------------------------------ percpu_counter_sum(&mp->m_icount) percpu_counter_add_batch(&mp->m_icount, idelta, XFS_ICOUNT_BATCH) percpu_counter_add(&mp->m_ifree, ifreedelta); percpu_counter_sum(&mp->m_ifree) After this, incorrect inode count (sb_ifree > sb_icount) will be writen to the log. In the subsequent writing of sb, incorrect inode count (sb_ifree > sb_icount) will fail to pass the boundary check in xfs_validate_sb_write() that cause the file system shutdown. When lazysbcount is enabled, we don't need to guarantee that Lazy sb counters are completely correct, but we do need to guarantee that sb_ifree <= sb_icount. On the other hand, the constraint that m_ifree <= m_icount must be satisfied any time that there /cannot/ be other threads allocating or freeing inode chunks. If the constraint is violated under these circumstances, sb_i{count,free} (the ondisk superblock inode counters) maybe incorrect and need to be marked sick at unmount, the count will be rebuilt on the next mount. Fixes: 8756a5af1819 ("libxfs: add more bounds checking to sb sanity checks") Signed-off-by: Long Li Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_sb.c | 4 +++- fs/xfs/xfs_mount.c | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a20cade590e9..1eeecf2eb2a7 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -972,7 +972,9 @@ xfs_log_sb( */ if (xfs_has_lazysbcount(mp)) { mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); - mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); + mp->m_sb.sb_ifree = min_t(uint64_t, + percpu_counter_sum(&mp->m_ifree), + mp->m_sb.sb_icount); mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index e8bb3c2e847e..fb87ffb48f7f 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -538,6 +538,20 @@ xfs_check_summary_counts( return 0; } +static void +xfs_unmount_check( + struct xfs_mount *mp) +{ + if (xfs_is_shutdown(mp)) + return; + + if (percpu_counter_sum(&mp->m_ifree) > + percpu_counter_sum(&mp->m_icount)) { + xfs_alert(mp, "ifree/icount mismatch at unmount"); + xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); + } +} + /* * Flush and reclaim dirty inodes in preparation for unmount. Inodes and * internal inode structures can be sitting in the CIL and AIL at this point, @@ -1077,6 +1091,7 @@ xfs_unmountfs( if (error) xfs_warn(mp, "Unable to free reserved block pool. " "Freespace may not be correct on next mount."); + xfs_unmount_check(mp); xfs_log_unmount(mp); xfs_da_unmount(mp); -- cgit From 64c80dfd04d1dd2ecf550542c8f3f41b54b20207 Mon Sep 17 00:00:00 2001 From: Lukas Herbolt Date: Wed, 16 Nov 2022 19:20:21 -0800 Subject: xfs: Print XFS UUID on mount and umount events. As of now only device names are printed out over __xfs_printk(). The device names are not persistent across reboots which in case of searching for origin of corruption brings another task to properly identify the devices. This patch add XFS UUID upon every mount/umount event which will make the identification much easier. Signed-off-by: Lukas Herbolt [sandeen: rebase onto current upstream kernel] Signed-off-by: Eric Sandeen Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_log.c | 10 ++++++---- fs/xfs/xfs_super.c | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f02a0dd522b3..0141d9907d31 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -644,12 +644,14 @@ xfs_log_mount( int min_logfsbs; if (!xfs_has_norecovery(mp)) { - xfs_notice(mp, "Mounting V%d Filesystem", - XFS_SB_VERSION_NUM(&mp->m_sb)); + xfs_notice(mp, "Mounting V%d Filesystem %pU", + XFS_SB_VERSION_NUM(&mp->m_sb), + &mp->m_sb.sb_uuid); } else { xfs_notice(mp, -"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", - XFS_SB_VERSION_NUM(&mp->m_sb)); +"Mounting V%d filesystem %pU in no-recovery mode. Filesystem will be inconsistent.", + XFS_SB_VERSION_NUM(&mp->m_sb), + &mp->m_sb.sb_uuid); ASSERT(xfs_is_readonly(mp)); } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ee4b429a2f2c..0c4b73e9b29d 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1110,7 +1110,7 @@ xfs_fs_put_super( if (!sb->s_fs_info) return; - xfs_notice(mp, "Unmounting Filesystem"); + xfs_notice(mp, "Unmounting Filesystem %pU", &mp->m_sb.sb_uuid); xfs_filestream_unmount(mp); xfs_unmountfs(mp); -- cgit From 7e8e5cc818bd93ee7f2699676f2e5b30d26d83f8 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 11 Nov 2022 06:46:52 -0500 Subject: filelock: WARN_ON_ONCE when ->fl_file and filp don't match vfs_lock_file, vfs_test_lock and vfs_cancel_lock all take both a struct file argument and a file_lock. The file_lock has a fl_file field in it howevever and it _must_ match the file passed in. While most of the locks.c routines use the separately-passed file argument, some filesystems rely on fl_file being filled out correctly. I'm working on a patch series to remove the redundant argument from these routines, but for now, let's ensure that the callers always set this properly by issuing a WARN_ON_ONCE if they ever don't match. Cc: Chuck Lever Cc: Trond Myklebust Signed-off-by: Jeff Layton --- fs/locks.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 607f94a0e789..5876c8ff0edc 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2146,6 +2146,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) */ int vfs_test_lock(struct file *filp, struct file_lock *fl) { + WARN_ON_ONCE(filp != fl->fl_file); if (filp->f_op->lock) return filp->f_op->lock(filp, F_GETLK, fl); posix_test_lock(filp, fl); @@ -2295,6 +2296,7 @@ out: */ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) { + WARN_ON_ONCE(filp != fl->fl_file); if (filp->f_op->lock) return filp->f_op->lock(filp, cmd, fl); else @@ -2663,6 +2665,7 @@ void locks_remove_file(struct file *filp) */ int vfs_cancel_lock(struct file *filp, struct file_lock *fl) { + WARN_ON_ONCE(filp != fl->fl_file); if (filp->f_op->lock) return filp->f_op->lock(filp, F_CANCELLK, fl); return 0; -- cgit From dc64cc12bcd14219afb91b55d23192c3eb45aa43 Mon Sep 17 00:00:00 2001 From: Bo Liu Date: Mon, 14 Nov 2022 22:17:57 -0500 Subject: binfmt_elf: replace IS_ERR() with IS_ERR_VALUE() Avoid typecasts that are needed for IS_ERR() and use IS_ERR_VALUE() instead. Signed-off-by: Bo Liu Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221115031757.2426-1-liubo03@inspur.com --- fs/binfmt_elf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 528e2ac8931f..d9af34f816a9 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1160,7 +1160,7 @@ out_free_interp: error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, elf_prot, elf_flags, total_size); if (BAD_ADDR(error)) { - retval = IS_ERR((void *)error) ? + retval = IS_ERR_VALUE(error) ? PTR_ERR((void*)error) : -EINVAL; goto out_free_dentry; } @@ -1245,7 +1245,7 @@ out_free_interp: interpreter, load_bias, interp_elf_phdata, &arch_state); - if (!IS_ERR((void *)elf_entry)) { + if (!IS_ERR_VALUE(elf_entry)) { /* * load_elf_interp() returns relocation * adjustment @@ -1254,7 +1254,7 @@ out_free_interp: elf_entry += interp_elf_ex->e_entry; } if (BAD_ADDR(elf_entry)) { - retval = IS_ERR((void *)elf_entry) ? + retval = IS_ERR_VALUE(elf_entry) ? (int)elf_entry : -EINVAL; goto out_free_dentry; } -- cgit From cd57e443831d8eeb083c7165bce195d886e216d4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Nov 2022 16:31:55 -0800 Subject: exec: Remove FOLL_FORCE for stack setup It does not appear that FOLL_FORCE should be needed for setting up the stack pages. They are allocated using the nascent brpm->vma, which was newly created with VM_STACK_FLAGS, which an arch can override, but they all appear to include VM_WRITE | VM_MAYWRITE. Remove FOLL_FORCE. Cc: Eric Biederman Cc: David Hildenbrand Cc: Linus Torvalds Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Link: https://lore.kernel.org/lkml/202211171439.CDE720EAD@keescook/ Signed-off-by: Kees Cook --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 9585bc1bc970..870a707b5d3b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -200,7 +200,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, { struct page *page; int ret; - unsigned int gup_flags = FOLL_FORCE; + unsigned int gup_flags = 0; #ifdef CONFIG_STACK_GROWSUP if (write) { -- cgit From 8032bf1233a74627ce69b803608e650f3f35971c Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 9 Oct 2022 20:44:02 -0600 Subject: treewide: use get_random_u32_below() instead of deprecated function This is a simple mechanical transformation done by: @@ expression E; @@ - prandom_u32_max + get_random_u32_below (E) Reviewed-by: Kees Cook Reviewed-by: Greg Kroah-Hartman Acked-by: Darrick J. Wong # for xfs Reviewed-by: SeongJae Park # for damon Reviewed-by: Jason Gunthorpe # for infiniband Reviewed-by: Russell King (Oracle) # for arm Acked-by: Ulf Hansson # for mmc Signed-off-by: Jason A. Donenfeld --- fs/ceph/inode.c | 2 +- fs/ceph/mdsmap.c | 2 +- fs/ext2/ialloc.c | 2 +- fs/ext4/ialloc.c | 2 +- fs/ext4/super.c | 5 ++--- fs/f2fs/gc.c | 2 +- fs/f2fs/segment.c | 8 ++++---- fs/ubifs/debug.c | 8 ++++---- fs/ubifs/lpt_commit.c | 14 +++++++------- fs/ubifs/tnc_commit.c | 2 +- fs/xfs/libxfs/xfs_alloc.c | 2 +- fs/xfs/libxfs/xfs_ialloc.c | 2 +- fs/xfs/xfs_error.c | 2 +- 13 files changed, 26 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 4af5e55abc15..fb255988dee8 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -362,7 +362,7 @@ static int ceph_fill_fragtree(struct inode *inode, if (nsplits != ci->i_fragtree_nsplits) { update = true; } else if (nsplits) { - i = prandom_u32_max(nsplits); + i = get_random_u32_below(nsplits); id = le32_to_cpu(fragtree->splits[i].frag); if (!__ceph_find_frag(ci, id)) update = true; diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 3fbabc98e1f7..7dac21ee6ce7 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -29,7 +29,7 @@ static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) return -1; /* pick */ - n = prandom_u32_max(n); + n = get_random_u32_below(n); for (j = 0, i = 0; i < m->possible_max_rank; i++) { if (CEPH_MDS_IS_READY(i, ignore_laggy)) j++; diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index f4944c4dee60..78b8686d9a4a 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -277,7 +277,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) int best_ndir = inodes_per_group; int best_group = -1; - parent_group = prandom_u32_max(ngroups); + parent_group = get_random_u32_below(ngroups); for (i = 0; i < ngroups; i++) { group = (parent_group + i) % ngroups; desc = ext2_get_group_desc (sb, group, NULL); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index e9bc46684106..9fc1af8e19a3 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -465,7 +465,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo); parent_group = hinfo.hash % ngroups; } else - parent_group = prandom_u32_max(ngroups); + parent_group = get_random_u32_below(ngroups); for (i = 0; i < ngroups; i++) { g = (parent_group + i) % ngroups; get_orlov_stats(sb, g, flex_size, &stats); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7cdd2138c897..63ef74eb8091 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3778,7 +3778,7 @@ cont_thread: } if (!progress) { elr->lr_next_sched = jiffies + - prandom_u32_max(EXT4_DEF_LI_MAX_START_DELAY * HZ); + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); } if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; @@ -3925,8 +3925,7 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, * spread the inode table initialization requests * better. */ - elr->lr_next_sched = jiffies + prandom_u32_max( - EXT4_DEF_LI_MAX_START_DELAY * HZ); + elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); return elr; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 4546e01b2ee0..536d332d9e2e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -282,7 +282,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, /* let's select beginning hot/small space first in no_heap mode*/ if (f2fs_need_rand_seg(sbi)) - p->offset = prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec); + p->offset = get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec); else if (test_opt(sbi, NOHEAP) && (type == CURSEG_HOT_DATA || IS_NODESEG(type))) p->offset = 0; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index acf3d3fa4363..334415d946f8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2534,7 +2534,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) sanity_check_seg_type(sbi, seg_type); if (f2fs_need_rand_seg(sbi)) - return prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec); + return get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec); /* if segs_per_sec is large than 1, we need to keep original policy. */ if (__is_large_section(sbi)) @@ -2588,7 +2588,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) curseg->alloc_type = LFS; if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) curseg->fragment_remained_chunk = - prandom_u32_max(sbi->max_fragment_chunk) + 1; + get_random_u32_below(sbi->max_fragment_chunk) + 1; } static int __next_free_blkoff(struct f2fs_sb_info *sbi, @@ -2625,9 +2625,9 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, /* To allocate block chunks in different sizes, use random number */ if (--seg->fragment_remained_chunk <= 0) { seg->fragment_remained_chunk = - prandom_u32_max(sbi->max_fragment_chunk) + 1; + get_random_u32_below(sbi->max_fragment_chunk) + 1; seg->next_blkoff += - prandom_u32_max(sbi->max_fragment_hole) + 1; + get_random_u32_below(sbi->max_fragment_hole) + 1; } } } diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 3f128b9fdfbb..9c9d3f0e36a4 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2467,7 +2467,7 @@ error_dump: static inline int chance(unsigned int n, unsigned int out_of) { - return !!(prandom_u32_max(out_of) + 1 <= n); + return !!(get_random_u32_below(out_of) + 1 <= n); } @@ -2485,13 +2485,13 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write) if (chance(1, 2)) { d->pc_delay = 1; /* Fail within 1 minute */ - delay = prandom_u32_max(60000); + delay = get_random_u32_below(60000); d->pc_timeout = jiffies; d->pc_timeout += msecs_to_jiffies(delay); ubifs_warn(c, "failing after %lums", delay); } else { d->pc_delay = 2; - delay = prandom_u32_max(10000); + delay = get_random_u32_below(10000); /* Fail within 10000 operations */ d->pc_cnt_max = delay; ubifs_warn(c, "failing after %lu calls", delay); @@ -2571,7 +2571,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf, unsigned int from, to, ffs = chance(1, 2); unsigned char *p = (void *)buf; - from = prandom_u32_max(len); + from = get_random_u32_below(len); /* Corruption span max to end of write unit */ to = min(len, ALIGN(from + 1, c->max_write_size)); diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index cfbc31f709f4..c4d079328b92 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1970,28 +1970,28 @@ static int dbg_populate_lsave(struct ubifs_info *c) if (!dbg_is_chk_gen(c)) return 0; - if (prandom_u32_max(4)) + if (get_random_u32_below(4)) return 0; for (i = 0; i < c->lsave_cnt; i++) c->lsave[i] = c->main_first; list_for_each_entry(lprops, &c->empty_list, list) - c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum; + c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum; list_for_each_entry(lprops, &c->freeable_list, list) - c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum; + c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum; list_for_each_entry(lprops, &c->frdi_idx_list, list) - c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum; + c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum; heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum; + c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum; heap = &c->lpt_heap[LPROPS_DIRTY - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum; + c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum; heap = &c->lpt_heap[LPROPS_FREE - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum; + c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum; return 1; } diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 01362ad5f804..a55e04822d16 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -700,7 +700,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt) c->ilebs[c->ileb_cnt++] = lnum; dbg_cmt("LEB %d", lnum); } - if (dbg_is_chk_index(c) && !prandom_u32_max(8)) + if (dbg_is_chk_index(c) && !get_random_u32_below(8)) return -ENOSPC; return 0; } diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index de79f5d07f65..989cf341779b 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1516,7 +1516,7 @@ xfs_alloc_ag_vextent_lastblock( #ifdef DEBUG /* Randomly don't execute the first algorithm. */ - if (prandom_u32_max(2)) + if (get_random_u32_below(2)) return 0; #endif diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 94db50eb706a..5118dedf9267 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -636,7 +636,7 @@ xfs_ialloc_ag_alloc( /* randomly do sparse inode allocations */ if (xfs_has_sparseinodes(tp->t_mountp) && igeo->ialloc_min_blks < igeo->ialloc_blks) - do_sparse = prandom_u32_max(2); + do_sparse = get_random_u32_below(2); #endif /* diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index c6b2aabd6f18..822e6a0e9d1a 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -279,7 +279,7 @@ xfs_errortag_test( ASSERT(error_tag < XFS_ERRTAG_MAX); randfactor = mp->m_errortag[error_tag]; - if (!randfactor || prandom_u32_max(randfactor)) + if (!randfactor || get_random_u32_below(randfactor)) return false; xfs_warn_ratelimited(mp, -- cgit From d247aabd391c3b2fa4f26874ed9136a7a142fcfd Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 9 Oct 2022 20:44:02 -0600 Subject: treewide: use get_random_u32_{above,below}() instead of manual loop These cases were done with this Coccinelle: @@ expression E; identifier I; @@ - do { ... when != I - I = get_random_u32(); ... when != I - } while (I > E); + I = get_random_u32_below(E + 1); @@ expression E; identifier I; @@ - do { ... when != I - I = get_random_u32(); ... when != I - } while (I >= E); + I = get_random_u32_below(E); @@ expression E; identifier I; @@ - do { ... when != I - I = get_random_u32(); ... when != I - } while (I < E); + I = get_random_u32_above(E - 1); @@ expression E; identifier I; @@ - do { ... when != I - I = get_random_u32(); ... when != I - } while (I <= E); + I = get_random_u32_above(E); @@ identifier I; @@ - do { ... when != I - I = get_random_u32(); ... when != I - } while (!I); + I = get_random_u32_above(0); @@ identifier I; @@ - do { ... when != I - I = get_random_u32(); ... when != I - } while (I == 0); + I = get_random_u32_above(0); @@ expression E; @@ - E + 1 + get_random_u32_below(U32_MAX - E) + get_random_u32_above(E) Reviewed-by: Kees Cook Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jason A. Donenfeld --- fs/ext4/mmp.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 588cb09c5291..4681fff6665f 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -262,13 +262,7 @@ void ext4_stop_mmpd(struct ext4_sb_info *sbi) */ static unsigned int mmp_new_seq(void) { - u32 new_seq; - - do { - new_seq = get_random_u32(); - } while (new_seq > EXT4_MMP_SEQ_MAX); - - return new_seq; + return get_random_u32_below(EXT4_MMP_SEQ_MAX + 1); } /* -- cgit From e8a533cbeb79809206f8724e89961e0079508c3c Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 9 Oct 2022 20:44:02 -0600 Subject: treewide: use get_random_u32_inclusive() when possible These cases were done with this Coccinelle: @@ expression H; expression L; @@ - (get_random_u32_below(H) + L) + get_random_u32_inclusive(L, H + L - 1) @@ expression H; expression L; expression E; @@ get_random_u32_inclusive(L, H - + E - - E ) @@ expression H; expression L; expression E; @@ get_random_u32_inclusive(L, H - - E - + E ) @@ expression H; expression L; expression E; expression F; @@ get_random_u32_inclusive(L, H - - E + F - + E ) @@ expression H; expression L; expression E; expression F; @@ get_random_u32_inclusive(L, H - + E + F - - E ) And then subsequently cleaned up by hand, with several automatic cases rejected if it didn't make sense contextually. Reviewed-by: Kees Cook Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe # for infiniband Signed-off-by: Jason A. Donenfeld --- fs/f2fs/segment.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 334415d946f8..b304692c0cf5 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2588,7 +2588,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) curseg->alloc_type = LFS; if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) curseg->fragment_remained_chunk = - get_random_u32_below(sbi->max_fragment_chunk) + 1; + get_random_u32_inclusive(1, sbi->max_fragment_chunk); } static int __next_free_blkoff(struct f2fs_sb_info *sbi, @@ -2625,9 +2625,9 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, /* To allocate block chunks in different sizes, use random number */ if (--seg->fragment_remained_chunk <= 0) { seg->fragment_remained_chunk = - get_random_u32_below(sbi->max_fragment_chunk) + 1; + get_random_u32_inclusive(1, sbi->max_fragment_chunk); seg->next_blkoff += - get_random_u32_below(sbi->max_fragment_hole) + 1; + get_random_u32_inclusive(1, sbi->max_fragment_hole); } } } -- cgit From 8603b6f58637ce196d68f7749633ea81af196d66 Mon Sep 17 00:00:00 2001 From: Oleksandr Natalenko Date: Sat, 3 Sep 2022 08:43:30 +0200 Subject: core_pattern: add CPU specifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Statistically, in a large deployment regular segfaults may indicate a CPU issue. Currently, it is not possible to find out what CPU the segfault happened on. There are at least two attempts to improve segfault logging with this regard, but they do not help in case the logs rotate. Hence, lets make sure it is possible to permanently record a CPU the task ran on using a new core_pattern specifier. Link: https://lkml.kernel.org/r/20220903064330.20772-1-oleksandr@redhat.com Signed-off-by: Oleksandr Natalenko Suggested-by: Renaud Métrich Reviewed-by: Oleg Nesterov Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Grzegorz Halat Cc: "Guilherme G. Piccoli" Cc: "Huang, Ying" Cc: Jason A. Donenfeld Cc: Joel Savitz Cc: Jonathan Corbet Cc: Kees Cook Cc: Laurent Dufour Cc: Luis Chamberlain Cc: Rob Herring Cc: Stephen Kitt Cc: Will Deacon Cc: Xiaoming Ni Signed-off-by: Andrew Morton --- fs/coredump.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/coredump.c b/fs/coredump.c index 7bad7785e8e6..3e8630c8d627 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -325,6 +325,10 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, err = cn_printf(cn, "%lu", rlimit(RLIMIT_CORE)); break; + /* CPU the task ran on */ + case 'C': + err = cn_printf(cn, "%d", cprm->cpu); + break; default: break; } @@ -534,6 +538,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) */ .mm_flags = mm->flags, .vma_meta = NULL, + .cpu = raw_smp_processor_id(), }; audit_core_dumps(siginfo->si_signo); -- cgit From 71dd5d651be7c99c401ad642e9f3cba88b956bce Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Fri, 7 Oct 2022 20:48:44 +0800 Subject: ocfs2/cluster: use bitmap API instead of hand-writing it Use bitmap_zero/bitmap_copy/bitmap_equal directly for bitmap operations. Link: https://lkml.kernel.org/r/20221007124846.186453-1-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/cluster/heartbeat.c | 20 ++++++++++---------- fs/ocfs2/cluster/nodemanager.c | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index b13d344d40b6..8fe6031f60e3 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -335,7 +335,7 @@ static void o2hb_arm_timeout(struct o2hb_region *reg) /* negotiate timeout must be less than write timeout. */ schedule_delayed_work(®->hr_nego_timeout_work, msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS)); - memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap)); + bitmap_zero(reg->hr_nego_node_bitmap, O2NM_MAX_NODES); } static void o2hb_disarm_timeout(struct o2hb_region *reg) @@ -386,8 +386,8 @@ static void o2hb_nego_timeout(struct work_struct *work) config_item_name(®->hr_item), reg->hr_bdev); set_bit(master_node, reg->hr_nego_node_bitmap); } - if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap, - sizeof(reg->hr_nego_node_bitmap))) { + if (!bitmap_equal(reg->hr_nego_node_bitmap, live_node_bitmap, + O2NM_MAX_NODES)) { /* check negotiate bitmap every second to do timeout * approve decision. */ @@ -856,8 +856,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg) * live nodes heartbeat on it. In other words, the region has been * added to all nodes. */ - if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, - sizeof(o2hb_live_node_bitmap))) + if (!bitmap_equal(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, + O2NM_MAX_NODES)) goto unlock; printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n", @@ -1437,11 +1437,11 @@ void o2hb_init(void) for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++) INIT_LIST_HEAD(&o2hb_live_slots[i]); - memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); - memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); - memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); - memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); - memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap)); + bitmap_zero(o2hb_live_node_bitmap, O2NM_MAX_NODES); + bitmap_zero(o2hb_region_bitmap, O2NM_MAX_REGIONS); + bitmap_zero(o2hb_live_region_bitmap, O2NM_MAX_REGIONS); + bitmap_zero(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS); + bitmap_zero(o2hb_failed_region_bitmap, O2NM_MAX_REGIONS); o2hb_dependent_users = 0; diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 27fee68f860a..2f61d39e4e50 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -54,7 +54,7 @@ int o2nm_configured_node_map(unsigned long *map, unsigned bytes) return -EINVAL; read_lock(&cluster->cl_nodes_lock); - memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap)); + bitmap_copy(map, cluster->cl_nodes_bitmap, O2NM_MAX_NODES); read_unlock(&cluster->cl_nodes_lock); return 0; -- cgit From 6d4a93b6809270e5c7d7216b20f4ef2e88213eb3 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Fri, 7 Oct 2022 20:48:45 +0800 Subject: ocfs2: use bitmap API in fill_node_map Pass bits directly into fill_node_map helper and use bitmap API directly to simplify code. Link: https://lkml.kernel.org/r/20221007124846.186453-2-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/cluster/heartbeat.c | 18 ++++++++---------- fs/ocfs2/cluster/heartbeat.h | 2 +- fs/ocfs2/cluster/netdebug.c | 2 +- fs/ocfs2/cluster/tcp.c | 6 ++---- fs/ocfs2/dlm/dlmdomain.c | 2 +- fs/ocfs2/stack_o2cb.c | 6 +++--- 6 files changed, 16 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 8fe6031f60e3..60b97c92e2b2 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -375,7 +375,7 @@ static void o2hb_nego_timeout(struct work_struct *work) if (reg->hr_last_hb_status) return; - o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES); /* lowest node as master node to make negotiate decision. */ master_node = find_first_bit(live_node_bitmap, O2NM_MAX_NODES); @@ -1087,7 +1087,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * If a node is not configured but is in the livemap, we still need * to read the slot so as to be able to remove it from the livemap. */ - o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES); i = -1; while ((i = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { @@ -1450,23 +1450,21 @@ void o2hb_init(void) /* if we're already in a callback then we're already serialized by the sem */ static void o2hb_fill_node_map_from_callback(unsigned long *map, - unsigned bytes) + unsigned int bits) { - BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); - - memcpy(map, &o2hb_live_node_bitmap, bytes); + bitmap_copy(map, o2hb_live_node_bitmap, bits); } /* * get a map of all nodes that are heartbeating in any regions */ -void o2hb_fill_node_map(unsigned long *map, unsigned bytes) +void o2hb_fill_node_map(unsigned long *map, unsigned int bits) { /* callers want to serialize this map and callbacks so that they * can trust that they don't miss nodes coming to the party */ down_read(&o2hb_callback_sem); spin_lock(&o2hb_live_lock); - o2hb_fill_node_map_from_callback(map, bytes); + o2hb_fill_node_map_from_callback(map, bits); spin_unlock(&o2hb_live_lock); up_read(&o2hb_callback_sem); } @@ -2460,7 +2458,7 @@ int o2hb_check_node_heartbeating_no_sem(u8 node_num) unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; spin_lock(&o2hb_live_lock); - o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); + o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES); spin_unlock(&o2hb_live_lock); if (!test_bit(node_num, testing_map)) { mlog(ML_HEARTBEAT, @@ -2477,7 +2475,7 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num) { unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); + o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES); if (!test_bit(node_num, testing_map)) { mlog(ML_HEARTBEAT, "node (%u) does not have heartbeating enabled.\n", diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index 1d4100abf6f8..8ef8c1b9eeb7 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -59,7 +59,7 @@ int o2hb_register_callback(const char *region_uuid, void o2hb_unregister_callback(const char *region_uuid, struct o2hb_callback_func *hc); void o2hb_fill_node_map(unsigned long *map, - unsigned bytes); + unsigned int bits); void o2hb_exit(void); void o2hb_init(void); int o2hb_check_node_heartbeating_no_sem(u8 node_num); diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 7524994e3199..35c05c18de59 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -438,7 +438,7 @@ static int o2net_fill_bitmap(char *buf, int len) unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; int i = -1, out = 0; - o2net_fill_node_map(map, sizeof(map)); + o2net_fill_node_map(map, O2NM_MAX_NODES); while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index f660c0dbdb63..6f5a3fb97c7f 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -990,14 +990,12 @@ static int o2net_tx_can_proceed(struct o2net_node *nn, } /* Get a map of all nodes to which this node is currently connected to */ -void o2net_fill_node_map(unsigned long *map, unsigned bytes) +void o2net_fill_node_map(unsigned long *map, unsigned int bits) { struct o2net_sock_container *sc; int node, ret; - BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); - - memset(map, 0, bytes); + bitmap_zero(map, bits); for (node = 0; node < O2NM_MAX_NODES; ++node) { if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret)) continue; diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index c4eccd499db8..f46941ff665d 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1604,7 +1604,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) /* group sem locking should work for us here -- we're already * registered for heartbeat events so filling this should be * atomic wrt getting those handlers called. */ - o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map)); + o2hb_fill_node_map(dlm->live_nodes_map, O2NM_MAX_NODES); spin_lock(&dlm->spinlock); memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map)); diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 88f75f7f02d7..c973c03f6fd8 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -273,17 +273,17 @@ static int o2cb_cluster_check(void) */ #define O2CB_MAP_STABILIZE_COUNT 60 for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) { - o2hb_fill_node_map(hbmap, sizeof(hbmap)); + o2hb_fill_node_map(hbmap, O2NM_MAX_NODES); if (!test_bit(node_num, hbmap)) { printk(KERN_ERR "o2cb: %s heartbeat has not been " "started.\n", (o2hb_global_heartbeat_active() ? "Global" : "Local")); return -EINVAL; } - o2net_fill_node_map(netmap, sizeof(netmap)); + o2net_fill_node_map(netmap, O2NM_MAX_NODES); /* Force set the current node to allow easy compare */ set_bit(node_num, netmap); - if (!memcmp(hbmap, netmap, sizeof(hbmap))) + if (bitmap_equal(hbmap, netmap, O2NM_MAX_NODES)) return 0; if (i < O2CB_MAP_STABILIZE_COUNT - 1) msleep(1000); -- cgit From b270f492dc45b59d573cd80795e8b4781e959836 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Fri, 7 Oct 2022 20:48:46 +0800 Subject: ocfs2/dlm: use bitmap API instead of hand-writing it Use bitmap_zero/bitmap_copy/bitmap_qeual directly for bitmap operations. Link: https://lkml.kernel.org/r/20221007124846.186453-3-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/dlm/dlmcommon.h | 2 +- fs/ocfs2/dlm/dlmdomain.c | 17 +++++++---------- fs/ocfs2/dlm/dlmmaster.c | 30 +++++++++++++++--------------- fs/ocfs2/dlm/dlmrecovery.c | 2 +- 4 files changed, 24 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index fd2022712167..20f790a47484 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -1094,7 +1094,7 @@ static inline enum dlm_status dlm_err_to_dlm_status(int err) static inline void dlm_node_iter_init(unsigned long *map, struct dlm_node_iter *iter) { - memcpy(iter->node_map, map, sizeof(iter->node_map)); + bitmap_copy(iter->node_map, map, O2NM_MAX_NODES); iter->curnode = -1; } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index f46941ff665d..5c04dde99981 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1576,8 +1576,8 @@ static int dlm_should_restart_join(struct dlm_ctxt *dlm, spin_lock(&dlm->spinlock); /* For now, we restart the process if the node maps have * changed at all */ - ret = memcmp(ctxt->live_map, dlm->live_nodes_map, - sizeof(dlm->live_nodes_map)); + ret = !bitmap_equal(ctxt->live_map, dlm->live_nodes_map, + O2NM_MAX_NODES); spin_unlock(&dlm->spinlock); if (ret) @@ -1607,10 +1607,8 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) o2hb_fill_node_map(dlm->live_nodes_map, O2NM_MAX_NODES); spin_lock(&dlm->spinlock); - memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map)); - + bitmap_copy(ctxt->live_map, dlm->live_nodes_map, O2NM_MAX_NODES); __dlm_set_joining_node(dlm, dlm->node_num); - spin_unlock(&dlm->spinlock); node = -1; @@ -1643,8 +1641,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) * yes_resp_map. Copy that into our domain map and send a join * assert message to clean up everyone elses state. */ spin_lock(&dlm->spinlock); - memcpy(dlm->domain_map, ctxt->yes_resp_map, - sizeof(ctxt->yes_resp_map)); + bitmap_copy(dlm->domain_map, ctxt->yes_resp_map, O2NM_MAX_NODES); set_bit(dlm->node_num, dlm->domain_map); spin_unlock(&dlm->spinlock); @@ -2009,9 +2006,9 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n", dlm->recovery_map, &(dlm->recovery_map[0])); - memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map)); - memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map)); - memset(dlm->domain_map, 0, sizeof(dlm->domain_map)); + bitmap_zero(dlm->recovery_map, O2NM_MAX_NODES); + bitmap_zero(dlm->live_nodes_map, O2NM_MAX_NODES); + bitmap_zero(dlm->domain_map, O2NM_MAX_NODES); dlm->dlm_thread_task = NULL; dlm->dlm_reco_thread_task = NULL; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 227da5b1b6ab..d610da8e2f24 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -258,12 +258,12 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, mle->type = type; INIT_HLIST_NODE(&mle->master_hash_node); INIT_LIST_HEAD(&mle->hb_events); - memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); + bitmap_zero(mle->maybe_map, O2NM_MAX_NODES); spin_lock_init(&mle->spinlock); init_waitqueue_head(&mle->wq); atomic_set(&mle->woken, 0); kref_init(&mle->mle_refs); - memset(mle->response_map, 0, sizeof(mle->response_map)); + bitmap_zero(mle->response_map, O2NM_MAX_NODES); mle->master = O2NM_MAX_NODES; mle->new_master = O2NM_MAX_NODES; mle->inuse = 0; @@ -290,8 +290,8 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, atomic_inc(&dlm->mle_cur_count[mle->type]); /* copy off the node_map and register hb callbacks on our copy */ - memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); - memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); + bitmap_copy(mle->node_map, dlm->domain_map, O2NM_MAX_NODES); + bitmap_copy(mle->vote_map, dlm->domain_map, O2NM_MAX_NODES); clear_bit(dlm->node_num, mle->vote_map); clear_bit(dlm->node_num, mle->node_map); @@ -572,7 +572,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, spin_unlock(&dlm->track_lock); memset(res->lvb, 0, DLM_LVB_LEN); - memset(res->refmap, 0, sizeof(res->refmap)); + bitmap_zero(res->refmap, O2NM_MAX_NODES); } struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, @@ -1036,10 +1036,10 @@ recheck: spin_lock(&mle->spinlock); m = mle->master; - map_changed = (memcmp(mle->vote_map, mle->node_map, - sizeof(mle->vote_map)) != 0); - voting_done = (memcmp(mle->vote_map, mle->response_map, - sizeof(mle->vote_map)) == 0); + map_changed = !bitmap_equal(mle->vote_map, mle->node_map, + O2NM_MAX_NODES); + voting_done = bitmap_equal(mle->vote_map, mle->response_map, + O2NM_MAX_NODES); /* restart if we hit any errors */ if (map_changed) { @@ -1277,11 +1277,11 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, /* now blank out everything, as if we had never * contacted anyone */ - memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); - memset(mle->response_map, 0, sizeof(mle->response_map)); + bitmap_zero(mle->maybe_map, O2NM_MAX_NODES); + bitmap_zero(mle->response_map, O2NM_MAX_NODES); /* reset the vote_map to the current node_map */ - memcpy(mle->vote_map, mle->node_map, - sizeof(mle->node_map)); + bitmap_copy(mle->vote_map, mle->node_map, + O2NM_MAX_NODES); /* put myself into the maybe map */ if (mle->type != DLM_MLE_BLOCK) set_bit(dlm->node_num, mle->maybe_map); @@ -2094,7 +2094,7 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) flags = item->u.am.flags; spin_lock(&dlm->spinlock); - memcpy(nodemap, dlm->domain_map, sizeof(nodemap)); + bitmap_copy(nodemap, dlm->domain_map, O2NM_MAX_NODES); spin_unlock(&dlm->spinlock); clear_bit(dlm->node_num, nodemap); @@ -3447,7 +3447,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, ret = 0; } - memset(iter.node_map, 0, sizeof(iter.node_map)); + bitmap_zero(iter.node_map, O2NM_MAX_NODES); set_bit(old_master, iter.node_map); mlog(0, "doing assert master of %.*s back to %u\n", res->lockname.len, res->lockname.name, old_master); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 52ad342fec3e..50da8af988c1 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -733,7 +733,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) struct dlm_reco_node_data *ndata; spin_lock(&dlm->spinlock); - memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map)); + bitmap_copy(dlm->reco.node_map, dlm->domain_map, O2NM_MAX_NODES); /* nodes can only be removed (by dying) after dropping * this lock, and death will be trapped later, so this should do */ spin_unlock(&dlm->spinlock); -- cgit From 12b9d301ff73122aebd78548fa4c04ca69ed78fe Mon Sep 17 00:00:00 2001 From: Jianglei Nie Date: Thu, 29 Sep 2022 12:29:33 +0800 Subject: proc/vmcore: fix potential memory leak in vmcore_init() Patch series "Some minor cleanup patches resent". The first three patches trivial clean up patches. And for the patch "kexec: replace crash_mem_range with range", I got a ibm-p9wr ppc64le system to test, it works well. This patch (of 4): elfcorehdr_alloc() allocates a memory chunk for elfcorehdr_addr with kzalloc(). If is_vmcore_usable() returns false, elfcorehdr_addr is a predefined value. If parse_crash_elf_headers() gets some error and returns a negetive value, the elfcorehdr_addr should be released with elfcorehdr_free(). Fix it by calling elfcorehdr_free() when parse_crash_elf_headers() fails. Link: https://lkml.kernel.org/r/20220929042936.22012-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20220929042936.22012-2-bhe@redhat.com Signed-off-by: Jianglei Nie Signed-off-by: Baoquan He Acked-by: Baoquan He Cc: Benjamin Herrenschmidt Cc: Chen Lifu Cc: "Eric W . Biederman" Cc: Li Chen Cc: Michael Ellerman Cc: Paul Mackerras Cc: Petr Mladek Cc: Russell King Cc: ye xingchen Cc: Zeal Robot Signed-off-by: Andrew Morton --- fs/proc/vmcore.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index f2aa86c421f2..74747571d58e 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1567,6 +1567,7 @@ static int __init vmcore_init(void) return rc; rc = parse_crash_elf_headers(); if (rc) { + elfcorehdr_free(elfcorehdr_addr); pr_warn("Kdump: vmcore not initialized\n"); return rc; } -- cgit From f1f1f2569901ec5b9d425f2e91c09a0e320768f3 Mon Sep 17 00:00:00 2001 From: Ivan Babrou Date: Thu, 22 Sep 2022 15:40:26 -0700 Subject: proc: report open files as size in stat() for /proc/pid/fd Many monitoring tools include open file count as a metric. Currently the only way to get this number is to enumerate the files in /proc/pid/fd. The problem with the current approach is that it does many things people generally don't care about when they need one number for a metric. In our tests for cadvisor, which reports open file counts per cgroup, we observed that reading the number of open files is slow. Out of 35.23% of CPU time spent in `proc_readfd_common`, we see 29.43% spent in `proc_fill_cache`, which is responsible for filling dentry info. Some of this extra time is spinlock contention, but it's a contention for the lock we don't want to take to begin with. We considered putting the number of open files in /proc/pid/status. Unfortunately, counting the number of fds involves iterating the open_files bitmap, which has a linear complexity in proportion with the number of open files (bitmap slots really, but it's close). We don't want to make /proc/pid/status any slower, so instead we put this info in /proc/pid/fd as a size member of the stat syscall result. Previously the reported number was zero, so there's very little risk of breaking anything, while still providing a somewhat logical way to count the open files with a fallback if it's zero. RFC for this patch included iterating open fds under RCU. Thanks to Frank Hofmann for the suggestion to use the bitmap instead. Previously: ``` $ sudo stat /proc/1/fd | head -n2 File: /proc/1/fd Size: 0 Blocks: 0 IO Block: 1024 directory ``` With this patch: ``` $ sudo stat /proc/1/fd | head -n2 File: /proc/1/fd Size: 65 Blocks: 0 IO Block: 1024 directory ``` Correctness check: ``` $ sudo ls /proc/1/fd | wc -l 65 ``` I added the docs for /proc//fd while I'm at it. [ivan@cloudflare.com: use bitmap_weight() to count the bits] Link: https://lkml.kernel.org/r/20221018045844.37697-1-ivan@cloudflare.com [akpm@linux-foundation.org: include linux/bitmap.h for bitmap_weight()] [ivan@cloudflare.com: return errno from proc_fd_getattr() instead of setting negative size] Link: https://lkml.kernel.org/r/20221024173140.30673-1-ivan@cloudflare.com Link: https://lkml.kernel.org/r/20220922224027.59266-1-ivan@cloudflare.com Signed-off-by: Ivan Babrou Cc: Alexey Dobriyan Cc: Al Viro Cc: Christoph Anton Mitterer Cc: David Hildenbrand Cc: David Laight Cc: Ivan Babrou Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kalesh Singh Cc: Mike Rapoport Cc: Paul Gortmaker Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/proc/fd.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'fs') diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 913bef0d2a36..fc46d6fe080c 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -279,6 +280,30 @@ out: return 0; } +static int proc_readfd_count(struct inode *inode, loff_t *count) +{ + struct task_struct *p = get_proc_task(inode); + struct fdtable *fdt; + + if (!p) + return -ENOENT; + + task_lock(p); + if (p->files) { + rcu_read_lock(); + + fdt = files_fdtable(p->files); + *count = bitmap_weight(fdt->open_fds, fdt->max_fds); + + rcu_read_unlock(); + } + task_unlock(p); + + put_task_struct(p); + + return 0; +} + static int proc_readfd(struct file *file, struct dir_context *ctx) { return proc_readfd_common(file, ctx, proc_fd_instantiate); @@ -319,9 +344,29 @@ int proc_fd_permission(struct user_namespace *mnt_userns, return rv; } +static int proc_fd_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + int rv = 0; + + generic_fillattr(&init_user_ns, inode, stat); + + /* If it's a directory, put the number of open fds there */ + if (S_ISDIR(inode->i_mode)) { + rv = proc_readfd_count(inode, &stat->size); + if (rv < 0) + return rv; + } + + return rv; +} + const struct inode_operations proc_fd_inode_operations = { .lookup = proc_lookupfd, .permission = proc_fd_permission, + .getattr = proc_fd_getattr, .setattr = proc_setattr, }; -- cgit From 941baf6febaa88c057192084ca280d976c7c7239 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 8 Sep 2022 21:21:54 +0300 Subject: proc: give /proc/cmdline size Most /proc files don't have length (in fstat sense). This leads to inefficiencies when reading such files with APIs commonly found in modern programming languages. They open file, then fstat descriptor, get st_size == 0 and either assume file is empty or start reading without knowing target size. cat(1) does OK because it uses large enough buffer by default. But naive programs copy-pasted from SO aren't: let mut f = std::fs::File::open("/proc/cmdline").unwrap(); let mut buf: Vec = Vec::new(); f.read_to_end(&mut buf).unwrap(); will result in openat(AT_FDCWD, "/proc/cmdline", O_RDONLY|O_CLOEXEC) = 3 statx(0, NULL, AT_STATX_SYNC_AS_STAT, STATX_ALL, NULL) = -1 EFAULT (Bad address) statx(3, "", AT_STATX_SYNC_AS_STAT|AT_EMPTY_PATH, STATX_ALL, {stx_mask=STATX_BASIC_STATS|STATX_MNT_ID, stx_attributes=0, stx_mode=S_IFREG|0444, stx_size=0, ...}) = 0 lseek(3, 0, SEEK_CUR) = 0 read(3, "BOOT_IMAGE=(hd3,gpt2)/vmlinuz-5.", 32) = 32 read(3, "19.6-100.fc35.x86_64 root=/dev/m", 32) = 32 read(3, "apper/fedora_localhost--live-roo"..., 64) = 64 read(3, "ocalhost--live-swap rd.lvm.lv=fe"..., 128) = 116 read(3, "", 12) open/stat is OK, lseek looks silly but there are 3 unnecessary reads because Rust starts with 32 bytes per Vec and grows from there. In case of /proc/cmdline, the length is known precisely. Make variables readonly while I'm at it. P.S.: I tried to scp /proc/cpuinfo today and got empty file but this is separate story. Link: https://lkml.kernel.org/r/YxoywlbM73JJN3r+@localhost.localdomain Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton --- fs/proc/cmdline.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index fa762c5fbcb2..91fe1597af7b 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -3,6 +3,7 @@ #include #include #include +#include "internal.h" static int cmdline_proc_show(struct seq_file *m, void *v) { @@ -13,7 +14,10 @@ static int cmdline_proc_show(struct seq_file *m, void *v) static int __init proc_cmdline_init(void) { - proc_create_single("cmdline", 0, NULL, cmdline_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("cmdline", 0, NULL, cmdline_proc_show); + pde->size = saved_command_line_len + 1; return 0; } fs_initcall(proc_cmdline_init); -- cgit From 610a2a3d7d8be3537458a378ec69396a76c385b6 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 27 Oct 2022 13:43:05 +0900 Subject: nilfs2: fix shift-out-of-bounds/overflow in nilfs_sb2_bad_offset() Patch series "nilfs2: fix UBSAN shift-out-of-bounds warnings on mount time". The first patch fixes a bug reported by syzbot, and the second one fixes the remaining bug of the same kind. Although they are triggered by the same super block data anomaly, I divided it into the above two because the details of the issues and how to fix it are different. Both are required to eliminate the shift-out-of-bounds issues at mount time. This patch (of 2): If the block size exponent information written in an on-disk superblock is corrupted, nilfs_sb2_bad_offset helper function can trigger shift-out-of-bounds warning followed by a kernel panic (if panic_on_warn is set): shift exponent 38983 is too large for 64-bit type 'unsigned long long' Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1b1/0x28e lib/dump_stack.c:106 ubsan_epilogue lib/ubsan.c:151 [inline] __ubsan_handle_shift_out_of_bounds+0x33d/0x3b0 lib/ubsan.c:322 nilfs_sb2_bad_offset fs/nilfs2/the_nilfs.c:449 [inline] nilfs_load_super_block+0xdf5/0xe00 fs/nilfs2/the_nilfs.c:523 init_nilfs+0xb7/0x7d0 fs/nilfs2/the_nilfs.c:577 nilfs_fill_super+0xb1/0x5d0 fs/nilfs2/super.c:1047 nilfs_mount+0x613/0x9b0 fs/nilfs2/super.c:1317 ... In addition, since nilfs_sb2_bad_offset() performs multiplication without considering the upper bound, the computation may overflow if the disk layout parameters are not normal. This fixes these issues by inserting preliminary sanity checks for those parameters and by converting the comparison from one involving multiplication and left bit-shifting to one using division and right bit-shifting. Link: https://lkml.kernel.org/r/20221027044306.42774-1-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/20221027044306.42774-2-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Reported-by: syzbot+e91619dd4c11c4960706@syzkaller.appspotmail.com Tested-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/the_nilfs.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 3b4a079c9617..d588816fdf2f 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "nilfs.h" #include "segment.h" @@ -443,11 +444,33 @@ static int nilfs_valid_sb(struct nilfs_super_block *sbp) return crc == le32_to_cpu(sbp->s_sum); } -static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset) +/** + * nilfs_sb2_bad_offset - check the location of the second superblock + * @sbp: superblock raw data buffer + * @offset: byte offset of second superblock calculated from device size + * + * nilfs_sb2_bad_offset() checks if the position on the second + * superblock is valid or not based on the filesystem parameters + * stored in @sbp. If @offset points to a location within the segment + * area, or if the parameters themselves are not normal, it is + * determined to be invalid. + * + * Return Value: true if invalid, false if valid. + */ +static bool nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset) { - return offset < ((le64_to_cpu(sbp->s_nsegments) * - le32_to_cpu(sbp->s_blocks_per_segment)) << - (le32_to_cpu(sbp->s_log_block_size) + 10)); + unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size); + u32 blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); + u64 nsegments = le64_to_cpu(sbp->s_nsegments); + u64 index; + + if (blocks_per_segment < NILFS_SEG_MIN_BLOCKS || + shift_bits > ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS) + return true; + + index = offset >> (shift_bits + BLOCK_SIZE_BITS); + do_div(index, blocks_per_segment); + return index < nsegments; } static void nilfs_release_super_block(struct the_nilfs *nilfs) -- cgit From ebeccaaef67a4895d2496ab8d9c2fb8d89201211 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 27 Oct 2022 13:43:06 +0900 Subject: nilfs2: fix shift-out-of-bounds due to too large exponent of block size If field s_log_block_size of superblock data is corrupted and too large, init_nilfs() and load_nilfs() still can trigger a shift-out-of-bounds warning followed by a kernel panic (if panic_on_warn is set): shift exponent 38973 is too large for 32-bit type 'int' Call Trace: dump_stack_lvl+0xcd/0x134 ubsan_epilogue+0xb/0x50 __ubsan_handle_shift_out_of_bounds.cold.12+0x17b/0x1f5 init_nilfs.cold.11+0x18/0x1d [nilfs2] nilfs_mount+0x9b5/0x12b0 [nilfs2] ... This fixes the issue by adding and using a new helper function for getting block size with sanity check. Link: https://lkml.kernel.org/r/20221027044306.42774-3-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/the_nilfs.c | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index d588816fdf2f..20ff02b4ef5d 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -193,6 +193,34 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs, return ret; } +/** + * nilfs_get_blocksize - get block size from raw superblock data + * @sb: super block instance + * @sbp: superblock raw data buffer + * @blocksize: place to store block size + * + * nilfs_get_blocksize() calculates the block size from the block size + * exponent information written in @sbp and stores it in @blocksize, + * or aborts with an error message if it's too large. + * + * Return Value: On success, 0 is returned. If the block size is too + * large, -EINVAL is returned. + */ +static int nilfs_get_blocksize(struct super_block *sb, + struct nilfs_super_block *sbp, int *blocksize) +{ + unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size); + + if (unlikely(shift_bits > + ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS)) { + nilfs_err(sb, "too large filesystem blocksize: 2 ^ %u KiB", + shift_bits); + return -EINVAL; + } + *blocksize = BLOCK_SIZE << shift_bits; + return 0; +} + /** * load_nilfs - load and recover the nilfs * @nilfs: the_nilfs structure to be released @@ -246,11 +274,15 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime); /* verify consistency between two super blocks */ - blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size); + err = nilfs_get_blocksize(sb, sbp[0], &blocksize); + if (err) + goto scan_error; + if (blocksize != nilfs->ns_blocksize) { nilfs_warn(sb, "blocksize differs between two super blocks (%d != %d)", blocksize, nilfs->ns_blocksize); + err = -EINVAL; goto scan_error; } @@ -609,9 +641,11 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) if (err) goto failed_sbh; - blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); - if (blocksize < NILFS_MIN_BLOCK_SIZE || - blocksize > NILFS_MAX_BLOCK_SIZE) { + err = nilfs_get_blocksize(sb, sbp, &blocksize); + if (err) + goto failed_sbh; + + if (blocksize < NILFS_MIN_BLOCK_SIZE) { nilfs_err(sb, "couldn't mount because of unsupported filesystem blocksize %d", blocksize); -- cgit From 80f784098ff44e086f68f0e8c98b6c6da8702ec4 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Wed, 19 Oct 2022 11:09:29 +0800 Subject: squashfs: add the mount parameter theads= Patch series 'squashfs: Add the mount parameter "threads="'. Currently, Squashfs supports multiple decompressor parallel modes. However, this mode can be configured only during kernel building and does not support flexible selection during runtime. In the current patch set, the mount parameter "threads=" is added to allow users to select the parallel decompressor mode and configure the number of decompressors when mounting a file system. "threads=" The upper limit is num_online_cpus() * 2. This patch (of 2): Squashfs supports three decompression concurrency modes: Single-thread mode: concurrent reads are blocked and the memory overhead is small. Multi-thread mode/percpu mode: reduces concurrent read blocking but increases memory overhead. The corresponding schema must be fixed at compile time. During mounting, the concurrent decompression mode cannot be adjusted based on file read blocking. The mount parameter theads= is added to select the concurrent decompression mode of a single SquashFS file system image. Link: https://lkml.kernel.org/r/20221019030930.130456-1-nixiaoming@huawei.com Link: https://lkml.kernel.org/r/20221019030930.130456-2-nixiaoming@huawei.com Signed-off-by: Xiaoming Ni Reviewed-by: Phillip Lougher Cc: Jianguo Chen Cc: Jubin Zhong Cc: Zhang Yi Signed-off-by: Andrew Morton --- fs/squashfs/Kconfig | 39 ++++++++++++++++++++--- fs/squashfs/block.c | 2 +- fs/squashfs/decompressor.c | 2 +- fs/squashfs/decompressor_multi.c | 16 +++++++--- fs/squashfs/decompressor_multi_percpu.c | 23 +++++++++----- fs/squashfs/decompressor_single.c | 15 ++++++--- fs/squashfs/squashfs.h | 23 +++++++++++--- fs/squashfs/squashfs_fs_sb.h | 3 +- fs/squashfs/super.c | 56 +++++++++++++++++++++++++++++++-- 9 files changed, 147 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index 916e78fabcaa..218bacdd4298 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -54,9 +54,36 @@ config SQUASHFS_FILE_DIRECT endchoice +config SQUASHFS_DECOMP_SINGLE + depends on SQUASHFS + def_bool n + +config SQUASHFS_DECOMP_MULTI + depends on SQUASHFS + def_bool n + +config SQUASHFS_DECOMP_MULTI_PERCPU + depends on SQUASHFS + def_bool n + +config SQUASHFS_CHOICE_DECOMP_BY_MOUNT + bool "Select the parallel decompression mode during mount" + depends on SQUASHFS + default n + select SQUASHFS_DECOMP_SINGLE + select SQUASHFS_DECOMP_MULTI + select SQUASHFS_DECOMP_MULTI_PERCPU + help + Compile all parallel decompression modes and specify the + decompression mode by setting "threads=" during mount. + threads= + + default Decompressor parallelisation is SQUASHFS_DECOMP_SINGLE + choice - prompt "Decompressor parallelisation options" + prompt "Select decompression parallel mode at compile time" depends on SQUASHFS + depends on !SQUASHFS_CHOICE_DECOMP_BY_MOUNT help Squashfs now supports three parallelisation options for decompression. Each one exhibits various trade-offs between @@ -64,15 +91,17 @@ choice If in doubt, select "Single threaded compression" -config SQUASHFS_DECOMP_SINGLE +config SQUASHFS_COMPILE_DECOMP_SINGLE bool "Single threaded compression" + select SQUASHFS_DECOMP_SINGLE help Traditionally Squashfs has used single-threaded decompression. Only one block (data or metadata) can be decompressed at any one time. This limits CPU and memory usage to a minimum. -config SQUASHFS_DECOMP_MULTI +config SQUASHFS_COMPILE_DECOMP_MULTI bool "Use multiple decompressors for parallel I/O" + select SQUASHFS_DECOMP_MULTI help By default Squashfs uses a single decompressor but it gives poor performance on parallel I/O workloads when using multiple CPU @@ -85,8 +114,9 @@ config SQUASHFS_DECOMP_MULTI decompressors per core. It dynamically allocates decompressors on a demand basis. -config SQUASHFS_DECOMP_MULTI_PERCPU +config SQUASHFS_COMPILE_DECOMP_MULTI_PERCPU bool "Use percpu multiple decompressors for parallel I/O" + select SQUASHFS_DECOMP_MULTI_PERCPU help By default Squashfs uses a single decompressor but it gives poor performance on parallel I/O workloads when using multiple CPU @@ -95,7 +125,6 @@ config SQUASHFS_DECOMP_MULTI_PERCPU This decompressor implementation uses a maximum of one decompressor per core. It uses percpu variables to ensure decompression is load-balanced across the cores. - endchoice config SQUASHFS_XATTR diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 833aca92301f..bed3bb8b27fa 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -216,7 +216,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, res = -EIO; goto out_free_bio; } - res = squashfs_decompress(msblk, bio, offset, length, output); + res = msblk->thread_ops->decompress(msblk, bio, offset, length, output); } else { res = copy_bio_to_actor(bio, output, offset, length); } diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index d57bef91ab08..8893cb9b4198 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -134,7 +134,7 @@ void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags) if (IS_ERR(comp_opts)) return comp_opts; - stream = squashfs_decompressor_create(msblk, comp_opts); + stream = msblk->thread_ops->create(msblk, comp_opts); if (IS_ERR(stream)) kfree(comp_opts); diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c index db9f12a3ea05..eb25432bd149 100644 --- a/fs/squashfs/decompressor_multi.c +++ b/fs/squashfs/decompressor_multi.c @@ -29,12 +29,11 @@ #define MAX_DECOMPRESSOR (num_online_cpus() * 2) -int squashfs_max_decompressors(void) +static int squashfs_max_decompressors(void) { return MAX_DECOMPRESSOR; } - struct squashfs_stream { void *comp_opts; struct list_head strm_list; @@ -59,7 +58,7 @@ static void put_decomp_stream(struct decomp_stream *decomp_strm, wake_up(&stream->wait); } -void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, +static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, void *comp_opts) { struct squashfs_stream *stream; @@ -103,7 +102,7 @@ out: } -void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) { struct squashfs_stream *stream = msblk->stream; if (stream) { @@ -180,7 +179,7 @@ wait: } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, +static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { @@ -195,3 +194,10 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, msblk->decompressor->name); return res; } + +const struct squashfs_decompressor_thread_ops squashfs_decompressor_multi = { + .create = squashfs_decompressor_create, + .destroy = squashfs_decompressor_destroy, + .decompress = squashfs_decompress, + .max_decompressors = squashfs_max_decompressors, +}; diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c index b881b9283b7f..1dfadf76ed9a 100644 --- a/fs/squashfs/decompressor_multi_percpu.c +++ b/fs/squashfs/decompressor_multi_percpu.c @@ -25,7 +25,7 @@ struct squashfs_stream { local_lock_t lock; }; -void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, +static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, void *comp_opts) { struct squashfs_stream *stream; @@ -59,7 +59,7 @@ out: return ERR_PTR(err); } -void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) { struct squashfs_stream __percpu *percpu = (struct squashfs_stream __percpu *) msblk->stream; @@ -75,19 +75,21 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, +static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { struct squashfs_stream *stream; + struct squashfs_stream __percpu *percpu = + (struct squashfs_stream __percpu *) msblk->stream; int res; - local_lock(&msblk->stream->lock); - stream = this_cpu_ptr(msblk->stream); + local_lock(&percpu->lock); + stream = this_cpu_ptr(percpu); res = msblk->decompressor->decompress(msblk, stream->stream, bio, offset, length, output); - local_unlock(&msblk->stream->lock); + local_unlock(&percpu->lock); if (res < 0) ERROR("%s decompression failed, data probably corrupt\n", @@ -96,7 +98,14 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, return res; } -int squashfs_max_decompressors(void) +static int squashfs_max_decompressors(void) { return num_possible_cpus(); } + +const struct squashfs_decompressor_thread_ops squashfs_decompressor_percpu = { + .create = squashfs_decompressor_create, + .destroy = squashfs_decompressor_destroy, + .decompress = squashfs_decompress, + .max_decompressors = squashfs_max_decompressors, +}; diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c index 4eb3d083d45e..6f161887710b 100644 --- a/fs/squashfs/decompressor_single.c +++ b/fs/squashfs/decompressor_single.c @@ -24,7 +24,7 @@ struct squashfs_stream { struct mutex mutex; }; -void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, +static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, void *comp_opts) { struct squashfs_stream *stream; @@ -49,7 +49,7 @@ out: return ERR_PTR(err); } -void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) { struct squashfs_stream *stream = msblk->stream; @@ -59,7 +59,7 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, +static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { @@ -78,7 +78,14 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, return res; } -int squashfs_max_decompressors(void) +static int squashfs_max_decompressors(void) { return 1; } + +const struct squashfs_decompressor_thread_ops squashfs_decompressor_single = { + .create = squashfs_decompressor_create, + .destroy = squashfs_decompressor_destroy, + .decompress = squashfs_decompress, + .max_decompressors = squashfs_max_decompressors, +}; diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 9783e01c8100..a6164fdf9435 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -38,11 +38,24 @@ extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int); extern void *squashfs_decompressor_setup(struct super_block *, unsigned short); /* decompressor_xxx.c */ -extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *); -extern void squashfs_decompressor_destroy(struct squashfs_sb_info *); -extern int squashfs_decompress(struct squashfs_sb_info *, struct bio *, - int, int, struct squashfs_page_actor *); -extern int squashfs_max_decompressors(void); + +struct squashfs_decompressor_thread_ops { + void * (*create)(struct squashfs_sb_info *msblk, void *comp_opts); + void (*destroy)(struct squashfs_sb_info *msblk); + int (*decompress)(struct squashfs_sb_info *msblk, struct bio *bio, + int offset, int length, struct squashfs_page_actor *output); + int (*max_decompressors)(void); +}; + +#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE +extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_single; +#endif +#ifdef CONFIG_SQUASHFS_DECOMP_MULTI +extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_multi; +#endif +#ifdef CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU +extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_percpu; +#endif /* export.c */ extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64, diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 1e90c2575f9b..f1e5dad8ae0a 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -53,7 +53,7 @@ struct squashfs_sb_info { __le64 *xattr_id_table; struct mutex meta_index_mutex; struct meta_index *meta_index; - struct squashfs_stream *stream; + void *stream; __le64 *inode_lookup_table; u64 inode_table; u64 directory_table; @@ -66,5 +66,6 @@ struct squashfs_sb_info { int xattr_ids; unsigned int ids; bool panic_on_errors; + const struct squashfs_decompressor_thread_ops *thread_ops; }; #endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 32565dafa7f3..aac3ea72a9ba 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -47,10 +47,12 @@ enum Opt_errors { enum squashfs_param { Opt_errors, + Opt_threads, }; struct squashfs_mount_opts { enum Opt_errors errors; + const struct squashfs_decompressor_thread_ops *thread_ops; }; static const struct constant_table squashfs_param_errors[] = { @@ -61,9 +63,29 @@ static const struct constant_table squashfs_param_errors[] = { static const struct fs_parameter_spec squashfs_fs_parameters[] = { fsparam_enum("errors", Opt_errors, squashfs_param_errors), + fsparam_string("threads", Opt_threads), {} }; +static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_opts *opts) +{ +#ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT + if (strcmp(str, "single") == 0) { + opts->thread_ops = &squashfs_decompressor_single; + return 0; + } + if (strcmp(str, "multi") == 0) { + opts->thread_ops = &squashfs_decompressor_multi; + return 0; + } + if (strcmp(str, "percpu") == 0) { + opts->thread_ops = &squashfs_decompressor_percpu; + return 0; + } +#endif + return -EINVAL; +} + static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct squashfs_mount_opts *opts = fc->fs_private; @@ -78,6 +100,10 @@ static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *para case Opt_errors: opts->errors = result.uint_32; break; + case Opt_threads: + if (squashfs_parse_param_threads(param->string, opts) != 0) + return -EINVAL; + break; default: return -EINVAL; } @@ -167,6 +193,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_bdev); goto failed_mount; } + msblk->thread_ops = opts->thread_ops; /* Check the MAJOR & MINOR versions and lookup compression type */ msblk->decompressor = supported_squashfs_filesystem( @@ -252,7 +279,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) /* Allocate read_page block */ msblk->read_page = squashfs_cache_init("data", - squashfs_max_decompressors(), msblk->block_size); + msblk->thread_ops->max_decompressors(), msblk->block_size); if (msblk->read_page == NULL) { errorf(fc, "Failed to allocate read_page block"); goto failed_mount; @@ -383,7 +410,7 @@ failed_mount: squashfs_cache_delete(msblk->block_cache); squashfs_cache_delete(msblk->fragment_cache); squashfs_cache_delete(msblk->read_page); - squashfs_decompressor_destroy(msblk); + msblk->thread_ops->destroy(msblk); kfree(msblk->inode_lookup_table); kfree(msblk->fragment_index); kfree(msblk->id_table); @@ -435,6 +462,20 @@ static int squashfs_show_options(struct seq_file *s, struct dentry *root) else seq_puts(s, ",errors=continue"); +#ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT + if (msblk->thread_ops == &squashfs_decompressor_single) { + seq_puts(s, ",threads=single"); + return 0; + } + if (msblk->thread_ops == &squashfs_decompressor_multi) { + seq_puts(s, ",threads=multi"); + return 0; + } + if (msblk->thread_ops == &squashfs_decompressor_percpu) { + seq_puts(s, ",threads=percpu"); + return 0; + } +#endif return 0; } @@ -446,6 +487,15 @@ static int squashfs_init_fs_context(struct fs_context *fc) if (!opts) return -ENOMEM; +#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE + opts->thread_ops = &squashfs_decompressor_single; +#elif defined(CONFIG_SQUASHFS_DECOMP_MULTI) + opts->thread_ops = &squashfs_decompressor_multi; +#elif defined(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) + opts->thread_ops = &squashfs_decompressor_percpu; +#else +#error "fail: unknown squashfs decompression thread mode?" +#endif fc->fs_private = opts; fc->ops = &squashfs_context_ops; return 0; @@ -478,7 +528,7 @@ static void squashfs_put_super(struct super_block *sb) squashfs_cache_delete(sbi->block_cache); squashfs_cache_delete(sbi->fragment_cache); squashfs_cache_delete(sbi->read_page); - squashfs_decompressor_destroy(sbi); + sbi->thread_ops->destroy(sbi); kfree(sbi->id_table); kfree(sbi->fragment_index); kfree(sbi->meta_index); -- cgit From fb40fe04f9df23114782d5edd1c5d017ae9d0ca8 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Wed, 19 Oct 2022 11:09:30 +0800 Subject: squashfs: allows users to configure the number of decompression threads The maximum number of threads in the decompressor_multi.c file is fixed and cannot be adjusted according to user needs. Therefore, the mount parameter needs to be added to allow users to configure the number of threads as required. The upper limit is num_online_cpus() * 2. Link: https://lkml.kernel.org/r/20221019030930.130456-3-nixiaoming@huawei.com Signed-off-by: Xiaoming Ni Reviewed-by: Phillip Lougher Cc: Jianguo Chen Cc: Jubin Zhong Cc: Zhang Yi Signed-off-by: Andrew Morton --- fs/squashfs/Kconfig | 16 ++++++++++-- fs/squashfs/decompressor_multi.c | 4 +-- fs/squashfs/squashfs_fs_sb.h | 1 + fs/squashfs/super.c | 55 +++++++++++++++++++++++++++++++++++----- 4 files changed, 66 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index 218bacdd4298..60fc98bdf421 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -73,11 +73,10 @@ config SQUASHFS_CHOICE_DECOMP_BY_MOUNT select SQUASHFS_DECOMP_SINGLE select SQUASHFS_DECOMP_MULTI select SQUASHFS_DECOMP_MULTI_PERCPU + select SQUASHFS_MOUNT_DECOMP_THREADS help Compile all parallel decompression modes and specify the decompression mode by setting "threads=" during mount. - threads= - default Decompressor parallelisation is SQUASHFS_DECOMP_SINGLE choice @@ -127,6 +126,19 @@ config SQUASHFS_COMPILE_DECOMP_MULTI_PERCPU decompression is load-balanced across the cores. endchoice +config SQUASHFS_MOUNT_DECOMP_THREADS + bool "Add the mount parameter 'threads=' for squashfs" + depends on SQUASHFS + depends on SQUASHFS_DECOMP_MULTI + default n + help + Use threads= to set the decompression parallel mode and the number of threads. + If SQUASHFS_CHOICE_DECOMP_BY_MOUNT=y + threads= + else + threads=<2|3|...> + The upper limit is num_online_cpus() * 2. + config SQUASHFS_XATTR bool "Squashfs XATTR support" depends on SQUASHFS diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c index eb25432bd149..416c53eedbd1 100644 --- a/fs/squashfs/decompressor_multi.c +++ b/fs/squashfs/decompressor_multi.c @@ -144,7 +144,7 @@ static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk, * If there is no available decomp and already full, * let's wait for releasing decomp from other users. */ - if (stream->avail_decomp >= MAX_DECOMPRESSOR) + if (stream->avail_decomp >= msblk->max_thread_num) goto wait; /* Let's allocate new decomp */ @@ -160,7 +160,7 @@ static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk, } stream->avail_decomp++; - WARN_ON(stream->avail_decomp > MAX_DECOMPRESSOR); + WARN_ON(stream->avail_decomp > msblk->max_thread_num); mutex_unlock(&stream->mutex); break; diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index f1e5dad8ae0a..659082e9e51d 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -67,5 +67,6 @@ struct squashfs_sb_info { unsigned int ids; bool panic_on_errors; const struct squashfs_decompressor_thread_ops *thread_ops; + int max_thread_num; }; #endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index aac3ea72a9ba..1e428ca9414e 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -53,6 +53,7 @@ enum squashfs_param { struct squashfs_mount_opts { enum Opt_errors errors; const struct squashfs_decompressor_thread_ops *thread_ops; + int thread_num; }; static const struct constant_table squashfs_param_errors[] = { @@ -67,7 +68,8 @@ static const struct fs_parameter_spec squashfs_fs_parameters[] = { {} }; -static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_opts *opts) + +static int squashfs_parse_param_threads_str(const char *str, struct squashfs_mount_opts *opts) { #ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT if (strcmp(str, "single") == 0) { @@ -86,6 +88,42 @@ static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_o return -EINVAL; } +static int squashfs_parse_param_threads_num(const char *str, struct squashfs_mount_opts *opts) +{ +#ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS + int ret; + unsigned long num; + + ret = kstrtoul(str, 0, &num); + if (ret != 0) + return -EINVAL; + if (num > 1) { + opts->thread_ops = &squashfs_decompressor_multi; + if (num > opts->thread_ops->max_decompressors()) + return -EINVAL; + opts->thread_num = (int)num; + return 0; + } +#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE + if (num == 1) { + opts->thread_ops = &squashfs_decompressor_single; + opts->thread_num = 1; + return 0; + } +#endif +#endif /* !CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS */ + return -EINVAL; +} + +static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_opts *opts) +{ + int ret = squashfs_parse_param_threads_str(str, opts); + + if (ret == 0) + return ret; + return squashfs_parse_param_threads_num(str, opts); +} + static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct squashfs_mount_opts *opts = fc->fs_private; @@ -194,6 +232,11 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) goto failed_mount; } msblk->thread_ops = opts->thread_ops; + if (opts->thread_num == 0) { + msblk->max_thread_num = msblk->thread_ops->max_decompressors(); + } else { + msblk->max_thread_num = opts->thread_num; + } /* Check the MAJOR & MINOR versions and lookup compression type */ msblk->decompressor = supported_squashfs_filesystem( @@ -279,7 +322,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) /* Allocate read_page block */ msblk->read_page = squashfs_cache_init("data", - msblk->thread_ops->max_decompressors(), msblk->block_size); + msblk->max_thread_num, msblk->block_size); if (msblk->read_page == NULL) { errorf(fc, "Failed to allocate read_page block"); goto failed_mount; @@ -467,14 +510,13 @@ static int squashfs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",threads=single"); return 0; } - if (msblk->thread_ops == &squashfs_decompressor_multi) { - seq_puts(s, ",threads=multi"); - return 0; - } if (msblk->thread_ops == &squashfs_decompressor_percpu) { seq_puts(s, ",threads=percpu"); return 0; } +#endif +#ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS + seq_printf(s, ",threads=%d", msblk->max_thread_num); #endif return 0; } @@ -496,6 +538,7 @@ static int squashfs_init_fs_context(struct fs_context *fc) #else #error "fail: unknown squashfs decompression thread mode?" #endif + opts->thread_num = 0; fc->fs_private = opts; fc->ops = &squashfs_context_ops; return 0; -- cgit From 13b6269dd022aaa69ca8d1df374ab327504121cf Mon Sep 17 00:00:00 2001 From: Shang XiaoJing Date: Tue, 1 Nov 2022 19:15:33 +0800 Subject: ocfs2: fix memory leak in ocfs2_stack_glue_init() ocfs2_table_header should be free in ocfs2_stack_glue_init() if ocfs2_sysfs_init() failed, otherwise kmemleak will report memleak. BUG: memory leak unreferenced object 0xffff88810eeb5800 (size 128): comm "modprobe", pid 4507, jiffies 4296182506 (age 55.888s) hex dump (first 32 bytes): c0 40 14 a0 ff ff ff ff 00 00 00 00 01 00 00 00 .@.............. 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000001e59e1cd>] __register_sysctl_table+0xca/0xef0 [<00000000c04f70f7>] 0xffffffffa0050037 [<000000001bd12912>] do_one_initcall+0xdb/0x480 [<0000000064f766c9>] do_init_module+0x1cf/0x680 [<000000002ba52db0>] load_module+0x6441/0x6f20 [<000000009772580d>] __do_sys_finit_module+0x12f/0x1c0 [<00000000380c1f22>] do_syscall_64+0x3f/0x90 [<000000004cf473bc>] entry_SYSCALL_64_after_hwframe+0x63/0xcd Link: https://lkml.kernel.org/r/41651ca1-432a-db34-eb97-d35744559de1@linux.alibaba.com Fixes: 3878f110f71a ("ocfs2: Move the hb_ctl_path sysctl into the stack glue.") Signed-off-by: Shang XiaoJing Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/stackglue.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 317126261523..a8d5ca98fa57 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -669,6 +669,8 @@ static struct ctl_table_header *ocfs2_table_header; static int __init ocfs2_stack_glue_init(void) { + int ret; + strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB); ocfs2_table_header = register_sysctl("fs/ocfs2/nm", ocfs2_nm_table); @@ -678,7 +680,11 @@ static int __init ocfs2_stack_glue_init(void) return -ENOMEM; /* or something. */ } - return ocfs2_sysfs_init(); + ret = ocfs2_sysfs_init(); + if (ret) + unregister_sysctl_table(ocfs2_table_header); + + return ret; } static void __exit ocfs2_stack_glue_exit(void) -- cgit From c7e8d3279c984e41165a7b510759bd1771ac3941 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 1 Nov 2022 15:33:43 +0800 Subject: squashfs: fix null-ptr-deref in squashfs_fill_super When squashfs_read_table() returns an error or `sb->s_magic != SQUASHFS_MAGIC`, enters the error branch and calls msblk->thread_ops->destroy(msblk) to destroy msblk. However, msblk->thread_ops has not been initialized. Therefore, the following problem is triggered: ================================================================== BUG: KASAN: null-ptr-deref in squashfs_fill_super+0xe7a/0x13b0 Read of size 8 at addr 0000000000000008 by task swapper/0/1 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.1.0-rc3-next-20221031 #367 Call Trace: dump_stack_lvl+0x73/0x9f print_report+0x743/0x759 kasan_report+0xc0/0x120 __asan_load8+0xd3/0x140 squashfs_fill_super+0xe7a/0x13b0 get_tree_bdev+0x27b/0x450 squashfs_get_tree+0x19/0x30 vfs_get_tree+0x49/0x150 path_mount+0xaae/0x1350 init_mount+0xad/0x100 do_mount_root+0xbc/0x1d0 mount_block_root+0x173/0x316 mount_root+0x223/0x236 prepare_namespace+0x1eb/0x237 kernel_init_freeable+0x528/0x576 kernel_init+0x29/0x250 ret_from_fork+0x1f/0x30 ================================================================== To solve this issue, msblk->thread_ops is initialized immediately after msblk is assigned a value. Link: https://lkml.kernel.org/r/20221101073343.3961562-1-libaokun1@huawei.com Fixes: b0645770d3c7 ("squashfs: add the mount parameter theads=") Signed-off-by: Baokun Li Reviewed-by: Xiaoming Ni Reviewed-by: Phillip Lougher Cc: Yu Kuai Cc: Zhang Yi Signed-off-by: Andrew Morton --- fs/squashfs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 1e428ca9414e..7d5265a39d20 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -197,6 +197,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) return -ENOMEM; } msblk = sb->s_fs_info; + msblk->thread_ops = opts->thread_ops; msblk->panic_on_errors = (opts->errors == Opt_errors_panic); @@ -231,7 +232,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_bdev); goto failed_mount; } - msblk->thread_ops = opts->thread_ops; + if (opts->thread_num == 0) { msblk->max_thread_num = msblk->thread_ops->max_decompressors(); } else { -- cgit From f6fbd8cbf3ed1915a7b957f2801f7c306a686c08 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 9 Nov 2022 14:14:35 -0500 Subject: lsm,fs: fix vfs_getxattr_alloc() return type and caller error paths The vfs_getxattr_alloc() function currently returns a ssize_t value despite the fact that it only uses int values internally for return values. Fix this by converting vfs_getxattr_alloc() to return an int type and adjust the callers as necessary. As part of these caller modifications, some of the callers are fixed to properly free the xattr value buffer on both success and failure to ensure that memory is not leaked in the failure case. Reviewed-by: Serge Hallyn Reviewed-by: Mimi Zohar Signed-off-by: Paul Moore --- fs/xattr.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xattr.c b/fs/xattr.c index 61107b6bbed2..f38fd969b5fc 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -354,11 +354,12 @@ out_noalloc: * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr * * Allocate memory, if not already allocated, or re-allocate correct size, - * before retrieving the extended attribute. + * before retrieving the extended attribute. The xattr value buffer should + * always be freed by the caller, even on error. * * Returns the result of alloc, if failed, or the getxattr operation. */ -ssize_t +int vfs_getxattr_alloc(struct user_namespace *mnt_userns, struct dentry *dentry, const char *name, char **xattr_value, size_t xattr_size, gfp_t flags) -- cgit From 406c706c7b7f1730aa787e914817b8d16b1e99f6 Mon Sep 17 00:00:00 2001 From: Peter Griffin Date: Thu, 3 Nov 2022 17:02:10 +0000 Subject: vfs: vfs_tmpfile: ensure O_EXCL flag is enforced If O_EXCL is *not* specified, then linkat() can be used to link the temporary file into the filesystem. If O_EXCL is specified then linkat() should fail (-1). After commit 863f144f12ad ("vfs: open inside ->tmpfile()") the O_EXCL flag is no longer honored by the vfs layer for tmpfile, which means the file can be linked even if O_EXCL flag is specified, which is a change in behaviour for userspace! The open flags was previously passed as a parameter, so it was uneffected by the changes to file->f_flags caused by finish_open(). This patch fixes the issue by storing file->f_flags in a local variable so the O_EXCL test logic is restored. This regression was detected by Android CTS Bionic fcntl() tests running on android-mainline [1]. [1] https://android.googlesource.com/platform/bionic/+/ refs/heads/master/tests/fcntl_test.cpp#352 Fixes: 863f144f12ad ("vfs: open inside ->tmpfile()") Acked-by: Miklos Szeredi Tested-by: Will McVicker Signed-off-by: Peter Griffin Signed-off-by: Al Viro --- fs/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 578c2110df02..9155ecb547ce 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3591,6 +3591,7 @@ static int vfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir = d_inode(parentpath->dentry); struct inode *inode; int error; + int open_flag = file->f_flags; /* we want directory to be writable */ error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); @@ -3613,7 +3614,7 @@ static int vfs_tmpfile(struct user_namespace *mnt_userns, if (error) return error; inode = file_inode(file); - if (!(file->f_flags & O_EXCL)) { + if (!(open_flag & O_EXCL)) { spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); -- cgit From f391d6ee002ea022c62dc0b09d0578f3ccce81be Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 18 Nov 2022 14:48:00 +0300 Subject: cifs: Use after free in debug code This debug code dereferences "old_iface" after it was already freed by the call to release_iface(). Re-order the debugging to avoid this issue. Fixes: b54034a73baf ("cifs: during reconnect, update interface if necessary") Cc: stable@vger.kernel.org # 5.19+ Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Dan Carpenter Signed-off-by: Steve French --- fs/cifs/sess.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 92e4278ec35d..9e7d9f0baa18 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -302,14 +302,14 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) /* now drop the ref to the current iface */ if (old_iface && iface) { - kref_put(&old_iface->refcount, release_iface); cifs_dbg(FYI, "replacing iface: %pIS with %pIS\n", &old_iface->sockaddr, &iface->sockaddr); - } else if (old_iface) { kref_put(&old_iface->refcount, release_iface); + } else if (old_iface) { cifs_dbg(FYI, "releasing ref to iface: %pIS\n", &old_iface->sockaddr); + kref_put(&old_iface->refcount, release_iface); } else { WARN_ON(!iface); cifs_dbg(FYI, "adding new iface: %pIS\n", &iface->sockaddr); -- cgit From bc943f4872a722c5cc64d1cf41daaaf4ec63158e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 16 Nov 2022 19:08:23 +0100 Subject: ext2: Don't flush page immediately for DIRSYNC directories We do not need to writeout modified directory blocks immediately when modifying them while the page is locked. It is enough to do the flush somewhat later which has the added benefit that inode times can be flushed as well. It also allows us to stop depending on write_one_page() function. Signed-off-by: Jan Kara --- fs/ext2/dir.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 27873733ed8c..6fa714dbee84 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -81,11 +81,10 @@ ext2_last_byte(struct inode *inode, unsigned long page_nr) return last_byte; } -static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) +static void ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; - int err = 0; inode_inc_iversion(dir); block_write_end(NULL, mapping, pos, len, len, page, NULL); @@ -94,16 +93,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) i_size_write(dir, pos+len); mark_inode_dirty(dir); } - - if (IS_DIRSYNC(dir)) { - err = write_one_page(page); - if (!err) - err = sync_inode_metadata(dir, 1); - } else { - unlock_page(page); - } - - return err; + unlock_page(page); } static bool ext2_check_page(struct page *page, int quiet, char *kaddr) @@ -460,6 +450,17 @@ static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len) return __block_write_begin(page, pos, len, ext2_get_block); } + +static int ext2_handle_dirsync(struct inode *dir) +{ + int err; + + err = filemap_write_and_wait(dir->i_mapping); + if (!err) + err = sync_inode_metadata(dir, 1); + return err; +} + void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, struct page *page, void *page_addr, struct inode *inode, int update_times) @@ -474,11 +475,12 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, BUG_ON(err); de->inode = cpu_to_le32(inode->i_ino); ext2_set_de_type(de, inode); - err = ext2_commit_chunk(page, pos, len); + ext2_commit_chunk(page, pos, len); if (update_times) dir->i_mtime = dir->i_ctime = current_time(dir); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); + ext2_handle_dirsync(dir); } /* @@ -566,10 +568,11 @@ got_it: memcpy(de->name, name, namelen); de->inode = cpu_to_le32(inode->i_ino); ext2_set_de_type (de, inode); - err = ext2_commit_chunk(page, pos, rec_len); + ext2_commit_chunk(page, pos, rec_len); dir->i_mtime = dir->i_ctime = current_time(dir); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); + err = ext2_handle_dirsync(dir); /* OFFSET_CACHE */ out_put: ext2_put_page(page, page_addr); @@ -615,10 +618,11 @@ int ext2_delete_entry (struct ext2_dir_entry_2 *dir, struct page *page, if (pde) pde->rec_len = ext2_rec_len_to_disk(to - from); dir->inode = 0; - err = ext2_commit_chunk(page, pos, to - from); + ext2_commit_chunk(page, pos, to - from); inode->i_ctime = inode->i_mtime = current_time(inode); EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(inode); + err = ext2_handle_dirsync(inode); out: return err; } @@ -658,7 +662,8 @@ int ext2_make_empty(struct inode *inode, struct inode *parent) memcpy (de->name, "..\0", 4); ext2_set_de_type (de, inode); kunmap_atomic(kaddr); - err = ext2_commit_chunk(page, 0, chunk_size); + ext2_commit_chunk(page, 0, chunk_size); + err = ext2_handle_dirsync(inode); fail: put_page(page); return err; -- cgit From a27c442d61cea70f38d9340528225b234888885b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 13 Nov 2022 17:28:55 +0100 Subject: ext2: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or as a fallback when no ->migrate_folio method is present. Signed-off-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/ext2/inode.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 918ab2f9e4c0..3b2e3e1e0fa2 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -869,11 +869,6 @@ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } -static int ext2_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, ext2_get_block, wbc); -} - static int ext2_read_folio(struct file *file, struct folio *folio) { return mpage_read_folio(folio, ext2_get_block); @@ -948,7 +943,6 @@ const struct address_space_operations ext2_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = ext2_read_folio, .readahead = ext2_readahead, - .writepage = ext2_writepage, .write_begin = ext2_write_begin, .write_end = ext2_write_end, .bmap = ext2_bmap, -- cgit From 36273e5b4e3a934c6d346c8f0b16b97e018094af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 13 Nov 2022 17:29:02 +0100 Subject: udf: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or as a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and remove the ->writepage implementation in extfat. Signed-off-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/udf/inode.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index dce6ae9ae306..0246b1b86fb9 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -182,11 +182,6 @@ static void udf_write_failed(struct address_space *mapping, loff_t to) } } -static int udf_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, udf_get_block, wbc); -} - static int udf_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -239,12 +234,12 @@ const struct address_space_operations udf_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = udf_read_folio, .readahead = udf_readahead, - .writepage = udf_writepage, .writepages = udf_writepages, .write_begin = udf_write_begin, .write_end = generic_write_end, .direct_IO = udf_direct_IO, .bmap = udf_bmap, + .migrate_folio = buffer_migrate_folio, }; /* -- cgit From c51f0e6a1254b3ac2d308e1c6fd8fb936992b455 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Nov 2022 10:39:44 +0100 Subject: btrfs: zoned: fix missing endianness conversion in sb_write_pointer generation is an on-disk __le64 value, so use btrfs_super_generation to convert it to host endian before comparing it. Fixes: 12659251ca5d ("btrfs: implement log-structured superblock for ZONED mode") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 1912abf6d020..44d8177eeffc 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -134,7 +134,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, super[i] = page_address(page[i]); } - if (super[0]->generation > super[1]->generation) + if (btrfs_super_generation(super[0]) > + btrfs_super_generation(super[1])) sector = zones[1].start; else sector = zones[0].start; -- cgit From a11452a3709e217492798cf3686ac2cc8eb3fb51 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 15 Nov 2022 16:29:44 +0000 Subject: btrfs: send: avoid unaligned encoded writes when attempting to clone range When trying to see if we can clone a file range, there are cases where we end up sending two write operations in case the inode from the source root has an i_size that is not sector size aligned and the length from the current offset to its i_size is less than the remaining length we are trying to clone. Issuing two write operations when we could instead issue a single write operation is not incorrect. However it is not optimal, specially if the extents are compressed and the flag BTRFS_SEND_FLAG_COMPRESSED was passed to the send ioctl. In that case we can end up sending an encoded write with an offset that is not sector size aligned, which makes the receiver fallback to decompressing the data and writing it using regular buffered IO (so re-compressing the data in case the fs is mounted with compression enabled), because encoded writes fail with -EINVAL when an offset is not sector size aligned. The following example, which triggered a bug in the receiver code for the fallback logic of decompressing + regular buffer IO and is fixed by the patchset referred in a Link at the bottom of this changelog, is an example where we have the non-optimal behaviour due to an unaligned encoded write: $ cat test.sh #!/bin/bash DEV=/dev/sdj MNT=/mnt/sdj mkfs.btrfs -f $DEV > /dev/null mount -o compress $DEV $MNT # File foo has a size of 33K, not aligned to the sector size. xfs_io -f -c "pwrite -S 0xab 0 33K" $MNT/foo xfs_io -f -c "pwrite -S 0xcd 0 64K" $MNT/bar # Now clone the first 32K of file bar into foo at offset 0. xfs_io -c "reflink $MNT/bar 0 0 32K" $MNT/foo # Snapshot the default subvolume and create a full send stream (v2). btrfs subvolume snapshot -r $MNT $MNT/snap btrfs send --compressed-data -f /tmp/test.send $MNT/snap echo -e "\nFile bar in the original filesystem:" od -A d -t x1 $MNT/snap/bar umount $MNT mkfs.btrfs -f $DEV > /dev/null mount $DEV $MNT echo -e "\nReceiving stream in a new filesystem..." btrfs receive -f /tmp/test.send $MNT echo -e "\nFile bar in the new filesystem:" od -A d -t x1 $MNT/snap/bar umount $MNT Before this patch, the send stream included one regular write and one encoded write for file 'bar', with the later being not sector size aligned and causing the receiver to fallback to decompression + buffered writes. The output of the btrfs receive command in verbose mode (-vvv): (...) mkfile o258-7-0 rename o258-7-0 -> bar utimes clone bar - source=foo source offset=0 offset=0 length=32768 write bar - offset=32768 length=1024 encoded_write bar - offset=33792, len=4096, unencoded_offset=33792, unencoded_file_len=31744, unencoded_len=65536, compression=1, encryption=0 encoded_write bar - falling back to decompress and write due to errno 22 ("Invalid argument") (...) This patch avoids the regular write followed by an unaligned encoded write so that we end up sending a single encoded write that is aligned. So after this patch the stream content is (output of btrfs receive -vvv): (...) mkfile o258-7-0 rename o258-7-0 -> bar utimes clone bar - source=foo source offset=0 offset=0 length=32768 encoded_write bar - offset=32768, len=4096, unencoded_offset=32768, unencoded_file_len=32768, unencoded_len=65536, compression=1, encryption=0 (...) So we get more optimal behaviour and avoid the silent data loss bug in versions of btrfs-progs affected by the bug referred by the Link tag below (btrfs-progs v5.19, v5.19.1, v6.0 and v6.0.1). Link: https://lore.kernel.org/linux-btrfs/cover.1668529099.git.fdmanana@suse.com/ Reviewed-by: Boris Burkov Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 145c84b44fd0..1c4b693ee4a3 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5702,6 +5702,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, u64 ext_len; u64 clone_len; u64 clone_data_offset; + bool crossed_src_i_size = false; if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(clone_root->root, path); @@ -5759,8 +5760,10 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, if (key.offset >= clone_src_i_size) break; - if (key.offset + ext_len > clone_src_i_size) + if (key.offset + ext_len > clone_src_i_size) { ext_len = clone_src_i_size - key.offset; + crossed_src_i_size = true; + } clone_data_offset = btrfs_file_extent_offset(leaf, ei); if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) { @@ -5821,6 +5824,25 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, ret = send_clone(sctx, offset, clone_len, clone_root); } + } else if (crossed_src_i_size && clone_len < len) { + /* + * If we are at i_size of the clone source inode and we + * can not clone from it, terminate the loop. This is + * to avoid sending two write operations, one with a + * length matching clone_len and the final one after + * this loop with a length of len - clone_len. + * + * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED + * was passed to the send ioctl), this helps avoid + * sending an encoded write for an offset that is not + * sector size aligned, in case the i_size of the source + * inode is not sector size aligned. That will make the + * receiver fallback to decompression of the data and + * writing it using regular buffered IO, therefore while + * not incorrect, it's not optimal due decompression and + * possible re-compression at the receiver. + */ + break; } else { ret = send_extent_data(sctx, dst_path, offset, clone_len); -- cgit From f7e942b5bb35d8e3af54053d19a6bf04143a3955 Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Wed, 16 Nov 2022 22:23:54 +0800 Subject: btrfs: qgroup: fix sleep from invalid context bug in btrfs_qgroup_inherit() Syzkaller reported BUG as follows: BUG: sleeping function called from invalid context at include/linux/sched/mm.h:274 Call Trace: dump_stack_lvl+0xcd/0x134 __might_resched.cold+0x222/0x26b kmem_cache_alloc+0x2e7/0x3c0 update_qgroup_limit_item+0xe1/0x390 btrfs_qgroup_inherit+0x147b/0x1ee0 create_subvol+0x4eb/0x1710 btrfs_mksubvol+0xfe5/0x13f0 __btrfs_ioctl_snap_create+0x2b0/0x430 btrfs_ioctl_snap_create_v2+0x25a/0x520 btrfs_ioctl+0x2a1c/0x5ce0 __x64_sys_ioctl+0x193/0x200 do_syscall_64+0x35/0x80 Fix this by calling qgroup_dirty() on @dstqgroup, and update limit item in btrfs_run_qgroups() later outside of the spinlock context. CC: stable@vger.kernel.org # 4.9+ Reviewed-by: Qu Wenruo Signed-off-by: ChenXiaoSong Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9334c3157c22..b74105a10f16 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2951,14 +2951,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, dstgroup->rsv_rfer = inherit->lim.rsv_rfer; dstgroup->rsv_excl = inherit->lim.rsv_excl; - ret = update_qgroup_limit_item(trans, dstgroup); - if (ret) { - qgroup_mark_inconsistent(fs_info); - btrfs_info(fs_info, - "unable to update quota limit for %llu", - dstgroup->qgroupid); - goto unlock; - } + qgroup_dirty(fs_info, dstgroup); } if (srcid) { -- cgit From caf1aeaffc3b09649a56769e559333ae2c4f1802 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 20 Nov 2022 10:10:53 -0700 Subject: eventpoll: add EPOLL_URING_WAKE poll wakeup flag We can have dependencies between epoll and io_uring. Consider an epoll context, identified by the epfd file descriptor, and an io_uring file descriptor identified by iofd. If we add iofd to the epfd context, and arm a multishot poll request for epfd with iofd, then the multishot poll request will repeatedly trigger and generate events until terminated by CQ ring overflow. This isn't a desired behavior. Add EPOLL_URING so that io_uring can pass it in as part of the poll wakeup key, and io_uring can check for that to detect a potential recursive invocation. Cc: stable@vger.kernel.org # 6.0 Signed-off-by: Jens Axboe --- fs/eventpoll.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 52954d4637b5..64659b110973 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -491,7 +491,8 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi) +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, + unsigned pollflags) { struct eventpoll *ep_src; unsigned long flags; @@ -522,16 +523,17 @@ static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi) } spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); ep->nests = nests + 1; - wake_up_locked_poll(&ep->poll_wait, EPOLLIN); + wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags); ep->nests = 0; spin_unlock_irqrestore(&ep->poll_wait.lock, flags); } #else -static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi) +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, + unsigned pollflags) { - wake_up_poll(&ep->poll_wait, EPOLLIN); + wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags); } #endif @@ -742,7 +744,7 @@ static void ep_free(struct eventpoll *ep) /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) - ep_poll_safewake(ep, NULL); + ep_poll_safewake(ep, NULL, 0); /* * We need to lock this because we could be hit by @@ -1208,7 +1210,7 @@ out_unlock: /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(ep, epi); + ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE); if (!(epi->event.events & EPOLLEXCLUSIVE)) ewake = 1; @@ -1553,7 +1555,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(ep, NULL); + ep_poll_safewake(ep, NULL, 0); return 0; } @@ -1629,7 +1631,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(ep, NULL); + ep_poll_safewake(ep, NULL, 0); return 0; } -- cgit From f217d7ccb9f9ae8d9525958f902c059db1c78355 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 17 Nov 2022 17:11:40 -0500 Subject: fs: dlm: avoid false-positive checker warning This patch avoid the false-positive checker warning about writing 112 bytes into a 88 bytes field "e->request", see: [ 54.891560] dlm: csmb1: dlm_recover_directory 23 out 2 messages [ 54.990542] ------------[ cut here ]------------ [ 54.991012] memcpy: detected field-spanning write (size 112) of single field "&e->request" at fs/dlm/requestqueue.c:47 (size 88) [ 54.992150] WARNING: CPU: 0 PID: 297 at fs/dlm/requestqueue.c:47 dlm_add_requestqueue+0x177/0x180 [ 54.993002] CPU: 0 PID: 297 Comm: kworker/u4:3 Not tainted 6.1.0-rc5-00008-ge01d50cbd6ee #248 [ 54.993878] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.0-1.fc36 04/01/2014 [ 54.994718] Workqueue: dlm_recv process_recv_sockets [ 54.995230] RIP: 0010:dlm_add_requestqueue+0x177/0x180 [ 54.995731] Code: e7 01 0f 85 3b ff ff ff b9 58 00 00 00 48 c7 c2 c0 41 74 82 4c 89 ee 48 c7 c7 20 42 74 82 c6 05 8b 8d 30 02 01 e8 51 07 be 00 <0f> 0b e9 12 ff ff ff 66 90 0f 1f 44 00 00 41 57 48 8d 87 10 08 00 [ 54.997483] RSP: 0018:ffffc90000b1fbe8 EFLAGS: 00010282 [ 54.997990] RAX: 0000000000000000 RBX: ffff888024fc3d00 RCX: 0000000000000000 [ 54.998667] RDX: 0000000000000001 RSI: ffffffff81155014 RDI: fffff52000163f73 [ 54.999342] RBP: ffff88800dbac000 R08: 0000000000000001 R09: ffffc90000b1fa5f [ 54.999997] R10: fffff52000163f4b R11: 203a7970636d656d R12: ffff88800cfb0018 [ 55.000673] R13: 0000000000000070 R14: ffff888024fc3d18 R15: 0000000000000000 [ 55.001344] FS: 0000000000000000(0000) GS:ffff88806d600000(0000) knlGS:0000000000000000 [ 55.002078] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 55.002603] CR2: 00007f35d4f0b9a0 CR3: 0000000025495002 CR4: 0000000000770ef0 [ 55.003258] PKRU: 55555554 [ 55.003514] Call Trace: [ 55.003756] [ 55.003953] dlm_receive_buffer+0x1c0/0x200 [ 55.004348] dlm_process_incoming_buffer+0x46d/0x780 [ 55.004786] ? kernel_recvmsg+0x8b/0xc0 [ 55.005150] receive_from_sock.isra.0+0x168/0x420 [ 55.005582] ? process_listen_recv_socket+0x10/0x10 [ 55.006018] ? finish_task_switch.isra.0+0xe0/0x400 [ 55.006469] ? __switch_to+0x2fe/0x6a0 [ 55.006808] ? read_word_at_a_time+0xe/0x20 [ 55.007197] ? strscpy+0x146/0x190 [ 55.007505] process_one_work+0x3d0/0x6b0 [ 55.007863] worker_thread+0x8d/0x620 [ 55.008209] ? __kthread_parkme+0xd8/0xf0 [ 55.008565] ? process_one_work+0x6b0/0x6b0 [ 55.008937] kthread+0x171/0x1a0 [ 55.009251] ? kthread_exit+0x60/0x60 [ 55.009582] ret_from_fork+0x1f/0x30 [ 55.009903] [ 55.010120] ---[ end trace 0000000000000000 ]--- [ 55.025783] dlm: csmb1: dlm_recover 5 generation 3 done: 201 ms [ 55.026466] gfs2: fsid=smbcluster:csmb1.0: recover generation 3 done It seems the checker is unable to detect the additional length bytes which was allocated additionally for the flexible array in struct dlm_message. To solve it we split the memcpy() into copy for the 88 bytes struct and another memcpy() for the flexible array m_extra field. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/requestqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index 036a9a0078f6..8be2893ad15b 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -44,7 +44,8 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) e->recover_seq = ls->ls_recover_seq & 0xFFFFFFFF; e->nodeid = nodeid; - memcpy(&e->request, ms, le16_to_cpu(ms->m_header.h_length)); + memcpy(&e->request, ms, sizeof(*ms)); + memcpy(&e->request.m_extra, ms->m_extra, length); atomic_inc(&ls->ls_requestqueue_cnt); mutex_lock(&ls->ls_requestqueue_mutex); -- cgit From 9267c85769e62c10961451fd28e88de996fdf401 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 17 Nov 2022 17:11:41 -0500 Subject: fs: dlm: drop lkb ref in bug case This patch will drop the lkb reference in an very unlikely case which should in practice not happened. However if it happens we cleanup the reference just in case. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/ast.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 078bbbd43a53..982d093b80cd 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -186,7 +186,7 @@ void dlm_callback_work(struct work_struct *work) spin_unlock(&lkb->lkb_cb_lock); if (WARN_ON(rv == DLM_DEQUEUE_CALLBACK_EMPTY)) - return; + goto out; for (;;) { castfn = lkb->lkb_astfn; @@ -217,6 +217,7 @@ void dlm_callback_work(struct work_struct *work) spin_unlock(&lkb->lkb_cb_lock); } +out: /* undo kref_get from dlm_add_callback, may cause lkb to be freed */ dlm_put_lkb(lkb); } -- cgit From 740bb8fc10d226d64c7da2271cf0b25dab1538dc Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 17 Nov 2022 17:11:42 -0500 Subject: fs: dlm: ast do WARN_ON_ONCE() on hotpath This patch changes the ast hotpath functionality in very unlikely cases that we do WARN_ON_ONCE() instead of WARN_ON() to not spamming the console output if we run into states that it would occur over and over again. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/ast.c | 6 +++--- fs/dlm/user.c | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 982d093b80cd..7dd072b8ea38 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -161,12 +161,12 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, spin_unlock(&ls->ls_cb_lock); break; case DLM_ENQUEUE_CALLBACK_FAILURE: - WARN_ON(1); + WARN_ON_ONCE(1); break; case DLM_ENQUEUE_CALLBACK_SUCCESS: break; default: - WARN_ON(1); + WARN_ON_ONCE(1); break; } spin_unlock(&lkb->lkb_cb_lock); @@ -185,7 +185,7 @@ void dlm_callback_work(struct work_struct *work) rv = dlm_dequeue_lkb_callback(lkb, &cb); spin_unlock(&lkb->lkb_cb_lock); - if (WARN_ON(rv == DLM_DEQUEUE_CALLBACK_EMPTY)) + if (WARN_ON_ONCE(rv == DLM_DEQUEUE_CALLBACK_EMPTY)) goto out; for (;;) { diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 6b530db4bc0b..edf722d6935a 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -214,7 +214,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, switch (rv) { case DLM_ENQUEUE_CALLBACK_FAILURE: spin_unlock(&proc->asts_spin); - WARN_ON(1); + WARN_ON_ONCE(1); goto out; case DLM_ENQUEUE_CALLBACK_NEED_SCHED: kref_get(&lkb->lkb_ref); @@ -224,7 +224,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, case DLM_ENQUEUE_CALLBACK_SUCCESS: break; default: - WARN_ON(1); + WARN_ON_ONCE(1); break; } spin_unlock(&proc->asts_spin); @@ -880,7 +880,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, spin_unlock(&proc->asts_spin); /* removes ref for proc->asts, may cause lkb to be freed */ dlm_put_lkb(lkb); - WARN_ON(1); + WARN_ON_ONCE(1); goto try_another; case DLM_DEQUEUE_CALLBACK_LAST: list_del_init(&lkb->lkb_cb_list); @@ -890,7 +890,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, case DLM_DEQUEUE_CALLBACK_SUCCESS: break; default: - WARN_ON(1); + WARN_ON_ONCE(1); break; } spin_unlock(&proc->asts_spin); -- cgit From 554d849616769339b9dce833a4830251fc4b91ba Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 17 Nov 2022 17:11:43 -0500 Subject: fs: dlm: rename DLM_IFL_NEED_SCHED to DLM_IFL_CB_PENDING This patch renames DLM_IFL_NEED_SCHED to DLM_IFL_CB_PENDING because CB_PENDING is a proper name to describe this flag. This flag is set when callback enqueue will return DLM_ENQUEUE_CALLBACK_NEED_SCHED because the callback worker need to be queued. The flag tells that callbacks are currently pending to be called and will be unset if the callback work for the specific lkb is done. The term need schedule is part of this time but a proper name is to say that there are some callbacks pending to being called. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/ast.c | 9 ++++----- fs/dlm/dlm_internal.h | 2 +- fs/dlm/user.c | 3 +-- 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 7dd072b8ea38..26fef9945cc9 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -45,8 +45,7 @@ void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb) kref_put(&cb->ref, dlm_release_callback); } - /* TODO */ - lkb->lkb_flags &= ~DLM_IFL_NEED_SCHED; + lkb->lkb_flags &= ~DLM_IFL_CB_PENDING; /* invalidate */ dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL); @@ -104,8 +103,8 @@ int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, cb->sb_status = status; cb->sb_flags = (sbflags & 0x000000FF); kref_init(&cb->ref); - if (!(lkb->lkb_flags & DLM_IFL_NEED_SCHED)) { - lkb->lkb_flags |= DLM_IFL_NEED_SCHED; + if (!(lkb->lkb_flags & DLM_IFL_CB_PENDING)) { + lkb->lkb_flags |= DLM_IFL_CB_PENDING; rv = DLM_ENQUEUE_CALLBACK_NEED_SCHED; } list_add_tail(&cb->list, &lkb->lkb_callbacks); @@ -210,7 +209,7 @@ void dlm_callback_work(struct work_struct *work) spin_lock(&lkb->lkb_cb_lock); rv = dlm_dequeue_lkb_callback(lkb, &cb); if (rv == DLM_DEQUEUE_CALLBACK_EMPTY) { - lkb->lkb_flags &= ~DLM_IFL_NEED_SCHED; + lkb->lkb_flags &= ~DLM_IFL_CB_PENDING; spin_unlock(&lkb->lkb_cb_lock); break; } diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 6318e0f51bc9..ab1a55337a6e 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -211,7 +211,7 @@ struct dlm_args { #endif #define DLM_IFL_DEADLOCK_CANCEL 0x01000000 #define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */ -#define DLM_IFL_NEED_SCHED 0x04000000 +#define DLM_IFL_CB_PENDING 0x04000000 /* least significant 2 bytes are message changed, they are full transmitted * but at receive side only the 2 bytes LSB will be set. * diff --git a/fs/dlm/user.c b/fs/dlm/user.c index edf722d6935a..35129505ddda 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -884,8 +884,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, goto try_another; case DLM_DEQUEUE_CALLBACK_LAST: list_del_init(&lkb->lkb_cb_list); - /* TODO */ - lkb->lkb_flags &= ~DLM_IFL_NEED_SCHED; + lkb->lkb_flags &= ~DLM_IFL_CB_PENDING; break; case DLM_DEQUEUE_CALLBACK_SUCCESS: break; -- cgit From 17827754e503d6c72b05a1c4603469ec9bf35d48 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 17 Nov 2022 17:11:45 -0500 Subject: fs: dlm: add dst nodeid for msg tracing In DLM when we send a dlm message it is easy to add the lock resource name, but additional lookup is required when to trace the receive message side. The idea here is to move the lookup work to the user by using a lookup to find the right send message with recv message. As note DLM can't drop any message which is guaranteed by a special session layer. For doing the lookup a 3 tupel is required as an unique identification which is dst nodeid, src nodeid and sequence number. This patch adds the destination nodeid to the dlm message trace points. The source nodeid is given by the h_nodeid field inside the header. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/midcomms.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 32194a750fe1..960def5ab530 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -479,10 +479,10 @@ static void dlm_receive_buffer_3_2_trace(uint32_t seq, union dlm_packet *p) { switch (p->header.h_cmd) { case DLM_MSG: - trace_dlm_recv_message(seq, &p->message); + trace_dlm_recv_message(dlm_our_nodeid(), seq, &p->message); break; case DLM_RCOM: - trace_dlm_recv_rcom(seq, &p->rcom); + trace_dlm_recv_rcom(dlm_our_nodeid(), seq, &p->rcom); break; default: break; @@ -1151,11 +1151,13 @@ static void dlm_midcomms_commit_msg_3_2_trace(const struct dlm_mhandle *mh, { switch (mh->inner_p->header.h_cmd) { case DLM_MSG: - trace_dlm_send_message(mh->seq, &mh->inner_p->message, + trace_dlm_send_message(mh->node->nodeid, mh->seq, + &mh->inner_p->message, name, namelen); break; case DLM_RCOM: - trace_dlm_send_rcom(mh->seq, &mh->inner_p->rcom); + trace_dlm_send_rcom(mh->node->nodeid, mh->seq, + &mh->inner_p->rcom); break; default: /* nothing to trace */ -- cgit From 8b0188b0d60b6f6183b48380bac49fe080c5ded9 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 17 Nov 2022 17:11:46 -0500 Subject: fs: dlm: add midcomms init/start functions This patch introduces leftovers of init, start, stop and exit functionality. The dlm application layer should always call the midcomms layer which getting aware of such event and redirect it to the lowcomms layer. Some functionality which is currently handled inside the start functionality of midcomms and lowcomms should be handled in the init functionality as it only need to be initialized once when dlm is loaded. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 5 ++--- fs/dlm/lowcomms.c | 16 ++++++++++------ fs/dlm/lowcomms.h | 1 + fs/dlm/main.c | 7 +++++-- fs/dlm/midcomms.c | 17 ++++++++++++++++- fs/dlm/midcomms.h | 3 +++ 6 files changed, 37 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 96fe8f7ca09c..d0b4e2181a5f 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -17,7 +17,6 @@ #include "recoverd.h" #include "dir.h" #include "midcomms.h" -#include "lowcomms.h" #include "config.h" #include "memory.h" #include "lock.h" @@ -723,7 +722,7 @@ static int __dlm_new_lockspace(const char *name, const char *cluster, if (!ls_count) { dlm_scand_stop(); dlm_midcomms_shutdown(); - dlm_lowcomms_stop(); + dlm_midcomms_stop(); } out: mutex_unlock(&ls_lock); @@ -926,7 +925,7 @@ int dlm_release_lockspace(void *lockspace, int force) if (!error) ls_count--; if (!ls_count) - dlm_lowcomms_stop(); + dlm_midcomms_stop(); mutex_unlock(&ls_lock); return error; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index b05c6d9b5102..0dcb69cc456a 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1987,10 +1987,6 @@ static const struct dlm_proto_ops dlm_sctp_ops = { int dlm_lowcomms_start(void) { int error = -EINVAL; - int i; - - for (i = 0; i < CONN_HASH_SIZE; i++) - INIT_HLIST_HEAD(&connection_hash[i]); init_local(); if (!dlm_local_count) { @@ -1999,8 +1995,6 @@ int dlm_lowcomms_start(void) goto fail; } - INIT_WORK(&listen_con.rwork, process_listen_recv_socket); - error = work_start(); if (error) goto fail_local; @@ -2039,6 +2033,16 @@ fail: return error; } +void dlm_lowcomms_init(void) +{ + int i; + + for (i = 0; i < CONN_HASH_SIZE; i++) + INIT_HLIST_HEAD(&connection_hash[i]); + + INIT_WORK(&listen_con.rwork, process_listen_recv_socket); +} + void dlm_lowcomms_exit(void) { struct dlm_node_addr *na, *safe; diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index 29369feea991..bbce7a18416d 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -35,6 +35,7 @@ extern int dlm_allow_conn; int dlm_lowcomms_start(void); void dlm_lowcomms_shutdown(void); void dlm_lowcomms_stop(void); +void dlm_lowcomms_init(void); void dlm_lowcomms_exit(void); int dlm_lowcomms_close(int nodeid); struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, diff --git a/fs/dlm/main.c b/fs/dlm/main.c index 1c5be4b70ac1..a77338be3237 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -17,7 +17,7 @@ #include "user.h" #include "memory.h" #include "config.h" -#include "lowcomms.h" +#include "midcomms.h" #define CREATE_TRACE_POINTS #include @@ -30,6 +30,8 @@ static int __init init_dlm(void) if (error) goto out; + dlm_midcomms_init(); + error = dlm_lockspace_init(); if (error) goto out_mem; @@ -66,6 +68,7 @@ static int __init init_dlm(void) out_lockspace: dlm_lockspace_exit(); out_mem: + dlm_midcomms_exit(); dlm_memory_exit(); out: return error; @@ -79,7 +82,7 @@ static void __exit exit_dlm(void) dlm_config_exit(); dlm_memory_exit(); dlm_lockspace_exit(); - dlm_lowcomms_exit(); + dlm_midcomms_exit(); dlm_unregister_debugfs(); } diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 960def5ab530..6b4c72fb0171 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -1205,13 +1205,28 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, #endif int dlm_midcomms_start(void) +{ + return dlm_lowcomms_start(); +} + +void dlm_midcomms_stop(void) +{ + dlm_lowcomms_stop(); +} + +void dlm_midcomms_init(void) { int i; for (i = 0; i < CONN_HASH_SIZE; i++) INIT_HLIST_HEAD(&node_hash[i]); - return dlm_lowcomms_start(); + dlm_lowcomms_init(); +} + +void dlm_midcomms_exit(void) +{ + dlm_lowcomms_exit(); } static void dlm_act_fin_ack_rcv(struct midcomms_node *node) diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index d6286e80b077..69296552d5ad 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -21,6 +21,9 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name, int namelen); int dlm_midcomms_close(int nodeid); int dlm_midcomms_start(void); +void dlm_midcomms_stop(void); +void dlm_midcomms_init(void); +void dlm_midcomms_exit(void); void dlm_midcomms_shutdown(void); void dlm_midcomms_add_member(int nodeid); void dlm_midcomms_remove_member(int nodeid); -- cgit From 01ea3d7701cb52dece379384f8aa7b8840f1d7c7 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 17 Nov 2022 17:11:47 -0500 Subject: fs: dlm: remove twice INIT_WORK This patch removed a twice INIT_WORK() functionality. We already doing this inside of dlm_lowcomms_init() functionality which is called only once dlm is loaded. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 0dcb69cc456a..3106e7f87344 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1825,7 +1825,6 @@ static int dlm_listen_for_all(void) save_listen_callbacks(sock); add_listen_sock(sock, &listen_con); - INIT_WORK(&listen_con.rwork, process_listen_recv_socket); result = sock->ops->listen(sock, 5); if (result < 0) { dlm_close_sock(&listen_con.sock); -- cgit From dd070a56e0fa36f03bcd09fbf1521c733cf2aa21 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 17 Nov 2022 17:11:48 -0500 Subject: fs: dlm: use list_first_entry_or_null Instead of check on list_empty() we can do the same with list_first_entry_or_null() and return NULL if the returned value is NULL. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 3106e7f87344..d3302b10b37e 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -214,15 +214,12 @@ static struct writequeue_entry *con_next_wq(struct connection *con) { struct writequeue_entry *e; - if (list_empty(&con->writequeue)) - return NULL; - - e = list_first_entry(&con->writequeue, struct writequeue_entry, - list); + e = list_first_entry_or_null(&con->writequeue, struct writequeue_entry, + list); /* if len is zero nothing is to send, if there are users filling * buffers we wait until the users are done so we can send more. */ - if (e->users || e->len == 0) + if (!e || e->users || e->len == 0) return NULL; return e; -- cgit From 1037c2a94ab51997d8b1ef9e7f6ed697e6e17d84 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 17 Nov 2022 17:11:49 -0500 Subject: fs: dlm: use listen sock as dlm running indicator This patch will switch from dlm_allow_conn to check if dlm lowcomms is running or not to if we actually have a listen socket set or not. The list socket will be set and unset in lowcomms start and shutdown functionality. To synchronize with data_ready() callback we will set the socket callback to NULL while socket lock is held. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/config.c | 4 ++-- fs/dlm/lowcomms.c | 17 ++++++----------- fs/dlm/lowcomms.h | 4 ++-- 3 files changed, 10 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/dlm/config.c b/fs/dlm/config.c index ac8b62106ce0..20b60709eccf 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -183,7 +183,7 @@ static int dlm_check_protocol_and_dlm_running(unsigned int x) return -EINVAL; } - if (dlm_allow_conn) + if (dlm_lowcomms_is_running()) return -EBUSY; return 0; @@ -194,7 +194,7 @@ static int dlm_check_zero_and_dlm_running(unsigned int x) if (!x) return -EINVAL; - if (dlm_allow_conn) + if (dlm_lowcomms_is_running()) return -EBUSY; return 0; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index d3302b10b37e..d9001d40aa6e 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -176,7 +176,6 @@ static DEFINE_SPINLOCK(dlm_node_addrs_spin); static struct listen_connection listen_con; static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT]; static int dlm_local_count; -int dlm_allow_conn; /* Work queues */ static struct workqueue_struct *recv_workqueue; @@ -191,6 +190,11 @@ static const struct dlm_proto_ops *dlm_proto_ops; static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); +bool dlm_lowcomms_is_running(void) +{ + return !!listen_con.sock; +} + static void writequeue_entry_ctor(void *data) { struct writequeue_entry *entry = data; @@ -511,9 +515,6 @@ static void lowcomms_data_ready(struct sock *sk) static void lowcomms_listen_data_ready(struct sock *sk) { - if (!dlm_allow_conn) - return; - queue_work(recv_workqueue, &listen_con.rwork); } @@ -1689,10 +1690,7 @@ void dlm_lowcomms_shutdown(void) { int idx; - /* Set all the flags to prevent any - * socket activity. - */ - dlm_allow_conn = 0; + restore_callbacks(listen_con.sock); if (recv_workqueue) flush_workqueue(recv_workqueue); @@ -1995,8 +1993,6 @@ int dlm_lowcomms_start(void) if (error) goto fail_local; - dlm_allow_conn = 1; - /* Start listening */ switch (dlm_config.ci_protocol) { case DLM_PROTO_TCP: @@ -2021,7 +2017,6 @@ int dlm_lowcomms_start(void) fail_listen: dlm_proto_ops = NULL; fail_proto_ops: - dlm_allow_conn = 0; work_stop(); fail_local: deinit_local(); diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index bbce7a18416d..acaf1b0b3885 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -29,8 +29,8 @@ static inline int nodeid_hash(int nodeid) return nodeid & (CONN_HASH_SIZE-1); } -/* switch to check if dlm is running */ -extern int dlm_allow_conn; +/* check if dlm is running */ +bool dlm_lowcomms_is_running(void); int dlm_lowcomms_start(void); void dlm_lowcomms_shutdown(void); -- cgit From 4f567acb0b8622fe265aac4b0037a03ef544ba24 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 17 Nov 2022 17:11:50 -0500 Subject: fs: dlm: remove socket shutdown handling Since commit 489d8e559c65 ("fs: dlm: add reliable connection if reconnect") we have functionality like TCP offers for half-closed sockets on dlm application protocol layer. This feature is required because the cluster manager events about leaving resource memberships can be locally already occurred but other cluster nodes having a pending leaving membership over the cluster manager protocol happening. In this time the local dlm node already shutdown it's connection and don't transmit anymore any new dlm messages, but however it still needs to be able to accept dlm messages because the pending leave membership request of the cluster manager protocol which the dlm kernel implementation has no control about it. We have this functionality on the application for two reasons, the main reason is that SCTP does not support such functionality on socket layer. But we can do it inside application layer. Another small issue is that this feature is broken in the TCP world because some NAT devices does not implement such functionality correctly. This is the same reason why the reliable connection session layer in DLM exists. We give up on middle devices in the networking which sends e.g. TCP resets out. In DLM we cannot have any message dropping and we ensure it over a session layer that it can't happen. Back to the half-closed grace shutdown handling. It's not necessary anymore to do it on socket layer (which is only support for TCP sockets) because we do it on application layer. This patch removes this handling, if there are still issues then we have a problem on the application layer for such handling. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 127 ++++++++++-------------------------------------------- fs/dlm/lowcomms.h | 1 + fs/dlm/midcomms.c | 6 ++- 3 files changed, 27 insertions(+), 107 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index d9001d40aa6e..e33bee1beba2 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -65,7 +65,6 @@ /* Number of messages to send before rescheduling */ #define MAX_SEND_MSG_COUNT 25 -#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(10000) struct connection { struct socket *sock; /* NULL if not connected */ @@ -79,14 +78,11 @@ struct connection { #define CF_CLOSE 6 #define CF_APP_LIMITED 7 #define CF_CLOSING 8 -#define CF_SHUTDOWN 9 -#define CF_CONNECTED 10 -#define CF_RECONNECT 11 -#define CF_DELAY_CONNECT 12 -#define CF_EOF 13 +#define CF_CONNECTED 9 +#define CF_RECONNECT 10 +#define CF_DELAY_CONNECT 11 struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; - atomic_t writequeue_cnt; int retries; #define MAX_CONNECT_RETRIES 3 struct hlist_node list; @@ -94,7 +90,6 @@ struct connection { struct connection *sendcon; struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ - wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */ unsigned char *rx_buf; int rx_buflen; int rx_leftover; @@ -157,10 +152,6 @@ struct dlm_proto_ops { int (*listen_validate)(void); void (*listen_sockopts)(struct socket *sock); int (*listen_bind)(struct socket *sock); - /* What to do to shutdown */ - void (*shutdown_action)(struct connection *con); - /* What to do to eof check */ - bool (*eof_condition)(struct connection *con); }; static struct listen_sock_callbacks { @@ -241,11 +232,6 @@ static struct connection *__find_con(int nodeid, int r) return NULL; } -static bool tcp_eof_condition(struct connection *con) -{ - return atomic_read(&con->writequeue_cnt); -} - static int dlm_con_init(struct connection *con, int nodeid) { con->rx_buflen = dlm_config.ci_buffer_size; @@ -257,10 +243,8 @@ static int dlm_con_init(struct connection *con, int nodeid) mutex_init(&con->sock_mutex); INIT_LIST_HEAD(&con->writequeue); spin_lock_init(&con->writequeue_lock); - atomic_set(&con->writequeue_cnt, 0); INIT_WORK(&con->swork, process_send_sockets); INIT_WORK(&con->rwork, process_recv_sockets); - init_waitqueue_head(&con->shutdown_wait); return 0; } @@ -771,7 +755,6 @@ static void free_entry(struct writequeue_entry *e) } list_del(&e->list); - atomic_dec(&e->con->writequeue_cnt); kref_put(&e->ref, dlm_page_release); } @@ -834,56 +817,10 @@ static void close_connection(struct connection *con, bool and_other, clear_bit(CF_CONNECTED, &con->flags); clear_bit(CF_DELAY_CONNECT, &con->flags); clear_bit(CF_RECONNECT, &con->flags); - clear_bit(CF_EOF, &con->flags); mutex_unlock(&con->sock_mutex); clear_bit(CF_CLOSING, &con->flags); } -static void shutdown_connection(struct connection *con) -{ - int ret; - - flush_work(&con->swork); - - mutex_lock(&con->sock_mutex); - /* nothing to shutdown */ - if (!con->sock) { - mutex_unlock(&con->sock_mutex); - return; - } - - set_bit(CF_SHUTDOWN, &con->flags); - ret = kernel_sock_shutdown(con->sock, SHUT_WR); - mutex_unlock(&con->sock_mutex); - if (ret) { - log_print("Connection %p failed to shutdown: %d will force close", - con, ret); - goto force_close; - } else { - ret = wait_event_timeout(con->shutdown_wait, - !test_bit(CF_SHUTDOWN, &con->flags), - DLM_SHUTDOWN_WAIT_TIMEOUT); - if (ret == 0) { - log_print("Connection %p shutdown timed out, will force close", - con); - goto force_close; - } - } - - return; - -force_close: - clear_bit(CF_SHUTDOWN, &con->flags); - close_connection(con, false, true, true); -} - -static void dlm_tcp_shutdown(struct connection *con) -{ - if (con->othercon) - shutdown_connection(con->othercon); - shutdown_connection(con); -} - static int con_realloc_receive_buf(struct connection *con, int newlen) { unsigned char *newbuf; @@ -975,19 +912,8 @@ out_close: log_print("connection %p got EOF from %d", con, con->nodeid); - if (dlm_proto_ops->eof_condition && - dlm_proto_ops->eof_condition(con)) { - set_bit(CF_EOF, &con->flags); - mutex_unlock(&con->sock_mutex); - } else { - mutex_unlock(&con->sock_mutex); - close_connection(con, false, true, false); - - /* handling for tcp shutdown */ - clear_bit(CF_SHUTDOWN, &con->flags); - wake_up(&con->shutdown_wait); - } - + mutex_unlock(&con->sock_mutex); + close_connection(con, false, true, false); /* signal to breaking receive worker */ ret = -1; } else { @@ -1261,7 +1187,6 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len, kref_get(&e->ref); *ppc = page_address(e->page); e->end += len; - atomic_inc(&con->writequeue_cnt); if (cb) cb(data); @@ -1467,20 +1392,6 @@ again: } spin_unlock(&con->writequeue_lock); - /* close if we got EOF */ - if (test_and_clear_bit(CF_EOF, &con->flags)) { - mutex_unlock(&con->sock_mutex); - close_connection(con, false, false, true); - - /* handling for tcp shutdown */ - clear_bit(CF_SHUTDOWN, &con->flags); - wake_up(&con->shutdown_wait); - } else { - mutex_unlock(&con->sock_mutex); - } - - return; - out: mutex_unlock(&con->sock_mutex); return; @@ -1680,16 +1591,8 @@ static int work_start(void) return 0; } -static void shutdown_conn(struct connection *con) -{ - if (dlm_proto_ops->shutdown_action) - dlm_proto_ops->shutdown_action(con); -} - void dlm_lowcomms_shutdown(void) { - int idx; - restore_callbacks(listen_con.sock); if (recv_workqueue) @@ -1698,9 +1601,25 @@ void dlm_lowcomms_shutdown(void) flush_workqueue(send_workqueue); dlm_close_sock(&listen_con.sock); +} + +void dlm_lowcomms_shutdown_node(int nodeid, bool force) +{ + struct connection *con; + int idx; idx = srcu_read_lock(&connections_srcu); - foreach_conn(shutdown_conn); + con = nodeid2con(nodeid, 0); + if (WARN_ON_ONCE(!con)) { + srcu_read_unlock(&connections_srcu, idx); + return; + } + + WARN_ON_ONCE(!force && !list_empty(&con->writequeue)); + clean_one_writequeue(con); + if (con->othercon) + clean_one_writequeue(con->othercon); + close_connection(con, true, true, true); srcu_read_unlock(&connections_srcu, idx); } @@ -1912,8 +1831,6 @@ static const struct dlm_proto_ops dlm_tcp_ops = { .listen_validate = dlm_tcp_listen_validate, .listen_sockopts = dlm_tcp_listen_sockopts, .listen_bind = dlm_tcp_listen_bind, - .shutdown_action = dlm_tcp_shutdown, - .eof_condition = tcp_eof_condition, }; static int dlm_sctp_bind(struct socket *sock) diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index acaf1b0b3885..3e8dca66183b 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -34,6 +34,7 @@ bool dlm_lowcomms_is_running(void); int dlm_lowcomms_start(void); void dlm_lowcomms_shutdown(void); +void dlm_lowcomms_shutdown_node(int nodeid, bool force); void dlm_lowcomms_stop(void); void dlm_lowcomms_init(void); void dlm_lowcomms_exit(void); diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 6b4c72fb0171..b0e8bdcaab1b 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -1426,11 +1426,13 @@ static void midcomms_shutdown(struct midcomms_node *node) pr_debug("active shutdown timed out for node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); midcomms_node_reset(node); + dlm_lowcomms_shutdown_node(node->nodeid, true); return; } pr_debug("active shutdown done for node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); + dlm_lowcomms_shutdown_node(node->nodeid, false); } void dlm_midcomms_shutdown(void) @@ -1438,6 +1440,8 @@ void dlm_midcomms_shutdown(void) struct midcomms_node *node; int i, idx; + dlm_lowcomms_shutdown(); + mutex_lock(&close_lock); idx = srcu_read_lock(&nodes_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { @@ -1455,8 +1459,6 @@ void dlm_midcomms_shutdown(void) } srcu_read_unlock(&nodes_srcu, idx); mutex_unlock(&close_lock); - - dlm_lowcomms_shutdown(); } int dlm_midcomms_close(int nodeid) -- cgit From c3d88dfd15835f42c79467eee2c40db3095e5271 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 17 Nov 2022 17:11:51 -0500 Subject: fs: dlm: cleanup listen sock handling This patch removes save_listen_callbacks() and add_listen_sock() as they are only used once in lowcomms functionality. For shutdown lowcomms it's not necessary to whole flush the workqueues to synchronize with restoring the old sk_data_ready() callback. Only the listen con receive work need to be cancelled. For each individual node shutdown we should be sure that last ack was been transmitted which is done by flushing per connection swork worker. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 51 +++++++++++++++++---------------------------------- 1 file changed, 17 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index e33bee1beba2..8bf09c3a0946 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -647,17 +647,6 @@ out: orig_report(sk); } -/* Note: sk_callback_lock must be locked before calling this function. */ -static void save_listen_callbacks(struct socket *sock) -{ - struct sock *sk = sock->sk; - - listen_sock.sk_data_ready = sk->sk_data_ready; - listen_sock.sk_state_change = sk->sk_state_change; - listen_sock.sk_write_space = sk->sk_write_space; - listen_sock.sk_error_report = sk->sk_error_report; -} - static void restore_callbacks(struct socket *sock) { struct sock *sk = sock->sk; @@ -671,21 +660,6 @@ static void restore_callbacks(struct socket *sock) release_sock(sk); } -static void add_listen_sock(struct socket *sock, struct listen_connection *con) -{ - struct sock *sk = sock->sk; - - lock_sock(sk); - save_listen_callbacks(sock); - con->sock = sock; - - sk->sk_user_data = con; - sk->sk_allocation = GFP_NOFS; - /* Install a data_ready callback */ - sk->sk_data_ready = lowcomms_listen_data_ready; - release_sock(sk); -} - /* Make a socket active */ static void add_sock(struct socket *sock, struct connection *con) { @@ -1593,13 +1567,12 @@ static int work_start(void) void dlm_lowcomms_shutdown(void) { - restore_callbacks(listen_con.sock); - - if (recv_workqueue) - flush_workqueue(recv_workqueue); - if (send_workqueue) - flush_workqueue(send_workqueue); + /* stop lowcomms_listen_data_ready calls */ + lock_sock(listen_con.sock->sk); + listen_con.sock->sk->sk_data_ready = listen_sock.sk_data_ready; + release_sock(listen_con.sock->sk); + cancel_work_sync(&listen_con.rwork); dlm_close_sock(&listen_con.sock); } @@ -1615,6 +1588,7 @@ void dlm_lowcomms_shutdown_node(int nodeid, bool force) return; } + flush_work(&con->swork); WARN_ON_ONCE(!force && !list_empty(&con->writequeue)); clean_one_writequeue(con); if (con->othercon) @@ -1736,8 +1710,17 @@ static int dlm_listen_for_all(void) if (result < 0) goto out; - save_listen_callbacks(sock); - add_listen_sock(sock, &listen_con); + lock_sock(sock->sk); + listen_sock.sk_data_ready = sock->sk->sk_data_ready; + listen_sock.sk_write_space = sock->sk->sk_write_space; + listen_sock.sk_error_report = sock->sk->sk_error_report; + listen_sock.sk_state_change = sock->sk->sk_state_change; + + listen_con.sock = sock; + + sock->sk->sk_allocation = GFP_NOFS; + sock->sk->sk_data_ready = lowcomms_listen_data_ready; + release_sock(sock->sk); result = sock->ops->listen(sock, 5); if (result < 0) { -- cgit From c51c9cd8addcfbdc097dbefd59f022402183644b Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 17 Nov 2022 17:11:52 -0500 Subject: fs: dlm: don't put dlm_local_addrs on heap This patch removes to allocate the dlm_local_addr[] pointers on the heap. Instead we directly store the type of "struct sockaddr_storage". This removes function deinit_local() because it was freeing memory only. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 8bf09c3a0946..0883394cfbeb 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -165,7 +165,7 @@ static LIST_HEAD(dlm_node_addrs); static DEFINE_SPINLOCK(dlm_node_addrs_spin); static struct listen_connection listen_con; -static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT]; +static struct sockaddr_storage dlm_local_addr[DLM_MAX_ADDR_COUNT]; static int dlm_local_count; /* Work queues */ @@ -383,7 +383,7 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, if (!sa_out) return 0; - if (dlm_local_addr[0]->ss_family == AF_INET) { + if (dlm_local_addr[0].ss_family == AF_INET) { struct sockaddr_in *in4 = (struct sockaddr_in *) &sas; struct sockaddr_in *ret4 = (struct sockaddr_in *) sa_out; ret4->sin_addr.s_addr = in4->sin_addr.s_addr; @@ -683,7 +683,7 @@ static void add_sock(struct socket *sock, struct connection *con) static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, int *addr_len) { - saddr->ss_family = dlm_local_addr[0]->ss_family; + saddr->ss_family = dlm_local_addr[0].ss_family; if (saddr->ss_family == AF_INET) { struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr; in4_addr->sin_port = cpu_to_be16(port); @@ -1065,7 +1065,7 @@ static int sctp_bind_addrs(struct socket *sock, uint16_t port) int i, addr_len, result = 0; for (i = 0; i < dlm_local_count; i++) { - memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr)); + memcpy(&localaddr, &dlm_local_addr[i], sizeof(localaddr)); make_sockaddr(&localaddr, port, &addr_len); if (!i) @@ -1085,7 +1085,7 @@ static int sctp_bind_addrs(struct socket *sock, uint16_t port) /* Get local addresses */ static void init_local(void) { - struct sockaddr_storage sas, *addr; + struct sockaddr_storage sas; int i; dlm_local_count = 0; @@ -1093,21 +1093,10 @@ static void init_local(void) if (dlm_our_addr(&sas, i)) break; - addr = kmemdup(&sas, sizeof(*addr), GFP_NOFS); - if (!addr) - break; - dlm_local_addr[dlm_local_count++] = addr; + memcpy(&dlm_local_addr[dlm_local_count++], &sas, sizeof(sas)); } } -static void deinit_local(void) -{ - int i; - - for (i = 0; i < dlm_local_count; i++) - kfree(dlm_local_addr[i]); -} - static struct writequeue_entry *new_writequeue_entry(struct connection *con) { struct writequeue_entry *entry; @@ -1463,7 +1452,7 @@ static void dlm_connect(struct connection *con) } /* Create a socket to communicate with */ - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family, SOCK_STREAM, dlm_proto_ops->proto, &sock); if (result < 0) goto socket_err; @@ -1679,7 +1668,6 @@ void dlm_lowcomms_stop(void) foreach_conn(free_conn); srcu_read_unlock(&connections_srcu, idx); work_stop(); - deinit_local(); dlm_proto_ops = NULL; } @@ -1696,7 +1684,7 @@ static int dlm_listen_for_all(void) if (result < 0) return result; - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family, SOCK_STREAM, dlm_proto_ops->proto, &sock); if (result < 0) { log_print("Can't create comms socket: %d", result); @@ -1743,7 +1731,7 @@ static int dlm_tcp_bind(struct socket *sock) /* Bind to our cluster-known address connecting to avoid * routing problems. */ - memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr)); + memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr)); make_sockaddr(&src_addr, 0, &addr_len); result = sock->ops->bind(sock, (struct sockaddr *)&src_addr, @@ -1800,8 +1788,8 @@ static int dlm_tcp_listen_bind(struct socket *sock) int addr_len; /* Bind to our port */ - make_sockaddr(dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len); - return sock->ops->bind(sock, (struct sockaddr *)dlm_local_addr[0], + make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len); + return sock->ops->bind(sock, (struct sockaddr *)&dlm_local_addr[0], addr_len); } @@ -1891,7 +1879,7 @@ int dlm_lowcomms_start(void) error = work_start(); if (error) - goto fail_local; + goto fail; /* Start listening */ switch (dlm_config.ci_protocol) { @@ -1918,8 +1906,6 @@ fail_listen: dlm_proto_ops = NULL; fail_proto_ops: work_stop(); -fail_local: - deinit_local(); fail: return error; } -- cgit From 6f0b0b5d7ae70423c94915989c45b29e87c61ad7 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 17 Nov 2022 17:11:53 -0500 Subject: fs: dlm: remove dlm_node_addrs lookup list This patch merges the dlm_node_addrs lookup list to the connection structure. It is a per node mapping to some configuration setup by configfs. We don't need two lookup structures. The connection hash has now a lifetime like the dlm_node_addrs entries. Means we add only new entries when configure cluster and not while new connections are coming in, remove connection when a node got fenced and cleanup all connection when the dlm exits. It should work the same and even will show more issues because we don't try to somehow keep those two data structures in sync with the current cluster configuration. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 290 +++++++++++++++++++++++++----------------------------- 1 file changed, 136 insertions(+), 154 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 0883394cfbeb..ed3cd3757199 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -93,6 +93,11 @@ struct connection { unsigned char *rx_buf; int rx_buflen; int rx_leftover; + int mark; + int addr_count; + int curr_addr_index; + struct sockaddr_storage addr[DLM_MAX_ADDR_COUNT]; + spinlock_t addrs_lock; struct rcu_head rcu; }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -131,15 +136,6 @@ struct dlm_msg { struct kref ref; }; -struct dlm_node_addr { - struct list_head list; - int nodeid; - int mark; - int addr_count; - int curr_addr_index; - struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; -}; - struct dlm_proto_ops { bool try_new_addr; const char *name; @@ -161,9 +157,6 @@ static struct listen_sock_callbacks { void (*sk_write_space)(struct sock *); } listen_sock; -static LIST_HEAD(dlm_node_addrs); -static DEFINE_SPINLOCK(dlm_node_addrs_spin); - static struct listen_connection listen_con; static struct sockaddr_storage dlm_local_addr[DLM_MAX_ADDR_COUNT]; static int dlm_local_count; @@ -306,17 +299,6 @@ static void foreach_conn(void (*conn_func)(struct connection *c)) } } -static struct dlm_node_addr *find_node_addr(int nodeid) -{ - struct dlm_node_addr *na; - - list_for_each_entry(na, &dlm_node_addrs, list) { - if (na->nodeid == nodeid) - return na; - } - return NULL; -} - static int addr_compare(const struct sockaddr_storage *x, const struct sockaddr_storage *y) { @@ -350,38 +332,45 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, unsigned int *mark) { struct sockaddr_storage sas; - struct dlm_node_addr *na; + struct connection *con; + int idx; if (!dlm_local_count) return -1; - spin_lock(&dlm_node_addrs_spin); - na = find_node_addr(nodeid); - if (na && na->addr_count) { - memcpy(&sas, na->addr[na->curr_addr_index], - sizeof(struct sockaddr_storage)); + idx = srcu_read_lock(&connections_srcu); + con = nodeid2con(nodeid, 0); + if (!con) { + srcu_read_unlock(&connections_srcu, idx); + return -ENOENT; + } - if (try_new_addr) { - na->curr_addr_index++; - if (na->curr_addr_index == na->addr_count) - na->curr_addr_index = 0; - } + spin_lock(&con->addrs_lock); + if (!con->addr_count) { + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); + return -ENOENT; } - spin_unlock(&dlm_node_addrs_spin); - if (!na) - return -EEXIST; + memcpy(&sas, &con->addr[con->curr_addr_index], + sizeof(struct sockaddr_storage)); - if (!na->addr_count) - return -ENOENT; + if (try_new_addr) { + con->curr_addr_index++; + if (con->curr_addr_index == con->addr_count) + con->curr_addr_index = 0; + } - *mark = na->mark; + *mark = con->mark; + spin_unlock(&con->addrs_lock); if (sas_out) memcpy(sas_out, &sas, sizeof(struct sockaddr_storage)); - if (!sa_out) + if (!sa_out) { + srcu_read_unlock(&connections_srcu, idx); return 0; + } if (dlm_local_addr[0].ss_family == AF_INET) { struct sockaddr_in *in4 = (struct sockaddr_in *) &sas; @@ -393,43 +382,46 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, ret6->sin6_addr = in6->sin6_addr; } + srcu_read_unlock(&connections_srcu, idx); return 0; } static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid, unsigned int *mark) { - struct dlm_node_addr *na; - int rv = -EEXIST; - int addr_i; - - spin_lock(&dlm_node_addrs_spin); - list_for_each_entry(na, &dlm_node_addrs, list) { - if (!na->addr_count) - continue; - - for (addr_i = 0; addr_i < na->addr_count; addr_i++) { - if (addr_compare(na->addr[addr_i], addr)) { - *nodeid = na->nodeid; - *mark = na->mark; - rv = 0; - goto unlock; + struct connection *con; + int i, idx, addr_i; + + idx = srcu_read_lock(&connections_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(con, &connection_hash[i], list) { + WARN_ON_ONCE(!con->addr_count); + + spin_lock(&con->addrs_lock); + for (addr_i = 0; addr_i < con->addr_count; addr_i++) { + if (addr_compare(&con->addr[addr_i], addr)) { + *nodeid = con->nodeid; + *mark = con->mark; + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); + return 0; + } } + spin_unlock(&con->addrs_lock); } } -unlock: - spin_unlock(&dlm_node_addrs_spin); - return rv; + srcu_read_unlock(&connections_srcu, idx); + + return -ENOENT; } -/* caller need to held dlm_node_addrs_spin lock */ -static bool dlm_lowcomms_na_has_addr(const struct dlm_node_addr *na, - const struct sockaddr_storage *addr) +static bool dlm_lowcomms_con_has_addr(const struct connection *con, + const struct sockaddr_storage *addr) { int i; - for (i = 0; i < na->addr_count; i++) { - if (addr_compare(na->addr[i], addr)) + for (i = 0; i < con->addr_count; i++) { + if (addr_compare(&con->addr[i], addr)) return true; } @@ -438,52 +430,42 @@ static bool dlm_lowcomms_na_has_addr(const struct dlm_node_addr *na, int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) { - struct sockaddr_storage *new_addr; - struct dlm_node_addr *new_node, *na; - bool ret; - - new_node = kzalloc(sizeof(struct dlm_node_addr), GFP_NOFS); - if (!new_node) - return -ENOMEM; + struct connection *con; + bool ret, idx; - new_addr = kzalloc(sizeof(struct sockaddr_storage), GFP_NOFS); - if (!new_addr) { - kfree(new_node); + idx = srcu_read_lock(&connections_srcu); + con = nodeid2con(nodeid, GFP_NOFS); + if (!con) { + srcu_read_unlock(&connections_srcu, idx); return -ENOMEM; } - memcpy(new_addr, addr, len); - - spin_lock(&dlm_node_addrs_spin); - na = find_node_addr(nodeid); - if (!na) { - new_node->nodeid = nodeid; - new_node->addr[0] = new_addr; - new_node->addr_count = 1; - new_node->mark = dlm_config.ci_mark; - list_add(&new_node->list, &dlm_node_addrs); - spin_unlock(&dlm_node_addrs_spin); + spin_lock(&con->addrs_lock); + if (!con->addr_count) { + memcpy(&con->addr[0], addr, sizeof(*addr)); + con->addr_count = 1; + con->mark = dlm_config.ci_mark; + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); return 0; } - ret = dlm_lowcomms_na_has_addr(na, addr); + ret = dlm_lowcomms_con_has_addr(con, addr); if (ret) { - spin_unlock(&dlm_node_addrs_spin); - kfree(new_addr); - kfree(new_node); + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); return -EEXIST; } - if (na->addr_count >= DLM_MAX_ADDR_COUNT) { - spin_unlock(&dlm_node_addrs_spin); - kfree(new_addr); - kfree(new_node); + if (con->addr_count >= DLM_MAX_ADDR_COUNT) { + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); return -ENOSPC; } - na->addr[na->addr_count++] = new_addr; - spin_unlock(&dlm_node_addrs_spin); - kfree(new_node); + memcpy(&con->addr[con->addr_count++], addr, sizeof(*addr)); + srcu_read_unlock(&connections_srcu, idx); + spin_unlock(&con->addrs_lock); return 0; } @@ -558,10 +540,10 @@ int dlm_lowcomms_connect_node(int nodeid) return 0; idx = srcu_read_lock(&connections_srcu); - con = nodeid2con(nodeid, GFP_NOFS); - if (!con) { + con = nodeid2con(nodeid, 0); + if (WARN_ON_ONCE(!con)) { srcu_read_unlock(&connections_srcu, idx); - return -ENOMEM; + return -ENOENT; } lowcomms_connect_sock(con); @@ -572,18 +554,20 @@ int dlm_lowcomms_connect_node(int nodeid) int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark) { - struct dlm_node_addr *na; + struct connection *con; + int idx; - spin_lock(&dlm_node_addrs_spin); - na = find_node_addr(nodeid); - if (!na) { - spin_unlock(&dlm_node_addrs_spin); + idx = srcu_read_lock(&connections_srcu); + con = nodeid2con(nodeid, 0); + if (!con) { + srcu_read_unlock(&connections_srcu, idx); return -ENOENT; } - na->mark = mark; - spin_unlock(&dlm_node_addrs_spin); - + spin_lock(&con->addrs_lock); + con->mark = mark; + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); return 0; } @@ -960,10 +944,10 @@ static int accept_from_sock(struct listen_connection *con) * In this case we store the incoming one in "othercon" */ idx = srcu_read_lock(&connections_srcu); - newcon = nodeid2con(nodeid, GFP_NOFS); - if (!newcon) { + newcon = nodeid2con(nodeid, 0); + if (WARN_ON_ONCE(!newcon)) { srcu_read_unlock(&connections_srcu, idx); - result = -ENOMEM; + result = -ENOENT; goto accept_err; } @@ -1210,8 +1194,8 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, } idx = srcu_read_lock(&connections_srcu); - con = nodeid2con(nodeid, allocation); - if (!con) { + con = nodeid2con(nodeid, 0); + if (WARN_ON_ONCE(!con)) { srcu_read_unlock(&connections_srcu, idx); return NULL; } @@ -1376,35 +1360,44 @@ static void clean_one_writequeue(struct connection *con) spin_unlock(&con->writequeue_lock); } +static void connection_release(struct rcu_head *rcu) +{ + struct connection *con = container_of(rcu, struct connection, rcu); + + kfree(con->rx_buf); + kfree(con); +} + /* Called from recovery when it knows that a node has left the cluster */ int dlm_lowcomms_close(int nodeid) { struct connection *con; - struct dlm_node_addr *na; int idx; log_print("closing connection to node %d", nodeid); + idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, 0); - if (con) { - set_bit(CF_CLOSE, &con->flags); - close_connection(con, true, true, true); - clean_one_writequeue(con); - if (con->othercon) - clean_one_writequeue(con->othercon); + if (WARN_ON_ONCE(!con)) { + srcu_read_unlock(&connections_srcu, idx); + return -ENOENT; } - srcu_read_unlock(&connections_srcu, idx); - spin_lock(&dlm_node_addrs_spin); - na = find_node_addr(nodeid); - if (na) { - list_del(&na->list); - while (na->addr_count--) - kfree(na->addr[na->addr_count]); - kfree(na); + spin_lock(&connections_lock); + hlist_del_rcu(&con->list); + spin_unlock(&connections_lock); + + close_connection(con, true, true, true); + + clean_one_writequeue(con); + call_srcu(&connections_srcu, &con->rcu, connection_release); + if (con->othercon) { + clean_one_writequeue(con->othercon); + if (con->othercon) + call_srcu(&connections_srcu, &con->othercon->rcu, connection_release); } - spin_unlock(&dlm_node_addrs_spin); + srcu_read_unlock(&connections_srcu, idx); return 0; } @@ -1607,27 +1600,9 @@ static void stop_conn(struct connection *con) _stop_conn(con, true); } -static void connection_release(struct rcu_head *rcu) -{ - struct connection *con = container_of(rcu, struct connection, rcu); - - kfree(con->rx_buf); - kfree(con); -} - static void free_conn(struct connection *con) { close_connection(con, true, true, true); - spin_lock(&connections_lock); - hlist_del_rcu(&con->list); - spin_unlock(&connections_lock); - if (con->othercon) { - clean_one_writequeue(con->othercon); - call_srcu(&connections_srcu, &con->othercon->rcu, - connection_release); - } - clean_one_writequeue(con); - call_srcu(&connections_srcu, &con->rcu, connection_release); } static void work_flush(void) @@ -1922,14 +1897,21 @@ void dlm_lowcomms_init(void) void dlm_lowcomms_exit(void) { - struct dlm_node_addr *na, *safe; + struct connection *con; + int i, idx; - spin_lock(&dlm_node_addrs_spin); - list_for_each_entry_safe(na, safe, &dlm_node_addrs, list) { - list_del(&na->list); - while (na->addr_count--) - kfree(na->addr[na->addr_count]); - kfree(na); + idx = srcu_read_lock(&connections_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(con, &connection_hash[i], list) { + spin_lock(&connections_lock); + hlist_del_rcu(&con->list); + spin_unlock(&connections_lock); + + if (con->othercon) + call_srcu(&connections_srcu, &con->othercon->rcu, + connection_release); + call_srcu(&connections_srcu, &con->rcu, connection_release); + } } - spin_unlock(&dlm_node_addrs_spin); + srcu_read_unlock(&connections_srcu, idx); } -- cgit From e9dd5fd849f1ac125919a79eb4be66f078dd7f51 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 17 Nov 2022 17:11:54 -0500 Subject: fs: dlm: use sock2con without checking null This patch removes null checks on private data for sockets. If we have a null dereference there we having a bug in our implementation that such callback occurs in this state. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index ed3cd3757199..677e31144ca0 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -472,10 +472,9 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) /* Data available on socket or listen socket received a connect */ static void lowcomms_data_ready(struct sock *sk) { - struct connection *con; + struct connection *con = sock2con(sk); - con = sock2con(sk); - if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags)) + if (!test_and_set_bit(CF_READ_PENDING, &con->flags)) queue_work(recv_workqueue, &con->rwork); } @@ -486,11 +485,7 @@ static void lowcomms_listen_data_ready(struct sock *sk) static void lowcomms_write_space(struct sock *sk) { - struct connection *con; - - con = sock2con(sk); - if (!con) - return; + struct connection *con = sock2con(sk); if (!test_and_set_bit(CF_CONNECTED, &con->flags)) { log_print("connected to node %d", con->nodeid); @@ -573,14 +568,10 @@ int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark) static void lowcomms_error_report(struct sock *sk) { - struct connection *con; + struct connection *con = sock2con(sk); void (*orig_report)(struct sock *) = NULL; struct inet_sock *inet; - con = sock2con(sk); - if (con == NULL) - goto out; - orig_report = listen_sock.sk_error_report; inet = inet_sk(sk); -- cgit From c852a6d70698d5e4a9f3670ecabf9656ff00c73c Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 17 Nov 2022 17:11:55 -0500 Subject: fs: dlm: use saved sk_error_report() This patch changes the handling of calling the original sk_error_report() by not putting it on the stack and calling it later. If the listen_sock.sk_error_report() is NULL in this moment it indicates a bug in our implementation. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 677e31144ca0..643f9810ec35 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -569,11 +569,8 @@ int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark) static void lowcomms_error_report(struct sock *sk) { struct connection *con = sock2con(sk); - void (*orig_report)(struct sock *) = NULL; struct inet_sock *inet; - orig_report = listen_sock.sk_error_report; - inet = inet_sk(sk); switch (sk->sk_family) { case AF_INET: @@ -618,8 +615,7 @@ static void lowcomms_error_report(struct sock *sk) queue_work(send_workqueue, &con->swork); out: - if (orig_report) - orig_report(sk); + listen_sock.sk_error_report(sk); } static void restore_callbacks(struct socket *sock) -- cgit From 1351975ac1377225cef5d858971e17252c06ff51 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 17 Nov 2022 17:11:56 -0500 Subject: fs: dlm: don't init error value This patch removes a init of an error value to -EINVAL which is not necessary. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 643f9810ec35..c6b91f67a2c2 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1830,7 +1830,7 @@ static const struct dlm_proto_ops dlm_sctp_ops = { int dlm_lowcomms_start(void) { - int error = -EINVAL; + int error; init_local(); if (!dlm_local_count) { -- cgit From dbb751ffab0b764720e360efd642ba6bf076d87f Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 17 Nov 2022 17:11:57 -0500 Subject: fs: dlm: parallelize lowcomms socket handling This patch is rework of lowcomms handling, the main goal was here to handle recvmsg() and sendpage() to run parallel. Parallel in two senses: 1. per connection and 2. that recvmsg()/sendpage() doesn't block each other. Currently recvmsg()/sendpage() cannot run parallel because two workqueues "dlm_recv" and "dlm_send" are ordered workqueues. That means only one work item can be executed. The amount of queue items will be increased about the amount of nodes being inside the cluster. The current two workqueues for sending and receiving can also block each other if the same connection is executed at the same time in dlm_recv and dlm_send workqueue because a per connection mutex for the socket handling. To make it more parallel we introduce one "dlm_io" workqueue which is not an ordered workqueue, the amount of workers are not limited. Due per connection flags SEND/RECV pending we schedule workers ordered per connection and per send and receive task. To get rid of the mutex blocking same workers to do socket handling we switched to a semaphore which handles socket operations as read lock and sock releases as write operations, to prevent sock_release() being called while the socket is being used. There might be more optimization removing the semaphore and replacing it with other synchronization mechanism, however due other circumstances e.g. othercon behaviour it seems complicated to doing this change. I added comments to remove the othercon handling and moving to a different synchronization mechanism as this is done. We need to do that to the next dlm major version upgrade because it is not backwards compatible with the current connect mechanism. The processing of dlm messages need to be still handled by a ordered workqueue. An dlm_process ordered workqueue was introduced which gets filled by the receive worker. This is probably the next bottleneck of DLM but the application can't currently parse dlm messages parallel. A comment was introduced to lift the workqueue context of dlm processing in a non-sleepable softirq to get messages processing done fast. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 1024 +++++++++++++++++++++++++++++------------------------ fs/dlm/midcomms.c | 45 ++- fs/dlm/midcomms.h | 1 + 3 files changed, 586 insertions(+), 484 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index c6b91f67a2c2..799d1c36eabf 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -63,35 +63,43 @@ #define NEEDED_RMEM (4*1024*1024) -/* Number of messages to send before rescheduling */ -#define MAX_SEND_MSG_COUNT 25 - struct connection { struct socket *sock; /* NULL if not connected */ uint32_t nodeid; /* So we know who we are in the list */ - struct mutex sock_mutex; + /* this semaphore is used to allow parallel recv/send in read + * lock mode. When we release a sock we need to held the write lock. + * + * However this is locking code and not nice. When we remove the + * othercon handling we can look into other mechanism to synchronize + * io handling to call sock_release() at the right time. + */ + struct rw_semaphore sock_lock; unsigned long flags; -#define CF_READ_PENDING 1 -#define CF_WRITE_PENDING 2 -#define CF_INIT_PENDING 4 +#define CF_APP_LIMITED 0 +#define CF_RECV_PENDING 1 +#define CF_SEND_PENDING 2 +#define CF_RECV_INTR 3 +#define CF_IO_STOP 4 #define CF_IS_OTHERCON 5 -#define CF_CLOSE 6 -#define CF_APP_LIMITED 7 -#define CF_CLOSING 8 -#define CF_CONNECTED 9 -#define CF_RECONNECT 10 -#define CF_DELAY_CONNECT 11 struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; int retries; -#define MAX_CONNECT_RETRIES 3 struct hlist_node list; + /* due some connect()/accept() races we currently have this cross over + * connection attempt second connection for one node. + * + * There is a solution to avoid the race by introducing a connect + * rule as e.g. our_nodeid > nodeid_to_connect who is allowed to + * connect. Otherside can connect but will only be considered that + * the other side wants to have a reconnect. + * + * However changing to this behaviour will break backwards compatible. + * In a DLM protocol major version upgrade we should remove this! + */ struct connection *othercon; - struct connection *sendcon; - struct work_struct rwork; /* Receive workqueue */ - struct work_struct swork; /* Send workqueue */ - unsigned char *rx_buf; - int rx_buflen; + struct work_struct rwork; /* receive worker */ + struct work_struct swork; /* send worker */ + unsigned char rx_leftover_buf[DLM_MAX_SOCKET_BUFSIZE]; int rx_leftover; int mark; int addr_count; @@ -136,6 +144,14 @@ struct dlm_msg { struct kref ref; }; +struct processqueue_entry { + unsigned char *buf; + int nodeid; + int buflen; + + struct list_head list; +}; + struct dlm_proto_ops { bool try_new_addr; const char *name; @@ -162,8 +178,8 @@ static struct sockaddr_storage dlm_local_addr[DLM_MAX_ADDR_COUNT]; static int dlm_local_count; /* Work queues */ -static struct workqueue_struct *recv_workqueue; -static struct workqueue_struct *send_workqueue; +static struct workqueue_struct *io_workqueue; +static struct workqueue_struct *process_workqueue; static struct hlist_head connection_hash[CONN_HASH_SIZE]; static DEFINE_SPINLOCK(connections_lock); @@ -171,14 +187,44 @@ DEFINE_STATIC_SRCU(connections_srcu); static const struct dlm_proto_ops *dlm_proto_ops; +#define DLM_IO_SUCCESS 0 +#define DLM_IO_END 1 +#define DLM_IO_EOF 2 +#define DLM_IO_RESCHED 3 + static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); +static void process_dlm_messages(struct work_struct *work); + +static DECLARE_WORK(process_work, process_dlm_messages); +static DEFINE_SPINLOCK(processqueue_lock); +static bool process_dlm_messages_pending; +static LIST_HEAD(processqueue); bool dlm_lowcomms_is_running(void) { return !!listen_con.sock; } +static void lowcomms_queue_swork(struct connection *con) +{ + WARN_ON_ONCE(!lockdep_is_held(&con->writequeue_lock)); + + if (!test_bit(CF_IO_STOP, &con->flags) && + !test_bit(CF_APP_LIMITED, &con->flags) && + !test_and_set_bit(CF_SEND_PENDING, &con->flags)) + queue_work(io_workqueue, &con->swork); +} + +static void lowcomms_queue_rwork(struct connection *con) +{ + WARN_ON_ONCE(!lockdep_sock_is_held(con->sock->sk)); + + if (!test_bit(CF_IO_STOP, &con->flags) && + !test_and_set_bit(CF_RECV_PENDING, &con->flags)) + queue_work(io_workqueue, &con->rwork); +} + static void writequeue_entry_ctor(void *data) { struct writequeue_entry *entry = data; @@ -225,21 +271,15 @@ static struct connection *__find_con(int nodeid, int r) return NULL; } -static int dlm_con_init(struct connection *con, int nodeid) +static void dlm_con_init(struct connection *con, int nodeid) { - con->rx_buflen = dlm_config.ci_buffer_size; - con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS); - if (!con->rx_buf) - return -ENOMEM; - con->nodeid = nodeid; - mutex_init(&con->sock_mutex); + init_rwsem(&con->sock_lock); INIT_LIST_HEAD(&con->writequeue); spin_lock_init(&con->writequeue_lock); INIT_WORK(&con->swork, process_send_sockets); INIT_WORK(&con->rwork, process_recv_sockets); - - return 0; + spin_lock_init(&con->addrs_lock); } /* @@ -249,7 +289,7 @@ static int dlm_con_init(struct connection *con, int nodeid) static struct connection *nodeid2con(int nodeid, gfp_t alloc) { struct connection *con, *tmp; - int r, ret; + int r; r = nodeid_hash(nodeid); con = __find_con(nodeid, r); @@ -260,11 +300,7 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) if (!con) return NULL; - ret = dlm_con_init(con, nodeid); - if (ret) { - kfree(con); - return NULL; - } + dlm_con_init(con, nodeid); spin_lock(&connections_lock); /* Because multiple workqueues/threads calls this function it can @@ -276,7 +312,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) tmp = __find_con(nodeid, r); if (tmp) { spin_unlock(&connections_lock); - kfree(con->rx_buf); kfree(con); return tmp; } @@ -287,18 +322,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) return con; } -/* Loop round all connections */ -static void foreach_conn(void (*conn_func)(struct connection *c)) -{ - int i; - struct connection *con; - - for (i = 0; i < CONN_HASH_SIZE; i++) { - hlist_for_each_entry_rcu(con, &connection_hash[i], list) - conn_func(con); - } -} - static int addr_compare(const struct sockaddr_storage *x, const struct sockaddr_storage *y) { @@ -474,56 +497,38 @@ static void lowcomms_data_ready(struct sock *sk) { struct connection *con = sock2con(sk); - if (!test_and_set_bit(CF_READ_PENDING, &con->flags)) - queue_work(recv_workqueue, &con->rwork); -} - -static void lowcomms_listen_data_ready(struct sock *sk) -{ - queue_work(recv_workqueue, &listen_con.rwork); + set_bit(CF_RECV_INTR, &con->flags); + lowcomms_queue_rwork(con); } static void lowcomms_write_space(struct sock *sk) { struct connection *con = sock2con(sk); - if (!test_and_set_bit(CF_CONNECTED, &con->flags)) { - log_print("connected to node %d", con->nodeid); - queue_work(send_workqueue, &con->swork); - return; - } - clear_bit(SOCK_NOSPACE, &con->sock->flags); + spin_lock_bh(&con->writequeue_lock); if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) { con->sock->sk->sk_write_pending--; clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags); } - queue_work(send_workqueue, &con->swork); -} - -static inline void lowcomms_connect_sock(struct connection *con) -{ - if (test_bit(CF_CLOSE, &con->flags)) - return; - queue_work(send_workqueue, &con->swork); - cond_resched(); + lowcomms_queue_swork(con); + spin_unlock_bh(&con->writequeue_lock); } static void lowcomms_state_change(struct sock *sk) { /* SCTP layer is not calling sk_data_ready when the connection - * is done, so we catch the signal through here. Also, it - * doesn't switch socket state when entering shutdown, so we - * skip the write in that case. + * is done, so we catch the signal through here. */ - if (sk->sk_shutdown) { - if (sk->sk_shutdown == RCV_SHUTDOWN) - lowcomms_data_ready(sk); - } else if (sk->sk_state == TCP_ESTABLISHED) { - lowcomms_write_space(sk); - } + if (sk->sk_shutdown == RCV_SHUTDOWN) + lowcomms_data_ready(sk); +} + +static void lowcomms_listen_data_ready(struct sock *sk) +{ + queue_work(io_workqueue, &listen_con.rwork); } int dlm_lowcomms_connect_node(int nodeid) @@ -541,9 +546,16 @@ int dlm_lowcomms_connect_node(int nodeid) return -ENOENT; } - lowcomms_connect_sock(con); + down_read(&con->sock_lock); + if (!con->sock) { + spin_lock_bh(&con->writequeue_lock); + lowcomms_queue_swork(con); + spin_unlock_bh(&con->writequeue_lock); + } + up_read(&con->sock_lock); srcu_read_unlock(&connections_srcu, idx); + cond_resched(); return 0; } @@ -596,39 +608,23 @@ static void lowcomms_error_report(struct sock *sk) "invalid socket family %d set, " "sk_err=%d/%d\n", dlm_our_nodeid(), sk->sk_family, sk->sk_err, sk->sk_err_soft); - goto out; - } - - /* below sendcon only handling */ - if (test_bit(CF_IS_OTHERCON, &con->flags)) - con = con->sendcon; - - switch (sk->sk_err) { - case ECONNREFUSED: - set_bit(CF_DELAY_CONNECT, &con->flags); - break; - default: break; } - if (!test_and_set_bit(CF_RECONNECT, &con->flags)) - queue_work(send_workqueue, &con->swork); + dlm_midcomms_unack_msg_resend(con->nodeid); -out: listen_sock.sk_error_report(sk); } -static void restore_callbacks(struct socket *sock) +static void restore_callbacks(struct sock *sk) { - struct sock *sk = sock->sk; + WARN_ON_ONCE(!lockdep_sock_is_held(sk)); - lock_sock(sk); sk->sk_user_data = NULL; sk->sk_data_ready = listen_sock.sk_data_ready; sk->sk_state_change = listen_sock.sk_state_change; sk->sk_write_space = listen_sock.sk_write_space; sk->sk_error_report = listen_sock.sk_error_report; - release_sock(sk); } /* Make a socket active */ @@ -640,10 +636,10 @@ static void add_sock(struct socket *sock, struct connection *con) con->sock = sock; sk->sk_user_data = con; - /* Install a data_ready callback */ sk->sk_data_ready = lowcomms_data_ready; sk->sk_write_space = lowcomms_write_space; - sk->sk_state_change = lowcomms_state_change; + if (dlm_config.ci_protocol == DLM_PROTO_SCTP) + sk->sk_state_change = lowcomms_state_change; sk->sk_allocation = GFP_NOFS; sk->sk_error_report = lowcomms_error_report; release_sock(sk); @@ -705,37 +701,62 @@ static void free_entry(struct writequeue_entry *e) static void dlm_close_sock(struct socket **sock) { - if (*sock) { - restore_callbacks(*sock); - sock_release(*sock); - *sock = NULL; + lock_sock((*sock)->sk); + restore_callbacks((*sock)->sk); + release_sock((*sock)->sk); + + sock_release(*sock); + *sock = NULL; +} + +static void allow_connection_io(struct connection *con) +{ + if (con->othercon) + clear_bit(CF_IO_STOP, &con->othercon->flags); + clear_bit(CF_IO_STOP, &con->flags); +} + +static void stop_connection_io(struct connection *con) +{ + if (con->othercon) + stop_connection_io(con->othercon); + + down_write(&con->sock_lock); + if (con->sock) { + lock_sock(con->sock->sk); + restore_callbacks(con->sock->sk); + + spin_lock_bh(&con->writequeue_lock); + set_bit(CF_IO_STOP, &con->flags); + spin_unlock_bh(&con->writequeue_lock); + release_sock(con->sock->sk); + } else { + spin_lock_bh(&con->writequeue_lock); + set_bit(CF_IO_STOP, &con->flags); + spin_unlock_bh(&con->writequeue_lock); } + up_write(&con->sock_lock); + + cancel_work_sync(&con->swork); + cancel_work_sync(&con->rwork); } /* Close a remote connection and tidy up */ -static void close_connection(struct connection *con, bool and_other, - bool tx, bool rx) +static void close_connection(struct connection *con, bool and_other) { - bool closing = test_and_set_bit(CF_CLOSING, &con->flags); struct writequeue_entry *e; - if (tx && !closing && cancel_work_sync(&con->swork)) { - log_print("canceled swork for node %d", con->nodeid); - clear_bit(CF_WRITE_PENDING, &con->flags); - } - if (rx && !closing && cancel_work_sync(&con->rwork)) { - log_print("canceled rwork for node %d", con->nodeid); - clear_bit(CF_READ_PENDING, &con->flags); + if (con->othercon && and_other) + close_connection(con->othercon, false); + + down_write(&con->sock_lock); + if (!con->sock) { + up_write(&con->sock_lock); + return; } - mutex_lock(&con->sock_mutex); dlm_close_sock(&con->sock); - if (con->othercon && and_other) { - /* Will only re-enter once. */ - close_connection(con->othercon, false, tx, rx); - } - /* if we send a writequeue entry only a half way, we drop the * whole entry because reconnection and that we not start of the * middle of a msg which will confuse the other end. @@ -747,143 +768,209 @@ static void close_connection(struct connection *con, bool and_other, * our policy is to start on a clean state when disconnects, we don't * know what's send/received on transport layer in this case. */ - spin_lock(&con->writequeue_lock); + spin_lock_bh(&con->writequeue_lock); if (!list_empty(&con->writequeue)) { e = list_first_entry(&con->writequeue, struct writequeue_entry, list); if (e->dirty) free_entry(e); } - spin_unlock(&con->writequeue_lock); + spin_unlock_bh(&con->writequeue_lock); con->rx_leftover = 0; con->retries = 0; clear_bit(CF_APP_LIMITED, &con->flags); - clear_bit(CF_CONNECTED, &con->flags); - clear_bit(CF_DELAY_CONNECT, &con->flags); - clear_bit(CF_RECONNECT, &con->flags); - mutex_unlock(&con->sock_mutex); - clear_bit(CF_CLOSING, &con->flags); + clear_bit(CF_RECV_PENDING, &con->flags); + clear_bit(CF_SEND_PENDING, &con->flags); + up_write(&con->sock_lock); } -static int con_realloc_receive_buf(struct connection *con, int newlen) +static struct processqueue_entry *new_processqueue_entry(int nodeid, + int buflen) { - unsigned char *newbuf; - - newbuf = kmalloc(newlen, GFP_NOFS); - if (!newbuf) - return -ENOMEM; + struct processqueue_entry *pentry; - /* copy any leftover from last receive */ - if (con->rx_leftover) - memmove(newbuf, con->rx_buf, con->rx_leftover); + pentry = kmalloc(sizeof(*pentry), GFP_NOFS); + if (!pentry) + return NULL; - /* swap to new buffer space */ - kfree(con->rx_buf); - con->rx_buflen = newlen; - con->rx_buf = newbuf; + pentry->buf = kmalloc(buflen, GFP_NOFS); + if (!pentry->buf) { + kfree(pentry); + return NULL; + } - return 0; + pentry->nodeid = nodeid; + return pentry; } -/* Data received from remote end */ -static int receive_from_sock(struct connection *con) +static void free_processqueue_entry(struct processqueue_entry *pentry) { - struct msghdr msg; - struct kvec iov; - int ret, buflen; + kfree(pentry->buf); + kfree(pentry); +} + +struct dlm_processed_nodes { + int nodeid; - mutex_lock(&con->sock_mutex); + struct list_head list; +}; - if (con->sock == NULL) { - ret = -EAGAIN; - goto out_close; +static void add_processed_node(int nodeid, struct list_head *processed_nodes) +{ + struct dlm_processed_nodes *n; + + list_for_each_entry(n, processed_nodes, list) { + /* we already remembered this node */ + if (n->nodeid == nodeid) + return; } - /* realloc if we get new buffer size to read out */ - buflen = dlm_config.ci_buffer_size; - if (con->rx_buflen != buflen && con->rx_leftover <= buflen) { - ret = con_realloc_receive_buf(con, buflen); - if (ret < 0) - goto out_resched; + /* if it's fails in worst case we simple don't send an ack back. + * We try it next time. + */ + n = kmalloc(sizeof(*n), GFP_NOFS); + if (!n) + return; + + n->nodeid = nodeid; + list_add(&n->list, processed_nodes); +} + +static void process_dlm_messages(struct work_struct *work) +{ + struct dlm_processed_nodes *n, *n_tmp; + struct processqueue_entry *pentry; + LIST_HEAD(processed_nodes); + + spin_lock(&processqueue_lock); + pentry = list_first_entry_or_null(&processqueue, + struct processqueue_entry, list); + if (WARN_ON_ONCE(!pentry)) { + spin_unlock(&processqueue_lock); + return; } + list_del(&pentry->list); + spin_unlock(&processqueue_lock); + for (;;) { - /* calculate new buffer parameter regarding last receive and - * possible leftover bytes - */ - iov.iov_base = con->rx_buf + con->rx_leftover; - iov.iov_len = con->rx_buflen - con->rx_leftover; - - memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; - ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, - msg.msg_flags); - trace_dlm_recv(con->nodeid, ret); - if (ret == -EAGAIN) + dlm_process_incoming_buffer(pentry->nodeid, pentry->buf, + pentry->buflen); + add_processed_node(pentry->nodeid, &processed_nodes); + free_processqueue_entry(pentry); + + spin_lock(&processqueue_lock); + pentry = list_first_entry_or_null(&processqueue, + struct processqueue_entry, list); + if (!pentry) { + process_dlm_messages_pending = false; + spin_unlock(&processqueue_lock); break; - else if (ret <= 0) - goto out_close; - - /* new buflen according readed bytes and leftover from last receive */ - buflen = ret + con->rx_leftover; - ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen); - if (ret < 0) - goto out_close; - - /* calculate leftover bytes from process and put it into begin of - * the receive buffer, so next receive we have the full message - * at the start address of the receive buffer. - */ - con->rx_leftover = buflen - ret; - if (con->rx_leftover) { - memmove(con->rx_buf, con->rx_buf + ret, - con->rx_leftover); } + + list_del(&pentry->list); + spin_unlock(&processqueue_lock); + } + + /* send ack back after we processed couple of messages */ + list_for_each_entry_safe(n, n_tmp, &processed_nodes, list) { + list_del(&n->list); + dlm_midcomms_receive_done(n->nodeid); + kfree(n); + } +} + +/* Data received from remote end */ +static int receive_from_sock(struct connection *con, int buflen) +{ + struct processqueue_entry *pentry; + int ret, buflen_real; + struct msghdr msg; + struct kvec iov; + + pentry = new_processqueue_entry(con->nodeid, buflen); + if (!pentry) + return DLM_IO_RESCHED; + + memcpy(pentry->buf, con->rx_leftover_buf, con->rx_leftover); + + /* calculate new buffer parameter regarding last receive and + * possible leftover bytes + */ + iov.iov_base = pentry->buf + con->rx_leftover; + iov.iov_len = buflen - con->rx_leftover; + + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + clear_bit(CF_RECV_INTR, &con->flags); +again: + ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, + msg.msg_flags); + trace_dlm_recv(con->nodeid, ret); + if (ret == -EAGAIN) { + lock_sock(con->sock->sk); + if (test_and_clear_bit(CF_RECV_INTR, &con->flags)) { + release_sock(con->sock->sk); + goto again; + } + + clear_bit(CF_RECV_PENDING, &con->flags); + release_sock(con->sock->sk); + free_processqueue_entry(pentry); + return DLM_IO_END; + } else if (ret == 0) { + /* close will clear CF_RECV_PENDING */ + free_processqueue_entry(pentry); + return DLM_IO_EOF; + } else if (ret < 0) { + free_processqueue_entry(pentry); + return ret; } - dlm_midcomms_receive_done(con->nodeid); - mutex_unlock(&con->sock_mutex); - return 0; + /* new buflen according readed bytes and leftover from last receive */ + buflen_real = ret + con->rx_leftover; + ret = dlm_validate_incoming_buffer(con->nodeid, pentry->buf, + buflen_real); + if (ret < 0) { + free_processqueue_entry(pentry); + return ret; + } -out_resched: - if (!test_and_set_bit(CF_READ_PENDING, &con->flags)) - queue_work(recv_workqueue, &con->rwork); - mutex_unlock(&con->sock_mutex); - return -EAGAIN; - -out_close: - if (ret == 0) { - log_print("connection %p got EOF from %d", - con, con->nodeid); - - mutex_unlock(&con->sock_mutex); - close_connection(con, false, true, false); - /* signal to breaking receive worker */ - ret = -1; - } else { - mutex_unlock(&con->sock_mutex); + pentry->buflen = ret; + + /* calculate leftover bytes from process and put it into begin of + * the receive buffer, so next receive we have the full message + * at the start address of the receive buffer. + */ + con->rx_leftover = buflen_real - ret; + memmove(con->rx_leftover_buf, pentry->buf + ret, + con->rx_leftover); + + spin_lock(&processqueue_lock); + list_add_tail(&pentry->list, &processqueue); + if (!process_dlm_messages_pending) { + process_dlm_messages_pending = true; + queue_work(process_workqueue, &process_work); } - return ret; + spin_unlock(&processqueue_lock); + + return DLM_IO_SUCCESS; } /* Listening socket is busy, accept a connection */ -static int accept_from_sock(struct listen_connection *con) +static int accept_from_sock(void) { - int result; struct sockaddr_storage peeraddr; - struct socket *newsock; - int len, idx; - int nodeid; + int len, idx, result, nodeid; struct connection *newcon; - struct connection *addcon; + struct socket *newsock; unsigned int mark; - if (!con->sock) - return -ENOTCONN; - - result = kernel_accept(con->sock, &newsock, O_NONBLOCK); - if (result < 0) + result = kernel_accept(listen_con.sock, &newsock, O_NONBLOCK); + if (result == -EAGAIN) + return DLM_IO_END; + else if (result < 0) goto accept_err; /* Get the connected socket's peer */ @@ -940,7 +1027,7 @@ static int accept_from_sock(struct listen_connection *con) sock_set_mark(newsock->sk, mark); - mutex_lock(&newcon->sock_mutex); + down_write(&newcon->sock_lock); if (newcon->sock) { struct connection *othercon = newcon->othercon; @@ -948,63 +1035,50 @@ static int accept_from_sock(struct listen_connection *con) othercon = kzalloc(sizeof(*othercon), GFP_NOFS); if (!othercon) { log_print("failed to allocate incoming socket"); - mutex_unlock(&newcon->sock_mutex); + up_write(&newcon->sock_lock); srcu_read_unlock(&connections_srcu, idx); result = -ENOMEM; goto accept_err; } - result = dlm_con_init(othercon, nodeid); - if (result < 0) { - kfree(othercon); - mutex_unlock(&newcon->sock_mutex); - srcu_read_unlock(&connections_srcu, idx); - goto accept_err; - } - - lockdep_set_subclass(&othercon->sock_mutex, 1); - set_bit(CF_IS_OTHERCON, &othercon->flags); + dlm_con_init(othercon, nodeid); + lockdep_set_subclass(&othercon->sock_lock, 1); newcon->othercon = othercon; - othercon->sendcon = newcon; + set_bit(CF_IS_OTHERCON, &othercon->flags); } else { /* close other sock con if we have something new */ - close_connection(othercon, false, true, false); + close_connection(othercon, false); } - mutex_lock(&othercon->sock_mutex); + down_write(&othercon->sock_lock); add_sock(newsock, othercon); - addcon = othercon; - mutex_unlock(&othercon->sock_mutex); + + /* check if we receved something while adding */ + lock_sock(othercon->sock->sk); + lowcomms_queue_rwork(othercon); + release_sock(othercon->sock->sk); + up_write(&othercon->sock_lock); } else { /* accept copies the sk after we've saved the callbacks, so we don't want to save them a second time or comm errors will result in calling sk_error_report recursively. */ add_sock(newsock, newcon); - addcon = newcon; - } - - set_bit(CF_CONNECTED, &addcon->flags); - mutex_unlock(&newcon->sock_mutex); - - /* - * Add it to the active queue in case we got data - * between processing the accept adding the socket - * to the read_sockets list - */ - if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags)) - queue_work(recv_workqueue, &addcon->rwork); + /* check if we receved something while adding */ + lock_sock(newcon->sock->sk); + lowcomms_queue_rwork(newcon); + release_sock(newcon->sock->sk); + } + up_write(&newcon->sock_lock); srcu_read_unlock(&connections_srcu, idx); - return 0; + return DLM_IO_SUCCESS; accept_err: if (newsock) sock_release(newsock); - if (result != -EAGAIN) - log_print("error accepting connection from node: %d", result); return result; } @@ -1098,7 +1172,7 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len, { struct writequeue_entry *e; - spin_lock(&con->writequeue_lock); + spin_lock_bh(&con->writequeue_lock); if (!list_empty(&con->writequeue)) { e = list_last_entry(&con->writequeue, struct writequeue_entry, list); if (DLM_WQ_REMAIN_BYTES(e) >= len) { @@ -1127,7 +1201,7 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len, list_add_tail(&e->list, &con->writequeue); out: - spin_unlock(&con->writequeue_lock); + spin_unlock_bh(&con->writequeue_lock); return e; }; @@ -1176,7 +1250,7 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, len < sizeof(struct dlm_header)) { BUILD_BUG_ON(PAGE_SIZE < DLM_MAX_SOCKET_BUFSIZE); log_print("failed to allocate a buffer of size %d", len); - WARN_ON(1); + WARN_ON_ONCE(1); return NULL; } @@ -1207,7 +1281,7 @@ static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg) struct connection *con = e->con; int users; - spin_lock(&con->writequeue_lock); + spin_lock_bh(&con->writequeue_lock); kref_get(&msg->ref); list_add(&msg->list, &e->msgs); @@ -1216,13 +1290,11 @@ static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg) goto out; e->len = DLM_WQ_LENGTH_BYTES(e); - spin_unlock(&con->writequeue_lock); - queue_work(send_workqueue, &con->swork); - return; + lowcomms_queue_swork(con); out: - spin_unlock(&con->writequeue_lock); + spin_unlock_bh(&con->writequeue_lock); return; } @@ -1244,7 +1316,7 @@ void dlm_lowcomms_put_msg(struct dlm_msg *msg) kref_put(&msg->ref, dlm_msg_release); } -/* does not held connections_srcu, usage workqueue only */ +/* does not held connections_srcu, usage lowcomms_error_report only */ int dlm_lowcomms_resend_msg(struct dlm_msg *msg) { struct dlm_msg *msg_resend; @@ -1270,88 +1342,78 @@ int dlm_lowcomms_resend_msg(struct dlm_msg *msg) } /* Send a message */ -static void send_to_sock(struct connection *con) +static int send_to_sock(struct connection *con) { const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; struct writequeue_entry *e; int len, offset, ret; - int count; - -again: - count = 0; - mutex_lock(&con->sock_mutex); - if (con->sock == NULL) - goto out_connect; - - spin_lock(&con->writequeue_lock); - for (;;) { - e = con_next_wq(con); - if (!e) - break; - - len = e->len; - offset = e->offset; - BUG_ON(len == 0 && e->users == 0); - spin_unlock(&con->writequeue_lock); - - ret = kernel_sendpage(con->sock, e->page, offset, len, - msg_flags); - trace_dlm_send(con->nodeid, ret); - if (ret == -EAGAIN || ret == 0) { - if (ret == -EAGAIN && - test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && - !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { - /* Notify TCP that we're limited by the - * application window size. - */ - set_bit(SOCK_NOSPACE, &con->sock->flags); - con->sock->sk->sk_write_pending++; - } - cond_resched(); - goto out; - } else if (ret < 0) - goto out; + spin_lock_bh(&con->writequeue_lock); + e = con_next_wq(con); + if (!e) { + clear_bit(CF_SEND_PENDING, &con->flags); + spin_unlock_bh(&con->writequeue_lock); + return DLM_IO_END; + } - spin_lock(&con->writequeue_lock); - writequeue_entry_complete(e, ret); + len = e->len; + offset = e->offset; + WARN_ON_ONCE(len == 0 && e->users == 0); + spin_unlock_bh(&con->writequeue_lock); - /* Don't starve people filling buffers */ - if (++count >= MAX_SEND_MSG_COUNT) { - spin_unlock(&con->writequeue_lock); - mutex_unlock(&con->sock_mutex); - cond_resched(); - goto again; + ret = kernel_sendpage(con->sock, e->page, offset, len, + msg_flags); + trace_dlm_send(con->nodeid, ret); + if (ret == -EAGAIN || ret == 0) { + lock_sock(con->sock->sk); + spin_lock_bh(&con->writequeue_lock); + if (test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && + !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { + /* Notify TCP that we're limited by the + * application window size. + */ + set_bit(SOCK_NOSPACE, &con->sock->sk->sk_socket->flags); + con->sock->sk->sk_write_pending++; + + clear_bit(CF_SEND_PENDING, &con->flags); + spin_unlock_bh(&con->writequeue_lock); + release_sock(con->sock->sk); + + /* wait for write_space() event */ + return DLM_IO_END; } + spin_unlock_bh(&con->writequeue_lock); + release_sock(con->sock->sk); + + return DLM_IO_RESCHED; + } else if (ret < 0) { + return ret; } - spin_unlock(&con->writequeue_lock); -out: - mutex_unlock(&con->sock_mutex); - return; + spin_lock_bh(&con->writequeue_lock); + writequeue_entry_complete(e, ret); + spin_unlock_bh(&con->writequeue_lock); -out_connect: - mutex_unlock(&con->sock_mutex); - queue_work(send_workqueue, &con->swork); - cond_resched(); + return DLM_IO_SUCCESS; } static void clean_one_writequeue(struct connection *con) { struct writequeue_entry *e, *safe; - spin_lock(&con->writequeue_lock); + spin_lock_bh(&con->writequeue_lock); list_for_each_entry_safe(e, safe, &con->writequeue, list) { free_entry(e); } - spin_unlock(&con->writequeue_lock); + spin_unlock_bh(&con->writequeue_lock); } static void connection_release(struct rcu_head *rcu) { struct connection *con = container_of(rcu, struct connection, rcu); - kfree(con->rx_buf); + WARN_ON_ONCE(!list_empty(&con->writequeue)); + WARN_ON_ONCE(con->sock); kfree(con); } @@ -1371,12 +1433,14 @@ int dlm_lowcomms_close(int nodeid) return -ENOENT; } + stop_connection_io(con); + log_print("io handling for node: %d stopped", nodeid); + close_connection(con, true); + spin_lock(&connections_lock); hlist_del_rcu(&con->list); spin_unlock(&connections_lock); - close_connection(con, true, true, true); - clean_one_writequeue(con); call_srcu(&connections_srcu, &con->rcu, connection_release); if (con->othercon) { @@ -1386,148 +1450,238 @@ int dlm_lowcomms_close(int nodeid) } srcu_read_unlock(&connections_srcu, idx); + /* for debugging we print when we are done to compare with other + * messages in between. This function need to be correctly synchronized + * with io handling + */ + log_print("closing connection to node %d done", nodeid); + return 0; } -/* Receive workqueue function */ +/* Receive worker function */ static void process_recv_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, rwork); + int ret, buflen; - clear_bit(CF_READ_PENDING, &con->flags); - receive_from_sock(con); + down_read(&con->sock_lock); + if (!con->sock) { + up_read(&con->sock_lock); + return; + } + + buflen = READ_ONCE(dlm_config.ci_buffer_size); + do { + ret = receive_from_sock(con, buflen); + } while (ret == DLM_IO_SUCCESS); + up_read(&con->sock_lock); + + switch (ret) { + case DLM_IO_END: + /* CF_RECV_PENDING cleared */ + break; + case DLM_IO_EOF: + close_connection(con, false); + /* CF_RECV_PENDING cleared */ + break; + case DLM_IO_RESCHED: + cond_resched(); + queue_work(io_workqueue, &con->rwork); + /* CF_RECV_PENDING not cleared */ + break; + default: + if (ret < 0) { + if (test_bit(CF_IS_OTHERCON, &con->flags)) { + close_connection(con, false); + } else { + spin_lock_bh(&con->writequeue_lock); + lowcomms_queue_swork(con); + spin_unlock_bh(&con->writequeue_lock); + } + + /* CF_RECV_PENDING cleared for othercon + * we trigger send queue if not already done + * and process_send_sockets will handle it + */ + break; + } + + WARN_ON_ONCE(1); + break; + } } static void process_listen_recv_socket(struct work_struct *work) { int ret; + if (WARN_ON_ONCE(!listen_con.sock)) + return; + do { - ret = accept_from_sock(&listen_con); - } while (!ret); + ret = accept_from_sock(); + } while (ret == DLM_IO_SUCCESS); + + if (ret < 0) + log_print("critical error accepting connection: %d", ret); } -static void dlm_connect(struct connection *con) +static int dlm_connect(struct connection *con) { struct sockaddr_storage addr; int result, addr_len; struct socket *sock; unsigned int mark; - /* Some odd races can cause double-connects, ignore them */ - if (con->retries++ > MAX_CONNECT_RETRIES) - return; - - if (con->sock) { - log_print("node %d already connected.", con->nodeid); - return; - } - memset(&addr, 0, sizeof(addr)); result = nodeid_to_addr(con->nodeid, &addr, NULL, dlm_proto_ops->try_new_addr, &mark); if (result < 0) { log_print("no address for nodeid %d", con->nodeid); - return; + return result; } /* Create a socket to communicate with */ result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family, SOCK_STREAM, dlm_proto_ops->proto, &sock); if (result < 0) - goto socket_err; + return result; sock_set_mark(sock->sk, mark); dlm_proto_ops->sockopts(sock); - add_sock(sock, con); - result = dlm_proto_ops->bind(sock); - if (result < 0) - goto add_sock_err; + if (result < 0) { + sock_release(sock); + return result; + } + + add_sock(sock, con); log_print_ratelimited("connecting to %d", con->nodeid); make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len); result = dlm_proto_ops->connect(con, sock, (struct sockaddr *)&addr, addr_len); - if (result < 0) - goto add_sock_err; - - return; - -add_sock_err: - dlm_close_sock(&con->sock); + switch (result) { + case -EINPROGRESS: + /* not an error */ + fallthrough; + case 0: + break; + default: + if (result < 0) + dlm_close_sock(&con->sock); -socket_err: - /* - * Some errors are fatal and this list might need adjusting. For other - * errors we try again until the max number of retries is reached. - */ - if (result != -EHOSTUNREACH && - result != -ENETUNREACH && - result != -ENETDOWN && - result != -EINVAL && - result != -EPROTONOSUPPORT) { - log_print("connect %d try %d error %d", con->nodeid, - con->retries, result); - msleep(1000); - lowcomms_connect_sock(con); + break; } + + return result; } -/* Send workqueue function */ +/* Send worker function */ static void process_send_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, swork); + int ret; - WARN_ON(test_bit(CF_IS_OTHERCON, &con->flags)); + WARN_ON_ONCE(test_bit(CF_IS_OTHERCON, &con->flags)); + + down_read(&con->sock_lock); + if (!con->sock) { + up_read(&con->sock_lock); + down_write(&con->sock_lock); + if (!con->sock) { + ret = dlm_connect(con); + switch (ret) { + case 0: + break; + case -EINPROGRESS: + /* avoid spamming resched on connection + * we might can switch to a state_change + * event based mechanism if established + */ + msleep(100); + break; + default: + /* CF_SEND_PENDING not cleared */ + up_write(&con->sock_lock); + log_print("connect to node %d try %d error %d", + con->nodeid, con->retries++, ret); + msleep(1000); + /* For now we try forever to reconnect. In + * future we should send a event to cluster + * manager to fence itself after certain amount + * of retries. + */ + queue_work(io_workqueue, &con->swork); + return; + } + } + downgrade_write(&con->sock_lock); + } - clear_bit(CF_WRITE_PENDING, &con->flags); + do { + ret = send_to_sock(con); + } while (ret == DLM_IO_SUCCESS); + up_read(&con->sock_lock); - if (test_and_clear_bit(CF_RECONNECT, &con->flags)) { - close_connection(con, false, false, true); - dlm_midcomms_unack_msg_resend(con->nodeid); - } + switch (ret) { + case DLM_IO_END: + /* CF_SEND_PENDING cleared */ + break; + case DLM_IO_RESCHED: + /* CF_SEND_PENDING not cleared */ + cond_resched(); + queue_work(io_workqueue, &con->swork); + break; + default: + if (ret < 0) { + close_connection(con, false); - if (con->sock == NULL) { - if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags)) - msleep(1000); + /* CF_SEND_PENDING cleared */ + spin_lock_bh(&con->writequeue_lock); + lowcomms_queue_swork(con); + spin_unlock_bh(&con->writequeue_lock); + break; + } - mutex_lock(&con->sock_mutex); - dlm_connect(con); - mutex_unlock(&con->sock_mutex); + WARN_ON_ONCE(1); + break; } - - if (!list_empty(&con->writequeue)) - send_to_sock(con); } static void work_stop(void) { - if (recv_workqueue) { - destroy_workqueue(recv_workqueue); - recv_workqueue = NULL; + if (io_workqueue) { + destroy_workqueue(io_workqueue); + io_workqueue = NULL; } - if (send_workqueue) { - destroy_workqueue(send_workqueue); - send_workqueue = NULL; + if (process_workqueue) { + destroy_workqueue(process_workqueue); + process_workqueue = NULL; } } static int work_start(void) { - recv_workqueue = alloc_ordered_workqueue("dlm_recv", WQ_MEM_RECLAIM); - if (!recv_workqueue) { - log_print("can't start dlm_recv"); + io_workqueue = alloc_workqueue("dlm_io", WQ_HIGHPRI | WQ_MEM_RECLAIM, + 0); + if (!io_workqueue) { + log_print("can't start dlm_io"); return -ENOMEM; } - send_workqueue = alloc_ordered_workqueue("dlm_send", WQ_MEM_RECLAIM); - if (!send_workqueue) { - log_print("can't start dlm_send"); - destroy_workqueue(recv_workqueue); - recv_workqueue = NULL; + /* ordered dlm message process queue, + * should be converted to a tasklet + */ + process_workqueue = alloc_ordered_workqueue("dlm_process", + WQ_HIGHPRI | WQ_MEM_RECLAIM); + if (!process_workqueue) { + log_print("can't start dlm_process"); + destroy_workqueue(io_workqueue); + io_workqueue = NULL; return -ENOMEM; } @@ -1543,6 +1697,8 @@ void dlm_lowcomms_shutdown(void) cancel_work_sync(&listen_con.rwork); dlm_close_sock(&listen_con.sock); + + flush_workqueue(process_workqueue); } void dlm_lowcomms_shutdown_node(int nodeid, bool force) @@ -1558,79 +1714,19 @@ void dlm_lowcomms_shutdown_node(int nodeid, bool force) } flush_work(&con->swork); + stop_connection_io(con); WARN_ON_ONCE(!force && !list_empty(&con->writequeue)); + close_connection(con, true); clean_one_writequeue(con); if (con->othercon) clean_one_writequeue(con->othercon); - close_connection(con, true, true, true); + allow_connection_io(con); srcu_read_unlock(&connections_srcu, idx); } -static void _stop_conn(struct connection *con, bool and_other) -{ - mutex_lock(&con->sock_mutex); - set_bit(CF_CLOSE, &con->flags); - set_bit(CF_READ_PENDING, &con->flags); - set_bit(CF_WRITE_PENDING, &con->flags); - if (con->sock && con->sock->sk) { - lock_sock(con->sock->sk); - con->sock->sk->sk_user_data = NULL; - release_sock(con->sock->sk); - } - if (con->othercon && and_other) - _stop_conn(con->othercon, false); - mutex_unlock(&con->sock_mutex); -} - -static void stop_conn(struct connection *con) -{ - _stop_conn(con, true); -} - -static void free_conn(struct connection *con) -{ - close_connection(con, true, true, true); -} - -static void work_flush(void) -{ - int ok; - int i; - struct connection *con; - - do { - ok = 1; - foreach_conn(stop_conn); - if (recv_workqueue) - flush_workqueue(recv_workqueue); - if (send_workqueue) - flush_workqueue(send_workqueue); - for (i = 0; i < CONN_HASH_SIZE && ok; i++) { - hlist_for_each_entry_rcu(con, &connection_hash[i], - list) { - ok &= test_bit(CF_READ_PENDING, &con->flags); - ok &= test_bit(CF_WRITE_PENDING, &con->flags); - if (con->othercon) { - ok &= test_bit(CF_READ_PENDING, - &con->othercon->flags); - ok &= test_bit(CF_WRITE_PENDING, - &con->othercon->flags); - } - } - } - } while (!ok); -} - void dlm_lowcomms_stop(void) { - int idx; - - idx = srcu_read_lock(&connections_srcu); - work_flush(); - foreach_conn(free_conn); - srcu_read_unlock(&connections_srcu, idx); work_stop(); - dlm_proto_ops = NULL; } @@ -1709,17 +1805,7 @@ static int dlm_tcp_bind(struct socket *sock) static int dlm_tcp_connect(struct connection *con, struct socket *sock, struct sockaddr *addr, int addr_len) { - int ret; - - ret = sock->ops->connect(sock, addr, addr_len, O_NONBLOCK); - switch (ret) { - case -EINPROGRESS: - fallthrough; - case 0: - return 0; - } - - return ret; + return sock->ops->connect(sock, addr, addr_len, O_NONBLOCK); } static int dlm_tcp_listen_validate(void) @@ -1784,13 +1870,7 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock, sock_set_sndtimeo(sock->sk, 5); ret = sock->ops->connect(sock, addr, addr_len, 0); sock_set_sndtimeo(sock->sk, 0); - if (ret < 0) - return ret; - - if (!test_and_set_bit(CF_CONNECTED, &con->flags)) - log_print("connected to node %d", con->nodeid); - - return 0; + return ret; } static int dlm_sctp_listen_validate(void) diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index b0e8bdcaab1b..fc015a6abe17 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -306,11 +306,11 @@ static void dlm_send_queue_flush(struct midcomms_node *node) pr_debug("flush midcomms send queue of node %d\n", node->nodeid); rcu_read_lock(); - spin_lock(&node->send_queue_lock); + spin_lock_bh(&node->send_queue_lock); list_for_each_entry_rcu(mh, &node->send_queue, list) { dlm_mhandle_delete(node, mh); } - spin_unlock(&node->send_queue_lock); + spin_unlock_bh(&node->send_queue_lock); rcu_read_unlock(); } @@ -437,7 +437,7 @@ static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq) } } - spin_lock(&node->send_queue_lock); + spin_lock_bh(&node->send_queue_lock); list_for_each_entry_rcu(mh, &node->send_queue, list) { if (before(mh->seq, seq)) { dlm_mhandle_delete(node, mh); @@ -446,7 +446,7 @@ static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq) break; } } - spin_unlock(&node->send_queue_lock); + spin_unlock_bh(&node->send_queue_lock); rcu_read_unlock(); } @@ -890,12 +890,7 @@ static void dlm_midcomms_receive_buffer_3_1(union dlm_packet *p, int nodeid) dlm_receive_buffer(p, nodeid); } -/* - * Called from the low-level comms layer to process a buffer of - * commands. - */ - -int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) +int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len) { const unsigned char *ptr = buf; const struct dlm_header *hd; @@ -930,6 +925,32 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) if (msglen > len) break; + ret += msglen; + len -= msglen; + ptr += msglen; + } + + return ret; +} + +/* + * Called from the low-level comms layer to process a buffer of + * commands. + */ +int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) +{ + const unsigned char *ptr = buf; + const struct dlm_header *hd; + uint16_t msglen; + int ret = 0; + + while (len >= sizeof(struct dlm_header)) { + hd = (struct dlm_header *)ptr; + + msglen = le16_to_cpu(hd->h_length); + if (msglen > len) + break; + switch (hd->h_version) { case cpu_to_le32(DLM_VERSION_3_1): dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid); @@ -1046,9 +1067,9 @@ static void midcomms_new_msg_cb(void *data) atomic_inc(&mh->node->send_queue_cnt); - spin_lock(&mh->node->send_queue_lock); + spin_lock_bh(&mh->node->send_queue_lock); list_add_tail_rcu(&mh->list, &mh->node->send_queue); - spin_unlock(&mh->node->send_queue_lock); + spin_unlock_bh(&mh->node->send_queue_lock); mh->seq = mh->node->seq_send++; } diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index 69296552d5ad..bea1cee4279c 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -14,6 +14,7 @@ struct midcomms_node; +int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len); int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen); struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, gfp_t allocation, char **ppc); -- cgit From 502487847743018c93d75b401eac2ea4c4973123 Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Sat, 19 Nov 2022 12:51:59 +0800 Subject: cifs: fix missing unlock in cifs_file_copychunk_range() xfstests generic/013 and generic/476 reported WARNING as follows: WARNING: lock held when returning to user space! 6.1.0-rc5+ #4 Not tainted ------------------------------------------------ fsstress/504233 is leaving the kernel with locks still held! 2 locks held by fsstress/504233: #0: ffff888054c38850 (&sb->s_type->i_mutex_key#21){+.+.}-{3:3}, at: lock_two_nondirectories+0xcf/0xf0 #1: ffff8880b8fec750 (&sb->s_type->i_mutex_key#21/4){+.+.}-{3:3}, at: lock_two_nondirectories+0xb7/0xf0 This will lead to deadlock and hungtask. Fix this by releasing locks when failed to write out on a file range in cifs_file_copychunk_range(). Fixes: 3e3761f1ec7d ("smb3: use filemap_write_and_wait_range instead of filemap_write_and_wait") Cc: stable@vger.kernel.org # 6.0 Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: ChenXiaoSong Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index fe220686bba4..712a43161448 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1281,7 +1281,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, rc = filemap_write_and_wait_range(src_inode->i_mapping, off, off + len - 1); if (rc) - goto out; + goto unlock; /* should we flush first and last page first */ truncate_inode_pages(&target_inode->i_data, 0); @@ -1297,6 +1297,8 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, * that target is updated on the server */ CIFS_I(target_inode)->time = 0; + +unlock: /* although unlocking in the reverse order from locking is not * strictly necessary here it is a little cleaner to be consistent */ -- cgit From 28b4b0596343d19d140da059eee0e5c2b5328731 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 17 Nov 2022 13:02:56 -0800 Subject: xfs: fix incorrect i_nlink caused by inode racing The following error occurred during the fsstress test: XFS: Assertion failed: VFS_I(ip)->i_nlink >= 2, file: fs/xfs/xfs_inode.c, line: 2452 The problem was that inode race condition causes incorrect i_nlink to be written to disk, and then it is read into memory. Consider the following call graph, inodes that are marked as both XFS_IFLUSHING and XFS_IRECLAIMABLE, i_nlink will be reset to 1 and then restored to original value in xfs_reinit_inode(). Therefore, the i_nlink of directory on disk may be set to 1. xfsaild xfs_inode_item_push xfs_iflush_cluster xfs_iflush xfs_inode_to_disk xfs_iget xfs_iget_cache_hit xfs_iget_recycle xfs_reinit_inode inode_init_always xfs_reinit_inode() needs to hold the ILOCK_EXCL as it is changing internal inode state and can race with other RCU protected inode lookups. On the read side, xfs_iflush_cluster() grabs the ILOCK_SHARED while under rcu + ip->i_flags_lock, and so xfs_iflush/xfs_inode_to_disk() are protected from racing inode updates (during transactions) by that lock. Fixes: ff7bebeb91f8 ("xfs: refactor the inode recycling code") # goes further back than this Signed-off-by: Long Li Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index eae7427062cf..f35e2cee5265 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -342,6 +342,9 @@ xfs_iget_recycle( trace_xfs_iget_recycle(ip); + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + return -EAGAIN; + /* * We need to make it look like the inode is being reclaimed to prevent * the actual reclaim workers from stomping over us while we recycle @@ -355,6 +358,7 @@ xfs_iget_recycle( ASSERT(!rwsem_is_locked(&inode->i_rwsem)); error = xfs_reinit_inode(mp, inode); + xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) { /* * Re-initializing the inode failed, and we are in deep @@ -518,6 +522,8 @@ xfs_iget_cache_hit( if (ip->i_flags & XFS_IRECLAIMABLE) { /* Drops i_flags_lock and RCU read lock. */ error = xfs_iget_recycle(pag, ip); + if (error == -EAGAIN) + goto out_skip; if (error) return error; } else { -- cgit From fce3caea0f241f5d34855c82c399d5e0e2d91f07 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Nov 2022 05:29:42 +0100 Subject: blk-crypto: don't use struct request_queue for public interfaces Switch all public blk-crypto interfaces to use struct block_device arguments to specify the device they operate on instead of th request_queue, which is a block layer implementation detail. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20221114042944.1009870-2-hch@lst.de Signed-off-by: Jens Axboe --- fs/crypto/inline_crypt.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index cea8b14007e6..55c4d8c23d30 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -139,8 +139,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci) return PTR_ERR(devs); for (i = 0; i < num_devs; i++) { - if (!blk_crypto_config_supported(bdev_get_queue(devs[i]), - &crypto_cfg)) + if (!blk_crypto_config_supported(devs[i], &crypto_cfg)) goto out_free_devs; } @@ -184,8 +183,7 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, goto fail; } for (i = 0; i < num_devs; i++) { - err = blk_crypto_start_using_key(blk_key, - bdev_get_queue(devs[i])); + err = blk_crypto_start_using_key(devs[i], blk_key); if (err) break; } @@ -224,7 +222,7 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb, devs = fscrypt_get_devices(sb, &num_devs); if (!IS_ERR(devs)) { for (i = 0; i < num_devs; i++) - blk_crypto_evict_key(bdev_get_queue(devs[i]), blk_key); + blk_crypto_evict_key(devs[i], blk_key); kfree(devs); } kfree_sensitive(blk_key); -- cgit From 6715c98b6cf003f26b1b2f655393134e9d999a05 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Nov 2022 05:29:43 +0100 Subject: blk-crypto: add a blk_crypto_config_supported_natively helper Add a blk_crypto_config_supported_natively helper that wraps __blk_crypto_cfg_supported to retrieve the crypto_profile from the request queue. With this fscrypt can stop including blk-crypto-profile.h and rely on the public consumer interface in blk-crypto.h. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20221114042944.1009870-3-hch@lst.de Signed-off-by: Jens Axboe --- fs/crypto/inline_crypt.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 55c4d8c23d30..8bfb3ce86476 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -12,7 +12,7 @@ * provides the key and IV to use. */ -#include +#include #include #include #include @@ -77,10 +77,8 @@ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode, unsigned int i; for (i = 0; i < num_devs; i++) { - struct request_queue *q = bdev_get_queue(devs[i]); - if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || - __blk_crypto_cfg_supported(q->crypto_profile, cfg)) { + blk_crypto_config_supported_natively(devs[i], cfg)) { if (!xchg(&mode->logged_blk_crypto_native, 1)) pr_info("fscrypt: %s using blk-crypto (native)\n", mode->friendly_name); -- cgit From 4e45886956a20942800259f326a04417292ae314 Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu Date: Sun, 20 Nov 2022 18:57:59 +0800 Subject: zonefs: Fix race between modprobe and mount There is a race between modprobe and mount as below: modprobe zonefs | mount -t zonefs --------------------------------|------------------------- zonefs_init | register_filesystem [1] | | zonefs_fill_super [2] zonefs_sysfs_init [3] | 1. register zonefs suceess, then 2. user can mount the zonefs 3. if sysfs initialize failed, the module initialize failed. Then the mount process maybe some error happened since the module initialize failed. Let's register zonefs after all dependency resource ready. And reorder the dependency resource release in module exit. Fixes: 9277a6d4fbd4 ("zonefs: Export open zone resource information through sysfs") Signed-off-by: Zhang Xiaoxu Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Signed-off-by: Damien Le Moal --- fs/zonefs/super.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index abc9a85106f2..f0e8a000f073 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1922,18 +1922,18 @@ static int __init zonefs_init(void) if (ret) return ret; - ret = register_filesystem(&zonefs_type); + ret = zonefs_sysfs_init(); if (ret) goto destroy_inodecache; - ret = zonefs_sysfs_init(); + ret = register_filesystem(&zonefs_type); if (ret) - goto unregister_fs; + goto sysfs_exit; return 0; -unregister_fs: - unregister_filesystem(&zonefs_type); +sysfs_exit: + zonefs_sysfs_exit(); destroy_inodecache: zonefs_destroy_inodecache(); @@ -1942,9 +1942,9 @@ destroy_inodecache: static void __exit zonefs_exit(void) { + unregister_filesystem(&zonefs_type); zonefs_sysfs_exit(); zonefs_destroy_inodecache(); - unregister_filesystem(&zonefs_type); } MODULE_AUTHOR("Damien Le Moal"); -- cgit From 03e02acda8e267a8183e1e0ed289ff1ef9cd7ed8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 20 Nov 2022 10:13:44 -0700 Subject: eventfd: provide a eventfd_signal_mask() helper This is identical to eventfd_signal(), but it allows the caller to pass in a mask to be used for the poll wakeup key. The use case is avoiding repeated multishot triggers if we have a dependency between eventfd and io_uring. If we setup an eventfd context and register that as the io_uring eventfd, and at the same time queue a multishot poll request for the eventfd context, then any CQE posted will repeatedly trigger the multishot request until it terminates when the CQ ring overflows. In preparation for io_uring detecting this circular dependency, add the mentioned helper so that io_uring can pass in EPOLL_URING as part of the poll wakeup key. Cc: stable@vger.kernel.org # 6.0 [axboe: fold in !CONFIG_EVENTFD fix from Zhang Qilong] Signed-off-by: Jens Axboe --- fs/eventfd.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/eventfd.c b/fs/eventfd.c index c0ffee99ad23..249ca6c0b784 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -43,21 +43,7 @@ struct eventfd_ctx { int id; }; -/** - * eventfd_signal - Adds @n to the eventfd counter. - * @ctx: [in] Pointer to the eventfd context. - * @n: [in] Value of the counter to be added to the eventfd internal counter. - * The value cannot be negative. - * - * This function is supposed to be called by the kernel in paths that do not - * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX - * value, and we signal this as overflow condition by returning a EPOLLERR - * to poll(2). - * - * Returns the amount by which the counter was incremented. This will be less - * than @n if the counter has overflowed. - */ -__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) +__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask) { unsigned long flags; @@ -78,12 +64,31 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) n = ULLONG_MAX - ctx->count; ctx->count += n; if (waitqueue_active(&ctx->wqh)) - wake_up_locked_poll(&ctx->wqh, EPOLLIN); + wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask); current->in_eventfd = 0; spin_unlock_irqrestore(&ctx->wqh.lock, flags); return n; } + +/** + * eventfd_signal - Adds @n to the eventfd counter. + * @ctx: [in] Pointer to the eventfd context. + * @n: [in] Value of the counter to be added to the eventfd internal counter. + * The value cannot be negative. + * + * This function is supposed to be called by the kernel in paths that do not + * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX + * value, and we signal this as overflow condition by returning a EPOLLERR + * to poll(2). + * + * Returns the amount by which the counter was incremented. This will be less + * than @n if the counter has overflowed. + */ +__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) +{ + return eventfd_signal_mask(ctx, n, 0); +} EXPORT_SYMBOL_GPL(eventfd_signal); static void eventfd_free_ctx(struct eventfd_ctx *ctx) -- cgit From 7a5e9f1f83e3271a9f05933a80b870fe55ebbb3d Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 22 Nov 2022 09:48:01 -0500 Subject: fs: dlm: fix building without lockdep This patch uses assert_spin_locked() instead of lockdep_is_held() where it's available to use because lockdep_is_held() is only available if CONFIG_LOCKDEP is set. In other cases like lockdep_sock_is_held() we surround it by a CONFIG_LOCKDEP idef. Fixes: dbb751ffab0b ("fs: dlm: parallelize lowcomms socket handling") Reported-by: kernel test robot Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 799d1c36eabf..8b80ca0cd65f 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -208,7 +208,7 @@ bool dlm_lowcomms_is_running(void) static void lowcomms_queue_swork(struct connection *con) { - WARN_ON_ONCE(!lockdep_is_held(&con->writequeue_lock)); + assert_spin_locked(&con->writequeue_lock); if (!test_bit(CF_IO_STOP, &con->flags) && !test_bit(CF_APP_LIMITED, &con->flags) && @@ -218,7 +218,9 @@ static void lowcomms_queue_swork(struct connection *con) static void lowcomms_queue_rwork(struct connection *con) { +#ifdef CONFIG_LOCKDEP WARN_ON_ONCE(!lockdep_sock_is_held(con->sock->sk)); +#endif if (!test_bit(CF_IO_STOP, &con->flags) && !test_and_set_bit(CF_RECV_PENDING, &con->flags)) @@ -618,7 +620,9 @@ static void lowcomms_error_report(struct sock *sk) static void restore_callbacks(struct sock *sk) { +#ifdef CONFIG_LOCKDEP WARN_ON_ONCE(!lockdep_sock_is_held(sk)); +#endif sk->sk_user_data = NULL; sk->sk_data_ready = listen_sock.sk_data_ready; -- cgit From 4e3c51f4e805291b057d12f5dda5aeb50a538dc4 Mon Sep 17 00:00:00 2001 From: Svyatoslav Feldsherov Date: Tue, 15 Nov 2022 20:20:01 +0000 Subject: fs: do not update freeing inode i_io_list After commit cbfecb927f42 ("fs: record I_DIRTY_TIME even if inode already has I_DIRTY_INODE") writeback_single_inode can push inode with I_DIRTY_TIME set to b_dirty_time list. In case of freeing inode with I_DIRTY_TIME set this can happen after deletion of inode from i_io_list at evict. Stack trace is following. evict fat_evict_inode fat_truncate_blocks fat_flush_inodes writeback_inode sync_inode_metadata(inode, sync=0) writeback_single_inode(inode, wbc) <- wbc->sync_mode == WB_SYNC_NONE This will lead to use after free in flusher thread. Similar issue can be triggered if writeback_single_inode in the stack trace update inode->i_io_list. Add explicit check to avoid it. Fixes: cbfecb927f42 ("fs: record I_DIRTY_TIME even if inode already has I_DIRTY_INODE") Reported-by: syzbot+6ba92bd00d5093f7e371@syzkaller.appspotmail.com Reviewed-by: Jan Kara Signed-off-by: Svyatoslav Feldsherov Link: https://lore.kernel.org/r/20221115202001.324188-1-feldsherov@google.com Signed-off-by: Theodore Ts'o --- fs/fs-writeback.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 443f83382b9b..9958d4020771 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1712,18 +1712,26 @@ static int writeback_single_inode(struct inode *inode, wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); /* - * If the inode is now fully clean, then it can be safely removed from - * its writeback list (if any). Otherwise the flusher threads are - * responsible for the writeback lists. + * If the inode is freeing, its i_io_list shoudn't be updated + * as it can be finally deleted at this moment. */ - if (!(inode->i_state & I_DIRTY_ALL)) - inode_cgwb_move_to_attached(inode, wb); - else if (!(inode->i_state & I_SYNC_QUEUED)) { - if ((inode->i_state & I_DIRTY)) - redirty_tail_locked(inode, wb); - else if (inode->i_state & I_DIRTY_TIME) { - inode->dirtied_when = jiffies; - inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); + if (!(inode->i_state & I_FREEING)) { + /* + * If the inode is now fully clean, then it can be safely + * removed from its writeback list (if any). Otherwise the + * flusher threads are responsible for the writeback lists. + */ + if (!(inode->i_state & I_DIRTY_ALL)) + inode_cgwb_move_to_attached(inode, wb); + else if (!(inode->i_state & I_SYNC_QUEUED)) { + if ((inode->i_state & I_DIRTY)) + redirty_tail_locked(inode, wb); + else if (inode->i_state & I_DIRTY_TIME) { + inode->dirtied_when = jiffies; + inode_io_list_move_locked(inode, + wb, + &wb->b_dirty_time); + } } } -- cgit From 572302af1258459e124437b8f3369357447afac7 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 10 Nov 2022 10:46:35 +0100 Subject: reiserfs: Add missing calls to reiserfs_security_free() Commit 57fe60df6241 ("reiserfs: add atomic addition of selinux attributes during inode creation") defined reiserfs_security_free() to free the name and value of a security xattr allocated by the active LSM through security_old_inode_init_security(). However, this function is not called in the reiserfs code. Thus, add a call to reiserfs_security_free() whenever reiserfs_security_init() is called, and initialize value to NULL, to avoid to call kfree() on an uninitialized pointer. Finally, remove the kfree() for the xattr name, as it is not allocated anymore. Fixes: 57fe60df6241 ("reiserfs: add atomic addition of selinux attributes during inode creation") Cc: stable@vger.kernel.org Cc: Jeff Mahoney Cc: Tetsuo Handa Reported-by: Mimi Zohar Reported-by: Tetsuo Handa Signed-off-by: Roberto Sassu Reviewed-by: Mimi Zohar Signed-off-by: Paul Moore --- fs/reiserfs/namei.c | 4 ++++ fs/reiserfs/xattr_security.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 3d7a35d6a18b..b916859992ec 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -696,6 +696,7 @@ static int reiserfs_create(struct user_namespace *mnt_userns, struct inode *dir, out_failed: reiserfs_write_unlock(dir->i_sb); + reiserfs_security_free(&security); return retval; } @@ -779,6 +780,7 @@ static int reiserfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, out_failed: reiserfs_write_unlock(dir->i_sb); + reiserfs_security_free(&security); return retval; } @@ -878,6 +880,7 @@ static int reiserfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, retval = journal_end(&th); out_failed: reiserfs_write_unlock(dir->i_sb); + reiserfs_security_free(&security); return retval; } @@ -1194,6 +1197,7 @@ static int reiserfs_symlink(struct user_namespace *mnt_userns, retval = journal_end(&th); out_failed: reiserfs_write_unlock(parent_dir->i_sb); + reiserfs_security_free(&security); return retval; } diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 8965c8e5e172..857a65b05726 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -50,6 +50,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode, int error; sec->name = NULL; + sec->value = NULL; /* Don't add selinux attributes on xattrs - they'll never get used */ if (IS_PRIVATE(dir)) @@ -95,7 +96,6 @@ int reiserfs_security_write(struct reiserfs_transaction_handle *th, void reiserfs_security_free(struct reiserfs_security_handle *sec) { - kfree(sec->name); kfree(sec->value); sec->name = NULL; sec->value = NULL; -- cgit From 198dd8aedee6a7d2de0dfa739f9a008a938f6848 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 23 Nov 2022 12:40:11 +1100 Subject: xfs: punching delalloc extents on write failure is racy xfs_buffered_write_iomap_end() has a comment about the safety of punching delalloc extents based holding the IOLOCK_EXCL. This comment is wrong, and punching delalloc extents is not race free. When we punch out a delalloc extent after a write failure in xfs_buffered_write_iomap_end(), we punch out the page cache with truncate_pagecache_range() before we punch out the delalloc extents. At this point, we only hold the IOLOCK_EXCL, so there is nothing stopping mmap() write faults racing with this cleanup operation, reinstantiating a folio over the range we are about to punch and hence requiring the delalloc extent to be kept. If this race condition is hit, we can end up with a dirty page in the page cache that has no delalloc extent or space reservation backing it. This leads to bad things happening at writeback time. To avoid this race condition, we need the page cache truncation to be atomic w.r.t. the extent manipulation. We can do this by holding the mapping->invalidate_lock exclusively across this operation - this will prevent new pages from being inserted into the page cache whilst we are removing the pages and the backing extent and space reservation. Taking the mapping->invalidate_lock exclusively in the buffered write IO path is safe - it naturally nests inside the IOLOCK (see truncate and fallocate paths). iomap_zero_range() can be called from under the mapping->invalidate_lock (from the truncate path via either xfs_zero_eof() or xfs_truncate_page(), but iomap_zero_iter() will not instantiate new delalloc pages (because it skips holes) and hence will not ever need to punch out delalloc extents on failure. Fix the locking issue, and clean up the code logic a little to avoid unnecessary work if we didn't allocate the delalloc extent or wrote the entire region we allocated. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_iomap.c | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 5cea069a38b4..a2e45ea1b0cb 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1147,6 +1147,10 @@ xfs_buffered_write_iomap_end( written = 0; } + /* If we didn't reserve the blocks, we're not allowed to punch them. */ + if (!(iomap->flags & IOMAP_F_NEW)) + return 0; + /* * start_fsb refers to the first unused block after a short write. If * nothing was written, round offset down to point at the first block in @@ -1158,27 +1162,28 @@ xfs_buffered_write_iomap_end( start_fsb = XFS_B_TO_FSB(mp, offset + written); end_fsb = XFS_B_TO_FSB(mp, offset + length); + /* Nothing to do if we've written the entire delalloc extent */ + if (start_fsb >= end_fsb) + return 0; + /* - * Trim delalloc blocks if they were allocated by this write and we - * didn't manage to write the whole range. - * - * We don't need to care about racing delalloc as we hold i_mutex - * across the reserve/allocate/unreserve calls. If there are delalloc - * blocks in the range, they are ours. + * Lock the mapping to avoid races with page faults re-instantiating + * folios and dirtying them via ->page_mkwrite between the page cache + * truncation and the delalloc extent removal. Failing to do this can + * leave dirty pages with no space reservation in the cache. */ - if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) { - truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), - XFS_FSB_TO_B(mp, end_fsb) - 1); - - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - end_fsb - start_fsb); - if (error && !xfs_is_shutdown(mp)) { - xfs_alert(mp, "%s: unable to clean up ino %lld", - __func__, ip->i_ino); - return error; - } + filemap_invalidate_lock(inode->i_mapping); + truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), + XFS_FSB_TO_B(mp, end_fsb) - 1); + + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + filemap_invalidate_unlock(inode->i_mapping); + if (error && !xfs_is_shutdown(mp)) { + xfs_alert(mp, "%s: unable to clean up ino %lld", + __func__, ip->i_ino); + return error; } - return 0; } -- cgit From b71f889c18ada210a97aa3eb5e00c0de552234c6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 23 Nov 2022 12:40:12 +1100 Subject: xfs: use byte ranges for write cleanup ranges xfs_buffered_write_iomap_end() currently converts the byte ranges passed to it to filesystem blocks to pass them to the bmap code to punch out delalloc blocks, but then has to convert filesytem blocks back to byte ranges for page cache truncate. We're about to make the page cache truncate go away and replace it with a page cache walk, so having to convert everything to/from/to filesystem blocks is messy and error-prone. It is much easier to pass around byte ranges and convert to page indexes and/or filesystem blocks only where those units are needed. In preparation for the page cache walk being added, add a helper that converts byte ranges to filesystem blocks and calls xfs_bmap_punch_delalloc_range() and convert xfs_buffered_write_iomap_end() to calculate limits in byte ranges. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_iomap.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index a2e45ea1b0cb..7bb55dbc19d3 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1120,6 +1120,20 @@ out_unlock: return error; } +static int +xfs_buffered_write_delalloc_punch( + struct inode *inode, + loff_t start_byte, + loff_t end_byte) +{ + struct xfs_mount *mp = XFS_M(inode->i_sb); + xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); + + return xfs_bmap_punch_delalloc_range(XFS_I(inode), start_fsb, + end_fsb - start_fsb); +} + static int xfs_buffered_write_iomap_end( struct inode *inode, @@ -1129,10 +1143,9 @@ xfs_buffered_write_iomap_end( unsigned flags, struct iomap *iomap) { - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t start_fsb; - xfs_fileoff_t end_fsb; + struct xfs_mount *mp = XFS_M(inode->i_sb); + loff_t start_byte; + loff_t end_byte; int error = 0; if (iomap->type != IOMAP_DELALLOC) @@ -1157,13 +1170,13 @@ xfs_buffered_write_iomap_end( * the range. */ if (unlikely(!written)) - start_fsb = XFS_B_TO_FSBT(mp, offset); + start_byte = round_down(offset, mp->m_sb.sb_blocksize); else - start_fsb = XFS_B_TO_FSB(mp, offset + written); - end_fsb = XFS_B_TO_FSB(mp, offset + length); + start_byte = round_up(offset + written, mp->m_sb.sb_blocksize); + end_byte = round_up(offset + length, mp->m_sb.sb_blocksize); /* Nothing to do if we've written the entire delalloc extent */ - if (start_fsb >= end_fsb) + if (start_byte >= end_byte) return 0; /* @@ -1173,15 +1186,12 @@ xfs_buffered_write_iomap_end( * leave dirty pages with no space reservation in the cache. */ filemap_invalidate_lock(inode->i_mapping); - truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), - XFS_FSB_TO_B(mp, end_fsb) - 1); - - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - end_fsb - start_fsb); + truncate_pagecache_range(inode, start_byte, end_byte - 1); + error = xfs_buffered_write_delalloc_punch(inode, start_byte, end_byte); filemap_invalidate_unlock(inode->i_mapping); if (error && !xfs_is_shutdown(mp)) { - xfs_alert(mp, "%s: unable to clean up ino %lld", - __func__, ip->i_ino); + xfs_alert(mp, "%s: unable to clean up ino 0x%llx", + __func__, XFS_I(inode)->i_ino); return error; } return 0; -- cgit From 9c7babf94a0d686b552e53aded8d4703d1b8b92b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 23 Nov 2022 12:44:38 +1100 Subject: xfs,iomap: move delalloc punching to iomap Because that's what Christoph wants for this error handling path only XFS uses. It requires a new iomap export for handling errors over delalloc ranges. This is basically the XFS code as is stands, but even though Christoph wants this as iomap funcitonality, we still have to call it from the filesystem specific ->iomap_end callback, and call into the iomap code with yet another filesystem specific callback to punch the delalloc extent within the defined ranges. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_iomap.c | 47 +++++++-------------------------------- 2 files changed, 68 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 91ee0b308e13..734b761a1e4a 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -832,6 +832,66 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); +/* + * When a short write occurs, the filesystem may need to remove reserved space + * that was allocated in ->iomap_begin from it's ->iomap_end method. For + * filesystems that use delayed allocation, we need to punch out delalloc + * extents from the range that are not dirty in the page cache. As the write can + * race with page faults, there can be dirty pages over the delalloc extent + * outside the range of a short write but still within the delalloc extent + * allocated for this iomap. + * + * This function uses [start_byte, end_byte) intervals (i.e. open ended) to + * simplify range iterations, but converts them back to {offset,len} tuples for + * the punch callback. + */ +int iomap_file_buffered_write_punch_delalloc(struct inode *inode, + struct iomap *iomap, loff_t pos, loff_t length, + ssize_t written, + int (*punch)(struct inode *inode, loff_t pos, loff_t length)) +{ + loff_t start_byte; + loff_t end_byte; + int blocksize = i_blocksize(inode); + int error = 0; + + if (iomap->type != IOMAP_DELALLOC) + return 0; + + /* If we didn't reserve the blocks, we're not allowed to punch them. */ + if (!(iomap->flags & IOMAP_F_NEW)) + return 0; + + /* + * start_byte refers to the first unused block after a short write. If + * nothing was written, round offset down to point at the first block in + * the range. + */ + if (unlikely(!written)) + start_byte = round_down(pos, blocksize); + else + start_byte = round_up(pos + written, blocksize); + end_byte = round_up(pos + length, blocksize); + + /* Nothing to do if we've written the entire delalloc extent */ + if (start_byte >= end_byte) + return 0; + + /* + * Lock the mapping to avoid races with page faults re-instantiating + * folios and dirtying them via ->page_mkwrite between the page cache + * truncation and the delalloc extent removal. Failing to do this can + * leave dirty pages with no space reservation in the cache. + */ + filemap_invalidate_lock(inode->i_mapping); + truncate_pagecache_range(inode, start_byte, end_byte - 1); + error = punch(inode, start_byte, end_byte - start_byte); + filemap_invalidate_unlock(inode->i_mapping); + + return error; +} +EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); + static loff_t iomap_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 7bb55dbc19d3..ea96e8a34868 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1123,12 +1123,12 @@ out_unlock: static int xfs_buffered_write_delalloc_punch( struct inode *inode, - loff_t start_byte, - loff_t end_byte) + loff_t offset, + loff_t length) { struct xfs_mount *mp = XFS_M(inode->i_sb); - xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte); - xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); + xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); return xfs_bmap_punch_delalloc_range(XFS_I(inode), start_fsb, end_fsb - start_fsb); @@ -1143,13 +1143,9 @@ xfs_buffered_write_iomap_end( unsigned flags, struct iomap *iomap) { - struct xfs_mount *mp = XFS_M(inode->i_sb); - loff_t start_byte; - loff_t end_byte; - int error = 0; - if (iomap->type != IOMAP_DELALLOC) - return 0; + struct xfs_mount *mp = XFS_M(inode->i_sb); + int error; /* * Behave as if the write failed if drop writes is enabled. Set the NEW @@ -1160,35 +1156,8 @@ xfs_buffered_write_iomap_end( written = 0; } - /* If we didn't reserve the blocks, we're not allowed to punch them. */ - if (!(iomap->flags & IOMAP_F_NEW)) - return 0; - - /* - * start_fsb refers to the first unused block after a short write. If - * nothing was written, round offset down to point at the first block in - * the range. - */ - if (unlikely(!written)) - start_byte = round_down(offset, mp->m_sb.sb_blocksize); - else - start_byte = round_up(offset + written, mp->m_sb.sb_blocksize); - end_byte = round_up(offset + length, mp->m_sb.sb_blocksize); - - /* Nothing to do if we've written the entire delalloc extent */ - if (start_byte >= end_byte) - return 0; - - /* - * Lock the mapping to avoid races with page faults re-instantiating - * folios and dirtying them via ->page_mkwrite between the page cache - * truncation and the delalloc extent removal. Failing to do this can - * leave dirty pages with no space reservation in the cache. - */ - filemap_invalidate_lock(inode->i_mapping); - truncate_pagecache_range(inode, start_byte, end_byte - 1); - error = xfs_buffered_write_delalloc_punch(inode, start_byte, end_byte); - filemap_invalidate_unlock(inode->i_mapping); + error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset, + length, written, &xfs_buffered_write_delalloc_punch); if (error && !xfs_is_shutdown(mp)) { xfs_alert(mp, "%s: unable to clean up ino 0x%llx", __func__, XFS_I(inode)->i_ino); -- cgit From f850c84948ef2d4f5e11fd8e528c2ac3b3c3d9c4 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 17 Nov 2022 04:32:47 +0000 Subject: proc/meminfo: fix spacing in SecPageTables SecPageTables has a tab after it instead of a space, this can break fragile parsers that depend on spaces after the stat names. Link: https://lkml.kernel.org/r/20221117043247.133294-1-yosryahmed@google.com Fixes: ebc97a52b5d6cd5f ("mm: add NR_SECONDARY_PAGETABLE to count secondary page table uses.") Signed-off-by: Yosry Ahmed Acked-by: Johannes Weiner Acked-by: Shakeel Butt Cc: David Hildenbrand Cc: Marc Zyngier Cc: Sean Christopherson Signed-off-by: Andrew Morton --- fs/proc/meminfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 5101131e6047..440960110a42 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -115,7 +115,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #endif show_val_kb(m, "PageTables: ", global_node_page_state(NR_PAGETABLE)); - show_val_kb(m, "SecPageTables: ", + show_val_kb(m, "SecPageTables: ", global_node_page_state(NR_SECONDARY_PAGETABLE)); show_val_kb(m, "NFS_Unstable: ", 0); -- cgit From 512c5ca01a3610ab14ff6309db363de51f1c13a6 Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Fri, 18 Nov 2022 14:33:04 +0800 Subject: nilfs2: fix nilfs_sufile_mark_dirty() not set segment usage as dirty When extending segments, nilfs_sufile_alloc() is called to get an unassigned segment, then mark it as dirty to avoid accidentally allocating the same segment in the future. But for some special cases such as a corrupted image it can be unreliable. If such corruption of the dirty state of the segment occurs, nilfs2 may reallocate a segment that is in use and pick the same segment for writing twice at the same time. This will cause the problem reported by syzkaller: https://syzkaller.appspot.com/bug?id=c7c4748e11ffcc367cef04f76e02e931833cbd24 This case started with segbuf1.segnum = 3, nextnum = 4 when constructed. It supposed segment 4 has already been allocated and marked as dirty. However the dirty state was corrupted and segment 4 usage was not dirty. For the first time nilfs_segctor_extend_segments() segment 4 was allocated again, which made segbuf2 and next segbuf3 had same segment 4. sb_getblk() will get same bh for segbuf2 and segbuf3, and this bh is added to both buffer lists of two segbuf. It makes the lists broken which causes NULL pointer dereference. Fix the problem by setting usage as dirty every time in nilfs_sufile_mark_dirty(), which is called during constructing current segment to be written out and before allocating next segment. [chenzhongjin@huawei.com: add lock protection per Ryusuke] Link: https://lkml.kernel.org/r/20221121091141.214703-1-chenzhongjin@huawei.com Link: https://lkml.kernel.org/r/20221118063304.140187-1-chenzhongjin@huawei.com Fixes: 9ff05123e3bf ("nilfs2: segment constructor") Signed-off-by: Chen Zhongjin Reported-by: Reported-by: Liu Shixin Acked-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/sufile.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 77ff8e95421f..dc359b56fdfa 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -495,14 +495,22 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) { struct buffer_head *bh; + void *kaddr; + struct nilfs_segment_usage *su; int ret; + down_write(&NILFS_MDT(sufile)->mi_sem); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); if (!ret) { mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(sufile); + kaddr = kmap_atomic(bh->b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); + nilfs_segment_usage_set_dirty(su); + kunmap_atomic(kaddr); brelse(bh); } + up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } -- cgit From 44361e8cf9ddb23f17bdcc40ca944abf32e83e79 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 23 Nov 2022 09:10:42 +0100 Subject: fuse: lock inode unconditionally in fuse_fallocate() file_modified() must be called with inode lock held. fuse_fallocate() didn't lock the inode in case of just FALLOC_KEEP_SIZE flags value, which resulted in a kernel Warning in notify_change(). Lock the inode unconditionally, like all other fallocate implementations do. Reported-by: Pengfei Xu Reported-and-tested-by: syzbot+462da39f0667b357c4b6@syzkaller.appspotmail.com Fixes: 4a6f278d4827 ("fuse: add file_modified() to fallocate") Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 71bfb663aac5..89f4741728ba 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2963,11 +2963,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, .mode = mode }; int err; - bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || - (mode & (FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_ZERO_RANGE)); - - bool block_faults = FUSE_IS_DAX(inode) && lock_inode; + bool block_faults = FUSE_IS_DAX(inode) && + (!(mode & FALLOC_FL_KEEP_SIZE) || + (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))); if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) @@ -2976,22 +2974,20 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (fm->fc->no_fallocate) return -EOPNOTSUPP; - if (lock_inode) { - inode_lock(inode); - if (block_faults) { - filemap_invalidate_lock(inode->i_mapping); - err = fuse_dax_break_layouts(inode, 0, 0); - if (err) - goto out; - } + inode_lock(inode); + if (block_faults) { + filemap_invalidate_lock(inode->i_mapping); + err = fuse_dax_break_layouts(inode, 0, 0); + if (err) + goto out; + } - if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) { - loff_t endbyte = offset + length - 1; + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) { + loff_t endbyte = offset + length - 1; - err = fuse_writeback_range(inode, offset, endbyte); - if (err) - goto out; - } + err = fuse_writeback_range(inode, offset, endbyte); + if (err) + goto out; } if (!(mode & FALLOC_FL_KEEP_SIZE) && @@ -3039,8 +3035,7 @@ out: if (block_faults) filemap_invalidate_unlock(inode->i_mapping); - if (lock_inode) - inode_unlock(inode); + inode_unlock(inode); fuse_flush_time_update(inode); -- cgit From a1db2f7edef095a385d477ab81e780694d63eebd Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 12 Oct 2022 13:23:23 +0200 Subject: fs/fuse: Replace kmap() with kmap_local_page() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The use of kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as the mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap’s pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Therefore, replace kmap() with kmap_local_page() in fuse_readdir_cached(), it being the only call site of kmap() currently left in fs/fuse. Cc: "Venkataramanan, Anirudh" Suggested-by: Ira Weiny Signed-off-by: Fabio M. De Francesco Signed-off-by: Miklos Szeredi --- fs/fuse/readdir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index e8deaacf1832..dc603479b30e 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -547,9 +547,9 @@ retry_locked: * Contents of the page are now protected against changing by holding * the page lock. */ - addr = kmap(page); + addr = kmap_local_page(page); res = fuse_parse_cache(ff, addr, size, ctx); - kunmap(page); + kunmap_local(addr); unlock_page(page); put_page(page); -- cgit From 4f8d37020e1fd0bf6ee9381ba918135ef3712efd Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 28 Oct 2022 14:25:21 +0200 Subject: fuse: add "expire only" mode to FUSE_NOTIFY_INVAL_ENTRY Add a flag to entry expiration that lets the filesystem expire a dentry without kicking it out from the cache immediately. This makes a difference for overmounted dentries, where plain invalidation would detach all submounts before dropping the dentry from the cache. If only expiry is set on the dentry, then any overmounts are left alone and until ->d_revalidate() is called. Note: ->d_revalidate() is not called for the case of following a submount, so invalidation will only be triggered for the non-overmounted case. The dentry could also be mounted in a different mount instance, in which case any submounts will still be detached. Suggested-by: Jakob Blomer Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 4 ++-- fs/fuse/dir.c | 6 ++++-- fs/fuse/fuse_i.h | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b4a6e0a1b945..0a845fac3ba8 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1498,7 +1498,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, buf[outarg.namelen] = 0; down_read(&fc->killsb); - err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name); + err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags); up_read(&fc->killsb); kfree(buf); return err; @@ -1546,7 +1546,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, buf[outarg.namelen] = 0; down_read(&fc->killsb); - err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name); + err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0); up_read(&fc->killsb); kfree(buf); return err; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index bb97a384dc5d..d092c7d75929 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1170,7 +1170,7 @@ int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask) } int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, - u64 child_nodeid, struct qstr *name) + u64 child_nodeid, struct qstr *name, u32 flags) { int err = -ENOTDIR; struct inode *parent; @@ -1197,7 +1197,9 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, goto unlock; fuse_dir_changed(parent); - fuse_invalidate_entry(entry); + if (!(flags & FUSE_EXPIRE_ONLY)) + d_invalidate(entry); + fuse_invalidate_entry_cache(entry); if (child_nodeid != 0 && d_really_is_positive(entry)) { inode_lock(d_inode(entry)); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 98a9cf531873..8789d94eda21 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1220,7 +1220,7 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, * then the dentry is unhashed (d_delete()). */ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, - u64 child_nodeid, struct qstr *name); + u64 child_nodeid, struct qstr *name, u32 flags); int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir); -- cgit From ccc031e26afe60d2a5a3d93dabd9c978210825fb Mon Sep 17 00:00:00 2001 From: Jiachen Zhang Date: Wed, 28 Sep 2022 20:19:34 +0800 Subject: fuse: always revalidate rename target dentry The previous commit df8629af2934 ("fuse: always revalidate if exclusive create") ensures that the dentries are revalidated on O_EXCL creates. This commit complements it by also performing revalidation for rename target dentries. Otherwise, a rename target file that only exists in kernel dentry cache but not in the filesystem will result in EEXIST if RENAME_NOREPLACE flag is used. Signed-off-by: Jiachen Zhang Signed-off-by: Zhang Tianci Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index d092c7d75929..2c4b08a6ec81 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -214,7 +214,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (inode && fuse_is_bad(inode)) goto invalid; else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || - (flags & (LOOKUP_EXCL | LOOKUP_REVAL))) { + (flags & (LOOKUP_EXCL | LOOKUP_REVAL | LOOKUP_RENAME_TARGET))) { struct fuse_entry_out outarg; FUSE_ARGS(args); struct fuse_forget_link *forget; -- cgit From 0618021e34c6b1edc4fc754ec53ab7fdcc98aaec Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 14 Sep 2022 16:26:32 +0200 Subject: fuse: Remove user_ns check for FUSE_DEV_IOC_CLONE Commit 8ed1f0e22f49e ("fs/fuse: fix ioctl type confusion") fixed a type confusion bug by adding an ->f_op comparison. Based on some off-list discussion back then, another check was added to compare the f_cred->user_ns. This is not for security reasons, but was based on the idea that a FUSE device FD should be using the UID/GID mappings of its f_cred->user_ns, and those translations are done using fc->user_ns, which matches the f_cred->user_ns of the initial FUSE device FD thanks to the check in fuse_fill_super(). See also commit 8cb08329b0809 ("fuse: Support fuse filesystems outside of init_user_ns"). But FUSE_DEV_IOC_CLONE is, at a higher level, a *cloning* operation that copies an existing context (with a weird API that involves first opening /dev/fuse, then tying the resulting new FUSE device FD to an existing FUSE instance). So if an application is already passing FUSE FDs across userns boundaries and dealing with the resulting ID mapping complications somehow, it doesn't make much sense to block this cloning operation. I've heard that this check is an obstacle for some folks, and I don't see a good reason to keep it, so remove it. Signed-off-by: Jann Horn Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 0a845fac3ba8..c73d9c4132f6 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2267,8 +2267,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, * Check against file->f_op because CUSE * uses the same ioctl handler. */ - if (old->f_op == file->f_op && - old->f_cred->user_ns == file->f_cred->user_ns) + if (old->f_op == file->f_op) fud = fuse_get_dev(old); if (fud) { -- cgit From 00d369bc2de5986504fc0d60bfeae4c342b2cad5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 9 Sep 2022 11:40:21 +0200 Subject: fuse: port to vfs{g,u}id_t and associated helpers A while ago we introduced a dedicated vfs{g,u}id_t type in commit 1e5267cd0895 ("mnt_idmapping: add vfs{g,u}id_t"). We already switched over a good part of the VFS. Ultimately we will remove all legacy idmapped mount helpers that operate only on k{g,u}id_t in favor of the new type safe helpers that operate on vfs{g,u}id_t. Cc: Seth Forshee (Digital Ocean) Cc: Christoph Hellwig Signed-off-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi --- fs/fuse/acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 337cb29a8dd5..84c1ca4bc1dc 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -98,7 +98,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, return ret; } - if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) && + if (!vfsgid_in_group_p(i_gid_into_vfsgid(&init_user_ns, inode)) && !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; -- cgit From e2283a736676554f72dbdcb62fdc1d23daf7044f Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Thu, 1 Sep 2022 07:42:59 +0000 Subject: fuse: remove the unneeded result variable Return the value fuse_dev_release() directly instead of storing it in another redundant variable. Reported-by: Zeal Robot Signed-off-by: ye xingchen Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index c7d882a9fe33..a06fbb1a8a5b 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -545,7 +545,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file) { struct fuse_dev *fud = file->private_data; struct cuse_conn *cc = fc_to_cc(fud->fc); - int rc; /* remove from the conntbl, no more access from this point on */ mutex_lock(&cuse_lock); @@ -560,9 +559,7 @@ static int cuse_channel_release(struct inode *inode, struct file *file) cdev_del(cc->cdev); } - rc = fuse_dev_release(inode, file); /* puts the base reference */ - - return rc; + return fuse_dev_release(inode, file); } static struct file_operations cuse_channel_fops; /* initialized during init */ -- cgit From 153524053bbb0d27bb2e0be36d1b46862e9ce74c Mon Sep 17 00:00:00 2001 From: Dharmendra Singh Date: Fri, 17 Jun 2022 12:40:27 +0530 Subject: fuse: allow non-extending parallel direct writes on the same file In general, as of now, in FUSE, direct writes on the same file are serialized over inode lock i.e we hold inode lock for the full duration of the write request. I could not find in fuse code and git history a comment which clearly explains why this exclusive lock is taken for direct writes. Following might be the reasons for acquiring an exclusive lock but not be limited to 1) Our guess is some USER space fuse implementations might be relying on this lock for serialization. 2) The lock protects against file read/write size races. 3) Ruling out any issues arising from partial write failures. This patch relaxes the exclusive lock for direct non-extending writes only. File size extending writes might not need the lock either, but we are not entirely sure if there is a risk to introduce any kind of regression. Furthermore, benchmarking with fio does not show a difference between patch versions that take on file size extension a) an exclusive lock and b) a shared lock. A possible example of an issue with i_size extending writes are write error cases. Some writes might succeed and others might fail for file system internal reasons - for example ENOSPACE. With parallel file size extending writes it _might_ be difficult to revert the action of the failing write, especially to restore the right i_size. With these changes, we allow non-extending parallel direct writes on the same file with the help of a flag called FOPEN_PARALLEL_DIRECT_WRITES. If this flag is set on the file (flag is passed from libfuse to fuse kernel as part of file open/create), we do not take exclusive lock anymore, but instead use a shared lock that allows non-extending writes to run in parallel. FUSE implementations which rely on this inode lock for serialization can continue to do so and serialized direct writes are still the default. Implementations that do not do write serialization need to be updated and need to set the FOPEN_PARALLEL_DIRECT_WRITES flag in their file open/create reply. On patch review there were concerns that network file systems (or vfs multiple mounts of the same file system) might have issues with parallel writes. We believe this is not the case, as this is just a local lock, which network file systems could not rely on anyway. I.e. this lock is just for local consistency. Signed-off-by: Dharmendra Singh Signed-off-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 89f4741728ba..d360d38ab616 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1563,14 +1563,47 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) return res; } +static bool fuse_direct_write_extending_i_size(struct kiocb *iocb, + struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); +} + static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; + bool exclusive_lock = + !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) || + iocb->ki_flags & IOCB_APPEND || + fuse_direct_write_extending_i_size(iocb, from); + + /* + * Take exclusive lock if + * - Parallel direct writes are disabled - a user space decision + * - Parallel direct writes are enabled and i_size is being extended. + * This might not be needed at all, but needs further investigation. + */ + if (exclusive_lock) + inode_lock(inode); + else { + inode_lock_shared(inode); + + /* A race with truncate might have come up as the decision for + * the lock type was done without holding the lock, check again. + */ + if (fuse_direct_write_extending_i_size(iocb, from)) { + inode_unlock_shared(inode); + inode_lock(inode); + exclusive_lock = true; + } + } - /* Don't allow parallel writes to the same file */ - inode_lock(inode); res = generic_write_checks(iocb, from); if (res > 0) { if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { @@ -1581,7 +1614,10 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) fuse_write_update_attr(inode, iocb->ki_pos, res); } } - inode_unlock(inode); + if (exclusive_lock) + inode_unlock(inode); + else + inode_unlock_shared(inode); return res; } @@ -2931,6 +2967,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (iov_iter_rw(iter) == WRITE) { fuse_write_update_attr(inode, pos, ret); + /* For extending writes we already hold exclusive lock */ if (ret < 0 && offset + count > i_size) fuse_do_truncate(file); } -- cgit From b138777786f780b9d5ce1989032608acbede0493 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 25 Oct 2022 09:10:17 -0700 Subject: fuse: Rearrange fuse_allow_current_process checks This is a followup to a previous commit of mine [0], which added the allow_sys_admin_access && capable(CAP_SYS_ADMIN) check. This patch rearranges the order of checks in fuse_allow_current_process without changing functionality. Commit 9ccf47b26b73 ("fuse: Add module param for CAP_SYS_ADMIN access bypassing allow_other") added allow_sys_admin_access && capable(CAP_SYS_ADMIN) check to the beginning of the function, with the reasoning that allow_sys_admin_access should be an 'escape hatch' for users with CAP_SYS_ADMIN, allowing them to skip any subsequent checks. However, placing this new check first results in many capable() calls when allow_sys_admin_access is set, where another check would've also returned 1. This can be problematic when a BPF program is tracing capable() calls. At Meta we ran into such a scenario recently. On a host where allow_sys_admin_access is set but most of the FUSE access is from processes which would pass other checks - i.e. they don't need CAP_SYS_ADMIN 'escape hatch' - this results in an unnecessary capable() call for each fs op. We also have a daemon tracing capable() with BPF and doing some data collection, so tracing these extraneous capable() calls has the potential to regress performance for an application doing many FUSE ops. So rearrange the order of these checks such that CAP_SYS_ADMIN 'escape hatch' is checked last. Add a small helper, fuse_permissible_uidgid, to make the logic easier to understand. Previously, if allow_other is set on the fuse_conn, uid/git checking doesn't happen as current_in_userns result is returned. These semantics are maintained here: fuse_permissible_uidgid check only happens if allow_other is not set. Signed-off-by: Dave Marchevsky Suggested-by: Andrii Nakryiko Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 35 ++++++++++++++++++++--------------- fs/fuse/fuse_i.h | 2 +- 2 files changed, 21 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 2c4b08a6ec81..aa67869e3444 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1237,6 +1237,18 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, return err; } +static inline bool fuse_permissible_uidgid(struct fuse_conn *fc) +{ + const struct cred *cred = current_cred(); + + return (uid_eq(cred->euid, fc->user_id) && + uid_eq(cred->suid, fc->user_id) && + uid_eq(cred->uid, fc->user_id) && + gid_eq(cred->egid, fc->group_id) && + gid_eq(cred->sgid, fc->group_id) && + gid_eq(cred->gid, fc->group_id)); +} + /* * Calling into a user-controlled filesystem gives the filesystem * daemon ptrace-like capabilities over the current process. This @@ -1250,26 +1262,19 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, * for which the owner of the mount has ptrace privilege. This * excludes processes started by other users, suid or sgid processes. */ -int fuse_allow_current_process(struct fuse_conn *fc) +bool fuse_allow_current_process(struct fuse_conn *fc) { - const struct cred *cred; - - if (allow_sys_admin_access && capable(CAP_SYS_ADMIN)) - return 1; + bool allow; if (fc->allow_other) - return current_in_userns(fc->user_ns); + allow = current_in_userns(fc->user_ns); + else + allow = fuse_permissible_uidgid(fc); - cred = current_cred(); - if (uid_eq(cred->euid, fc->user_id) && - uid_eq(cred->suid, fc->user_id) && - uid_eq(cred->uid, fc->user_id) && - gid_eq(cred->egid, fc->group_id) && - gid_eq(cred->sgid, fc->group_id) && - gid_eq(cred->gid, fc->group_id)) - return 1; + if (!allow && allow_sys_admin_access && capable(CAP_SYS_ADMIN)) + allow = true; - return 0; + return allow; } static int fuse_access(struct inode *inode, int mask) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 8789d94eda21..d339b1ace887 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1179,7 +1179,7 @@ bool fuse_invalid_attr(struct fuse_attr *attr); /** * Is current process allowed to perform filesystem operation? */ -int fuse_allow_current_process(struct fuse_conn *fc); +bool fuse_allow_current_process(struct fuse_conn *fc); u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); -- cgit From 8fe97d47b52ae1ad130470b1780f0ded4ba609a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 20 Nov 2022 13:43:03 +0100 Subject: btrfs: use kvcalloc in btrfs_get_dev_zone_info Otherwise the kernel memory allocator seems to be unhappy about failing order 6 allocations for the zones array, that cause 100% reproducible mount failures in my qemu setup: [26.078981] mount: page allocation failure: order:6, mode:0x40dc0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null) [26.079741] CPU: 0 PID: 2965 Comm: mount Not tainted 6.1.0-rc5+ #185 [26.080181] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [26.080950] Call Trace: [26.081132] [26.081291] dump_stack_lvl+0x56/0x6f [26.081554] warn_alloc+0x117/0x140 [26.081808] ? __alloc_pages_direct_compact+0x1b5/0x300 [26.082174] __alloc_pages_slowpath.constprop.0+0xd0e/0xde0 [26.082569] __alloc_pages+0x32a/0x340 [26.082836] __kmalloc_large_node+0x4d/0xa0 [26.083133] ? trace_kmalloc+0x29/0xd0 [26.083399] kmalloc_large+0x14/0x60 [26.083654] btrfs_get_dev_zone_info+0x1b9/0xc00 [26.083980] ? _raw_spin_unlock_irqrestore+0x28/0x50 [26.084328] btrfs_get_dev_zone_info_all_devices+0x54/0x80 [26.084708] open_ctree+0xed4/0x1654 [26.084974] btrfs_mount_root.cold+0x12/0xde [26.085288] ? lock_is_held_type+0xe2/0x140 [26.085603] legacy_get_tree+0x28/0x50 [26.085876] vfs_get_tree+0x1d/0xb0 [26.086139] vfs_kern_mount.part.0+0x6c/0xb0 [26.086456] btrfs_mount+0x118/0x3a0 [26.086728] ? lock_is_held_type+0xe2/0x140 [26.087043] legacy_get_tree+0x28/0x50 [26.087323] vfs_get_tree+0x1d/0xb0 [26.087587] path_mount+0x2ba/0xbe0 [26.087850] ? _raw_spin_unlock_irqrestore+0x38/0x50 [26.088217] __x64_sys_mount+0xfe/0x140 [26.088506] do_syscall_64+0x35/0x80 [26.088776] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 5b316468983d ("btrfs: get zone information of zoned block devices") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 44d8177eeffc..c9e2b0c85309 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -467,7 +467,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) goto out; } - zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); + zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); if (!zones) { ret = -ENOMEM; goto out; @@ -586,7 +586,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) } - kfree(zones); + kvfree(zones); switch (bdev_zoned_model(bdev)) { case BLK_ZONED_HM: @@ -618,7 +618,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) return 0; out: - kfree(zones); + kvfree(zones); out_free_zone_info: btrfs_destroy_dev_zone_info(device); -- cgit From 796787c978efbbdb50e245718c784eb94f59eac4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 21 Nov 2022 10:23:22 +0000 Subject: btrfs: do not modify log tree while holding a leaf from fs tree locked When logging an inode in full mode, or when logging xattrs or when logging the dir index items of a directory, we are modifying the log tree while holding a read lock on a leaf from the fs/subvolume tree. This can lead to a deadlock in rare circumstances, but it is a real possibility, and it was recently reported by syzbot with the following trace from lockdep: WARNING: possible circular locking dependency detected 6.1.0-rc5-next-20221116-syzkaller #0 Not tainted ------------------------------------------------------ syz-executor.1/16154 is trying to acquire lock: ffff88807e3084a0 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0xa1/0xf30 fs/btrfs/delayed-inode.c:256 but task is already holding lock: ffff88807df33078 (btrfs-log-00){++++}-{3:3}, at: __btrfs_tree_lock+0x32/0x3d0 fs/btrfs/locking.c:197 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (btrfs-log-00){++++}-{3:3}: down_read_nested+0x9e/0x450 kernel/locking/rwsem.c:1634 __btrfs_tree_read_lock+0x32/0x350 fs/btrfs/locking.c:135 btrfs_tree_read_lock fs/btrfs/locking.c:141 [inline] btrfs_read_lock_root_node+0x82/0x3a0 fs/btrfs/locking.c:280 btrfs_search_slot_get_root fs/btrfs/ctree.c:1678 [inline] btrfs_search_slot+0x3ca/0x2c70 fs/btrfs/ctree.c:1998 btrfs_lookup_csum+0x116/0x3f0 fs/btrfs/file-item.c:209 btrfs_csum_file_blocks+0x40e/0x1370 fs/btrfs/file-item.c:1021 log_csums.isra.0+0x244/0x2d0 fs/btrfs/tree-log.c:4258 copy_items.isra.0+0xbfb/0xed0 fs/btrfs/tree-log.c:4403 copy_inode_items_to_log+0x13d6/0x1d90 fs/btrfs/tree-log.c:5873 btrfs_log_inode+0xb19/0x4680 fs/btrfs/tree-log.c:6495 btrfs_log_inode_parent+0x890/0x2a20 fs/btrfs/tree-log.c:6982 btrfs_log_dentry_safe+0x59/0x80 fs/btrfs/tree-log.c:7083 btrfs_sync_file+0xa41/0x13c0 fs/btrfs/file.c:1921 vfs_fsync_range+0x13e/0x230 fs/sync.c:188 generic_write_sync include/linux/fs.h:2856 [inline] iomap_dio_complete+0x73a/0x920 fs/iomap/direct-io.c:128 btrfs_direct_write fs/btrfs/file.c:1536 [inline] btrfs_do_write_iter+0xba2/0x1470 fs/btrfs/file.c:1668 call_write_iter include/linux/fs.h:2160 [inline] do_iter_readv_writev+0x20b/0x3b0 fs/read_write.c:735 do_iter_write+0x182/0x700 fs/read_write.c:861 vfs_iter_write+0x74/0xa0 fs/read_write.c:902 iter_file_splice_write+0x745/0xc90 fs/splice.c:686 do_splice_from fs/splice.c:764 [inline] direct_splice_actor+0x114/0x180 fs/splice.c:931 splice_direct_to_actor+0x335/0x8a0 fs/splice.c:886 do_splice_direct+0x1ab/0x280 fs/splice.c:974 do_sendfile+0xb19/0x1270 fs/read_write.c:1255 __do_sys_sendfile64 fs/read_write.c:1323 [inline] __se_sys_sendfile64 fs/read_write.c:1309 [inline] __x64_sys_sendfile64+0x259/0x2c0 fs/read_write.c:1309 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd -> #1 (btrfs-tree-00){++++}-{3:3}: __lock_release kernel/locking/lockdep.c:5382 [inline] lock_release+0x371/0x810 kernel/locking/lockdep.c:5688 up_write+0x2a/0x520 kernel/locking/rwsem.c:1614 btrfs_tree_unlock_rw fs/btrfs/locking.h:189 [inline] btrfs_unlock_up_safe+0x1e3/0x290 fs/btrfs/locking.c:238 search_leaf fs/btrfs/ctree.c:1832 [inline] btrfs_search_slot+0x265e/0x2c70 fs/btrfs/ctree.c:2074 btrfs_insert_empty_items+0xbd/0x1c0 fs/btrfs/ctree.c:4133 btrfs_insert_delayed_item+0x826/0xfa0 fs/btrfs/delayed-inode.c:746 btrfs_insert_delayed_items fs/btrfs/delayed-inode.c:824 [inline] __btrfs_commit_inode_delayed_items fs/btrfs/delayed-inode.c:1111 [inline] __btrfs_run_delayed_items+0x280/0x590 fs/btrfs/delayed-inode.c:1153 flush_space+0x147/0xe90 fs/btrfs/space-info.c:728 btrfs_async_reclaim_metadata_space+0x541/0xc10 fs/btrfs/space-info.c:1086 process_one_work+0x9bf/0x1710 kernel/workqueue.c:2289 worker_thread+0x669/0x1090 kernel/workqueue.c:2436 kthread+0x2e8/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 -> #0 (&delayed_node->mutex){+.+.}-{3:3}: check_prev_add kernel/locking/lockdep.c:3097 [inline] check_prevs_add kernel/locking/lockdep.c:3216 [inline] validate_chain kernel/locking/lockdep.c:3831 [inline] __lock_acquire+0x2a43/0x56d0 kernel/locking/lockdep.c:5055 lock_acquire kernel/locking/lockdep.c:5668 [inline] lock_acquire+0x1e3/0x630 kernel/locking/lockdep.c:5633 __mutex_lock_common kernel/locking/mutex.c:603 [inline] __mutex_lock+0x12f/0x1360 kernel/locking/mutex.c:747 __btrfs_release_delayed_node.part.0+0xa1/0xf30 fs/btrfs/delayed-inode.c:256 __btrfs_release_delayed_node fs/btrfs/delayed-inode.c:251 [inline] btrfs_release_delayed_node fs/btrfs/delayed-inode.c:281 [inline] btrfs_remove_delayed_node+0x52/0x60 fs/btrfs/delayed-inode.c:1285 btrfs_evict_inode+0x511/0xf30 fs/btrfs/inode.c:5554 evict+0x2ed/0x6b0 fs/inode.c:664 dispose_list+0x117/0x1e0 fs/inode.c:697 prune_icache_sb+0xeb/0x150 fs/inode.c:896 super_cache_scan+0x391/0x590 fs/super.c:106 do_shrink_slab+0x464/0xce0 mm/vmscan.c:843 shrink_slab_memcg mm/vmscan.c:912 [inline] shrink_slab+0x388/0x660 mm/vmscan.c:991 shrink_node_memcgs mm/vmscan.c:6088 [inline] shrink_node+0x93d/0x1f30 mm/vmscan.c:6117 shrink_zones mm/vmscan.c:6355 [inline] do_try_to_free_pages+0x3b4/0x17a0 mm/vmscan.c:6417 try_to_free_mem_cgroup_pages+0x3a4/0xa70 mm/vmscan.c:6732 reclaim_high.constprop.0+0x182/0x230 mm/memcontrol.c:2393 mem_cgroup_handle_over_high+0x190/0x520 mm/memcontrol.c:2578 try_charge_memcg+0xe0c/0x12f0 mm/memcontrol.c:2816 try_charge mm/memcontrol.c:2827 [inline] charge_memcg+0x90/0x3b0 mm/memcontrol.c:6889 __mem_cgroup_charge+0x2b/0x90 mm/memcontrol.c:6910 mem_cgroup_charge include/linux/memcontrol.h:667 [inline] __filemap_add_folio+0x615/0xf80 mm/filemap.c:852 filemap_add_folio+0xaf/0x1e0 mm/filemap.c:934 __filemap_get_folio+0x389/0xd80 mm/filemap.c:1976 pagecache_get_page+0x2e/0x280 mm/folio-compat.c:104 find_or_create_page include/linux/pagemap.h:612 [inline] alloc_extent_buffer+0x2b9/0x1580 fs/btrfs/extent_io.c:4588 btrfs_init_new_buffer fs/btrfs/extent-tree.c:4869 [inline] btrfs_alloc_tree_block+0x2e1/0x1320 fs/btrfs/extent-tree.c:4988 __btrfs_cow_block+0x3b2/0x1420 fs/btrfs/ctree.c:440 btrfs_cow_block+0x2fa/0x950 fs/btrfs/ctree.c:595 btrfs_search_slot+0x11b0/0x2c70 fs/btrfs/ctree.c:2038 btrfs_update_root+0xdb/0x630 fs/btrfs/root-tree.c:137 update_log_root fs/btrfs/tree-log.c:2841 [inline] btrfs_sync_log+0xbfb/0x2870 fs/btrfs/tree-log.c:3064 btrfs_sync_file+0xdb9/0x13c0 fs/btrfs/file.c:1947 vfs_fsync_range+0x13e/0x230 fs/sync.c:188 generic_write_sync include/linux/fs.h:2856 [inline] iomap_dio_complete+0x73a/0x920 fs/iomap/direct-io.c:128 btrfs_direct_write fs/btrfs/file.c:1536 [inline] btrfs_do_write_iter+0xba2/0x1470 fs/btrfs/file.c:1668 call_write_iter include/linux/fs.h:2160 [inline] do_iter_readv_writev+0x20b/0x3b0 fs/read_write.c:735 do_iter_write+0x182/0x700 fs/read_write.c:861 vfs_iter_write+0x74/0xa0 fs/read_write.c:902 iter_file_splice_write+0x745/0xc90 fs/splice.c:686 do_splice_from fs/splice.c:764 [inline] direct_splice_actor+0x114/0x180 fs/splice.c:931 splice_direct_to_actor+0x335/0x8a0 fs/splice.c:886 do_splice_direct+0x1ab/0x280 fs/splice.c:974 do_sendfile+0xb19/0x1270 fs/read_write.c:1255 __do_sys_sendfile64 fs/read_write.c:1323 [inline] __se_sys_sendfile64 fs/read_write.c:1309 [inline] __x64_sys_sendfile64+0x259/0x2c0 fs/read_write.c:1309 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd other info that might help us debug this: Chain exists of: &delayed_node->mutex --> btrfs-tree-00 --> btrfs-log-00 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(btrfs-log-00); lock(btrfs-tree-00); lock(btrfs-log-00); lock(&delayed_node->mutex); Holding a read lock on a leaf from a fs/subvolume tree creates a nasty lock dependency when we are COWing extent buffers for the log tree and we have two tasks modifying the log tree, with each one in one of the following 2 scenarios: 1) Modifying the log tree triggers an extent buffer allocation while holding a write lock on a parent extent buffer from the log tree. Allocating the pages for an extent buffer, or the extent buffer struct, can trigger inode eviction and finally the inode eviction will trigger a release/remove of a delayed node, which requires taking the delayed node's mutex; 2) Allocating a metadata extent for a log tree can trigger the async reclaim thread and make us wait for it to release enough space and unblock our reservation ticket. The reclaim thread can start flushing delayed items, and that in turn results in the need to lock delayed node mutexes and in the need to write lock extent buffers of a subvolume tree - all this while holding a write lock on the parent extent buffer in the log tree. So one task in scenario 1) running in parallel with another task in scenario 2) could lead to a deadlock, one wanting to lock a delayed node mutex while having a read lock on a leaf from the subvolume, while the other is holding the delayed node's mutex and wants to write lock the same subvolume leaf for flushing delayed items. Fix this by cloning the leaf of the fs/subvolume tree, release/unlock the fs/subvolume leaf and use the clone leaf instead. Reported-by: syzbot+9b7c21f486f5e7f8d029@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/000000000000ccc93c05edc4d8cf@google.com/ CC: stable@vger.kernel.org # 6.0+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 813986e38258..c3cf3dabe0b1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3694,15 +3694,29 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, u64 *last_old_dentry_offset) { struct btrfs_root *log = inode->root->log_root; - struct extent_buffer *src = path->nodes[0]; - const int nritems = btrfs_header_nritems(src); + struct extent_buffer *src; + const int nritems = btrfs_header_nritems(path->nodes[0]); const u64 ino = btrfs_ino(inode); bool last_found = false; int batch_start = 0; int batch_size = 0; int i; - for (i = path->slots[0]; i < nritems; i++) { + /* + * We need to clone the leaf, release the read lock on it, and use the + * clone before modifying the log tree. See the comment at copy_items() + * about why we need to do this. + */ + src = btrfs_clone_extent_buffer(path->nodes[0]); + if (!src) + return -ENOMEM; + + i = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = src; + path->slots[0] = i; + + for (; i < nritems; i++) { struct btrfs_dir_item *di; struct btrfs_key key; int ret; @@ -4303,7 +4317,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, { struct btrfs_root *log = inode->root->log_root; struct btrfs_file_extent_item *extent; - struct extent_buffer *src = src_path->nodes[0]; + struct extent_buffer *src; int ret = 0; struct btrfs_key *ins_keys; u32 *ins_sizes; @@ -4314,6 +4328,43 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); const u64 i_size = i_size_read(&inode->vfs_inode); + /* + * To keep lockdep happy and avoid deadlocks, clone the source leaf and + * use the clone. This is because otherwise we would be changing the log + * tree, to insert items from the subvolume tree or insert csum items, + * while holding a read lock on a leaf from the subvolume tree, which + * creates a nasty lock dependency when COWing log tree nodes/leaves: + * + * 1) Modifying the log tree triggers an extent buffer allocation while + * holding a write lock on a parent extent buffer from the log tree. + * Allocating the pages for an extent buffer, or the extent buffer + * struct, can trigger inode eviction and finally the inode eviction + * will trigger a release/remove of a delayed node, which requires + * taking the delayed node's mutex; + * + * 2) Allocating a metadata extent for a log tree can trigger the async + * reclaim thread and make us wait for it to release enough space and + * unblock our reservation ticket. The reclaim thread can start + * flushing delayed items, and that in turn results in the need to + * lock delayed node mutexes and in the need to write lock extent + * buffers of a subvolume tree - all this while holding a write lock + * on the parent extent buffer in the log tree. + * + * So one task in scenario 1) running in parallel with another task in + * scenario 2) could lead to a deadlock, one wanting to lock a delayed + * node mutex while having a read lock on a leaf from the subvolume, + * while the other is holding the delayed node's mutex and wants to + * write lock the same subvolume leaf for flushing delayed items. + */ + src = btrfs_clone_extent_buffer(src_path->nodes[0]); + if (!src) + return -ENOMEM; + + i = src_path->slots[0]; + btrfs_release_path(src_path); + src_path->nodes[0] = src; + src_path->slots[0] = i; + ins_data = kmalloc(nr * sizeof(struct btrfs_key) + nr * sizeof(u32), GFP_NOFS); if (!ins_data) -- cgit From ffdbb44f2f23f963b8f5672e35c3a26088177a62 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Tue, 22 Nov 2022 19:50:02 +0800 Subject: btrfs: sysfs: normalize the error handling branch in btrfs_init_sysfs() Although kset_unregister() can eventually remove all attribute files, explicitly rolling back with the matching function makes the code logic look clearer. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Zhen Lei Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 699b54b3acaa..74fef1f49c35 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -2321,8 +2321,11 @@ int __init btrfs_init_sysfs(void) #ifdef CONFIG_BTRFS_DEBUG ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group); - if (ret) - goto out2; + if (ret) { + sysfs_unmerge_group(&btrfs_kset->kobj, + &btrfs_static_feature_attr_group); + goto out_remove_group; + } #endif return 0; -- cgit From ac8db824ead0de2e9111337c401409d010fba2f0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 23 Nov 2022 14:14:32 -0500 Subject: NFSD: Fix reads with a non-zero offset that don't end on a page boundary This was found when virtual machines with nfs-mounted qcow2 disks failed to boot properly. Reported-by: Anders Blomdell Suggested-by: Al Viro Link: https://bugzilla.redhat.com/show_bug.cgi?id=2142132 Fixes: bfbfb6182ad1 ("nfsd_splice_actor(): handle compound pages") Signed-off-by: Chuck Lever --- fs/nfsd/vfs.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 83be89905cbf..31bc7cc82439 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -871,10 +871,11 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct svc_rqst *rqstp = sd->u.data; struct page *page = buf->page; // may be a compound one unsigned offset = buf->offset; + struct page *last_page; - page += offset / PAGE_SIZE; - for (int i = sd->len; i > 0; i -= PAGE_SIZE) - svc_rqst_replace_page(rqstp, page++); + last_page = page + (offset + sd->len - 1) / PAGE_SIZE; + for (page += offset / PAGE_SIZE; page <= last_page; page++) + svc_rqst_replace_page(rqstp, page); if (rqstp->rq_res.page_len == 0) // first call rqstp->rq_res.page_base = offset % PAGE_SIZE; rqstp->rq_res.page_len += sd->len; -- cgit From 61d8e42667716f71f2c26e327e66f2940d809f80 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 24 Nov 2022 22:55:57 -0500 Subject: copy_mnt_ns(): handle a corner case (overmounted mntns bindings) saner copy_mnt_ns() has the old tree copied, with mntns binding *and* anything bound on top of them skipped. Then it proceeds to walk both trees in parallel. Unfortunately, it doesn't get the "skip the stuff we'd skipped when copying" quite right. Consequences are minor (the ->mnt_root comparison will return the situation to sanity pretty soon and the worst we get is the unexpected subset of opened non-directories being switched to new namespace), but it's confusing enough and it's not hard to get the expected behaviour... Signed-off-by: Al Viro --- fs/namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index df137ba19d37..c80f422084eb 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3515,8 +3515,9 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, q = next_mnt(q, new); if (!q) break; + // an mntns binding we'd skipped? while (p->mnt.mnt_root != q->mnt.mnt_root) - p = next_mnt(p, old); + p = next_mnt(skip_mnt_tree(p), old); } namespace_unlock(); -- cgit From bdbadfcc37c5c8f9f2a401a18eae71b0c28799ee Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 3 Sep 2022 20:23:56 -0400 Subject: [elf][non-regset] uninline elf_core_copy_task_fpregs() (and lose pt_regs argument) Don't bother with pointless macros - we are not sharing it with aout coredumps anymore. Just convert the underlying functions to the same arguments (nobody uses regs, actually) and call them elf_core_copy_task_fpregs(). And unexport the entire bunch, while we are at it. [added missing includes in arch/{csky,m68k,um}/kernel/process.c to avoid extra warnings about the lack of externs getting added to huge piles for those files. Pointless, but...] Signed-off-by: Al Viro --- fs/binfmt_elf.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e990075fb43d..c3c5bd48361e 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2001,8 +2001,7 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t) t->num_notes++; sz += notesize(&t->notes[0]); - if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL, - &t->fpu))) { + if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, &t->fpu))) { fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), &(t->fpu)); t->num_notes++; @@ -2100,7 +2099,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, /* Try to dump the FPU. */ info->prstatus->pr_fpvalid = - elf_core_copy_task_fpregs(current, task_pt_regs(current), info->fpu); + elf_core_copy_task_fpregs(current, info->fpu); if (info->prstatus->pr_fpvalid) fill_note(info->notes + info->numnote++, "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu); -- cgit From e961d370fc7b806a0d668413a9f1b006760eaee6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 4 Sep 2022 17:17:26 -0400 Subject: [elf][non-regset] use elf_core_copy_task_regs() for dumper as well elf_core_copy_regs() is equivalent to elf_core_copy_task_regs() of current on all architectures. Signed-off-by: Al Viro --- fs/binfmt_elf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index c3c5bd48361e..cb95e842c50f 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2072,7 +2072,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, /* now collect the dump for the current */ memset(info->prstatus, 0, sizeof(*info->prstatus)); fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo); - elf_core_copy_regs(&info->prstatus->pr_reg, task_pt_regs(current)); + elf_core_copy_task_regs(current, &info->prstatus->pr_reg); /* Set up header */ fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); -- cgit From e92edb85d87e9f8c8bead9cdb7cb2da4b51fd7f8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Sep 2022 00:41:05 -0400 Subject: [elf] unify regset and non-regset cases The only real difference is in filling per-thread notes - getting the values of registers. And this is the only part that is worth an ifdef - we don't need to duplicate the logics regarding gathering threads, filling other notes, etc. It would've been hard to do back when regset-based variant had been introduced, mostly due to sharing bits and pieces of helpers with aout coredumps. As the result, too much had been duplicated and the copies had drifted away since then. Now it can be done cleanly... Signed-off-by: Al Viro --- fs/binfmt_elf.c | 230 ++++++++++---------------------------------------------- 1 file changed, 38 insertions(+), 192 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index cb95e842c50f..3758a617c9d8 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1723,7 +1723,6 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm return 0; } -#ifdef CORE_DUMP_USE_REGSET #include struct elf_thread_core_info { @@ -1744,6 +1743,7 @@ struct elf_note_info { int thread_notes; }; +#ifdef CORE_DUMP_USE_REGSET /* * When a regset has a writeback hook, we call it on each thread before * dumping user memory. On register window machines, this makes sure the @@ -1823,13 +1823,41 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, return 1; } +#else +static int fill_thread_core_info(struct elf_thread_core_info *t, + const struct user_regset_view *view, + long signr, struct elf_note_info *info) +{ + struct task_struct *p = t->task; + elf_fpregset_t *fpu; + + fill_prstatus(&t->prstatus.common, p, signr); + elf_core_copy_task_regs(p, &t->prstatus.pr_reg); + + fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + &(t->prstatus)); + info->size += notesize(&t->notes[0]); + + fpu = kzalloc(sizeof(elf_fpregset_t), GFP_KERNEL); + if (!fpu || !elf_core_copy_task_fpregs(p, fpu)) { + kfree(fpu); + return 1; + } + + t->prstatus.pr_fpvalid = 1; + fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(*fpu), fpu); + info->size += notesize(&t->notes[1]); + + return 1; +} +#endif static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, struct coredump_params *cprm) { struct task_struct *dump_task = current; - const struct user_regset_view *view = task_user_regset_view(dump_task); + const struct user_regset_view *view; struct elf_thread_core_info *t; struct elf_prpsinfo *psinfo; struct core_thread *ct; @@ -1839,6 +1867,9 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, return 0; fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); +#ifdef CORE_DUMP_USE_REGSET + view = task_user_regset_view(dump_task); + /* * Figure out how many notes we're going to need for each thread. */ @@ -1862,6 +1893,11 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, */ fill_elf_header(elf, phdrs, view->e_machine, view->e_flags); +#else + view = NULL; + info->thread_notes = 2; + fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); +#endif /* * Allocate a structure for each thread. @@ -1969,196 +2005,6 @@ static void free_note_info(struct elf_note_info *info) kvfree(info->files.data); } -#else - -/* Here is the structure in which status of each thread is captured. */ -struct elf_thread_status -{ - struct list_head list; - struct elf_prstatus prstatus; /* NT_PRSTATUS */ - elf_fpregset_t fpu; /* NT_PRFPREG */ - struct task_struct *thread; - struct memelfnote notes[3]; - int num_notes; -}; - -/* - * In order to add the specific thread information for the elf file format, - * we need to keep a linked list of every threads pr_status and then create - * a single section for them in the final core file. - */ -static int elf_dump_thread_status(long signr, struct elf_thread_status *t) -{ - int sz = 0; - struct task_struct *p = t->thread; - t->num_notes = 0; - - fill_prstatus(&t->prstatus.common, p, signr); - elf_core_copy_task_regs(p, &t->prstatus.pr_reg); - - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), - &(t->prstatus)); - t->num_notes++; - sz += notesize(&t->notes[0]); - - if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, &t->fpu))) { - fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), - &(t->fpu)); - t->num_notes++; - sz += notesize(&t->notes[1]); - } - return sz; -} - -struct elf_note_info { - struct memelfnote *notes; - struct memelfnote *notes_files; - struct elf_prstatus *prstatus; /* NT_PRSTATUS */ - struct elf_prpsinfo *psinfo; /* NT_PRPSINFO */ - struct list_head thread_list; - elf_fpregset_t *fpu; - user_siginfo_t csigdata; - int thread_status_size; - int numnote; -}; - -static int elf_note_info_init(struct elf_note_info *info) -{ - memset(info, 0, sizeof(*info)); - INIT_LIST_HEAD(&info->thread_list); - - /* Allocate space for ELF notes */ - info->notes = kmalloc_array(8, sizeof(struct memelfnote), GFP_KERNEL); - if (!info->notes) - return 0; - info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL); - if (!info->psinfo) - return 0; - info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL); - if (!info->prstatus) - return 0; - info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL); - if (!info->fpu) - return 0; - return 1; -} - -static int fill_note_info(struct elfhdr *elf, int phdrs, - struct elf_note_info *info, - struct coredump_params *cprm) -{ - struct core_thread *ct; - struct elf_thread_status *ets; - - if (!elf_note_info_init(info)) - return 0; - - for (ct = current->signal->core_state->dumper.next; - ct; ct = ct->next) { - ets = kzalloc(sizeof(*ets), GFP_KERNEL); - if (!ets) - return 0; - - ets->thread = ct->task; - list_add(&ets->list, &info->thread_list); - } - - list_for_each_entry(ets, &info->thread_list, list) { - int sz; - - sz = elf_dump_thread_status(cprm->siginfo->si_signo, ets); - info->thread_status_size += sz; - } - /* now collect the dump for the current */ - memset(info->prstatus, 0, sizeof(*info->prstatus)); - fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo); - elf_core_copy_task_regs(current, &info->prstatus->pr_reg); - - /* Set up header */ - fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); - - /* - * Set up the notes in similar form to SVR4 core dumps made - * with info from their /proc. - */ - - fill_note(info->notes + 0, "CORE", NT_PRSTATUS, - sizeof(*info->prstatus), info->prstatus); - fill_psinfo(info->psinfo, current->group_leader, current->mm); - fill_note(info->notes + 1, "CORE", NT_PRPSINFO, - sizeof(*info->psinfo), info->psinfo); - - fill_siginfo_note(info->notes + 2, &info->csigdata, cprm->siginfo); - fill_auxv_note(info->notes + 3, current->mm); - info->numnote = 4; - - if (fill_files_note(info->notes + info->numnote, cprm) == 0) { - info->notes_files = info->notes + info->numnote; - info->numnote++; - } - - /* Try to dump the FPU. */ - info->prstatus->pr_fpvalid = - elf_core_copy_task_fpregs(current, info->fpu); - if (info->prstatus->pr_fpvalid) - fill_note(info->notes + info->numnote++, - "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu); - return 1; -} - -static size_t get_note_info_size(struct elf_note_info *info) -{ - int sz = 0; - int i; - - for (i = 0; i < info->numnote; i++) - sz += notesize(info->notes + i); - - sz += info->thread_status_size; - - return sz; -} - -static int write_note_info(struct elf_note_info *info, - struct coredump_params *cprm) -{ - struct elf_thread_status *ets; - int i; - - for (i = 0; i < info->numnote; i++) - if (!writenote(info->notes + i, cprm)) - return 0; - - /* write out the thread status notes section */ - list_for_each_entry(ets, &info->thread_list, list) { - for (i = 0; i < ets->num_notes; i++) - if (!writenote(&ets->notes[i], cprm)) - return 0; - } - - return 1; -} - -static void free_note_info(struct elf_note_info *info) -{ - while (!list_empty(&info->thread_list)) { - struct list_head *tmp = info->thread_list.next; - list_del(tmp); - kfree(list_entry(tmp, struct elf_thread_status, list)); - } - - /* Free data possibly allocated by fill_files_note(): */ - if (info->notes_files) - kvfree(info->notes_files->data); - - kfree(info->prstatus); - kfree(info->psinfo); - kfree(info->notes); - kfree(info->fpu); -} - -#endif - static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, elf_addr_t e_shoff, int segs) { -- cgit From 38ba2f11d9ce0e7c9444e57cb1bb418d1979534b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 4 Sep 2022 23:24:11 -0400 Subject: [elf] get rid of get_note_info_size() it's trivial now... Signed-off-by: Al Viro --- fs/binfmt_elf.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 3758a617c9d8..65d2d2ba4d9e 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1946,11 +1946,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, return 1; } -static size_t get_note_info_size(struct elf_note_info *info) -{ - return info->size; -} - /* * Write all the notes for each thread. When writing the first thread, the * process-wide notes are interleaved after the first thread-specific note. @@ -2068,7 +2063,7 @@ static int elf_core_dump(struct coredump_params *cprm) /* Write notes phdr entry */ { - size_t sz = get_note_info_size(&info); + size_t sz = info.size; /* For cell spufs */ sz += elf_coredump_extra_notes_size(); -- cgit From cda2ed05aade303b7d89844a0333168c3484634a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 31 Oct 2022 13:46:26 +0100 Subject: fs: simplify vfs_get_super Remove the pointless keying argument and associated enum and pass the fill_super callback and a "bool reconf" instead. Also mark the function static given that there are no users outside of super.c. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/super.c | 60 +++++++++--------------------------------------------------- 1 file changed, 9 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 6a82660e1adb..76f477c24c3d 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1111,55 +1111,14 @@ static int test_single_super(struct super_block *s, struct fs_context *fc) return 1; } -/** - * vfs_get_super - Get a superblock with a search key set in s_fs_info. - * @fc: The filesystem context holding the parameters - * @keying: How to distinguish superblocks - * @fill_super: Helper to initialise a new superblock - * - * Search for a superblock and create a new one if not found. The search - * criterion is controlled by @keying. If the search fails, a new superblock - * is created and @fill_super() is called to initialise it. - * - * @keying can take one of a number of values: - * - * (1) vfs_get_single_super - Only one superblock of this type may exist on the - * system. This is typically used for special system filesystems. - * - * (2) vfs_get_keyed_super - Multiple superblocks may exist, but they must have - * distinct keys (where the key is in s_fs_info). Searching for the same - * key again will turn up the superblock for that key. - * - * (3) vfs_get_independent_super - Multiple superblocks may exist and are - * unkeyed. Each call will get a new superblock. - * - * A permissions check is made by sget_fc() unless we're getting a superblock - * for a kernel-internal mount or a submount. - */ -int vfs_get_super(struct fs_context *fc, - enum vfs_get_super_keying keying, - int (*fill_super)(struct super_block *sb, - struct fs_context *fc)) +static int vfs_get_super(struct fs_context *fc, bool reconf, + int (*test)(struct super_block *, struct fs_context *), + int (*fill_super)(struct super_block *sb, + struct fs_context *fc)) { - int (*test)(struct super_block *, struct fs_context *); struct super_block *sb; int err; - switch (keying) { - case vfs_get_single_super: - case vfs_get_single_reconf_super: - test = test_single_super; - break; - case vfs_get_keyed_super: - test = test_keyed_super; - break; - case vfs_get_independent_super: - test = NULL; - break; - default: - BUG(); - } - sb = sget_fc(fc, test, set_anon_super_fc); if (IS_ERR(sb)) return PTR_ERR(sb); @@ -1173,7 +1132,7 @@ int vfs_get_super(struct fs_context *fc, fc->root = dget(sb->s_root); } else { fc->root = dget(sb->s_root); - if (keying == vfs_get_single_reconf_super) { + if (reconf) { err = reconfigure_super(fc); if (err < 0) { dput(fc->root); @@ -1189,13 +1148,12 @@ error: deactivate_locked_super(sb); return err; } -EXPORT_SYMBOL(vfs_get_super); int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { - return vfs_get_super(fc, vfs_get_independent_super, fill_super); + return vfs_get_super(fc, false, NULL, fill_super); } EXPORT_SYMBOL(get_tree_nodev); @@ -1203,7 +1161,7 @@ int get_tree_single(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { - return vfs_get_super(fc, vfs_get_single_super, fill_super); + return vfs_get_super(fc, false, test_single_super, fill_super); } EXPORT_SYMBOL(get_tree_single); @@ -1211,7 +1169,7 @@ int get_tree_single_reconf(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { - return vfs_get_super(fc, vfs_get_single_reconf_super, fill_super); + return vfs_get_super(fc, true, test_single_super, fill_super); } EXPORT_SYMBOL(get_tree_single_reconf); @@ -1221,7 +1179,7 @@ int get_tree_keyed(struct fs_context *fc, void *key) { fc->s_fs_info = key; - return vfs_get_super(fc, vfs_get_keyed_super, fill_super); + return vfs_get_super(fc, false, test_keyed_super, fill_super); } EXPORT_SYMBOL(get_tree_keyed); -- cgit From 10bc8e4af65946b727728d7479c028742321b60a Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 17 Nov 2022 22:52:49 +0200 Subject: vfs: fix copy_file_range() averts filesystem freeze protection Commit 868f9f2f8e00 ("vfs: fix copy_file_range() regression in cross-fs copies") removed fallback to generic_copy_file_range() for cross-fs cases inside vfs_copy_file_range(). To preserve behavior of nfsd and ksmbd server-side-copy, the fallback to generic_copy_file_range() was added in nfsd and ksmbd code, but that call is missing sb_start_write(), fsnotify hooks and more. Ideally, nfsd and ksmbd would pass a flag to vfs_copy_file_range() that will take care of the fallback, but that code would be subtle and we got vfs_copy_file_range() logic wrong too many times already. Instead, add a flag to explicitly request vfs_copy_file_range() to perform only generic_copy_file_range() and let nfsd and ksmbd use this flag only in the fallback path. This choise keeps the logic changes to minimum in the non-nfsd/ksmbd code paths to reduce the risk of further regressions. Fixes: 868f9f2f8e00 ("vfs: fix copy_file_range() regression in cross-fs copies") Tested-by: Namjae Jeon Tested-by: Luis Henriques Signed-off-by: Amir Goldstein Signed-off-by: Al Viro --- fs/ksmbd/vfs.c | 6 +++--- fs/nfsd/vfs.c | 4 ++-- fs/read_write.c | 19 +++++++++++++++---- 3 files changed, 20 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 8de970d6146f..94b8ed4ef870 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -1794,9 +1794,9 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work, ret = vfs_copy_file_range(src_fp->filp, src_off, dst_fp->filp, dst_off, len, 0); if (ret == -EOPNOTSUPP || ret == -EXDEV) - ret = generic_copy_file_range(src_fp->filp, src_off, - dst_fp->filp, dst_off, - len, 0); + ret = vfs_copy_file_range(src_fp->filp, src_off, + dst_fp->filp, dst_off, len, + COPY_FILE_SPLICE); if (ret < 0) return ret; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index f650afedd67f..5cf11cde51f8 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -596,8 +596,8 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0); if (ret == -EOPNOTSUPP || ret == -EXDEV) - ret = generic_copy_file_range(src, src_pos, dst, dst_pos, - count, 0); + ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, + COPY_FILE_SPLICE); return ret; } diff --git a/fs/read_write.c b/fs/read_write.c index 328ce8cf9a85..24b9668d6377 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1388,6 +1388,8 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { + lockdep_assert(sb_write_started(file_inode(file_out)->i_sb)); + return do_splice_direct(file_in, &pos_in, file_out, &pos_out, len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); } @@ -1424,7 +1426,9 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, * and several different sets of file_operations, but they all end up * using the same ->copy_file_range() function pointer. */ - if (file_out->f_op->copy_file_range) { + if (flags & COPY_FILE_SPLICE) { + /* cross sb splice is allowed */ + } else if (file_out->f_op->copy_file_range) { if (file_in->f_op->copy_file_range != file_out->f_op->copy_file_range) return -EXDEV; @@ -1474,8 +1478,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, size_t len, unsigned int flags) { ssize_t ret; + bool splice = flags & COPY_FILE_SPLICE; - if (flags != 0) + if (flags & ~COPY_FILE_SPLICE) return -EINVAL; ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len, @@ -1501,14 +1506,14 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * same sb using clone, but for filesystems where both clone and copy * are supported (e.g. nfs,cifs), we only call the copy method. */ - if (file_out->f_op->copy_file_range) { + if (!splice && file_out->f_op->copy_file_range) { ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); goto done; } - if (file_in->f_op->remap_file_range && + if (!splice && file_in->f_op->remap_file_range && file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, @@ -1528,6 +1533,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * consistent story about which filesystems support copy_file_range() * and which filesystems do not, that will allow userspace tools to * make consistent desicions w.r.t using copy_file_range(). + * + * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE. */ ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); @@ -1582,6 +1589,10 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, pos_out = f_out.file->f_pos; } + ret = -EINVAL; + if (flags != 0) + goto out; + ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len, flags); if (ret > 0) { -- cgit From db58653ce0c7cf4d155727852607106f890005c0 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 21 Nov 2022 16:29:37 +0900 Subject: zonefs: Fix active zone accounting If a file zone transitions to the offline or readonly state from an active state, we must clear the zone active flag and decrement the active seq file counter. Do so in zonefs_account_active() using the new zonefs inode flags ZONEFS_ZONE_OFFLINE and ZONEFS_ZONE_READONLY. These flags are set if necessary in zonefs_check_zone_condition() based on the result of report zones operation after an IO error. Fixes: 87c9ce3ffec9 ("zonefs: Add active seq file accounting") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn --- fs/zonefs/super.c | 11 +++++++++++ fs/zonefs/zonefs.h | 6 ++++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index f0e8a000f073..2c53fbb8d918 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -40,6 +40,13 @@ static void zonefs_account_active(struct inode *inode) if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) return; + /* + * For zones that transitioned to the offline or readonly condition, + * we only need to clear the active state. + */ + if (zi->i_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY)) + goto out; + /* * If the zone is active, that is, if it is explicitly open or * partially written, check if it was already accounted as active. @@ -53,6 +60,7 @@ static void zonefs_account_active(struct inode *inode) return; } +out: /* The zone is not active. If it was, update the active count */ if (zi->i_flags & ZONEFS_ZONE_ACTIVE) { zi->i_flags &= ~ZONEFS_ZONE_ACTIVE; @@ -324,6 +332,7 @@ static loff_t zonefs_check_zone_condition(struct inode *inode, inode->i_flags |= S_IMMUTABLE; inode->i_mode &= ~0777; zone->wp = zone->start; + zi->i_flags |= ZONEFS_ZONE_OFFLINE; return 0; case BLK_ZONE_COND_READONLY: /* @@ -342,8 +351,10 @@ static loff_t zonefs_check_zone_condition(struct inode *inode, zone->cond = BLK_ZONE_COND_OFFLINE; inode->i_mode &= ~0777; zone->wp = zone->start; + zi->i_flags |= ZONEFS_ZONE_OFFLINE; return 0; } + zi->i_flags |= ZONEFS_ZONE_READONLY; inode->i_mode &= ~0222; return i_size_read(inode); case BLK_ZONE_COND_FULL: diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 4b3de66c3233..1dbe78119ff1 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -39,8 +39,10 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) return ZONEFS_ZTYPE_SEQ; } -#define ZONEFS_ZONE_OPEN (1 << 0) -#define ZONEFS_ZONE_ACTIVE (1 << 1) +#define ZONEFS_ZONE_OPEN (1U << 0) +#define ZONEFS_ZONE_ACTIVE (1U << 1) +#define ZONEFS_ZONE_OFFLINE (1U << 2) +#define ZONEFS_ZONE_READONLY (1U << 3) /* * In-memory inode data. -- cgit From de4eda9de2d957ef2d6a8365a01e26a435e958cb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 15 Sep 2022 20:25:47 -0400 Subject: use less confusing names for iov_iter direction initializers READ/WRITE proved to be actively confusing - the meanings are "data destination, as used with read(2)" and "data source, as used with write(2)", but people keep interpreting those as "we read data from it" and "we write data to it", i.e. exactly the wrong way. Call them ITER_DEST and ITER_SOURCE - at least that is harder to misinterpret... Signed-off-by: Al Viro --- fs/9p/vfs_addr.c | 4 ++-- fs/9p/vfs_dir.c | 2 +- fs/9p/xattr.c | 4 ++-- fs/afs/cmservice.c | 2 +- fs/afs/dir.c | 2 +- fs/afs/file.c | 4 ++-- fs/afs/internal.h | 4 ++-- fs/afs/rxrpc.c | 10 +++++----- fs/afs/write.c | 4 ++-- fs/aio.c | 4 ++-- fs/btrfs/ioctl.c | 4 ++-- fs/ceph/addr.c | 4 ++-- fs/ceph/file.c | 4 ++-- fs/cifs/connect.c | 6 +++--- fs/cifs/file.c | 4 ++-- fs/cifs/fscache.c | 4 ++-- fs/cifs/smb2ops.c | 4 ++-- fs/cifs/transport.c | 6 +++--- fs/coredump.c | 2 +- fs/erofs/fscache.c | 6 +++--- fs/fscache/io.c | 2 +- fs/fuse/ioctl.c | 4 ++-- fs/netfs/io.c | 6 +++--- fs/nfs/fscache.c | 4 ++-- fs/nfsd/vfs.c | 4 ++-- fs/ocfs2/cluster/tcp.c | 2 +- fs/orangefs/inode.c | 8 ++++---- fs/proc/vmcore.c | 6 +++--- fs/read_write.c | 12 ++++++------ fs/seq_file.c | 2 +- fs/splice.c | 10 +++++----- 31 files changed, 72 insertions(+), 72 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 47b9a1122f34..a19891015f19 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -40,7 +40,7 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) size_t len = subreq->len - subreq->transferred; int total, err; - iov_iter_xarray(&to, READ, &rreq->mapping->i_pages, pos, len); + iov_iter_xarray(&to, ITER_DEST, &rreq->mapping->i_pages, pos, len); total = p9_client_read(fid, pos, &to, &err); @@ -172,7 +172,7 @@ static int v9fs_vfs_write_folio_locked(struct folio *folio) len = min_t(loff_t, i_size - start, len); - iov_iter_xarray(&from, WRITE, &folio_mapping(folio)->i_pages, start, len); + iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len); /* We should have writeback_fid always set */ BUG_ON(!v9inode->writeback_fid); diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 000fbaae9b18..3bb95adc9619 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -109,7 +109,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) struct iov_iter to; int n; - iov_iter_kvec(&to, READ, &kvec, 1, buflen); + iov_iter_kvec(&to, ITER_DEST, &kvec, 1, buflen); n = p9_client_read(file->private_data, ctx->pos, &to, &err); if (err) diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 1f9298a4bd42..2807bb63f780 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -24,7 +24,7 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, struct iov_iter to; int err; - iov_iter_kvec(&to, READ, &kvec, 1, buffer_size); + iov_iter_kvec(&to, ITER_DEST, &kvec, 1, buffer_size); attr_fid = p9_client_xattrwalk(fid, name, &attr_size); if (IS_ERR(attr_fid)) { @@ -109,7 +109,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, struct iov_iter from; int retval, err; - iov_iter_kvec(&from, WRITE, &kvec, 1, value_len); + iov_iter_kvec(&from, ITER_SOURCE, &kvec, 1, value_len); p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n", name, value_len, flags); diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 0a090d614e76..7dcd59693a0c 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -298,7 +298,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) if (call->count2 != call->count && call->count2 != 0) return afs_protocol_error(call, afs_eproto_cb_count); call->iter = &call->def_iter; - iov_iter_discard(&call->def_iter, READ, call->count2 * 3 * 4); + iov_iter_discard(&call->def_iter, ITER_DEST, call->count2 * 3 * 4); call->unmarshall++; fallthrough; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 230c2d19116d..104df2964225 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -305,7 +305,7 @@ expand: req->actual_len = i_size; /* May change */ req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */ req->data_version = dvnode->status.data_version; /* May change */ - iov_iter_xarray(&req->def_iter, READ, &dvnode->netfs.inode.i_mapping->i_pages, + iov_iter_xarray(&req->def_iter, ITER_DEST, &dvnode->netfs.inode.i_mapping->i_pages, 0, i_size); req->iter = &req->def_iter; diff --git a/fs/afs/file.c b/fs/afs/file.c index d1cfb235c4b9..2eeab57df133 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -324,7 +324,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq) fsreq->vnode = vnode; fsreq->iter = &fsreq->def_iter; - iov_iter_xarray(&fsreq->def_iter, READ, + iov_iter_xarray(&fsreq->def_iter, ITER_DEST, &fsreq->vnode->netfs.inode.i_mapping->i_pages, fsreq->pos, fsreq->len); @@ -346,7 +346,7 @@ static int afs_symlink_read_folio(struct file *file, struct folio *folio) fsreq->len = folio_size(folio); fsreq->vnode = vnode; fsreq->iter = &fsreq->def_iter; - iov_iter_xarray(&fsreq->def_iter, READ, &folio->mapping->i_pages, + iov_iter_xarray(&fsreq->def_iter, ITER_DEST, &folio->mapping->i_pages, fsreq->pos, fsreq->len); ret = afs_fetch_data(fsreq->vnode, fsreq); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 723d162078a3..9ba7b68375c9 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1301,7 +1301,7 @@ static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t si call->iov_len = size; call->kvec[0].iov_base = buf; call->kvec[0].iov_len = size; - iov_iter_kvec(&call->def_iter, READ, call->kvec, 1, size); + iov_iter_kvec(&call->def_iter, ITER_DEST, call->kvec, 1, size); } static inline void afs_extract_to_tmp(struct afs_call *call) @@ -1319,7 +1319,7 @@ static inline void afs_extract_to_tmp64(struct afs_call *call) static inline void afs_extract_discard(struct afs_call *call, size_t size) { call->iov_len = size; - iov_iter_discard(&call->def_iter, READ, size); + iov_iter_discard(&call->def_iter, ITER_DEST, size); } static inline void afs_extract_to_buf(struct afs_call *call, size_t size) diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index eccc3cd0cb70..c62939e5ea1f 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -359,7 +359,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size); + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iov, 1, call->request_size); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = MSG_WAITALL | (call->write_iter ? MSG_MORE : 0); @@ -400,7 +400,7 @@ error_do_abort: RX_USER_ABORT, ret, "KSD"); } else { len = 0; - iov_iter_kvec(&msg.msg_iter, READ, NULL, 0, 0); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, NULL, 0, 0); rxrpc_kernel_recv_data(call->net->socket, rxcall, &msg.msg_iter, &len, false, &call->abort_code, &call->service_id); @@ -485,7 +485,7 @@ static void afs_deliver_to_call(struct afs_call *call) ) { if (state == AFS_CALL_SV_AWAIT_ACK) { len = 0; - iov_iter_kvec(&call->def_iter, READ, NULL, 0, 0); + iov_iter_kvec(&call->def_iter, ITER_DEST, NULL, 0, 0); ret = rxrpc_kernel_recv_data(call->net->socket, call->rxcall, &call->def_iter, &len, false, &remote_abort, @@ -822,7 +822,7 @@ void afs_send_empty_reply(struct afs_call *call) msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE, NULL, 0, 0); + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, NULL, 0, 0); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -862,7 +862,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) iov[0].iov_len = len; msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len); + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iov, 1, len); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; diff --git a/fs/afs/write.c b/fs/afs/write.c index 9ebdd36eaf2f..08fd456dde67 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -609,7 +609,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, */ afs_write_to_cache(vnode, start, len, i_size, caching); - iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len); + iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len); ret = afs_store_data(vnode, &iter, start, false); } else { _debug("write discard %x @%llx [%llx]", len, start, i_size); @@ -1000,7 +1000,7 @@ int afs_launder_folio(struct folio *folio) bv[0].bv_page = &folio->page; bv[0].bv_offset = f; bv[0].bv_len = t - f; - iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len); + iov_iter_bvec(&iter, ITER_SOURCE, bv, 1, bv[0].bv_len); trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio); ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true); diff --git a/fs/aio.c b/fs/aio.c index 5b2ff20ad322..562916d85cba 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1552,7 +1552,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, if (unlikely(!file->f_op->read_iter)) return -EINVAL; - ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); + ret = aio_setup_rw(ITER_DEST, iocb, &iovec, vectored, compat, &iter); if (ret < 0) return ret; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); @@ -1580,7 +1580,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, if (unlikely(!file->f_op->write_iter)) return -EINVAL; - ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); + ret = aio_setup_rw(ITER_SOURCE, iocb, &iovec, vectored, compat, &iter); if (ret < 0) return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d5dd8bed1488..a59c884c2cb0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -5283,7 +5283,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, goto out_acct; } - ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) goto out_acct; @@ -5382,7 +5382,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool if (args.len > args.unencoded_len - args.unencoded_offset) goto out_acct; - ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + ret = import_iovec(ITER_SOURCE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) goto out_acct; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index dcf701b05cc1..61f47debec5a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -288,7 +288,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) } len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len); - iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len); + iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter); if (err == 0) err = -EFAULT; @@ -327,7 +327,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) } dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len); - iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len); + iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off); if (err < 0) { dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 04fd34557de8..6f9580defb2b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1161,7 +1161,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) aio_req->total_len = rc + zlen; } - iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs, + iov_iter_bvec(&i, ITER_DEST, osd_data->bvec_pos.bvecs, osd_data->num_bvecs, len); iov_iter_advance(&i, rc); iov_iter_zero(zlen, &i); @@ -1400,7 +1400,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, int zlen = min_t(size_t, len - ret, size - pos - ret); - iov_iter_bvec(&i, READ, bvecs, num_pages, len); + iov_iter_bvec(&i, ITER_DEST, bvecs, num_pages, len); iov_iter_advance(&i, ret); iov_iter_zero(zlen, &i); ret += zlen; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 9db9527c61cf..e80252a83225 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -759,7 +759,7 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, { struct msghdr smb_msg = {}; struct kvec iov = {.iov_base = buf, .iov_len = to_read}; - iov_iter_kvec(&smb_msg.msg_iter, READ, &iov, 1, to_read); + iov_iter_kvec(&smb_msg.msg_iter, ITER_DEST, &iov, 1, to_read); return cifs_readv_from_socket(server, &smb_msg); } @@ -774,7 +774,7 @@ cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read) * and cifs_readv_from_socket sets msg_control and msg_controllen * so little to initialize in struct msghdr */ - iov_iter_discard(&smb_msg.msg_iter, READ, to_read); + iov_iter_discard(&smb_msg.msg_iter, ITER_DEST, to_read); return cifs_readv_from_socket(server, &smb_msg); } @@ -786,7 +786,7 @@ cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, struct msghdr smb_msg = {}; struct bio_vec bv = { .bv_page = page, .bv_len = to_read, .bv_offset = page_offset}; - iov_iter_bvec(&smb_msg.msg_iter, READ, &bv, 1, to_read); + iov_iter_bvec(&smb_msg.msg_iter, ITER_DEST, &bv, 1, to_read); return cifs_readv_from_socket(server, &smb_msg); } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index cd9698209930..209dfc06fd6d 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3532,7 +3532,7 @@ static ssize_t __cifs_writev( ctx->iter = *from; ctx->len = len; } else { - rc = setup_aio_ctx_iter(ctx, from, WRITE); + rc = setup_aio_ctx_iter(ctx, from, ITER_SOURCE); if (rc) { kref_put(&ctx->refcount, cifs_aio_ctx_release); return rc; @@ -4276,7 +4276,7 @@ static ssize_t __cifs_readv( ctx->iter = *to; ctx->len = len; } else { - rc = setup_aio_ctx_iter(ctx, to, READ); + rc = setup_aio_ctx_iter(ctx, to, ITER_DEST); if (rc) { kref_put(&ctx->refcount, cifs_aio_ctx_release); return rc; diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index a1751b956318..f6f3a6b75601 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -150,7 +150,7 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page) bvec[0].bv_page = page; bvec[0].bv_offset = 0; bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + iov_iter_bvec(&iter, ITER_DEST, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); ret = fscache_begin_read_operation(&cres, cookie); if (ret < 0) @@ -180,7 +180,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page, bvec[0].bv_page = page; bvec[0].bv_offset = 0; bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + iov_iter_bvec(&iter, ITER_SOURCE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); ret = fscache_begin_write_operation(&cres, cookie); if (ret < 0) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index bfaafd02fb1f..32b3877b538a 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -4723,13 +4723,13 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - iov_iter_bvec(&iter, WRITE, bvec, npages, data_len); + iov_iter_bvec(&iter, ITER_SOURCE, bvec, npages, data_len); } else if (buf_len >= data_offset + data_len) { /* read response payload is in buf */ WARN_ONCE(npages > 0, "read data can be either in buf or in pages"); iov.iov_base = buf + data_offset; iov.iov_len = data_len; - iov_iter_kvec(&iter, WRITE, &iov, 1, data_len); + iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, data_len); } else { /* read response payload cannot be in both buf and pages */ WARN_ONCE(1, "buf can not contain only a part of read data"); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 575fa8f58342..3851d0aaa288 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -347,7 +347,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, .iov_base = &rfc1002_marker, .iov_len = 4 }; - iov_iter_kvec(&smb_msg.msg_iter, WRITE, &hiov, 1, 4); + iov_iter_kvec(&smb_msg.msg_iter, ITER_SOURCE, &hiov, 1, 4); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) goto unmask; @@ -368,7 +368,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, size += iov[i].iov_len; } - iov_iter_kvec(&smb_msg.msg_iter, WRITE, iov, n_vec, size); + iov_iter_kvec(&smb_msg.msg_iter, ITER_SOURCE, iov, n_vec, size); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) @@ -384,7 +384,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, rqst_page_get_length(&rqst[j], i, &bvec.bv_len, &bvec.bv_offset); - iov_iter_bvec(&smb_msg.msg_iter, WRITE, + iov_iter_bvec(&smb_msg.msg_iter, ITER_SOURCE, &bvec, 1, bvec.bv_len); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) diff --git a/fs/coredump.c b/fs/coredump.c index 7bad7785e8e6..095ed821c8ac 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -853,7 +853,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page) if (dump_interrupted()) return 0; pos = file->f_pos; - iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE); + iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE); n = __kernel_write_iter(cprm->file, &iter, &pos); if (n != PAGE_SIZE) return 0; diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index af5ed6b9c54d..4c837be3b6e3 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -194,7 +194,7 @@ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, atomic_inc(&rreq->nr_outstanding); - iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, + iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, start + done, subreq->len); ret = fscache_read(cres, subreq->start, &iter, @@ -290,7 +290,7 @@ static int erofs_fscache_data_read(struct address_space *mapping, if (IS_ERR(src)) return PTR_ERR(src); - iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, PAGE_SIZE); + iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, PAGE_SIZE); if (copy_to_iter(src + offset, size, &iter) != size) { erofs_put_metabuf(&buf); return -EFAULT; @@ -302,7 +302,7 @@ static int erofs_fscache_data_read(struct address_space *mapping, if (!(map.m_flags & EROFS_MAP_MAPPED)) { count = len; - iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, count); + iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count); iov_iter_zero(count, &iter); return count; } diff --git a/fs/fscache/io.c b/fs/fscache/io.c index 3af3b08a9bb3..0d2b8dec8f82 100644 --- a/fs/fscache/io.c +++ b/fs/fscache/io.c @@ -286,7 +286,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, * taken into account. */ - iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len); + iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len); fscache_write(cres, start, &iter, fscache_wreq_done, wreq); return; diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 61d8afcb10a3..fcce94ace2c2 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -255,7 +255,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, ap.args.in_pages = true; err = -EFAULT; - iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size); + iov_iter_init(&ii, ITER_SOURCE, in_iov, in_iovs, in_size); for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) @@ -324,7 +324,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, goto out; err = -EFAULT; - iov_iter_init(&ii, READ, out_iov, out_iovs, transferred); + iov_iter_init(&ii, ITER_DEST, out_iov, out_iovs, transferred); for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) diff --git a/fs/netfs/io.c b/fs/netfs/io.c index e374767d1b68..7f753380e047 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -23,7 +23,7 @@ static void netfs_clear_unread(struct netfs_io_subrequest *subreq) { struct iov_iter iter; - iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages, + iov_iter_xarray(&iter, ITER_DEST, &subreq->rreq->mapping->i_pages, subreq->start + subreq->transferred, subreq->len - subreq->transferred); iov_iter_zero(iov_iter_count(&iter), &iter); @@ -49,7 +49,7 @@ static void netfs_read_from_cache(struct netfs_io_request *rreq, struct iov_iter iter; netfs_stat(&netfs_n_rh_read); - iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, + iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start + subreq->transferred, subreq->len - subreq->transferred); @@ -208,7 +208,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq) continue; } - iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages, + iov_iter_xarray(&iter, ITER_SOURCE, &rreq->mapping->i_pages, subreq->start, subreq->len); atomic_inc(&rreq->nr_copy_ops); diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index e861d7bae305..e731c00a9fcb 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -252,7 +252,7 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page) bvec[0].bv_page = page; bvec[0].bv_offset = 0; bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + iov_iter_bvec(&iter, ITER_DEST, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); ret = fscache_begin_read_operation(&cres, cookie); if (ret < 0) @@ -282,7 +282,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page, bvec[0].bv_page = page; bvec[0].bv_offset = 0; bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + iov_iter_bvec(&iter, ITER_SOURCE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); ret = fscache_begin_write_operation(&cres, cookie); if (ret < 0) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index f650afedd67f..51f453baa952 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -942,7 +942,7 @@ __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, ssize_t host_err; trace_nfsd_read_vector(rqstp, fhp, offset, *count); - iov_iter_kvec(&iter, READ, vec, vlen, *count); + iov_iter_kvec(&iter, ITER_DEST, vec, vlen, *count); host_err = vfs_iter_read(file, &iter, &ppos, 0); return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } @@ -1032,7 +1032,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, if (stable && !use_wgather) flags |= RWF_SYNC; - iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt); + iov_iter_kvec(&iter, ITER_SOURCE, vec, vlen, *cnt); since = READ_ONCE(file->f_wb_err); if (verf) nfsd_copy_write_verifier(verf, nn); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index f660c0dbdb63..785cabd71d67 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -900,7 +900,7 @@ static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) { struct kvec vec = { .iov_len = len, .iov_base = data, }; struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; - iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, len); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, len); return sock_recvmsg(sock, &msg, MSG_DONTWAIT); } diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 7a8c0c6e698d..b3bbb5a5787a 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -53,7 +53,7 @@ static int orangefs_writepage_locked(struct page *page, bv.bv_len = wlen; bv.bv_offset = off % PAGE_SIZE; WARN_ON(wlen == 0); - iov_iter_bvec(&iter, WRITE, &bv, 1, wlen); + iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, wlen); ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen, len, wr, NULL, NULL); @@ -112,7 +112,7 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow, else ow->bv[i].bv_offset = 0; } - iov_iter_bvec(&iter, WRITE, ow->bv, ow->npages, ow->len); + iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->npages, ow->len); WARN_ON(ow->off >= len); if (ow->off + ow->len > len) @@ -270,7 +270,7 @@ static void orangefs_readahead(struct readahead_control *rac) offset = readahead_pos(rac); i_pages = &rac->mapping->i_pages; - iov_iter_xarray(&iter, READ, i_pages, offset, readahead_length(rac)); + iov_iter_xarray(&iter, ITER_DEST, i_pages, offset, readahead_length(rac)); /* read in the pages. */ if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, @@ -303,7 +303,7 @@ static int orangefs_read_folio(struct file *file, struct folio *folio) bv.bv_page = &folio->page; bv.bv_len = folio_size(folio); bv.bv_offset = 0; - iov_iter_bvec(&iter, READ, &bv, 1, folio_size(folio)); + iov_iter_bvec(&iter, ITER_DEST, &bv, 1, folio_size(folio)); ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, folio_size(folio), inode->i_size, NULL, NULL, file); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index f2aa86c421f2..5aa527ca6dbe 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -199,7 +199,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) struct kvec kvec = { .iov_base = buf, .iov_len = count }; struct iov_iter iter; - iov_iter_kvec(&iter, READ, &kvec, 1, count); + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); return read_from_oldmem(&iter, count, ppos, false); } @@ -212,7 +212,7 @@ ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) struct kvec kvec = { .iov_base = buf, .iov_len = count }; struct iov_iter iter; - iov_iter_kvec(&iter, READ, &kvec, 1, count); + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); return read_from_oldmem(&iter, count, ppos, cc_platform_has(CC_ATTR_MEM_ENCRYPT)); @@ -437,7 +437,7 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) offset = (loff_t) index << PAGE_SHIFT; kvec.iov_base = page_address(page); kvec.iov_len = PAGE_SIZE; - iov_iter_kvec(&iter, READ, &kvec, 1, PAGE_SIZE); + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, PAGE_SIZE); rc = __read_vmcore(&iter, &offset); if (rc < 0) { diff --git a/fs/read_write.c b/fs/read_write.c index 328ce8cf9a85..37c2f28b51e8 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -384,7 +384,7 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); - iov_iter_ubuf(&iter, READ, buf, len); + iov_iter_ubuf(&iter, ITER_DEST, buf, len); ret = call_read_iter(filp, &kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); @@ -424,7 +424,7 @@ ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; - iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len); + iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len); ret = file->f_op->read_iter(&kiocb, &iter); if (ret > 0) { if (pos) @@ -486,7 +486,7 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); - iov_iter_ubuf(&iter, WRITE, (void __user *)buf, len); + iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); ret = call_write_iter(filp, &kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); @@ -533,7 +533,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct iov_iter iter; - iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len); + iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len); return __kernel_write_iter(file, &iter, pos); } /* @@ -911,7 +911,7 @@ static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, struct iov_iter iter; ssize_t ret; - ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret >= 0) { ret = do_iter_read(file, &iter, pos, flags); kfree(iov); @@ -928,7 +928,7 @@ static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, struct iov_iter iter; ssize_t ret; - ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret >= 0) { file_start_write(file); ret = do_iter_write(file, &iter, pos, flags); diff --git a/fs/seq_file.c b/fs/seq_file.c index 9456a2032224..f5fdaf3b1572 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -156,7 +156,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) ssize_t ret; init_sync_kiocb(&kiocb, file); - iov_iter_init(&iter, READ, &iov, 1, size); + iov_iter_init(&iter, ITER_DEST, &iov, 1, size); kiocb.ki_pos = *ppos; ret = seq_read_iter(&kiocb, &iter); diff --git a/fs/splice.c b/fs/splice.c index 0878b852b355..5969b7a1d353 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -303,7 +303,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, struct kiocb kiocb; int ret; - iov_iter_pipe(&to, READ, pipe, len); + iov_iter_pipe(&to, ITER_DEST, pipe, len); init_sync_kiocb(&kiocb, in); kiocb.ki_pos = *ppos; ret = call_read_iter(in, &kiocb, &to); @@ -682,7 +682,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, n++; } - iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left); + iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left); ret = vfs_iter_write(out, &from, &sd.pos, 0); if (ret <= 0) break; @@ -1263,9 +1263,9 @@ static int vmsplice_type(struct fd f, int *type) if (!f.file) return -EBADF; if (f.file->f_mode & FMODE_WRITE) { - *type = WRITE; + *type = ITER_SOURCE; } else if (f.file->f_mode & FMODE_READ) { - *type = READ; + *type = ITER_DEST; } else { fdput(f); return -EBADF; @@ -1314,7 +1314,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, if (!iov_iter_count(&iter)) error = 0; - else if (iov_iter_rw(&iter) == WRITE) + else if (type == ITER_SOURCE) error = vmsplice_to_pipe(f.file, &iter, flags); else error = vmsplice_to_user(f.file, &iter, flags); -- cgit From aa997990080877c3a6dab9f25609073816378b43 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 25 Nov 2022 11:20:47 -0800 Subject: fscrypt: add comment for fscrypt_valid_enc_modes_v1() Make it clear that nothing new should be added to this function. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20221125192047.18916-1-ebiggers@kernel.org --- fs/crypto/policy.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 46757c3052ef..84fa51604b15 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -61,6 +61,13 @@ fscrypt_get_dummy_policy(struct super_block *sb) return sb->s_cop->get_dummy_policy(sb); } +/* + * Return %true if the given combination of encryption modes is supported for v1 + * (and later) encryption policies. + * + * Do *not* add anything new here, since v1 encryption policies are deprecated. + * New combinations of modes should go in fscrypt_valid_enc_modes_v2() only. + */ static bool fscrypt_valid_enc_modes_v1(u32 contents_mode, u32 filenames_mode) { if (contents_mode == FSCRYPT_MODE_AES_256_XTS && -- cgit From cf260db405b1a159e9076220b40f5f02ac480525 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Fri, 25 Nov 2022 17:13:57 +0800 Subject: btrfs: replace INT_LIMIT(loff_t) with OFFSET_MAX OFFSET_MAX is self-annotated and more readable. Signed-off-by: Zhen Lei Acked-by: David Sterba Reviewed-by: Eric Biggers Signed-off-by: Al Viro --- fs/btrfs/ordered-data.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index e54f8280031f..100d9f4836b1 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -761,11 +761,11 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) struct btrfs_ordered_extent *ordered; if (start + len < start) { - orig_end = INT_LIMIT(loff_t); + orig_end = OFFSET_MAX; } else { orig_end = start + len - 1; - if (orig_end > INT_LIMIT(loff_t)) - orig_end = INT_LIMIT(loff_t); + if (orig_end > OFFSET_MAX) + orig_end = OFFSET_MAX; } /* start IO across the range first to instantiate any delalloc -- cgit From 0eb43812c0270ee3d005ff32f91f7d0a6c4943af Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 26 Aug 2022 19:44:44 -0400 Subject: NFS: Clear the file access cache upon login POSIX typically only refreshes the user's supplementary group information upon login. Since NFS servers may often refresh their concept of the user supplementary group membership at their own cadence, it is possible for the NFS client's access cache to become stale due to the user's group membership changing on the server after the user has already logged in on the client. While it is reasonable to expect that such group membership changes are rare, and that we do not want to optimise the cache to accommodate them, it is also not unreasonable for the user to expect that if they log out and log back in again, that the staleness would clear up. Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f594dac436a7..bccb04af38ae 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2948,9 +2948,28 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, co return NULL; } +static u64 nfs_access_login_time(const struct task_struct *task, + const struct cred *cred) +{ + const struct task_struct *parent; + u64 ret; + + rcu_read_lock(); + for (;;) { + parent = rcu_dereference(task->real_parent); + if (parent == task || cred_fscmp(parent->cred, cred) != 0) + break; + task = parent; + } + ret = task->start_time; + rcu_read_unlock(); + return ret; +} + static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, u32 *mask, bool may_block) { struct nfs_inode *nfsi = NFS_I(inode); + u64 login_time = nfs_access_login_time(current, cred); struct nfs_access_entry *cache; bool retry = true; int err; @@ -2978,6 +2997,9 @@ static int nfs_access_get_cached_locked(struct inode *inode, const struct cred * spin_lock(&inode->i_lock); retry = false; } + err = -ENOENT; + if ((s64)(login_time - cache->timestamp) > 0) + goto out; *mask = cache->mask; list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru); err = 0; @@ -3057,6 +3079,7 @@ static void nfs_access_add_rbtree(struct inode *inode, else goto found; } + set->timestamp = ktime_get_ns(); rb_link_node(&set->rb_node, parent, p); rb_insert_color(&set->rb_node, root_node); list_add_tail(&set->lru, &nfsi->access_cache_entry_lru); -- cgit From eef7314caf2d73a94b68ba293cd105154d3a664e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2022 16:44:47 -0400 Subject: NFSv4.2: Clear FATTR4_WORD2_SECURITY_LABEL when done decoding We need to clear the FATTR4_WORD2_SECURITY_LABEL bitmap flag irrespective of whether or not the label is too long. Fixes: aa9c2669626c ("NFS: Client implementation of Labeled-NFS") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index acfe5f4bda48..8c5298e37f0f 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4234,6 +4234,7 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, p = xdr_inline_decode(xdr, len); if (unlikely(!p)) return -EIO; + bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL; if (len < NFS4_MAXLABELLEN) { if (label) { if (label->len) { @@ -4246,7 +4247,6 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, label->lfs = lfs; status = NFS_ATTR_FATTR_V4_SECURITY_LABEL; } - bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL; } else printk(KERN_WARNING "%s: label too long (%u)!\n", __func__, len); -- cgit From c8a62f440229ae7a10874776344dfcc17d860336 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2022 17:41:30 -0400 Subject: NFSv4.2: Always decode the security label If the server returns a reply that includes a security label, then we must decode it whether or not we can store the results. Fixes: 1e2f67da8931 ("NFS: Remove the nfs4_label argument from decode_getattr_*() functions") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 8c5298e37f0f..9103e022376a 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4755,12 +4755,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, if (status < 0) goto xdr_error; - if (fattr->label) { - status = decode_attr_security_label(xdr, bitmap, fattr->label); - if (status < 0) - goto xdr_error; - fattr->valid |= status; - } + status = decode_attr_security_label(xdr, bitmap, fattr->label); + if (status < 0) + goto xdr_error; + fattr->valid |= status; xdr_error: dprintk("%s: xdr returned %d\n", __func__, -status); -- cgit From 43c1031f7110967c240cb6e922adcfc4b8899183 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2022 18:21:14 -0400 Subject: NFSv4.2: Fix a memory stomp in decode_attr_security_label We must not change the value of label->len if it is zero, since that indicates we stored a label. Fixes: b4487b935452 ("nfs: Fix getxattr kernel panic and memory overflow") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 9103e022376a..deec76cf5afe 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4236,12 +4236,10 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, return -EIO; bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL; if (len < NFS4_MAXLABELLEN) { - if (label) { - if (label->len) { - if (label->len < len) - return -ERANGE; - memcpy(label->label, p, len); - } + if (label && label->len) { + if (label->len < len) + return -ERANGE; + memcpy(label->label, p, len); label->len = len; label->pi = pi; label->lfs = lfs; -- cgit From c528f70f504434eaff993a5ddd52203a2010d51f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 19 Oct 2022 13:12:11 -0400 Subject: NFSv4.2: Fix initialisation of struct nfs4_label The call to nfs4_label_init_security() should return a fully initialised label. Fixes: aa9c2669626c ("NFS: Client implementation of Labeled-NFS") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 86ed5c0142c3..98a867092039 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -122,6 +122,11 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry, if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0) return NULL; + label->lfs = 0; + label->pi = 0; + label->len = 0; + label->label = NULL; + err = security_dentry_init_security(dentry, sattr->ia_mode, &dentry->d_name, NULL, (void **)&label->label, &label->len); @@ -3796,7 +3801,7 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr, int *opened) { struct nfs4_state *state; - struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL; + struct nfs4_label l, *label; label = nfs4_label_init_security(dir, ctx->dentry, attr, &l); @@ -4682,7 +4687,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags) { struct nfs_server *server = NFS_SERVER(dir); - struct nfs4_label l, *ilabel = NULL; + struct nfs4_label l, *ilabel; struct nfs_open_context *ctx; struct nfs4_state *state; int status = 0; @@ -5033,7 +5038,7 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, struct nfs4_exception exception = { .interruptible = true, }; - struct nfs4_label l, *label = NULL; + struct nfs4_label l, *label; int err; label = nfs4_label_init_security(dir, dentry, sattr, &l); @@ -5074,7 +5079,7 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, struct nfs4_exception exception = { .interruptible = true, }; - struct nfs4_label l, *label = NULL; + struct nfs4_label l, *label; int err; label = nfs4_label_init_security(dir, dentry, sattr, &l); @@ -5193,7 +5198,7 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, struct nfs4_exception exception = { .interruptible = true, }; - struct nfs4_label l, *label = NULL; + struct nfs4_label l, *label; int err; label = nfs4_label_init_security(dir, dentry, sattr, &l); -- cgit From 85aa8ddc3818718208c3cfdfda9c8c908c9dead1 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Tue, 20 Sep 2022 13:00:21 -0400 Subject: NFS: Trigger the "ls -l" readdir heuristic sooner Since commit 1a34c8c9a49e ("NFS: Support larger readdir buffers") has updated dtsize, and with recent improvements to the READDIRPLUS helper heuristic, the heuristic may not trigger until many dentries are emitted to userspace. This will cause many thousands of GETATTR calls for "ls -l" when the directory's pagecache has already been populated. This manifests as poor performance for long directory listings after an initially fast "ls -l". Fix this by emitting only 17 entries for any first pass through the NFS directory's ->iterate_shared(), which allows userpace to prime the counters for the heuristic. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index bccb04af38ae..ea1ceffa1d3a 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1074,6 +1074,8 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc) return res; } +#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL) + /* * Once we've found the start of the dirent within a page: fill 'er up... */ @@ -1083,6 +1085,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, struct file *file = desc->file; struct nfs_cache_array *array; unsigned int i; + bool first_emit = !desc->dir_cookie; array = kmap_local_page(desc->page); for (i = desc->cache_entry_index; i < array->size; i++) { @@ -1106,6 +1109,10 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos++; + if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 1) { + desc->eob = true; + break; + } } if (array->page_is_eof) desc->eof = !desc->eob; @@ -1187,8 +1194,6 @@ out: return status; } -#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL) - static bool nfs_readdir_handle_cache_misses(struct inode *inode, struct nfs_readdir_descriptor *desc, unsigned int cache_misses, -- cgit From e83458fce080dc23c25353a1af90bfecf79c7369 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2022 16:50:12 -0400 Subject: NFSv4: Fix a credential leak in _nfs4_discover_trunking() Fixes: 4f40a5b55446 ("NFSv4: Add an fattr allocation to _nfs4_discover_trunking()") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 98a867092039..bd89c7f06952 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4018,7 +4018,7 @@ static int _nfs4_discover_trunking(struct nfs_server *server, page = alloc_page(GFP_KERNEL); if (!page) - return -ENOMEM; + goto out_put_cred; locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); if (!locations) goto out_free; @@ -4040,6 +4040,8 @@ out_free_2: kfree(locations); out_free: __free_page(page); +out_put_cred: + put_cred(cred); return status; } -- cgit From 51069e4aef6257b0454057359faed0ab0c9af083 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 4 Nov 2022 13:20:01 -0400 Subject: NFSv4: Fix a deadlock between nfs4_open_recover_helper() and delegreturn If we're asked to recover open state while a delegation return is outstanding, then the state manager thread cannot use a cached open, so if the server returns a delegation, we can end up deadlocked behind the pending delegreturn. To avoid this problem, let's just ask the server not to give us a delegation unless we're explicitly reclaiming one. Fixes: be36e185bd26 ("NFSv4: nfs4_open_recover_helper() must set share access") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index bd89c7f06952..e51044a5f550 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2131,18 +2131,18 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context } static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, - fmode_t fmode) + fmode_t fmode) { struct nfs4_state *newstate; + struct nfs_server *server = NFS_SB(opendata->dentry->d_sb); + int openflags = opendata->o_arg.open_flags; int ret; if (!nfs4_mode_match_open_stateid(opendata->state, fmode)) return 0; - opendata->o_arg.open_flags = 0; opendata->o_arg.fmode = fmode; - opendata->o_arg.share_access = nfs4_map_atomic_open_share( - NFS_SB(opendata->dentry->d_sb), - fmode, 0); + opendata->o_arg.share_access = + nfs4_map_atomic_open_share(server, fmode, openflags); memset(&opendata->o_res, 0, sizeof(opendata->o_res)); memset(&opendata->c_res, 0, sizeof(opendata->c_res)); nfs4_init_opendata_res(opendata); @@ -2724,10 +2724,15 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s struct nfs4_opendata *opendata; int ret; - opendata = nfs4_open_recoverdata_alloc(ctx, state, - NFS4_OPEN_CLAIM_FH); + opendata = nfs4_open_recoverdata_alloc(ctx, state, NFS4_OPEN_CLAIM_FH); if (IS_ERR(opendata)) return PTR_ERR(opendata); + /* + * We're not recovering a delegation, so ask for no delegation. + * Otherwise the recovery thread could deadlock with an outstanding + * delegation return. + */ + opendata->o_arg.open_flags = O_DIRECT; ret = nfs4_open_recover(opendata, state); if (ret == -ESTALE) d_drop(ctx->dentry); -- cgit From 35e3b6ae84935d0d7ff76cbdaa83411b0ad5e471 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 14 Nov 2022 17:30:39 -0500 Subject: NFS: Fix an Oops in nfs_d_automount() When mounting from a NFSv4 referral, path->dentry can end up being a negative dentry, so derive the struct nfs_server from the dentry itself instead. Fixes: 2b0143b5c986 ("VFS: normal filesystems (and lustre): d_inode() annotations") Signed-off-by: Trond Myklebust --- fs/nfs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 2f336ace7555..88a23af2bd5c 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -147,7 +147,7 @@ struct vfsmount *nfs_d_automount(struct path *path) struct nfs_fs_context *ctx; struct fs_context *fc; struct vfsmount *mnt = ERR_PTR(-ENOMEM); - struct nfs_server *server = NFS_SERVER(d_inode(path->dentry)); + struct nfs_server *server = NFS_SB(path->dentry->d_sb); struct nfs_client *client = server->nfs_client; int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout); int ret; -- cgit From 5776a9cd2a66cdc037c8e6ee34cde40ee768893d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 16 Nov 2022 11:11:53 -0500 Subject: NFS: Fix a race in nfs_call_unlink() We should check that the filehandles match before transferring the sillyrename data to the newly looked-up dentry in case the name was reused on the server. Signed-off-by: Trond Myklebust --- fs/nfs/unlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 9697cd5d2561..150a953a8be9 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -139,6 +139,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct inode *inode, struct nf */ spin_lock(&alias->d_lock); if (d_really_is_positive(alias) && + !nfs_compare_fh(NFS_FH(inode), NFS_FH(d_inode(alias))) && !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { devname_garbage = alias->d_fsdata; alias->d_fsdata = data; -- cgit From 672e4268b2863d7e4978dfed29552b31c2f9bd4e Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Mon, 28 Nov 2022 11:33:05 +0100 Subject: ovl: fix use inode directly in rcu-walk mode ovl_dentry_revalidate_common() can be called in rcu-walk mode. As document said, "in rcu-walk mode, d_parent and d_inode should not be used without care". Check inode here to protect access under rcu-walk mode. Fixes: bccece1ead36 ("ovl: allow remote upper") Reported-and-tested-by: syzbot+a4055c78774bbf3498bb@syzkaller.appspotmail.com Signed-off-by: Chen Zhongjin Cc: # v5.7 Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index a29a8afe9b26..3d14a3f1465d 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -139,11 +139,16 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry, unsigned int flags, bool weak) { struct ovl_entry *oe = dentry->d_fsdata; + struct inode *inode = d_inode_rcu(dentry); struct dentry *upper; unsigned int i; int ret = 1; - upper = ovl_dentry_upper(dentry); + /* Careful in RCU mode */ + if (!inode) + return -ECHILD; + + upper = ovl_i_dentry_upper(inode); if (upper) ret = ovl_revalidate_real(upper, flags, weak); -- cgit From 27e714c007e4ad01837bf0fac5c11913a38d7695 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 26 Nov 2022 03:17:17 +0000 Subject: ext2: unbugger ext2_empty_dir() In 27cfa258951a "ext2: fix fs corruption when trying to remove a non-empty directory with IO error" a funny thing has happened: - page = ext2_get_page(inode, i, dir_has_error, &page_addr); + page = ext2_get_page(inode, i, 0, &page_addr); - if (IS_ERR(page)) { - dir_has_error = 1; - continue; - } + if (IS_ERR(page)) + goto not_empty; And at not_empty: we hit ext2_put_page(page, page_addr), which does put_page(page). Which, unless I'm very mistaken, should oops immediately when given ERR_PTR(-E...) as page. OK, shit happens, insufficiently tested patches included. But when commit in question describes the fault-injection test that exercised that particular failure exit... Ow. CC: stable@vger.kernel.org Fixes: 27cfa258951a ("ext2: fix fs corruption when trying to remove a non-empty directory with IO error") Signed-off-by: Al Viro Signed-off-by: Jan Kara --- fs/ext2/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 6fa714dbee84..e5cbc27ba459 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -684,7 +684,7 @@ int ext2_empty_dir (struct inode * inode) page = ext2_get_page(inode, i, 0, &page_addr); if (IS_ERR(page)) - goto not_empty; + return 0; kaddr = page_addr; de = (ext2_dirent *)kaddr; -- cgit From 456b59e757b0c558df550764a4fd5ae6877e93f8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 24 Nov 2022 17:03:11 +0000 Subject: ovl: update ->f_iocb_flags when ovl_change_flags() modifies ->f_flags ovl_change_flags() is an open-coded variant of fs/fcntl.c:setfl() and it got missed by commit 164f4064ca81 ("keep iocb_flags() result cached in struct file"); the same change applies there. Reported-by: Pierre Labastie Fixes: 164f4064ca81 ("keep iocb_flags() result cached in struct file") Cc: # v6.0 Link: https://bugzilla.kernel.org/show_bug.cgi?id=216738 Signed-off-by: Al Viro Signed-off-by: Miklos Szeredi --- fs/overlayfs/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index a1a22f58ba18..dd688a842b0b 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -96,6 +96,7 @@ static int ovl_change_flags(struct file *file, unsigned int flags) spin_lock(&file->f_lock); file->f_flags = (file->f_flags & ~OVL_SETFL_MASK) | flags; + file->f_iocb_flags = iocb_flags(file); spin_unlock(&file->f_lock); return 0; -- cgit From b41b98e12a954c306e8eb9527ada0a70db339683 Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Mon, 28 Nov 2022 20:05:49 +0800 Subject: fs/ext2: Fix code indentation ts=4 can cause misunderstanding in code reading. It is better to replace 8 spaces with one tab. Signed-off-by: Rong Tao Signed-off-by: Jan Kara --- fs/ext2/balloc.c | 12 ++++++------ fs/ext2/super.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 5dc0a31f4a08..eca60b747c6b 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -667,7 +667,7 @@ ext2_try_to_allocate(struct super_block *sb, int group, { ext2_fsblk_t group_first_block = ext2_group_first_block_no(sb, group); ext2_fsblk_t group_last_block = ext2_group_last_block_no(sb, group); - ext2_grpblk_t start, end; + ext2_grpblk_t start, end; unsigned long num = 0; start = 0; @@ -1481,11 +1481,11 @@ unsigned long ext2_count_free_blocks (struct super_block * sb) desc_count, bitmap_count); return bitmap_count; #else - for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { - desc = ext2_get_group_desc (sb, i, NULL); - if (!desc) - continue; - desc_count += le16_to_cpu(desc->bg_free_blocks_count); + for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { + desc = ext2_get_group_desc(sb, i, NULL); + if (!desc) + continue; + desc_count += le16_to_cpu(desc->bg_free_blocks_count); } return desc_count; #endif diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 03f2af98b1b4..69c88facfe90 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1648,7 +1648,7 @@ static int __init init_ext2_fs(void) err = init_inodecache(); if (err) return err; - err = register_filesystem(&ext2_fs_type); + err = register_filesystem(&ext2_fs_type); if (err) goto out; return 0; -- cgit From eeadcb75794516839078c28b3730132aeb700ce6 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 13 Sep 2022 14:01:51 -0400 Subject: NFSD: Simplify READ_PLUS Chuck had suggested reverting READ_PLUS so it returns a single DATA segment covering the requested read range. This prepares the server for a future "sparse read" function so support can easily be added without needing to rip out the old READ_PLUS code at the same time. Signed-off-by: Anna Schumaker Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 139 +++++++++++++----------------------------------------- 1 file changed, 32 insertions(+), 107 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index bcfeb1a922c0..1e4990fb1692 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -4767,79 +4767,37 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, - struct nfsd4_read *read, - unsigned long *maxcount, u32 *eof, - loff_t *pos) + struct nfsd4_read *read) { - struct xdr_stream *xdr = resp->xdr; + bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags); struct file *file = read->rd_nf->nf_file; - int starting_len = xdr->buf->len; - loff_t hole_pos; - __be32 nfserr; - __be32 *p, tmp; - __be64 tmp64; - - hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE); - if (hole_pos > read->rd_offset) - *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset); - *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len)); + struct xdr_stream *xdr = resp->xdr; + unsigned long maxcount; + __be32 nfserr, *p; /* Content type, offset, byte count */ p = xdr_reserve_space(xdr, 4 + 8 + 4); if (!p) - return nfserr_resource; + return nfserr_io; + if (resp->xdr->buf->page_len && splice_ok) { + WARN_ON_ONCE(splice_ok); + return nfserr_serverfault; + } - read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount); - if (read->rd_vlen < 0) - return nfserr_resource; + maxcount = min_t(unsigned long, read->rd_length, + (xdr->buf->buflen - xdr->buf->len)); - nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, - resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof); + if (file->f_op->splice_read && splice_ok) + nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount); + else + nfserr = nfsd4_encode_readv(resp, read, file, maxcount); if (nfserr) return nfserr; - xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount)); - - tmp = htonl(NFS4_CONTENT_DATA); - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); - tmp64 = cpu_to_be64(read->rd_offset); - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8); - tmp = htonl(*maxcount); - write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4); - - tmp = xdr_zero; - write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp, - xdr_pad_size(*maxcount)); - return nfs_ok; -} - -static __be32 -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, - struct nfsd4_read *read, - unsigned long *maxcount, u32 *eof) -{ - struct file *file = read->rd_nf->nf_file; - loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA); - loff_t f_size = i_size_read(file_inode(file)); - unsigned long count; - __be32 *p; - - if (data_pos == -ENXIO) - data_pos = f_size; - else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE)) - return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size); - count = data_pos - read->rd_offset; - /* Content type, offset, byte count */ - p = xdr_reserve_space(resp->xdr, 4 + 8 + 8); - if (!p) - return nfserr_resource; - - *p++ = htonl(NFS4_CONTENT_HOLE); + *p++ = cpu_to_be32(NFS4_CONTENT_DATA); p = xdr_encode_hyper(p, read->rd_offset); - p = xdr_encode_hyper(p, count); + *p = cpu_to_be32(read->rd_length); - *eof = (read->rd_offset + count) >= f_size; - *maxcount = min_t(unsigned long, count, *maxcount); return nfs_ok; } @@ -4847,69 +4805,36 @@ static __be32 nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_read *read) { - unsigned long maxcount, count; + struct file *file = read->rd_nf->nf_file; struct xdr_stream *xdr = resp->xdr; - struct file *file; int starting_len = xdr->buf->len; - int last_segment = xdr->buf->len; - int segments = 0; - __be32 *p, tmp; - bool is_data; - loff_t pos; - u32 eof; + u32 segments = 0; + __be32 *p; if (nfserr) return nfserr; - file = read->rd_nf->nf_file; /* eof flag, segment count */ p = xdr_reserve_space(xdr, 4 + 4); if (!p) - return nfserr_resource; + return nfserr_io; xdr_commit_encode(xdr); - maxcount = min_t(unsigned long, read->rd_length, - (xdr->buf->buflen - xdr->buf->len)); - count = maxcount; - - eof = read->rd_offset >= i_size_read(file_inode(file)); - if (eof) + read->rd_eof = read->rd_offset >= i_size_read(file_inode(file)); + if (read->rd_eof) goto out; - pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE); - is_data = pos > read->rd_offset; - - while (count > 0 && !eof) { - maxcount = count; - if (is_data) - nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof, - segments == 0 ? &pos : NULL); - else - nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof); - if (nfserr) - goto out; - count -= maxcount; - read->rd_offset += maxcount; - is_data = !is_data; - last_segment = xdr->buf->len; - segments++; - } - -out: - if (nfserr && segments == 0) + nfserr = nfsd4_encode_read_plus_data(resp, read); + if (nfserr) { xdr_truncate_encode(xdr, starting_len); - else { - if (nfserr) { - xdr_truncate_encode(xdr, last_segment); - nfserr = nfs_ok; - eof = 0; - } - tmp = htonl(eof); - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); - tmp = htonl(segments); - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4); + return nfserr; } + segments++; + +out: + p = xdr_encode_bool(p, read->rd_eof); + *p = cpu_to_be32(segments); return nfserr; } -- cgit From 69eed23baf877bbb1f14d7f4df54f89807c9ee2a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 10 Oct 2022 21:24:23 +0100 Subject: NFSD: Remove redundant assignment to variable host_err Variable host_err is assigned a value that is never read, it is being re-assigned a value in every different execution path in the following switch statement. The assignment is redundant and can be removed. Cleans up clang-scan warning: warning: Value stored to 'host_err' is never read [deadcode.DeadStores] Signed-off-by: Colin Ian King Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/vfs.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 849a720ab43f..d6d90d63485d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1305,7 +1305,6 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, iap->ia_mode &= ~current_umask(); err = 0; - host_err = 0; switch (type) { case S_IFREG: host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true); -- cgit From ea5021e911d3479346a75ac9b7d9dcd751b0fb99 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 16 Oct 2022 11:47:02 -0400 Subject: NFSD: Finish converting the NFSv2 GETACL result encoder The xdr_stream conversion inadvertently left some code that set the page_len of the send buffer. The XDR stream encoders should handle this automatically now. This oversight adds garbage past the end of the Reply message. Clients typically ignore the garbage, but NFSD does not need to send it, as it leaks stale memory contents onto the wire. Fixes: f8cba47344f7 ("NFSD: Update the NFSv2 GETACL result encoder to use struct xdr_stream") Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs2acl.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 13e6e6897f6c..65d4511b7af0 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -246,7 +246,6 @@ nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr) struct nfsd3_getaclres *resp = rqstp->rq_resp; struct dentry *dentry = resp->fh.fh_dentry; struct inode *inode; - int w; if (!svcxdr_encode_stat(xdr, resp->status)) return false; @@ -260,15 +259,6 @@ nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr) if (xdr_stream_encode_u32(xdr, resp->mask) < 0) return false; - rqstp->rq_res.page_len = w = nfsacl_size( - (resp->mask & NFS_ACL) ? resp->acl_access : NULL, - (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); - while (w > 0) { - if (!*(rqstp->rq_next_page++)) - return true; - w -= PAGE_SIZE; - } - if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access, resp->mask & NFS_ACL, 0)) return false; -- cgit From 841fd0a3cb490eae5dfd262eccb8c8b11d57f8b8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 16 Oct 2022 11:47:08 -0400 Subject: NFSD: Finish converting the NFSv3 GETACL result encoder For some reason, the NFSv2 GETACL result encoder was fully converted to use the new nfs_stream_encode_acl(), but the NFSv3 equivalent was not similarly converted. Fixes: 20798dfe249a ("NFSD: Update the NFSv3 GETACL result encoder to use struct xdr_stream") Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs3acl.c | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 2fb9ee356455..a34a22e272ad 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -171,11 +171,7 @@ nfs3svc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_getaclres *resp = rqstp->rq_resp; struct dentry *dentry = resp->fh.fh_dentry; - struct kvec *head = rqstp->rq_res.head; struct inode *inode; - unsigned int base; - int n; - int w; if (!svcxdr_encode_nfsstat3(xdr, resp->status)) return false; @@ -187,26 +183,12 @@ nfs3svc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr) if (xdr_stream_encode_u32(xdr, resp->mask) < 0) return false; - base = (char *)xdr->p - (char *)head->iov_base; - - rqstp->rq_res.page_len = w = nfsacl_size( - (resp->mask & NFS_ACL) ? resp->acl_access : NULL, - (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); - while (w > 0) { - if (!*(rqstp->rq_next_page++)) - return false; - w -= PAGE_SIZE; - } - - n = nfsacl_encode(&rqstp->rq_res, base, inode, - resp->acl_access, - resp->mask & NFS_ACL, 0); - if (n > 0) - n = nfsacl_encode(&rqstp->rq_res, base + n, inode, - resp->acl_default, - resp->mask & NFS_DFACL, - NFS_ACL_DEFAULT); - if (n <= 0) + if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access, + resp->mask & NFS_ACL, 0)) + return false; + if (!nfs_stream_encode_acl(xdr, inode, resp->acl_default, + resp->mask & NFS_DFACL, + NFS_ACL_DEFAULT)) return false; break; default: -- cgit From 8e823bafff2308753d430566256c83d8085952da Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 18 Oct 2022 07:47:54 -0400 Subject: nfsd: ignore requests to disable unsupported versions The kernel currently errors out if you attempt to enable or disable a version that it doesn't recognize. Change it to ignore attempts to disable an unrecognized version. If we don't support it, then there is no harm in doing so. Signed-off-by: Jeff Layton Reviewed-by: Tom Talpey Signed-off-by: Chuck Lever --- fs/nfsd/nfsctl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index dc74a947a440..68ed42fd29fc 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -601,7 +601,9 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) } break; default: - return -EINVAL; + /* Ignore requests to disable non-existent versions */ + if (cmd == NFSD_SET) + return -EINVAL; } vers += len + 1; } while ((len = qword_get(&mesg, vers, size)) > 0); -- cgit From cb12fae1c34b1fa7eaae92c5aadc72d86d7fae19 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 18 Oct 2022 07:47:55 -0400 Subject: nfsd: move nfserrno() to vfs.c nfserrno() is common to all nfs versions, but nfsproc.c is specifically for NFSv2. Move it to vfs.c, and the prototype to vfs.h. While we're in here, remove the #ifdef EDQUOT check in this function. It's apparently a holdover from the initial merge of the nfsd code in 1997. No other place in the kernel checks that that symbol is defined before using it, so I think we can dispense with it here. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/blocklayout.c | 1 + fs/nfsd/blocklayoutxdr.c | 1 + fs/nfsd/export.h | 1 - fs/nfsd/flexfilelayout.c | 1 + fs/nfsd/nfs4idmap.c | 1 + fs/nfsd/nfsproc.c | 62 ----------------------------------------------- fs/nfsd/vfs.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/nfsd/vfs.h | 1 + 8 files changed, 68 insertions(+), 63 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index b6d01d51a746..04697f8dc37d 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -12,6 +12,7 @@ #include "blocklayoutxdr.h" #include "pnfs.h" #include "filecache.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PNFS diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 442543304930..8e9c1a0f8d38 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -9,6 +9,7 @@ #include "nfsd.h" #include "blocklayoutxdr.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PNFS diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index ee0e3aba4a6e..d03f7f6a8642 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -115,7 +115,6 @@ struct svc_export * rqst_find_fsidzero_export(struct svc_rqst *); int exp_rootfh(struct net *, struct auth_domain *, char *path, struct knfsd_fh *, int maxsize); __be32 exp_pseudoroot(struct svc_rqst *, struct svc_fh *); -__be32 nfserrno(int errno); static inline void exp_put(struct svc_export *exp) { diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c index 070f90ed09b6..3ca5304440ff 100644 --- a/fs/nfsd/flexfilelayout.c +++ b/fs/nfsd/flexfilelayout.c @@ -15,6 +15,7 @@ #include "flexfilelayoutxdr.h" #include "pnfs.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PNFS diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index e70a1a2999b7..5e9809aff37e 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -41,6 +41,7 @@ #include "idmap.h" #include "nfsd.h" #include "netns.h" +#include "vfs.h" /* * Turn off idmapping when using AUTH_SYS. diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 82b3ddeacc33..52fc222c34f2 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -848,65 +848,3 @@ const struct svc_version nfsd_version2 = { .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS2_SVC_XDRSIZE, }; - -/* - * Map errnos to NFS errnos. - */ -__be32 -nfserrno (int errno) -{ - static struct { - __be32 nfserr; - int syserr; - } nfs_errtbl[] = { - { nfs_ok, 0 }, - { nfserr_perm, -EPERM }, - { nfserr_noent, -ENOENT }, - { nfserr_io, -EIO }, - { nfserr_nxio, -ENXIO }, - { nfserr_fbig, -E2BIG }, - { nfserr_stale, -EBADF }, - { nfserr_acces, -EACCES }, - { nfserr_exist, -EEXIST }, - { nfserr_xdev, -EXDEV }, - { nfserr_mlink, -EMLINK }, - { nfserr_nodev, -ENODEV }, - { nfserr_notdir, -ENOTDIR }, - { nfserr_isdir, -EISDIR }, - { nfserr_inval, -EINVAL }, - { nfserr_fbig, -EFBIG }, - { nfserr_nospc, -ENOSPC }, - { nfserr_rofs, -EROFS }, - { nfserr_mlink, -EMLINK }, - { nfserr_nametoolong, -ENAMETOOLONG }, - { nfserr_notempty, -ENOTEMPTY }, -#ifdef EDQUOT - { nfserr_dquot, -EDQUOT }, -#endif - { nfserr_stale, -ESTALE }, - { nfserr_jukebox, -ETIMEDOUT }, - { nfserr_jukebox, -ERESTARTSYS }, - { nfserr_jukebox, -EAGAIN }, - { nfserr_jukebox, -EWOULDBLOCK }, - { nfserr_jukebox, -ENOMEM }, - { nfserr_io, -ETXTBSY }, - { nfserr_notsupp, -EOPNOTSUPP }, - { nfserr_toosmall, -ETOOSMALL }, - { nfserr_serverfault, -ESERVERFAULT }, - { nfserr_serverfault, -ENFILE }, - { nfserr_io, -EREMOTEIO }, - { nfserr_stale, -EOPENSTALE }, - { nfserr_io, -EUCLEAN }, - { nfserr_perm, -ENOKEY }, - { nfserr_no_grace, -ENOGRACE}, - }; - int i; - - for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) { - if (nfs_errtbl[i].syserr == errno) - return nfs_errtbl[i].nfserr; - } - WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno); - return nfserr_io; -} - diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d6d90d63485d..278ebc677061 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -49,6 +49,69 @@ #define NFSDDBG_FACILITY NFSDDBG_FILEOP +/** + * nfserrno - Map Linux errnos to NFS errnos + * @errno: POSIX(-ish) error code to be mapped + * + * Returns the appropriate (net-endian) nfserr_* (or nfs_ok if errno is 0). If + * it's an error we don't expect, log it once and return nfserr_io. + */ +__be32 +nfserrno (int errno) +{ + static struct { + __be32 nfserr; + int syserr; + } nfs_errtbl[] = { + { nfs_ok, 0 }, + { nfserr_perm, -EPERM }, + { nfserr_noent, -ENOENT }, + { nfserr_io, -EIO }, + { nfserr_nxio, -ENXIO }, + { nfserr_fbig, -E2BIG }, + { nfserr_stale, -EBADF }, + { nfserr_acces, -EACCES }, + { nfserr_exist, -EEXIST }, + { nfserr_xdev, -EXDEV }, + { nfserr_mlink, -EMLINK }, + { nfserr_nodev, -ENODEV }, + { nfserr_notdir, -ENOTDIR }, + { nfserr_isdir, -EISDIR }, + { nfserr_inval, -EINVAL }, + { nfserr_fbig, -EFBIG }, + { nfserr_nospc, -ENOSPC }, + { nfserr_rofs, -EROFS }, + { nfserr_mlink, -EMLINK }, + { nfserr_nametoolong, -ENAMETOOLONG }, + { nfserr_notempty, -ENOTEMPTY }, + { nfserr_dquot, -EDQUOT }, + { nfserr_stale, -ESTALE }, + { nfserr_jukebox, -ETIMEDOUT }, + { nfserr_jukebox, -ERESTARTSYS }, + { nfserr_jukebox, -EAGAIN }, + { nfserr_jukebox, -EWOULDBLOCK }, + { nfserr_jukebox, -ENOMEM }, + { nfserr_io, -ETXTBSY }, + { nfserr_notsupp, -EOPNOTSUPP }, + { nfserr_toosmall, -ETOOSMALL }, + { nfserr_serverfault, -ESERVERFAULT }, + { nfserr_serverfault, -ENFILE }, + { nfserr_io, -EREMOTEIO }, + { nfserr_stale, -EOPENSTALE }, + { nfserr_io, -EUCLEAN }, + { nfserr_perm, -ENOKEY }, + { nfserr_no_grace, -ENOGRACE}, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) { + if (nfs_errtbl[i].syserr == errno) + return nfs_errtbl[i].nfserr; + } + WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno); + return nfserr_io; +} + /* * Called from nfsd_lookup and encode_dirent. Check if we have crossed * a mount point. diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 120521bc7b24..8ddd687f8359 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -60,6 +60,7 @@ static inline void nfsd_attrs_free(struct nfsd_attrs *attrs) posix_acl_release(attrs->na_dpacl); } +__be32 nfserrno (int errno); int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, struct svc_export **expp); __be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, -- cgit From 2f3a4b2ac2f28b9be78ad21f401f31e263845214 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 18 Oct 2022 07:47:56 -0400 Subject: nfsd: allow disabling NFSv2 at compile time rpc.nfsd stopped supporting NFSv2 a year ago. Take the next logical step toward deprecating it and allow NFSv2 support to be compiled out. Add a new CONFIG_NFSD_V2 option that can be turned off and rework the CONFIG_NFSD_V?_ACL option dependencies. Add a description that discourages enabling it. Also, change the description of CONFIG_NFSD to state that the always-on version is now 3 instead of 2. Finally, add an #ifdef around "case 2:" in __write_versions. When NFSv2 is disabled at compile time, this should make the kernel ignore attempts to disable it at runtime, but still error out when trying to enable it. Signed-off-by: Jeff Layton Reviewed-by: Tom Talpey Signed-off-by: Chuck Lever --- fs/nfsd/Kconfig | 19 +++++++++++++++---- fs/nfsd/Makefile | 5 +++-- fs/nfsd/nfsctl.c | 2 ++ fs/nfsd/nfsd.h | 3 +-- fs/nfsd/nfssvc.c | 6 ++++++ 5 files changed, 27 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index f6a2fd3015e7..7c441f2bd444 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -8,6 +8,7 @@ config NFSD select SUNRPC select EXPORTFS select NFS_ACL_SUPPORT if NFSD_V2_ACL + select NFS_ACL_SUPPORT if NFSD_V3_ACL depends on MULTIUSER help Choose Y here if you want to allow other computers to access @@ -26,19 +27,29 @@ config NFSD Below you can choose which versions of the NFS protocol are available to clients mounting the NFS server on this system. - Support for NFS version 2 (RFC 1094) is always available when + Support for NFS version 3 (RFC 1813) is always available when CONFIG_NFSD is selected. If unsure, say N. -config NFSD_V2_ACL - bool +config NFSD_V2 + bool "NFS server support for NFS version 2 (DEPRECATED)" depends on NFSD + default n + help + NFSv2 (RFC 1094) was the first publicly-released version of NFS. + Unless you are hosting ancient (1990's era) NFS clients, you don't + need this. + + If unsure, say N. + +config NFSD_V2_ACL + bool "NFS server support for the NFSv2 ACL protocol extension" + depends on NFSD_V2 config NFSD_V3_ACL bool "NFS server support for the NFSv3 ACL protocol extension" depends on NFSD - select NFSD_V2_ACL help Solaris NFS servers support an auxiliary NFSv3 ACL protocol that never became an official part of the NFS version 3 protocol. diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 805c06d5f1b4..6fffc8f03f74 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -10,9 +10,10 @@ obj-$(CONFIG_NFSD) += nfsd.o # this one should be compiled first, as the tracing macros can easily blow up nfsd-y += trace.o -nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ - export.o auth.o lockd.o nfscache.o nfsxdr.o \ +nfsd-y += nfssvc.o nfsctl.o nfsfh.o vfs.o \ + export.o auth.o lockd.o nfscache.o \ stats.o filecache.o nfs3proc.o nfs3xdr.o +nfsd-$(CONFIG_NFSD_V2) += nfsproc.o nfsxdr.o nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 68ed42fd29fc..d1e581a60480 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -581,7 +581,9 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET; switch(num) { +#ifdef CONFIG_NFSD_V2 case 2: +#endif case 3: nfsd_vers(nn, num, cmd); break; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 09726c5b9a31..93b42ef9ed91 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -64,8 +64,7 @@ struct readdir_cd { extern struct svc_program nfsd_program; -extern const struct svc_version nfsd_version2, nfsd_version3, - nfsd_version4; +extern const struct svc_version nfsd_version2, nfsd_version3, nfsd_version4; extern struct mutex nfsd_mutex; extern spinlock_t nfsd_drc_lock; extern unsigned long nfsd_drc_max_mem; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index bfbd9f672f59..62e473b0ca52 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -91,8 +91,12 @@ unsigned long nfsd_drc_mem_used; #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static struct svc_stat nfsd_acl_svcstats; static const struct svc_version *nfsd_acl_version[] = { +# if defined(CONFIG_NFSD_V2_ACL) [2] = &nfsd_acl_version2, +# endif +# if defined(CONFIG_NFSD_V3_ACL) [3] = &nfsd_acl_version3, +# endif }; #define NFSD_ACL_MINVERS 2 @@ -116,7 +120,9 @@ static struct svc_stat nfsd_acl_svcstats = { #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ static const struct svc_version *nfsd_version[] = { +#if defined(CONFIG_NFSD_V2) [2] = &nfsd_version2, +#endif [3] = &nfsd_version3, #if defined(CONFIG_NFSD_V4) [4] = &nfsd_version4, -- cgit From 427505ffeaa464f683faba945a88d3e3248f6979 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Fri, 21 Oct 2022 14:24:14 +0200 Subject: exportfs: use pr_debug for unreachable debug statements expfs.c has a bunch of dprintk statements which are unusable due to: #define dprintk(fmt, args...) do{}while(0) Use pr_debug so that they can be enabled dynamically. Also make some minor changes to the debug statements to fix some incorrect types, and remove __func__ which can be handled by dynamic debug separately. Signed-off-by: David Disseldorp Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- fs/exportfs/expfs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index c648a493faf2..3204bd33e4e8 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -18,7 +18,7 @@ #include #include -#define dprintk(fmt, args...) do{}while(0) +#define dprintk(fmt, args...) pr_debug(fmt, ##args) static int get_name(const struct path *path, char *name, struct dentry *child); @@ -132,8 +132,8 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, inode_unlock(dentry->d_inode); if (IS_ERR(parent)) { - dprintk("%s: get_parent of %ld failed, err %d\n", - __func__, dentry->d_inode->i_ino, PTR_ERR(parent)); + dprintk("get_parent of %lu failed, err %ld\n", + dentry->d_inode->i_ino, PTR_ERR(parent)); return parent; } @@ -147,7 +147,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, dprintk("%s: found name: %s\n", __func__, nbuf); tmp = lookup_one_unlocked(mnt_user_ns(mnt), nbuf, parent, strlen(nbuf)); if (IS_ERR(tmp)) { - dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp)); + dprintk("lookup failed: %ld\n", PTR_ERR(tmp)); err = PTR_ERR(tmp); goto out_err; } -- cgit From e0aa651068bfd520afcd357af8ecd2de005fc83d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 28 Oct 2022 08:13:53 -0400 Subject: nfsd: don't call nfsd_file_put from client states seqfile display We had a report of this: BUG: sleeping function called from invalid context at fs/nfsd/filecache.c:440 ...with a stack trace showing nfsd_file_put being called from nfs4_show_open. This code has always tried to call fput while holding a spinlock, but we recently changed this to use the filecache, and that started triggering the might_sleep() in nfsd_file_put. states_start takes and holds the cl_lock while iterating over the client's states, and we can't sleep with that held. Have the various nfs4_show_* functions instead hold the fi_lock instead of taking a nfsd_file reference. Fixes: 78599c42ae3c ("nfsd4: add file to display list of client's opens") Link: https://bugzilla.redhat.com/show_bug.cgi?id=2138357 Reported-by: Zhi Li Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 51 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 836bd825ca4a..52b5552d0d70 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -675,15 +675,26 @@ find_any_file(struct nfs4_file *f) return ret; } -static struct nfsd_file *find_deleg_file(struct nfs4_file *f) +static struct nfsd_file *find_any_file_locked(struct nfs4_file *f) { - struct nfsd_file *ret = NULL; + lockdep_assert_held(&f->fi_lock); + + if (f->fi_fds[O_RDWR]) + return f->fi_fds[O_RDWR]; + if (f->fi_fds[O_WRONLY]) + return f->fi_fds[O_WRONLY]; + if (f->fi_fds[O_RDONLY]) + return f->fi_fds[O_RDONLY]; + return NULL; +} + +static struct nfsd_file *find_deleg_file_locked(struct nfs4_file *f) +{ + lockdep_assert_held(&f->fi_lock); - spin_lock(&f->fi_lock); if (f->fi_deleg_file) - ret = nfsd_file_get(f->fi_deleg_file); - spin_unlock(&f->fi_lock); - return ret; + return f->fi_deleg_file; + return NULL; } static atomic_long_t num_delegations; @@ -2613,9 +2624,11 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) ols = openlockstateid(st); oo = ols->st_stateowner; nf = st->sc_file; - file = find_any_file(nf); + + spin_lock(&nf->fi_lock); + file = find_any_file_locked(nf); if (!file) - return 0; + goto out; seq_printf(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); @@ -2637,8 +2650,8 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) seq_printf(s, ", "); nfs4_show_owner(s, oo); seq_printf(s, " }\n"); - nfsd_file_put(file); - +out: + spin_unlock(&nf->fi_lock); return 0; } @@ -2652,9 +2665,10 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) ols = openlockstateid(st); oo = ols->st_stateowner; nf = st->sc_file; - file = find_any_file(nf); + spin_lock(&nf->fi_lock); + file = find_any_file_locked(nf); if (!file) - return 0; + goto out; seq_printf(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); @@ -2674,8 +2688,8 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) seq_printf(s, ", "); nfs4_show_owner(s, oo); seq_printf(s, " }\n"); - nfsd_file_put(file); - +out: + spin_unlock(&nf->fi_lock); return 0; } @@ -2687,9 +2701,10 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) ds = delegstateid(st); nf = st->sc_file; - file = find_deleg_file(nf); + spin_lock(&nf->fi_lock); + file = find_deleg_file_locked(nf); if (!file) - return 0; + goto out; seq_printf(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); @@ -2705,8 +2720,8 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) seq_printf(s, ", "); nfs4_show_fname(s, file); seq_printf(s, " }\n"); - nfsd_file_put(file); - +out: + spin_unlock(&nf->fi_lock); return 0; } -- cgit From c252849082ff525af18b4f253b3c9ece94e951ed Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:46:38 -0400 Subject: NFSD: Pass the target nfsd_file to nfsd_commit() In a moment I'm going to introduce separate nfsd_file types, one of which is garbage-collected; the other, not. The garbage-collected variety is to be used by NFSv2 and v3, and the non-garbage-collected variety is to be used by NFSv4. nfsd_commit() is invoked by both NFSv3 and NFSv4 consumers. We want nfsd_commit() to find and use the correct variety of cached nfsd_file object for the NFS version that is in use. Signed-off-by: Chuck Lever Tested-by: Jeff Layton Reviewed-by: Jeff Layton Reviewed-by: NeilBrown --- fs/nfsd/nfs3proc.c | 10 +++++++++- fs/nfsd/nfs4proc.c | 11 ++++++++++- fs/nfsd/vfs.c | 15 ++++----------- fs/nfsd/vfs.h | 3 ++- 4 files changed, 25 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 923d9a80df92..ff2920546333 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -13,6 +13,7 @@ #include "cache.h" #include "xdr3.h" #include "vfs.h" +#include "filecache.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -763,6 +764,7 @@ nfsd3_proc_commit(struct svc_rqst *rqstp) { struct nfsd3_commitargs *argp = rqstp->rq_argp; struct nfsd3_commitres *resp = rqstp->rq_resp; + struct nfsd_file *nf; dprintk("nfsd: COMMIT(3) %s %u@%Lu\n", SVCFH_fmt(&argp->fh), @@ -770,8 +772,14 @@ nfsd3_proc_commit(struct svc_rqst *rqstp) (unsigned long long) argp->offset); fh_copy(&resp->fh, &argp->fh); - resp->status = nfsd_commit(rqstp, &resp->fh, argp->offset, + resp->status = nfsd_file_acquire(rqstp, &resp->fh, NFSD_MAY_WRITE | + NFSD_MAY_NOT_BREAK_LEASE, &nf); + if (resp->status) + goto out; + resp->status = nfsd_commit(rqstp, &resp->fh, nf, argp->offset, argp->count, resp->verf); + nfsd_file_put(nf); +out: return rpc_success; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 8beb2bc4c328..42db62413890 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -731,10 +731,19 @@ nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_commit *commit = &u->commit; + struct nfsd_file *nf; + __be32 status; - return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, + status = nfsd_file_acquire(rqstp, &cstate->current_fh, NFSD_MAY_WRITE | + NFSD_MAY_NOT_BREAK_LEASE, &nf); + if (status != nfs_ok) + return status; + + status = nfsd_commit(rqstp, &cstate->current_fh, nf, commit->co_offset, commit->co_count, (__be32 *)commit->co_verf.data); + nfsd_file_put(nf); + return status; } static __be32 diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 278ebc677061..6f39d6c38116 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1196,6 +1196,7 @@ out: * nfsd_commit - Commit pending writes to stable storage * @rqstp: RPC request being processed * @fhp: NFS filehandle + * @nf: target file * @offset: raw offset from beginning of file * @count: raw count of bytes to sync * @verf: filled in with the server's current write verifier @@ -1212,19 +1213,13 @@ out: * An nfsstat value in network byte order. */ __be32 -nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset, - u32 count, __be32 *verf) +nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, + u64 offset, u32 count, __be32 *verf) { + __be32 err = nfs_ok; u64 maxbytes; loff_t start, end; struct nfsd_net *nn; - struct nfsd_file *nf; - __be32 err; - - err = nfsd_file_acquire(rqstp, fhp, - NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf); - if (err) - goto out; /* * Convert the client-provided (offset, count) range to a @@ -1265,8 +1260,6 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset, } else nfsd_copy_write_verifier(verf, nn); - nfsd_file_put(nf); -out: return err; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 8ddd687f8359..dbdfef7ae85b 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -89,7 +89,8 @@ __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); __be32 nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct svc_fh *resfhp, struct nfsd_attrs *iap); __be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp, - u64 offset, u32 count, __be32 *verf); + struct nfsd_file *nf, u64 offset, u32 count, + __be32 *verf); #ifdef CONFIG_NFSD_V4 __be32 nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, void **bufp, int *lenp); -- cgit From dcf3f80965ca787c70def402cdf1553c93c75529 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:46:44 -0400 Subject: NFSD: Revert "NFSD: NFSv4 CLOSE should release an nfsd_file immediately" This reverts commit 5e138c4a750dc140d881dab4a8804b094bbc08d2. That commit attempted to make files available to other users as soon as all NFSv4 clients were done with them, rather than waiting until the filecache LRU had garbage collected them. It gets the reference counting wrong, for one thing. But it also misses that DELEGRETURN should release a file in the same fashion. In fact, any nfsd_file_put() on an file held open by an NFSv4 client needs potentially to release the file immediately... Clear the way for implementing that idea. Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton Reviewed-by: NeilBrown --- fs/nfsd/filecache.c | 18 ------------------ fs/nfsd/filecache.h | 1 - fs/nfsd/nfs4state.c | 4 ++-- 3 files changed, 2 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index ec3fceb92236..babea79d3f6f 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -444,24 +444,6 @@ nfsd_file_put(struct nfsd_file *nf) nfsd_file_put_noref(nf); } -/** - * nfsd_file_close - Close an nfsd_file - * @nf: nfsd_file to close - * - * If this is the final reference for @nf, free it immediately. - * This reflects an on-the-wire CLOSE or DELEGRETURN into the - * VFS and exported filesystem. - */ -void nfsd_file_close(struct nfsd_file *nf) -{ - nfsd_file_put(nf); - if (refcount_dec_if_one(&nf->nf_ref)) { - nfsd_file_unhash(nf); - nfsd_file_lru_remove(nf); - nfsd_file_free(nf); - } -} - struct nfsd_file * nfsd_file_get(struct nfsd_file *nf) { diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 357832bac736..6b012ea4bd9d 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -52,7 +52,6 @@ void nfsd_file_cache_shutdown(void); int nfsd_file_cache_start_net(struct net *net); void nfsd_file_cache_shutdown_net(struct net *net); void nfsd_file_put(struct nfsd_file *nf); -void nfsd_file_close(struct nfsd_file *nf); struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); void nfsd_file_close_inode_sync(struct inode *inode); bool nfsd_file_is_cached(struct inode *inode); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 52b5552d0d70..16c3e991ddcc 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -842,9 +842,9 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) swap(f2, fp->fi_fds[O_RDWR]); spin_unlock(&fp->fi_lock); if (f1) - nfsd_file_close(f1); + nfsd_file_put(f1); if (f2) - nfsd_file_close(f2); + nfsd_file_put(f2); } } -- cgit From 4d1ea8455716ca070e3cd85767e6f6a562a58b1b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:46:51 -0400 Subject: NFSD: Add an NFSD_FILE_GC flag to enable nfsd_file garbage collection NFSv4 operations manage the lifetime of nfsd_file items they use by means of NFSv4 OPEN and CLOSE. Hence there's no need for them to be garbage collected. Introduce a mechanism to enable garbage collection for nfsd_file items used only by NFSv2/3 callers. Note that the change in nfsd_file_put() ensures that both CLOSE and DELEGRETURN will actually close out and free an nfsd_file on last reference of a non-garbage-collected file. Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=394 Suggested-by: Trond Myklebust Signed-off-by: Chuck Lever Tested-by: Jeff Layton Reviewed-by: NeilBrown Reviewed-by: Jeff Layton --- fs/nfsd/filecache.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++------- fs/nfsd/filecache.h | 3 +++ fs/nfsd/nfs3proc.c | 4 ++-- fs/nfsd/trace.h | 3 ++- fs/nfsd/vfs.c | 4 ++-- 5 files changed, 64 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index babea79d3f6f..cee44405cf7d 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -63,6 +63,7 @@ struct nfsd_file_lookup_key { struct net *net; const struct cred *cred; unsigned char need; + bool gc; enum nfsd_file_lookup_type type; }; @@ -162,6 +163,8 @@ static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg, return 1; if (!nfsd_match_cred(nf->nf_cred, key->cred)) return 1; + if (!!test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc) + return 1; if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) return 1; break; @@ -297,6 +300,8 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may) nf->nf_flags = 0; __set_bit(NFSD_FILE_HASHED, &nf->nf_flags); __set_bit(NFSD_FILE_PENDING, &nf->nf_flags); + if (key->gc) + __set_bit(NFSD_FILE_GC, &nf->nf_flags); nf->nf_inode = key->inode; /* nf_ref is pre-incremented for hash table */ refcount_set(&nf->nf_ref, 2); @@ -428,16 +433,27 @@ nfsd_file_put_noref(struct nfsd_file *nf) } } +static void +nfsd_file_unhash_and_put(struct nfsd_file *nf) +{ + if (nfsd_file_unhash(nf)) + nfsd_file_put_noref(nf); +} + void nfsd_file_put(struct nfsd_file *nf) { might_sleep(); - nfsd_file_lru_add(nf); - if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) { + if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) + nfsd_file_lru_add(nf); + else if (refcount_read(&nf->nf_ref) == 2) + nfsd_file_unhash_and_put(nf); + + if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { nfsd_file_flush(nf); nfsd_file_put_noref(nf); - } else if (nf->nf_file) { + } else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags)) { nfsd_file_put_noref(nf); nfsd_file_schedule_laundrette(); } else @@ -1016,12 +1032,14 @@ nfsd_file_is_cached(struct inode *inode) static __be32 nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, - unsigned int may_flags, struct nfsd_file **pnf, bool open) + unsigned int may_flags, struct nfsd_file **pnf, + bool open, bool want_gc) { struct nfsd_file_lookup_key key = { .type = NFSD_FILE_KEY_FULL, .need = may_flags & NFSD_FILE_MAY_MASK, .net = SVC_NET(rqstp), + .gc = want_gc, }; bool open_retry = true; struct nfsd_file *nf; @@ -1117,14 +1135,35 @@ open_file: * then unhash. */ if (status != nfs_ok || key.inode->i_nlink == 0) - if (nfsd_file_unhash(nf)) - nfsd_file_put_noref(nf); + nfsd_file_unhash_and_put(nf); clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags); smp_mb__after_atomic(); wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING); goto out; } +/** + * nfsd_file_acquire_gc - Get a struct nfsd_file with an open file + * @rqstp: the RPC transaction being executed + * @fhp: the NFS filehandle of the file to be opened + * @may_flags: NFSD_MAY_ settings for the file + * @pnf: OUT: new or found "struct nfsd_file" object + * + * The nfsd_file object returned by this API is reference-counted + * and garbage-collected. The object is retained for a few + * seconds after the final nfsd_file_put() in case the caller + * wants to re-use it. + * + * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in + * network byte order is returned. + */ +__be32 +nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf) +{ + return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, true); +} + /** * nfsd_file_acquire - Get a struct nfsd_file with an open file * @rqstp: the RPC transaction being executed @@ -1132,6 +1171,10 @@ open_file: * @may_flags: NFSD_MAY_ settings for the file * @pnf: OUT: new or found "struct nfsd_file" object * + * The nfsd_file_object returned by this API is reference-counted + * but not garbage-collected. The object is unhashed after the + * final nfsd_file_put(). + * * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in * network byte order is returned. */ @@ -1139,7 +1182,7 @@ __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **pnf) { - return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true); + return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, false); } /** @@ -1149,6 +1192,10 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, * @may_flags: NFSD_MAY_ settings for the file * @pnf: OUT: new or found "struct nfsd_file" object * + * The nfsd_file_object returned by this API is reference-counted + * but not garbage-collected. The object is released immediately + * one RCU grace period after the final nfsd_file_put(). + * * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in * network byte order is returned. */ @@ -1156,7 +1203,7 @@ __be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **pnf) { - return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false); + return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false, false); } /* diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 6b012ea4bd9d..b7efb2c3ddb1 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -38,6 +38,7 @@ struct nfsd_file { #define NFSD_FILE_HASHED (0) #define NFSD_FILE_PENDING (1) #define NFSD_FILE_REFERENCED (2) +#define NFSD_FILE_GC (3) unsigned long nf_flags; struct inode *nf_inode; /* don't deref */ refcount_t nf_ref; @@ -55,6 +56,8 @@ void nfsd_file_put(struct nfsd_file *nf); struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); void nfsd_file_close_inode_sync(struct inode *inode); bool nfsd_file_is_cached(struct inode *inode); +__be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **nfp); __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **nfp); __be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index ff2920546333..d01b29aba662 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -772,8 +772,8 @@ nfsd3_proc_commit(struct svc_rqst *rqstp) (unsigned long long) argp->offset); fh_copy(&resp->fh, &argp->fh); - resp->status = nfsd_file_acquire(rqstp, &resp->fh, NFSD_MAY_WRITE | - NFSD_MAY_NOT_BREAK_LEASE, &nf); + resp->status = nfsd_file_acquire_gc(rqstp, &resp->fh, NFSD_MAY_WRITE | + NFSD_MAY_NOT_BREAK_LEASE, &nf); if (resp->status) goto out; resp->status = nfsd_commit(rqstp, &resp->fh, nf, argp->offset, diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index d4b6839bb459..3fcfeb7b560f 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -817,7 +817,8 @@ DEFINE_CLID_EVENT(confirmed_r); __print_flags(val, "|", \ { 1 << NFSD_FILE_HASHED, "HASHED" }, \ { 1 << NFSD_FILE_PENDING, "PENDING" }, \ - { 1 << NFSD_FILE_REFERENCED, "REFERENCED"}) + { 1 << NFSD_FILE_REFERENCED, "REFERENCED"}, \ + { 1 << NFSD_FILE_GC, "GC"}) DECLARE_EVENT_CLASS(nfsd_file_class, TP_PROTO(struct nfsd_file *nf), diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6f39d6c38116..961e411080f9 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1148,7 +1148,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 err; trace_nfsd_read_start(rqstp, fhp, offset, *count); - err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf); + err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_READ, &nf); if (err) return err; @@ -1180,7 +1180,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, trace_nfsd_write_start(rqstp, fhp, offset, *cnt); - err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf); + err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_WRITE, &nf); if (err) goto out; -- cgit From b3276c1f5b268ff56622e9e125b792b4c3dc03ac Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 1 Nov 2022 13:30:46 -0400 Subject: NFSD: Flesh out a documenting comment for filecache.c Record what we've learned recently about the NFSD filecache in a documenting comment so our future selves don't forget what all this is for. Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton --- fs/nfsd/filecache.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index cee44405cf7d..897dd47ffb0a 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -2,6 +2,30 @@ * Open file cache. * * (c) 2015 - Jeff Layton + * + * An nfsd_file object is a per-file collection of open state that binds + * together: + * - a struct file * + * - a user credential + * - a network namespace + * - a read-ahead context + * - monitoring for writeback errors + * + * nfsd_file objects are reference-counted. Consumers acquire a new + * object via the nfsd_file_acquire API. They manage their interest in + * the acquired object, and hence the object's reference count, via + * nfsd_file_get and nfsd_file_put. There are two varieties of nfsd_file + * object: + * + * * non-garbage-collected: When a consumer wants to precisely control + * the lifetime of a file's open state, it acquires a non-garbage- + * collected nfsd_file. The final nfsd_file_put releases the open + * state immediately. + * + * * garbage-collected: When a consumer does not control the lifetime + * of open state, it acquires a garbage-collected nfsd_file. The + * final nfsd_file_put allows the open state to linger for a period + * during which it may be re-used. */ #include -- cgit From eeff73f7c1c583f79a401284f46c619294859310 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:46:57 -0400 Subject: NFSD: Clean up nfs4_preprocess_stateid_op() call sites Remove the lame-duck dprintk()s around nfs4_preprocess_stateid_op() call sites. Signed-off-by: Chuck Lever Tested-by: Jeff Layton Reviewed-by: Jeff Layton Reviewed-by: NeilBrown --- fs/nfsd/nfs4proc.c | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 42db62413890..6f7e0c5e62d2 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -943,12 +943,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &read->rd_stateid, RD_STATE, &read->rd_nf, NULL); - if (status) { - dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); - goto out; - } - status = nfs_ok; -out: + read->rd_rqstp = rqstp; read->rd_fhp = &cstate->current_fh; return status; @@ -1117,10 +1112,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &setattr->sa_stateid, WR_STATE, NULL, NULL); - if (status) { - dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); + if (status) return status; - } } err = fh_want_write(&cstate->current_fh); if (err) @@ -1168,10 +1161,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, write->wr_offset, cnt); status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, stateid, WR_STATE, &nf, NULL); - if (status) { - dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); + if (status) return status; - } write->wr_how_written = write->wr_stable_how; @@ -1202,17 +1193,13 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh, src_stateid, RD_STATE, src, NULL); - if (status) { - dprintk("NFSD: %s: couldn't process src stateid!\n", __func__); + if (status) goto out; - } status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, dst_stateid, WR_STATE, dst, NULL); - if (status) { - dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__); + if (status) goto out_put_src; - } /* fix up for NFS-specific error code */ if (!S_ISREG(file_inode((*src)->nf_file)->i_mode) || @@ -1957,10 +1944,8 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &fallocate->falloc_stateid, WR_STATE, &nf, NULL); - if (status != nfs_ok) { - dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n"); + if (status != nfs_ok) return status; - } status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, nf->nf_file, fallocate->falloc_offset, @@ -2016,10 +2001,8 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &seek->seek_stateid, RD_STATE, &nf, NULL); - if (status) { - dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n"); + if (status) return status; - } switch (seek->seek_whence) { case NFS4_CONTENT_DATA: -- cgit From 20eee313ff4b8a7e71ae9560f5c4ba27cd763005 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:47:03 -0400 Subject: NFSD: Trace stateids returned via DELEGRETURN Handing out a delegation stateid is recorded with the nfsd_deleg_read tracepoint, but there isn't a matching tracepoint for recording when the stateid is returned. Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton --- fs/nfsd/nfs4state.c | 1 + fs/nfsd/trace.h | 1 + 2 files changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 16c3e991ddcc..9ec66611a458 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6917,6 +6917,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto put_stateid; + trace_nfsd_deleg_return(stateid); wake_up_var(d_inode(cstate->current_fh.fh_dentry)); destroy_delegation(dp); put_stateid: diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 3fcfeb7b560f..00908f75e0a6 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -604,6 +604,7 @@ DEFINE_STATEID_EVENT(layout_recall_release); DEFINE_STATEID_EVENT(open); DEFINE_STATEID_EVENT(deleg_read); +DEFINE_STATEID_EVENT(deleg_return); DEFINE_STATEID_EVENT(deleg_recall); DECLARE_EVENT_CLASS(nfsd_stateseqid_class, -- cgit From a1c74569bbde91299f24535abf711be5c84df9de Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:47:09 -0400 Subject: NFSD: Trace delegation revocations Delegation revocation is an exceptional event that is not otherwise visible externally (eg, no network traffic is emitted). Generate a trace record when it occurs so that revocation can be observed or other activity can be triggered. Example: nfsd-1104 [005] 1912.002544: nfsd_stid_revoke: client 633c9343:4e82788d stateid 00000003:00000001 ref=2 type=DELEG Trace infrastructure is provided for subsequent additional tracing related to nfs4_stid activity. Signed-off-by: Chuck Lever Tested-by: Jeff Layton Reviewed-by: Jeff Layton --- fs/nfsd/nfs4state.c | 2 ++ fs/nfsd/trace.h | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9ec66611a458..7c131cac2705 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1366,6 +1366,8 @@ static void revoke_delegation(struct nfs4_delegation *dp) WARN_ON(!list_empty(&dp->dl_recall_lru)); + trace_nfsd_stid_revoke(&dp->dl_stid); + if (clp->cl_minorversion) { dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID; refcount_inc(&dp->dl_stid.sc_count); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 00908f75e0a6..b648db4d0346 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -637,6 +637,61 @@ DEFINE_EVENT(nfsd_stateseqid_class, nfsd_##name, \ DEFINE_STATESEQID_EVENT(preprocess); DEFINE_STATESEQID_EVENT(open_confirm); +TRACE_DEFINE_ENUM(NFS4_OPEN_STID); +TRACE_DEFINE_ENUM(NFS4_LOCK_STID); +TRACE_DEFINE_ENUM(NFS4_DELEG_STID); +TRACE_DEFINE_ENUM(NFS4_CLOSED_STID); +TRACE_DEFINE_ENUM(NFS4_REVOKED_DELEG_STID); +TRACE_DEFINE_ENUM(NFS4_CLOSED_DELEG_STID); +TRACE_DEFINE_ENUM(NFS4_LAYOUT_STID); + +#define show_stid_type(x) \ + __print_flags(x, "|", \ + { NFS4_OPEN_STID, "OPEN" }, \ + { NFS4_LOCK_STID, "LOCK" }, \ + { NFS4_DELEG_STID, "DELEG" }, \ + { NFS4_CLOSED_STID, "CLOSED" }, \ + { NFS4_REVOKED_DELEG_STID, "REVOKED" }, \ + { NFS4_CLOSED_DELEG_STID, "CLOSED_DELEG" }, \ + { NFS4_LAYOUT_STID, "LAYOUT" }) + +DECLARE_EVENT_CLASS(nfsd_stid_class, + TP_PROTO( + const struct nfs4_stid *stid + ), + TP_ARGS(stid), + TP_STRUCT__entry( + __field(unsigned long, sc_type) + __field(int, sc_count) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, si_id) + __field(u32, si_generation) + ), + TP_fast_assign( + const stateid_t *stp = &stid->sc_stateid; + + __entry->sc_type = stid->sc_type; + __entry->sc_count = refcount_read(&stid->sc_count); + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; + __entry->cl_id = stp->si_opaque.so_clid.cl_id; + __entry->si_id = stp->si_opaque.so_id; + __entry->si_generation = stp->si_generation; + ), + TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s", + __entry->cl_boot, __entry->cl_id, + __entry->si_id, __entry->si_generation, + __entry->sc_count, show_stid_type(__entry->sc_type) + ) +); + +#define DEFINE_STID_EVENT(name) \ +DEFINE_EVENT(nfsd_stid_class, nfsd_stid_##name, \ + TP_PROTO(const struct nfs4_stid *stid), \ + TP_ARGS(stid)) + +DEFINE_STID_EVENT(revoke); + DECLARE_EVENT_CLASS(nfsd_clientid_class, TP_PROTO(const clientid_t *clid), TP_ARGS(clid), -- cgit From b48f8056c034f28dd54668399f1d22be421b0bef Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:47:16 -0400 Subject: NFSD: Use const pointers as parameters to fh_ helpers Enable callers to use const pointers where they are able to. Signed-off-by: Chuck Lever Tested-by: Jeff Layton Reviewed-by: Jeff Layton Reviewed-by: NeilBrown --- fs/nfsd/nfsfh.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index c3ae6414fc5c..513e028b0bbe 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -220,7 +220,7 @@ __be32 fh_update(struct svc_fh *); void fh_put(struct svc_fh *); static __inline__ struct svc_fh * -fh_copy(struct svc_fh *dst, struct svc_fh *src) +fh_copy(struct svc_fh *dst, const struct svc_fh *src) { WARN_ON(src->fh_dentry); @@ -229,7 +229,7 @@ fh_copy(struct svc_fh *dst, struct svc_fh *src) } static inline void -fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src) +fh_copy_shallow(struct knfsd_fh *dst, const struct knfsd_fh *src) { dst->fh_size = src->fh_size; memcpy(&dst->fh_raw, &src->fh_raw, src->fh_size); @@ -243,7 +243,8 @@ fh_init(struct svc_fh *fhp, int maxsize) return fhp; } -static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) +static inline bool fh_match(const struct knfsd_fh *fh1, + const struct knfsd_fh *fh2) { if (fh1->fh_size != fh2->fh_size) return false; @@ -252,7 +253,8 @@ static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) return true; } -static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) +static inline bool fh_fsid_match(const struct knfsd_fh *fh1, + const struct knfsd_fh *fh2) { if (fh1->fh_fsid_type != fh2->fh_fsid_type) return false; -- cgit From 3fe828caddd81e68e9d29353c6e9285a658ca056 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:47:22 -0400 Subject: NFSD: Update file_hashtbl() helpers Enable callers to use const pointers for type safety. Signed-off-by: Chuck Lever Reviewed-by: NeilBrown Reviewed-by: Jeff Layton --- fs/nfsd/nfs4state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 7c131cac2705..baf03a31203c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -721,7 +721,7 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername) #define FILE_HASH_BITS 8 #define FILE_HASH_SIZE (1 << FILE_HASH_BITS) -static unsigned int file_hashval(struct svc_fh *fh) +static unsigned int file_hashval(const struct svc_fh *fh) { struct inode *inode = d_inode(fh->fh_dentry); @@ -4686,7 +4686,7 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) /* search file_hashtbl[] for file */ static struct nfs4_file * -find_file_locked(struct svc_fh *fh, unsigned int hashval) +find_file_locked(const struct svc_fh *fh, unsigned int hashval) { struct nfs4_file *fp; -- cgit From 81a21fa3e7fdecb3c5b97014f0fc5a17d5806cae Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:47:28 -0400 Subject: NFSD: Clean up nfsd4_init_file() Name this function more consistently. I'm going to use nfsd4_file_ and nfsd4_file_hash_ for these helpers. Change the @fh parameter to be const pointer for better type safety. Finally, move the hash insertion operation to the caller. This is typical for most other "init_object" type helpers, and it is where most of the other nfs4_file hash table operations are located. Signed-off-by: Chuck Lever Reviewed-by: NeilBrown Reviewed-by: Jeff Layton --- fs/nfsd/nfs4state.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index baf03a31203c..d63a4e5439c2 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4277,11 +4277,9 @@ static struct nfs4_file *nfsd4_alloc_file(void) } /* OPEN Share state helper functions */ -static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval, - struct nfs4_file *fp) -{ - lockdep_assert_held(&state_lock); +static void nfsd4_file_init(const struct svc_fh *fh, struct nfs4_file *fp) +{ refcount_set(&fp->fi_ref, 1); spin_lock_init(&fp->fi_lock); INIT_LIST_HEAD(&fp->fi_stateids); @@ -4299,7 +4297,6 @@ static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval, INIT_LIST_HEAD(&fp->fi_lo_states); atomic_set(&fp->fi_lo_recalls, 0); #endif - hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]); } void @@ -4717,7 +4714,8 @@ static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh, fp->fi_aliased = alias_found = true; } if (likely(ret == NULL)) { - nfsd4_init_file(fh, hashval, new); + nfsd4_file_init(fh, new); + hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]); new->fi_aliased = alias_found; ret = new; } -- cgit From 3341678f2fd6106055cead09e513fad6950a0d19 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:47:34 -0400 Subject: NFSD: Add a nfsd4_file_hash_remove() helper Refactor to relocate hash deletion operation to a helper function that is close to most other nfs4_file data structure operations. The "noinline" annotation will become useful in a moment when the hlist_del_rcu() is replaced with a more complex rhash remove operation. It also guarantees that hash remove operations can be traced with "-p function -l remove_nfs4_file_locked". This also simplifies the organization of forward declarations: the to-be-added rhashtable and its param structure will be defined /after/ put_nfs4_file(). Signed-off-by: Chuck Lever Reviewed-by: NeilBrown Reviewed-by: Jeff Layton --- fs/nfsd/nfs4state.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d63a4e5439c2..fb70cab33887 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -84,6 +84,7 @@ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) static void nfs4_free_ol_stateid(struct nfs4_stid *stid); void nfsd4_end_grace(struct nfsd_net *nn); static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps); +static void nfsd4_file_hash_remove(struct nfs4_file *fi); /* Locking: */ @@ -591,7 +592,7 @@ put_nfs4_file(struct nfs4_file *fi) might_lock(&state_lock); if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) { - hlist_del_rcu(&fi->fi_hash); + nfsd4_file_hash_remove(fi); spin_unlock(&state_lock); WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); @@ -4749,6 +4750,11 @@ find_or_add_file(struct nfs4_file *new, struct svc_fh *fh) return insert_file(new, fh, hashval); } +static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi) +{ + hlist_del_rcu(&fi->fi_hash); +} + /* * Called to check deny when READ with all zero stateid or * WRITE with all zero or all one stateid -- cgit From 9270fc514ba7d415636b23bcb937573a1ce54f6a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:47:41 -0400 Subject: NFSD: Clean up find_or_add_file() Remove the call to find_file_locked() in insert_nfs4_file(). Tracing shows that over 99% of these calls return NULL. Thus it is not worth the expense of the extra bucket list traversal. insert_file() already deals correctly with the case where the item is already in the hash bucket. Since nfsd4_file_hash_insert() is now just a wrapper around insert_file(), move the meat of insert_file() into nfsd4_file_hash_insert() and get rid of it. Signed-off-by: Chuck Lever Reviewed-by: NeilBrown --- fs/nfsd/nfs4state.c | 64 +++++++++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index fb70cab33887..d63ce2118042 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4698,24 +4698,42 @@ find_file_locked(const struct svc_fh *fh, unsigned int hashval) return NULL; } -static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh, - unsigned int hashval) +static struct nfs4_file * find_file(struct svc_fh *fh) { struct nfs4_file *fp; + unsigned int hashval = file_hashval(fh); + + rcu_read_lock(); + fp = find_file_locked(fh, hashval); + rcu_read_unlock(); + return fp; +} + +/* + * On hash insertion, identify entries with the same inode but + * distinct filehandles. They will all be in the same hash bucket + * because nfs4_file's are hashed by the address in the fi_inode + * field. + */ +static noinline_for_stack struct nfs4_file * +nfsd4_file_hash_insert(struct nfs4_file *new, const struct svc_fh *fhp) +{ + unsigned int hashval = file_hashval(fhp); struct nfs4_file *ret = NULL; bool alias_found = false; + struct nfs4_file *fi; spin_lock(&state_lock); - hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash, + hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash, lockdep_is_held(&state_lock)) { - if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) { - if (refcount_inc_not_zero(&fp->fi_ref)) - ret = fp; - } else if (d_inode(fh->fh_dentry) == fp->fi_inode) - fp->fi_aliased = alias_found = true; + if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { + if (refcount_inc_not_zero(&fi->fi_ref)) + ret = fi; + } else if (d_inode(fhp->fh_dentry) == fi->fi_inode) + fi->fi_aliased = alias_found = true; } if (likely(ret == NULL)) { - nfsd4_file_init(fh, new); + nfsd4_file_init(fhp, new); hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]); new->fi_aliased = alias_found; ret = new; @@ -4724,32 +4742,6 @@ static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh, return ret; } -static struct nfs4_file * find_file(struct svc_fh *fh) -{ - struct nfs4_file *fp; - unsigned int hashval = file_hashval(fh); - - rcu_read_lock(); - fp = find_file_locked(fh, hashval); - rcu_read_unlock(); - return fp; -} - -static struct nfs4_file * -find_or_add_file(struct nfs4_file *new, struct svc_fh *fh) -{ - struct nfs4_file *fp; - unsigned int hashval = file_hashval(fh); - - rcu_read_lock(); - fp = find_file_locked(fh, hashval); - rcu_read_unlock(); - if (fp) - return fp; - - return insert_file(new, fh, hashval); -} - static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi) { hlist_del_rcu(&fi->fi_hash); @@ -5641,7 +5633,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf * and check for delegations in the process of being recalled. * If not found, create the nfs4_file struct */ - fp = find_or_add_file(open->op_file, current_fh); + fp = nfsd4_file_hash_insert(open->op_file, current_fh); if (fp != open->op_file) { status = nfs4_check_deleg(cl, open, &dp); if (status) -- cgit From 15424748001a9b5ea62b3e6ad45f0a8b27f01df9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:47:47 -0400 Subject: NFSD: Refactor find_file() find_file() is now the only caller of find_file_locked(), so just fold these two together. Signed-off-by: Chuck Lever Reviewed-by: NeilBrown Reviewed-by: Jeff Layton --- fs/nfsd/nfs4state.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d63ce2118042..a611e7b0f013 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4682,31 +4682,24 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) nfs4_put_stid(&last->st_stid); } -/* search file_hashtbl[] for file */ -static struct nfs4_file * -find_file_locked(const struct svc_fh *fh, unsigned int hashval) +static noinline_for_stack struct nfs4_file * +nfsd4_file_hash_lookup(const struct svc_fh *fhp) { - struct nfs4_file *fp; + unsigned int hashval = file_hashval(fhp); + struct nfs4_file *fi; - hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash, - lockdep_is_held(&state_lock)) { - if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) { - if (refcount_inc_not_zero(&fp->fi_ref)) - return fp; + rcu_read_lock(); + hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash, + lockdep_is_held(&state_lock)) { + if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { + if (refcount_inc_not_zero(&fi->fi_ref)) { + rcu_read_unlock(); + return fi; + } } } - return NULL; -} - -static struct nfs4_file * find_file(struct svc_fh *fh) -{ - struct nfs4_file *fp; - unsigned int hashval = file_hashval(fh); - - rcu_read_lock(); - fp = find_file_locked(fh, hashval); rcu_read_unlock(); - return fp; + return NULL; } /* @@ -4757,9 +4750,10 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) struct nfs4_file *fp; __be32 ret = nfs_ok; - fp = find_file(current_fh); + fp = nfsd4_file_hash_lookup(current_fh); if (!fp) return ret; + /* Check for conflicting share reservations */ spin_lock(&fp->fi_lock); if (fp->fi_share_deny & deny_type) -- cgit From d47b295e8d76a4d69f0e2ea0cd8a79c9d3488280 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:47:53 -0400 Subject: NFSD: Use rhashtable for managing nfs4_file objects fh_match() is costly, especially when filehandles are large (as is the case for NFSv4). It needs to be used sparingly when searching data structures. Unfortunately, with common workloads, I see multiple thousands of objects stored in file_hashtbl[], which has just 256 buckets, making its bucket hash chains quite lengthy. Walking long hash chains with the state_lock held blocks other activity that needs that lock. Sizable hash chains are a common occurrance once the server has handed out some delegations, for example -- IIUC, each delegated file is held open on the server by an nfs4_file object. To help mitigate the cost of searching with fh_match(), replace the nfs4_file hash table with an rhashtable, which can dynamically resize its bucket array to minimize hash chain length. The result of this modification is an improvement in the latency of NFSv4 operations, and the reduction of nfsd CPU utilization due to eliminating the cost of multiple calls to fh_match() and reducing the CPU cache misses incurred while walking long hash chains in the nfs4_file hash table. Signed-off-by: Chuck Lever Reviewed-by: NeilBrown Reviewed-by: Jeff Layton --- fs/nfsd/nfs4state.c | 97 ++++++++++++++++++++++++++++++++++------------------- fs/nfsd/state.h | 5 +-- 2 files changed, 63 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a611e7b0f013..2ec981fd2985 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -44,7 +44,9 @@ #include #include #include +#include #include + #include "xdr4.h" #include "xdr4cb.h" #include "vfs.h" @@ -589,11 +591,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu) void put_nfs4_file(struct nfs4_file *fi) { - might_lock(&state_lock); - - if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) { + if (refcount_dec_and_test(&fi->fi_ref)) { nfsd4_file_hash_remove(fi); - spin_unlock(&state_lock); WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu); @@ -718,19 +717,20 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername) return ret & OWNER_HASH_MASK; } -/* hash table for nfs4_file */ -#define FILE_HASH_BITS 8 -#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) - -static unsigned int file_hashval(const struct svc_fh *fh) -{ - struct inode *inode = d_inode(fh->fh_dentry); +static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp; - /* XXX: why not (here & in file cache) use inode? */ - return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS); -} +static const struct rhashtable_params nfs4_file_rhash_params = { + .key_len = sizeof_field(struct nfs4_file, fi_inode), + .key_offset = offsetof(struct nfs4_file, fi_inode), + .head_offset = offsetof(struct nfs4_file, fi_rlist), -static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; + /* + * Start with a single page hash table to reduce resizing churn + * on light workloads. + */ + .min_size = 256, + .automatic_shrinking = true, +}; /* * Check if courtesy clients have conflicting access and resolve it if possible @@ -4685,12 +4685,14 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) static noinline_for_stack struct nfs4_file * nfsd4_file_hash_lookup(const struct svc_fh *fhp) { - unsigned int hashval = file_hashval(fhp); + struct inode *inode = d_inode(fhp->fh_dentry); + struct rhlist_head *tmp, *list; struct nfs4_file *fi; rcu_read_lock(); - hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash, - lockdep_is_held(&state_lock)) { + list = rhltable_lookup(&nfs4_file_rhltable, &inode, + nfs4_file_rhash_params); + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { if (refcount_inc_not_zero(&fi->fi_ref)) { rcu_read_unlock(); @@ -4704,40 +4706,56 @@ nfsd4_file_hash_lookup(const struct svc_fh *fhp) /* * On hash insertion, identify entries with the same inode but - * distinct filehandles. They will all be in the same hash bucket - * because nfs4_file's are hashed by the address in the fi_inode - * field. + * distinct filehandles. They will all be on the list returned + * by rhltable_lookup(). + * + * inode->i_lock prevents racing insertions from adding an entry + * for the same inode/fhp pair twice. */ static noinline_for_stack struct nfs4_file * nfsd4_file_hash_insert(struct nfs4_file *new, const struct svc_fh *fhp) { - unsigned int hashval = file_hashval(fhp); + struct inode *inode = d_inode(fhp->fh_dentry); + struct rhlist_head *tmp, *list; struct nfs4_file *ret = NULL; bool alias_found = false; struct nfs4_file *fi; + int err; - spin_lock(&state_lock); - hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash, - lockdep_is_held(&state_lock)) { + rcu_read_lock(); + spin_lock(&inode->i_lock); + + list = rhltable_lookup(&nfs4_file_rhltable, &inode, + nfs4_file_rhash_params); + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { if (refcount_inc_not_zero(&fi->fi_ref)) ret = fi; - } else if (d_inode(fhp->fh_dentry) == fi->fi_inode) + } else fi->fi_aliased = alias_found = true; } - if (likely(ret == NULL)) { - nfsd4_file_init(fhp, new); - hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]); - new->fi_aliased = alias_found; - ret = new; - } - spin_unlock(&state_lock); + if (ret) + goto out_unlock; + + nfsd4_file_init(fhp, new); + err = rhltable_insert(&nfs4_file_rhltable, &new->fi_rlist, + nfs4_file_rhash_params); + if (err) + goto out_unlock; + + new->fi_aliased = alias_found; + ret = new; + +out_unlock: + spin_unlock(&inode->i_lock); + rcu_read_unlock(); return ret; } static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi) { - hlist_del_rcu(&fi->fi_hash); + rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist, + nfs4_file_rhash_params); } /* @@ -5628,6 +5646,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf * If not found, create the nfs4_file struct */ fp = nfsd4_file_hash_insert(open->op_file, current_fh); + if (unlikely(!fp)) + return nfserr_jukebox; if (fp != open->op_file) { status = nfs4_check_deleg(cl, open, &dp); if (status) @@ -8042,10 +8062,16 @@ nfs4_state_start(void) { int ret; - ret = nfsd4_create_callback_queue(); + ret = rhltable_init(&nfs4_file_rhltable, &nfs4_file_rhash_params); if (ret) return ret; + ret = nfsd4_create_callback_queue(); + if (ret) { + rhltable_destroy(&nfs4_file_rhltable); + return ret; + } + set_max_delegations(); return 0; } @@ -8076,6 +8102,7 @@ nfs4_state_shutdown_net(struct net *net) nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); + rhltable_destroy(&nfs4_file_rhltable); #ifdef CONFIG_NFSD_V4_2_INTER_SSC nfsd4_ssc_shutdown_umount(nn); #endif diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index e2daef3cc003..eadd7f465bf5 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -536,16 +536,13 @@ struct nfs4_clnt_odstate { * inode can have multiple filehandles associated with it, so there is * (potentially) a many to one relationship between this struct and struct * inode. - * - * These are hashed by filehandle in the file_hashtbl, which is protected by - * the global state_lock spinlock. */ struct nfs4_file { refcount_t fi_ref; struct inode * fi_inode; bool fi_aliased; spinlock_t fi_lock; - struct hlist_node fi_hash; /* hash on fi_fhandle */ + struct rhlist_head fi_rlist; struct list_head fi_stateids; union { struct list_head fi_delegations; -- cgit From 3f054211b29c0fa06dfdcab402c795fd7e906be1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 31 Oct 2022 09:53:26 -0400 Subject: NFSD: Fix licensing header in filecache.c Add a missing SPDX header. Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton --- fs/nfsd/filecache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 897dd47ffb0a..deab9b765c88 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -1,5 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * Open file cache. + * The NFSD open file cache. * * (c) 2015 - Jeff Layton * -- cgit From 1f696e230ea5198e393368b319eb55651828d687 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 2 Nov 2022 14:44:47 -0400 Subject: nfsd: remove the pages_flushed statistic from filecache We're counting mapping->nrpages, but not all of those are necessarily dirty. We don't really have a simple way to count just the dirty pages, so just remove this stat since it's not accurate. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index deab9b765c88..2d543a452858 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -58,7 +58,6 @@ static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); static DEFINE_PER_CPU(unsigned long, nfsd_file_releases); static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age); -static DEFINE_PER_CPU(unsigned long, nfsd_file_pages_flushed); static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions); struct nfsd_fcache_disposal { @@ -396,7 +395,6 @@ nfsd_file_flush(struct nfsd_file *nf) if (!file || !(file->f_mode & FMODE_WRITE)) return; - this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages); if (vfs_fsync(file, 1) != 0) nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); } @@ -1023,7 +1021,6 @@ nfsd_file_cache_shutdown(void) per_cpu(nfsd_file_acquisitions, i) = 0; per_cpu(nfsd_file_releases, i) = 0; per_cpu(nfsd_file_total_age, i) = 0; - per_cpu(nfsd_file_pages_flushed, i) = 0; per_cpu(nfsd_file_evictions, i) = 0; } } @@ -1238,7 +1235,7 @@ nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, */ int nfsd_file_cache_stats_show(struct seq_file *m, void *v) { - unsigned long releases = 0, pages_flushed = 0, evictions = 0; + unsigned long releases = 0, evictions = 0; unsigned long hits = 0, acquisitions = 0; unsigned int i, count = 0, buckets = 0; unsigned long lru = 0, total_age = 0; @@ -1266,7 +1263,6 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v) releases += per_cpu(nfsd_file_releases, i); total_age += per_cpu(nfsd_file_total_age, i); evictions += per_cpu(nfsd_file_evictions, i); - pages_flushed += per_cpu(nfsd_file_pages_flushed, i); } seq_printf(m, "total entries: %u\n", count); @@ -1280,6 +1276,5 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "mean age (ms): %ld\n", total_age / releases); else seq_printf(m, "mean age (ms): -\n"); - seq_printf(m, "pages flushed: %lu\n", pages_flushed); return 0; } -- cgit From 8214118589881b2d390284410c5ff275e7a5e03c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 2 Nov 2022 14:44:48 -0400 Subject: nfsd: reorganize filecache.c In a coming patch, we're going to rework how the filecache refcounting works. Move some code around in the function to reduce the churn in the later patches, and rename some of the functions with (hopefully) clearer names: nfsd_file_flush becomes nfsd_file_fsync, and nfsd_file_unhash_and_dispose is renamed to nfsd_file_unhash_and_queue. Also, the nfsd_file_put_final tracepoint is renamed to nfsd_file_free, to better match the name of the function from which it's called. Signed-off-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 111 ++++++++++++++++++++++++++-------------------------- fs/nfsd/trace.h | 4 +- 2 files changed, 58 insertions(+), 57 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 2d543a452858..1418bc889aa0 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -335,16 +335,59 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may) return nf; } +static void +nfsd_file_fsync(struct nfsd_file *nf) +{ + struct file *file = nf->nf_file; + + if (!file || !(file->f_mode & FMODE_WRITE)) + return; + if (vfs_fsync(file, 1) != 0) + nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); +} + +static int +nfsd_file_check_write_error(struct nfsd_file *nf) +{ + struct file *file = nf->nf_file; + + if (!file || !(file->f_mode & FMODE_WRITE)) + return 0; + return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err)); +} + +static void +nfsd_file_hash_remove(struct nfsd_file *nf) +{ + trace_nfsd_file_unhash(nf); + + if (nfsd_file_check_write_error(nf)) + nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); + rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash, + nfsd_file_rhash_params); +} + +static bool +nfsd_file_unhash(struct nfsd_file *nf) +{ + if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { + nfsd_file_hash_remove(nf); + return true; + } + return false; +} + static bool nfsd_file_free(struct nfsd_file *nf) { s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime)); bool flush = false; + trace_nfsd_file_free(nf); + this_cpu_inc(nfsd_file_releases); this_cpu_add(nfsd_file_total_age, age); - trace_nfsd_file_put_final(nf); if (nf->nf_mark) nfsd_file_mark_put(nf->nf_mark); if (nf->nf_file) { @@ -378,27 +421,6 @@ nfsd_file_check_writeback(struct nfsd_file *nf) mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); } -static int -nfsd_file_check_write_error(struct nfsd_file *nf) -{ - struct file *file = nf->nf_file; - - if (!file || !(file->f_mode & FMODE_WRITE)) - return 0; - return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err)); -} - -static void -nfsd_file_flush(struct nfsd_file *nf) -{ - struct file *file = nf->nf_file; - - if (!file || !(file->f_mode & FMODE_WRITE)) - return; - if (vfs_fsync(file, 1) != 0) - nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); -} - static void nfsd_file_lru_add(struct nfsd_file *nf) { set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); @@ -412,31 +434,18 @@ static void nfsd_file_lru_remove(struct nfsd_file *nf) trace_nfsd_file_lru_del(nf); } -static void -nfsd_file_hash_remove(struct nfsd_file *nf) -{ - trace_nfsd_file_unhash(nf); - - if (nfsd_file_check_write_error(nf)) - nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); - rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash, - nfsd_file_rhash_params); -} - -static bool -nfsd_file_unhash(struct nfsd_file *nf) +struct nfsd_file * +nfsd_file_get(struct nfsd_file *nf) { - if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { - nfsd_file_hash_remove(nf); - return true; - } - return false; + if (likely(refcount_inc_not_zero(&nf->nf_ref))) + return nf; + return NULL; } static void -nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose) +nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose) { - trace_nfsd_file_unhash_and_dispose(nf); + trace_nfsd_file_unhash_and_queue(nf); if (nfsd_file_unhash(nf)) { /* caller must call nfsd_file_dispose_list() later */ nfsd_file_lru_remove(nf); @@ -474,7 +483,7 @@ nfsd_file_put(struct nfsd_file *nf) nfsd_file_unhash_and_put(nf); if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { - nfsd_file_flush(nf); + nfsd_file_fsync(nf); nfsd_file_put_noref(nf); } else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags)) { nfsd_file_put_noref(nf); @@ -483,14 +492,6 @@ nfsd_file_put(struct nfsd_file *nf) nfsd_file_put_noref(nf); } -struct nfsd_file * -nfsd_file_get(struct nfsd_file *nf) -{ - if (likely(refcount_inc_not_zero(&nf->nf_ref))) - return nf; - return NULL; -} - static void nfsd_file_dispose_list(struct list_head *dispose) { @@ -499,7 +500,7 @@ nfsd_file_dispose_list(struct list_head *dispose) while(!list_empty(dispose)) { nf = list_first_entry(dispose, struct nfsd_file, nf_lru); list_del_init(&nf->nf_lru); - nfsd_file_flush(nf); + nfsd_file_fsync(nf); nfsd_file_put_noref(nf); } } @@ -513,7 +514,7 @@ nfsd_file_dispose_list_sync(struct list_head *dispose) while(!list_empty(dispose)) { nf = list_first_entry(dispose, struct nfsd_file, nf_lru); list_del_init(&nf->nf_lru); - nfsd_file_flush(nf); + nfsd_file_fsync(nf); if (!refcount_dec_and_test(&nf->nf_ref)) continue; if (nfsd_file_free(nf)) @@ -713,7 +714,7 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose) nfsd_file_rhash_params); if (!nf) break; - nfsd_file_unhash_and_dispose(nf, dispose); + nfsd_file_unhash_and_queue(nf, dispose); count++; } while (1); rcu_read_unlock(); @@ -915,7 +916,7 @@ __nfsd_file_cache_purge(struct net *net) nf = rhashtable_walk_next(&iter); while (!IS_ERR_OR_NULL(nf)) { if (!net || nf->nf_net == net) - nfsd_file_unhash_and_dispose(nf, &dispose); + nfsd_file_unhash_and_queue(nf, &dispose); nf = rhashtable_walk_next(&iter); } diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index b648db4d0346..0177cfa50556 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -906,10 +906,10 @@ DEFINE_EVENT(nfsd_file_class, name, \ TP_PROTO(struct nfsd_file *nf), \ TP_ARGS(nf)) -DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final); +DEFINE_NFSD_FILE_EVENT(nfsd_file_free); DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash); DEFINE_NFSD_FILE_EVENT(nfsd_file_put); -DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_dispose); +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue); TRACE_EVENT(nfsd_file_alloc, TP_PROTO( -- cgit From 41e8f85a75fc60e1543e4903428a1b481b672a17 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Fri, 11 Nov 2022 09:04:06 -0800 Subject: f2fs: introduce F2FS_IOC_START_ATOMIC_REPLACE introduce a new ioctl to replace the whole content of a file atomically, which means it induces truncate and content update at the same time. We can start it with F2FS_IOC_START_ATOMIC_REPLACE and complete it with F2FS_IOC_COMMIT_ATOMIC_WRITE. Or abort it with F2FS_IOC_ABORT_ATOMIC_WRITE. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 +++ fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 21 +++++++++++++++------ fs/f2fs/segment.c | 13 ++++++++++++- 4 files changed, 31 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9b47ded653d1..560fa80590e9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3466,6 +3466,9 @@ static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, else if (*blk_addr != NULL_ADDR) return 0; + if (is_inode_flag_set(inode, FI_ATOMIC_REPLACE)) + goto reserve_block; + /* Look for the block in the original inode */ err = __find_data_block(inode, index, &ori_blk_addr); if (err) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6a8cbf5bb187..b89b5d755ce0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -769,6 +769,7 @@ enum { FI_ALIGNED_WRITE, /* enable aligned write */ FI_COW_FILE, /* indicate COW file */ FI_ATOMIC_COMMITTED, /* indicate atomic commit completed except disk sync */ + FI_ATOMIC_REPLACE, /* indicate atomic replace */ FI_MAX, /* max flag, never be used */ }; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 28f586e77999..ab0a0d3730f6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2034,7 +2034,7 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) return put_user(inode->i_generation, (int __user *)arg); } -static int f2fs_ioc_start_atomic_write(struct file *filp) +static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) { struct inode *inode = file_inode(filp); struct user_namespace *mnt_userns = file_mnt_user_ns(filp); @@ -2103,15 +2103,22 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) f2fs_write_inode(inode, NULL); - isize = i_size_read(inode); - fi->original_i_size = isize; - f2fs_i_size_write(fi->cow_inode, isize); - stat_inc_atomic_inode(inode); set_inode_flag(inode, FI_ATOMIC_FILE); set_inode_flag(fi->cow_inode, FI_COW_FILE); clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); + + isize = i_size_read(inode); + fi->original_i_size = isize; + if (truncate) { + set_inode_flag(inode, FI_ATOMIC_REPLACE); + truncate_inode_pages_final(inode->i_mapping); + f2fs_i_size_write(inode, 0); + isize = 0; + } + f2fs_i_size_write(fi->cow_inode, isize); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); f2fs_update_time(sbi, REQ_TIME); @@ -4139,7 +4146,9 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FS_IOC_GETVERSION: return f2fs_ioc_getversion(filp, arg); case F2FS_IOC_START_ATOMIC_WRITE: - return f2fs_ioc_start_atomic_write(filp); + return f2fs_ioc_start_atomic_write(filp, false); + case F2FS_IOC_START_ATOMIC_REPLACE: + return f2fs_ioc_start_atomic_write(filp, true); case F2FS_IOC_COMMIT_ATOMIC_WRITE: return f2fs_ioc_commit_atomic_write(filp); case F2FS_IOC_ABORT_ATOMIC_WRITE: diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8aa81238c770..5ac026a57228 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -197,6 +197,7 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean) fi->cow_inode = NULL; release_atomic_write_cnt(inode); clear_inode_flag(inode, FI_ATOMIC_COMMITTED); + clear_inode_flag(inode, FI_ATOMIC_REPLACE); clear_inode_flag(inode, FI_ATOMIC_FILE); stat_dec_atomic_inode(inode); @@ -261,14 +262,24 @@ static void __complete_revoke_list(struct inode *inode, struct list_head *head, bool revoke) { struct revoke_entry *cur, *tmp; + pgoff_t start_index = 0; + bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE); list_for_each_entry_safe(cur, tmp, head, list) { - if (revoke) + if (revoke) { __replace_atomic_write_block(inode, cur->index, cur->old_addr, NULL, true); + } else if (truncate) { + f2fs_truncate_hole(inode, start_index, cur->index); + start_index = cur->index + 1; + } + list_del(&cur->list); kmem_cache_free(revoke_entry_slab, cur); } + + if (!revoke && truncate) + f2fs_do_truncate_blocks(inode, start_index * PAGE_SIZE, false); } static int __f2fs_commit_atomic_write(struct inode *inode) -- cgit From d3b7b4afd6b2c344eabf9cc26b8bfa903c164c7c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 15 Nov 2022 00:08:47 +0800 Subject: f2fs: fix to do sanity check on i_extra_isize in is_alive() syzbot found a f2fs bug: BUG: KASAN: slab-out-of-bounds in data_blkaddr fs/f2fs/f2fs.h:2891 [inline] BUG: KASAN: slab-out-of-bounds in is_alive fs/f2fs/gc.c:1117 [inline] BUG: KASAN: slab-out-of-bounds in gc_data_segment fs/f2fs/gc.c:1520 [inline] BUG: KASAN: slab-out-of-bounds in do_garbage_collect+0x386a/0x3df0 fs/f2fs/gc.c:1734 Read of size 4 at addr ffff888076557568 by task kworker/u4:3/52 CPU: 1 PID: 52 Comm: kworker/u4:3 Not tainted 6.1.0-rc4-syzkaller-00362-gfef7fd48922d #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 Workqueue: writeback wb_workfn (flush-7:0) Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:284 [inline] print_report+0x15e/0x45d mm/kasan/report.c:395 kasan_report+0xbb/0x1f0 mm/kasan/report.c:495 data_blkaddr fs/f2fs/f2fs.h:2891 [inline] is_alive fs/f2fs/gc.c:1117 [inline] gc_data_segment fs/f2fs/gc.c:1520 [inline] do_garbage_collect+0x386a/0x3df0 fs/f2fs/gc.c:1734 f2fs_gc+0x88c/0x20a0 fs/f2fs/gc.c:1831 f2fs_balance_fs+0x544/0x6b0 fs/f2fs/segment.c:410 f2fs_write_inode+0x57e/0xe20 fs/f2fs/inode.c:753 write_inode fs/fs-writeback.c:1440 [inline] __writeback_single_inode+0xcfc/0x1440 fs/fs-writeback.c:1652 writeback_sb_inodes+0x54d/0xf90 fs/fs-writeback.c:1870 wb_writeback+0x2c5/0xd70 fs/fs-writeback.c:2044 wb_do_writeback fs/fs-writeback.c:2187 [inline] wb_workfn+0x2dc/0x12f0 fs/fs-writeback.c:2227 process_one_work+0x9bf/0x1710 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e4/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306 The root cause is that we forgot to do sanity check on .i_extra_isize in below path, result in accessing invalid address later, fix it. - gc_data_segment - is_alive - data_blkaddr - offset_in_addr Reported-by: syzbot+f8f3dfa4abc489e768a1@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-f2fs-devel/0000000000003cb3c405ed5c17f9@google.com/T/#u Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index f1a46519a5fe..0f967b1e98f2 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1077,7 +1077,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, { struct page *node_page; nid_t nid; - unsigned int ofs_in_node, max_addrs; + unsigned int ofs_in_node, max_addrs, base; block_t source_blkaddr; nid = le32_to_cpu(sum->nid); @@ -1103,11 +1103,17 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return false; } - max_addrs = IS_INODE(node_page) ? DEF_ADDRS_PER_INODE : - DEF_ADDRS_PER_BLOCK; - if (ofs_in_node >= max_addrs) { - f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u", - ofs_in_node, dni->ino, dni->nid, max_addrs); + if (IS_INODE(node_page)) { + base = offset_in_addr(F2FS_INODE(node_page)); + max_addrs = DEF_ADDRS_PER_INODE; + } else { + base = 0; + max_addrs = DEF_ADDRS_PER_BLOCK; + } + + if (base + ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent blkaddr offset: base:%u, ofs_in_node:%u, max:%u, ino:%u, nid:%u", + base, ofs_in_node, max_addrs, dni->ino, dni->nid); f2fs_put_page(node_page, 1); return false; } -- cgit From 5b7b74b71c7fefbaa3e0ccc120c3cbd50b3fad86 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 12 Nov 2022 00:13:49 +0800 Subject: f2fs: remove submit label in __submit_discard_cmd() Complaint from Matthew Wilcox in another similar place: "submit? You don't submit anything at the 'submit' label. it should be called 'skip' or something. But I think this is just badly written and you don't need a goto at all." Let's remove submit label for readability. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5ac026a57228..8b0b76550578 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1143,13 +1143,12 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, if (time_to_inject(sbi, FAULT_DISCARD)) { f2fs_show_injection_info(sbi, FAULT_DISCARD); err = -EIO; - goto submit; - } - err = __blkdev_issue_discard(bdev, + } else { + err = __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), GFP_NOFS, &bio); -submit: + } if (err) { spin_lock_irqsave(&dc->lock, flags); if (dc->state == D_PARTIAL) -- cgit From b7ad23cec26a91a4f7c45ff7ff8e915f21ac5127 Mon Sep 17 00:00:00 2001 From: Yuwei Guan Date: Tue, 15 Nov 2022 14:35:35 +0800 Subject: f2fs: fix to alloc_mode changed after remount on a small volume device The commit 84b89e5d943d8 ("f2fs: add auto tuning for small devices") add tuning for small volume device, now support to tune alloce_mode to 'reuse' if it's small size. But the alloc_mode will change to 'default' when do remount on this small size dievce. This patch fo fix alloc_mode changed when do remount for a small volume device. Signed-off-by: Yuwei Guan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 75027ff85cd9..96cfe626a670 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2059,7 +2059,11 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; - F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; + if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main) <= + SMALL_VOLUME_SEGMENTS) + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; + else + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); @@ -4077,7 +4081,6 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) /* adjust parameters according to the volume size */ if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { - F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; if (f2fs_block_unit_discard(sbi)) sm_i->dcc_info->discard_granularity = 1; sm_i->ipu_policy = 1 << F2FS_IPU_FORCE | -- cgit From 777cd95b8066ced4e9a2534941b81f8ad98e74fb Mon Sep 17 00:00:00 2001 From: Yuwei Guan Date: Tue, 15 Nov 2022 14:35:36 +0800 Subject: f2fs: cleanup for 'f2fs_tuning_parameters' function A cleanup patch for 'f2fs_tuning_parameters' function. Signed-off-by: Yuwei Guan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 96cfe626a670..05101cd4140b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4077,13 +4077,11 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) { - struct f2fs_sm_info *sm_i = SM_I(sbi); - /* adjust parameters according to the volume size */ - if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { + if (MAIN_SEGS(sbi) <= SMALL_VOLUME_SEGMENTS) { if (f2fs_block_unit_discard(sbi)) - sm_i->dcc_info->discard_granularity = 1; - sm_i->ipu_policy = 1 << F2FS_IPU_FORCE | + SM_I(sbi)->dcc_info->discard_granularity = 1; + SM_I(sbi)->ipu_policy = 1 << F2FS_IPU_FORCE | 1 << F2FS_IPU_HONOR_OPU_WRITE; } -- cgit From 66aee5aaa237e5a3475581b462b8b22e0944d264 Mon Sep 17 00:00:00 2001 From: Yuwei Guan Date: Tue, 15 Nov 2022 14:35:37 +0800 Subject: f2fs: change type for 'sbi->readdir_ra' Before this patch, the varibale 'readdir_ra' takes effect if it's equal to '1' or not, so we can change type for it from 'int' to 'bool'. Signed-off-by: Yuwei Guan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/super.c | 2 +- fs/f2fs/sysfs.c | 5 +++++ 4 files changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 030b7fd4142f..8e025157f35c 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -1010,7 +1010,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, struct fscrypt_str de_name = FSTR_INIT(NULL, 0); struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); struct blk_plug plug; - bool readdir_ra = sbi->readdir_ra == 1; + bool readdir_ra = sbi->readdir_ra; bool found_valid_dirent = false; int err = 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b89b5d755ce0..96bd3461c0bb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1698,7 +1698,7 @@ struct f2fs_sb_info { unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ int dir_level; /* directory level */ - int readdir_ra; /* readahead inode in readdir */ + bool readdir_ra; /* readahead inode in readdir */ u64 max_io_bytes; /* max io bytes to merge IOs */ block_t user_block_count; /* # of user blocks */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 05101cd4140b..31435c8645c8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4085,7 +4085,7 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) 1 << F2FS_IPU_HONOR_OPU_WRITE; } - sbi->readdir_ra = 1; + sbi->readdir_ra = true; } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 97bf0dbb0974..33ec467b3772 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -656,6 +656,11 @@ out: return count; } + if (!strcmp(a->attr.name, "readdir_ra")) { + sbi->readdir_ra = !!t; + return count; + } + *ui = (unsigned int)t; return count; -- cgit From 4ff23a6547b81ca22adb852dfe93ee5fc45328ac Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Thu, 17 Nov 2022 23:10:54 +0800 Subject: f2fs: set zstd compress level correctly Fixes: cf30f6a5f0c6 ("lib: zstd: Add kernel-specific API") Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Reviewed-by: Nick Terrell Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index d315c2de136f..74d3f2d2271f 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -346,7 +346,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) if (!level) level = F2FS_ZSTD_DEFAULT_CLEVEL; - params = zstd_get_params(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen); + params = zstd_get_params(level, cc->rlen); workspace_size = zstd_cstream_workspace_bound(¶ms.cParams); workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode), -- cgit From bc12ac98ea2e1b70adc6478c8b473a0003b659d3 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 29 Jun 2022 19:26:46 +0800 Subject: ext4: silence the warning when evicting inode with dioread_nolock When evicting an inode with default dioread_nolock, it could be raced by the unwritten extents converting kworker after writeback some new allocated dirty blocks. It convert unwritten extents to written, the extents could be merged to upper level and free extent blocks, so it could mark the inode dirty again even this inode has been marked I_FREEING. But the inode->i_io_list check and warning in ext4_evict_inode() missing this corner case. Fortunately, ext4_evict_inode() will wait all extents converting finished before this check, so it will not lead to inode use-after-free problem, every thing is OK besides this warning. The WARN_ON_ONCE was originally designed for finding inode use-after-free issues in advance, but if we add current dioread_nolock case in, it will become not quite useful, so fix this warning by just remove this check. ====== WARNING: CPU: 7 PID: 1092 at fs/ext4/inode.c:227 ext4_evict_inode+0x875/0xc60 ... RIP: 0010:ext4_evict_inode+0x875/0xc60 ... Call Trace: evict+0x11c/0x2b0 iput+0x236/0x3a0 do_unlinkat+0x1b4/0x490 __x64_sys_unlinkat+0x4c/0xb0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7fa933c1115b ====== rm kworker ext4_end_io_end() vfs_unlink() ext4_unlink() ext4_convert_unwritten_io_end_vec() ext4_convert_unwritten_extents() ext4_map_blocks() ext4_ext_map_blocks() ext4_ext_try_to_merge_up() __mark_inode_dirty() check !I_FREEING locked_inode_to_wb_and_lock_list() iput() iput_final() evict() ext4_evict_inode() truncate_inode_pages_final() //wait release io_end inode_io_list_move_locked() ext4_release_io_end() trigger WARN_ON_ONCE() Cc: stable@kernel.org Fixes: ceff86fddae8 ("ext4: Avoid freeing inodes on dirty list") Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220629112647.4141034-1-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2b5ef1b64249..7c5f5dabe0fd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -222,13 +222,13 @@ void ext4_evict_inode(struct inode *inode) /* * For inodes with journalled data, transaction commit could have - * dirtied the inode. Flush worker is ignoring it because of I_FREEING - * flag but we still need to remove the inode from the writeback lists. + * dirtied the inode. And for inodes with dioread_nolock, unwritten + * extents converting worker could merge extents and also have dirtied + * the inode. Flush worker is ignoring it because of I_FREEING flag but + * we still need to remove the inode from the writeback lists. */ - if (!list_empty_careful(&inode->i_io_list)) { - WARN_ON_ONCE(!ext4_should_journal_data(inode)); + if (!list_empty_careful(&inode->i_io_list)) inode_io_list_del(inode); - } /* * Protect us against freezing - iput() caller didn't have to have any -- cgit From 787caf1bdcd9f04058e4e8d8ed56db1dbafea0b7 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Fri, 11 Nov 2022 18:08:29 +0800 Subject: f2fs: fix to enable compress for newly created file if extension matches If compress_extension is set, and a newly created file matches the extension, the file could be marked as compression file. However, if inline_data is also enabled, there is no chance to check its extension since f2fs_should_compress() always returns false. This patch moves set_compress_inode(), which do extension check, in f2fs_should_compress() to check extensions before setting inline data flag. Fixes: 7165841d578e ("f2fs: fix to check inline_data during compressed inode conversion") Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/namei.c | 329 ++++++++++++++++++++++++++++---------------------------- 2 files changed, 164 insertions(+), 167 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 96bd3461c0bb..f0833638f59e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2980,7 +2980,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) /* Flags that should be inherited by new inodes from their parent. */ #define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \ F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ - F2FS_CASEFOLD_FL | F2FS_COMPR_FL | F2FS_NOCOMP_FL) + F2FS_CASEFOLD_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index e104409c3a0e..54448dccbb6a 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -22,8 +22,163 @@ #include "acl.h" #include +static inline int is_extension_exist(const unsigned char *s, const char *sub, + bool tmp_ext) +{ + size_t slen = strlen(s); + size_t sublen = strlen(sub); + int i; + + if (sublen == 1 && *sub == '*') + return 1; + + /* + * filename format of multimedia file should be defined as: + * "filename + '.' + extension + (optional: '.' + temp extension)". + */ + if (slen < sublen + 2) + return 0; + + if (!tmp_ext) { + /* file has no temp extension */ + if (s[slen - sublen - 1] != '.') + return 0; + return !strncasecmp(s + slen - sublen, sub, sublen); + } + + for (i = 1; i < slen - sublen; i++) { + if (s[i] != '.') + continue; + if (!strncasecmp(s + i + 1, sub, sublen)) + return 1; + } + + return 0; +} + +int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, + bool hot, bool set) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int cold_count = le32_to_cpu(sbi->raw_super->extension_count); + int hot_count = sbi->raw_super->hot_ext_count; + int total_count = cold_count + hot_count; + int start, count; + int i; + + if (set) { + if (total_count == F2FS_MAX_EXTENSION) + return -EINVAL; + } else { + if (!hot && !cold_count) + return -EINVAL; + if (hot && !hot_count) + return -EINVAL; + } + + if (hot) { + start = cold_count; + count = total_count; + } else { + start = 0; + count = cold_count; + } + + for (i = start; i < count; i++) { + if (strcmp(name, extlist[i])) + continue; + + if (set) + return -EINVAL; + + memcpy(extlist[i], extlist[i + 1], + F2FS_EXTENSION_LEN * (total_count - i - 1)); + memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN); + if (hot) + sbi->raw_super->hot_ext_count = hot_count - 1; + else + sbi->raw_super->extension_count = + cpu_to_le32(cold_count - 1); + return 0; + } + + if (!set) + return -EINVAL; + + if (hot) { + memcpy(extlist[count], name, strlen(name)); + sbi->raw_super->hot_ext_count = hot_count + 1; + } else { + char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN]; + + memcpy(buf, &extlist[cold_count], + F2FS_EXTENSION_LEN * hot_count); + memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN); + memcpy(extlist[cold_count], name, strlen(name)); + memcpy(&extlist[cold_count + 1], buf, + F2FS_EXTENSION_LEN * hot_count); + sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1); + } + return 0; +} + +static void set_compress_new_inode(struct f2fs_sb_info *sbi, struct inode *dir, + struct inode *inode, const unsigned char *name) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + unsigned char (*noext)[F2FS_EXTENSION_LEN] = + F2FS_OPTION(sbi).noextensions; + unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions; + unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + int i, cold_count, hot_count; + + if (!f2fs_sb_has_compression(sbi)) + return; + + if (S_ISDIR(inode->i_mode)) + goto inherit_comp; + + /* This name comes only from normal files. */ + if (!name) + return; + + /* Don't compress hot files. */ + f2fs_down_read(&sbi->sb_lock); + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; + for (i = cold_count; i < cold_count + hot_count; i++) + if (is_extension_exist(name, extlist[i], false)) + break; + f2fs_up_read(&sbi->sb_lock); + if (i < (cold_count + hot_count)) + return; + + /* Don't compress unallowed extension. */ + for (i = 0; i < noext_cnt; i++) + if (is_extension_exist(name, noext[i], false)) + return; + + /* Compress wanting extension. */ + for (i = 0; i < ext_cnt; i++) { + if (is_extension_exist(name, ext[i], false)) { + set_compress_context(inode); + return; + } + } +inherit_comp: + /* Inherit the {no-}compression flag in directory */ + if (F2FS_I(dir)->i_flags & F2FS_NOCOMP_FL) { + F2FS_I(inode)->i_flags |= F2FS_NOCOMP_FL; + f2fs_mark_inode_dirty_sync(inode, true); + } else if (F2FS_I(dir)->i_flags & F2FS_COMPR_FL) { + set_compress_context(inode); + } +} + static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, - struct inode *dir, umode_t mode) + struct inode *dir, umode_t mode, + const char *name) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); nid_t ino; @@ -114,12 +269,8 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); - if (f2fs_sb_has_compression(sbi)) { - /* Inherit the compression flag in directory */ - if ((F2FS_I(dir)->i_flags & F2FS_COMPR_FL) && - f2fs_may_compress(inode)) - set_compress_context(inode); - } + /* Check compression first. */ + set_compress_new_inode(sbi, dir, inode, name); /* Should enable inline_data after compression set */ if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) @@ -153,40 +304,6 @@ fail_drop: return ERR_PTR(err); } -static inline int is_extension_exist(const unsigned char *s, const char *sub, - bool tmp_ext) -{ - size_t slen = strlen(s); - size_t sublen = strlen(sub); - int i; - - if (sublen == 1 && *sub == '*') - return 1; - - /* - * filename format of multimedia file should be defined as: - * "filename + '.' + extension + (optional: '.' + temp extension)". - */ - if (slen < sublen + 2) - return 0; - - if (!tmp_ext) { - /* file has no temp extension */ - if (s[slen - sublen - 1] != '.') - return 0; - return !strncasecmp(s + slen - sublen, sub, sublen); - } - - for (i = 1; i < slen - sublen; i++) { - if (s[i] != '.') - continue; - if (!strncasecmp(s + i + 1, sub, sublen)) - return 1; - } - - return 0; -} - /* * Set file's temperature for hot/cold data separation */ @@ -217,124 +334,6 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode * file_set_hot(inode); } -int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, - bool hot, bool set) -{ - __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - int cold_count = le32_to_cpu(sbi->raw_super->extension_count); - int hot_count = sbi->raw_super->hot_ext_count; - int total_count = cold_count + hot_count; - int start, count; - int i; - - if (set) { - if (total_count == F2FS_MAX_EXTENSION) - return -EINVAL; - } else { - if (!hot && !cold_count) - return -EINVAL; - if (hot && !hot_count) - return -EINVAL; - } - - if (hot) { - start = cold_count; - count = total_count; - } else { - start = 0; - count = cold_count; - } - - for (i = start; i < count; i++) { - if (strcmp(name, extlist[i])) - continue; - - if (set) - return -EINVAL; - - memcpy(extlist[i], extlist[i + 1], - F2FS_EXTENSION_LEN * (total_count - i - 1)); - memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN); - if (hot) - sbi->raw_super->hot_ext_count = hot_count - 1; - else - sbi->raw_super->extension_count = - cpu_to_le32(cold_count - 1); - return 0; - } - - if (!set) - return -EINVAL; - - if (hot) { - memcpy(extlist[count], name, strlen(name)); - sbi->raw_super->hot_ext_count = hot_count + 1; - } else { - char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN]; - - memcpy(buf, &extlist[cold_count], - F2FS_EXTENSION_LEN * hot_count); - memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN); - memcpy(extlist[cold_count], name, strlen(name)); - memcpy(&extlist[cold_count + 1], buf, - F2FS_EXTENSION_LEN * hot_count); - sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1); - } - return 0; -} - -static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, - const unsigned char *name) -{ - __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - unsigned char (*noext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).noextensions; - unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions; - unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; - unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; - int i, cold_count, hot_count; - - if (!f2fs_sb_has_compression(sbi) || - F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL || - !f2fs_may_compress(inode) || - (!ext_cnt && !noext_cnt)) - return; - - f2fs_down_read(&sbi->sb_lock); - - cold_count = le32_to_cpu(sbi->raw_super->extension_count); - hot_count = sbi->raw_super->hot_ext_count; - - for (i = cold_count; i < cold_count + hot_count; i++) { - if (is_extension_exist(name, extlist[i], false)) { - f2fs_up_read(&sbi->sb_lock); - return; - } - } - - f2fs_up_read(&sbi->sb_lock); - - for (i = 0; i < noext_cnt; i++) { - if (is_extension_exist(name, noext[i], false)) { - f2fs_disable_compressed_file(inode); - return; - } - } - - if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) - return; - - for (i = 0; i < ext_cnt; i++) { - if (!is_extension_exist(name, ext[i], false)) - continue; - - /* Do not use inline_data with compression */ - stat_dec_inline_inode(inode); - clear_inode_flag(inode, FI_INLINE_DATA); - set_compress_context(inode); - return; - } -} - static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { @@ -352,15 +351,13 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, mode); + inode = f2fs_new_inode(mnt_userns, dir, mode, dentry->d_name.name); if (IS_ERR(inode)) return PTR_ERR(inode); if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) set_file_temperature(sbi, inode, dentry->d_name.name); - set_compress_inode(sbi, inode, dentry->d_name.name); - inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; @@ -689,7 +686,7 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO); + inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -760,7 +757,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode); + inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -817,7 +814,7 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, mode); + inode = f2fs_new_inode(mnt_userns, dir, mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -856,7 +853,7 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, mode); + inode = f2fs_new_inode(mnt_userns, dir, mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); -- cgit From b16bcaaf7a325f90967259a0b7cfcce4ff8c56ba Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Fri, 11 Nov 2022 18:08:30 +0800 Subject: f2fs: move set_file_temperature into f2fs_new_inode Since the file name has already passed to f2fs_new_inode(), let's move set_file_temperature() into f2fs_new_inode(). Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 62 +++++++++++++++++++++++++++------------------------------ 1 file changed, 29 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 54448dccbb6a..58a91ce8fe08 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -176,6 +176,32 @@ inherit_comp: } } +/* + * Set file's temperature for hot/cold data separation + */ +static void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, + const unsigned char *name) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int i, cold_count, hot_count; + + f2fs_down_read(&sbi->sb_lock); + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; + for (i = 0; i < cold_count + hot_count; i++) + if (is_extension_exist(name, extlist[i], true)) + break; + f2fs_up_read(&sbi->sb_lock); + + if (i == cold_count + hot_count) + return; + + if (i < cold_count) + file_set_cold(inode); + else + file_set_hot(inode); +} + static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, struct inode *dir, umode_t mode, const char *name) @@ -276,6 +302,9 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) set_inode_flag(inode, FI_INLINE_DATA); + if (name && !test_opt(sbi, DISABLE_EXT_IDENTIFY)) + set_file_temperature(sbi, inode, name); + stat_inc_inline_xattr(inode); stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); @@ -304,36 +333,6 @@ fail_drop: return ERR_PTR(err); } -/* - * Set file's temperature for hot/cold data separation - */ -static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, - const unsigned char *name) -{ - __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - int i, cold_count, hot_count; - - f2fs_down_read(&sbi->sb_lock); - - cold_count = le32_to_cpu(sbi->raw_super->extension_count); - hot_count = sbi->raw_super->hot_ext_count; - - for (i = 0; i < cold_count + hot_count; i++) { - if (is_extension_exist(name, extlist[i], true)) - break; - } - - f2fs_up_read(&sbi->sb_lock); - - if (i == cold_count + hot_count) - return; - - if (i < cold_count) - file_set_cold(inode); - else - file_set_hot(inode); -} - static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { @@ -355,9 +354,6 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, if (IS_ERR(inode)) return PTR_ERR(inode); - if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) - set_file_temperature(sbi, inode, dentry->d_name.name); - inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; -- cgit From 620816393239890feff8608251e2746b1cc2cfa0 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 17 Nov 2022 01:10:45 +0800 Subject: f2fs: make __queue_discard_cmd() return void Since __queue_discard_cmd() never returns an error, let's make it return void. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8b0b76550578..14ece4bf7c7e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1358,13 +1358,13 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, } } -static int __queue_discard_cmd(struct f2fs_sb_info *sbi, +static void __queue_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { block_t lblkstart = blkstart; if (!f2fs_bdev_support_discard(bdev)) - return 0; + return; trace_f2fs_queue_discard(bdev, blkstart, blklen); @@ -1376,7 +1376,6 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); - return 0; } static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, @@ -1776,7 +1775,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } /* For conventional zones, use regular discard if supported */ - return __queue_discard_cmd(sbi, bdev, lblkstart, blklen); + __queue_discard_cmd(sbi, bdev, lblkstart, blklen); + return 0; } #endif @@ -1787,7 +1787,8 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi, if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif - return __queue_discard_cmd(sbi, bdev, blkstart, blklen); + __queue_discard_cmd(sbi, bdev, blkstart, blklen); + return 0; } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, -- cgit From 78a99fe6254cad4be310cd84af39f6c46b668c72 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 23 Nov 2022 06:42:52 +0800 Subject: f2fs: truncate blocks in batch in __complete_revoke_list() Use f2fs_do_truncate_blocks() to truncate all blocks in-batch in __complete_revoke_list(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 14ece4bf7c7e..37c721e1eb03 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -262,24 +262,19 @@ static void __complete_revoke_list(struct inode *inode, struct list_head *head, bool revoke) { struct revoke_entry *cur, *tmp; - pgoff_t start_index = 0; bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE); list_for_each_entry_safe(cur, tmp, head, list) { - if (revoke) { + if (revoke) __replace_atomic_write_block(inode, cur->index, cur->old_addr, NULL, true); - } else if (truncate) { - f2fs_truncate_hole(inode, start_index, cur->index); - start_index = cur->index + 1; - } list_del(&cur->list); kmem_cache_free(revoke_entry_slab, cur); } if (!revoke && truncate) - f2fs_do_truncate_blocks(inode, start_index * PAGE_SIZE, false); + f2fs_do_truncate_blocks(inode, 0, false); } static int __f2fs_commit_atomic_write(struct inode *inode) -- cgit From e219aecfd4b766c4e878a3769057e9809f7fcadc Mon Sep 17 00:00:00 2001 From: Yonggil Song Date: Tue, 22 Nov 2022 18:03:20 +0900 Subject: f2fs: avoid victim selection from previous victim section When f2fs chooses GC victim in large section & LFS mode, next_victim_seg[gc_type] is referenced first. After segment is freed, next_victim_seg[gc_type] has the next segment number. However, next_victim_seg[gc_type] still has the last segment number even after the last segment of section is freed. In this case, when f2fs chooses a victim for the next GC round, the last segment of previous victim section is chosen as a victim. Initialize next_victim_seg[gc_type] to NULL_SEGNO for the last segment in large section. Fixes: e3080b0120a1 ("f2fs: support subsectional garbage collection") Signed-off-by: Yonggil Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 0f967b1e98f2..f1b68eda2235 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1749,8 +1749,9 @@ freed: get_valid_blocks(sbi, segno, false) == 0) seg_freed++; - if (__is_large_section(sbi) && segno + 1 < end_segno) - sbi->next_victim_seg[gc_type] = segno + 1; + if (__is_large_section(sbi)) + sbi->next_victim_seg[gc_type] = + (segno + 1 < end_segno) ? segno + 1 : NULL_SEGNO; skip: f2fs_put_page(sum_page, 0); } -- cgit From 48c08c51f938d955dd8f5b8972bc29faa4c9556f Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Fri, 18 Nov 2022 11:46:00 +0800 Subject: f2fs: init discard policy after thread wakeup Under the current logic, after the discard thread wakes up, it will not run according to the expected policy, but will use the expected policy before sleep. Move the strategy selection to after the thread wakes up, so that the running state of the thread meets expectations. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 37c721e1eb03..73ad8dc9a4d3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1679,6 +1679,11 @@ static int issue_discard_thread(void *data) set_freezable(); do { + wait_event_interruptible_timeout(*q, + kthread_should_stop() || freezing(current) || + dcc->discard_wake, + msecs_to_jiffies(wait_ms)); + if (sbi->gc_mode == GC_URGENT_HIGH || !f2fs_available_free_memory(sbi, DISCARD_CACHE)) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); @@ -1686,14 +1691,6 @@ static int issue_discard_thread(void *data) __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, dcc->discard_granularity); - if (!atomic_read(&dcc->discard_cmd_cnt)) - wait_ms = dpolicy.max_interval; - - wait_event_interruptible_timeout(*q, - kthread_should_stop() || freezing(current) || - dcc->discard_wake, - msecs_to_jiffies(wait_ms)); - if (dcc->discard_wake) dcc->discard_wake = 0; @@ -1707,12 +1704,11 @@ static int issue_discard_thread(void *data) continue; if (kthread_should_stop()) return 0; - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || + !atomic_read(&dcc->discard_cmd_cnt)) { wait_ms = dpolicy.max_interval; continue; } - if (!atomic_read(&dcc->discard_cmd_cnt)) - continue; sb_start_intwrite(sbi->sb); @@ -1727,6 +1723,8 @@ static int issue_discard_thread(void *data) } else { wait_ms = dpolicy.max_interval; } + if (!atomic_read(&dcc->discard_cmd_cnt)) + wait_ms = dpolicy.max_interval; sb_end_intwrite(sbi->sb); -- cgit From 1cd2e6d544359ae13e6fd9029b6018b957cf08c3 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 24 Nov 2022 00:44:01 +0800 Subject: f2fs: define MIN_DISCARD_GRANULARITY macro Do cleanup in f2fs_tuning_parameters() and __init_discard_policy(), let's use macro instead of number. Suggested-by: Chao Yu Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 4 ++-- fs/f2fs/super.c | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f0833638f59e..4694b55b6df4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -329,6 +329,8 @@ struct discard_entry { unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; +/* minimum discard granularity, unit: block count */ +#define MIN_DISCARD_GRANULARITY 1 /* default discard granularity of inner discard thread, unit: block count */ #define DEFAULT_DISCARD_GRANULARITY 16 /* default maximum discard granularity of ordered discard, unit: block count */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 73ad8dc9a4d3..c7afcc6cd75b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1065,7 +1065,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->sync = false; dpolicy->ordered = true; if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { - dpolicy->granularity = 1; + dpolicy->granularity = MIN_DISCARD_GRANULARITY; if (atomic_read(&dcc->discard_cmd_cnt)) dpolicy->max_interval = dcc->min_discard_issue_time; @@ -1080,7 +1080,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, } else if (discard_type == DPOLICY_UMOUNT) { dpolicy->io_aware = false; /* we need to issue all to keep CP_TRIMMED_FLAG */ - dpolicy->granularity = 1; + dpolicy->granularity = MIN_DISCARD_GRANULARITY; dpolicy->timeout = true; } } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 31435c8645c8..daf14b55a972 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4080,7 +4080,8 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) /* adjust parameters according to the volume size */ if (MAIN_SEGS(sbi) <= SMALL_VOLUME_SEGMENTS) { if (f2fs_block_unit_discard(sbi)) - SM_I(sbi)->dcc_info->discard_granularity = 1; + SM_I(sbi)->dcc_info->discard_granularity = + MIN_DISCARD_GRANULARITY; SM_I(sbi)->ipu_policy = 1 << F2FS_IPU_FORCE | 1 << F2FS_IPU_HONOR_OPU_WRITE; } -- cgit From 8a47d228de6a4fd4c751142fb27d56f385b3fe41 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 24 Nov 2022 00:44:02 +0800 Subject: f2fs: introduce discard_urgent_util sysfs node Through this node, you can control the background discard to run more aggressively or not aggressively when reach the utilization rate of the space. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 3 ++- fs/f2fs/sysfs.c | 9 +++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4694b55b6df4..296683648d4f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -413,6 +413,7 @@ struct discard_cmd_control { unsigned int min_discard_issue_time; /* min. interval between discard issue */ unsigned int mid_discard_issue_time; /* mid. interval between discard issue */ unsigned int max_discard_issue_time; /* max. interval between discard issue */ + unsigned int discard_urgent_util; /* utilization which issue discard proactively */ unsigned int discard_granularity; /* discard granularity */ unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */ unsigned int undiscard_blks; /* # of undiscard blocks */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c7afcc6cd75b..0ff451ea18f6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1064,7 +1064,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->io_aware = true; dpolicy->sync = false; dpolicy->ordered = true; - if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { + if (utilization(sbi) > dcc->discard_urgent_util) { dpolicy->granularity = MIN_DISCARD_GRANULARITY; if (atomic_read(&dcc->discard_cmd_cnt)) dpolicy->max_interval = @@ -2079,6 +2079,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME; dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME; dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME; + dcc->discard_urgent_util = DEF_DISCARD_URGENT_UTIL; dcc->undiscard_blks = 0; dcc->next_pos = 0; dcc->root = RB_ROOT_CACHED; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 33ec467b3772..a4745d596310 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -493,6 +493,13 @@ out: return count; } + if (!strcmp(a->attr.name, "discard_urgent_util")) { + if (t > 100) + return -EINVAL; + *ui = t; + return count; + } + if (!strcmp(a->attr.name, "migration_granularity")) { if (t == 0 || t > sbi->segs_per_sec) return -EINVAL; @@ -800,6 +807,7 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_req F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_urgent_util, discard_urgent_util); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); @@ -930,6 +938,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(min_discard_issue_time), ATTR_LIST(mid_discard_issue_time), ATTR_LIST(max_discard_issue_time), + ATTR_LIST(discard_urgent_util), ATTR_LIST(discard_granularity), ATTR_LIST(max_ordered_discard), ATTR_LIST(pending_discard), -- cgit From f43dc4dc3eff028b5ddddd99f3a66c5a6bdd4e78 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Nov 2022 09:09:11 +1100 Subject: iomap: buffered write failure should not truncate the page cache iomap_file_buffered_write_punch_delalloc() currently invalidates the page cache over the unused range of the delalloc extent that was allocated. While the write allocated the delalloc extent, it does not own it exclusively as the write does not hold any locks that prevent either writeback or mmap page faults from changing the state of either the page cache or the extent state backing this range. Whilst xfs_bmap_punch_delalloc_range() already handles races in extent conversion - it will only punch out delalloc extents and it ignores any other type of extent - the page cache truncate does not discriminate between data written by this write or some other task. As a result, truncating the page cache can result in data corruption if the write races with mmap modifications to the file over the same range. generic/346 exercises this workload, and if we randomly fail writes (as will happen when iomap gets stale iomap detection later in the patchset), it will randomly corrupt the file data because it removes data written by mmap() in the same page as the write() that failed. Hence we do not want to punch out the page cache over the range of the extent we failed to write to - what we actually need to do is detect the ranges that have dirty data in cache over them and *not punch them out*. To do this, we have to walk the page cache over the range of the delalloc extent we want to remove. This is made complex by the fact we have to handle partially up-to-date folios correctly and this can happen even when the FSB size == PAGE_SIZE because we now support multi-page folios in the page cache. Because we are only interested in discovering the edges of data ranges in the page cache (i.e. hole-data boundaries) we can make use of mapping_seek_hole_data() to find those transitions in the page cache. As we hold the invalidate_lock, we know that the boundaries are not going to change while we walk the range. This interface is also byte-based and is sub-page block aware, so we can find the data ranges in the cache based on byte offsets rather than page, folio or fs block sized chunks. This greatly simplifies the logic of finding dirty cached ranges in the page cache. Once we've identified a range that contains cached data, we can then iterate the range folio by folio. This allows us to determine if the data is dirty and hence perform the correct delalloc extent punching operations. The seek interface we use to iterate data ranges will give us sub-folio start/end granularity, so we may end up looking up the same folio multiple times as the seek interface iterates across each discontiguous data region in the folio. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 195 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 180 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 734b761a1e4a..dca9ec9dc4a8 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -832,6 +832,165 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); +/* + * Scan the data range passed to us for dirty page cache folios. If we find a + * dirty folio, punch out the preceeding range and update the offset from which + * the next punch will start from. + * + * We can punch out storage reservations under clean pages because they either + * contain data that has been written back - in which case the delalloc punch + * over that range is a no-op - or they have been read faults in which case they + * contain zeroes and we can remove the delalloc backing range and any new + * writes to those pages will do the normal hole filling operation... + * + * This makes the logic simple: we only need to keep the delalloc extents only + * over the dirty ranges of the page cache. + * + * This function uses [start_byte, end_byte) intervals (i.e. open ended) to + * simplify range iterations. + */ +static int iomap_write_delalloc_scan(struct inode *inode, + loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, + int (*punch)(struct inode *inode, loff_t offset, loff_t length)) +{ + while (start_byte < end_byte) { + struct folio *folio; + + /* grab locked page */ + folio = filemap_lock_folio(inode->i_mapping, + start_byte >> PAGE_SHIFT); + if (!folio) { + start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) + + PAGE_SIZE; + continue; + } + + /* if dirty, punch up to offset */ + if (folio_test_dirty(folio)) { + if (start_byte > *punch_start_byte) { + int error; + + error = punch(inode, *punch_start_byte, + start_byte - *punch_start_byte); + if (error) { + folio_unlock(folio); + folio_put(folio); + return error; + } + } + + /* + * Make sure the next punch start is correctly bound to + * the end of this data range, not the end of the folio. + */ + *punch_start_byte = min_t(loff_t, end_byte, + folio_next_index(folio) << PAGE_SHIFT); + } + + /* move offset to start of next folio in range */ + start_byte = folio_next_index(folio) << PAGE_SHIFT; + folio_unlock(folio); + folio_put(folio); + } + return 0; +} + +/* + * Punch out all the delalloc blocks in the range given except for those that + * have dirty data still pending in the page cache - those are going to be + * written and so must still retain the delalloc backing for writeback. + * + * As we are scanning the page cache for data, we don't need to reimplement the + * wheel - mapping_seek_hole_data() does exactly what we need to identify the + * start and end of data ranges correctly even for sub-folio block sizes. This + * byte range based iteration is especially convenient because it means we + * don't have to care about variable size folios, nor where the start or end of + * the data range lies within a folio, if they lie within the same folio or even + * if there are multiple discontiguous data ranges within the folio. + * + * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so + * can return data ranges that exist in the cache beyond EOF. e.g. a page fault + * spanning EOF will initialise the post-EOF data to zeroes and mark it up to + * date. A write page fault can then mark it dirty. If we then fail a write() + * beyond EOF into that up to date cached range, we allocate a delalloc block + * beyond EOF and then have to punch it out. Because the range is up to date, + * mapping_seek_hole_data() will return it, and we will skip the punch because + * the folio is dirty. THis is incorrect - we always need to punch out delalloc + * beyond EOF in this case as writeback will never write back and covert that + * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF, + * resulting in always punching out the range from the EOF to the end of the + * range the iomap spans. + * + * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it + * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA + * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte) + * returns the end of the data range (data_end). Using closed intervals would + * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose + * the code to subtle off-by-one bugs.... + */ +static int iomap_write_delalloc_release(struct inode *inode, + loff_t start_byte, loff_t end_byte, + int (*punch)(struct inode *inode, loff_t pos, loff_t length)) +{ + loff_t punch_start_byte = start_byte; + loff_t scan_end_byte = min(i_size_read(inode), end_byte); + int error = 0; + + /* + * Lock the mapping to avoid races with page faults re-instantiating + * folios and dirtying them via ->page_mkwrite whilst we walk the + * cache and perform delalloc extent removal. Failing to do this can + * leave dirty pages with no space reservation in the cache. + */ + filemap_invalidate_lock(inode->i_mapping); + while (start_byte < scan_end_byte) { + loff_t data_end; + + start_byte = mapping_seek_hole_data(inode->i_mapping, + start_byte, scan_end_byte, SEEK_DATA); + /* + * If there is no more data to scan, all that is left is to + * punch out the remaining range. + */ + if (start_byte == -ENXIO || start_byte == scan_end_byte) + break; + if (start_byte < 0) { + error = start_byte; + goto out_unlock; + } + WARN_ON_ONCE(start_byte < punch_start_byte); + WARN_ON_ONCE(start_byte > scan_end_byte); + + /* + * We find the end of this contiguous cached data range by + * seeking from start_byte to the beginning of the next hole. + */ + data_end = mapping_seek_hole_data(inode->i_mapping, start_byte, + scan_end_byte, SEEK_HOLE); + if (data_end < 0) { + error = data_end; + goto out_unlock; + } + WARN_ON_ONCE(data_end <= start_byte); + WARN_ON_ONCE(data_end > scan_end_byte); + + error = iomap_write_delalloc_scan(inode, &punch_start_byte, + start_byte, data_end, punch); + if (error) + goto out_unlock; + + /* The next data search starts at the end of this one. */ + start_byte = data_end; + } + + if (punch_start_byte < end_byte) + error = punch(inode, punch_start_byte, + end_byte - punch_start_byte); +out_unlock: + filemap_invalidate_unlock(inode->i_mapping); + return error; +} + /* * When a short write occurs, the filesystem may need to remove reserved space * that was allocated in ->iomap_begin from it's ->iomap_end method. For @@ -842,8 +1001,25 @@ EXPORT_SYMBOL_GPL(iomap_file_buffered_write); * allocated for this iomap. * * This function uses [start_byte, end_byte) intervals (i.e. open ended) to - * simplify range iterations, but converts them back to {offset,len} tuples for - * the punch callback. + * simplify range iterations. + * + * The punch() callback *must* only punch delalloc extents in the range passed + * to it. It must skip over all other types of extents in the range and leave + * them completely unchanged. It must do this punch atomically with respect to + * other extent modifications. + * + * The punch() callback may be called with a folio locked to prevent writeback + * extent allocation racing at the edge of the range we are currently punching. + * The locked folio may or may not cover the range being punched, so it is not + * safe for the punch() callback to lock folios itself. + * + * Lock order is: + * + * inode->i_rwsem (shared or exclusive) + * inode->i_mapping->invalidate_lock (exclusive) + * folio_lock() + * ->punch + * internal filesystem allocation lock */ int iomap_file_buffered_write_punch_delalloc(struct inode *inode, struct iomap *iomap, loff_t pos, loff_t length, @@ -853,7 +1029,6 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode, loff_t start_byte; loff_t end_byte; int blocksize = i_blocksize(inode); - int error = 0; if (iomap->type != IOMAP_DELALLOC) return 0; @@ -877,18 +1052,8 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode, if (start_byte >= end_byte) return 0; - /* - * Lock the mapping to avoid races with page faults re-instantiating - * folios and dirtying them via ->page_mkwrite between the page cache - * truncation and the delalloc extent removal. Failing to do this can - * leave dirty pages with no space reservation in the cache. - */ - filemap_invalidate_lock(inode->i_mapping); - truncate_pagecache_range(inode, start_byte, end_byte - 1); - error = punch(inode, start_byte, end_byte - start_byte); - filemap_invalidate_unlock(inode->i_mapping); - - return error; + return iomap_write_delalloc_release(inode, start_byte, end_byte, + punch); } EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); -- cgit From 7348b322332d8602a4133f0b861334ea021b134a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Nov 2022 09:09:17 +1100 Subject: xfs: xfs_bmap_punch_delalloc_range() should take a byte range All the callers of xfs_bmap_punch_delalloc_range() jump through hoops to convert a byte range to filesystem blocks before calling xfs_bmap_punch_delalloc_range(). Instead, pass the byte range to xfs_bmap_punch_delalloc_range() and have it do the conversion to filesystem blocks internally. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 16 ++++++---------- fs/xfs/xfs_bmap_util.c | 10 ++++++---- fs/xfs/xfs_bmap_util.h | 2 +- fs/xfs/xfs_iomap.c | 8 ++------ 4 files changed, 15 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 5d1a995b15f8..6aadc5815068 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -114,9 +114,8 @@ xfs_end_ioend( if (unlikely(error)) { if (ioend->io_flags & IOMAP_F_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); - xfs_bmap_punch_delalloc_range(ip, - XFS_B_TO_FSBT(mp, offset), - XFS_B_TO_FSB(mp, size)); + xfs_bmap_punch_delalloc_range(ip, offset, + offset + size); } goto done; } @@ -455,12 +454,8 @@ xfs_discard_folio( struct folio *folio, loff_t pos) { - struct inode *inode = folio->mapping->host; - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(folio->mapping->host); struct xfs_mount *mp = ip->i_mount; - size_t offset = offset_in_folio(folio, pos); - xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, pos); - xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, offset); int error; if (xfs_is_shutdown(mp)) @@ -470,8 +465,9 @@ xfs_discard_folio( "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", folio, ip->i_ino, pos); - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - i_blocks_per_folio(inode, folio) - pageoff_fsb); + error = xfs_bmap_punch_delalloc_range(ip, pos, + round_up(pos, folio_size(folio))); + if (error && !xfs_is_shutdown(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 04d0c2bff67c..867645b74d88 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -590,11 +590,13 @@ out_unlock_iolock: int xfs_bmap_punch_delalloc_range( struct xfs_inode *ip, - xfs_fileoff_t start_fsb, - xfs_fileoff_t length) + xfs_off_t start_byte, + xfs_off_t end_byte) { + struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = &ip->i_df; - xfs_fileoff_t end_fsb = start_fsb + length; + xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; int error = 0; @@ -607,7 +609,7 @@ xfs_bmap_punch_delalloc_range( while (got.br_startoff + got.br_blockcount > start_fsb) { del = got; - xfs_trim_extent(&del, start_fsb, length); + xfs_trim_extent(&del, start_fsb, end_fsb - start_fsb); /* * A delete can push the cursor forward. Step back to the diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 24b37d211f1d..6888078f5c31 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -31,7 +31,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap) #endif /* CONFIG_XFS_RT */ int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, - xfs_fileoff_t start_fsb, xfs_fileoff_t length); + xfs_off_t start_byte, xfs_off_t end_byte); struct kgetbmap { __s64 bmv_offset; /* file offset of segment in blocks */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index ea96e8a34868..09676ff6940e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1126,12 +1126,8 @@ xfs_buffered_write_delalloc_punch( loff_t offset, loff_t length) { - struct xfs_mount *mp = XFS_M(inode->i_sb); - xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset); - xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); - - return xfs_bmap_punch_delalloc_range(XFS_I(inode), start_fsb, - end_fsb - start_fsb); + return xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, + offset + length); } static int -- cgit From d7b64041164ca177170191d2ad775da074ab2926 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Nov 2022 09:09:17 +1100 Subject: iomap: write iomap validity checks A recent multithreaded write data corruption has been uncovered in the iomap write code. The core of the problem is partial folio writes can be flushed to disk while a new racing write can map it and fill the rest of the page: writeback new write allocate blocks blocks are unwritten submit IO ..... map blocks iomap indicates UNWRITTEN range loop { lock folio copyin data ..... IO completes runs unwritten extent conv blocks are marked written get next folio } Now add memory pressure such that memory reclaim evicts the partially written folio that has already been written to disk. When the new write finally gets to the last partial page of the new write, it does not find it in cache, so it instantiates a new page, sees the iomap is unwritten, and zeros the part of the page that it does not have data from. This overwrites the data on disk that was originally written. The full description of the corruption mechanism can be found here: https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/ To solve this problem, we need to check whether the iomap is still valid after we lock each folio during the write. We have to do it after we lock the page so that we don't end up with state changes occurring while we wait for the folio to be locked. Hence we need a mechanism to be able to check that the cached iomap is still valid (similar to what we already do in buffered writeback), and we need a way for ->begin_write to back out and tell the high level iomap iterator that we need to remap the remaining write range. The iomap needs to grow some storage for the validity cookie that the filesystem provides to travel with the iomap. XFS, in particular, also needs to know some more information about what the iomap maps (attribute extents rather than file data extents) to for the validity cookie to cover all the types of iomaps we might need to validate. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 29 ++++++++++++++++++++++++++++- fs/iomap/iter.c | 19 ++++++++++++++++++- 2 files changed, 46 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index dca9ec9dc4a8..356193e44cf0 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -584,7 +584,7 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter, return iomap_read_inline_data(iter, folio); } -static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, +static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, size_t len, struct folio **foliop) { const struct iomap_page_ops *page_ops = iter->iomap.page_ops; @@ -618,6 +618,27 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM; goto out_no_page; } + + /* + * Now we have a locked folio, before we do anything with it we need to + * check that the iomap we have cached is not stale. The inode extent + * mapping can change due to concurrent IO in flight (e.g. + * IOMAP_UNWRITTEN state can change and memory reclaim could have + * reclaimed a previously partially written page at this index after IO + * completion before this write reaches this file offset) and hence we + * could do the wrong thing here (zero a page range incorrectly or fail + * to zero) and corrupt data. + */ + if (page_ops && page_ops->iomap_valid) { + bool iomap_valid = page_ops->iomap_valid(iter->inode, + &iter->iomap); + if (!iomap_valid) { + iter->iomap.flags |= IOMAP_F_STALE; + status = 0; + goto out_unlock; + } + } + if (pos + len > folio_pos(folio) + folio_size(folio)) len = folio_pos(folio) + folio_size(folio) - pos; @@ -773,6 +794,8 @@ again: status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) break; + if (iter->iomap.flags & IOMAP_F_STALE) + break; page = folio_file_page(folio, pos >> PAGE_SHIFT); if (mapping_writably_mapped(mapping)) @@ -1081,6 +1104,8 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) return status; + if (iter->iomap.flags & IOMAP_F_STALE) + break; status = iomap_write_end(iter, pos, bytes, bytes, folio); if (WARN_ON_ONCE(status == 0)) @@ -1136,6 +1161,8 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) status = iomap_write_begin(iter, pos, bytes, &folio); if (status) return status; + if (iter->iomap.flags & IOMAP_F_STALE) + break; offset = offset_in_folio(folio, pos); if (bytes > folio_size(folio) - offset) diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index a1c7592d2ade..79a0614eaab7 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -7,12 +7,28 @@ #include #include "trace.h" +/* + * Advance to the next range we need to map. + * + * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully + * processed - it was aborted because the extent the iomap spanned may have been + * changed during the operation. In this case, the iteration behaviour is to + * remap the unprocessed range of the iter, and that means we may need to remap + * even when we've made no progress (i.e. iter->processed = 0). Hence the + * "finished iterating" case needs to distinguish between + * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we + * need to remap the entire remaining range. + */ static inline int iomap_iter_advance(struct iomap_iter *iter) { + bool stale = iter->iomap.flags & IOMAP_F_STALE; + /* handle the previous iteration (if any) */ if (iter->iomap.length) { - if (iter->processed <= 0) + if (iter->processed < 0) return iter->processed; + if (!iter->processed && !stale) + return 0; if (WARN_ON_ONCE(iter->processed > iomap_length(iter))) return -EIO; iter->pos += iter->processed; @@ -33,6 +49,7 @@ static inline void iomap_iter_done(struct iomap_iter *iter) WARN_ON_ONCE(iter->iomap.offset > iter->pos); WARN_ON_ONCE(iter->iomap.length == 0); WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); + WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE); trace_iomap_iter_dstmap(iter->inode, &iter->iomap); if (iter->srcmap.type != IOMAP_HOLE) -- cgit From 304a68b9c63bbfc1f6e159d68e8892fc54a06067 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Nov 2022 09:09:17 +1100 Subject: xfs: use iomap_valid method to detect stale cached iomaps Now that iomap supports a mechanism to validate cached iomaps for buffered write operations, hook it up to the XFS buffered write ops so that we can avoid data corruptions that result from stale cached iomaps. See: https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/ or the ->iomap_valid() introduction commit for exact details of the corruption vector. The validity cookie we store in the iomap is based on the type of iomap we return. It is expected that the iomap->flags we set in xfs_bmbt_to_iomap() is not perturbed by the iomap core and are returned to us in the iomap passed via the .iomap_valid() callback. This ensures that the validity cookie is always checking the correct inode fork sequence numbers to detect potential changes that affect the extent cached by the iomap. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 6 ++- fs/xfs/xfs_aops.c | 2 +- fs/xfs/xfs_iomap.c | 95 ++++++++++++++++++++++++++++++++++++++---------- fs/xfs/xfs_iomap.h | 5 ++- fs/xfs/xfs_pnfs.c | 6 ++- 5 files changed, 87 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 49d0d4ea63fc..56b9b7db38bb 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4551,7 +4551,8 @@ xfs_bmapi_convert_delalloc( * the extent. Just return the real extent at this offset. */ if (!isnullstartblock(bma.got.br_startblock)) { - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, + xfs_iomap_inode_sequence(ip, flags)); *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } @@ -4599,7 +4600,8 @@ xfs_bmapi_convert_delalloc( XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, + xfs_iomap_inode_sequence(ip, flags)); *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 6aadc5815068..a22d90af40c8 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -372,7 +372,7 @@ retry: isnullstartblock(imap.br_startblock)) goto allocate_blocks; - xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq); trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 09676ff6940e..26ca3cc1a048 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -48,13 +48,45 @@ xfs_alert_fsblock_zero( return -EFSCORRUPTED; } +u64 +xfs_iomap_inode_sequence( + struct xfs_inode *ip, + u16 iomap_flags) +{ + u64 cookie = 0; + + if (iomap_flags & IOMAP_F_XATTR) + return READ_ONCE(ip->i_af.if_seq); + if ((iomap_flags & IOMAP_F_SHARED) && ip->i_cowfp) + cookie = (u64)READ_ONCE(ip->i_cowfp->if_seq) << 32; + return cookie | READ_ONCE(ip->i_df.if_seq); +} + +/* + * Check that the iomap passed to us is still valid for the given offset and + * length. + */ +static bool +xfs_iomap_valid( + struct inode *inode, + const struct iomap *iomap) +{ + return iomap->validity_cookie == + xfs_iomap_inode_sequence(XFS_I(inode), iomap->flags); +} + +const struct iomap_page_ops xfs_iomap_page_ops = { + .iomap_valid = xfs_iomap_valid, +}; + int xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, unsigned int mapping_flags, - u16 iomap_flags) + u16 iomap_flags, + u64 sequence_cookie) { struct xfs_mount *mp = ip->i_mount; struct xfs_buftarg *target = xfs_inode_buftarg(ip); @@ -91,6 +123,9 @@ xfs_bmbt_to_iomap( if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) iomap->flags |= IOMAP_F_DIRTY; + + iomap->validity_cookie = sequence_cookie; + iomap->page_ops = &xfs_iomap_page_ops; return 0; } @@ -195,7 +230,8 @@ xfs_iomap_write_direct( xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, unsigned int flags, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + u64 *seq) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; @@ -285,6 +321,7 @@ xfs_iomap_write_direct( error = xfs_alert_fsblock_zero(ip, imap); out_unlock: + *seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -743,6 +780,7 @@ xfs_direct_write_iomap_begin( bool shared = false; u16 iomap_flags = 0; unsigned int lockmode = XFS_ILOCK_SHARED; + u64 seq; ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); @@ -811,9 +849,10 @@ xfs_direct_write_iomap_begin( goto out_unlock; } + seq = xfs_iomap_inode_sequence(ip, iomap_flags); xfs_iunlock(ip, lockmode); trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq); allocate_blocks: error = -EAGAIN; @@ -839,24 +878,26 @@ allocate_blocks: xfs_iunlock(ip, lockmode); error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, - flags, &imap); + flags, &imap, &seq); if (error) return error; trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, - iomap_flags | IOMAP_F_NEW); + iomap_flags | IOMAP_F_NEW, seq); out_found_cow: - xfs_iunlock(ip, lockmode); length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); if (imap.br_startblock != HOLESTARTBLOCK) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); + seq = xfs_iomap_inode_sequence(ip, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); if (error) - return error; + goto out_unlock; } - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + xfs_iunlock(ip, lockmode); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); out_unlock: if (lockmode) @@ -915,6 +956,7 @@ xfs_buffered_write_iomap_begin( int allocfork = XFS_DATA_FORK; int error = 0; unsigned int lockmode = XFS_ILOCK_EXCL; + u64 seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -1094,26 +1136,31 @@ retry: * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch * them out if the write happens to fail. */ + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); xfs_iunlock(ip, XFS_ILOCK_EXCL); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); found_imap: + seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); found_cow: - xfs_iunlock(ip, XFS_ILOCK_EXCL); + seq = xfs_iomap_inode_sequence(ip, 0); if (imap.br_startoff <= offset_fsb) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); if (error) - return error; + goto out_unlock; + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, - IOMAP_F_SHARED); + IOMAP_F_SHARED, seq); } xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1193,6 +1240,7 @@ xfs_read_iomap_begin( int nimaps = 1, error = 0; bool shared = false; unsigned int lockmode = XFS_ILOCK_SHARED; + u64 seq; ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); @@ -1206,13 +1254,14 @@ xfs_read_iomap_begin( &nimaps, 0); if (!error && (flags & IOMAP_REPORT)) error = xfs_reflink_trim_around_shared(ip, &imap, &shared); + seq = xfs_iomap_inode_sequence(ip, shared ? IOMAP_F_SHARED : 0); xfs_iunlock(ip, lockmode); if (error) return error; trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, - shared ? IOMAP_F_SHARED : 0); + shared ? IOMAP_F_SHARED : 0, seq); } const struct iomap_ops xfs_read_iomap_ops = { @@ -1237,6 +1286,7 @@ xfs_seek_iomap_begin( struct xfs_bmbt_irec imap, cmap; int error = 0; unsigned lockmode; + u64 seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -1271,8 +1321,9 @@ xfs_seek_iomap_begin( if (data_fsb < cow_fsb + cmap.br_blockcount) end_fsb = min(end_fsb, data_fsb); xfs_trim_extent(&cmap, offset_fsb, end_fsb); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, - IOMAP_F_SHARED); + IOMAP_F_SHARED, seq); /* * This is a COW extent, so we must probe the page cache * because there could be dirty page cache being backed @@ -1293,8 +1344,9 @@ xfs_seek_iomap_begin( imap.br_startblock = HOLESTARTBLOCK; imap.br_state = XFS_EXT_NORM; done: + seq = xfs_iomap_inode_sequence(ip, 0); xfs_trim_extent(&imap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); out_unlock: xfs_iunlock(ip, lockmode); return error; @@ -1320,6 +1372,7 @@ xfs_xattr_iomap_begin( struct xfs_bmbt_irec imap; int nimaps = 1, error = 0; unsigned lockmode; + int seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -1336,12 +1389,14 @@ xfs_xattr_iomap_begin( error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, XFS_BMAPI_ATTRFORK); out_unlock: + + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_XATTR); xfs_iunlock(ip, lockmode); if (error) return error; ASSERT(nimaps); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_XATTR, seq); } const struct iomap_ops xfs_xattr_iomap_ops = { diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 0f62ab633040..4da13440bae9 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -13,14 +13,15 @@ struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, unsigned int flags, - struct xfs_bmbt_irec *imap); + struct xfs_bmbt_irec *imap, u64 *sequence); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip, xfs_fileoff_t end_fsb); +u64 xfs_iomap_inode_sequence(struct xfs_inode *ip, u16 iomap_flags); int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, unsigned int mapping_flags, - u16 iomap_flags); + u16 iomap_flags, u64 sequence_cookie); int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, bool *did_zero); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 37a24f0f7cd4..38d23f0e703a 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -125,6 +125,7 @@ xfs_fs_map_blocks( int nimaps = 1; uint lock_flags; int error = 0; + u64 seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -176,6 +177,7 @@ xfs_fs_map_blocks( lock_flags = xfs_ilock_data_map_shared(ip); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, bmapi_flags); + seq = xfs_iomap_inode_sequence(ip, 0); ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK); @@ -189,7 +191,7 @@ xfs_fs_map_blocks( xfs_iunlock(ip, lock_flags); error = xfs_iomap_write_direct(ip, offset_fsb, - end_fsb - offset_fsb, 0, &imap); + end_fsb - offset_fsb, 0, &imap, &seq); if (error) goto out_unlock; @@ -209,7 +211,7 @@ xfs_fs_map_blocks( } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq); *device_generation = mp->m_generation; return error; out_unlock: -- cgit From 6e8af15ccdc4e138a5b529c1901a0013e1dcaa09 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Nov 2022 09:09:17 +1100 Subject: xfs: drop write error injection is unfixable, remove it With the changes to scan the page cache for dirty data to avoid data corruptions from partial write cleanup racing with other page cache operations, the drop writes error injection no longer works the same way it used to and causes xfs/196 to fail. This is because xfs/196 writes to the file and populates the page cache before it turns on the error injection and starts failing -overwrites-. The result is that the original drop-writes code failed writes only -after- overwriting the data in the cache, followed by invalidates the cached data, then punching out the delalloc extent from under that data. On the surface, this looks fine. The problem is that page cache invalidation *doesn't guarantee that it removes anything from the page cache* and it doesn't change the dirty state of the folio. When block size == page size and we do page aligned IO (as xfs/196 does) everything happens to align perfectly and page cache invalidation removes the single page folios that span the written data. Hence the followup delalloc punch pass does not find cached data over that range and it can punch the extent out. IOWs, xfs/196 "works" for block size == page size with the new code. I say "works", because it actually only works for the case where IO is page aligned, and no data was read from disk before writes occur. Because the moment we actually read data first, the readahead code allocates multipage folios and suddenly the invalidate code goes back to zeroing subfolio ranges without changing dirty state. Hence, with multipage folios in play, block size == page size is functionally identical to block size < page size behaviour, and drop-writes is manifestly broken w.r.t to this case. Invalidation of a subfolio range doesn't result in the folio being removed from the cache, just the range gets zeroed. Hence after we've sequentially walked over a folio that we've dirtied (via write data) and then invalidated, we end up with a dirty folio full of zeroed data. And because the new code skips punching ranges that have dirty folios covering them, we end up leaving the delalloc range intact after failing all the writes. Hence failed writes now end up writing zeroes to disk in the cases where invalidation zeroes folios rather than removing them from cache. This is a fundamental change of behaviour that is needed to avoid the data corruption vectors that exist in the old write fail path, and it renders the drop-writes injection non-functional and unworkable as it stands. As it is, I think the error injection is also now unnecessary, as partial writes that need delalloc extent are going to be a lot more common with stale iomap detection in place. Hence this patch removes the drop-writes error injection completely. xfs/196 can remain for testing kernels that don't have this data corruption fix, but those that do will report: xfs/196 3s ... [not run] XFS error injection drop_writes unknown on this kernel. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_errortag.h | 12 +++++------- fs/xfs/xfs_error.c | 27 ++++++++++++++++++++------- fs/xfs/xfs_iomap.c | 9 --------- 3 files changed, 25 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 5362908164b0..580ccbd5aadc 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -40,13 +40,12 @@ #define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25 #define XFS_ERRTAG_BMAP_FINISH_ONE 26 #define XFS_ERRTAG_AG_RESV_CRITICAL 27 + /* - * DEBUG mode instrumentation to test and/or trigger delayed allocation - * block killing in the event of failed writes. When enabled, all - * buffered writes are silenty dropped and handled as if they failed. - * All delalloc blocks in the range of the write (including pre-existing - * delalloc blocks!) are tossed as part of the write failure error - * handling sequence. + * Drop-writes support removed because write error handling cannot trash + * pre-existing delalloc extents in any useful way anymore. We retain the + * definition so that we can reject it as an invalid value in + * xfs_errortag_valid(). */ #define XFS_ERRTAG_DROP_WRITES 28 #define XFS_ERRTAG_LOG_BAD_CRC 29 @@ -95,7 +94,6 @@ #define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 #define XFS_RANDOM_BMAP_FINISH_ONE 1 #define XFS_RANDOM_AG_RESV_CRITICAL 4 -#define XFS_RANDOM_DROP_WRITES 1 #define XFS_RANDOM_LOG_BAD_CRC 1 #define XFS_RANDOM_LOG_ITEM_PIN 1 #define XFS_RANDOM_BUF_LRU_REF 2 diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index c6b2aabd6f18..dea3c0649d2f 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -46,7 +46,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_REFCOUNT_FINISH_ONE, XFS_RANDOM_BMAP_FINISH_ONE, XFS_RANDOM_AG_RESV_CRITICAL, - XFS_RANDOM_DROP_WRITES, + 0, /* XFS_RANDOM_DROP_WRITES has been removed */ XFS_RANDOM_LOG_BAD_CRC, XFS_RANDOM_LOG_ITEM_PIN, XFS_RANDOM_BUF_LRU_REF, @@ -162,7 +162,6 @@ XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDA XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); -XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES); XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); @@ -206,7 +205,6 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(refcount_finish_one), XFS_ERRORTAG_ATTR_LIST(bmap_finish_one), XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), - XFS_ERRORTAG_ATTR_LIST(drop_writes), XFS_ERRORTAG_ATTR_LIST(log_bad_crc), XFS_ERRORTAG_ATTR_LIST(log_item_pin), XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), @@ -256,6 +254,19 @@ xfs_errortag_del( kmem_free(mp->m_errortag); } +static bool +xfs_errortag_valid( + unsigned int error_tag) +{ + if (error_tag >= XFS_ERRTAG_MAX) + return false; + + /* Error out removed injection types */ + if (error_tag == XFS_ERRTAG_DROP_WRITES) + return false; + return true; +} + bool xfs_errortag_test( struct xfs_mount *mp, @@ -277,7 +288,9 @@ xfs_errortag_test( if (!mp->m_errortag) return false; - ASSERT(error_tag < XFS_ERRTAG_MAX); + if (!xfs_errortag_valid(error_tag)) + return false; + randfactor = mp->m_errortag[error_tag]; if (!randfactor || prandom_u32_max(randfactor)) return false; @@ -293,7 +306,7 @@ xfs_errortag_get( struct xfs_mount *mp, unsigned int error_tag) { - if (error_tag >= XFS_ERRTAG_MAX) + if (!xfs_errortag_valid(error_tag)) return -EINVAL; return mp->m_errortag[error_tag]; @@ -305,7 +318,7 @@ xfs_errortag_set( unsigned int error_tag, unsigned int tag_value) { - if (error_tag >= XFS_ERRTAG_MAX) + if (!xfs_errortag_valid(error_tag)) return -EINVAL; mp->m_errortag[error_tag] = tag_value; @@ -319,7 +332,7 @@ xfs_errortag_add( { BUILD_BUG_ON(ARRAY_SIZE(xfs_errortag_random_default) != XFS_ERRTAG_MAX); - if (error_tag >= XFS_ERRTAG_MAX) + if (!xfs_errortag_valid(error_tag)) return -EINVAL; return xfs_errortag_set(mp, error_tag, diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 26ca3cc1a048..1bdd7afc1010 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1190,15 +1190,6 @@ xfs_buffered_write_iomap_end( struct xfs_mount *mp = XFS_M(inode->i_sb); int error; - /* - * Behave as if the write failed if drop writes is enabled. Set the NEW - * flag to force delalloc cleanup. - */ - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DROP_WRITES)) { - iomap->flags |= IOMAP_F_NEW; - written = 0; - } - error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset, length, written, &xfs_buffered_write_delalloc_punch); if (error && !xfs_is_shutdown(mp)) { -- cgit From c2beff99eb03866df6fdbd3a93b08fd27eb8bf5c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 28 Nov 2022 17:24:35 -0800 Subject: xfs: add debug knob to slow down writeback for fun Add a new error injection knob so that we can arbitrarily slow down writeback to test for race conditions and aberrant reclaim behavior if the writeback mechanisms are slow to issue writeback. This will enable functional testing for the ifork sequence counters introduced in commit 745b3f76d1c8 ("xfs: maintain a sequence count for inode fork manipulations"). Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_errortag.h | 4 +++- fs/xfs/xfs_aops.c | 14 ++++++++++++-- fs/xfs/xfs_error.c | 16 ++++++++++++++++ fs/xfs/xfs_error.h | 13 +++++++++++++ fs/xfs/xfs_trace.c | 2 ++ fs/xfs/xfs_trace.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 90 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 580ccbd5aadc..f5f629174eca 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -61,7 +61,8 @@ #define XFS_ERRTAG_LARP 39 #define XFS_ERRTAG_DA_LEAF_SPLIT 40 #define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41 -#define XFS_ERRTAG_MAX 42 +#define XFS_ERRTAG_WB_DELAY_MS 42 +#define XFS_ERRTAG_MAX 43 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -107,5 +108,6 @@ #define XFS_RANDOM_LARP 1 #define XFS_RANDOM_DA_LEAF_SPLIT 1 #define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 +#define XFS_RANDOM_WB_DELAY_MS 3000 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a22d90af40c8..41734202796f 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -17,6 +17,8 @@ #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_reflink.h" +#include "xfs_errortag.h" +#include "xfs_error.h" struct xfs_writepage_ctx { struct iomap_writepage_ctx ctx; @@ -217,11 +219,17 @@ xfs_imap_valid( * checked (and found nothing at this offset) could have added * overlapping blocks. */ - if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) + if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) { + trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap, + XFS_WPC(wpc)->data_seq, XFS_DATA_FORK); return false; + } if (xfs_inode_has_cow_data(ip) && - XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) + XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) { + trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap, + XFS_WPC(wpc)->cow_seq, XFS_COW_FORK); return false; + } return true; } @@ -285,6 +293,8 @@ xfs_map_blocks( if (xfs_is_shutdown(mp)) return -EIO; + XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS); + /* * COW fork blocks can overlap data fork blocks even if the blocks * aren't shared. COW I/O always takes precedent, so we must always diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index dea3c0649d2f..2d6e3c718e03 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -60,6 +60,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_LARP, XFS_RANDOM_DA_LEAF_SPLIT, XFS_RANDOM_ATTR_LEAF_TO_NODE, + XFS_RANDOM_WB_DELAY_MS, }; struct xfs_errortag_attr { @@ -175,6 +176,7 @@ XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL); XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); +XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -218,6 +220,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(larp), XFS_ERRORTAG_ATTR_LIST(da_leaf_split), XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), + XFS_ERRORTAG_ATTR_LIST(wb_delay_ms), NULL, }; ATTRIBUTE_GROUPS(xfs_errortag); @@ -267,6 +270,19 @@ xfs_errortag_valid( return true; } +bool +xfs_errortag_enabled( + struct xfs_mount *mp, + unsigned int tag) +{ + if (!mp->m_errortag) + return false; + if (!xfs_errortag_valid(tag)) + return false; + + return mp->m_errortag[tag] != 0; +} + bool xfs_errortag_test( struct xfs_mount *mp, diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 5191e9145e55..dbe6c37dc697 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -45,6 +45,18 @@ extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression, const char *file, int line, unsigned int error_tag); #define XFS_TEST_ERROR(expr, mp, tag) \ ((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag))) +bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); +#define XFS_ERRORTAG_DELAY(mp, tag) \ + do { \ + might_sleep(); \ + if (!xfs_errortag_enabled((mp), (tag))) \ + break; \ + xfs_warn_ratelimited((mp), \ +"Injecting %ums delay at file %s, line %d, on filesystem \"%s\"", \ + (mp)->m_errortag[(tag)], __FILE__, __LINE__, \ + (mp)->m_super->s_id); \ + mdelay((mp)->m_errortag[(tag)]); \ + } while (0) extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag); extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag, @@ -55,6 +67,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #define xfs_errortag_init(mp) (0) #define xfs_errortag_del(mp) #define XFS_TEST_ERROR(expr, mp, tag) (expr) +#define XFS_ERRORTAG_DELAY(mp, tag) ((void)0) #define xfs_errortag_set(mp, tag, val) (ENOSYS) #define xfs_errortag_add(mp, tag) (ENOSYS) #define xfs_errortag_clearall(mp) (ENOSYS) diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index d269ef57ff01..8a5dc1538aa8 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -34,6 +34,8 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_error.h" +#include +#include "xfs_iomap.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 372d871bccc5..c9ada9577a4a 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3352,6 +3352,50 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \ TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), \ TP_ARGS(ip, irec)) +/* inode iomap invalidation events */ +DECLARE_EVENT_CLASS(xfs_wb_invalid_class, + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork), + TP_ARGS(ip, iomap, wpcseq, whichfork), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u64, addr) + __field(loff_t, pos) + __field(u64, len) + __field(u16, type) + __field(u16, flags) + __field(u32, wpcseq) + __field(u32, forkseq) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->addr = iomap->addr; + __entry->pos = iomap->offset; + __entry->len = iomap->length; + __entry->type = iomap->type; + __entry->flags = iomap->flags; + __entry->wpcseq = wpcseq; + __entry->forkseq = READ_ONCE(xfs_ifork_ptr(ip, whichfork)->if_seq); + ), + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x wpcseq 0x%x forkseq 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pos, + __entry->addr, + __entry->len, + __entry->type, + __entry->flags, + __entry->wpcseq, + __entry->forkseq) +); +#define DEFINE_WB_INVALID_EVENT(name) \ +DEFINE_EVENT(xfs_wb_invalid_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork), \ + TP_ARGS(ip, iomap, wpcseq, whichfork)) +DEFINE_WB_INVALID_EVENT(xfs_wb_cow_iomap_invalid); +DEFINE_WB_INVALID_EVENT(xfs_wb_data_iomap_invalid); + /* refcount/reflink tracepoint definitions */ /* reflink tracepoints */ -- cgit From 254e3459285cbf2174350bbc0051e475e1bc5196 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 28 Nov 2022 17:24:36 -0800 Subject: xfs: add debug knob to slow down write for fun Add a new error injection knob so that we can arbitrarily slow down pagecache writes to test for race conditions and aberrant reclaim behavior if the writeback mechanisms are slow to issue writeback. This will enable functional testing for the ifork sequence counters introduced in commit 304a68b9c63b ("xfs: use iomap_valid method to detect stale cached iomaps") that fixes write racing with reclaim writeback. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_errortag.h | 4 +++- fs/xfs/xfs_error.c | 3 +++ fs/xfs/xfs_iomap.c | 14 ++++++++++++-- fs/xfs/xfs_trace.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index f5f629174eca..01a9e86b3037 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -62,7 +62,8 @@ #define XFS_ERRTAG_DA_LEAF_SPLIT 40 #define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41 #define XFS_ERRTAG_WB_DELAY_MS 42 -#define XFS_ERRTAG_MAX 43 +#define XFS_ERRTAG_WRITE_DELAY_MS 43 +#define XFS_ERRTAG_MAX 44 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -109,5 +110,6 @@ #define XFS_RANDOM_DA_LEAF_SPLIT 1 #define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 #define XFS_RANDOM_WB_DELAY_MS 3000 +#define XFS_RANDOM_WRITE_DELAY_MS 3000 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 2d6e3c718e03..713341d246d1 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -61,6 +61,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_DA_LEAF_SPLIT, XFS_RANDOM_ATTR_LEAF_TO_NODE, XFS_RANDOM_WB_DELAY_MS, + XFS_RANDOM_WRITE_DELAY_MS, }; struct xfs_errortag_attr { @@ -177,6 +178,7 @@ XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS); +XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -221,6 +223,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(da_leaf_split), XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), XFS_ERRORTAG_ATTR_LIST(wb_delay_ms), + XFS_ERRORTAG_ATTR_LIST(write_delay_ms), NULL, }; ATTRIBUTE_GROUPS(xfs_errortag); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 1bdd7afc1010..1005f1e36545 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -27,6 +27,8 @@ #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_reflink.h" +#include "xfs_error.h" +#include "xfs_errortag.h" #define XFS_ALLOC_ALIGN(mp, off) \ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) @@ -71,8 +73,16 @@ xfs_iomap_valid( struct inode *inode, const struct iomap *iomap) { - return iomap->validity_cookie == - xfs_iomap_inode_sequence(XFS_I(inode), iomap->flags); + struct xfs_inode *ip = XFS_I(inode); + + if (iomap->validity_cookie != + xfs_iomap_inode_sequence(ip, iomap->flags)) { + trace_xfs_iomap_invalid(ip, iomap); + return false; + } + + XFS_ERRORTAG_DELAY(ip->i_mount, XFS_ERRTAG_WRITE_DELAY_MS); + return true; } const struct iomap_page_ops xfs_iomap_page_ops = { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index c9ada9577a4a..421d1e504ac4 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3396,6 +3396,48 @@ DEFINE_EVENT(xfs_wb_invalid_class, name, \ DEFINE_WB_INVALID_EVENT(xfs_wb_cow_iomap_invalid); DEFINE_WB_INVALID_EVENT(xfs_wb_data_iomap_invalid); +DECLARE_EVENT_CLASS(xfs_iomap_invalid_class, + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap), + TP_ARGS(ip, iomap), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u64, addr) + __field(loff_t, pos) + __field(u64, len) + __field(u64, validity_cookie) + __field(u64, inodeseq) + __field(u16, type) + __field(u16, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->addr = iomap->addr; + __entry->pos = iomap->offset; + __entry->len = iomap->length; + __entry->validity_cookie = iomap->validity_cookie; + __entry->type = iomap->type; + __entry->flags = iomap->flags; + __entry->inodeseq = xfs_iomap_inode_sequence(ip, iomap->flags); + ), + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x validity_cookie 0x%llx inodeseq 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pos, + __entry->addr, + __entry->len, + __entry->type, + __entry->flags, + __entry->validity_cookie, + __entry->inodeseq) +); +#define DEFINE_IOMAP_INVALID_EVENT(name) \ +DEFINE_EVENT(xfs_iomap_invalid_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap), \ + TP_ARGS(ip, iomap)) +DEFINE_IOMAP_INVALID_EVENT(xfs_iomap_invalid); + /* refcount/reflink tracepoint definitions */ /* reflink tracepoints */ -- cgit From ca57f02295f188d6c65ec02202402979880fa6d8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 28 Nov 2022 22:02:56 +0000 Subject: afs: Fix fileserver probe RTT handling The fileserver probing code attempts to work out the best fileserver to use for a volume by retrieving the RTT calculated by AF_RXRPC for the probe call sent to each server and comparing them. Sometimes, however, no RTT estimate is available and rxrpc_kernel_get_srtt() returns false, leading good fileservers to be given an RTT of UINT_MAX and thus causing the rotation algorithm to ignore them. Fix afs_select_fileserver() to ignore rxrpc_kernel_get_srtt()'s return value and just take the estimated RTT it provides - which will be capped at 1 second. Fixes: 1d4adfaf6574 ("rxrpc: Make rxrpc_kernel_get_srtt() indicate validity") Signed-off-by: David Howells Reviewed-by: Marc Dionne Tested-by: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/166965503999.3392585.13954054113218099395.stgit@warthog.procyon.org.uk/ Signed-off-by: Linus Torvalds --- fs/afs/fs_probe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index c0031a3ab42f..3ac5fcf98d0d 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -167,8 +167,8 @@ responded: clear_bit(AFS_SERVER_FL_HAS_FS64, &server->flags); } - if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) && - rtt_us < server->probe.rtt) { + rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us); + if (rtt_us < server->probe.rtt) { server->probe.rtt = rtt_us; server->rtt = rtt_us; alist->preferred = index; -- cgit From 98dc08bae6780bb950b5c0cdefeb662b22482655 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 28 Nov 2022 23:04:01 -0800 Subject: fsverity: stop using PG_error to track error status As a step towards freeing the PG_error flag for other uses, change ext4 and f2fs to stop using PG_error to track verity errors. Instead, if a verity error occurs, just mark the whole bio as failed. The coarser granularity isn't really a problem since it isn't any worse than what the block layer provides, and errors from a multi-page readahead aren't reported to applications unless a single-page read fails too. f2fs supports compression, which makes the f2fs changes a bit more complicated than desired, but the basic premise still works. Note: there are still a few uses of PageError in f2fs, but they are on the write path, so they are unrelated and this patch doesn't touch them. Reviewed-by: Chao Yu Acked-by: Jaegeuk Kim Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20221129070401.156114-1-ebiggers@kernel.org --- fs/ext4/readpage.c | 8 ++----- fs/f2fs/compress.c | 64 ++++++++++++++++++++++++++---------------------------- fs/f2fs/data.c | 53 +++++++++++++++++++++++++++----------------- fs/verity/verify.c | 12 +++++----- 4 files changed, 72 insertions(+), 65 deletions(-) (limited to 'fs') diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 3d21eae267fc..e604ea4e102b 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -75,14 +75,10 @@ static void __read_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, iter_all) { page = bv->bv_page; - /* PG_error was set if verity failed. */ - if (bio->bi_status || PageError(page)) { + if (bio->bi_status) ClearPageUptodate(page); - /* will re-read again later */ - ClearPageError(page); - } else { + else SetPageUptodate(page); - } unlock_page(page); } if (bio->bi_private) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index d315c2de136f..2b7a5cc4ed66 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1711,50 +1711,27 @@ static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task) } } -/* - * Update and unlock the cluster's pagecache pages, and release the reference to - * the decompress_io_ctx that was being held for I/O completion. - */ -static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, - bool in_task) +static void f2fs_verify_cluster(struct work_struct *work) { + struct decompress_io_ctx *dic = + container_of(work, struct decompress_io_ctx, verity_work); int i; + /* Verify, update, and unlock the decompressed pages. */ for (i = 0; i < dic->cluster_size; i++) { struct page *rpage = dic->rpages[i]; if (!rpage) continue; - /* PG_error was set if verity failed. */ - if (failed || PageError(rpage)) { - ClearPageUptodate(rpage); - /* will re-read again later */ - ClearPageError(rpage); - } else { + if (fsverity_verify_page(rpage)) SetPageUptodate(rpage); - } + else + ClearPageUptodate(rpage); unlock_page(rpage); } - f2fs_put_dic(dic, in_task); -} - -static void f2fs_verify_cluster(struct work_struct *work) -{ - struct decompress_io_ctx *dic = - container_of(work, struct decompress_io_ctx, verity_work); - int i; - - /* Verify the cluster's decompressed pages with fs-verity. */ - for (i = 0; i < dic->cluster_size; i++) { - struct page *rpage = dic->rpages[i]; - - if (rpage && !fsverity_verify_page(rpage)) - SetPageError(rpage); - } - - __f2fs_decompress_end_io(dic, false, true); + f2fs_put_dic(dic, true); } /* @@ -1764,6 +1741,8 @@ static void f2fs_verify_cluster(struct work_struct *work) void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, bool in_task) { + int i; + if (!failed && dic->need_verity) { /* * Note that to avoid deadlocks, the verity work can't be done @@ -1773,9 +1752,28 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, */ INIT_WORK(&dic->verity_work, f2fs_verify_cluster); fsverity_enqueue_verify_work(&dic->verity_work); - } else { - __f2fs_decompress_end_io(dic, failed, in_task); + return; + } + + /* Update and unlock the cluster's pagecache pages. */ + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; + + if (!rpage) + continue; + + if (failed) + ClearPageUptodate(rpage); + else + SetPageUptodate(rpage); + unlock_page(rpage); } + + /* + * Release the reference to the decompress_io_ctx that was being held + * for I/O completion. + */ + f2fs_put_dic(dic, in_task); } /* diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a71e818cd67b..7af75041bd81 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -116,43 +116,56 @@ struct bio_post_read_ctx { struct f2fs_sb_info *sbi; struct work_struct work; unsigned int enabled_steps; + /* + * decompression_attempted keeps track of whether + * f2fs_end_read_compressed_page() has been called on the pages in the + * bio that belong to a compressed cluster yet. + */ + bool decompression_attempted; block_t fs_blkaddr; }; +/* + * Update and unlock a bio's pages, and free the bio. + * + * This marks pages up-to-date only if there was no error in the bio (I/O error, + * decryption error, or verity error), as indicated by bio->bi_status. + * + * "Compressed pages" (pagecache pages backed by a compressed cluster on-disk) + * aren't marked up-to-date here, as decompression is done on a per-compression- + * cluster basis rather than a per-bio basis. Instead, we only must do two + * things for each compressed page here: call f2fs_end_read_compressed_page() + * with failed=true if an error occurred before it would have normally gotten + * called (i.e., I/O error or decryption error, but *not* verity error), and + * release the bio's reference to the decompress_io_ctx of the page's cluster. + */ static void f2fs_finish_read_bio(struct bio *bio, bool in_task) { struct bio_vec *bv; struct bvec_iter_all iter_all; + struct bio_post_read_ctx *ctx = bio->bi_private; - /* - * Update and unlock the bio's pagecache pages, and put the - * decompression context for any compressed pages. - */ bio_for_each_segment_all(bv, bio, iter_all) { struct page *page = bv->bv_page; if (f2fs_is_compressed_page(page)) { - if (bio->bi_status) + if (ctx && !ctx->decompression_attempted) f2fs_end_read_compressed_page(page, true, 0, in_task); f2fs_put_page_dic(page, in_task); continue; } - /* PG_error was set if verity failed. */ - if (bio->bi_status || PageError(page)) { + if (bio->bi_status) ClearPageUptodate(page); - /* will re-read again later */ - ClearPageError(page); - } else { + else SetPageUptodate(page); - } dec_page_count(F2FS_P_SB(page), __read_io_type(page)); unlock_page(page); } - if (bio->bi_private) - mempool_free(bio->bi_private, bio_post_read_ctx_pool); + if (ctx) + mempool_free(ctx, bio_post_read_ctx_pool); bio_put(bio); } @@ -185,8 +198,10 @@ static void f2fs_verify_bio(struct work_struct *work) struct page *page = bv->bv_page; if (!f2fs_is_compressed_page(page) && - !fsverity_verify_page(page)) - SetPageError(page); + !fsverity_verify_page(page)) { + bio->bi_status = BLK_STS_IOERR; + break; + } } } else { fsverity_verify_bio(bio); @@ -245,6 +260,8 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, blkaddr++; } + ctx->decompression_attempted = true; + /* * Optimization: if all the bio's pages are compressed, then scheduling * the per-bio verity work is unnecessary, as verity will be fully @@ -1062,6 +1079,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, ctx->sbi = sbi; ctx->enabled_steps = post_read_steps; ctx->fs_blkaddr = blkaddr; + ctx->decompression_attempted = false; bio->bi_private = ctx; } iostat_alloc_and_bind_ctx(sbi, bio, ctx); @@ -1089,7 +1107,6 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, bio_put(bio); return -EFAULT; } - ClearPageError(page); inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); __submit_bio(sbi, bio, DATA); @@ -2141,7 +2158,6 @@ submit_and_realloc: inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); - ClearPageError(page); *last_block_in_bio = block_nr; goto out; out: @@ -2289,7 +2305,6 @@ submit_and_realloc: inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); - ClearPageError(page); *last_block_in_bio = blkaddr; } @@ -2306,7 +2321,6 @@ out: for (i = 0; i < cc->cluster_size; i++) { if (cc->rpages[i]) { ClearPageUptodate(cc->rpages[i]); - ClearPageError(cc->rpages[i]); unlock_page(cc->rpages[i]); } } @@ -2403,7 +2417,6 @@ read_single_page: #ifdef CONFIG_F2FS_FS_COMPRESSION set_error_page: #endif - SetPageError(page); zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); } diff --git a/fs/verity/verify.c b/fs/verity/verify.c index bde8c9b7d25f..961ba248021f 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -200,9 +200,8 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page); * @bio: the bio to verify * * Verify a set of pages that have just been read from a verity file. The pages - * must be pagecache pages that are still locked and not yet uptodate. Pages - * that fail verification are set to the Error state. Verification is skipped - * for pages already in the Error state, e.g. due to fscrypt decryption failure. + * must be pagecache pages that are still locked and not yet uptodate. If a + * page fails verification, then bio->bi_status is set to an error status. * * This is a helper function for use by the ->readahead() method of filesystems * that issue bios to read data directly into the page cache. Filesystems that @@ -244,9 +243,10 @@ void fsverity_verify_bio(struct bio *bio) unsigned long level0_ra_pages = min(max_ra_pages, params->level0_blocks - level0_index); - if (!PageError(page) && - !verify_page(inode, vi, req, page, level0_ra_pages)) - SetPageError(page); + if (!verify_page(inode, vi, req, page, level0_ra_pages)) { + bio->bi_status = BLK_STS_IOERR; + break; + } } fsverity_free_hash_request(params->hash_alg, req); -- cgit From b0284cd29a957e62d60c2886fd663be93c56f9c0 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 3 Nov 2022 18:10:34 -0700 Subject: mm: Do not enable PG_arch_2 for all 64-bit architectures Commit 4beba9486abd ("mm: Add PG_arch_2 page flag") introduced a new page flag for all 64-bit architectures. However, even if an architecture is 64-bit, it may still have limited spare bits in the 'flags' member of 'struct page'. This may happen if an architecture enables SPARSEMEM without SPARSEMEM_VMEMMAP as is the case with the newly added loongarch. This architecture port needs 19 more bits for the sparsemem section information and, while it is currently fine with PG_arch_2, adding any more PG_arch_* flags will trigger build-time warnings. Add a new CONFIG_ARCH_USES_PG_ARCH_X option which can be selected by architectures that need more PG_arch_* flags beyond PG_arch_1. Select it on arm64. Signed-off-by: Catalin Marinas [pcc@google.com: fix build with CONFIG_ARM64_MTE disabled] Signed-off-by: Peter Collingbourne Reported-by: kernel test robot Cc: Andrew Morton Cc: Steven Price Reviewed-by: Steven Price Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221104011041.290951-2-pcc@google.com --- fs/proc/page.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/page.c b/fs/proc/page.c index f2273b164535..882525c8e94c 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -219,7 +219,7 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); -#ifdef CONFIG_64BIT +#ifdef CONFIG_ARCH_USES_PG_ARCH_X u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2); #endif -- cgit From ef6458b1b6ca3fdb991ce4182e981a88d4c58c0f Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 3 Nov 2022 18:10:37 -0700 Subject: mm: Add PG_arch_3 page flag As with PG_arch_2, this flag is only allowed on 64-bit architectures due to the shortage of bits available. It will be used by the arm64 MTE code in subsequent patches. Signed-off-by: Peter Collingbourne Cc: Will Deacon Cc: Marc Zyngier Cc: Steven Price [catalin.marinas@arm.com: added flag preserving in __split_huge_page_tail()] Signed-off-by: Catalin Marinas Reviewed-by: Steven Price Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221104011041.290951-5-pcc@google.com --- fs/proc/page.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/proc/page.c b/fs/proc/page.c index 882525c8e94c..6249c347809a 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -221,6 +221,7 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); #ifdef CONFIG_ARCH_USES_PG_ARCH_X u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2); + u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3); #endif return u; -- cgit From 318cdc822c63b6e2befcfdc2088378ae6fa18def Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 29 Jun 2022 19:26:47 +0800 Subject: ext4: check and assert if marking an no_delete evicting inode dirty In ext4_evict_inode(), if we evicting an inode in the 'no_delete' path, it cannot be raced by another mark_inode_dirty(). If it happens, someone else may accidentally dirty it without holding inode refcount and probably cause use-after-free issues in the writeback procedure. It's indiscoverable and hard to debug, so add an WARN_ON_ONCE() to check and detect this issue in advance. Suggested-by: Jan Kara Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220629112647.4141034-2-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7c5f5dabe0fd..121ce2fccfab 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -335,6 +335,12 @@ stop_handle: ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: + /* + * Check out some where else accidentally dirty the evicting inode, + * which may probably cause inode use-after-free issues later. + */ + WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list)); + if (!list_empty(&EXT4_I(inode)->i_fc_list)) ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ -- cgit From 66267814ba0ee0732c69ca87eb1fd6eb63bf0d5f Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Wed, 17 Aug 2022 10:59:28 +0800 Subject: fs/ext4: replace ternary operator with min()/max() and min_t() Fix the following coccicheck warning: fs/ext4/inline.c:183: WARNING opportunity for min(). fs/ext4/extents.c:2631: WARNING opportunity for max(). fs/ext4/extents.c:2632: WARNING opportunity for min(). fs/ext4/extents.c:5559: WARNING opportunity for max(). fs/ext4/super.c:6908: WARNING opportunity for min(). min()/max() and min_t() macro is defined in include/linux/minmax.h. It avoids multiple evaluations of the arguments when non-constant and performs strict type-checking. Reported-by: kernel test robot Suggested-by: Lukas Czerner Signed-off-by: Jiangshan Yi Reviewed-by: Lukas Czerner Link: https://lore.kernel.org/r/20220817025928.612851-1-13667453960@163.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 8 +++----- fs/ext4/inline.c | 3 +-- fs/ext4/super.c | 3 +-- 3 files changed, 5 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 6c399a8b22b3..ec008278d970 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2635,9 +2635,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, unwritten, ex_ee_len); path[depth].p_ext = ex; - a = ex_ee_block > start ? ex_ee_block : start; - b = ex_ee_block+ex_ee_len - 1 < end ? - ex_ee_block+ex_ee_len - 1 : end; + a = max(ex_ee_block, start); + b = min(ex_ee_block + ex_ee_len - 1, end); ext_debug(inode, " border %u:%u\n", a, b); @@ -5567,8 +5566,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) * ee_start_lblk to shift extents */ ret = ext4_ext_shift_extents(inode, handle, - ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk, - len_lblk, SHIFT_RIGHT); + max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index a4fbe825694b..2b42ececa46d 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -180,8 +180,7 @@ static int ext4_read_inline_data(struct inode *inode, void *buffer, BUG_ON(len > EXT4_I(inode)->i_inline_size); - cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ? - len : EXT4_MIN_INLINE_DATA_SIZE; + cp_len = min_t(unsigned int, len, EXT4_MIN_INLINE_DATA_SIZE); raw_inode = ext4_raw_inode(iloc); memcpy(buffer, (void *)(raw_inode->i_block), cp_len); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7cdd2138c897..4749b303f2a9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -7031,8 +7031,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, len = i_size-off; toread = len; while (toread > 0) { - tocopy = sb->s_blocksize - offset < toread ? - sb->s_blocksize - offset : toread; + tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); bh = ext4_bread(NULL, inode, blk, 0); if (IS_ERR(bh)) return PTR_ERR(bh); -- cgit From a4bbf53d88c728da9ff6c316b1e4ded63a8f3940 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 28 Nov 2022 20:51:39 -0800 Subject: fsverity: simplify fsverity_get_digest() Instead of looking up the algorithm by name in hash_algo_name[] to get its hash_algo ID, just store the hash_algo ID in the fsverity_hash_alg struct. Verify at boot time that every fsverity_hash_alg has a valid hash_algo ID with matching digest size. Remove an unnecessary memset() of the whole digest array to 0 before the digest is copied into it. Finally, remove the pr_debug statement. There is already a pr_debug for the fsverity digest when the file is opened. Signed-off-by: Eric Biggers Reviewed-by: Mimi Zohar Link: https://lore.kernel.org/r/20221129045139.69803-1-ebiggers@kernel.org --- fs/verity/fsverity_private.h | 5 +++++ fs/verity/hash_algs.c | 6 ++++++ fs/verity/measure.c | 19 ++----------------- 3 files changed, 13 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index dbe1ce5b450a..c7fcb855e068 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -32,6 +32,11 @@ struct fsverity_hash_alg { unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */ unsigned int block_size; /* block size in bytes, e.g. 64 for SHA-256 */ mempool_t req_pool; /* mempool with a preallocated hash request */ + /* + * The HASH_ALGO_* constant for this algorithm. This is different from + * FS_VERITY_HASH_ALG_*, which uses a different numbering scheme. + */ + enum hash_algo algo_id; }; /* Merkle tree parameters: hash algorithm, initial hash state, and topology */ diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index 71d0fccb6d4c..6f8170cf4ae7 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -16,11 +16,13 @@ struct fsverity_hash_alg fsverity_hash_algs[] = { .name = "sha256", .digest_size = SHA256_DIGEST_SIZE, .block_size = SHA256_BLOCK_SIZE, + .algo_id = HASH_ALGO_SHA256, }, [FS_VERITY_HASH_ALG_SHA512] = { .name = "sha512", .digest_size = SHA512_DIGEST_SIZE, .block_size = SHA512_BLOCK_SIZE, + .algo_id = HASH_ALGO_SHA512, }, }; @@ -324,5 +326,9 @@ void __init fsverity_check_hash_algs(void) */ BUG_ON(!is_power_of_2(alg->digest_size)); BUG_ON(!is_power_of_2(alg->block_size)); + + /* Verify that there is a valid mapping to HASH_ALGO_*. */ + BUG_ON(alg->algo_id == 0); + BUG_ON(alg->digest_size != hash_digest_size[alg->algo_id]); } } diff --git a/fs/verity/measure.c b/fs/verity/measure.c index e99c00350c28..5c79ea1b2468 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -65,8 +65,7 @@ EXPORT_SYMBOL_GPL(fsverity_ioctl_measure); * @alg: (out) pointer to the hash algorithm enumeration * * Return the file hash algorithm and digest of an fsverity protected file. - * Assumption: before calling fsverity_get_digest(), the file must have been - * opened. + * Assumption: before calling this, the file must have been opened. * * Return: 0 on success, -errno on failure */ @@ -76,27 +75,13 @@ int fsverity_get_digest(struct inode *inode, { const struct fsverity_info *vi; const struct fsverity_hash_alg *hash_alg; - int i; vi = fsverity_get_info(inode); if (!vi) return -ENODATA; /* not a verity file */ hash_alg = vi->tree_params.hash_alg; - memset(digest, 0, FS_VERITY_MAX_DIGEST_SIZE); - - /* convert the verity hash algorithm name to a hash_algo_name enum */ - i = match_string(hash_algo_name, HASH_ALGO__LAST, hash_alg->name); - if (i < 0) - return -EINVAL; - *alg = i; - - if (WARN_ON_ONCE(hash_alg->digest_size != hash_digest_size[*alg])) - return -EINVAL; memcpy(digest, vi->file_digest, hash_alg->digest_size); - - pr_debug("file digest %s:%*phN\n", hash_algo_name[*alg], - hash_digest_size[*alg], digest); - + *alg = hash_alg->algo_id; return 0; } -- cgit From ab1ddef98a715eddb65309ffa83267e4e84a571e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 14 Nov 2022 08:33:09 -0500 Subject: filelock: new helper: vfs_inode_has_locks Ceph has a need to know whether a particular inode has any locks set on it. It's currently tracking that by a num_locks field in its filp->private_data, but that's problematic as it tries to decrement this field when releasing locks and that can race with the file being torn down. Add a new vfs_inode_has_locks helper that just returns whether any locks are currently held on the inode. Reviewed-by: Xiubo Li Reviewed-by: Christoph Hellwig Signed-off-by: Jeff Layton --- fs/locks.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 5876c8ff0edc..9ccf89b6c95d 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2672,6 +2672,29 @@ int vfs_cancel_lock(struct file *filp, struct file_lock *fl) } EXPORT_SYMBOL_GPL(vfs_cancel_lock); +/** + * vfs_inode_has_locks - are any file locks held on @inode? + * @inode: inode to check for locks + * + * Return true if there are any FL_POSIX or FL_FLOCK locks currently + * set on @inode. + */ +bool vfs_inode_has_locks(struct inode *inode) +{ + struct file_lock_context *ctx; + bool ret; + + ctx = smp_load_acquire(&inode->i_flctx); + if (!ctx) + return false; + + spin_lock(&ctx->flc_lock); + ret = !list_empty(&ctx->flc_posix) || !list_empty(&ctx->flc_flock); + spin_unlock(&ctx->flc_lock); + return ret; +} +EXPORT_SYMBOL_GPL(vfs_inode_has_locks); + #ifdef CONFIG_PROC_FS #include #include -- cgit From 401a8b8fd5acd51582b15238d72a8d0edd580e9f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 16 Nov 2022 09:02:30 -0500 Subject: filelock: add a new locks_inode_context accessor function There are a number of places in the kernel that are accessing the inode->i_flctx field without smp_load_acquire. This is required to ensure that the caller doesn't see a partially-initialized structure. Add a new accessor function for it to make this clear and convert all of the relevant accesses in locks.c to use it. Also, convert locks_free_lock_context to use the helper as well instead of just doing a "bare" assignment. Reviewed-by: Christoph Hellwig Signed-off-by: Jeff Layton --- fs/locks.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 9ccf89b6c95d..8e48b3d5afe3 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -175,7 +175,7 @@ locks_get_lock_context(struct inode *inode, int type) struct file_lock_context *ctx; /* paired with cmpxchg() below */ - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (likely(ctx) || type == F_UNLCK) goto out; @@ -194,7 +194,7 @@ locks_get_lock_context(struct inode *inode, int type) */ if (cmpxchg(&inode->i_flctx, NULL, ctx)) { kmem_cache_free(flctx_cache, ctx); - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); } out: trace_locks_get_lock_context(inode, type, ctx); @@ -247,7 +247,7 @@ locks_check_ctx_file_list(struct file *filp, struct list_head *list, void locks_free_lock_context(struct inode *inode) { - struct file_lock_context *ctx = inode->i_flctx; + struct file_lock_context *ctx = locks_inode_context(inode); if (unlikely(ctx)) { locks_check_ctx_lists(inode); @@ -891,7 +891,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) void *owner; void (*func)(void); - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx || list_empty_careful(&ctx->flc_posix)) { fl->fl_type = F_UNLCK; return; @@ -1483,7 +1483,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) new_fl->fl_flags = type; /* typically we will check that ctx is non-NULL before calling */ - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx) { WARN_ON_ONCE(1); goto free_lock; @@ -1588,7 +1588,7 @@ void lease_get_mtime(struct inode *inode, struct timespec64 *time) struct file_lock_context *ctx; struct file_lock *fl; - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (ctx && !list_empty_careful(&ctx->flc_lease)) { spin_lock(&ctx->flc_lock); fl = list_first_entry_or_null(&ctx->flc_lease, @@ -1634,7 +1634,7 @@ int fcntl_getlease(struct file *filp) int type = F_UNLCK; LIST_HEAD(dispose); - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (ctx && !list_empty_careful(&ctx->flc_lease)) { percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); @@ -1823,7 +1823,7 @@ static int generic_delete_lease(struct file *filp, void *owner) struct file_lock_context *ctx; LIST_HEAD(dispose); - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx) { trace_generic_delete_lease(inode, NULL); return error; @@ -2563,7 +2563,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) * posix_lock_file(). Another process could be setting a lock on this * file at the same time, but we wouldn't remove that lock anyway. */ - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx || list_empty(&ctx->flc_posix)) return; @@ -2636,7 +2636,7 @@ void locks_remove_file(struct file *filp) { struct file_lock_context *ctx; - ctx = smp_load_acquire(&locks_inode(filp)->i_flctx); + ctx = locks_inode_context(locks_inode(filp)); if (!ctx) return; @@ -2684,7 +2684,7 @@ bool vfs_inode_has_locks(struct inode *inode) struct file_lock_context *ctx; bool ret; - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx) return false; @@ -2865,7 +2865,7 @@ void show_fd_locks(struct seq_file *f, struct file_lock_context *ctx; int id = 0; - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx) return; -- cgit From d4e78663f6bc83db44041f224e58e0940662a912 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 16 Nov 2022 08:44:25 -0500 Subject: ceph: use locks_inode_context helper ceph currently doesn't access i_flctx safely. This requires a smp_load_acquire, as the pointer is set via cmpxchg (a release operation). Reviewed-by: Xiubo Li Reviewed-by: Christoph Hellwig Signed-off-by: Jeff Layton --- fs/ceph/locks.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 3e2843e86e27..f3b461c708a8 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -364,7 +364,7 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) *fcntl_count = 0; *flock_count = 0; - ctx = inode->i_flctx; + ctx = locks_inode_context(inode); if (ctx) { spin_lock(&ctx->flc_lock); list_for_each_entry(lock, &ctx->flc_posix, fl_list) @@ -418,7 +418,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode, int num_fcntl_locks, int num_flock_locks) { struct file_lock *lock; - struct file_lock_context *ctx = inode->i_flctx; + struct file_lock_context *ctx = locks_inode_context(inode); int err = 0; int seen_fcntl = 0; int seen_flock = 0; -- cgit From a1fde8ee771f92d2a8bbd79532149ddd34546bc1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 16 Nov 2022 08:49:25 -0500 Subject: cifs: use locks_inode_context helper cifs currently doesn't access i_flctx safely. This requires a smp_load_acquire, as the pointer is set via cmpxchg (a release operation). Cc: Steve French Reviewed-by: Christoph Hellwig Signed-off-by: Jeff Layton --- fs/cifs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index cd9698209930..6c1431979495 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1413,7 +1413,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) struct inode *inode = d_inode(cfile->dentry); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct file_lock *flock; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); unsigned int count = 0, i; int rc = 0, xid, type; struct list_head locks_to_send, *el; -- cgit From 87f00aba211ef7308fd6c5d47d646a70bf196662 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 16 Nov 2022 08:50:52 -0500 Subject: ksmbd: use locks_inode_context helper ksmbd currently doesn't access i_flctx safely. This requires a smp_load_acquire, as the pointer is set via cmpxchg (a release operation). Cc: Steve French Acked-by: Namjae Jeon Reviewed-by: Christoph Hellwig Signed-off-by: Jeff Layton --- fs/ksmbd/vfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 8de970d6146f..f9e85d6a160e 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -321,7 +321,7 @@ static int check_lock_range(struct file *filp, loff_t start, loff_t end, unsigned char type) { struct file_lock *flock; - struct file_lock_context *ctx = file_inode(filp)->i_flctx; + struct file_lock_context *ctx = locks_inode_context(file_inode(filp)); int error = 0; if (!ctx || list_empty_careful(&ctx->flc_posix)) -- cgit From 98b41ffe0afdfeaa1439a5d6bd2db4a94277e31b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 16 Nov 2022 09:19:43 -0500 Subject: lockd: use locks_inode_context helper lockd currently doesn't access i_flctx safely. This requires a smp_load_acquire, as the pointer is set via cmpxchg (a release operation). Cc: Trond Myklebust Cc: Anna Schumaker Cc: Chuck Lever Reviewed-by: Christoph Hellwig Signed-off-by: Jeff Layton --- fs/lockd/svcsubs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index e1c4617de771..720684345817 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -207,7 +207,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, { struct inode *inode = nlmsvc_file_inode(file); struct file_lock *fl; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); struct nlm_host *lockhost; if (!flctx || list_empty_careful(&flctx->flc_posix)) @@ -262,7 +262,7 @@ nlm_file_inuse(struct nlm_file *file) { struct inode *inode = nlmsvc_file_inode(file); struct file_lock *fl; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) return 1; -- cgit From 17b985def2a859d66d27afee442147468a6a4ea6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 16 Nov 2022 09:55:36 -0500 Subject: nfs: use locks_inode_context helper nfs currently doesn't access i_flctx safely. This requires a smp_load_acquire, as the pointer is set via cmpxchg (a release operation). Cc: Trond Myklebust Cc: Anna Schumaker Reviewed-by: Christoph Hellwig Signed-off-by: Jeff Layton --- fs/nfs/delegation.c | 2 +- fs/nfs/nfs4state.c | 2 +- fs/nfs/pagelist.c | 2 +- fs/nfs/write.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index ead8a0e06abf..cf7365581031 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -146,7 +146,7 @@ static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_state { struct inode *inode = state->inode; struct file_lock *fl; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); struct list_head *list; int status = 0; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index a2d2d5d1b088..dd18344648f3 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1501,7 +1501,7 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ struct file_lock *fl; struct nfs4_lock_state *lsp; int status = 0; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); struct list_head *list; if (flctx == NULL) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 317cedfa52bf..16be6dae524f 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -1055,7 +1055,7 @@ static unsigned int nfs_coalesce_size(struct nfs_page *prev, if (prev) { if (!nfs_match_open_context(nfs_req_openctx(req), nfs_req_openctx(prev))) return 0; - flctx = d_inode(nfs_req_openctx(req)->dentry)->i_flctx; + flctx = locks_inode_context(d_inode(nfs_req_openctx(req)->dentry)); if (flctx != NULL && !(list_empty_careful(&flctx->flc_posix) && list_empty_careful(&flctx->flc_flock)) && diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f41d24b54fd1..80c240e50952 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1185,7 +1185,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) { struct nfs_open_context *ctx = nfs_file_open_context(file); struct nfs_lock_context *l_ctx; - struct file_lock_context *flctx = file_inode(file)->i_flctx; + struct file_lock_context *flctx = locks_inode_context(file_inode(file)); struct nfs_page *req; int do_flush, status; /* @@ -1321,7 +1321,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode, unsigned int pagelen) { int ret; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); struct file_lock *fl; if (file->f_flags & O_DSYNC) -- cgit From 77c67530e1f95ac25c7075635f32f04367380894 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 16 Nov 2022 09:36:07 -0500 Subject: nfsd: use locks_inode_context helper nfsd currently doesn't access i_flctx safely everywhere. This requires a smp_load_acquire, as the pointer is set via cmpxchg (a release operation). Acked-by: Chuck Lever Reviewed-by: Christoph Hellwig Signed-off-by: Jeff Layton --- fs/nfsd/nfs4state.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 836bd825ca4a..da8d0ea66229 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4758,7 +4758,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) static bool nfsd4_deleg_present(const struct inode *inode) { - struct file_lock_context *ctx = smp_load_acquire(&inode->i_flctx); + struct file_lock_context *ctx = locks_inode_context(inode); return ctx && !list_empty_careful(&ctx->flc_lease); } @@ -5897,7 +5897,7 @@ nfs4_lockowner_has_blockers(struct nfs4_lockowner *lo) list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) { nf = stp->st_stid.sc_file; - ctx = nf->fi_inode->i_flctx; + ctx = locks_inode_context(nf->fi_inode); if (!ctx) continue; if (locks_owner_has_blockers(ctx, lo)) @@ -7713,7 +7713,7 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) } inode = locks_inode(nf->nf_file); - flctx = inode->i_flctx; + flctx = locks_inode_context(inode); if (flctx && !list_empty_careful(&flctx->flc_posix)) { spin_lock(&flctx->flc_lock); -- cgit From f2f2494c8aa3cc317572c4674ef256005ebc092b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 18 Nov 2022 15:43:57 -0800 Subject: Add process name and pid to locks warning It's fairly useless to complain about using an obsolete feature without telling the user which process used it. My Fedora desktop randomly drops this message, but I would really need this patch to figure out what triggers is. [ jlayton: print pid as well as process name ] Signed-off-by: Andi Kleen Signed-off-by: Jeff Layton --- fs/locks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 8e48b3d5afe3..8f01bee17715 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2096,7 +2096,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) * throw a warning to let people know that they don't actually work. */ if (cmd & LOCK_MAND) { - pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n"); + pr_warn_once("%s(%d): Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n", current->comm, current->pid); return 0; } -- cgit From a79168a0c00d710420c1758f6c38df89e12f0763 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 30 Nov 2022 08:41:01 -0800 Subject: fs/remap_range: avoid spurious writeback on zero length request generic_remap_checks() can reduce the effective request length (i.e., after the reflink extend to EOF case is handled) down to zero. If this occurs, __generic_remap_file_range_prep() proceeds through dio serialization, file mapping flush calls, and may invoke file_modified() before returning back to the filesystem caller, all of which immediately check for len == 0 and return. While this is mostly harmless, it is spurious and not completely without side effect. A filemap write call can submit I/O (but not wait on it) when the specified end byte precedes the start but happens to land on the same aligned page boundary, which can occur from __generic_remap_file_range_prep() when len is 0. The dedupe path already has a len == 0 check to break out before doing range comparisons. Lift this check a bit earlier in the function to cover the general case of len == 0 and avoid the unnecessary work. While here, account for the case where generic_remap_check_len() may also reduce length to zero. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/remap_range.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/remap_range.c b/fs/remap_range.c index 654912d06862..8741ecd616a7 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -304,7 +304,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, /* Check that we don't violate system file offset limits. */ ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, remap_flags); - if (ret) + if (ret || *len == 0) return ret; /* Wait for the completion of any pending IOs on both files */ @@ -328,9 +328,6 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (remap_flags & REMAP_FILE_DEDUP) { bool is_same = false; - if (*len == 0) - return 0; - if (!IS_DAX(inode_in)) ret = vfs_dedupe_file_range_compare(file_in, pos_in, file_out, pos_out, *len, &is_same); @@ -348,7 +345,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, remap_flags); - if (ret) + if (ret || *len == 0) return ret; /* If can't alter the file contents, we're done. */ -- cgit From 032e160305f6872e590c77f11896fb28365c6d6c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 28 Nov 2022 17:24:42 -0800 Subject: xfs: invalidate block device page cache during unmount Every now and then I see fstests failures on aarch64 (64k pages) that trigger on the following sequence: mkfs.xfs $dev mount $dev $mnt touch $mnt/a umount $mnt xfs_db -c 'path /a' -c 'print' $dev 99% of the time this succeeds, but every now and then xfs_db cannot find /a and fails. This turns out to be a race involving udev/blkid, the page cache for the block device, and the xfs_db process. udev is triggered whenever anyone closes a block device or unmounts it. The default udev rules invoke blkid to read the fs super and create symlinks to the bdev under /dev/disk. For this, it uses buffered reads through the page cache. xfs_db also uses buffered reads to examine metadata. There is no coordination between xfs_db and udev, which means that they can run concurrently. Note there is no coordination between the kernel and blkid either. On a system with 64k pages, the page cache can cache the superblock and the root inode (and hence the root dir) with the same 64k page. If udev spawns blkid after the mkfs and the system is busy enough that it is still running when xfs_db starts up, they'll both read from the same page in the pagecache. The unmount writes updated inode metadata to disk directly. The XFS buffer cache does not use the bdev pagecache, nor does it invalidate the pagecache on umount. If the above scenario occurs, the pagecache no longer reflects what's on disk, xfs_db reads the stale metadata, and fails to find /a. Most of the time this succeeds because closing a bdev invalidates the page cache, but when processes race, everyone loses. Fix the problem by invalidating the bdev pagecache after flushing the bdev, so that xfs_db will see up to date metadata. Signed-off-by: Darrick J. Wong Reviewed-by: Gao Xiang Reviewed-by: Dave Chinner --- fs/xfs/xfs_buf.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index dde346450952..54c774af6e1c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1945,6 +1945,7 @@ xfs_free_buftarg( list_lru_destroy(&btp->bt_lru); blkdev_issue_flush(btp->bt_bdev); + invalidate_bdev(btp->bt_bdev); fs_put_dax(btp->bt_daxdev, btp->bt_mount); kmem_free(btp); -- cgit From fd5beaff250d7e88912a937fad072d9d24f219da Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 28 Nov 2022 17:24:42 -0800 Subject: xfs: use memcpy, not strncpy, to format the attr prefix during listxattr When -Wstringop-truncation is enabled, the compiler complains about truncation of the null byte at the end of the xattr name prefix. This is intentional, since we're concatenating the two strings together and do _not_ want a null byte in the middle of the name. We've already ensured that the name buffer is long enough to handle prefix and name, and the prefix_len is supposed to be the length of the prefix string without the null byte, so use memcpy here instead. Signed-off-by: Darrick J. Wong Reviewed-by: Gao Xiang Reviewed-by: Dave Chinner --- fs/xfs/xfs_xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index c325a28b89a8..10aa1fd39d2b 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -210,7 +210,7 @@ __xfs_xattr_put_listent( return; } offset = context->buffer + context->count; - strncpy(offset, prefix, prefix_len); + memcpy(offset, prefix, prefix_len); offset += prefix_len; strncpy(offset, (char *)name, namelen); /* real name */ offset += namelen; -- cgit From e5827a007aa4bb737c63121fd2c77e089b18a372 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 28 Nov 2022 17:24:42 -0800 Subject: xfs: shut up -Wuninitialized in xfsaild_push -Wuninitialized complains about @target in xfsaild_push being uninitialized in the case where the waitqueue is active but there is no last item in the AIL to wait for. I /think/ it should never be the case that the subsequent xfs_trans_ail_cursor_first returns a log item and hence we'll never end up at XFS_LSN_CMP, but let's make this explicit. Signed-off-by: Darrick J. Wong Reviewed-by: Gao Xiang Reviewed-by: Dave Chinner --- fs/xfs/xfs_trans_ail.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index f51df7d94ef7..7d4109af193e 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -422,7 +422,7 @@ xfsaild_push( struct xfs_ail_cursor cur; struct xfs_log_item *lip; xfs_lsn_t lsn; - xfs_lsn_t target; + xfs_lsn_t target = NULLCOMMITLSN; long tout; int stuck = 0; int flushing = 0; @@ -472,6 +472,8 @@ xfsaild_push( XFS_STATS_INC(mp, xs_push_ail); + ASSERT(target != NULLCOMMITLSN); + lsn = lip->li_lsn; while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { int lock_result; -- cgit From 4c6dbfd2756bd83a0085ed804e2bb7be9cc16bc5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 28 Nov 2022 17:24:43 -0800 Subject: xfs: attach dquots to inode before reading data/cow fork mappings I've been running near-continuous integration testing of online fsck, and I've noticed that once a day, one of the ARM VMs will fail the test with out of order records in the data fork. xfs/804 races fsstress with online scrub (aka scan but do not change anything), so I think this might be a bug in the core xfs code. This also only seems to trigger if one runs the test for more than ~6 minutes via TIME_FACTOR=13 or something. https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/tree/tests/xfs/804?h=djwong-wtf I added a debugging patch to the kernel to check the data fork extents after taking the ILOCK, before dropping ILOCK, and before and after each bmapping operation. So far I've narrowed it down to the delalloc code inserting a record in the wrong place in the iext tree: xfs_bmap_add_extent_hole_delay, near line 2691: case 0: /* * New allocation is not contiguous with another * delayed allocation. * Insert a new entry. */ oldlen = newlen = 0; xfs_iunlock_check_datafork(ip); <-- ok here xfs_iext_insert(ip, icur, new, state); xfs_iunlock_check_datafork(ip); <-- bad here break; } I recorded the state of the data fork mappings and iext cursor state when a corrupt data fork is detected immediately after the xfs_bmap_add_extent_hole_delay call in xfs_bmapi_reserve_delalloc: ino 0x140bb3 func xfs_bmapi_reserve_delalloc line 4164 data fork: ino 0x140bb3 nr 0x0 nr_real 0x0 offset 0xb9 blockcount 0x1f startblock 0x935de2 state 1 ino 0x140bb3 nr 0x1 nr_real 0x1 offset 0xe6 blockcount 0xa startblock 0xffffffffe0007 state 0 ino 0x140bb3 nr 0x2 nr_real 0x1 offset 0xd8 blockcount 0xe startblock 0x935e01 state 0 Here we see that a delalloc extent was inserted into the wrong position in the iext leaf, same as all the other times. The extra trace data I collected are as follows: ino 0x140bb3 fork 0 oldoff 0xe6 oldlen 0x4 oldprealloc 0x6 isize 0xe6000 ino 0x140bb3 oldgotoff 0xea oldgotstart 0xfffffffffffffffe oldgotcount 0x0 oldgotstate 0 ino 0x140bb3 crapgotoff 0x0 crapgotstart 0x0 crapgotcount 0x0 crapgotstate 0 ino 0x140bb3 freshgotoff 0xd8 freshgotstart 0x935e01 freshgotcount 0xe freshgotstate 0 ino 0x140bb3 nowgotoff 0xe6 nowgotstart 0xffffffffe0007 nowgotcount 0xa nowgotstate 0 ino 0x140bb3 oldicurpos 1 oldleafnr 2 oldleaf 0xfffffc00f0609a00 ino 0x140bb3 crapicurpos 2 crapleafnr 2 crapleaf 0xfffffc00f0609a00 ino 0x140bb3 freshicurpos 1 freshleafnr 2 freshleaf 0xfffffc00f0609a00 ino 0x140bb3 newicurpos 1 newleafnr 3 newleaf 0xfffffc00f0609a00 The first line shows that xfs_bmapi_reserve_delalloc was called with whichfork=XFS_DATA_FORK, off=0xe6, len=0x4, prealloc=6. The second line ("oldgot") shows the contents of @got at the beginning of the call, which are the results of the first iext lookup in xfs_buffered_write_iomap_begin. Line 3 ("crapgot") is the result of duplicating the cursor at the start of the body of xfs_bmapi_reserve_delalloc and performing a fresh lookup at @off. Line 4 ("freshgot") is the result of a new xfs_iext_get_extent right before the call to xfs_bmap_add_extent_hole_delay. Totally garbage. Line 5 ("nowgot") is contents of @got after the xfs_bmap_add_extent_hole_delay call. Line 6 is the contents of @icur at the beginning fo the call. Lines 7-9 are the contents of the iext cursors at the point where the block mappings were sampled. I think @oldgot is a HOLESTARTBLOCK extent because the first lookup didn't find anything, so we filled in imap with "fake hole until the end". At the time of the first lookup, I suspect that there's only one 32-block unwritten extent in the mapping (hence oldicurpos==1) but by the time we get to recording crapgot, crapicurpos==2. Dave then added: Ok, that's much simpler to reason about, and implies the smoke is coming from xfs_buffered_write_iomap_begin() or xfs_bmapi_reserve_delalloc(). I suspect the former - it does a lot of stuff with the ILOCK_EXCL held..... .... including calling xfs_qm_dqattach_locked(). xfs_buffered_write_iomap_begin ILOCK_EXCL look up icur xfs_qm_dqattach_locked xfs_qm_dqattach_one xfs_qm_dqget_inode dquot cache miss xfs_iunlock(ip, XFS_ILOCK_EXCL); error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp); xfs_ilock(ip, XFS_ILOCK_EXCL); .... xfs_bmapi_reserve_delalloc(icur) Yup, that's what is letting the magic smoke out - xfs_qm_dqattach_locked() can cycle the ILOCK. If that happens, we can pass a stale icur to xfs_bmapi_reserve_delalloc() and it all goes downhill from there. Back to Darrick now: So. Fix this by moving the dqattach_locked call up before we take the ILOCK, like all the other callers in that file. Fixes: a526c85c2236 ("xfs: move xfs_file_iomap_begin_delay around") # goes further back than this Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_iomap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 1005f1e36545..68436370927d 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -978,6 +978,10 @@ xfs_buffered_write_iomap_begin( ASSERT(!XFS_IS_REALTIME_INODE(ip)); + error = xfs_qm_dqattach(ip); + if (error) + return error; + error = xfs_ilock_for_iomap(ip, flags, &lockmode); if (error) return error; @@ -1081,10 +1085,6 @@ xfs_buffered_write_iomap_begin( allocfork = XFS_COW_FORK; } - error = xfs_qm_dqattach_locked(ip, false); - if (error) - goto out_unlock; - if (eof && offset + count > XFS_ISIZE(ip)) { /* * Determine the initial size of the preallocation. -- cgit From 1eb52a6a71981b80f9acbd915acd6a05a5037196 Mon Sep 17 00:00:00 2001 From: Guo Xuenan Date: Wed, 30 Nov 2022 09:25:46 -0800 Subject: xfs: wait iclog complete before tearing down AIL Fix uaf in xfs_trans_ail_delete during xlog force shutdown. In commit cd6f79d1fb32 ("xfs: run callbacks before waking waiters in xlog_state_shutdown_callbacks") changed the order of running callbacks and wait for iclog completion to avoid unmount path untimely destroy AIL. But which seems not enough to ensue this, adding mdelay in `xfs_buf_item_unpin` can prove that. The reproduction is as follows. To ensure destroy AIL safely, we should wait all xlog ioend workers done and sync the AIL. ================================================================== BUG: KASAN: use-after-free in xfs_trans_ail_delete+0x240/0x2a0 Read of size 8 at addr ffff888023169400 by task kworker/1:1H/43 CPU: 1 PID: 43 Comm: kworker/1:1H Tainted: G W 6.1.0-rc1-00002-gc28266863c4a #137 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Workqueue: xfs-log/sda xlog_ioend_work Call Trace: dump_stack_lvl+0x4d/0x66 print_report+0x171/0x4a6 kasan_report+0xb3/0x130 xfs_trans_ail_delete+0x240/0x2a0 xfs_buf_item_done+0x7b/0xa0 xfs_buf_ioend+0x1e9/0x11f0 xfs_buf_item_unpin+0x4c8/0x860 xfs_trans_committed_bulk+0x4c2/0x7c0 xlog_cil_committed+0xab6/0xfb0 xlog_cil_process_committed+0x117/0x1e0 xlog_state_shutdown_callbacks+0x208/0x440 xlog_force_shutdown+0x1b3/0x3a0 xlog_ioend_work+0xef/0x1d0 process_one_work+0x6f9/0xf70 worker_thread+0x578/0xf30 kthread+0x28c/0x330 ret_from_fork+0x1f/0x30 Allocated by task 9606: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 __kasan_kmalloc+0x7a/0x90 __kmalloc+0x59/0x140 kmem_alloc+0xb2/0x2f0 xfs_trans_ail_init+0x20/0x320 xfs_log_mount+0x37e/0x690 xfs_mountfs+0xe36/0x1b40 xfs_fs_fill_super+0xc5c/0x1a70 get_tree_bdev+0x3c5/0x6c0 vfs_get_tree+0x85/0x250 path_mount+0xec3/0x1830 do_mount+0xef/0x110 __x64_sys_mount+0x150/0x1f0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Freed by task 9662: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 kasan_save_free_info+0x2a/0x40 __kasan_slab_free+0x105/0x1a0 __kmem_cache_free+0x99/0x2d0 kvfree+0x3a/0x40 xfs_log_unmount+0x60/0xf0 xfs_unmountfs+0xf3/0x1d0 xfs_fs_put_super+0x78/0x300 generic_shutdown_super+0x151/0x400 kill_block_super+0x9a/0xe0 deactivate_locked_super+0x82/0xe0 deactivate_super+0x91/0xb0 cleanup_mnt+0x32a/0x4a0 task_work_run+0x15f/0x240 exit_to_user_mode_prepare+0x188/0x190 syscall_exit_to_user_mode+0x12/0x30 do_syscall_64+0x42/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd The buggy address belongs to the object at ffff888023169400 which belongs to the cache kmalloc-128 of size 128 The buggy address is located 0 bytes inside of 128-byte region [ffff888023169400, ffff888023169480) The buggy address belongs to the physical page: page:ffffea00008c5a00 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888023168f80 pfn:0x23168 head:ffffea00008c5a00 order:1 compound_mapcount:0 compound_pincount:0 flags: 0x1fffff80010200(slab|head|node=0|zone=1|lastcpupid=0x1fffff) raw: 001fffff80010200 ffffea00006b3988 ffffea0000577a88 ffff88800f842ac0 raw: ffff888023168f80 0000000000150007 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888023169300: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888023169380: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff888023169400: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888023169480: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888023169500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ================================================================== Disabling lock debugging due to kernel taint Fixes: cd6f79d1fb32 ("xfs: run callbacks before waking waiters in xlog_state_shutdown_callbacks") Signed-off-by: Guo Xuenan Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 0141d9907d31..fc61cc024023 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -888,6 +888,23 @@ xlog_force_iclog( return xlog_state_release_iclog(iclog->ic_log, iclog, NULL); } +/* + * Cycle all the iclogbuf locks to make sure all log IO completion + * is done before we tear down these buffers. + */ +static void +xlog_wait_iclog_completion(struct xlog *log) +{ + int i; + struct xlog_in_core *iclog = log->l_iclog; + + for (i = 0; i < log->l_iclog_bufs; i++) { + down(&iclog->ic_sema); + up(&iclog->ic_sema); + iclog = iclog->ic_next; + } +} + /* * Wait for the iclog and all prior iclogs to be written disk as required by the * log force state machine. Waiting on ic_force_wait ensures iclog completions @@ -1113,6 +1130,14 @@ xfs_log_unmount( { xfs_log_clean(mp); + /* + * If shutdown has come from iclog IO context, the log + * cleaning will have been skipped and so we need to wait + * for the iclog to complete shutdown processing before we + * tear anything down. + */ + xlog_wait_iclog_completion(mp->m_log); + xfs_buftarg_drain(mp->m_ddev_targp); xfs_trans_ail_destroy(mp); @@ -2115,17 +2140,6 @@ xlog_dealloc_log( xlog_in_core_t *iclog, *next_iclog; int i; - /* - * Cycle all the iclogbuf locks to make sure all log IO completion - * is done before we tear down these buffers. - */ - iclog = log->l_iclog; - for (i = 0; i < log->l_iclog_bufs; i++) { - down(&iclog->ic_sema); - up(&iclog->ic_sema); - iclog = iclog->ic_next; - } - /* * Destroy the CIL after waiting for iclog IO completion because an * iclog EIO error will try to shut down the log, which accesses the -- cgit From 575689fc0ffa6c4bb4e72fd18e31a6525a6124e0 Mon Sep 17 00:00:00 2001 From: Guo Xuenan Date: Wed, 30 Nov 2022 09:25:46 -0800 Subject: xfs: fix super block buf log item UAF during force shutdown xfs log io error will trigger xlog shut down, and end_io worker call xlog_state_shutdown_callbacks to unpin and release the buf log item. The race condition is that when there are some thread doing transaction commit and happened not to be intercepted by xlog_is_shutdown, then, these log item will be insert into CIL, when unpin and release these buf log item, UAF will occur. BTW, add delay before `xlog_cil_commit` can increase recurrence probability. The following call graph actually encountered this bad situation. fsstress io end worker kworker/0:1H-216 xlog_ioend_work ->xlog_force_shutdown ->xlog_state_shutdown_callbacks ->xlog_cil_process_committed ->xlog_cil_committed ->xfs_trans_committed_bulk ->xfs_trans_apply_sb_deltas ->li_ops->iop_unpin(lip, 1); ->xfs_trans_getsb ->_xfs_trans_bjoin ->xfs_buf_item_init ->if (bip) { return 0;} //relog ->xlog_cil_commit ->xlog_cil_insert_items //insert into CIL ->xfs_buf_ioend_fail(bp); ->xfs_buf_ioend ->xfs_buf_item_done ->xfs_buf_item_relse ->xfs_buf_item_free when cil push worker gather percpu cil and insert super block buf log item into ctx->log_items then uaf occurs. ================================================================== BUG: KASAN: use-after-free in xlog_cil_push_work+0x1c8f/0x22f0 Write of size 8 at addr ffff88801800f3f0 by task kworker/u4:4/105 CPU: 0 PID: 105 Comm: kworker/u4:4 Tainted: G W 6.1.0-rc1-00001-g274115149b42 #136 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Workqueue: xfs-cil/sda xlog_cil_push_work Call Trace: dump_stack_lvl+0x4d/0x66 print_report+0x171/0x4a6 kasan_report+0xb3/0x130 xlog_cil_push_work+0x1c8f/0x22f0 process_one_work+0x6f9/0xf70 worker_thread+0x578/0xf30 kthread+0x28c/0x330 ret_from_fork+0x1f/0x30 Allocated by task 2145: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 __kasan_slab_alloc+0x54/0x60 kmem_cache_alloc+0x14a/0x510 xfs_buf_item_init+0x160/0x6d0 _xfs_trans_bjoin+0x7f/0x2e0 xfs_trans_getsb+0xb6/0x3f0 xfs_trans_apply_sb_deltas+0x1f/0x8c0 __xfs_trans_commit+0xa25/0xe10 xfs_symlink+0xe23/0x1660 xfs_vn_symlink+0x157/0x280 vfs_symlink+0x491/0x790 do_symlinkat+0x128/0x220 __x64_sys_symlink+0x7a/0x90 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Freed by task 216: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 kasan_save_free_info+0x2a/0x40 __kasan_slab_free+0x105/0x1a0 kmem_cache_free+0xb6/0x460 xfs_buf_ioend+0x1e9/0x11f0 xfs_buf_item_unpin+0x3d6/0x840 xfs_trans_committed_bulk+0x4c2/0x7c0 xlog_cil_committed+0xab6/0xfb0 xlog_cil_process_committed+0x117/0x1e0 xlog_state_shutdown_callbacks+0x208/0x440 xlog_force_shutdown+0x1b3/0x3a0 xlog_ioend_work+0xef/0x1d0 process_one_work+0x6f9/0xf70 worker_thread+0x578/0xf30 kthread+0x28c/0x330 ret_from_fork+0x1f/0x30 The buggy address belongs to the object at ffff88801800f388 which belongs to the cache xfs_buf_item of size 272 The buggy address is located 104 bytes inside of 272-byte region [ffff88801800f388, ffff88801800f498) The buggy address belongs to the physical page: page:ffffea0000600380 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88801800f208 pfn:0x1800e head:ffffea0000600380 order:1 compound_mapcount:0 compound_pincount:0 flags: 0x1fffff80010200(slab|head|node=0|zone=1|lastcpupid=0x1fffff) raw: 001fffff80010200 ffffea0000699788 ffff88801319db50 ffff88800fb50640 raw: ffff88801800f208 000000000015000a 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88801800f280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88801800f300: fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff88801800f380: fc fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88801800f400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88801800f480: fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc fc ================================================================== Disabling lock debugging due to kernel taint Signed-off-by: Guo Xuenan Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 522d450a94b1..df7322ed73fa 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1018,6 +1018,8 @@ xfs_buf_item_relse( trace_xfs_buf_item_relse(bp, _RET_IP_); ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); + if (atomic_read(&bip->bli_refcount)) + return; bp->b_log_item = NULL; xfs_buf_rele(bp); xfs_buf_item_free(bip); -- cgit From ef4d3ea40565a781c25847e9cb96c1bd9f462bc6 Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Wed, 30 Nov 2022 17:55:51 +0000 Subject: afs: Fix server->active leak in afs_put_server The atomic_read was accidentally replaced with atomic_inc_return, which prevents the server from getting cleaned up and causes rmmod to hang with a warning: Can't purge s=00000001 Fixes: 2757a4dc1849 ("afs: Fix access after dec in put functions") Signed-off-by: Marc Dionne Signed-off-by: David Howells Link: https://lore.kernel.org/r/20221130174053.2665818-1-marc.dionne@auristor.com/ Signed-off-by: Linus Torvalds --- fs/afs/server.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/server.c b/fs/afs/server.c index 4981baf97835..b5237206eac3 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -406,7 +406,7 @@ void afs_put_server(struct afs_net *net, struct afs_server *server, if (!server) return; - a = atomic_inc_return(&server->active); + a = atomic_read(&server->active); zero = __refcount_dec_and_test(&server->ref, &r); trace_afs_server(debug_id, r - 1, a, reason); if (unlikely(zero)) -- cgit From f0a0ccda18d6fd826d7c7e7ad48a6ed61c20f8b4 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Sat, 19 Nov 2022 21:05:42 +0900 Subject: nilfs2: fix NULL pointer dereference in nilfs_palloc_commit_free_entry() Syzbot reported a null-ptr-deref bug: NILFS (loop0): segctord starting. Construction interval = 5 seconds, CP frequency < 30 seconds general protection fault, probably for non-canonical address 0xdffffc0000000002: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017] CPU: 1 PID: 3603 Comm: segctord Not tainted 6.1.0-rc2-syzkaller-00105-gb229b6ca5abb #0 Hardware name: Google Compute Engine/Google Compute Engine, BIOS Google 10/11/2022 RIP: 0010:nilfs_palloc_commit_free_entry+0xe5/0x6b0 fs/nilfs2/alloc.c:608 Code: 00 00 00 00 fc ff df 80 3c 02 00 0f 85 cd 05 00 00 48 b8 00 00 00 00 00 fc ff df 4c 8b 73 08 49 8d 7e 10 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 26 05 00 00 49 8b 46 10 be a6 00 00 00 48 c7 c7 RSP: 0018:ffffc90003dff830 EFLAGS: 00010212 RAX: dffffc0000000000 RBX: ffff88802594e218 RCX: 000000000000000d RDX: 0000000000000002 RSI: 0000000000002000 RDI: 0000000000000010 RBP: ffff888071880222 R08: 0000000000000005 R09: 000000000000003f R10: 000000000000000d R11: 0000000000000000 R12: ffff888071880158 R13: ffff88802594e220 R14: 0000000000000000 R15: 0000000000000004 FS: 0000000000000000(0000) GS:ffff8880b9b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fb1c08316a8 CR3: 0000000018560000 CR4: 0000000000350ee0 Call Trace: nilfs_dat_commit_free fs/nilfs2/dat.c:114 [inline] nilfs_dat_commit_end+0x464/0x5f0 fs/nilfs2/dat.c:193 nilfs_dat_commit_update+0x26/0x40 fs/nilfs2/dat.c:236 nilfs_btree_commit_update_v+0x87/0x4a0 fs/nilfs2/btree.c:1940 nilfs_btree_commit_propagate_v fs/nilfs2/btree.c:2016 [inline] nilfs_btree_propagate_v fs/nilfs2/btree.c:2046 [inline] nilfs_btree_propagate+0xa00/0xd60 fs/nilfs2/btree.c:2088 nilfs_bmap_propagate+0x73/0x170 fs/nilfs2/bmap.c:337 nilfs_collect_file_data+0x45/0xd0 fs/nilfs2/segment.c:568 nilfs_segctor_apply_buffers+0x14a/0x470 fs/nilfs2/segment.c:1018 nilfs_segctor_scan_file+0x3f4/0x6f0 fs/nilfs2/segment.c:1067 nilfs_segctor_collect_blocks fs/nilfs2/segment.c:1197 [inline] nilfs_segctor_collect fs/nilfs2/segment.c:1503 [inline] nilfs_segctor_do_construct+0x12fc/0x6af0 fs/nilfs2/segment.c:2045 nilfs_segctor_construct+0x8e3/0xb30 fs/nilfs2/segment.c:2379 nilfs_segctor_thread_construct fs/nilfs2/segment.c:2487 [inline] nilfs_segctor_thread+0x3c3/0xf30 fs/nilfs2/segment.c:2570 kthread+0x2e4/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306 ... If DAT metadata file is corrupted on disk, there is a case where req->pr_desc_bh is NULL and blocknr is 0 at nilfs_dat_commit_end() during a b-tree operation that cascadingly updates ancestor nodes of the b-tree, because nilfs_dat_commit_alloc() for a lower level block can initialize the blocknr on the same DAT entry between nilfs_dat_prepare_end() and nilfs_dat_commit_end(). If this happens, nilfs_dat_commit_end() calls nilfs_dat_commit_free() without valid buffer heads in req->pr_desc_bh and req->pr_bitmap_bh, and causes the NULL pointer dereference above in nilfs_palloc_commit_free_entry() function, which leads to a crash. Fix this by adding a NULL check on req->pr_desc_bh and req->pr_bitmap_bh before nilfs_palloc_commit_free_entry() in nilfs_dat_commit_free(). This also calls nilfs_error() in that case to notify that there is a fatal flaw in the filesystem metadata and prevent further operations. Link: https://lkml.kernel.org/r/00000000000097c20205ebaea3d6@google.com Link: https://lkml.kernel.org/r/20221114040441.1649940-1-zhangpeng362@huawei.com Link: https://lkml.kernel.org/r/20221119120542.17204-1-konishi.ryusuke@gmail.com Signed-off-by: ZhangPeng Signed-off-by: Ryusuke Konishi Reported-by: syzbot+ebe05ee8e98f755f61d0@syzkaller.appspotmail.com Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/dat.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 3b55e239705f..9930fa901039 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -111,6 +111,13 @@ static void nilfs_dat_commit_free(struct inode *dat, kunmap_atomic(kaddr); nilfs_dat_commit_entry(dat, req); + + if (unlikely(req->pr_desc_bh == NULL || req->pr_bitmap_bh == NULL)) { + nilfs_error(dat->i_sb, + "state inconsistency probably due to duplicate use of vblocknr = %llu", + (unsigned long long)req->pr_entry_nr); + return; + } nilfs_palloc_commit_free_entry(dat, req); } -- cgit From d09e8ca6cb93bb4b97517a18fbbf7eccb0e9ff43 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 15 Nov 2022 02:06:01 +0000 Subject: mm: anonymous shared memory naming Since commit 9a10064f5625 ("mm: add a field to store names for private anonymous memory"), name for private anonymous memory, but not shared anonymous, can be set. However, naming shared anonymous memory just as useful for tracking purposes. Extend the functionality to be able to set names for shared anon. There are two ways to create anonymous shared memory, using memfd or directly via mmap(): 1. fd = memfd_create(...) mem = mmap(..., MAP_SHARED, fd, ...) 2. mem = mmap(..., MAP_SHARED | MAP_ANONYMOUS, -1, ...) In both cases the anonymous shared memory is created the same way by mapping an unlinked file on tmpfs. The memfd way allows to give a name for anonymous shared memory, but not useful when parts of shared memory require to have distinct names. Example use case: The VMM maps VM memory as anonymous shared memory (not private because VMM is sandboxed and drivers are running in their own processes). However, the VM tells back to the VMM how parts of the memory are actually used by the guest, how each of the segments should be backed (i.e. 4K pages, 2M pages), and some other information about the segments. The naming allows us to monitor the effective memory footprint for each of these segments from the host without looking inside the guest. Sample output: /* Create shared anonymous segmenet */ anon_shmem = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); /* Name the segment: "MY-NAME" */ rv = prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, anon_shmem, SIZE, "MY-NAME"); cat /proc//maps (and smaps): 7fc8e2b4c000-7fc8f2b4c000 rw-s 00000000 00:01 1024 [anon_shmem:MY-NAME] If the segment is not named, the output is: 7fc8e2b4c000-7fc8f2b4c000 rw-s 00000000 00:01 1024 /dev/zero (deleted) Link: https://lkml.kernel.org/r/20221115020602.804224-1-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Acked-by: David Hildenbrand Cc: Arnd Bergmann Cc: Bagas Sanjaya Cc: Colin Cross Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet Cc: "Kirill A . Shutemov" Cc: Liam Howlett Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Paul Gortmaker Cc: Peter Xu Cc: Sean Christopherson Cc: Vincent Whitchurch Cc: Vlastimil Babka Cc: xu xin Cc: Yang Shi Cc: Yu Zhao Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8a74cdcc9af0..89338950afd3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -274,6 +274,7 @@ static void show_vma_header_prefix(struct seq_file *m, static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { + struct anon_vma_name *anon_name = NULL; struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; vm_flags_t flags = vma->vm_flags; @@ -293,6 +294,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) start = vma->vm_start; end = vma->vm_end; show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); + if (mm) + anon_name = anon_vma_name(vma); /* * Print the dentry name for named mappings, and a @@ -300,7 +303,14 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) */ if (file) { seq_pad(m, ' '); - seq_file_path(m, file, "\n"); + /* + * If user named this anon shared memory via + * prctl(PR_SET_VMA ..., use the provided name. + */ + if (anon_name) + seq_printf(m, "[anon_shmem:%s]", anon_name->name); + else + seq_file_path(m, file, "\n"); goto done; } @@ -312,8 +322,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) name = arch_vma_name(vma); if (!name) { - struct anon_vma_name *anon_name; - if (!mm) { name = "[vdso]"; goto done; @@ -330,7 +338,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - anon_name = anon_vma_name(vma); if (anon_name) { seq_pad(m, ' '); seq_printf(m, "[anon:%s]", anon_name->name); -- cgit From dbaf7dc97ab8d526a20d3477419bc14b4890a82c Mon Sep 17 00:00:00 2001 From: Li zeming Date: Mon, 7 Nov 2022 09:56:59 +0800 Subject: hugetlbfs: inode: remove unnecessary (void*) conversions The ei pointer does not need to cast the type. Link: https://lkml.kernel.org/r/20221107015659.3221-1-zeming@nfschina.com Signed-off-by: Li zeming Reviewed-by: Muchun Song Cc: Mike Kravetz Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 3ee84604e36d..790d2727141a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1279,7 +1279,7 @@ static const struct address_space_operations hugetlbfs_aops = { static void init_once(void *foo) { - struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; + struct hugetlbfs_inode_info *ei = foo; inode_init_once(&ei->vfs_inode); } -- cgit From 6dd8fe86fa84729538d8bed3149faf9c5886bb5b Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Thu, 17 Nov 2022 23:30:52 -0800 Subject: ext4: convert move_extent_per_page() to use folios Patch series "Removing the try_to_release_page() wrapper", v3. This patchset replaces the remaining calls of try_to_release_page() with the folio equivalent: filemap_release_folio(). This allows us to remove the wrapper. This patch (of 4): Convert move_extent_per_page() to use folios. This change removes 5 calls to compound_head() and is in preparation for the removal of the try_to_release_page() wrapper. Link: https://lkml.kernel.org/r/20221118073055.55694-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20221118073055.55694-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/ext4/move_extent.c | 52 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 044e34cd835c..8dbb87edf24c 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -253,6 +253,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, { struct inode *orig_inode = file_inode(o_filp); struct page *pagep[2] = {NULL, NULL}; + struct folio *folio[2] = {NULL, NULL}; handle_t *handle; ext4_lblk_t orig_blk_offset, donor_blk_offset; unsigned long blocksize = orig_inode->i_sb->s_blocksize; @@ -313,6 +314,13 @@ again: * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ + folio[0] = page_folio(pagep[0]); + folio[1] = page_folio(pagep[1]); + + VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]); + VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]); + VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]); + if (unwritten) { ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to @@ -331,10 +339,10 @@ again: ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } - if ((page_has_private(pagep[0]) && - !try_to_release_page(pagep[0], 0)) || - (page_has_private(pagep[1]) && - !try_to_release_page(pagep[1], 0))) { + if ((folio_has_private(folio[0]) && + !filemap_release_folio(folio[0], 0)) || + (folio_has_private(folio[1]) && + !filemap_release_folio(folio[1], 0))) { *err = -EBUSY; goto drop_data_sem; } @@ -344,19 +352,21 @@ again: block_len_in_page, 1, err); drop_data_sem: ext4_double_up_write_data_sem(orig_inode, donor_inode); - goto unlock_pages; + goto unlock_folios; } data_copy: - *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size); + *err = mext_page_mkuptodate(&folio[0]->page, from, from + replaced_size); if (*err) - goto unlock_pages; + goto unlock_folios; /* At this point all buffers in range are uptodate, old mapping layout * is no longer required, try to drop it now. */ - if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) || - (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) { + if ((folio_has_private(folio[0]) && + !filemap_release_folio(folio[0], 0)) || + (folio_has_private(folio[1]) && + !filemap_release_folio(folio[1], 0))) { *err = -EBUSY; - goto unlock_pages; + goto unlock_folios; } ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, @@ -369,13 +379,13 @@ data_copy: replaced_size = block_len_in_page << orig_inode->i_blkbits; } else - goto unlock_pages; + goto unlock_folios; } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ - if (!page_has_buffers(pagep[0])) - create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0); - bh = page_buffers(pagep[0]); + if (!folio_buffers(folio[0])) + create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0); + bh = folio_buffers(folio[0]); for (i = 0; i < data_offset_in_page; i++) bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { @@ -385,7 +395,7 @@ data_copy: bh = bh->b_this_page; } if (!*err) - *err = block_commit_write(pagep[0], from, from + replaced_size); + *err = block_commit_write(&folio[0]->page, from, from + replaced_size); if (unlikely(*err < 0)) goto repair_branches; @@ -395,11 +405,11 @@ data_copy: *err = ext4_jbd2_inode_add_write(handle, orig_inode, (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size); -unlock_pages: - unlock_page(pagep[0]); - put_page(pagep[0]); - unlock_page(pagep[1]); - put_page(pagep[1]); +unlock_folios: + folio_unlock(folio[0]); + folio_put(folio[0]); + folio_unlock(folio[1]); + folio_put(folio[1]); stop_journal: ext4_journal_stop(handle); if (*err == -ENOSPC && @@ -430,7 +440,7 @@ repair_branches: *err = -EIO; } replaced_count = 0; - goto unlock_pages; + goto unlock_folios; } /** -- cgit From 2e41f274f9aa71cdcc69dc1f26a3f9304a651804 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 20 Sep 2022 02:24:16 +0900 Subject: libfs: add DEFINE_SIMPLE_ATTRIBUTE_SIGNED for signed value Patch series "fix error when writing negative value to simple attribute files". The simple attribute files do not accept a negative value since the commit 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()"), but some attribute files want to accept a negative value. This patch (of 3): The simple attribute files do not accept a negative value since the commit 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()"), so we have to use a 64-bit value to write a negative value. This adds DEFINE_SIMPLE_ATTRIBUTE_SIGNED for a signed value. Link: https://lkml.kernel.org/r/20220919172418.45257-1-akinobu.mita@gmail.com Link: https://lkml.kernel.org/r/20220919172418.45257-2-akinobu.mita@gmail.com Fixes: 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()") Signed-off-by: Akinobu Mita Reported-by: Zhao Gongyi Reviewed-by: David Hildenbrand Reviewed-by: Greg Kroah-Hartman Cc: Alexander Viro Cc: Jonathan Corbet Cc: Oscar Salvador Cc: Rafael J. Wysocki Cc: Shuah Khan Cc: Wei Yongjun Cc: Yicong Yang Signed-off-by: Andrew Morton --- fs/libfs.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/libfs.c b/fs/libfs.c index 682d56345a1c..aada4e7c8713 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -995,8 +995,8 @@ out: EXPORT_SYMBOL_GPL(simple_attr_read); /* interpret the buffer as a number to call the set function with */ -ssize_t simple_attr_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) +static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf, + size_t len, loff_t *ppos, bool is_signed) { struct simple_attr *attr; unsigned long long val; @@ -1017,7 +1017,10 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, goto out; attr->set_buf[size] = '\0'; - ret = kstrtoull(attr->set_buf, 0, &val); + if (is_signed) + ret = kstrtoll(attr->set_buf, 0, &val); + else + ret = kstrtoull(attr->set_buf, 0, &val); if (ret) goto out; ret = attr->set(attr->data, val); @@ -1027,8 +1030,21 @@ out: mutex_unlock(&attr->mutex); return ret; } + +ssize_t simple_attr_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return simple_attr_write_xsigned(file, buf, len, ppos, false); +} EXPORT_SYMBOL_GPL(simple_attr_write); +ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return simple_attr_write_xsigned(file, buf, len, ppos, true); +} +EXPORT_SYMBOL_GPL(simple_attr_write_signed); + /** * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation * @sb: filesystem to do the file handle conversion on -- cgit From d472cf797c4e268613dbce5ec9b95d0bcae19ecb Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 20 Sep 2022 02:24:18 +0900 Subject: debugfs: fix error when writing negative value to atomic_t debugfs file The simple attribute files do not accept a negative value since the commit 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()"), so we have to use a 64-bit value to write a negative value for a debugfs file created by debugfs_create_atomic_t(). This restores the previous behaviour by introducing DEFINE_DEBUGFS_ATTRIBUTE_SIGNED for a signed value. Link: https://lkml.kernel.org/r/20220919172418.45257-4-akinobu.mita@gmail.com Fixes: 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()") Signed-off-by: Akinobu Mita Reported-by: Zhao Gongyi Reviewed-by: David Hildenbrand Reviewed-by: Greg Kroah-Hartman Cc: Alexander Viro Cc: Jonathan Corbet Cc: Oscar Salvador Cc: Rafael J. Wysocki Cc: Shuah Khan Cc: Wei Yongjun Cc: Yicong Yang Signed-off-by: Andrew Morton --- fs/debugfs/file.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index ddb3fc258df9..b54f470e0d03 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -378,8 +378,8 @@ ssize_t debugfs_attr_read(struct file *file, char __user *buf, } EXPORT_SYMBOL_GPL(debugfs_attr_read); -ssize_t debugfs_attr_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) +static ssize_t debugfs_attr_write_xsigned(struct file *file, const char __user *buf, + size_t len, loff_t *ppos, bool is_signed) { struct dentry *dentry = F_DENTRY(file); ssize_t ret; @@ -387,12 +387,28 @@ ssize_t debugfs_attr_write(struct file *file, const char __user *buf, ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; - ret = simple_attr_write(file, buf, len, ppos); + if (is_signed) + ret = simple_attr_write_signed(file, buf, len, ppos); + else + ret = simple_attr_write(file, buf, len, ppos); debugfs_file_put(dentry); return ret; } + +ssize_t debugfs_attr_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return debugfs_attr_write_xsigned(file, buf, len, ppos, false); +} EXPORT_SYMBOL_GPL(debugfs_attr_write); +ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return debugfs_attr_write_xsigned(file, buf, len, ppos, true); +} +EXPORT_SYMBOL_GPL(debugfs_attr_write_signed); + static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode, struct dentry *parent, void *value, const struct file_operations *fops, @@ -738,11 +754,11 @@ static int debugfs_atomic_t_get(void *data, u64 *val) *val = atomic_read((atomic_t *)data); return 0; } -DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get, +DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t, debugfs_atomic_t_get, debugfs_atomic_t_set, "%lld\n"); -DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, +DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n"); -DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, +DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); /** -- cgit From ce2fcf1516d674a174d9b34d1e1024d64de9fba3 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Wed, 9 Nov 2022 15:46:27 +0800 Subject: ocfs2: fix memory leak in ocfs2_mount_volume() There is a memory leak reported by kmemleak: unreferenced object 0xffff88810cc65e60 (size 32): comm "mount.ocfs2", pid 23753, jiffies 4302528942 (age 34735.105s) hex dump (first 32 bytes): 10 00 00 00 00 00 00 00 00 01 01 01 01 01 01 01 ................ 01 01 01 01 01 01 01 01 00 00 00 00 00 00 00 00 ................ backtrace: [] __kmalloc+0x4d/0x150 [] ocfs2_compute_replay_slots+0x121/0x330 [ocfs2] [] ocfs2_check_volume+0x485/0x900 [ocfs2] [] ocfs2_mount_volume.isra.0+0x1e9/0x650 [ocfs2] [] ocfs2_fill_super+0xe0b/0x1740 [ocfs2] [] mount_bdev+0x312/0x400 [] legacy_get_tree+0xed/0x1d0 [] vfs_get_tree+0x7d/0x230 [] path_mount+0xd62/0x1760 [] do_mount+0xca/0xe0 [] __x64_sys_mount+0x12c/0x1a0 [] do_syscall_64+0x35/0x80 [] entry_SYSCALL_64_after_hwframe+0x46/0xb0 This call stack is related to two problems. Firstly, the ocfs2 super uses "replay_map" to trace online/offline slots, in order to recover offline slots during recovery and mount. But when ocfs2_truncate_log_init() returns an error in ocfs2_mount_volume(), the memory of "replay_map" will not be freed in error handling path. Secondly, the memory of "replay_map" will not be freed if d_make_root() returns an error in ocfs2_fill_super(). But the memory of "replay_map" will be freed normally when completing recovery and mount in ocfs2_complete_mount_recovery(). Fix the first problem by adding error handling path to free "replay_map" when ocfs2_truncate_log_init() fails. And fix the second problem by calling ocfs2_free_replay_slots(osb) in the error handling path "out_dismount". In addition, since ocfs2_free_replay_slots() is static, it is necessary to remove its static attribute and declare it in header file. Link: https://lkml.kernel.org/r/20221109074627.2303950-1-lizetao1@huawei.com Fixes: 9140db04ef18 ("ocfs2: recover orphans in offline slots during recovery and mount") Signed-off-by: Li Zetao Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/journal.c | 2 +- fs/ocfs2/journal.h | 1 + fs/ocfs2/super.c | 5 ++++- 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 126671e6caed..3fb98b4569a2 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -157,7 +157,7 @@ static void ocfs2_queue_replay_slots(struct ocfs2_super *osb, replay_map->rm_state = REPLAY_DONE; } -static void ocfs2_free_replay_slots(struct ocfs2_super *osb) +void ocfs2_free_replay_slots(struct ocfs2_super *osb) { struct ocfs2_replay_map *replay_map = osb->replay_map; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 969d0aa28718..41c382f68529 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -150,6 +150,7 @@ int ocfs2_recovery_init(struct ocfs2_super *osb); void ocfs2_recovery_exit(struct ocfs2_super *osb); int ocfs2_compute_replay_slots(struct ocfs2_super *osb); +void ocfs2_free_replay_slots(struct ocfs2_super *osb); /* * Journal Control: * Initialize, Load, Shutdown, Wipe a journal. diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 42c993e53924..0b0e6a132101 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1159,6 +1159,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) out_dismount: atomic_set(&osb->vol_state, VOLUME_DISABLED); wake_up(&osb->osb_mount_event); + ocfs2_free_replay_slots(osb); ocfs2_dismount_volume(sb, 1); goto out; @@ -1822,12 +1823,14 @@ static int ocfs2_mount_volume(struct super_block *sb) status = ocfs2_truncate_log_init(osb); if (status < 0) { mlog_errno(status); - goto out_system_inodes; + goto out_check_volume; } ocfs2_super_unlock(osb, 1); return 0; +out_check_volume: + ocfs2_free_replay_slots(osb); out_system_inodes: if (osb->local_alloc_state == OCFS2_LA_ENABLED) ocfs2_shutdown_local_alloc(osb); -- cgit From 811b99fd237e17230d660c6000021f23ce3f3985 Mon Sep 17 00:00:00 2001 From: Bo Liu Date: Fri, 11 Nov 2022 02:56:48 -0500 Subject: fat (exportfs): fix some kernel-doc warnings Fix the following W=1 kernel build warning(s): fs/fat/nfs.c:21: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst fs/fat/nfs.c:139: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst Link: https://lkml.kernel.org/r/20221111075648.4005-1-liubo03@inspur.com Signed-off-by: Bo Liu Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton --- fs/fat/nfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c index af191371c352..3626eb585a98 100644 --- a/fs/fat/nfs.c +++ b/fs/fat/nfs.c @@ -17,7 +17,7 @@ struct fat_fid { #define FAT_FID_SIZE_WITHOUT_PARENT 3 #define FAT_FID_SIZE_WITH_PARENT (sizeof(struct fat_fid)/sizeof(u32)) -/** +/* * Look up a directory inode given its starting cluster. */ static struct inode *fat_dget(struct super_block *sb, int i_logstart) @@ -135,7 +135,7 @@ fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp, return type; } -/** +/* * Map a NFS file handle to a corresponding dentry. * The dentry may or may not be connected to the filesystem root. */ -- cgit From 63ffb573df66aea034d07fd00483d0a3cd4fed66 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 22 Nov 2022 03:04:00 +0100 Subject: efi: vars: prohibit reading random seed variables In anticipation of putting random seeds in EFI variables, it's important that the random GUID namespace of variables remains hidden from userspace. We accomplish this by not populating efivarfs with entries from that GUID, as well as denying the creation of new ones in that GUID. Signed-off-by: Jason A. Donenfeld Signed-off-by: Ard Biesheuvel --- fs/efivarfs/inode.c | 4 ++++ fs/efivarfs/super.c | 3 +++ 2 files changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 939e5e242b98..617f3ad2485e 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -91,6 +91,10 @@ static int efivarfs_create(struct user_namespace *mnt_userns, struct inode *dir, err = guid_parse(dentry->d_name.name + namelen + 1, &var->var.VendorGuid); if (err) goto out; + if (guid_equal(&var->var.VendorGuid, &LINUX_EFI_RANDOM_SEED_TABLE_GUID)) { + err = -EPERM; + goto out; + } if (efivar_variable_is_removable(var->var.VendorGuid, dentry->d_name.name, namelen)) diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 6780fc81cc11..07e82e246666 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -116,6 +116,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, int err = -ENOMEM; bool is_removable = false; + if (guid_equal(&vendor, &LINUX_EFI_RANDOM_SEED_TABLE_GUID)) + return 0; + entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return err; -- cgit From eee22187b53611e173161e38f61de1c7ecbeb876 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 17 Aug 2022 21:27:01 +0800 Subject: ext4: add inode table check in __ext4_get_inode_loc to aovid possible infinite loop In do_writepages, if the value returned by ext4_writepages is "-ENOMEM" and "wbc->sync_mode == WB_SYNC_ALL", retry until the condition is not met. In __ext4_get_inode_loc, if the bh returned by sb_getblk is NULL, the function returns -ENOMEM. In __getblk_slow, if the return value of grow_buffers is less than 0, the function returns NULL. When the three processes are connected in series like the following stack, an infinite loop may occur: do_writepages <--- keep retrying ext4_writepages mpage_map_and_submit_extent mpage_map_one_extent ext4_map_blocks ext4_ext_map_blocks ext4_ext_handle_unwritten_extents ext4_ext_convert_to_initialized ext4_split_extent ext4_split_extent_at __ext4_ext_dirty __ext4_mark_inode_dirty ext4_reserve_inode_write ext4_get_inode_loc __ext4_get_inode_loc <--- return -ENOMEM sb_getblk __getblk_gfp __getblk_slow <--- return NULL grow_buffers grow_dev_page <--- return -ENXIO ret = (block < end_block) ? 1 : -ENXIO; In this issue, bg_inode_table_hi is overwritten as an incorrect value. As a result, `block < end_block` cannot be met in grow_dev_page. Therefore, __ext4_get_inode_loc always returns '-ENOMEM' and do_writepages keeps retrying. As a result, the writeback process is in the D state due to an infinite loop. Add a check on inode table block in the __ext4_get_inode_loc function by referring to ext4_read_inode_bitmap to avoid this infinite loop. Cc: stable@kernel.org Signed-off-by: Baokun Li Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20220817132701.3015912-3-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 121ce2fccfab..6d16241666e6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4479,9 +4479,17 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; inode_offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)); - block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); + block = ext4_inode_table(sb, gdp); + if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) || + (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) { + ext4_error(sb, "Invalid inode table block %llu in " + "block_group %u", block, iloc->block_group); + return -EFSCORRUPTED; + } + block += (inode_offset / inodes_per_block); + bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; -- cgit From 71df9683827a17fb6279fcc2e52efdc7062a03b9 Mon Sep 17 00:00:00 2001 From: Jinpeng Cui Date: Wed, 31 Aug 2022 16:08:43 +0000 Subject: ext4: remove redundant variable err Return value directly from ext4_group_extend_no_check() instead of getting value from redundant variable err. Reported-by: Zeal Robot Signed-off-by: Jinpeng Cui Link: https://lore.kernel.org/r/20220831160843.305836-1-cui.jinpeng2@zte.com.cn Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 46b87ffeb304..6e88a85abd7f 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1831,7 +1831,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_grpblk_t last; ext4_grpblk_t add; struct buffer_head *bh; - int err; ext4_group_t group; o_blocks_count = ext4_blocks_count(es); @@ -1886,8 +1885,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, } brelse(bh); - err = ext4_group_extend_no_check(sb, o_blocks_count, add); - return err; + return ext4_group_extend_no_check(sb, o_blocks_count, add); } /* ext4_group_extend */ -- cgit From 56d0d0b9289dae041becc7ee6bd966a00dd610e0 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 16 Sep 2022 17:28:16 -0700 Subject: ext4: check the return value of ext4_xattr_inode_dec_ref() Check the return value of ext4_xattr_inode_dec_ref(), which could return error code and need to be warned. Signed-off-by: Li Zhong Link: https://lore.kernel.org/r/20220917002816.3804400-1-floridsleeves@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 36d6ba7190b6..718ef3987f94 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1540,7 +1540,8 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); if (err) { - ext4_xattr_inode_dec_ref(handle, ea_inode); + if (ext4_xattr_inode_dec_ref(handle, ea_inode)) + ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); iput(ea_inode); return err; } -- cgit From e3ea75ee651daf5e434afbfdb7dbf75e200ea1f6 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 4 Oct 2022 15:58:03 +0200 Subject: ext4: journal_path mount options should follow links Before the commit 461c3af045d3 ("ext4: Change handle_mount_opt() to use fs_parameter") ext4 mount option journal_path did follow links in the provided path. Bring this behavior back by allowing to pass pathwalk flags to fs_lookup_param(). Fixes: 461c3af045d3 ("ext4: Change handle_mount_opt() to use fs_parameter") Signed-off-by: Lukas Czerner Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20221004135803.32283-1-lczerner@redhat.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/super.c | 2 +- fs/fs_parser.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4749b303f2a9..0f71542cf453 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2247,7 +2247,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; } - error = fs_lookup_param(fc, param, 1, &path); + error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path); if (error) { ext4_msg(NULL, KERN_ERR, "error: could not find " "journal device path"); diff --git a/fs/fs_parser.c b/fs/fs_parser.c index ed40ce5742fd..edb3712dcfa5 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -138,15 +138,16 @@ EXPORT_SYMBOL(__fs_parse); * @fc: The filesystem context to log errors through. * @param: The parameter. * @want_bdev: T if want a blockdev + * @flags: Pathwalk flags passed to filename_lookup() * @_path: The result of the lookup */ int fs_lookup_param(struct fs_context *fc, struct fs_parameter *param, bool want_bdev, + unsigned int flags, struct path *_path) { struct filename *f; - unsigned int flags = 0; bool put_f; int ret; -- cgit From 5f3e240321dd00b251f91a37c70fc551a140c87b Mon Sep 17 00:00:00 2001 From: changfengnan Date: Sat, 8 Oct 2022 20:05:18 +0800 Subject: ext4: split ext4_journal_start trace for debug we might want to know why jbd2 thread using high io for detail, split ext4_journal_start trace to ext4_journal_start_sb and ext4_journal_start_inode, show ino and handle type when possible. Signed-off-by: changfengnan Link: https://lore.kernel.org/r/20221008120518.74870-1-changfengnan@bytedance.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4_jbd2.c | 14 ++++++++++---- fs/ext4/ext4_jbd2.h | 10 +++++----- fs/ext4/ialloc.c | 4 ++-- 3 files changed, 17 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 8e1fb18f465e..77f318ec8abb 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -86,15 +86,21 @@ static int ext4_journal_check_start(struct super_block *sb) return 0; } -handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, +handle_t *__ext4_journal_start_sb(struct inode *inode, + struct super_block *sb, unsigned int line, int type, int blocks, int rsv_blocks, int revoke_creds) { journal_t *journal; int err; - - trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds, - _RET_IP_); + if (inode) + trace_ext4_journal_start_inode(inode, blocks, rsv_blocks, + revoke_creds, type, + _RET_IP_); + else + trace_ext4_journal_start_sb(sb, blocks, rsv_blocks, + revoke_creds, type, + _RET_IP_); err = ext4_journal_check_start(sb); if (err < 0) return ERR_PTR(err); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index db2ae4a2b38d..0c77697d5e90 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -261,9 +261,9 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ (bh)) -handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, - int type, int blocks, int rsv_blocks, - int revoke_creds); +handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb, + unsigned int line, int type, int blocks, + int rsv_blocks, int revoke_creds); int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) @@ -303,7 +303,7 @@ static inline int ext4_trans_default_revoke_credits(struct super_block *sb) } #define ext4_journal_start_sb(sb, type, nblocks) \ - __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0, \ + __ext4_journal_start_sb(NULL, (sb), __LINE__, (type), (nblocks), 0,\ ext4_trans_default_revoke_credits(sb)) #define ext4_journal_start(inode, type, nblocks) \ @@ -323,7 +323,7 @@ static inline handle_t *__ext4_journal_start(struct inode *inode, int blocks, int rsv_blocks, int revoke_creds) { - return __ext4_journal_start_sb(inode->i_sb, line, type, blocks, + return __ext4_journal_start_sb(inode, inode->i_sb, line, type, blocks, rsv_blocks, revoke_creds); } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index e9bc46684106..d701977678d4 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1076,8 +1076,8 @@ repeat_in_this_group: if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) { BUG_ON(nblocks <= 0); - handle = __ext4_journal_start_sb(dir->i_sb, line_no, - handle_type, nblocks, 0, + handle = __ext4_journal_start_sb(NULL, dir->i_sb, + line_no, handle_type, nblocks, 0, ext4_trans_default_revoke_credits(sb)); if (IS_ERR(handle)) { err = PTR_ERR(handle); -- cgit From 9d720a5a658f5135861773f26e927449bef93d61 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 30 Nov 2022 09:25:51 -0800 Subject: xfs: hoist refcount record merge predicates Hoist these multiline conditionals into separate static inline helpers to improve readability and set the stage for corruption fixes that will be introduced in the next patch. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Xiao Yang --- fs/xfs/libxfs/xfs_refcount.c | 129 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 113 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 3f34bafe18dd..4408893333a6 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -815,11 +815,119 @@ out_error: /* Is this extent valid? */ static inline bool xfs_refc_valid( - struct xfs_refcount_irec *rc) + const struct xfs_refcount_irec *rc) { return rc->rc_startblock != NULLAGBLOCK; } +static inline bool +xfs_refc_want_merge_center( + const struct xfs_refcount_irec *left, + const struct xfs_refcount_irec *cleft, + const struct xfs_refcount_irec *cright, + const struct xfs_refcount_irec *right, + bool cleft_is_cright, + enum xfs_refc_adjust_op adjust, + unsigned long long *ulenp) +{ + unsigned long long ulen = left->rc_blockcount; + + /* + * To merge with a center record, both shoulder records must be + * adjacent to the record we want to adjust. This is only true if + * find_left and find_right made all four records valid. + */ + if (!xfs_refc_valid(left) || !xfs_refc_valid(right) || + !xfs_refc_valid(cleft) || !xfs_refc_valid(cright)) + return false; + + /* There must only be one record for the entire range. */ + if (!cleft_is_cright) + return false; + + /* The shoulder record refcounts must match the new refcount. */ + if (left->rc_refcount != cleft->rc_refcount + adjust) + return false; + if (right->rc_refcount != cleft->rc_refcount + adjust) + return false; + + /* + * The new record cannot exceed the max length. ulen is a ULL as the + * individual record block counts can be up to (u32 - 1) in length + * hence we need to catch u32 addition overflows here. + */ + ulen += cleft->rc_blockcount + right->rc_blockcount; + if (ulen >= MAXREFCEXTLEN) + return false; + + *ulenp = ulen; + return true; +} + +static inline bool +xfs_refc_want_merge_left( + const struct xfs_refcount_irec *left, + const struct xfs_refcount_irec *cleft, + enum xfs_refc_adjust_op adjust) +{ + unsigned long long ulen = left->rc_blockcount; + + /* + * For a left merge, the left shoulder record must be adjacent to the + * start of the range. If this is true, find_left made left and cleft + * contain valid contents. + */ + if (!xfs_refc_valid(left) || !xfs_refc_valid(cleft)) + return false; + + /* Left shoulder record refcount must match the new refcount. */ + if (left->rc_refcount != cleft->rc_refcount + adjust) + return false; + + /* + * The new record cannot exceed the max length. ulen is a ULL as the + * individual record block counts can be up to (u32 - 1) in length + * hence we need to catch u32 addition overflows here. + */ + ulen += cleft->rc_blockcount; + if (ulen >= MAXREFCEXTLEN) + return false; + + return true; +} + +static inline bool +xfs_refc_want_merge_right( + const struct xfs_refcount_irec *cright, + const struct xfs_refcount_irec *right, + enum xfs_refc_adjust_op adjust) +{ + unsigned long long ulen = right->rc_blockcount; + + /* + * For a right merge, the right shoulder record must be adjacent to the + * end of the range. If this is true, find_right made cright and right + * contain valid contents. + */ + if (!xfs_refc_valid(right) || !xfs_refc_valid(cright)) + return false; + + /* Right shoulder record refcount must match the new refcount. */ + if (right->rc_refcount != cright->rc_refcount + adjust) + return false; + + /* + * The new record cannot exceed the max length. ulen is a ULL as the + * individual record block counts can be up to (u32 - 1) in length + * hence we need to catch u32 addition overflows here. + */ + ulen += cright->rc_blockcount; + if (ulen >= MAXREFCEXTLEN) + return false; + + return true; +} + /* * Try to merge with any extents on the boundaries of the adjustment range. */ @@ -861,23 +969,15 @@ xfs_refcount_merge_extents( (cleft.rc_blockcount == cright.rc_blockcount); /* Try to merge left, cleft, and right. cleft must == cright. */ - ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount + - right.rc_blockcount; - if (xfs_refc_valid(&left) && xfs_refc_valid(&right) && - xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal && - left.rc_refcount == cleft.rc_refcount + adjust && - right.rc_refcount == cleft.rc_refcount + adjust && - ulen < MAXREFCEXTLEN) { + if (xfs_refc_want_merge_center(&left, &cleft, &cright, &right, cequal, + adjust, &ulen)) { *shape_changed = true; return xfs_refcount_merge_center_extents(cur, &left, &cleft, &right, ulen, aglen); } /* Try to merge left and cleft. */ - ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount; - if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) && - left.rc_refcount == cleft.rc_refcount + adjust && - ulen < MAXREFCEXTLEN) { + if (xfs_refc_want_merge_left(&left, &cleft, adjust)) { *shape_changed = true; error = xfs_refcount_merge_left_extent(cur, &left, &cleft, agbno, aglen); @@ -893,10 +993,7 @@ xfs_refcount_merge_extents( } /* Try to merge cright and right. */ - ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount; - if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) && - right.rc_refcount == cright.rc_refcount + adjust && - ulen < MAXREFCEXTLEN) { + if (xfs_refc_want_merge_right(&cright, &right, adjust)) { *shape_changed = true; return xfs_refcount_merge_right_extent(cur, &right, &cright, aglen); -- cgit From b25d1984aa884fc91a73a5a407b9ac976d441e9b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 30 Nov 2022 09:25:51 -0800 Subject: xfs: estimate post-merge refcounts correctly Upon enabling fsdax + reflink for XFS, xfs/179 began to report refcount metadata corruptions after being run. Specifically, xfs_repair noticed single-block refcount records that could be combined but had not been. The root cause of this is improper MAXREFCOUNT edge case handling in xfs_refcount_merge_extents. When we're trying to find candidates for a refcount btree record merge, we compute the refcount attribute of the merged record, but we fail to account for the fact that once a record hits rc_refcount == MAXREFCOUNT, it is pinned that way forever. Hence the computed refcount is wrong, and we fail to merge the extents. Fix this by adjusting the merge predicates to compute the adjusted refcount correctly. Fixes: 3172725814f9 ("xfs: adjust refcount of an extent of blocks in refcount btree") Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Xiao Yang --- fs/xfs/libxfs/xfs_refcount.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 4408893333a6..6f7ed9288fe4 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -820,6 +820,17 @@ xfs_refc_valid( return rc->rc_startblock != NULLAGBLOCK; } +static inline xfs_nlink_t +xfs_refc_merge_refcount( + const struct xfs_refcount_irec *irec, + enum xfs_refc_adjust_op adjust) +{ + /* Once a record hits MAXREFCOUNT, it is pinned there forever */ + if (irec->rc_refcount == MAXREFCOUNT) + return MAXREFCOUNT; + return irec->rc_refcount + adjust; +} + static inline bool xfs_refc_want_merge_center( const struct xfs_refcount_irec *left, @@ -831,6 +842,7 @@ xfs_refc_want_merge_center( unsigned long long *ulenp) { unsigned long long ulen = left->rc_blockcount; + xfs_nlink_t new_refcount; /* * To merge with a center record, both shoulder records must be @@ -846,9 +858,10 @@ xfs_refc_want_merge_center( return false; /* The shoulder record refcounts must match the new refcount. */ - if (left->rc_refcount != cleft->rc_refcount + adjust) + new_refcount = xfs_refc_merge_refcount(cleft, adjust); + if (left->rc_refcount != new_refcount) return false; - if (right->rc_refcount != cleft->rc_refcount + adjust) + if (right->rc_refcount != new_refcount) return false; /* @@ -871,6 +884,7 @@ xfs_refc_want_merge_left( enum xfs_refc_adjust_op adjust) { unsigned long long ulen = left->rc_blockcount; + xfs_nlink_t new_refcount; /* * For a left merge, the left shoulder record must be adjacent to the @@ -881,7 +895,8 @@ xfs_refc_want_merge_left( return false; /* Left shoulder record refcount must match the new refcount. */ - if (left->rc_refcount != cleft->rc_refcount + adjust) + new_refcount = xfs_refc_merge_refcount(cleft, adjust); + if (left->rc_refcount != new_refcount) return false; /* @@ -903,6 +918,7 @@ xfs_refc_want_merge_right( enum xfs_refc_adjust_op adjust) { unsigned long long ulen = right->rc_blockcount; + xfs_nlink_t new_refcount; /* * For a right merge, the right shoulder record must be adjacent to the @@ -913,7 +929,8 @@ xfs_refc_want_merge_right( return false; /* Right shoulder record refcount must match the new refcount. */ - if (right->rc_refcount != cright->rc_refcount + adjust) + new_refcount = xfs_refc_merge_refcount(cright, adjust); + if (right->rc_refcount != new_refcount) return false; /* -- cgit From 8c25febf23963431686f04874b96321288504127 Mon Sep 17 00:00:00 2001 From: Guo Xuenan Date: Thu, 1 Dec 2022 09:36:16 -0800 Subject: xfs: get rid of assert from xfs_btree_islastblock xfs_btree_check_block contains debugging knobs. With XFS_DEBUG setting up, turn on the debugging knob can trigger the assert of xfs_btree_islastblock, test script as follows: while true do mount $disk $mountpoint fsstress -d $testdir -l 0 -n 10000 -p 4 >/dev/null echo 1 > /sys/fs/xfs/sda/errortag/btree_chk_sblk sleep 10 umount $mountpoint done Kick off fsstress and only *then* turn on the debugging knob. If it happens that the knob gets turned on after the cntbt lookup succeeds but before the call to xfs_btree_islastblock, then we *can* end up in the situation where a previously checked btree block suddenly starts returning EFSCORRUPTED from xfs_btree_check_block. Kaboom. Darrick give a very detailed explanation as follows: Looking back at commit 27d9ee577dcce, I think the point of all this was to make sure that the cursor has actually performed a lookup, and that the btree block at whatever level we're asking about is ok. If the caller hasn't ever done a lookup, the bc_levels array will be empty, so cur->bc_levels[level].bp pointer will be NULL. The call to xfs_btree_get_block will crash anyway, so the "ASSERT(block);" part is pointless. If the caller did a lookup but the lookup failed due to block corruption, the corresponding cur->bc_levels[level].bp pointer will also be NULL, and we'll still crash. The "ASSERT(xfs_btree_check_block);" logic is also unnecessary. If the cursor level points to an inode root, the block buffer will be incore, so it had better always be consistent. If the caller ignores a failed lookup after a successful one and calls this function, the cursor state is garbage and the assert wouldn't have tripped anyway. So get rid of the assert. Fixes: 27d9ee577dcc ("xfs: actually check xfs_btree_check_block return in xfs_btree_islastblock") Signed-off-by: Guo Xuenan Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.h | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index eef27858a013..29c4b4ccb909 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -556,7 +556,6 @@ xfs_btree_islastblock( struct xfs_buf *bp; block = xfs_btree_get_block(cur, level, &bp); - ASSERT(block && xfs_btree_check_block(cur, block, level, bp) == 0); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK); -- cgit From ddfdd530e43fcb3f7a0a69966e5f6c33497b4ae3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 1 Dec 2022 09:36:16 -0800 Subject: xfs: invalidate xfs_bufs when allocating cow extents While investigating test failures in xfs/17[1-3] in alwayscow mode, I noticed through code inspection that xfs_bmap_alloc_userdata isn't setting XFS_ALLOC_USERDATA when allocating extents for a file's CoW fork. COW staging extents should be flagged as USERDATA, since user data are persisted to these blocks before being remapped into a file. This mis-classification has a few impacts on the behavior of the system. First, the filestreams allocator is supposed to keep allocating from a chosen AG until it runs out of space in that AG. However, it only does that for USERDATA allocations, which means that COW allocations aren't tied to the filestreams AG. Fortunately, few people use filestreams, so nobody's noticed. A more serious problem is that xfs_alloc_ag_vextent_small looks for a buffer to invalidate *if* the USERDATA flag is set and the AG is so full that the allocation had to come from the AGFL because the cntbt is empty. The consequences of not invalidating the buffer are severe -- if the AIL incorrectly checkpoints a buffer that is now being used to store user data, that action will clobber the user's written data. Fix filestreams and yet another data corruption vector by flagging COW allocations as USERDATA. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 56b9b7db38bb..0d56a8d862e8 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4058,7 +4058,7 @@ xfs_bmap_alloc_userdata( * the busy list. */ bma->datatype = XFS_ALLOC_NOBUSY; - if (whichfork == XFS_DATA_FORK) { + if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) { bma->datatype |= XFS_ALLOC_USERDATA; if (bma->offset == 0) bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; -- cgit From e0cefada1383c5ceb5a35f08369d0d40a6629c18 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 1 Dec 2022 20:58:19 +0800 Subject: fscrypt: Add SM4 XTS/CTS symmetric algorithm support Add support for XTS and CTS mode variant of SM4 algorithm. The former is used to encrypt file contents, while the latter (SM4-CTS-CBC) is used to encrypt filenames. SM4 is a symmetric algorithm widely used in China, and is even mandatory algorithm in some special scenarios. We need to provide these users with the ability to encrypt files or disks using SM4-XTS. Signed-off-by: Tianjia Zhang Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20221201125819.36932-3-tianjia.zhang@linux.alibaba.com --- fs/crypto/keysetup.c | 15 +++++++++++++++ fs/crypto/policy.c | 5 +++++ 2 files changed, 20 insertions(+) (limited to 'fs') diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 9e44dc078a81..94757ccd3056 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -44,6 +44,21 @@ struct fscrypt_mode fscrypt_modes[] = { .security_strength = 16, .ivsize = 16, }, + [FSCRYPT_MODE_SM4_XTS] = { + .friendly_name = "SM4-XTS", + .cipher_str = "xts(sm4)", + .keysize = 32, + .security_strength = 16, + .ivsize = 16, + .blk_crypto_mode = BLK_ENCRYPTION_MODE_SM4_XTS, + }, + [FSCRYPT_MODE_SM4_CTS] = { + .friendly_name = "SM4-CTS-CBC", + .cipher_str = "cts(cbc(sm4))", + .keysize = 16, + .security_strength = 16, + .ivsize = 16, + }, [FSCRYPT_MODE_ADIANTUM] = { .friendly_name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 84fa51604b15..893661b52376 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -90,6 +90,11 @@ static bool fscrypt_valid_enc_modes_v2(u32 contents_mode, u32 filenames_mode) if (contents_mode == FSCRYPT_MODE_AES_256_XTS && filenames_mode == FSCRYPT_MODE_AES_256_HCTR2) return true; + + if (contents_mode == FSCRYPT_MODE_SM4_XTS && + filenames_mode == FSCRYPT_MODE_SM4_CTS) + return true; + return fscrypt_valid_enc_modes_v1(contents_mode, filenames_mode); } -- cgit From e7f703ff2507f4e9f496da96cd4b78fd3026120c Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Fri, 2 Dec 2022 09:41:01 +0800 Subject: binfmt: Fix error return code in load_elf_fdpic_binary() Fix to return a negative error code from create_elf_fdpic_tables() instead of 0. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Signed-off-by: Wang Yufen Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/1669945261-30271-1-git-send-email-wangyufen@huawei.com --- fs/binfmt_elf_fdpic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index e90c1192dec6..096e3520a0b1 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -434,8 +434,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) current->mm->start_stack = current->mm->start_brk + stack_size; #endif - if (create_elf_fdpic_tables(bprm, current->mm, - &exec_params, &interp_params) < 0) + retval = create_elf_fdpic_tables(bprm, current->mm, &exec_params, + &interp_params); + if (retval < 0) goto error; kdebug("- start_code %lx", current->mm->start_code); -- cgit From d6fdf29f7b99814d3673f2d9f4649262807cb836 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 1 Dec 2022 17:01:03 +0100 Subject: posix_acl: Fix the type of sentinel in get_acl The type should be struct posix_acl * instead of void *. Cc: Christian Brauner Cc: Alexander Viro Signed-off-by: Uros Bizjak Signed-off-by: Christian Brauner (Microsoft) --- fs/posix_acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 989bbf280bfe..e6643db35cce 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -115,7 +115,7 @@ static struct posix_acl *__get_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, int type) { - void *sentinel; + struct posix_acl *sentinel; struct posix_acl **p; struct posix_acl *acl; -- cgit From c65234b283a65cfbfc94619655e820a5e55199eb Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Mon, 17 Oct 2022 09:42:30 +0800 Subject: configfs: fix possible memory leak in configfs_create_dir() kmemleak reported memory leaks in configfs_create_dir(): unreferenced object 0xffff888009f6af00 (size 192): comm "modprobe", pid 3777, jiffies 4295537735 (age 233.784s) backtrace: kmem_cache_alloc (mm/slub.c:3250 mm/slub.c:3256 mm/slub.c:3263 mm/slub.c:3273) new_fragment (./include/linux/slab.h:600 fs/configfs/dir.c:163) configfs_register_subsystem (fs/configfs/dir.c:1857) basic_write (drivers/hwtracing/stm/p_basic.c:14) stm_p_basic do_one_initcall (init/main.c:1296) do_init_module (kernel/module/main.c:2455) ... unreferenced object 0xffff888003ba7180 (size 96): comm "modprobe", pid 3777, jiffies 4295537735 (age 233.784s) backtrace: kmem_cache_alloc (mm/slub.c:3250 mm/slub.c:3256 mm/slub.c:3263 mm/slub.c:3273) configfs_new_dirent (./include/linux/slab.h:723 fs/configfs/dir.c:194) configfs_make_dirent (fs/configfs/dir.c:248) configfs_create_dir (fs/configfs/dir.c:296) configfs_attach_group.isra.28 (fs/configfs/dir.c:816 fs/configfs/dir.c:852) configfs_register_subsystem (fs/configfs/dir.c:1881) basic_write (drivers/hwtracing/stm/p_basic.c:14) stm_p_basic do_one_initcall (init/main.c:1296) do_init_module (kernel/module/main.c:2455) ... This is because the refcount is not correct in configfs_make_dirent(). For normal stage, the refcount is changing as: configfs_register_subsystem() configfs_create_dir() configfs_make_dirent() configfs_new_dirent() # set s_count = 1 dentry->d_fsdata = configfs_get(sd); # s_count = 2 ... configfs_unregister_subsystem() configfs_remove_dir() remove_dir() configfs_remove_dirent() # s_count = 1 dput() ... *dentry_unlink_inode()* configfs_d_iput() # s_count = 0, release However, if we failed in configfs_create(): configfs_register_subsystem() configfs_create_dir() configfs_make_dirent() # s_count = 2 ... configfs_create() # fail ->out_remove: configfs_remove_dirent(dentry) configfs_put(sd) # s_count = 1 return PTR_ERR(inode); There is no inode in the error path, so the configfs_d_iput() is lost and makes sd and fragment memory leaked. To fix this, when we failed in configfs_create(), manually call configfs_put(sd) to keep the refcount correct. Fixes: 7063fbf22611 ("[PATCH] configfs: User-driven configuration filesystem") Signed-off-by: Chen Zhongjin Signed-off-by: Christoph Hellwig --- fs/configfs/dir.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index d1f9d2632202..ec6519e1ca3b 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -316,6 +316,7 @@ static int configfs_create_dir(struct config_item *item, struct dentry *dentry, return 0; out_remove: + configfs_put(dentry->d_fsdata); configfs_remove_dirent(dentry); return PTR_ERR(inode); } @@ -382,6 +383,7 @@ int configfs_create_link(struct configfs_dirent *target, struct dentry *parent, return 0; out_remove: + configfs_put(dentry->d_fsdata); configfs_remove_dirent(dentry); return PTR_ERR(inode); } -- cgit From d9a4af5690e26afa8a2eb83c575d3a9ef52cde1d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Nov 2022 17:27:14 +0106 Subject: printk: Convert console_drivers list to hlist Replace the open coded single linked list with a hlist so a conversion to SRCU protected list walks can reuse the existing primitives. Co-developed-by: John Ogness Signed-off-by: John Ogness Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20221116162152.193147-3-john.ogness@linutronix.de --- fs/proc/consoles.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index dfe6ce3505ce..cf2e0788f9c7 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -74,8 +74,9 @@ static void *c_start(struct seq_file *m, loff_t *pos) static void *c_next(struct seq_file *m, void *v, loff_t *pos) { struct console *con = v; + ++*pos; - return con->next; + return hlist_entry_safe(con->node.next, struct console, node); } static void c_stop(struct seq_file *m, void *v) -- cgit From 8b5dd40088f7a3d85e6072f027ec50f309f64876 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Wed, 16 Nov 2022 17:27:22 +0106 Subject: proc: consoles: document console_lock usage The console_lock is held throughout the start/show/stop procedure to print out device/driver information about all registered consoles. Since the console_lock is being used for multiple reasons, explicitly document these reasons. This will be useful when the console_lock is split into fine-grained locking. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20221116162152.193147-11-john.ogness@linutronix.de --- fs/proc/consoles.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index cf2e0788f9c7..46b305fa04ed 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -63,6 +63,15 @@ static void *c_start(struct seq_file *m, loff_t *pos) struct console *con; loff_t off = 0; + /* + * Take console_lock to serialize device() callback with + * other console operations. For example, fg_console is + * modified under console_lock when switching vt. + * + * Hold the console_lock to guarantee safe traversal of the + * console list. SRCU cannot be used because there is no + * place to store the SRCU cookie. + */ console_lock(); for_each_console(con) if (off++ == *pos) -- cgit From 28de287a95362f52489c5a3f90acfb2b99524dae Mon Sep 17 00:00:00 2001 From: John Ogness Date: Wed, 16 Nov 2022 17:27:46 +0106 Subject: proc: consoles: use console_list_lock for list iteration The console_lock is used in part to guarantee safe list iteration. The console_list_lock should be used because list synchronization responsibility will be removed from the console_lock in a later change. Note, the console_lock is still needed to serialize the device() callback with other console operations. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20221116162152.193147-35-john.ogness@linutronix.de --- fs/proc/consoles.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index 46b305fa04ed..e0758fe7936d 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -33,7 +33,16 @@ static int show_console_dev(struct seq_file *m, void *v) if (con->device) { const struct tty_driver *driver; int index; + + /* + * Take console_lock to serialize device() callback with + * other console operations. For example, fg_console is + * modified under console_lock when switching vt. + */ + console_lock(); driver = con->device(con, &index); + console_unlock(); + if (driver) { dev = MKDEV(driver->major, driver->minor_start); dev += index; @@ -64,15 +73,11 @@ static void *c_start(struct seq_file *m, loff_t *pos) loff_t off = 0; /* - * Take console_lock to serialize device() callback with - * other console operations. For example, fg_console is - * modified under console_lock when switching vt. - * - * Hold the console_lock to guarantee safe traversal of the + * Hold the console_list_lock to guarantee safe traversal of the * console list. SRCU cannot be used because there is no * place to store the SRCU cookie. */ - console_lock(); + console_list_lock(); for_each_console(con) if (off++ == *pos) break; @@ -90,7 +95,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos) static void c_stop(struct seq_file *m, void *v) { - console_unlock(); + console_list_unlock(); } static const struct seq_operations consoles_op = { -- cgit From d272e01fa0a2f15c5c331a37cd99c6875c7b7186 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 15 Nov 2022 09:35:10 -0600 Subject: ksmbd: replace one-element arrays with flexible-array members One-element arrays are deprecated, and we are replacing them with flexible array members instead. So, replace one-element arrays with flexible-array members in multiple structs in fs/ksmbd/smb_common.h and one in fs/ksmbd/smb2pdu.h. Important to mention is that doing a build before/after this patch results in no binary output differences. This helps with the ongoing efforts to tighten the FORTIFY_SOURCE routines on memcpy() and help us make progress towards globally enabling -fstrict-flex-arrays=3 [1]. Link: https://github.com/KSPP/linux/issues/242 Link: https://github.com/KSPP/linux/issues/79 Link: https://gcc.gnu.org/pipermail/gcc-patches/2022-October/602902.html [1] Signed-off-by: Gustavo A. R. Silva Reviewed-by: Sergey Senozhatsky Acked-by: Namjae Jeon Reviewed-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/Y3OxronfaPYv9qGP@work --- fs/ksmbd/smb2pdu.c | 4 ++-- fs/ksmbd/smb2pdu.h | 2 +- fs/ksmbd/smb_common.h | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index b2fc85d440d0..31f00cec5255 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -3438,7 +3438,7 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, goto free_conv_name; } - struct_sz = readdir_info_level_struct_sz(info_level) - 1 + conv_len; + struct_sz = readdir_info_level_struct_sz(info_level) + conv_len; next_entry_offset = ALIGN(struct_sz, KSMBD_DIR_INFO_ALIGNMENT); d_info->last_entry_off_align = next_entry_offset - struct_sz; @@ -3690,7 +3690,7 @@ static int reserve_populate_dentry(struct ksmbd_dir_info *d_info, return -EOPNOTSUPP; conv_len = (d_info->name_len + 1) * 2; - next_entry_offset = ALIGN(struct_sz - 1 + conv_len, + next_entry_offset = ALIGN(struct_sz + conv_len, KSMBD_DIR_INFO_ALIGNMENT); if (next_entry_offset > d_info->out_buf_len) { diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index 092fdd3f8750..aa5dbe54f5a1 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -443,7 +443,7 @@ struct smb2_posix_info { /* SidBuffer contain two sids (UNIX user sid(16), UNIX group sid(16)) */ u8 SidBuffer[32]; __le32 name_len; - u8 name[1]; + u8 name[]; /* * var sized owner SID * var sized group SID diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h index 318c16fa81da..e663ab9ea759 100644 --- a/fs/ksmbd/smb_common.h +++ b/fs/ksmbd/smb_common.h @@ -277,14 +277,14 @@ struct file_directory_info { __le64 AllocationSize; __le32 ExtFileAttributes; __le32 FileNameLength; - char FileName[1]; + char FileName[]; } __packed; /* level 0x101 FF resp data */ struct file_names_info { __le32 NextEntryOffset; __u32 FileIndex; __le32 FileNameLength; - char FileName[1]; + char FileName[]; } __packed; /* level 0xc FF resp data */ struct file_full_directory_info { @@ -299,7 +299,7 @@ struct file_full_directory_info { __le32 ExtFileAttributes; __le32 FileNameLength; __le32 EaSize; - char FileName[1]; + char FileName[]; } __packed; /* level 0x102 FF resp */ struct file_both_directory_info { @@ -317,7 +317,7 @@ struct file_both_directory_info { __u8 ShortNameLength; __u8 Reserved; __u8 ShortName[24]; - char FileName[1]; + char FileName[]; } __packed; /* level 0x104 FFrsp data */ struct file_id_both_directory_info { @@ -337,7 +337,7 @@ struct file_id_both_directory_info { __u8 ShortName[24]; __le16 Reserved2; __le64 UniqueId; - char FileName[1]; + char FileName[]; } __packed; struct file_id_full_dir_info { @@ -354,7 +354,7 @@ struct file_id_full_dir_info { __le32 EaSize; /* EA size */ __le32 Reserved; __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/ - char FileName[1]; + char FileName[]; } __packed; /* level 0x105 FF rsp data */ struct smb_version_values { -- cgit From 6a46bf558803dd2b959ca7435a5c143efe837217 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 2 Nov 2022 10:51:23 +0800 Subject: binfmt_misc: fix shift-out-of-bounds in check_special_flags UBSAN reported a shift-out-of-bounds warning: left shift of 1 by 31 places cannot be represented in type 'int' Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x8d/0xcf lib/dump_stack.c:106 ubsan_epilogue+0xa/0x44 lib/ubsan.c:151 __ubsan_handle_shift_out_of_bounds+0x1e7/0x208 lib/ubsan.c:322 check_special_flags fs/binfmt_misc.c:241 [inline] create_entry fs/binfmt_misc.c:456 [inline] bm_register_write+0x9d3/0xa20 fs/binfmt_misc.c:654 vfs_write+0x11e/0x580 fs/read_write.c:582 ksys_write+0xcf/0x120 fs/read_write.c:637 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x34/0x80 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x4194e1 Since the type of Node's flags is unsigned long, we should define these macros with same type too. Signed-off-by: Liu Shixin Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221102025123.1117184-1-liushixin2@huawei.com --- fs/binfmt_misc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index e1eae7ea823a..bb202ad369d5 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -44,10 +44,10 @@ static LIST_HEAD(entries); static int enabled = 1; enum {Enabled, Magic}; -#define MISC_FMT_PRESERVE_ARGV0 (1 << 31) -#define MISC_FMT_OPEN_BINARY (1 << 30) -#define MISC_FMT_CREDENTIALS (1 << 29) -#define MISC_FMT_OPEN_FILE (1 << 28) +#define MISC_FMT_PRESERVE_ARGV0 (1UL << 31) +#define MISC_FMT_OPEN_BINARY (1UL << 30) +#define MISC_FMT_CREDENTIALS (1UL << 29) +#define MISC_FMT_OPEN_FILE (1UL << 28) typedef struct { struct list_head list; -- cgit From e1fce564900f8734edf15b87f028c57e14f6e28d Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Fri, 2 Dec 2022 16:22:54 +0800 Subject: pstore/ram: Fix error return code in ramoops_probe() In the if (dev_of_node(dev) && !pdata) path, the "err" may be assigned a value of 0, so the error return code -EINVAL may be incorrectly set to 0. To fix set valid return code before calling to goto. Fixes: 35da60941e44 ("pstore/ram: add Device Tree bindings") Signed-off-by: Wang Yufen Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/1669969374-46582-1-git-send-email-wangyufen@huawei.com --- fs/pstore/ram.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 5a39fa8522b0..9a5052431fd3 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -745,6 +745,7 @@ static int ramoops_probe(struct platform_device *pdev) /* Make sure we didn't get bogus platform data pointer. */ if (!pdata) { pr_err("NULL platform data\n"); + err = -EINVAL; goto fail_out; } @@ -752,6 +753,7 @@ static int ramoops_probe(struct platform_device *pdev) !pdata->ftrace_size && !pdata->pmsg_size)) { pr_err("The memory size and the record/console size must be " "non-zero\n"); + err = -EINVAL; goto fail_out; } -- cgit From 1f5619ed881081be300db61da552ffae7163bb72 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 2 Dec 2022 13:30:51 -0800 Subject: xfs: Remove duplicated include in xfs_iomap.c ./fs/xfs/xfs_iomap.c: xfs_error.h is included more than once. ./fs/xfs/xfs_iomap.c: xfs_errortag.h is included more than once. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=3337 Reported-by: Abaci Robot Signed-off-by: Yang Li Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_iomap.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 68436370927d..43f447199c08 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -27,8 +27,6 @@ #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_reflink.h" -#include "xfs_error.h" -#include "xfs_errortag.h" #define XFS_ALLOC_ALIGN(mp, off) \ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) -- cgit From 7868f93006ad27c00ecddcc2904118aa705459ca Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Sat, 3 Dec 2022 01:07:24 +0000 Subject: udf: Increase UDF_MAX_READ_VERSION to 0x0260 Some discs containing the UDF file system are unable to be mounted, failing with the following message: UDF-fs: error (device sr0): udf_fill_super: minUDFReadRev=260 (max is 250) The UDF 2.60 specification [0] states in the section Basic Restrictions & Requirements (page 10): The Minimum UDF Read Revision value shall be at most #0250 for all media with a UDF 2.60 file system. This indicates that a UDF 2.50 implementation can read all UDF 2.60 media. Media that do not have a Metadata Partition may use a value lower than #250. The conclusion is that the discs failing to mount were burned with a faulty software, which didn't follow the specification. This can be worked around by increasing UDF_MAX_READ_VERSION to 0x260, to match the Minimum Read Revision. No other changes are required, as reading UDF 2.60 is backward compatible with UDF 2.50. [0] http://www.osta.org/specs/pdf/udf260.pdf Signed-off-by: Bartosz Taudul Signed-off-by: Jan Kara --- fs/udf/udf_sb.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 4fa620543d30..291b56dd011e 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -6,7 +6,11 @@ #include #include -#define UDF_MAX_READ_VERSION 0x0250 +/* + * Even UDF 2.6 media should have version <= 0x250 but apparently there are + * some broken filesystems with version set to 0x260. Accommodate those. + */ +#define UDF_MAX_READ_VERSION 0x0260 #define UDF_MAX_WRITE_VERSION 0x0201 #define UDF_FLAG_USE_EXTENDED_FE 0 -- cgit From 83ae4133ac9410ac6a57136e464d498dc66200cf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 30 Sep 2022 16:45:09 -0400 Subject: btrfs: add a cached_state to try_lock_extent With nowait becoming more pervasive throughout our codebase go ahead and add a cached_state to try_lock_extent(). This allows us to be faster about clearing the locked area if we have contention, and then gives us the same optimization for unlock if we are able to lock the range. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 7 ++++--- fs/btrfs/extent-io-tree.h | 3 ++- fs/btrfs/extent_io.c | 3 ++- fs/btrfs/file.c | 3 ++- fs/btrfs/inode.c | 3 ++- fs/btrfs/ordered-data.c | 2 +- fs/btrfs/relocation.c | 2 +- 7 files changed, 14 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 83cb0378096f..1b0a45b51f4c 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -1615,17 +1615,18 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, changeset); } -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) { int err; u64 failed_start; err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, - NULL, NULL, GFP_NOFS); + cached, NULL, GFP_NOFS); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, NULL); + EXTENT_LOCKED, cached); return 0; } return 1; diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index a855f40dd61d..786be8f38f0b 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -106,7 +106,8 @@ void extent_io_tree_release(struct extent_io_tree *tree); int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached); -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached); int __init extent_state_init_cachep(void); void __cold extent_state_free_cachep(void); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4dcf22e051ff..c7e94a0e60d5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4959,7 +4959,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; if (wait == WAIT_NONE) { - if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1)) + if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1, + NULL)) return -EAGAIN; } else { ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d01631d47806..98107466572b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1302,7 +1302,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, struct btrfs_ordered_extent *ordered; if (nowait) { - if (!try_lock_extent(&inode->io_tree, start_pos, last_pos)) { + if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, + cached_state)) { for (i = 0; i < num_pages; i++) { unlock_page(pages[i]); put_page(pages[i]); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0e516aefbf51..2ba2d8b9cefc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7255,7 +7255,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, while (1) { if (nowait) { - if (!try_lock_extent(io_tree, lockstart, lockend)) + if (!try_lock_extent(io_tree, lockstart, lockend, + cached_state)) return -EAGAIN; } else { lock_extent(io_tree, lockstart, lockend, cached_state); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index e54f8280031f..b648c9d4ea0f 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1073,7 +1073,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end) { struct btrfs_ordered_extent *ordered; - if (!try_lock_extent(&inode->io_tree, start, end)) + if (!try_lock_extent(&inode->io_tree, start, end, NULL)) return false; ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 666a37a0ee89..e81a21082e58 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1120,7 +1120,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); end--; ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end); + key.offset, end, NULL); if (!ret) continue; -- cgit From 632ddfa2131f0fea1831bc1f4b28c68faa779156 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 30 Sep 2022 16:45:10 -0400 Subject: btrfs: use cached_state for btrfs_check_nocow_lock Now that try_lock_extent() takes a cached_state, plumb the cached_state through btrfs_try_lock_ordered_range() and then use a cached_state in btrfs_check_nocow_lock everywhere to avoid extra tree searches on the extent_io_tree. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 9 ++++++--- fs/btrfs/ordered-data.c | 7 ++++--- fs/btrfs/ordered-data.h | 3 ++- 3 files changed, 12 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 98107466572b..493cae66e5e6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1373,6 +1373,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; + struct extent_state *cached_state = NULL; u64 lockstart, lockend; u64 num_bytes; int ret; @@ -1389,12 +1390,14 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, num_bytes = lockend - lockstart + 1; if (nowait) { - if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend)) { + if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend, + &cached_state)) { btrfs_drew_write_unlock(&root->snapshot_lock); return -EAGAIN; } } else { - btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL); + btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, + &cached_state); } ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, NULL, NULL, NULL, nowait, false); @@ -1403,7 +1406,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, else *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); - unlock_extent(&inode->io_tree, lockstart, lockend, NULL); + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); return ret; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index b648c9d4ea0f..de2b716d3e7b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1069,11 +1069,12 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, * Return true if btrfs_lock_ordered_range does not return any extents, * otherwise false. */ -bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end) +bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state) { struct btrfs_ordered_extent *ordered; - if (!try_lock_extent(&inode->io_tree, start, end, NULL)) + if (!try_lock_extent(&inode->io_tree, start, end, cached_state)) return false; ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); @@ -1081,7 +1082,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end) return true; btrfs_put_ordered_extent(ordered); - unlock_extent(&inode->io_tree, start, end, NULL); + unlock_extent(&inode->io_tree, start, end, cached_state); return false; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f59f2dbdb25e..89f82b78f590 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -206,7 +206,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); -bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end); +bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state); int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, u64 post); int __init ordered_data_init(void); -- cgit From 9c5c9604631ae5fcdc5124c79d01d75f80b5ffd4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 30 Sep 2022 16:45:11 -0400 Subject: btrfs: use a cached_state everywhere in relocation All of the relocation code avoids using the cached state, despite everywhere using the normal lock_extent() // do something unlock_extent() pattern. Fix this by plumbing a cached state throughout all of these functions in order to allow for less tree searches. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index e81a21082e58..9dcf9b39c798 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1113,6 +1113,8 @@ int replace_file_extents(struct btrfs_trans_handle *trans, inode = find_next_inode(root, key.objectid); } if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { + struct extent_state *cached_state = NULL; + end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); WARN_ON(!IS_ALIGNED(key.offset, @@ -1120,14 +1122,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans, WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); end--; ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, NULL); + key.offset, end, + &cached_state); if (!ret) continue; btrfs_drop_extent_map_range(BTRFS_I(inode), key.offset, end, true); unlock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, NULL); + key.offset, end, &cached_state); } } @@ -1516,6 +1519,8 @@ static int invalidate_extent_cache(struct btrfs_root *root, objectid = min_key->objectid; while (1) { + struct extent_state *cached_state = NULL; + cond_resched(); iput(inode); @@ -1566,9 +1571,9 @@ static int invalidate_extent_cache(struct btrfs_root *root, } /* the lock_extent waits for read_folio to complete */ - lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); + lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); } return 0; } @@ -2863,19 +2868,21 @@ static noinline_for_stack int prealloc_file_extent_cluster( btrfs_inode_lock(&inode->vfs_inode, 0); for (nr = 0; nr < cluster->nr; nr++) { + struct extent_state *cached_state = NULL; + start = cluster->boundary[nr] - offset; if (nr + 1 < cluster->nr) end = cluster->boundary[nr + 1] - 1 - offset; else end = cluster->end - offset; - lock_extent(&inode->io_tree, start, end, NULL); + lock_extent(&inode->io_tree, start, end, &cached_state); num_bytes = end + 1 - start; ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); cur_offset = end + 1; - unlock_extent(&inode->io_tree, start, end, NULL); + unlock_extent(&inode->io_tree, start, end, &cached_state); if (ret) break; } @@ -2891,6 +2898,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod u64 start, u64 end, u64 block_start) { struct extent_map *em; + struct extent_state *cached_state = NULL; int ret = 0; em = alloc_extent_map(); @@ -2903,9 +2911,9 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod em->block_start = block_start; set_bit(EXTENT_FLAG_PINNED, &em->flags); - lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); + lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); free_extent_map(em); return ret; @@ -2983,6 +2991,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, */ cur = max(page_start, cluster->boundary[*cluster_nr] - offset); while (cur <= page_end) { + struct extent_state *cached_state = NULL; u64 extent_start = cluster->boundary[*cluster_nr] - offset; u64 extent_end = get_cluster_boundary_end(cluster, *cluster_nr) - offset; @@ -2998,13 +3007,15 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, goto release_page; /* Mark the range delalloc and dirty for later writeback */ - lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL); + lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, + &cached_state); ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, - clamped_end, 0, NULL); + clamped_end, 0, &cached_state); if (ret) { - clear_extent_bits(&BTRFS_I(inode)->io_tree, - clamped_start, clamped_end, - EXTENT_LOCKED | EXTENT_BOUNDARY); + clear_extent_bit(&BTRFS_I(inode)->io_tree, + clamped_start, clamped_end, + EXTENT_LOCKED | EXTENT_BOUNDARY, + &cached_state); btrfs_delalloc_release_metadata(BTRFS_I(inode), clamped_len, true); btrfs_delalloc_release_extents(BTRFS_I(inode), @@ -3031,7 +3042,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, boundary_start, boundary_end, EXTENT_BOUNDARY); } - unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL); + unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, + &cached_state); btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); cur += clamped_len; -- cgit From 123a7f008c9e2b25b451c116620f1f6c77bd6b2b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 30 Sep 2022 16:45:12 -0400 Subject: btrfs: cache the failed state when locking extents Currently if we fail to lock a range we'll return the start of the range that we failed to lock. We'll then search down to this range and wait on any extent states in this range. However we can avoid this search altogether if we simply cache the extent_state that had the contention. We can pass this into wait_extent_bit() and start from that extent_state without doing the search. In the most optimistic case we can avoid all searches, more likely we'll avoid the initial search and have to perform the search after we wait on the failed state, or worst case we must search both times which is what currently happens. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 52 +++++++++++++++++++++++++++++++++++------------ fs/btrfs/extent-io-tree.h | 3 ++- fs/btrfs/extent_io.c | 3 ++- 3 files changed, 43 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 1b0a45b51f4c..a630c771d25c 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -714,7 +714,8 @@ static void wait_on_state(struct extent_io_tree *tree, * The range [start, end] is inclusive. * The tree lock is taken by this function */ -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits) +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state) { struct extent_state *state; @@ -722,6 +723,16 @@ void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits) spin_lock(&tree->lock); again: + /* + * Maintain cached_state, as we may not remove it from the tree if there + * are more bits than the bits we're waiting on set on this state. + */ + if (cached_state && *cached_state) { + state = *cached_state; + if (extent_state_in_tree(state) && + state->start <= start && start < state->end) + goto process_node; + } while (1) { /* * This search will find all the extents that end after our @@ -752,6 +763,12 @@ process_node: } } out: + /* This state is no longer useful, clear it and free it up. */ + if (cached_state && *cached_state) { + state = *cached_state; + *cached_state = NULL; + free_extent_state(state); + } spin_unlock(&tree->lock); } @@ -939,13 +956,17 @@ out: * sleeping, so the gfp mask is used to indicate what is allowed. * * If any of the exclusive bits are set, this will fail with -EEXIST if some - * part of the range already has the desired bits set. The start of the - * existing range is returned in failed_start in this case. + * part of the range already has the desired bits set. The extent_state of the + * existing range is returned in failed_state in this case, and the start of the + * existing range is returned in failed_start. failed_state is used as an + * optimization for wait_extent_bit, failed_start must be used as the source of + * truth as failed_state may have changed since we returned. * * [start, end] is inclusive This takes the tree lock. */ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, u64 *failed_start, + struct extent_state **failed_state, struct extent_state **cached_state, struct extent_changeset *changeset, gfp_t mask) { @@ -964,7 +985,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (exclusive_bits) ASSERT(failed_start); else - ASSERT(failed_start == NULL); + ASSERT(failed_start == NULL && failed_state == NULL); again: if (!prealloc && gfpflags_allow_blocking(mask)) { /* @@ -1012,6 +1033,7 @@ hit_next: if (state->start == start && state->end <= end) { if (state->state & exclusive_bits) { *failed_start = state->start; + cache_state(state, failed_state); err = -EEXIST; goto out; } @@ -1047,6 +1069,7 @@ hit_next: if (state->start < start) { if (state->state & exclusive_bits) { *failed_start = start; + cache_state(state, failed_state); err = -EEXIST; goto out; } @@ -1125,6 +1148,7 @@ hit_next: if (state->start <= end && state->end > end) { if (state->state & exclusive_bits) { *failed_start = start; + cache_state(state, failed_state); err = -EEXIST; goto out; } @@ -1162,8 +1186,8 @@ out: int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_state **cached_state, gfp_t mask) { - return __set_extent_bit(tree, start, end, bits, NULL, cached_state, - NULL, mask); + return __set_extent_bit(tree, start, end, bits, NULL, NULL, + cached_state, NULL, mask); } /* @@ -1598,8 +1622,8 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ ASSERT(!(bits & EXTENT_LOCKED)); - return __set_extent_bit(tree, start, end, bits, NULL, NULL, changeset, - GFP_NOFS); + return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, + changeset, GFP_NOFS); } int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, @@ -1622,7 +1646,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u64 failed_start; err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, - cached, NULL, GFP_NOFS); + NULL, cached, NULL, GFP_NOFS); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, @@ -1639,20 +1663,22 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state) { + struct extent_state *failed_state = NULL; int err; u64 failed_start; err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, - cached_state, NULL, GFP_NOFS); + &failed_state, cached_state, NULL, GFP_NOFS); while (err == -EEXIST) { if (failed_start != start) clear_extent_bit(tree, start, failed_start - 1, EXTENT_LOCKED, cached_state); - wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED, + &failed_state); err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, - &failed_start, cached_state, NULL, - GFP_NOFS); + &failed_start, &failed_state, + cached_state, NULL, GFP_NOFS); } return err; } diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 786be8f38f0b..c71aa29f719d 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -235,6 +235,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits); +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state); #endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c7e94a0e60d5..836851233618 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5001,7 +5001,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, if (ret || wait != WAIT_COMPLETE) return ret; - wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED); + wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, + EXTENT_LOCKED, NULL); if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) ret = -EIO; return ret; -- cgit From e5e886bad9e9e87b767ade3884faec1cfdec9b43 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 30 Sep 2022 16:45:13 -0400 Subject: btrfs: add cached_state to read_extent_buffer_subpage We don't use a cached state here at all, which generally makes sense as async reads are going to unlock at endio time. However for blocking reads we will call wait_extent_bit() for our range. Since the lock_extent() stuff will return the cached_state for the start of the range this is a helpful optimization to have for this case, we'll have the exact state we want to wait on. Add a cached state here and simply throw it away if we're a non-blocking read, otherwise we'll get a small improvement by eliminating some tree searches. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 836851233618..7891d375eb43 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4949,6 +4949,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; struct page *page = eb->pages[0]; + struct extent_state *cached_state = NULL; struct btrfs_bio_ctrl bio_ctrl = { .mirror_num = mirror_num, }; @@ -4960,10 +4961,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, if (wait == WAIT_NONE) { if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1, - NULL)) + &cached_state)) return -EAGAIN; } else { - ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL); + ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, + &cached_state); if (ret < 0) return ret; } @@ -4973,7 +4975,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, PageUptodate(page) || btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL); + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, + &cached_state); return ret; } @@ -4998,11 +5001,13 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, atomic_dec(&eb->io_pages); } submit_one_bio(&bio_ctrl); - if (ret || wait != WAIT_COMPLETE) + if (ret || wait != WAIT_COMPLETE) { + free_extent_state(cached_state); return ret; + } wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, - EXTENT_LOCKED, NULL); + EXTENT_LOCKED, &cached_state); if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) ret = -EIO; return ret; -- cgit From c1b078545e64da7a2b67b74b9d12813b5dd2a529 Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Fri, 7 Oct 2022 18:33:35 +0200 Subject: btrfs: simplify cleanup after error in btrfs_create_tree Since leaf is already NULL, and no other branch will go to fail_unlock, the fail_unlock label is useless and can be removed Signed-off-by: Peng Hao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d99bf7c64611..3460eaa9e4aa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1197,7 +1197,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; - goto fail_unlock; + goto fail; } root->node = leaf; @@ -1232,9 +1232,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, return root; -fail_unlock: - if (leaf) - btrfs_tree_unlock(leaf); fail: btrfs_put_root(root); -- cgit From d60d956eb41f0945c2438af11ed412109c558f4f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:25 -0400 Subject: btrfs: remove unused set/clear_pending_info helpers The last users of these helpers were removed in 5297199a8bca ("btrfs: remove inode number cache feature") so delete these helpers. The point was for mount options that were applicable after transaction commit so they could not be applied immediately. We don't have such options anymore and if we do the patch can be reverted. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9e6d48ff4597..3be1db2c558f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1661,30 +1661,6 @@ do { \ #define btrfs_clear_pending(info, opt) \ clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) -/* - * Helpers for setting pending mount option changes. - * - * Expects corresponding macros - * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name - */ -#define btrfs_set_pending_and_info(info, opt, fmt, args...) \ -do { \ - if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \ - btrfs_info((info), fmt, ##args); \ - btrfs_set_pending((info), SET_##opt); \ - btrfs_clear_pending((info), CLEAR_##opt); \ - } \ -} while(0) - -#define btrfs_clear_pending_and_info(info, opt, fmt, args...) \ -do { \ - if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \ - btrfs_info((info), fmt, ##args); \ - btrfs_set_pending((info), CLEAR_##opt); \ - btrfs_clear_pending((info), SET_##opt); \ - } \ -} while(0) - /* * Inode flags */ -- cgit From ea206640a600c808d30d8d62c44b6982c12a143b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:26 -0400 Subject: btrfs: remove unused BTRFS_TOTAL_BYTES_PINNED_BATCH This hasn't been used since 138a12d86574 ("btrfs: rip out btrfs_space_info::total_bytes_pinned") so it is safe to remove. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3be1db2c558f..8bd49328efab 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -92,14 +92,6 @@ struct reloc_control; #define BTRFS_DIRTY_METADATA_THRESH SZ_32M -/* - * Use large batch size to reduce overhead of metadata updates. On the reader - * side, we only read it when we are close to ENOSPC and the read overhead is - * mostly related to the number of CPUs, so it is OK to use arbitrary large - * value here. - */ -#define BTRFS_TOTAL_BYTES_PINNED_BATCH SZ_128M - #define BTRFS_MAX_EXTENT_SIZE SZ_128M /* -- cgit From 4ce76e8e783693c72956e817f3a6dbbaa055411a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:27 -0400 Subject: btrfs: remove unused BTRFS_IOPRIO_READA The last user of this definition was removed in patch f26c92386028 ("btrfs: remove reada infrastructure") so we can remove this definition. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8bd49328efab..7760cc07288e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -87,9 +87,6 @@ struct reloc_control; #define BTRFS_EMPTY_DIR_SIZE 0 -/* ioprio of readahead is set to idle */ -#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) - #define BTRFS_DIRTY_METADATA_THRESH SZ_32M #define BTRFS_MAX_EXTENT_SIZE SZ_128M -- cgit From 4300c58f809079951c87d84e5f11a2d265e3c9e7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:28 -0400 Subject: btrfs: move btrfs on-disk definitions out of ctree.h The bulk of our on-disk definitions exist in btrfs_tree.h, which user space can use. Keep things consistent and move the rest of the on disk definitions out of ctree.h into btrfs_tree.h. Note I did have to update all u8's to __u8, but otherwise this is a strict copy and paste. Most of the definitions are mainly for internal use and are not guaranteed stable public API and may change as we need. Compilation failures by user applications can happen. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ reformat comments, style fixups ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 215 +------------------------------------------------------ 1 file changed, 1 insertion(+), 214 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7760cc07288e..7a80f9380146 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -55,8 +55,6 @@ struct btrfs_balance_control; struct btrfs_delayed_root; struct reloc_control; -#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ - /* * Maximum number of mirrors that can be available for all profiles counting * the target device of dev-replace as one. During an active device replace @@ -68,8 +66,6 @@ struct reloc_control; */ #define BTRFS_MAX_MIRRORS (4 + 1) -#define BTRFS_MAX_LEVEL 8 - #define BTRFS_OLDEST_GENERATION 0ULL /* @@ -138,81 +134,9 @@ enum { BTRFS_FS_STATE_COUNT }; -#define BTRFS_BACKREF_REV_MAX 256 -#define BTRFS_BACKREF_REV_SHIFT 56 -#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ - BTRFS_BACKREF_REV_SHIFT) - -#define BTRFS_OLD_BACKREF_REV 0 -#define BTRFS_MIXED_BACKREF_REV 1 - -/* - * every tree block (leaf or node) starts with this header. - */ -struct btrfs_header { - /* these first four must match the super block */ - u8 csum[BTRFS_CSUM_SIZE]; - u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ - __le64 bytenr; /* which block this node is supposed to live in */ - __le64 flags; - - /* allowed to be different from the super from here on down */ - u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; - __le64 generation; - __le64 owner; - __le32 nritems; - u8 level; -} __attribute__ ((__packed__)); - -/* - * this is a very generous portion of the super block, giving us - * room to translate 14 chunks with 3 stripes each. - */ -#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 - -/* - * just in case we somehow lose the roots and are not able to mount, - * we store an array of the roots from previous transactions - * in the super. - */ -#define BTRFS_NUM_BACKUP_ROOTS 4 -struct btrfs_root_backup { - __le64 tree_root; - __le64 tree_root_gen; - - __le64 chunk_root; - __le64 chunk_root_gen; - - __le64 extent_root; - __le64 extent_root_gen; - - __le64 fs_root; - __le64 fs_root_gen; - - __le64 dev_root; - __le64 dev_root_gen; - - __le64 csum_root; - __le64 csum_root_gen; - - __le64 total_bytes; - __le64 bytes_used; - __le64 num_devices; - /* future */ - __le64 unused_64[4]; - - u8 tree_root_level; - u8 chunk_root_level; - u8 extent_root_level; - u8 fs_root_level; - u8 dev_root_level; - u8 csum_root_level; - /* future and to align */ - u8 unused_8[10]; -} __attribute__ ((__packed__)); - #define BTRFS_SUPER_INFO_OFFSET SZ_64K #define BTRFS_SUPER_INFO_SIZE 4096 +static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); /* * The reserved space at the beginning of each device. @@ -221,69 +145,6 @@ struct btrfs_root_backup { */ #define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) -/* - * the super block basically lists the main trees of the FS - * it currently lacks any block count etc etc - */ -struct btrfs_super_block { - /* the first 4 fields must match struct btrfs_header */ - u8 csum[BTRFS_CSUM_SIZE]; - /* FS specific UUID, visible to user */ - u8 fsid[BTRFS_FSID_SIZE]; - __le64 bytenr; /* this block number */ - __le64 flags; - - /* allowed to be different from the btrfs_header from here own down */ - __le64 magic; - __le64 generation; - __le64 root; - __le64 chunk_root; - __le64 log_root; - - /* - * This member has never been utilized since the very beginning, thus - * it's always 0 regardless of kernel version. We always use - * generation + 1 to read log tree root. So here we mark it deprecated. - */ - __le64 __unused_log_root_transid; - __le64 total_bytes; - __le64 bytes_used; - __le64 root_dir_objectid; - __le64 num_devices; - __le32 sectorsize; - __le32 nodesize; - __le32 __unused_leafsize; - __le32 stripesize; - __le32 sys_chunk_array_size; - __le64 chunk_root_generation; - __le64 compat_flags; - __le64 compat_ro_flags; - __le64 incompat_flags; - __le16 csum_type; - u8 root_level; - u8 chunk_root_level; - u8 log_root_level; - struct btrfs_dev_item dev_item; - - char label[BTRFS_LABEL_SIZE]; - - __le64 cache_generation; - __le64 uuid_tree_generation; - - /* the UUID written into btree blocks */ - u8 metadata_uuid[BTRFS_FSID_SIZE]; - - /* future expansion */ - u8 reserved8[8]; - __le64 reserved[27]; - u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; - struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; - - /* Padded to 4096 bytes */ - u8 padding[565]; -} __attribute__ ((__packed__)); -static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); - /* * Compat flags that we support. If any incompat flags are set other than the * ones specified below then we will fail to mount @@ -341,43 +202,6 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) #define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL -/* - * A leaf is full of items. offset and size tell us where to find - * the item in the leaf (relative to the start of the data area) - */ -struct btrfs_item { - struct btrfs_disk_key key; - __le32 offset; - __le32 size; -} __attribute__ ((__packed__)); - -/* - * leaves have an item area and a data area: - * [item0, item1....itemN] [free space] [dataN...data1, data0] - * - * The data is separate from the items to get the keys closer together - * during searches. - */ -struct btrfs_leaf { - struct btrfs_header header; - struct btrfs_item items[]; -} __attribute__ ((__packed__)); - -/* - * all non-leaf blocks are nodes, they hold only keys and pointers to - * other blocks - */ -struct btrfs_key_ptr { - struct btrfs_disk_key key; - __le64 blockptr; - __le64 generation; -} __attribute__ ((__packed__)); - -struct btrfs_node { - struct btrfs_header header; - struct btrfs_key_ptr ptrs[]; -} __attribute__ ((__packed__)); - /* Read ahead values for struct btrfs_path.reada */ enum { READA_NONE, @@ -1650,43 +1474,6 @@ do { \ #define btrfs_clear_pending(info, opt) \ clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) -/* - * Inode flags - */ -#define BTRFS_INODE_NODATASUM (1U << 0) -#define BTRFS_INODE_NODATACOW (1U << 1) -#define BTRFS_INODE_READONLY (1U << 2) -#define BTRFS_INODE_NOCOMPRESS (1U << 3) -#define BTRFS_INODE_PREALLOC (1U << 4) -#define BTRFS_INODE_SYNC (1U << 5) -#define BTRFS_INODE_IMMUTABLE (1U << 6) -#define BTRFS_INODE_APPEND (1U << 7) -#define BTRFS_INODE_NODUMP (1U << 8) -#define BTRFS_INODE_NOATIME (1U << 9) -#define BTRFS_INODE_DIRSYNC (1U << 10) -#define BTRFS_INODE_COMPRESS (1U << 11) - -#define BTRFS_INODE_ROOT_ITEM_INIT (1U << 31) - -#define BTRFS_INODE_FLAG_MASK \ - (BTRFS_INODE_NODATASUM | \ - BTRFS_INODE_NODATACOW | \ - BTRFS_INODE_READONLY | \ - BTRFS_INODE_NOCOMPRESS | \ - BTRFS_INODE_PREALLOC | \ - BTRFS_INODE_SYNC | \ - BTRFS_INODE_IMMUTABLE | \ - BTRFS_INODE_APPEND | \ - BTRFS_INODE_NODUMP | \ - BTRFS_INODE_NOATIME | \ - BTRFS_INODE_DIRSYNC | \ - BTRFS_INODE_COMPRESS | \ - BTRFS_INODE_ROOT_ITEM_INIT) - -#define BTRFS_INODE_RO_VERITY (1U << 0) - -#define BTRFS_INODE_RO_FLAG_MASK (BTRFS_INODE_RO_VERITY) - struct btrfs_map_token { struct extent_buffer *eb; char *kaddr; -- cgit From 51129b33d3911c7a36e643d47cf7c00fba3089fe Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:29 -0400 Subject: btrfs: move btrfs_get_block_group helper out of disk-io.h This inline helper calls btrfs_fs_compat_ro(), which is defined in another header. To avoid weird header dependency problems move this helper into disk-io.c with the rest of the global root helpers. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 7 +++++++ fs/btrfs/disk-io.h | 8 +------- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3460eaa9e4aa..5705335c80c8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1167,6 +1167,13 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) return btrfs_global_root(fs_info, &key); } +struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) +{ + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) + return fs_info->block_group_root; + return btrfs_extent_root(fs_info, 0); +} + struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid) { diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 9fa923e005a3..da040ec327ae 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -75,6 +75,7 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, struct btrfs_key *key); struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr); struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr); +struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); @@ -106,13 +107,6 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) return NULL; } -static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) -{ - if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) - return fs_info->block_group_root; - return btrfs_extent_root(fs_info, 0); -} - void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, -- cgit From ad4b63caf56d0dc08e03966172d685ff9ebad996 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:30 -0400 Subject: btrfs: move maximum limits to btrfs_tree.h We have maximum link and name length limits, move these to btrfs_tree.h as they're on disk limitations. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ reformat comments ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7a80f9380146..6a37df0fa0eb 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -68,19 +68,6 @@ struct reloc_control; #define BTRFS_OLDEST_GENERATION 0ULL -/* - * we can actually store much bigger names, but lets not confuse the rest - * of linux - */ -#define BTRFS_NAME_LEN 255 - -/* - * Theoretical limit is larger, but we keep this down to a sane - * value. That should limit greatly the possibility of collisions on - * inode ref items. - */ -#define BTRFS_LINK_MAX 65535U - #define BTRFS_EMPTY_DIR_SIZE 0 #define BTRFS_DIRTY_METADATA_THRESH SZ_32M -- cgit From ed4c491a3db2e85b3eb04ac61f0723fc2ec5f50a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:31 -0400 Subject: btrfs: move BTRFS_MAX_MIRRORS into scrub.c This is only used locally in scrub.c, move it out of ctree.h into scrub.c. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 11 ----------- fs/btrfs/scrub.c | 11 +++++++++++ 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6a37df0fa0eb..ea83c178a60c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -55,17 +55,6 @@ struct btrfs_balance_control; struct btrfs_delayed_root; struct reloc_control; -/* - * Maximum number of mirrors that can be available for all profiles counting - * the target device of dev-replace as one. During an active device replace - * procedure, the target device of the copy operation is a mirror for the - * filesystem data as well that can be used to read data in order to repair - * read errors on other disks. - * - * Current value is derived from RAID1C4 with 4 copies. - */ -#define BTRFS_MAX_MIRRORS (4 + 1) - #define BTRFS_OLDEST_GENERATION 0ULL #define BTRFS_EMPTY_DIR_SIZE 0 diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 196c4c6ed1ed..cdda4b2f20f2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -56,6 +56,17 @@ struct scrub_ctx; #define SCRUB_MAX_PAGES (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE)) +/* + * Maximum number of mirrors that can be available for all profiles counting + * the target device of dev-replace as one. During an active device replace + * procedure, the target device of the copy operation is a mirror for the + * filesystem data as well that can be used to read data in order to repair + * read errors on other disks. + * + * Current value is derived from RAID1C4 with 4 copies. + */ +#define BTRFS_MAX_MIRRORS (4 + 1) + struct scrub_recover { refcount_t refs; struct btrfs_io_context *bioc; -- cgit From 390d89ccf672ec79b84037d4af834b2275ff7cb9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:32 -0400 Subject: btrfs: move discard stat defs to free-space-cache.h These definitions are used for discard statistics, move them out of ctree.h and put them in free-space-cache.h. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 9 --------- fs/btrfs/free-space-cache.h | 9 +++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ea83c178a60c..03d665b21244 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -63,15 +63,6 @@ struct reloc_control; #define BTRFS_MAX_EXTENT_SIZE SZ_128M -/* - * Deltas are an effective way to populate global statistics. Give macro names - * to make it clear what we're doing. An example is discard_extents in - * btrfs_free_space_ctl. - */ -#define BTRFS_STAT_NR_ENTRIES 2 -#define BTRFS_STAT_CURR 0 -#define BTRFS_STAT_PREV 1 - static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 6d419ba53e95..eaf30f6444dd 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -43,6 +43,15 @@ static inline bool btrfs_free_space_trimming_bitmap( return (info->trim_state == BTRFS_TRIM_STATE_TRIMMING); } +/* + * Deltas are an effective way to populate global statistics. Give macro names + * to make it clear what we're doing. An example is discard_extents in + * btrfs_free_space_ctl. + */ +#define BTRFS_STAT_NR_ENTRIES 2 +#define BTRFS_STAT_CURR 0 +#define BTRFS_STAT_PREV 1 + struct btrfs_free_space_ctl { spinlock_t tree_lock; struct rb_root free_space_offset; -- cgit From 06d61cb101f346338b50be5cd4f053a169b59d5e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:34 -0400 Subject: btrfs: move btrfs_should_fragment_free_space into block-group.c This function uses functions that are not defined in block-group.h, move it into block-group.c in order to keep the header clean. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 12 ++++++++++++ fs/btrfs/block-group.h | 11 +---------- 2 files changed, 13 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index deebc8ddbd93..3f8b1cbbbc43 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -18,6 +18,18 @@ #include "raid56.h" #include "zoned.h" +#ifdef CONFIG_BTRFS_DEBUG +int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + + return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && + block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || + (btrfs_test_opt(fs_info, FRAGMENT_DATA) && + block_group->flags & BTRFS_BLOCK_GROUP_DATA); +} +#endif + /* * Return target flags in extended format or 0 if restripe for this chunk_type * is not in progress diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 8fb14b99a1d1..39e79bed10ae 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -251,16 +251,7 @@ static inline bool btrfs_is_block_group_data_only( } #ifdef CONFIG_BTRFS_DEBUG -static inline int btrfs_should_fragment_free_space( - struct btrfs_block_group *block_group) -{ - struct btrfs_fs_info *fs_info = block_group->fs_info; - - return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && - block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || - (btrfs_test_opt(fs_info, FRAGMENT_DATA) && - block_group->flags & BTRFS_BLOCK_GROUP_DATA); -} +int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group); #endif struct btrfs_block_group *btrfs_lookup_first_block_group( -- cgit From f1e5c6185ca166cde0c7c2eeeab5d233ef315140 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:35 -0400 Subject: btrfs: move flush related definitions to space-info.h This code is used in space-info.c, move the definitions to space-info.h. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 59 ------------------------------------------------ fs/btrfs/delayed-inode.c | 1 + fs/btrfs/inode-item.c | 1 + fs/btrfs/props.c | 1 + fs/btrfs/relocation.c | 1 + fs/btrfs/space-info.h | 59 ++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 63 insertions(+), 59 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 03d665b21244..39974f4b441f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2639,65 +2639,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, void btrfs_clear_space_info_full(struct btrfs_fs_info *info); -/* - * Different levels for to flush space when doing space reservations. - * - * The higher the level, the more methods we try to reclaim space. - */ -enum btrfs_reserve_flush_enum { - /* If we are in the transaction, we can't flush anything.*/ - BTRFS_RESERVE_NO_FLUSH, - - /* - * Flush space by: - * - Running delayed inode items - * - Allocating a new chunk - */ - BTRFS_RESERVE_FLUSH_LIMIT, - - /* - * Flush space by: - * - Running delayed inode items - * - Running delayed refs - * - Running delalloc and waiting for ordered extents - * - Allocating a new chunk - */ - BTRFS_RESERVE_FLUSH_EVICT, - - /* - * Flush space by above mentioned methods and by: - * - Running delayed iputs - * - Committing transaction - * - * Can be interrupted by a fatal signal. - */ - BTRFS_RESERVE_FLUSH_DATA, - BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, - BTRFS_RESERVE_FLUSH_ALL, - - /* - * Pretty much the same as FLUSH_ALL, but can also steal space from - * global rsv. - * - * Can be interrupted by a fatal signal. - */ - BTRFS_RESERVE_FLUSH_ALL_STEAL, -}; - -enum btrfs_flush_state { - FLUSH_DELAYED_ITEMS_NR = 1, - FLUSH_DELAYED_ITEMS = 2, - FLUSH_DELAYED_REFS_NR = 3, - FLUSH_DELAYED_REFS = 4, - FLUSH_DELALLOC = 5, - FLUSH_DELALLOC_WAIT = 6, - FLUSH_DELALLOC_FULL = 7, - ALLOC_CHUNK = 8, - ALLOC_CHUNK_FORCE = 9, - RUN_DELAYED_IPUTS = 10, - COMMIT_TRANS = 11, -}; - int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index cac5169eaf8d..a411f04a7b97 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -14,6 +14,7 @@ #include "qgroup.h" #include "locking.h" #include "inode-item.h" +#include "space-info.h" #define BTRFS_DELAYED_WRITEBACK 512 #define BTRFS_DELAYED_BACKGROUND 128 diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 0eeb5ea87894..366f3a788c6a 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -8,6 +8,7 @@ #include "disk-io.h" #include "transaction.h" #include "print-tree.h" +#include "space-info.h" struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, const char *name, diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 055a631276ce..07f62e3ba6a5 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -10,6 +10,7 @@ #include "ctree.h" #include "xattr.h" #include "compression.h" +#include "space-info.h" #define BTRFS_PROP_HANDLERS_HT_BITS 8 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 9dcf9b39c798..216a4485d914 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -27,6 +27,7 @@ #include "subpage.h" #include "zoned.h" #include "inode-item.h" +#include "space-info.h" /* * Relocation overview diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index ce66023a9eb8..7e17bb803436 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -5,6 +5,65 @@ #include "volumes.h" +/* + * Different levels for to flush space when doing space reservations. + * + * The higher the level, the more methods we try to reclaim space. + */ +enum btrfs_reserve_flush_enum { + /* If we are in the transaction, we can't flush anything.*/ + BTRFS_RESERVE_NO_FLUSH, + + /* + * Flush space by: + * - Running delayed inode items + * - Allocating a new chunk + */ + BTRFS_RESERVE_FLUSH_LIMIT, + + /* + * Flush space by: + * - Running delayed inode items + * - Running delayed refs + * - Running delalloc and waiting for ordered extents + * - Allocating a new chunk + */ + BTRFS_RESERVE_FLUSH_EVICT, + + /* + * Flush space by above mentioned methods and by: + * - Running delayed iputs + * - Committing transaction + * + * Can be interrupted by a fatal signal. + */ + BTRFS_RESERVE_FLUSH_DATA, + BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, + BTRFS_RESERVE_FLUSH_ALL, + + /* + * Pretty much the same as FLUSH_ALL, but can also steal space from + * global rsv. + * + * Can be interrupted by a fatal signal. + */ + BTRFS_RESERVE_FLUSH_ALL_STEAL, +}; + +enum btrfs_flush_state { + FLUSH_DELAYED_ITEMS_NR = 1, + FLUSH_DELAYED_ITEMS = 2, + FLUSH_DELAYED_REFS_NR = 3, + FLUSH_DELAYED_REFS = 4, + FLUSH_DELALLOC = 5, + FLUSH_DELALLOC_WAIT = 6, + FLUSH_DELALLOC_FULL = 7, + ALLOC_CHUNK = 8, + ALLOC_CHUNK_FORCE = 9, + RUN_DELAYED_IPUTS = 10, + COMMIT_TRANS = 11, +}; + struct btrfs_space_info { spinlock_t lock; -- cgit From f60acad355cf14ccccf420e6ea0ddd6de87cb210 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:36 -0400 Subject: btrfs: move btrfs_print_data_csum_error into inode.c This isn't used outside of inode.c, there's no reason to define it in btrfs_inode.h. Drop the inline and add __cold as it's for errors that are not in any hot path. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 26 -------------------------- fs/btrfs/inode.c | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 54c2ccb36b61..530a0ebfab3f 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -410,30 +410,4 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes - -static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, - u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) -{ - struct btrfs_root *root = inode->root; - const u32 csum_size = root->fs_info->csum_size; - - /* Output minus objectid, which is more meaningful */ - if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) - btrfs_warn_rl(root->fs_info, -"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - root->root_key.objectid, btrfs_ino(inode), - logical_start, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), - mirror_num); - else - btrfs_warn_rl(root->fs_info, -"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - root->root_key.objectid, btrfs_ino(inode), - logical_start, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), - mirror_num); -} - #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2ba2d8b9cefc..8cae730bd5ba 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -125,6 +125,32 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 ram_bytes, int compress_type, int type); +static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, + u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) +{ + struct btrfs_root *root = inode->root; + const u32 csum_size = root->fs_info->csum_size; + + /* Output without objectid, which is more meaningful */ + if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) { + btrfs_warn_rl(root->fs_info, +"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", + root->root_key.objectid, btrfs_ino(inode), + logical_start, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); + } else { + btrfs_warn_rl(root->fs_info, +"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", + root->root_key.objectid, btrfs_ino(inode), + logical_start, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); + } +} + /* * btrfs_inode_lock - lock inode i_rwsem based on arguments passed * -- cgit From 956504a331a613814279b04f9b0d663c7c2bb9bc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:37 -0400 Subject: btrfs: move trans_handle_cachep out of ctree.h This is local to the transaction code, remove it from ctree.h and inode.c, create new helpers in the transaction to handle the init work and move the cachep locally to transaction.c. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 - fs/btrfs/inode.c | 8 -------- fs/btrfs/super.c | 9 ++++++++- fs/btrfs/transaction.c | 17 +++++++++++++++++ fs/btrfs/transaction.h | 3 +++ 5 files changed, 28 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 39974f4b441f..fec3d618bcef 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -41,7 +41,6 @@ struct btrfs_pending_snapshot; struct btrfs_delayed_ref_root; struct btrfs_space_info; struct btrfs_block_group; -extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; extern struct kmem_cache *btrfs_free_space_bitmap_cachep; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8cae730bd5ba..54f50784aefa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -107,7 +107,6 @@ static const struct address_space_operations btrfs_aops; static const struct file_operations btrfs_dir_file_operations; static struct kmem_cache *btrfs_inode_cachep; -struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; struct kmem_cache *btrfs_free_space_bitmap_cachep; @@ -8926,7 +8925,6 @@ void __cold btrfs_destroy_cachep(void) rcu_barrier(); bioset_exit(&btrfs_dio_bioset); kmem_cache_destroy(btrfs_inode_cachep); - kmem_cache_destroy(btrfs_trans_handle_cachep); kmem_cache_destroy(btrfs_path_cachep); kmem_cache_destroy(btrfs_free_space_cachep); kmem_cache_destroy(btrfs_free_space_bitmap_cachep); @@ -8941,12 +8939,6 @@ int __init btrfs_init_cachep(void) if (!btrfs_inode_cachep) goto fail; - btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", - sizeof(struct btrfs_trans_handle), 0, - SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); - if (!btrfs_trans_handle_cachep) - goto fail; - btrfs_path_cachep = kmem_cache_create("btrfs_path", sizeof(struct btrfs_path), 0, SLAB_MEM_SPREAD, NULL); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 5942b9384088..8fe2fdb167a7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2739,10 +2739,14 @@ static int __init init_btrfs_fs(void) if (err) goto free_compress; - err = extent_state_init_cachep(); + err = btrfs_transaction_init(); if (err) goto free_cachep; + err = extent_state_init_cachep(); + if (err) + goto free_transaction; + err = extent_buffer_init_cachep(); if (err) goto free_extent_cachep; @@ -2811,6 +2815,8 @@ free_eb_cachep: extent_buffer_free_cachep(); free_extent_cachep: extent_state_free_cachep(); +free_transaction: + btrfs_transaction_exit(); free_cachep: btrfs_destroy_cachep(); free_compress: @@ -2822,6 +2828,7 @@ free_compress: static void __exit exit_btrfs_fs(void) { + btrfs_transaction_exit(); btrfs_destroy_cachep(); btrfs_delayed_ref_exit(); btrfs_auto_defrag_exit(); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d1f1da6820fb..ae7d4aca771d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -24,6 +24,8 @@ #include "space-info.h" #include "zoned.h" +static struct kmem_cache *btrfs_trans_handle_cachep; + #define BTRFS_ROOT_TRANS_TAG 0 /* @@ -2600,3 +2602,18 @@ void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info) btrfs_warn(fs_info, "unknown pending changes left 0x%lx, ignoring", prev); } + +int __init btrfs_transaction_init(void) +{ + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", + sizeof(struct btrfs_trans_handle), 0, + SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); + if (!btrfs_trans_handle_cachep) + return -ENOMEM; + return 0; +} + +void __cold btrfs_transaction_exit(void) +{ + kmem_cache_destroy(btrfs_trans_handle_cachep); +} diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 970ff316069d..1332f193b800 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -236,4 +236,7 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); +int __init btrfs_transaction_init(void); +void __cold btrfs_transaction_exit(void); + #endif -- cgit From 226463d7b100d30def24f2be492fef0081003a30 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:38 -0400 Subject: btrfs: move btrfs_path_cachep out of ctree.h This is local to the ctree code, remove it from ctree.h and inode.c, create new init/exit functions for the cachep, and move it locally to ctree.c. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 17 +++++++++++++++++ fs/btrfs/ctree.h | 3 ++- fs/btrfs/inode.c | 8 -------- fs/btrfs/super.c | 9 ++++++++- 4 files changed, 27 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dcb510f38dda..4f2d367e1207 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -18,6 +18,8 @@ #include "tree-mod-log.h" #include "tree-checker.h" +static struct kmem_cache *btrfs_path_cachep; + static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -4933,3 +4935,18 @@ int btrfs_previous_extent_item(struct btrfs_root *root, } return 1; } + +int __init btrfs_ctree_init(void) +{ + btrfs_path_cachep = kmem_cache_create("btrfs_path", + sizeof(struct btrfs_path), 0, + SLAB_MEM_SPREAD, NULL); + if (!btrfs_path_cachep) + return -ENOMEM; + return 0; +} + +void __cold btrfs_ctree_exit(void) +{ + kmem_cache_destroy(btrfs_path_cachep); +} diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fec3d618bcef..282571b54b6d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -41,7 +41,6 @@ struct btrfs_pending_snapshot; struct btrfs_delayed_ref_root; struct btrfs_space_info; struct btrfs_block_group; -extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; extern struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_ordered_sum; @@ -2662,6 +2661,8 @@ void btrfs_end_write_no_snapshotting(struct btrfs_root *root); void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); /* ctree.c */ +int __init btrfs_ctree_init(void); +void __cold btrfs_ctree_exit(void); int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 54f50784aefa..39c474e8a312 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -107,7 +107,6 @@ static const struct address_space_operations btrfs_aops; static const struct file_operations btrfs_dir_file_operations; static struct kmem_cache *btrfs_inode_cachep; -struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; struct kmem_cache *btrfs_free_space_bitmap_cachep; @@ -8925,7 +8924,6 @@ void __cold btrfs_destroy_cachep(void) rcu_barrier(); bioset_exit(&btrfs_dio_bioset); kmem_cache_destroy(btrfs_inode_cachep); - kmem_cache_destroy(btrfs_path_cachep); kmem_cache_destroy(btrfs_free_space_cachep); kmem_cache_destroy(btrfs_free_space_bitmap_cachep); } @@ -8939,12 +8937,6 @@ int __init btrfs_init_cachep(void) if (!btrfs_inode_cachep) goto fail; - btrfs_path_cachep = kmem_cache_create("btrfs_path", - sizeof(struct btrfs_path), 0, - SLAB_MEM_SPREAD, NULL); - if (!btrfs_path_cachep) - goto fail; - btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", sizeof(struct btrfs_free_space), 0, SLAB_MEM_SPREAD, NULL); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8fe2fdb167a7..9f8b77553de0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2743,10 +2743,14 @@ static int __init init_btrfs_fs(void) if (err) goto free_cachep; - err = extent_state_init_cachep(); + err = btrfs_ctree_init(); if (err) goto free_transaction; + err = extent_state_init_cachep(); + if (err) + goto free_ctree; + err = extent_buffer_init_cachep(); if (err) goto free_extent_cachep; @@ -2815,6 +2819,8 @@ free_eb_cachep: extent_buffer_free_cachep(); free_extent_cachep: extent_state_free_cachep(); +free_ctree: + btrfs_ctree_exit(); free_transaction: btrfs_transaction_exit(); free_cachep: @@ -2828,6 +2834,7 @@ free_compress: static void __exit exit_btrfs_fs(void) { + btrfs_ctree_exit(); btrfs_transaction_exit(); btrfs_destroy_cachep(); btrfs_delayed_ref_exit(); -- cgit From eda517fd0ceee0329bb956f701c0e124fc1fe269 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:39 -0400 Subject: btrfs: move free space cachep's out of ctree.h This is local to the free-space-cache.c code, remove it from ctree.h and inode.c, create new init/exit functions for the cachep, and move it locally to free-space-cache.c. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 -- fs/btrfs/free-space-cache.c | 28 ++++++++++++++++++++++++++++ fs/btrfs/free-space-cache.h | 2 ++ fs/btrfs/inode.c | 16 ---------------- fs/btrfs/super.c | 9 ++++++++- 5 files changed, 38 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 282571b54b6d..ca5c67b2c025 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -41,8 +41,6 @@ struct btrfs_pending_snapshot; struct btrfs_delayed_ref_root; struct btrfs_space_info; struct btrfs_block_group; -extern struct kmem_cache *btrfs_free_space_cachep; -extern struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_ordered_sum; struct btrfs_ref; struct btrfs_bio; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f4023651dd68..bd68fafcccc7 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -29,6 +29,9 @@ #define MAX_CACHE_BYTES_PER_GIG SZ_64K #define FORCE_EXTENT_THRESHOLD SZ_1M +static struct kmem_cache *btrfs_free_space_cachep; +static struct kmem_cache *btrfs_free_space_bitmap_cachep; + struct btrfs_trim_range { u64 start; u64 bytes; @@ -4132,6 +4135,31 @@ out: return ret; } +int __init btrfs_free_space_init(void) +{ + btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", + sizeof(struct btrfs_free_space), 0, + SLAB_MEM_SPREAD, NULL); + if (!btrfs_free_space_cachep) + return -ENOMEM; + + btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", + PAGE_SIZE, PAGE_SIZE, + SLAB_MEM_SPREAD, NULL); + if (!btrfs_free_space_bitmap_cachep) { + kmem_cache_destroy(btrfs_free_space_cachep); + return -ENOMEM; + } + + return 0; +} + +void __cold btrfs_free_space_exit(void) +{ + kmem_cache_destroy(btrfs_free_space_cachep); + kmem_cache_destroy(btrfs_free_space_bitmap_cachep); +} + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* * Use this if you need to make a bitmap or extent entry specifically, it diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index eaf30f6444dd..cab954a9d97b 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -88,6 +88,8 @@ struct btrfs_io_ctl { int bitmaps; }; +int __init btrfs_free_space_init(void); +void __cold btrfs_free_space_exit(void); struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, struct btrfs_path *path); int create_free_space_inode(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 39c474e8a312..50584b93a66f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -107,8 +107,6 @@ static const struct address_space_operations btrfs_aops; static const struct file_operations btrfs_dir_file_operations; static struct kmem_cache *btrfs_inode_cachep; -struct kmem_cache *btrfs_free_space_cachep; -struct kmem_cache *btrfs_free_space_bitmap_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode, bool skip_writeback); @@ -8924,8 +8922,6 @@ void __cold btrfs_destroy_cachep(void) rcu_barrier(); bioset_exit(&btrfs_dio_bioset); kmem_cache_destroy(btrfs_inode_cachep); - kmem_cache_destroy(btrfs_free_space_cachep); - kmem_cache_destroy(btrfs_free_space_bitmap_cachep); } int __init btrfs_init_cachep(void) @@ -8937,18 +8933,6 @@ int __init btrfs_init_cachep(void) if (!btrfs_inode_cachep) goto fail; - btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", - sizeof(struct btrfs_free_space), 0, - SLAB_MEM_SPREAD, NULL); - if (!btrfs_free_space_cachep) - goto fail; - - btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", - PAGE_SIZE, PAGE_SIZE, - SLAB_MEM_SPREAD, NULL); - if (!btrfs_free_space_bitmap_cachep) - goto fail; - if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, offsetof(struct btrfs_dio_private, bio), BIOSET_NEED_BVECS)) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9f8b77553de0..0a93fbd29494 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2747,10 +2747,14 @@ static int __init init_btrfs_fs(void) if (err) goto free_transaction; - err = extent_state_init_cachep(); + err = btrfs_free_space_init(); if (err) goto free_ctree; + err = extent_state_init_cachep(); + if (err) + goto free_free_space; + err = extent_buffer_init_cachep(); if (err) goto free_extent_cachep; @@ -2819,6 +2823,8 @@ free_eb_cachep: extent_buffer_free_cachep(); free_extent_cachep: extent_state_free_cachep(); +free_free_space: + btrfs_free_space_exit(); free_ctree: btrfs_ctree_exit(); free_transaction: @@ -2834,6 +2840,7 @@ free_compress: static void __exit exit_btrfs_fs(void) { + btrfs_free_space_exit(); btrfs_ctree_exit(); btrfs_transaction_exit(); btrfs_destroy_cachep(); -- cgit From 890d2b1aa38b9101503194855aeee897ba6cdbc2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:40 -0400 Subject: btrfs: move btrfs_next_old_item into ctree.c This uses btrfs_header_nritems, which I will be moving out of ctree.h. In order to avoid needing to include the relevant header in ctree.h, simply move this helper function into ctree.c. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ rename parameters ] Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 8 ++++++++ fs/btrfs/ctree.h | 9 +-------- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4f2d367e1207..5f461e05cc9c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4852,6 +4852,14 @@ done: return ret; } +int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) +{ + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) + return btrfs_next_old_leaf(root, path, time_seq); + return 0; +} + /* * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps * searching until it gets past min_objectid or finds an item of 'type' diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ca5c67b2c025..c8ced88f2143 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2821,14 +2821,7 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, (path)->slots[0]++ \ ) -static inline int btrfs_next_old_item(struct btrfs_root *root, - struct btrfs_path *p, u64 time_seq) -{ - ++p->slots[0]; - if (p->slots[0] >= btrfs_header_nritems(p->nodes[0])) - return btrfs_next_old_leaf(root, p, time_seq); - return 0; -} +int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); /* * Search the tree again to find a leaf with greater keys. -- cgit From 7a66eda351bacf2f88801204cfa5319dda3ec75f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:41 -0400 Subject: btrfs: move the btrfs_verity_descriptor_item defs up in ctree.h These are wrapped in CONFIG_FS_VERITY, but we can have the definitions without verity enabled. Move these definitions up with the other accessor helpers. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c8ced88f2143..c0c0dc8fa99f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2523,6 +2523,16 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, struct btrfs_dev_replace_item, cursor_right, 64); +/* btrfs_verity_descriptor_item */ +BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, + encryption, 8); +BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item, + size, 64); +BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, + struct btrfs_verity_descriptor_item, encryption, 8); +BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, + struct btrfs_verity_descriptor_item, size, 64); + /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(BTRFS_LEAF_DATA_OFFSET + \ @@ -3720,15 +3730,6 @@ extern const struct fsverity_operations btrfs_verityops; int btrfs_drop_verity_items(struct btrfs_inode *inode); int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size); -BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, - encryption, 8); -BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item, - size, 64); -BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, - struct btrfs_verity_descriptor_item, encryption, 8); -BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, - struct btrfs_verity_descriptor_item, size, 64); - #else static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) -- cgit From 765c3fe99bcda005d66c159f9a46fc2e0c77c8ce Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Sep 2022 09:35:01 -0400 Subject: btrfs: introduce BTRFS_RESERVE_FLUSH_EMERGENCY Inside of FB, as well as some user reports, we've had a consistent problem of occasional ENOSPC transaction aborts. Inside FB we were seeing ~100-200 ENOSPC aborts per day in the fleet, which is a really low occurrence rate given the size of our fleet, but it's not nothing. There are two causes of this particular problem. First is delayed allocation. The reservation system for delalloc assumes that contiguous dirty ranges will result in 1 file extent item. However if there is memory pressure that results in fragmented writeout, or there is fragmentation in the block groups, this won't necessarily be true. Consider the case where we do a single 256MiB write to a file and then close it. We will have 1 reservation for the inode update, the reservations for the checksum updates, and 1 reservation for the file extent item. At some point later we decide to write this entire range out, but we're so fragmented that we break this into 100 different file extents. Since we've already closed the file and are no longer writing to it there's nothing to trigger a refill of the delalloc block rsv to satisfy the 99 new file extent reservations we need. At this point we exhaust our delalloc reservation, and we begin to steal from the global reserve. If you have enough of these cases going in parallel you can easily exhaust the global reserve, get an ENOSPC at btrfs_alloc_tree_block() time, and then abort the transaction. The other case is the delayed refs reserve. The delayed refs reserve updates its size based on outstanding delayed refs and dirty block groups. However we only refill this block reserve when returning excess reservations and when we call btrfs_start_transaction(root, X). We will reserve 2*X credits at transaction start time, and fill in X into the delayed refs reserve to make sure it stays topped off. Generally this works well, but clearly has downsides. If we do a particularly delayed ref heavy operation we may never catch up in our reservations. Additionally running delayed refs generates more delayed refs, and at that point we may be committing the transaction and have no way to trigger a refill of our delayed refs rsv. Then a similar thing occurs with the delalloc reserve. Generally speaking we well over-reserve in all of our block rsvs. If we reserve 1 credit we're usually reserving around 264k of space, but we'll often not use any of that reservation, or use a few blocks of that reservation. We can be reasonably sure that as long as you were able to reserve space up front for your operation you'll be able to find space on disk for that reservation. So introduce a new flushing state, BTRFS_RESERVE_FLUSH_EMERGENCY. This gets used in the case that we've exhausted our reserve and the global reserve. It simply forces a reservation if we have enough actual space on disk to make the reservation, which is almost always the case. This keeps us from hitting ENOSPC aborts in these odd occurrences where we've not kept up with the delayed work. Fixing this in a complete way is going to be relatively complicated and time consuming. This patch is what I discussed with Filipe earlier this year, and what I put into our kernels inside FB. With this patch we're down to 1-2 ENOSPC aborts per week, which is a significant reduction. This is a decent stop gap until we can work out a more wholistic solution to these two corner cases. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 12 ++++++++++++ fs/btrfs/space-info.c | 29 +++++++++++++++++++++++++++-- fs/btrfs/space-info.h | 18 ++++++++++++++++++ 3 files changed, 57 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index ec96285357e0..89e3e7d1bff6 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -552,5 +552,17 @@ try_reserve: if (!ret) return global_rsv; } + + /* + * All hope is lost, but of course our reservations are overly + * pessimistic, so instead of possibly having an ENOSPC abort here, try + * one last time to force a reservation if there's enough actual space + * on disk to make the reservation. + */ + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize, + BTRFS_RESERVE_FLUSH_EMERGENCY); + if (!ret) + return block_rsv; + return ERR_PTR(ret); } diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index f171bf875633..af2e133aaa5c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1583,6 +1583,16 @@ static inline bool can_steal(enum btrfs_reserve_flush_enum flush) flush == BTRFS_RESERVE_FLUSH_EVICT); } +/* + * NO_FLUSH and FLUSH_EMERGENCY don't want to create a ticket, they just want to + * fail as quickly as possible. + */ +static inline bool can_ticket(enum btrfs_reserve_flush_enum flush) +{ + return (flush != BTRFS_RESERVE_NO_FLUSH && + flush != BTRFS_RESERVE_FLUSH_EMERGENCY); +} + /** * Try to reserve bytes from the block_rsv's space * @@ -1644,6 +1654,21 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, ret = 0; } + /* + * Things are dire, we need to make a reservation so we don't abort. We + * will let this reservation go through as long as we have actual space + * left to allocate for the block. + */ + if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { + used = btrfs_space_info_used(space_info, false); + if (used + orig_bytes <= + writable_total_bytes(fs_info, space_info)) { + btrfs_space_info_update_bytes_may_use(fs_info, space_info, + orig_bytes); + ret = 0; + } + } + /* * If we couldn't make a reservation then setup our reservation ticket * and kick the async worker if it's not already running. @@ -1651,7 +1676,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * If we are a priority flusher then we just need to add our ticket to * the list and we will do our own flushing further down. */ - if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { + if (ret && can_ticket(flush)) { ticket.bytes = orig_bytes; ticket.error = 0; space_info->reclaim_size += ticket.bytes; @@ -1701,7 +1726,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, } } spin_unlock(&space_info->lock); - if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) + if (!ret || !can_ticket(flush)) return ret; return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns, diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 7e17bb803436..f28bd2c051d4 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -48,6 +48,24 @@ enum btrfs_reserve_flush_enum { * Can be interrupted by a fatal signal. */ BTRFS_RESERVE_FLUSH_ALL_STEAL, + + /* + * This is for btrfs_use_block_rsv only. We have exhausted our block + * rsv and our global block rsv. This can happen for things like + * delalloc where we are overwriting a lot of extents with a single + * extent and didn't reserve enough space. Alternatively it can happen + * with delalloc where we reserve 1 extents worth for a large extent but + * fragmentation leads to multiple extents being created. This will + * give us the reservation in the case of + * + * if (num_bytes < (space_info->total_bytes - + * btrfs_space_info_used(space_info, false)) + * + * Which ignores bytes_may_use. This is potentially dangerous, but our + * reservation system is generally pessimistic so is able to absorb this + * style of mistake. + */ + BTRFS_RESERVE_FLUSH_EMERGENCY, }; enum btrfs_flush_state { -- cgit From ff2b64a22a2efcc087520e94ad06b005268a5f9d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 10 Oct 2022 18:36:08 +0800 Subject: btrfs: raid56: cleanup for function __free_raid_bio() The cleanup involves two things: - Remove the "__" prefix There is no naming confliction. - Remove the forward declaration There is no special function call involved. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 61 +++++++++++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 82c8e991300e..371b2a182544 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -69,7 +69,6 @@ static void rmw_work(struct work_struct *work); static void read_rebuild_work(struct work_struct *work); static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); -static void __free_raid_bio(struct btrfs_raid_bio *rbio); static void index_rbio_pages(struct btrfs_raid_bio *rbio); static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); @@ -77,6 +76,28 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); static void scrub_parity_work(struct work_struct *work); +static void free_raid_bio(struct btrfs_raid_bio *rbio) +{ + int i; + + if (!refcount_dec_and_test(&rbio->refs)) + return; + + WARN_ON(!list_empty(&rbio->stripe_cache)); + WARN_ON(!list_empty(&rbio->hash_list)); + WARN_ON(!bio_list_empty(&rbio->bio_list)); + + for (i = 0; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) { + __free_page(rbio->stripe_pages[i]); + rbio->stripe_pages[i] = NULL; + } + } + + btrfs_put_bioc(rbio->bioc); + kfree(rbio); +} + static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) { INIT_WORK(&rbio->work, work_func); @@ -336,7 +357,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) spin_unlock(&h->lock); if (freeit) - __free_raid_bio(rbio); + free_raid_bio(rbio); } /* @@ -684,7 +705,7 @@ out: if (cache_drop) remove_rbio_from_cache(cache_drop); if (freeit) - __free_raid_bio(freeit); + free_raid_bio(freeit); return ret; } @@ -769,28 +790,6 @@ done_nolock: remove_rbio_from_cache(rbio); } -static void __free_raid_bio(struct btrfs_raid_bio *rbio) -{ - int i; - - if (!refcount_dec_and_test(&rbio->refs)) - return; - - WARN_ON(!list_empty(&rbio->stripe_cache)); - WARN_ON(!list_empty(&rbio->hash_list)); - WARN_ON(!bio_list_empty(&rbio->bio_list)); - - for (i = 0; i < rbio->nr_pages; i++) { - if (rbio->stripe_pages[i]) { - __free_page(rbio->stripe_pages[i]); - rbio->stripe_pages[i] = NULL; - } - } - - btrfs_put_bioc(rbio->bioc); - kfree(rbio); -} - static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) { struct bio *next; @@ -830,7 +829,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) */ unlock_stripe(rbio); extra = bio_list_get(&rbio->bio_list); - __free_raid_bio(rbio); + free_raid_bio(rbio); rbio_endio_bio_list(cur, err); if (extra) @@ -1731,7 +1730,7 @@ static void run_plug(struct btrfs_plug_cb *plug) if (last) { if (rbio_can_merge(last, cur)) { merge_rbio(last, cur); - __free_raid_bio(cur); + free_raid_bio(cur); continue; } @@ -1822,7 +1821,7 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) if (rbio_is_full(rbio)) { ret = full_stripe_write(rbio); if (ret) { - __free_raid_bio(rbio); + free_raid_bio(rbio); goto fail; } return; @@ -1839,7 +1838,7 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) } else { ret = __raid56_parity_write(rbio); if (ret) { - __free_raid_bio(rbio); + free_raid_bio(rbio); goto fail; } } @@ -2214,7 +2213,7 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", __func__, bio->bi_iter.bi_sector << 9, (u64)bio->bi_iter.bi_size, bioc->map_type); - __free_raid_bio(rbio); + free_raid_bio(rbio); bio->bi_status = BLK_STS_IOERR; goto out_end_bio; } @@ -2747,7 +2746,7 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) btrfs_warn_rl(fs_info, "can not determine the failed stripe number for full stripe %llu", bioc->raid_map[0]); - __free_raid_bio(rbio); + free_raid_bio(rbio); return NULL; } -- cgit From 797d74b749850a5b81c47560caec26e00e7e3768 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 10 Oct 2022 18:36:09 +0800 Subject: btrfs: raid56: allocate memory separately for rbio pointers Currently inside alloc_rbio(), we allocate a larger memory to contain the following members: - struct btrfs_raid_rbio itself - stripe_pages array - bio_sectors array - stripe_sectors array - finish_pointers array Then update rbio pointers to point the extra space after the rbio structure itself. Thus it introduced a complex CONSUME_ALLOC() macro to help the thing. This is too hacky, and is going to make later pointers expansion harder. This patch will change it to use regular kcalloc() for each pointer inside btrfs_raid_bio, making the later expansion much easier. And introduce a helper free_raid_bio_pointers() to free up all the pointer members in btrfs_raid_bio, which will be used in both free_raid_bio() and error path of alloc_rbio(). Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 371b2a182544..4ec211a58f15 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -76,6 +76,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); static void scrub_parity_work(struct work_struct *work); +static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) +{ + kfree(rbio->stripe_pages); + kfree(rbio->bio_sectors); + kfree(rbio->stripe_sectors); + kfree(rbio->finish_pointers); +} + static void free_raid_bio(struct btrfs_raid_bio *rbio) { int i; @@ -95,6 +103,7 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio) } btrfs_put_bioc(rbio->bioc); + free_raid_bio_pointers(rbio); kfree(rbio); } @@ -918,7 +927,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; const unsigned int num_sectors = stripe_nsectors * real_stripes; struct btrfs_raid_bio *rbio; - void *p; /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); @@ -928,14 +936,23 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, */ ASSERT(stripe_nsectors <= BITS_PER_LONG); - rbio = kzalloc(sizeof(*rbio) + - sizeof(*rbio->stripe_pages) * num_pages + - sizeof(*rbio->bio_sectors) * num_sectors + - sizeof(*rbio->stripe_sectors) * num_sectors + - sizeof(*rbio->finish_pointers) * real_stripes, - GFP_NOFS); + rbio = kzalloc(sizeof(*rbio), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); + rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), + GFP_NOFS); + rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), + GFP_NOFS); + rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), + GFP_NOFS); + rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); + + if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || + !rbio->finish_pointers) { + free_raid_bio_pointers(rbio); + kfree(rbio); + return ERR_PTR(-ENOMEM); + } bio_list_init(&rbio->bio_list); INIT_LIST_HEAD(&rbio->plug_list); @@ -955,21 +972,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, atomic_set(&rbio->error, 0); atomic_set(&rbio->stripes_pending, 0); - /* - * The stripe_pages, bio_sectors, etc arrays point to the extra memory - * we allocated past the end of the rbio. - */ - p = rbio + 1; -#define CONSUME_ALLOC(ptr, count) do { \ - ptr = p; \ - p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ - } while (0) - CONSUME_ALLOC(rbio->stripe_pages, num_pages); - CONSUME_ALLOC(rbio->bio_sectors, num_sectors); - CONSUME_ALLOC(rbio->stripe_sectors, num_sectors); - CONSUME_ALLOC(rbio->finish_pointers, real_stripes); -#undef CONSUME_ALLOC - ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); -- cgit From 88074c8b1376ac315ef4a294db82d861df074ef2 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 10 Oct 2022 18:36:10 +0800 Subject: btrfs: raid56: make it more explicit that cache rbio should have all its data sectors uptodate For Btrfs RAID56, we have a caching system for btrfs raid bios (rbio). We call cache_rbio_pages() to mark a qualified rbio ready for cache. The timing happens at: - finish_rmw() At this timing, we have already read all necessary sectors, along with the rbio sectors, we have covered all data stripes. - __raid_recover_end_io() At this timing, we have rebuild the rbio, thus all data sectors involved (either from stripe or bio list) are uptodate now. Thus at the timing of cache_rbio_pages(), we should have all data sectors uptodate. This patch will make it explicit that all data sectors are uptodate at cache_rbio_pages() timing, mostly to prepare for the incoming verification at RMW time. This patch will add: - Extra ASSERT()s in cache_rbio_pages() This is to make sure all data sectors, which are not covered by bio, are already uptodate. - Extra ASSERT()s in steal_rbio() Since only cached rbio can be stolen, thus every data sector should already be uptodate in the source rbio. - Update __raid_recover_end_io() to update recovered sector->uptodate Previously __raid_recover_end_io() will only mark failed sectors uptodate if it's doing an RMW. But this can trigger new ASSERT()s, as for recovery case, a recovered failed sector will not be marked uptodate, and trigger ASSERT() in later cache_rbio_pages() call. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 70 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 4ec211a58f15..c009c0a2081e 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -176,8 +176,16 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) for (i = 0; i < rbio->nr_sectors; i++) { /* Some range not covered by bio (partial write), skip it */ - if (!rbio->bio_sectors[i].page) + if (!rbio->bio_sectors[i].page) { + /* + * Even if the sector is not covered by bio, if it is + * a data sector it should still be uptodate as it is + * read from disk. + */ + if (i < rbio->nr_data * rbio->stripe_nsectors) + ASSERT(rbio->stripe_sectors[i].uptodate); continue; + } ASSERT(rbio->stripe_sectors[i].page); memcpy_page(rbio->stripe_sectors[i].page, @@ -264,6 +272,21 @@ static void steal_rbio_page(struct btrfs_raid_bio *src, dest->stripe_sectors[i].uptodate = true; } +static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) +{ + const int sector_nr = (page_nr << PAGE_SHIFT) >> + rbio->bioc->fs_info->sectorsize_bits; + + /* + * We have ensured PAGE_SIZE is aligned with sectorsize, thus + * we won't have a page which is half data half parity. + * + * Thus if the first sector of the page belongs to data stripes, then + * the full page belongs to data stripes. + */ + return (sector_nr < rbio->nr_data * rbio->stripe_nsectors); +} + /* * Stealing an rbio means taking all the uptodate pages from the stripe array * in the source rbio and putting them into the destination rbio. @@ -274,16 +297,26 @@ static void steal_rbio_page(struct btrfs_raid_bio *src, static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) { int i; - struct page *s; if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) return; for (i = 0; i < dest->nr_pages; i++) { - s = src->stripe_pages[i]; - if (!s || !full_page_sectors_uptodate(src, i)) + struct page *p = src->stripe_pages[i]; + + /* + * We don't need to steal P/Q pages as they will always be + * regenerated for RMW or full write anyway. + */ + if (!is_data_stripe_page(src, i)) continue; + /* + * If @src already has RBIO_CACHE_READY_BIT, it should have + * all data stripe pages present and uptodate. + */ + ASSERT(p); + ASSERT(full_page_sectors_uptodate(src, i)); steal_rbio_page(src, dest, i); } index_stripe_sectors(dest); @@ -2003,22 +2036,21 @@ pstripe: /* xor in the rest */ run_xor(pointers, rbio->nr_data - 1, sectorsize); } - /* if we're doing this rebuild as part of an rmw, go through - * and set all of our private rbio pages in the - * failed stripes as uptodate. This way finish_rmw will - * know they can be trusted. If this was a read reconstruction, - * other endio functions will fiddle the uptodate bits + + /* + * No matter if this is a RMW or recovery, we should have all + * failed sectors repaired, thus they are now uptodate. + * Especially if we determine to cache the rbio, we need to + * have at least all data sectors uptodate. */ - if (rbio->operation == BTRFS_RBIO_WRITE) { - for (i = 0; i < rbio->stripe_nsectors; i++) { - if (faila != -1) { - sector = rbio_stripe_sector(rbio, faila, i); - sector->uptodate = 1; - } - if (failb != -1) { - sector = rbio_stripe_sector(rbio, failb, i); - sector->uptodate = 1; - } + for (i = 0; i < rbio->stripe_nsectors; i++) { + if (faila != -1) { + sector = rbio_stripe_sector(rbio, faila, i); + sector->uptodate = 1; + } + if (failb != -1) { + sector = rbio_stripe_sector(rbio, failb, i); + sector->uptodate = 1; } } for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--) -- cgit From d47704bd1c78c85831561bcf701b90dd66f811b2 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:16:54 +0100 Subject: btrfs: get the next extent map during fiemap/lseek more efficiently At find_delalloc_subrange(), when we need to get the next extent map, we do a full search on the extent map tree (a red black tree). This is fine but it's a lot more efficient to simply use rb_next(), which typically requires iterating over less nodes of the tree and never needs to compare the ranges of nodes with the one we are looking for. So add a public helper to extent_map.{h,c} to get the extent map that immediately follows another extent map, using rb_next(), and use that helper at find_delalloc_subrange(). Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 31 ++++++++++++++++++++++++++++++- fs/btrfs/extent_map.h | 2 ++ fs/btrfs/file.c | 44 +++++++++++++++++++++++++++----------------- 3 files changed, 59 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6092a4eedc92..715979807ae1 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -523,7 +523,7 @@ void replace_extent_mapping(struct extent_map_tree *tree, setup_extent_mapping(tree, new, modified); } -static struct extent_map *next_extent_map(struct extent_map *em) +static struct extent_map *next_extent_map(const struct extent_map *em) { struct rb_node *next; @@ -533,6 +533,35 @@ static struct extent_map *next_extent_map(struct extent_map *em) return container_of(next, struct extent_map, rb_node); } +/* + * Get the extent map that immediately follows another one. + * + * @tree: The extent map tree that the extent map belong to. + * Holding read or write access on the tree's lock is required. + * @em: An extent map from the given tree. The caller must ensure that + * between getting @em and between calling this function, the + * extent map @em is not removed from the tree - for example, by + * holding the tree's lock for the duration of those 2 operations. + * + * Returns the extent map that immediately follows @em, or NULL if @em is the + * last extent map in the tree. + */ +struct extent_map *btrfs_next_extent_map(const struct extent_map_tree *tree, + const struct extent_map *em) +{ + struct extent_map *next; + + /* The lock must be acquired either in read mode or write mode. */ + lockdep_assert_held(&tree->lock); + ASSERT(extent_map_in_tree(em)); + + next = next_extent_map(em); + if (next) + refcount_inc(&next->refs); + + return next; +} + static struct extent_map *prev_extent_map(struct extent_map *em) { struct rb_node *prev; diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index ad311864272a..68d3f2c9ea1d 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -87,6 +87,8 @@ static inline u64 extent_map_block_end(struct extent_map *em) void extent_map_tree_init(struct extent_map_tree *tree); struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); +struct extent_map *btrfs_next_extent_map(const struct extent_map_tree *tree, + const struct extent_map *em); int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified); void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 493cae66e5e6..6685cb0e928c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3569,40 +3569,50 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end */ read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); - read_unlock(&em_tree->lock); + if (!em) { + read_unlock(&em_tree->lock); + return (delalloc_len > 0); + } /* extent_map_end() returns a non-inclusive end offset. */ - em_end = em ? extent_map_end(em) : 0; + em_end = extent_map_end(em); /* * If we have a hole/prealloc extent map, check the next one if this one * ends before our range's end. */ - if (em && (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) { + if ((em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) { struct extent_map *next_em; - read_lock(&em_tree->lock); - next_em = lookup_extent_mapping(em_tree, em_end, len - em_end); - read_unlock(&em_tree->lock); - + next_em = btrfs_next_extent_map(em_tree, em); free_extent_map(em); - em_end = next_em ? extent_map_end(next_em) : 0; + + /* + * There's no next extent map or the next one starts beyond our + * range, return the range found in the io tree (if any). + */ + if (!next_em || next_em->start > end) { + read_unlock(&em_tree->lock); + free_extent_map(next_em); + return (delalloc_len > 0); + } + + em_end = extent_map_end(next_em); em = next_em; } - if (em && (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - free_extent_map(em); - em = NULL; - } + read_unlock(&em_tree->lock); /* - * No extent map or one for a hole or prealloc extent. Use the delalloc - * range we found in the io tree if we have one. + * We have a hole or prealloc extent that ends at or beyond our range's + * end, return the range found in the io tree (if any). */ - if (!em) + if (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + free_extent_map(em); return (delalloc_len > 0); + } /* * We don't have any range as EXTENT_DELALLOC in the io tree, so the -- cgit From 013f9c70d293d7a47aaaeaee248ca60f745fa450 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:16:55 +0100 Subject: btrfs: skip unnecessary extent map searches during fiemap and lseek If we have no outstanding extents it means we don't have any extent maps corresponding to delalloc that is flushing, as when an ordered extent is created we increment the number of outstanding extents to 1 and when we remove the ordered extent we decrement them by 1. So skip extent map tree searches if the number of outstanding ordered extents is 0, saving time as the tree is not empty if we have previously made some reads or flushed delalloc, as in those cases it can have a very large number of extent maps for files with many extents. This helps save time when processing a file range corresponding to a hole or prealloc (unwritten) extent. The next patch in the series has a performance test in its changelog and its subject is: "btrfs: skip unnecessary delalloc search during fiemap and lseek" Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 6685cb0e928c..1cf97280760e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3553,6 +3553,18 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end if (delalloc_len > 0) *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1; + spin_lock(&inode->lock); + if (inode->outstanding_extents == 0) { + /* + * No outstanding extents means we don't have any delalloc that + * is flushing, so return the unflushed range found in the io + * tree (if any). + */ + spin_unlock(&inode->lock); + return (delalloc_len > 0); + } + spin_unlock(&inode->lock); + /* * Now also check if there's any extent map in the range that does not * map to a hole or prealloc extent. We do this because: -- cgit From a2853ffc2eb9bfb6e6d48486df7f3969a58ae3b4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:16:56 +0100 Subject: btrfs: skip unnecessary delalloc search during fiemap and lseek During fiemap and lseek (hole and data seeking), there's no point in iterating the inode's io tree to count delalloc bits if the inode's delalloc bytes counter has a value of zero, as that counter is updated whenever we set a range for delalloc or clear a range from delalloc. So skip the counting and io tree iteration if the inode's delalloc bytes counter has a value of zero. This helps save time when processing a file range corresponding to a hole or prealloc (unwritten) extent. This patch is part of a series comprised of the following patches: btrfs: get the next extent map during fiemap/lseek more efficiently btrfs: skip unnecessary extent map searches during fiemap and lseek btrfs: skip unnecessary delalloc search during fiemap and lseek The following test was performed on a release kernel (Debian's default kernel config) before and after applying those 3 patches. # Wrapper to call fiemap in extent count only mode. # (struct fiemap::fm_extent_count set to 0) $ cat fiemap.c #include #include #include #include #include #include #include #include #include int main(int argc, char **argv) { struct fiemap fiemap = { 0 }; int fd; if (argc != 2) { printf("usage: %s \n", argv[0]); return 1; } fd = open(argv[1], O_RDONLY); if (fd < 0) { fprintf(stderr, "error opening file: %s\n", strerror(errno)); return 1; } /* fiemap.fm_extent_count set to 0, to count extents only. */ fiemap.fm_length = FIEMAP_MAX_OFFSET; if (ioctl(fd, FS_IOC_FIEMAP, &fiemap) < 0) { fprintf(stderr, "fiemap error: %s\n", strerror(errno)); return 1; } close(fd); printf("fm_mapped_extents = %d\n", fiemap.fm_mapped_extents); return 0; } $ gcc -o fiemap fiemap.c And the wrapper shell script that creates a file with many holes and runs fiemap against it: $ cat test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi mkfs.btrfs -f $DEV mount $DEV $MNT FILE_SIZE=$((1 * 1024 * 1024 * 1024)) echo -n > $MNT/foobar for ((off = 0; off < $FILE_SIZE; off += 8192)); do xfs_io -c "pwrite -S 0xab $off 4K" $MNT/foobar > /dev/null done # flush all delalloc sync start=$(date +%s%N) ./fiemap $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds" umount $MNT Result before applying patchset: fm_mapped_extents = 131072 fiemap took 63 milliseconds Result after applying patchset: fm_mapped_extents = 131072 fiemap took 39 milliseconds (-38.1%) Running the same test for a 512M file instead of a 1G file, gave the following results. Result before applying patchset: fm_mapped_extents = 65536 fiemap took 29 milliseconds Result after applying patchset: fm_mapped_extents = 65536 fiemap took 20 milliseconds (-31.0%) Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1cf97280760e..19b41b5fe6c0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3537,15 +3537,27 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end struct extent_map *em; u64 em_end; u64 delalloc_len; + unsigned int outstanding_extents; /* * Search the io tree first for EXTENT_DELALLOC. If we find any, it * means we have delalloc (dirty pages) for which writeback has not * started yet. */ - *delalloc_start_ret = start; - delalloc_len = count_range_bits(&inode->io_tree, delalloc_start_ret, end, - len, EXTENT_DELALLOC, 1); + spin_lock(&inode->lock); + outstanding_extents = inode->outstanding_extents; + + if (inode->delalloc_bytes > 0) { + spin_unlock(&inode->lock); + *delalloc_start_ret = start; + delalloc_len = count_range_bits(&inode->io_tree, + delalloc_start_ret, end, + len, EXTENT_DELALLOC, 1); + } else { + spin_unlock(&inode->lock); + delalloc_len = 0; + } + /* * If delalloc was found then *delalloc_start_ret has a sector size * aligned value (rounded down). @@ -3553,17 +3565,12 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end if (delalloc_len > 0) *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1; - spin_lock(&inode->lock); - if (inode->outstanding_extents == 0) { - /* - * No outstanding extents means we don't have any delalloc that - * is flushing, so return the unflushed range found in the io - * tree (if any). - */ - spin_unlock(&inode->lock); + /* + * No outstanding extents means we don't have any delalloc that is + * flushing, so return the unflushed range found in the io tree (if any). + */ + if (outstanding_extents == 0) return (delalloc_len > 0); - } - spin_unlock(&inode->lock); /* * Now also check if there's any extent map in the range that does not -- cgit From b98c6cd59e90fe659cf966859bc4e1c03aea347b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:16:57 +0100 Subject: btrfs: drop pointless memset when cloning extent buffer At btrfs_clone_extent_buffer(), before allocating the pages array for the new extent buffer we are calling memset() to zero out the pages array of the extent buffer. This is pointless however, because the extent buffer already has every element in its pages array pointing to NULL, as it was allocated with kmem_cache_zalloc(). The memset() was introduced with commit dd137dd1f2d719 ("btrfs: factor out allocating an array of pages"), but even before that commit we already depended on the pages array being initialized to NULL for the error paths that need to call btrfs_release_extent_buffer(). So remove the memset(), it's useless and slightly increases the object text size. Before this change: $ size fs/btrfs/extent_io.o text data bss dec hex filename 70580 5469 40 76089 12939 fs/btrfs/extent_io.o After this change: $ size fs/btrfs/extent_io.o text data bss dec hex filename 70564 5469 40 76073 12929 fs/btrfs/extent_io.o Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7891d375eb43..56a9f591f479 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4299,7 +4299,6 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) */ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); - memset(new->pages, 0, sizeof(*new->pages) * num_pages); ret = btrfs_alloc_page_array(num_pages, new->pages); if (ret) { btrfs_release_extent_buffer(new); -- cgit From 206c1d32f381ee91ba849a7dcb28728e8c3721b6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:16:58 +0100 Subject: btrfs: drop redundant bflags initialization when allocating extent buffer When allocating an extent buffer, at __alloc_extent_buffer(), there's no point in explicitly assigning zero to the bflags field of the new extent buffer because we allocated it with kmem_cache_zalloc(). So just remove the redundant initialization, it saves one mov instruction in the generated assembly code for x86_64 ("movq $0x0,0x10(%rax)"). Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 56a9f591f479..3192f094fe6c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4266,7 +4266,6 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, eb->start = start; eb->len = len; eb->fs_info = fs_info; - eb->bflags = 0; init_rwsem(&eb->lock); btrfs_leak_debug_add_eb(eb); -- cgit From c902421927ff602954d2ecc7bd6e71b255d29387 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:16:59 +0100 Subject: btrfs: remove checks for a root with id 0 during backref walking When doing backref walking to determine if an extent is shared, we are testing the root_objectid of the given share_check struct is 0, but that is an impossible case, since btrfs_is_data_extent_shared() always initializes the root_objectid field with the id of the given root, and no root can have an objectid of 0. So remove those checks. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 18374a6d05bd..abac2e942e8b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -719,8 +719,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, continue; } - if (sc && sc->root_objectid && - ref->root_id != sc->root_objectid) { + if (sc && ref->root_id != sc->root_objectid) { free_pref(ref); ret = BACKREF_FOUND_SHARED; goto out; @@ -1348,8 +1347,7 @@ again: * and would retain their original ref->count < 0. */ if (roots && ref->count && ref->root_id && ref->parent == 0) { - if (sc && sc->root_objectid && - ref->root_id != sc->root_objectid) { + if (sc && ref->root_id != sc->root_objectid) { ret = BACKREF_FOUND_SHARED; goto out; } -- cgit From a0a5472ad802d99d3fb4b361cc3fb5ea24914ee0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:00 +0100 Subject: btrfs: remove checks for a 0 inode number during backref walking When doing backref walking to determine if an extent is shared, we are testing if the inode number, stored in the 'inum' field of struct share_check, is 0. However that can never be case, since the all instances of the structure are created at btrfs_is_data_extent_shared(), which always initializes it with the inode number from a fs tree (and the number for any inode from any tree can never be 0). So remove the checks. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index abac2e942e8b..9c7b3e1e4762 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1051,7 +1051,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum && + if (sc && key.objectid != sc->inum && !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; @@ -1152,7 +1152,7 @@ static int add_keyed_refs(struct btrfs_root *extent_root, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum && + if (sc && key.objectid != sc->inum && !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; -- cgit From ceb707da9ad92ad3a5251dc13844034ded06cb3d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:01 +0100 Subject: btrfs: directly pass the inode to btrfs_is_data_extent_shared() Currently we pass a root and an inode number as arguments for btrfs_is_data_extent_shared() and the inode number is always from an inode that belongs to that root (it wouldn't make sense otherwise). In every context that we call btrfs_is_data_extent_shared() (fiemap only), we have an inode available, so directly pass the inode to the function instead of a root and inode number. This reduces the number of parameters and it makes the function's signature conform to most other functions we have. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 8 ++++---- fs/btrfs/backref.h | 2 +- fs/btrfs/extent_io.c | 16 +++++++--------- 3 files changed, 12 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 9c7b3e1e4762..efdeb69e8ed6 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1658,8 +1658,7 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, /* * Check if a data extent is shared or not. * - * @root: The root the inode belongs to. - * @inum: Number of the inode whose extent we are checking. + * @inode: The inode whose extent we are checking. * @bytenr: Logical bytenr of the extent we are checking. * @extent_gen: Generation of the extent (file extent item) or 0 if it is * not known. @@ -1678,11 +1677,12 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, * * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. */ -int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, +int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, u64 extent_gen, struct ulist *roots, struct ulist *tmp, struct btrfs_backref_shared_cache *cache) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct ulist_iterator uiter; @@ -1691,7 +1691,7 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, int ret = 0; struct share_check shared = { .root_objectid = root->root_key.objectid, - .inum = inum, + .inum = btrfs_ino(inode), .share_count = 0, .have_delayed_delete_refs = false, }; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 8e69584d538d..c846fa2bdf93 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -77,7 +77,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, struct btrfs_inode_extref **ret_extref, u64 *found_off); -int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, +int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, u64 extent_gen, struct ulist *roots, struct ulist *tmp, struct btrfs_backref_shared_cache *cache); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3192f094fe6c..83be7e85ba91 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3712,7 +3712,6 @@ static int fiemap_process_hole(struct btrfs_inode *inode, u64 start, u64 end) { const u64 i_size = i_size_read(&inode->vfs_inode); - const u64 ino = btrfs_ino(inode); u64 cur_offset = start; u64 last_delalloc_end = 0; u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN; @@ -3752,8 +3751,8 @@ static int fiemap_process_hole(struct btrfs_inode *inode, if (prealloc_len > 0) { if (!checked_extent_shared && fieinfo->fi_extents_max) { - ret = btrfs_is_data_extent_shared(inode->root, - ino, disk_bytenr, + ret = btrfs_is_data_extent_shared(inode, + disk_bytenr, extent_gen, roots, tmp_ulist, backref_cache); @@ -3802,8 +3801,8 @@ static int fiemap_process_hole(struct btrfs_inode *inode, } if (!checked_extent_shared && fieinfo->fi_extents_max) { - ret = btrfs_is_data_extent_shared(inode->root, - ino, disk_bytenr, + ret = btrfs_is_data_extent_shared(inode, + disk_bytenr, extent_gen, roots, tmp_ulist, backref_cache); @@ -3904,7 +3903,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, const u64 ino = btrfs_ino(inode); struct extent_state *cached_state = NULL; struct btrfs_path *path; - struct btrfs_root *root = inode->root; struct fiemap_cache cache = { 0 }; struct btrfs_backref_shared_cache *backref_cache; struct ulist *roots; @@ -3925,8 +3923,8 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, goto out; } - lockstart = round_down(start, root->fs_info->sectorsize); - lockend = round_up(start + len, root->fs_info->sectorsize); + lockstart = round_down(start, inode->root->fs_info->sectorsize); + lockend = round_up(start + len, inode->root->fs_info->sectorsize); prev_extent_end = lockstart; lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); @@ -4034,7 +4032,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, } else { /* We have a regular extent. */ if (fieinfo->fi_extents_max) { - ret = btrfs_is_data_extent_shared(root, ino, + ret = btrfs_is_data_extent_shared(inode, disk_bytenr, extent_gen, roots, -- cgit From 61dbb952f0a5f587c983d88853212e969d2d4ede Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:02 +0100 Subject: btrfs: turn the backref sharedness check cache into a context object Right now we are using a struct btrfs_backref_shared_cache to pass state across multiple btrfs_is_data_extent_shared() calls. The structure's name closely follows its current purpose, which is to cache previous checks for the sharedness of metadata extents. However we will start using the structure for more things other than caching sharedness checks, so rename it to struct btrfs_backref_share_check_ctx. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 32 ++++++++++++++++---------------- fs/btrfs/backref.h | 8 ++++---- fs/btrfs/extent_io.c | 24 ++++++++++++------------ 3 files changed, 32 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index efdeb69e8ed6..693c99ad4afb 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1542,13 +1542,13 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, * fs_info->commit_root_sem semaphore, so no need to worry about the root's last * snapshot field changing while updating or checking the cache. */ -static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache, +static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, struct btrfs_root *root, u64 bytenr, int level, bool *is_shared) { struct btrfs_backref_shared_cache_entry *entry; - if (!cache->use_cache) + if (!ctx->use_path_cache) return false; if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) @@ -1562,7 +1562,7 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache */ ASSERT(level >= 0); - entry = &cache->entries[level]; + entry = &ctx->path_cache_entries[level]; /* Unused cache entry or being used for some other extent buffer. */ if (entry->bytenr != bytenr) @@ -1595,8 +1595,8 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache */ if (*is_shared) { for (int i = 0; i < level; i++) { - cache->entries[i].is_shared = true; - cache->entries[i].gen = entry->gen; + ctx->path_cache_entries[i].is_shared = true; + ctx->path_cache_entries[i].gen = entry->gen; } } @@ -1608,14 +1608,14 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache * fs_info->commit_root_sem semaphore, so no need to worry about the root's last * snapshot field changing while updating or checking the cache. */ -static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, +static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, struct btrfs_root *root, u64 bytenr, int level, bool is_shared) { struct btrfs_backref_shared_cache_entry *entry; u64 gen; - if (!cache->use_cache) + if (!ctx->use_path_cache) return; if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) @@ -1634,7 +1634,7 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, else gen = btrfs_root_last_snapshot(&root->root_item); - entry = &cache->entries[level]; + entry = &ctx->path_cache_entries[level]; entry->bytenr = bytenr; entry->is_shared = is_shared; entry->gen = gen; @@ -1648,7 +1648,7 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, */ if (is_shared) { for (int i = 0; i < level; i++) { - entry = &cache->entries[i]; + entry = &ctx->path_cache_entries[i]; entry->is_shared = is_shared; entry->gen = gen; } @@ -1664,7 +1664,7 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, * not known. * @roots: List of roots this extent is shared among. * @tmp: Temporary list used for iteration. - * @cache: A backref lookup result cache. + * @ctx: A backref sharedness check context. * * btrfs_is_data_extent_shared uses the backref walking code but will short * circuit as soon as it finds a root or inode that doesn't match the @@ -1680,7 +1680,7 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, u64 extent_gen, struct ulist *roots, struct ulist *tmp, - struct btrfs_backref_shared_cache *cache) + struct btrfs_backref_share_check_ctx *ctx) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1715,7 +1715,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, /* -1 means we are in the bytenr of the data extent. */ level = -1; ULIST_ITER_INIT(&uiter); - cache->use_cache = true; + ctx->use_path_cache = true; while (1) { bool is_shared; bool cached; @@ -1726,7 +1726,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, /* this is the only condition under which we return 1 */ ret = 1; if (level >= 0) - store_backref_shared_cache(cache, root, bytenr, + store_backref_shared_cache(ctx, root, bytenr, level, true); break; } @@ -1761,17 +1761,17 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, * (which implies multiple paths). */ if (level == -1 && tmp->nnodes > 1) - cache->use_cache = false; + ctx->use_path_cache = false; if (level >= 0) - store_backref_shared_cache(cache, root, bytenr, + store_backref_shared_cache(ctx, root, bytenr, level, false); node = ulist_next(tmp, &uiter); if (!node) break; bytenr = node->val; level++; - cached = lookup_backref_shared_cache(cache, root, bytenr, level, + cached = lookup_backref_shared_cache(ctx, root, bytenr, level, &is_shared); if (cached) { ret = (is_shared ? 1 : 0); diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index c846fa2bdf93..e3a2b45a76e3 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -23,13 +23,13 @@ struct btrfs_backref_shared_cache_entry { bool is_shared; }; -struct btrfs_backref_shared_cache { +struct btrfs_backref_share_check_ctx { /* * A path from a root to a leaf that has a file extent item pointing to * a given data extent should never exceed the maximum b+tree height. */ - struct btrfs_backref_shared_cache_entry entries[BTRFS_MAX_LEVEL]; - bool use_cache; + struct btrfs_backref_shared_cache_entry path_cache_entries[BTRFS_MAX_LEVEL]; + bool use_path_cache; }; typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, @@ -80,7 +80,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, u64 extent_gen, struct ulist *roots, struct ulist *tmp, - struct btrfs_backref_shared_cache *cache); + struct btrfs_backref_share_check_ctx *ctx); int __init btrfs_prelim_ref_init(void); void __cold btrfs_prelim_ref_exit(void); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 83be7e85ba91..365ad00a5942 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3705,7 +3705,7 @@ static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path static int fiemap_process_hole(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache, - struct btrfs_backref_shared_cache *backref_cache, + struct btrfs_backref_share_check_ctx *backref_ctx, u64 disk_bytenr, u64 extent_offset, u64 extent_gen, struct ulist *roots, struct ulist *tmp_ulist, @@ -3755,7 +3755,7 @@ static int fiemap_process_hole(struct btrfs_inode *inode, disk_bytenr, extent_gen, roots, tmp_ulist, - backref_cache); + backref_ctx); if (ret < 0) return ret; else if (ret > 0) @@ -3805,7 +3805,7 @@ static int fiemap_process_hole(struct btrfs_inode *inode, disk_bytenr, extent_gen, roots, tmp_ulist, - backref_cache); + backref_ctx); if (ret < 0) return ret; else if (ret > 0) @@ -3904,7 +3904,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, struct extent_state *cached_state = NULL; struct btrfs_path *path; struct fiemap_cache cache = { 0 }; - struct btrfs_backref_shared_cache *backref_cache; + struct btrfs_backref_share_check_ctx *backref_ctx; struct ulist *roots; struct ulist *tmp_ulist; u64 last_extent_end; @@ -3914,11 +3914,11 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, bool stopped = false; int ret; - backref_cache = kzalloc(sizeof(*backref_cache), GFP_KERNEL); + backref_ctx = kzalloc(sizeof(*backref_ctx), GFP_KERNEL); path = btrfs_alloc_path(); roots = ulist_alloc(GFP_KERNEL); tmp_ulist = ulist_alloc(GFP_KERNEL); - if (!backref_cache || !path || !roots || !tmp_ulist) { + if (!backref_ctx || !path || !roots || !tmp_ulist) { ret = -ENOMEM; goto out; } @@ -3978,7 +3978,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, const u64 range_end = min(key.offset, lockend) - 1; ret = fiemap_process_hole(inode, fieinfo, &cache, - backref_cache, 0, 0, 0, + backref_ctx, 0, 0, 0, roots, tmp_ulist, prev_extent_end, range_end); if (ret < 0) { @@ -4019,14 +4019,14 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, extent_len, flags); } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { ret = fiemap_process_hole(inode, fieinfo, &cache, - backref_cache, + backref_ctx, disk_bytenr, extent_offset, extent_gen, roots, tmp_ulist, key.offset, extent_end - 1); } else if (disk_bytenr == 0) { /* We have an explicit hole. */ ret = fiemap_process_hole(inode, fieinfo, &cache, - backref_cache, 0, 0, 0, + backref_ctx, 0, 0, 0, roots, tmp_ulist, key.offset, extent_end - 1); } else { @@ -4037,7 +4037,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, extent_gen, roots, tmp_ulist, - backref_cache); + backref_ctx); if (ret < 0) goto out_unlock; else if (ret > 0) @@ -4086,7 +4086,7 @@ check_eof_delalloc: path = NULL; if (!stopped && prev_extent_end < lockend) { - ret = fiemap_process_hole(inode, fieinfo, &cache, backref_cache, + ret = fiemap_process_hole(inode, fieinfo, &cache, backref_ctx, 0, 0, 0, roots, tmp_ulist, prev_extent_end, lockend - 1); if (ret < 0) @@ -4119,7 +4119,7 @@ check_eof_delalloc: out_unlock: unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); out: - kfree(backref_cache); + kfree(backref_ctx); btrfs_free_path(path); ulist_free(roots); ulist_free(tmp_ulist); -- cgit From 84a7949d409753c90dc3477b8cfc18e983b09078 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:03 +0100 Subject: btrfs: move ulists to data extent sharedness check context When calling btrfs_is_data_extent_shared() we pass two ulists that were allocated by the caller. This is because the single caller, fiemap, calls btrfs_is_data_extent_shared() multiple times and the ulists can be reused, instead of allocating new ones before each call and freeing them after each call. Now that we have a context structure/object that we pass to btrfs_is_data_extent_shared(), we can move those ulists to it, and hide their allocation and the context's allocation in a helper function, as well as the freeing of the ulists and the context object. This allows to reduce the number of parameters passed to btrfs_is_data_extent_shared(), the need to pass the ulists from extent_fiemap() to fiemap_process_hole() and having the caller deal with allocating and releasing the ulists. Also rename one of the ulists from 'tmp' / 'tmp_ulist' to 'refs', since that's a much better name as it reflects what the list is used for (and matching the argument name for find_parent_nodes()). Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 43 ++++++++++++++++++++++++++++++++----------- fs/btrfs/backref.h | 7 ++++++- fs/btrfs/extent_io.c | 34 ++++++++++------------------------ 3 files changed, 48 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 693c99ad4afb..4caff3052da7 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1655,6 +1655,30 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx } } +struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void) +{ + struct btrfs_backref_share_check_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + ulist_init(&ctx->refs); + ulist_init(&ctx->roots); + + return ctx; +} + +void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx) +{ + if (!ctx) + return; + + ulist_release(&ctx->refs); + ulist_release(&ctx->roots); + kfree(ctx); +} + /* * Check if a data extent is shared or not. * @@ -1662,8 +1686,6 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx * @bytenr: Logical bytenr of the extent we are checking. * @extent_gen: Generation of the extent (file extent item) or 0 if it is * not known. - * @roots: List of roots this extent is shared among. - * @tmp: Temporary list used for iteration. * @ctx: A backref sharedness check context. * * btrfs_is_data_extent_shared uses the backref walking code but will short @@ -1679,7 +1701,6 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx */ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, u64 extent_gen, - struct ulist *roots, struct ulist *tmp, struct btrfs_backref_share_check_ctx *ctx) { struct btrfs_root *root = inode->root; @@ -1697,8 +1718,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, }; int level; - ulist_init(roots); - ulist_init(tmp); + ulist_init(&ctx->roots); + ulist_init(&ctx->refs); trans = btrfs_join_transaction_nostart(root); if (IS_ERR(trans)) { @@ -1720,8 +1741,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, bool is_shared; bool cached; - ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, - roots, NULL, &shared, false); + ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, &ctx->refs, + &ctx->roots, NULL, &shared, false); if (ret == BACKREF_FOUND_SHARED) { /* this is the only condition under which we return 1 */ ret = 1; @@ -1760,13 +1781,13 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, * deal with), we can not use it if we have multiple leaves * (which implies multiple paths). */ - if (level == -1 && tmp->nnodes > 1) + if (level == -1 && ctx->refs.nnodes > 1) ctx->use_path_cache = false; if (level >= 0) store_backref_shared_cache(ctx, root, bytenr, level, false); - node = ulist_next(tmp, &uiter); + node = ulist_next(&ctx->refs, &uiter); if (!node) break; bytenr = node->val; @@ -1789,8 +1810,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, up_read(&fs_info->commit_root_sem); } out: - ulist_release(roots); - ulist_release(tmp); + ulist_release(&ctx->roots); + ulist_release(&ctx->refs); return ret; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index e3a2b45a76e3..8da0ba6b94a4 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -24,6 +24,9 @@ struct btrfs_backref_shared_cache_entry { }; struct btrfs_backref_share_check_ctx { + /* Ulists used during backref walking. */ + struct ulist refs; + struct ulist roots; /* * A path from a root to a leaf that has a file extent item pointing to * a given data extent should never exceed the maximum b+tree height. @@ -35,6 +38,9 @@ struct btrfs_backref_share_check_ctx { typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, void *ctx); +struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void); +void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx); + int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_path *path, struct btrfs_key *found_key, u64 *flags); @@ -79,7 +85,6 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 *found_off); int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, u64 extent_gen, - struct ulist *roots, struct ulist *tmp, struct btrfs_backref_share_check_ctx *ctx); int __init btrfs_prelim_ref_init(void); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 365ad00a5942..e25e54d2216f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3708,7 +3708,6 @@ static int fiemap_process_hole(struct btrfs_inode *inode, struct btrfs_backref_share_check_ctx *backref_ctx, u64 disk_bytenr, u64 extent_offset, u64 extent_gen, - struct ulist *roots, struct ulist *tmp_ulist, u64 start, u64 end) { const u64 i_size = i_size_read(&inode->vfs_inode); @@ -3752,10 +3751,9 @@ static int fiemap_process_hole(struct btrfs_inode *inode, if (prealloc_len > 0) { if (!checked_extent_shared && fieinfo->fi_extents_max) { ret = btrfs_is_data_extent_shared(inode, - disk_bytenr, - extent_gen, roots, - tmp_ulist, - backref_ctx); + disk_bytenr, + extent_gen, + backref_ctx); if (ret < 0) return ret; else if (ret > 0) @@ -3803,8 +3801,7 @@ static int fiemap_process_hole(struct btrfs_inode *inode, if (!checked_extent_shared && fieinfo->fi_extents_max) { ret = btrfs_is_data_extent_shared(inode, disk_bytenr, - extent_gen, roots, - tmp_ulist, + extent_gen, backref_ctx); if (ret < 0) return ret; @@ -3905,8 +3902,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, struct btrfs_path *path; struct fiemap_cache cache = { 0 }; struct btrfs_backref_share_check_ctx *backref_ctx; - struct ulist *roots; - struct ulist *tmp_ulist; u64 last_extent_end; u64 prev_extent_end; u64 lockstart; @@ -3914,11 +3909,9 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, bool stopped = false; int ret; - backref_ctx = kzalloc(sizeof(*backref_ctx), GFP_KERNEL); + backref_ctx = btrfs_alloc_backref_share_check_ctx(); path = btrfs_alloc_path(); - roots = ulist_alloc(GFP_KERNEL); - tmp_ulist = ulist_alloc(GFP_KERNEL); - if (!backref_ctx || !path || !roots || !tmp_ulist) { + if (!backref_ctx || !path) { ret = -ENOMEM; goto out; } @@ -3979,7 +3972,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, ret = fiemap_process_hole(inode, fieinfo, &cache, backref_ctx, 0, 0, 0, - roots, tmp_ulist, prev_extent_end, range_end); if (ret < 0) { goto out_unlock; @@ -4021,13 +4013,12 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, ret = fiemap_process_hole(inode, fieinfo, &cache, backref_ctx, disk_bytenr, extent_offset, - extent_gen, roots, tmp_ulist, - key.offset, extent_end - 1); + extent_gen, key.offset, + extent_end - 1); } else if (disk_bytenr == 0) { /* We have an explicit hole. */ ret = fiemap_process_hole(inode, fieinfo, &cache, backref_ctx, 0, 0, 0, - roots, tmp_ulist, key.offset, extent_end - 1); } else { /* We have a regular extent. */ @@ -4035,8 +4026,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, ret = btrfs_is_data_extent_shared(inode, disk_bytenr, extent_gen, - roots, - tmp_ulist, backref_ctx); if (ret < 0) goto out_unlock; @@ -4087,8 +4076,7 @@ check_eof_delalloc: if (!stopped && prev_extent_end < lockend) { ret = fiemap_process_hole(inode, fieinfo, &cache, backref_ctx, - 0, 0, 0, roots, tmp_ulist, - prev_extent_end, lockend - 1); + 0, 0, 0, prev_extent_end, lockend - 1); if (ret < 0) goto out_unlock; prev_extent_end = lockend; @@ -4119,10 +4107,8 @@ check_eof_delalloc: out_unlock: unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); out: - kfree(backref_ctx); + btrfs_free_backref_share_ctx(backref_ctx); btrfs_free_path(path); - ulist_free(roots); - ulist_free(tmp_ulist); return ret; } -- cgit From b629685803bc0cefbbd29240ea28d57d8a17bcbc Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:04 +0100 Subject: btrfs: remove roots ulist when checking data extent sharedness Currently btrfs_is_data_extent_shared() is passing a ulist for the roots argument of find_parent_nodes(), however it does not use that ulist for anything and for this context that list always ends up with at most one element. Since find_parent_nodes() is able to deal with a NULL ulist for its roots argument, make btrfs_is_data_extent_shared() pass it NULL and avoid the burden of allocating memory for the unnused roots ulist, initializing it, releasing it and allocating one struct ulist_node for it during the call to find_parent_nodes(). Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 6 +----- fs/btrfs/backref.h | 1 - 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 4caff3052da7..4d8573fa75b4 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1664,7 +1664,6 @@ struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void) return NULL; ulist_init(&ctx->refs); - ulist_init(&ctx->roots); return ctx; } @@ -1675,7 +1674,6 @@ void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx) return; ulist_release(&ctx->refs); - ulist_release(&ctx->roots); kfree(ctx); } @@ -1718,7 +1716,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, }; int level; - ulist_init(&ctx->roots); ulist_init(&ctx->refs); trans = btrfs_join_transaction_nostart(root); @@ -1742,7 +1739,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, bool cached; ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, &ctx->refs, - &ctx->roots, NULL, &shared, false); + NULL, NULL, &shared, false); if (ret == BACKREF_FOUND_SHARED) { /* this is the only condition under which we return 1 */ ret = 1; @@ -1810,7 +1807,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, up_read(&fs_info->commit_root_sem); } out: - ulist_release(&ctx->roots); ulist_release(&ctx->refs); return ret; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 8da0ba6b94a4..5f468f0defda 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -26,7 +26,6 @@ struct btrfs_backref_shared_cache_entry { struct btrfs_backref_share_check_ctx { /* Ulists used during backref walking. */ struct ulist refs; - struct ulist roots; /* * A path from a root to a leaf that has a file extent item pointing to * a given data extent should never exceed the maximum b+tree height. -- cgit From 56f5c19920d09bbf91efcf80e6ba301923400f4c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:05 +0100 Subject: btrfs: remove useless logic when finding parent nodes At find_parent_nodes(), at its last step, when iterating over all direct references, we are checking if we have a share context and if we have a reference with a different root from the one in the share context. However that logic is pointless because of two reasons: 1) After the previous patch in the series (subject "btrfs: remove roots ulist when checking data extent sharedness"), the roots argument is always NULL when using a share check context (struct share_check), so this code is never triggered; 2) Even before that previous patch, we could not hit this code because if we had a reference with a root different from the one in our share context, then we would have exited earlier when doing either of the following: - Adding a second direct ref to the direct refs red black tree resulted in extent_is_shared() returning true when called from add_direct_ref() -> add_prelim_ref(), after processing delayed references or while processing references in the extent tree; - When adding a second reference to the indirect refs red black tree (same as above, extent_is_shared() returns true); - If we only have one indirect reference and no direct references, then when resolving it at resolve_indirect_refs() we immediately return that the target extent is shared, therefore never reaching that loop that iterates over all direct references at find_parent_nodes(); - If we have 1 indirect reference and 1 direct reference, then we also exit early because extent_is_shared() ends up returning true when called through add_prelim_ref() (by add_direct_ref() or add_indirect_ref()) or add_delayed_refs(). Same applies as when having a combination of direct, indirect and indirect with missing key references. This logic had been obsoleted since commit 3ec4d3238ab165 ("btrfs: allow backref search checks for shared extents"), which introduced the early exits in case an extent is shared. So just remove that logic, and assert at find_parent_nodes() that when we have a share context we don't have a roots ulist and that we haven't found the extent to be directly shared after processing delayed references and all references from the extent tree. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 4d8573fa75b4..22ab13821ada 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1219,6 +1219,10 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, .indirect_missing_keys = PREFTREE_INIT }; + /* Roots ulist is not needed when using a sharedness check context. */ + if (sc) + ASSERT(roots == NULL); + key.objectid = bytenr; key.offset = (u64)-1; if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) @@ -1310,6 +1314,20 @@ again: } } + /* + * If we have a share context and we reached here, it means the extent + * is not directly shared (no multiple reference items for it), + * otherwise we would have exited earlier with a return value of + * BACKREF_FOUND_SHARED after processing delayed references or while + * processing inline or keyed references from the extent tree. + * The extent may however be indirectly shared through shared subtrees + * as a result from creating snapshots, so we determine below what is + * its parent node, in case we are dealing with a metadata extent, or + * what's the leaf (or leaves), from a fs tree, that has a file extent + * item pointing to it in case we are dealing with a data extent. + */ + ASSERT(extent_is_shared(sc) == 0); + btrfs_release_path(path); ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0); @@ -1347,11 +1365,6 @@ again: * and would retain their original ref->count < 0. */ if (roots && ref->count && ref->root_id && ref->parent == 0) { - if (sc && ref->root_id != sc->root_objectid) { - ret = BACKREF_FOUND_SHARED; - goto out; - } - /* no parent == root of tree */ ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); if (ret < 0) -- cgit From 73e339e6ab74cc3edcd1f1ed3d9b822baf8534e1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:06 +0100 Subject: btrfs: cache sharedness of the last few data extents during fiemap During fiemap we process all the file extent items of an inode, by their file offset order (left to right b+tree order), and then check if the data extent they point at is shared or not. Until now we didn't cache those results, we only did it for b+tree nodes/leaves since for each unique b+tree path we have access to hundreds of file extent items. However, it is also common to repeat checking the sharedness of a particular data extent in a very short time window, and the cases that lead to that are the following: 1) COW writes. If have a file extent item like this: [ bytenr X, offset = 0, num_bytes = 512K ] file offset 0 512K Then a 4K write into file offset 64K happens, we end up with the following file extent item layout: [ bytenr X, offset = 0, num_bytes = 64K ] file offset 0 64K [ bytenr Y, offset = 0, num_bytes = 4K ] file offset 64K 68K [ bytenr X, offset = 68K, num_bytes = 444K ] file offset 68K 512K So during fiemap we well check for the sharedness of the data extent with bytenr X twice. Typically for COW writes and for at least moderately updated files, we end up with many file extent items that point to different sections of the same data extent. 2) Writing into a NOCOW file after a snapshot is taken. This happens if the target extent was created in a generation older than the generation where the last snapshot for the root (the tree the inode belongs to) was made. This leads to a scenario like the previous one. 3) Writing into sections of a preallocated extent. For example if a file has the following layout: [ bytenr X, offset = 0, num_bytes = 1M, type = prealloc ] 0 1M After doing a 4K write into file offset 0 and another 4K write into offset 512K, we get the following layout: [ bytenr X, offset = 0, num_bytes = 4K, type = regular ] 0 4K [ bytenr X, offset = 4K, num_bytes = 508K, type = prealloc ] 4K 512K [ bytenr X, offset = 512K, num_bytes = 4K, type = regular ] 512K 516K [ bytenr X, offset = 516K, num_bytes = 508K, type = prealloc ] 516K 1M So we end up with 4 consecutive file extent items pointing to the data extent at bytenr X. 4) Hole punching in the middle of an extent. For example if a file has the following file extent item: [ bytenr X, offset = 0, num_bytes = 8M ] 0 8M And then hole is punched for the file range [4M, 6M[, we our file extent item split into two: [ bytenr X, offset = 0, num_bytes = 4M ] 0 4M [ 2M hole, implicit or explicit depending on NO_HOLES feature ] 4M 6M [ bytenr X, offset = 6M, num_bytes = 2M ] 6M 8M Again, we end up with two file extent items pointing to the same data extent. 5) When reflinking (clone and deduplication) within the same file. This is probably the least common case of all. In cases 1, 2, 4 and 4, when we have multiple file extent items that point to the same data extent, their distance is usually short, typically separated by a few slots in a b+tree leaf (or across sibling leaves). For case 5, the distance can vary a lot, but it's typically the less common case. This change caches the result of the sharedness checks for data extents, but only for the last 8 extents that we notice that our inode refers to with multiple file extent items. Whenever we want to check if a data extent is shared, we lookup the cache which consists of doing a linear scan of an 8 elements array, and if we find the data extent there, we return the result and don't check the extent tree and delayed refs. The array/cache is small so that doing the search has no noticeable negative impact on the performance in case we don't have file extent items within a distance of 8 slots that point to the same data extent. Slots in the cache/array are overwritten in a simple round robin fashion, as that approach fits very well. Using this simple approach with only the last 8 data extents seen is effective as usually when multiple file extents items point to the same data extent, their distance is within 8 slots. It also uses very little memory and the time to cache a result or lookup the cache is negligible. The following test was run on non-debug kernel (Debian's default kernel config) to measure the impact in the case of COW writes (first example given above), where we run fiemap after overwriting 33% of the blocks of a file: $ cat test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi umount $DEV &> /dev/null mkfs.btrfs -f $DEV mount $DEV $MNT FILE_SIZE=$((1 * 1024 * 1024 * 1024)) # Create the file full of 1M extents. xfs_io -f -s -c "pwrite -b 1M -S 0xab 0 $FILE_SIZE" $MNT/foobar block_count=$((FILE_SIZE / 4096)) # Overwrite about 33% of the file blocks. overwrite_count=$((block_count / 3)) echo -e "\nOverwriting $overwrite_count 4K blocks (out of $block_count)..." RANDOM=123 for ((i = 1; i <= $overwrite_count; i++)); do off=$(((RANDOM % block_count) * 4096)) xfs_io -c "pwrite -S 0xcd $off 4K" $MNT/foobar > /dev/null echo -ne "\r$i blocks overwritten..." done echo -e "\n" # Unmount and mount to clear all cached metadata. umount $MNT mount $DEV $MNT start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds" umount $MNT Result before applying this patch: fiemap took 128 milliseconds Result after applying this patch: fiemap took 92 milliseconds (-28.1%) The test is somewhat limited in the sense the gains may be higher in practice, because in the test the filesystem is small, so we have small fs and extent trees, plus there's no concurrent access to the trees as well, therefore no lock contention there. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/backref.h | 27 +++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 22ab13821ada..64bea9b30f6b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -137,7 +137,25 @@ struct preftrees { struct share_check { u64 root_objectid; u64 inum; + u64 data_bytenr; + /* + * Counts number of inodes that refer to an extent (different inodes in + * the same root or different roots) that we could find. The sharedness + * check typically stops once this counter gets greater than 1, so it + * may not reflect the total number of inodes. + */ int share_count; + /* + * The number of times we found our inode refers to the data extent we + * are determining the sharedness. In other words, how many file extent + * items we could find for our inode that point to our target data + * extent. The value we get here after finishing the extent sharedness + * check may be smaller than reality, but if it ends up being greater + * than 1, then we know for sure the inode has multiple file extent + * items that point to our inode, and we can safely assume it's useful + * to cache the sharedness check result. + */ + int self_ref_count; bool have_delayed_delete_refs; }; @@ -207,7 +225,7 @@ static int prelim_ref_compare(struct prelim_ref *ref1, } static void update_share_count(struct share_check *sc, int oldcount, - int newcount) + int newcount, struct prelim_ref *newref) { if ((!sc) || (oldcount == 0 && newcount < 1)) return; @@ -216,6 +234,11 @@ static void update_share_count(struct share_check *sc, int oldcount, sc->share_count--; else if (oldcount < 1 && newcount > 0) sc->share_count++; + + if (newref->root_id == sc->root_objectid && + newref->wanted_disk_byte == sc->data_bytenr && + newref->key_for_search.objectid == sc->inum) + sc->self_ref_count += newref->count; } /* @@ -266,14 +289,14 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, * BTRFS_[ADD|DROP]_DELAYED_REF actions. */ update_share_count(sc, ref->count, - ref->count + newref->count); + ref->count + newref->count, newref); ref->count += newref->count; free_pref(newref); return; } } - update_share_count(sc, 0, newref->count); + update_share_count(sc, 0, newref->count, newref); preftree->count++; trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count); rb_link_node(&newref->rbnode, parent, p); @@ -1724,11 +1747,18 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, struct share_check shared = { .root_objectid = root->root_key.objectid, .inum = btrfs_ino(inode), + .data_bytenr = bytenr, .share_count = 0, + .self_ref_count = 0, .have_delayed_delete_refs = false, }; int level; + for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) { + if (ctx->prev_extents_cache[i].bytenr == bytenr) + return ctx->prev_extents_cache[i].is_shared; + } + ulist_init(&ctx->refs); trans = btrfs_join_transaction_nostart(root); @@ -1813,6 +1843,20 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, cond_resched(); } + /* + * Cache the sharedness result for the data extent if we know our inode + * has more than 1 file extent item that refers to the data extent. + */ + if (ret >= 0 && shared.self_ref_count > 1) { + int slot = ctx->prev_extents_cache_slot; + + ctx->prev_extents_cache[slot].bytenr = shared.data_bytenr; + ctx->prev_extents_cache[slot].is_shared = (ret == 1); + + slot = (slot + 1) % BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; + ctx->prev_extents_cache_slot = slot; + } + if (trans) { btrfs_put_tree_mod_seq(fs_info, &elem); btrfs_end_transaction(trans); diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 5f468f0defda..fda78db50be6 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -23,6 +23,8 @@ struct btrfs_backref_shared_cache_entry { bool is_shared; }; +#define BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE 8 + struct btrfs_backref_share_check_ctx { /* Ulists used during backref walking. */ struct ulist refs; @@ -32,6 +34,31 @@ struct btrfs_backref_share_check_ctx { */ struct btrfs_backref_shared_cache_entry path_cache_entries[BTRFS_MAX_LEVEL]; bool use_path_cache; + /* + * Cache the sharedness result for the last few extents we have found, + * but only for extents for which we have multiple file extent items + * that point to them. + * It's very common to have several file extent items that point to the + * same extent (bytenr) but with different offsets and lengths. This + * typically happens for COW writes, partial writes into prealloc + * extents, NOCOW writes after snapshoting a root, hole punching or + * reflinking within the same file (less common perhaps). + * So keep a small cache with the lookup results for the extent pointed + * by the last few file extent items. This cache is checked, with a + * linear scan, whenever btrfs_is_data_extent_shared() is called, so + * it must be small so that it does not negatively affect performance in + * case we don't have multiple file extent items that point to the same + * data extent. + */ + struct { + u64 bytenr; + bool is_shared; + } prev_extents_cache[BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE]; + /* + * The slot in the prev_extents_cache array that will be used for + * storing the sharedness result of a new data extent. + */ + int prev_extents_cache_slot; }; typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, -- cgit From 583f4ac56254c844cbbc9f23744e1f2efbf42fc7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:07 +0100 Subject: btrfs: move up backref sharedness cache store and lookup functions Move the static functions to lookup and store sharedness check of an extent buffer to a location above find_all_parents(), because in the next patch the lookup function will be used by find_all_parents(). The store function is also moved just because it's the counter part to the lookup function and it's best to have their definitions close together. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 236 ++++++++++++++++++++++++++--------------------------- 1 file changed, 118 insertions(+), 118 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 64bea9b30f6b..1b1575b3a7b0 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1198,6 +1198,124 @@ static int add_keyed_refs(struct btrfs_root *extent_root, return ret; } +/* + * The caller has joined a transaction or is holding a read lock on the + * fs_info->commit_root_sem semaphore, so no need to worry about the root's last + * snapshot field changing while updating or checking the cache. + */ +static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, + struct btrfs_root *root, + u64 bytenr, int level, bool *is_shared) +{ + struct btrfs_backref_shared_cache_entry *entry; + + if (!ctx->use_path_cache) + return false; + + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) + return false; + + /* + * Level -1 is used for the data extent, which is not reliable to cache + * because its reference count can increase or decrease without us + * realizing. We cache results only for extent buffers that lead from + * the root node down to the leaf with the file extent item. + */ + ASSERT(level >= 0); + + entry = &ctx->path_cache_entries[level]; + + /* Unused cache entry or being used for some other extent buffer. */ + if (entry->bytenr != bytenr) + return false; + + /* + * We cached a false result, but the last snapshot generation of the + * root changed, so we now have a snapshot. Don't trust the result. + */ + if (!entry->is_shared && + entry->gen != btrfs_root_last_snapshot(&root->root_item)) + return false; + + /* + * If we cached a true result and the last generation used for dropping + * a root changed, we can not trust the result, because the dropped root + * could be a snapshot sharing this extent buffer. + */ + if (entry->is_shared && + entry->gen != btrfs_get_last_root_drop_gen(root->fs_info)) + return false; + + *is_shared = entry->is_shared; + /* + * If the node at this level is shared, than all nodes below are also + * shared. Currently some of the nodes below may be marked as not shared + * because we have just switched from one leaf to another, and switched + * also other nodes above the leaf and below the current level, so mark + * them as shared. + */ + if (*is_shared) { + for (int i = 0; i < level; i++) { + ctx->path_cache_entries[i].is_shared = true; + ctx->path_cache_entries[i].gen = entry->gen; + } + } + + return true; +} + +/* + * The caller has joined a transaction or is holding a read lock on the + * fs_info->commit_root_sem semaphore, so no need to worry about the root's last + * snapshot field changing while updating or checking the cache. + */ +static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, + struct btrfs_root *root, + u64 bytenr, int level, bool is_shared) +{ + struct btrfs_backref_shared_cache_entry *entry; + u64 gen; + + if (!ctx->use_path_cache) + return; + + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) + return; + + /* + * Level -1 is used for the data extent, which is not reliable to cache + * because its reference count can increase or decrease without us + * realizing. We cache results only for extent buffers that lead from + * the root node down to the leaf with the file extent item. + */ + ASSERT(level >= 0); + + if (is_shared) + gen = btrfs_get_last_root_drop_gen(root->fs_info); + else + gen = btrfs_root_last_snapshot(&root->root_item); + + entry = &ctx->path_cache_entries[level]; + entry->bytenr = bytenr; + entry->is_shared = is_shared; + entry->gen = gen; + + /* + * If we found an extent buffer is shared, set the cache result for all + * extent buffers below it to true. As nodes in the path are COWed, + * their sharedness is moved to their children, and if a leaf is COWed, + * then the sharedness of a data extent becomes direct, the refcount of + * data extent is increased in the extent item at the extent tree. + */ + if (is_shared) { + for (int i = 0; i < level; i++) { + entry = &ctx->path_cache_entries[i]; + entry->is_shared = is_shared; + entry->gen = gen; + } + } +} + /* * this adds all existing backrefs (inline backrefs, backrefs and delayed * refs) for the given bytenr to the refs list, merges duplicates and resolves @@ -1573,124 +1691,6 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, return ret; } -/* - * The caller has joined a transaction or is holding a read lock on the - * fs_info->commit_root_sem semaphore, so no need to worry about the root's last - * snapshot field changing while updating or checking the cache. - */ -static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, - struct btrfs_root *root, - u64 bytenr, int level, bool *is_shared) -{ - struct btrfs_backref_shared_cache_entry *entry; - - if (!ctx->use_path_cache) - return false; - - if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) - return false; - - /* - * Level -1 is used for the data extent, which is not reliable to cache - * because its reference count can increase or decrease without us - * realizing. We cache results only for extent buffers that lead from - * the root node down to the leaf with the file extent item. - */ - ASSERT(level >= 0); - - entry = &ctx->path_cache_entries[level]; - - /* Unused cache entry or being used for some other extent buffer. */ - if (entry->bytenr != bytenr) - return false; - - /* - * We cached a false result, but the last snapshot generation of the - * root changed, so we now have a snapshot. Don't trust the result. - */ - if (!entry->is_shared && - entry->gen != btrfs_root_last_snapshot(&root->root_item)) - return false; - - /* - * If we cached a true result and the last generation used for dropping - * a root changed, we can not trust the result, because the dropped root - * could be a snapshot sharing this extent buffer. - */ - if (entry->is_shared && - entry->gen != btrfs_get_last_root_drop_gen(root->fs_info)) - return false; - - *is_shared = entry->is_shared; - /* - * If the node at this level is shared, than all nodes below are also - * shared. Currently some of the nodes below may be marked as not shared - * because we have just switched from one leaf to another, and switched - * also other nodes above the leaf and below the current level, so mark - * them as shared. - */ - if (*is_shared) { - for (int i = 0; i < level; i++) { - ctx->path_cache_entries[i].is_shared = true; - ctx->path_cache_entries[i].gen = entry->gen; - } - } - - return true; -} - -/* - * The caller has joined a transaction or is holding a read lock on the - * fs_info->commit_root_sem semaphore, so no need to worry about the root's last - * snapshot field changing while updating or checking the cache. - */ -static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, - struct btrfs_root *root, - u64 bytenr, int level, bool is_shared) -{ - struct btrfs_backref_shared_cache_entry *entry; - u64 gen; - - if (!ctx->use_path_cache) - return; - - if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) - return; - - /* - * Level -1 is used for the data extent, which is not reliable to cache - * because its reference count can increase or decrease without us - * realizing. We cache results only for extent buffers that lead from - * the root node down to the leaf with the file extent item. - */ - ASSERT(level >= 0); - - if (is_shared) - gen = btrfs_get_last_root_drop_gen(root->fs_info); - else - gen = btrfs_root_last_snapshot(&root->root_item); - - entry = &ctx->path_cache_entries[level]; - entry->bytenr = bytenr; - entry->is_shared = is_shared; - entry->gen = gen; - - /* - * If we found an extent buffer is shared, set the cache result for all - * extent buffers below it to true. As nodes in the path are COWed, - * their sharedness is moved to their children, and if a leaf is COWed, - * then the sharedness of a data extent becomes direct, the refcount of - * data extent is increased in the extent item at the extent tree. - */ - if (is_shared) { - for (int i = 0; i < level; i++) { - entry = &ctx->path_cache_entries[i]; - entry->is_shared = is_shared; - entry->gen = gen; - } - } -} - struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void) { struct btrfs_backref_share_check_ctx *ctx; -- cgit From 877c14767f106a5c8af271ebf1294c514eb76d0c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:08 +0100 Subject: btrfs: avoid duplicated resolution of indirect backrefs during fiemap During fiemap, when determining if a data extent is shared or not, if we don't find the extent is directly shared, then we need to determine if it's shared through subtrees. For that we need to resolve the indirect reference we found in order to figure out the path in the inode's fs tree, which is a path starting at the fs tree's root node and going down to the leaf that contains the file extent item that points to the data extent. We then proceed to determine if any extent buffer in that path is shared with other trees or not. Currently whenever we find the data extent that a file extent item points to is not directly shared, we always resolve the path in the fs tree, and then check if any extent buffer in the path is shared. This is a lot of work and when we have file extent items that belong to the same leaf, we have the same path, so we only need to calculate it once. This change does that, it keeps track of the current and previous leaf, and when we find that a data extent is not directly shared, we try to compute the fs tree path only once and then use it for every other file extent item in the same leaf, using the existing cached path result for the leaf as long as the cache results are valid. This saves us from doing expensive b+tree searches in the fs tree of our target inode, as well as other minor work. The following test was run on a non-debug kernel (Debian's default kernel config): $ cat test-with-snapshots.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi umount $DEV &> /dev/null mkfs.btrfs -f $DEV # Use compression to quickly create files with a lot of extents # (each with a size of 128K). mount -o compress=lzo $DEV $MNT # 40G gives 327680 extents, each with a size of 128K. xfs_io -f -c "pwrite -S 0xab -b 1M 0 40G" $MNT/foobar # Add some more files to increase the size of the fs and extent # trees (in the real world there's a lot of files and extents # from other files). xfs_io -f -c "pwrite -S 0xcd -b 1M 0 20G" $MNT/file1 xfs_io -f -c "pwrite -S 0xef -b 1M 0 20G" $MNT/file2 xfs_io -f -c "pwrite -S 0x73 -b 1M 0 20G" $MNT/file3 # Create a snapshot so all the extents become indirectly shared # through subtrees, with a generation less than or equals to the # generation used to create the snapshot. btrfs subvolume snapshot -r $MNT $MNT/snap1 umount $MNT mount -o compress=lzo $DEV $MNT start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds (metadata not cached)" echo start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds (metadata cached)" umount $MNT Result before applying this patch: (...) /mnt/sdi/foobar: 327680 extents found fiemap took 1204 milliseconds (metadata not cached) /mnt/sdi/foobar: 327680 extents found fiemap took 729 milliseconds (metadata cached) Result after applying this patch: (...) /mnt/sdi/foobar: 327680 extents found fiemap took 732 milliseconds (metadata not cached) /mnt/sdi/foobar: 327680 extents found fiemap took 421 milliseconds (metadata cached) That's a -46.1% total reduction for the metadata not cached case, and a -42.2% reduction for the cached metadata case. The test is somewhat limited in the sense the gains may be higher in practice, because in the test the filesystem is small, so we have small fs and extent trees, plus there's no concurrent access to the trees as well, therefore no lock contention there. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 64 ++++++++++++++++++++++++++++++++++++++++++++-------- fs/btrfs/backref.h | 13 +++++++++++ fs/btrfs/extent_io.c | 2 ++ 3 files changed, 69 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 1b1575b3a7b0..94a3c6deafbb 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -16,8 +16,9 @@ #include "misc.h" #include "tree-mod-log.h" -/* Just an arbitrary number so we can be sure this happened */ -#define BACKREF_FOUND_SHARED 6 +/* Just arbitrary numbers so we can be sure one of these happened. */ +#define BACKREF_FOUND_SHARED 6 +#define BACKREF_FOUND_NOT_SHARED 7 struct extent_inode_elem { u64 inum; @@ -135,7 +136,8 @@ struct preftrees { * - decremented when a ref->count transitions to <1 */ struct share_check { - u64 root_objectid; + struct btrfs_backref_share_check_ctx *ctx; + struct btrfs_root *root; u64 inum; u64 data_bytenr; /* @@ -235,7 +237,7 @@ static void update_share_count(struct share_check *sc, int oldcount, else if (oldcount < 1 && newcount > 0) sc->share_count++; - if (newref->root_id == sc->root_objectid && + if (newref->root_id == sc->root->root_key.objectid && newref->wanted_disk_byte == sc->data_bytenr && newref->key_for_search.objectid == sc->inum) sc->self_ref_count += newref->count; @@ -742,7 +744,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, continue; } - if (sc && ref->root_id != sc->root_objectid) { + if (sc && ref->root_id != sc->root->root_key.objectid) { free_pref(ref); ret = BACKREF_FOUND_SHARED; goto out; @@ -1469,6 +1471,44 @@ again: */ ASSERT(extent_is_shared(sc) == 0); + /* + * If we are here for a data extent and we have a share_check structure + * it means the data extent is not directly shared (does not have + * multiple reference items), so we have to check if a path in the fs + * tree (going from the root node down to the leaf that has the file + * extent item pointing to the data extent) is shared, that is, if any + * of the extent buffers in the path is referenced by other trees. + */ + if (sc && bytenr == sc->data_bytenr) { + /* + * If we are only determining if a data extent is shared or not + * and the corresponding file extent item is located in the same + * leaf as the previous file extent item, we can skip resolving + * indirect references for a data extent, since the fs tree path + * is the same (same leaf, so same path). We skip as long as the + * cached result for the leaf is valid and only if there's only + * one file extent item pointing to the data extent, because in + * the case of multiple file extent items, they may be located + * in different leaves and therefore we have multiple paths. + */ + if (sc->ctx->curr_leaf_bytenr == sc->ctx->prev_leaf_bytenr && + sc->self_ref_count == 1) { + bool cached; + bool is_shared; + + cached = lookup_backref_shared_cache(sc->ctx, sc->root, + sc->ctx->curr_leaf_bytenr, + 0, &is_shared); + if (cached) { + if (is_shared) + ret = BACKREF_FOUND_SHARED; + else + ret = BACKREF_FOUND_NOT_SHARED; + goto out; + } + } + } + btrfs_release_path(path); ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0); @@ -1745,7 +1785,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem); int ret = 0; struct share_check shared = { - .root_objectid = root->root_key.objectid, + .ctx = ctx, + .root = root, .inum = btrfs_ino(inode), .data_bytenr = bytenr, .share_count = 0, @@ -1783,12 +1824,13 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, &ctx->refs, NULL, NULL, &shared, false); - if (ret == BACKREF_FOUND_SHARED) { - /* this is the only condition under which we return 1 */ - ret = 1; + if (ret == BACKREF_FOUND_SHARED || + ret == BACKREF_FOUND_NOT_SHARED) { + /* If shared must return 1, otherwise return 0. */ + ret = (ret == BACKREF_FOUND_SHARED) ? 1 : 0; if (level >= 0) store_backref_shared_cache(ctx, root, bytenr, - level, true); + level, ret == 1); break; } if (ret < 0 && ret != -ENOENT) @@ -1865,6 +1907,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, } out: ulist_release(&ctx->refs); + ctx->prev_leaf_bytenr = ctx->curr_leaf_bytenr; + return ret; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index fda78db50be6..6dac462430b0 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -28,6 +28,19 @@ struct btrfs_backref_shared_cache_entry { struct btrfs_backref_share_check_ctx { /* Ulists used during backref walking. */ struct ulist refs; + /* + * The current leaf the caller of btrfs_is_data_extent_shared() is at. + * Typically the caller (at the moment only fiemap) tries to determine + * the sharedness of data extents point by file extent items from entire + * leaves. + */ + u64 curr_leaf_bytenr; + /* + * The previous leaf the caller was at in the previous call to + * btrfs_is_data_extent_shared(). This may be the same as the current + * leaf. On the first call it must be 0. + */ + u64 prev_leaf_bytenr; /* * A path from a root to a leaf that has a file extent item pointing to * a given data extent should never exceed the maximum b+tree height. diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e25e54d2216f..4e4f28387ace 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3966,6 +3966,8 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, if (extent_end <= lockstart) goto next_item; + backref_ctx->curr_leaf_bytenr = leaf->start; + /* We have in implicit hole (NO_HOLES feature enabled). */ if (prev_extent_end < key.offset) { const u64 range_end = min(key.offset, lockend) - 1; -- cgit From 6976201f188f0a899fe5f0dfde32b9855d1d22cc Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:09 +0100 Subject: btrfs: avoid unnecessary resolution of indirect backrefs during fiemap During fiemap, when determining if a data extent is shared or not, if we don't find the extent is directly shared, then we need to determine if it's shared through subtrees. For that we need to resolve the indirect reference we found in order to figure out the path in the inode's fs tree, which is a path starting at the fs tree's root node and going down to the leaf that contains the file extent item that points to the data extent. We then proceed to determine if any extent buffer in that path is shared with other trees or not. However when the generation of the data extent is more recent than the last generation used to snapshot the root, we don't need to determine the path, since the data extent can not be shared through snapshots. For this case we currently still determine the leaf of that path (at find_parent_nodes(), but then stop determining the other nodes in the path (at btrfs_is_data_extent_shared()) as it's pointless. So do the check of the data extent's generation earlier, at find_parent_nodes(), before trying to resolve the indirect reference to determine the leaf in the path. This saves us from doing one expensive b+tree search in the fs tree of our target inode, as well as other minor work. The following test was run on a non-debug kernel (Debian's default kernel config): $ cat test-fiemap.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi umount $DEV &> /dev/null mkfs.btrfs -f $DEV # Use compression to quickly create files with a lot of extents # (each with a size of 128K). mount -o compress=lzo $DEV $MNT # 40G gives 327680 extents, each with a size of 128K. xfs_io -f -c "pwrite -S 0xab -b 1M 0 40G" $MNT/foobar # Add some more files to increase the size of the fs and extent # trees (in the real world there's a lot of files and extents # from other files). xfs_io -f -c "pwrite -S 0xcd -b 1M 0 20G" $MNT/file1 xfs_io -f -c "pwrite -S 0xef -b 1M 0 20G" $MNT/file2 xfs_io -f -c "pwrite -S 0x73 -b 1M 0 20G" $MNT/file3 umount $MNT mount -o compress=lzo $DEV $MNT start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds (metadata not cached)" echo start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds (metadata cached)" umount $MNT Before applying this patch: (...) /mnt/sdi/foobar: 327680 extents found fiemap took 1285 milliseconds (metadata not cached) /mnt/sdi/foobar: 327680 extents found fiemap took 742 milliseconds (metadata cached) After applying this patch: (...) /mnt/sdi/foobar: 327680 extents found fiemap took 689 milliseconds (metadata not cached) /mnt/sdi/foobar: 327680 extents found fiemap took 393 milliseconds (metadata cached) That's a -46.4% total reduction for the metadata not cached case, and a -47.0% reduction for the cached metadata case. The test is somewhat limited in the sense the gains may be higher in practice, because in the test the filesystem is small, so we have small fs and extent trees, plus there's no concurrent access to the trees as well, therefore no lock contention there. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 94a3c6deafbb..232f49415ab9 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -140,6 +140,7 @@ struct share_check { struct btrfs_root *root; u64 inum; u64 data_bytenr; + u64 data_extent_gen; /* * Counts number of inodes that refer to an extent (different inodes in * the same root or different roots) that we could find. The sharedness @@ -1480,6 +1481,21 @@ again: * of the extent buffers in the path is referenced by other trees. */ if (sc && bytenr == sc->data_bytenr) { + /* + * If our data extent is from a generation more recent than the + * last generation used to snapshot the root, then we know that + * it can not be shared through subtrees, so we can skip + * resolving indirect references, there's no point in + * determining the extent buffers for the path from the fs tree + * root node down to the leaf that has the file extent item that + * points to the data extent. + */ + if (sc->data_extent_gen > + btrfs_root_last_snapshot(&sc->root->root_item)) { + ret = BACKREF_FOUND_NOT_SHARED; + goto out; + } + /* * If we are only determining if a data extent is shared or not * and the corresponding file extent item is located in the same @@ -1789,6 +1805,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, .root = root, .inum = btrfs_ino(inode), .data_bytenr = bytenr, + .data_extent_gen = extent_gen, .share_count = 0, .self_ref_count = 0, .have_delayed_delete_refs = false, @@ -1836,17 +1853,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, if (ret < 0 && ret != -ENOENT) break; ret = 0; - /* - * If our data extent is not shared through reflinks and it was - * created in a generation after the last one used to create a - * snapshot of the inode's root, then it can not be shared - * indirectly through subtrees, as that can only happen with - * snapshots. In this case bail out, no need to check for the - * sharedness of extent buffers. - */ - if (level == -1 && - extent_gen > btrfs_root_last_snapshot(&root->root_item)) - break; /* * If our data extent was not directly shared (without multiple -- cgit From cc4804bfd6392bcb626a687689b36fd9cda87c11 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 13 Oct 2022 15:52:09 -0700 Subject: btrfs: skip reclaim if block_group is empty As we delete extents from a block group, at some deletion we cross below the reclaim threshold. It is possible we are still in the middle of deleting more extents and might soon hit 0. If the block group is empty by the time the reclaim worker runs, we will still relocate it. This works just fine, as relocating an empty block group ultimately results in properly deleting it. However, we have more direct ways of removing empty block groups in the cleaner thread. Those are either async discard or the unused_bgs list. In fact, when we decide whether to relocate a block group during extent deletion, we do check for emptiness and prefer the discard/unused_bgs mechanisms when possible. Not using relocation for this case reduces some modest overhead from empty bg relocation: - extra transactions - extra metadata use/churn for creating relocation metadata - trying to read the extent tree to look for extents (and in this case finding none) Reviewed-by: Filipe Manana Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 3f8b1cbbbc43..684401aa014a 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1606,6 +1606,24 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) up_write(&space_info->groups_sem); goto next; } + if (bg->used == 0) { + /* + * It is possible that we trigger relocation on a block + * group as its extents are deleted and it first goes + * below the threshold, then shortly after goes empty. + * + * In this case, relocating it does delete it, but has + * some overhead in relocation specific metadata, looking + * for the non-existent extents and running some extra + * transactions, which we can avoid by using one of the + * other mechanisms for dealing with empty block groups. + */ + if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_mark_bg_unused(bg); + spin_unlock(&bg->lock); + up_write(&space_info->groups_sem); + goto next; + } spin_unlock(&bg->lock); /* Get out fast, in case we're unmounting the filesystem */ -- cgit From 81531225e5bd50ce628816cc1445c8b52aa99db2 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 13 Oct 2022 15:52:10 -0700 Subject: btrfs: re-check reclaim condition in reclaim worker I have observed the following case play out and lead to unnecessary relocations: 1. write a file across multiple block groups 2. delete the file 3. several block groups fall below the reclaim threshold 4. reclaim the first, moving extents into the others 5. reclaim the others which are now actually very full, leading to poor reclaim behavior with lots of writing, allocating new block groups, etc. I believe the risk of missing some reasonable reclaims is worth it when traded off against the savings of avoiding overfull reclaims. Going forward, it could be interesting to make the check more advanced (zoned aware, fragmentation aware, etc...) so that it can be a really strong signal both at extent delete and reclaim time. Reviewed-by: Filipe Manana Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 65 +++++++++++++++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 684401aa014a..b1f4acdeafb9 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1539,6 +1539,30 @@ static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) return true; } +static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed) +{ + const struct btrfs_space_info *space_info = bg->space_info; + const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); + const u64 new_val = bg->used; + const u64 old_val = new_val + bytes_freed; + u64 thresh; + + if (reclaim_thresh == 0) + return false; + + thresh = div_factor_fine(bg->length, reclaim_thresh); + + /* + * If we were below the threshold before don't reclaim, we are likely a + * brand new block group and we don't want to relocate new block groups. + */ + if (old_val < thresh) + return false; + if (new_val >= thresh) + return false; + return true; +} + void btrfs_reclaim_bgs_work(struct work_struct *work) { struct btrfs_fs_info *fs_info = @@ -1623,6 +1647,22 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) spin_unlock(&bg->lock); up_write(&space_info->groups_sem); goto next; + + } + /* + * The block group might no longer meet the reclaim condition by + * the time we get around to reclaiming it, so to avoid + * reclaiming overly full block_groups, skip reclaiming them. + * + * Since the decision making process also depends on the amount + * being freed, pass in a fake giant value to skip that extra + * check, which is more meaningful when adding to the list in + * the first place. + */ + if (!should_reclaim_block_group(bg, bg->length)) { + spin_unlock(&bg->lock); + up_write(&space_info->groups_sem); + goto next; } spin_unlock(&bg->lock); @@ -3241,31 +3281,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) return ret; } -static inline bool should_reclaim_block_group(struct btrfs_block_group *bg, - u64 bytes_freed) -{ - const struct btrfs_space_info *space_info = bg->space_info; - const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); - const u64 new_val = bg->used; - const u64 old_val = new_val + bytes_freed; - u64 thresh; - - if (reclaim_thresh == 0) - return false; - - thresh = div_factor_fine(bg->length, reclaim_thresh); - - /* - * If we were below the threshold before don't reclaim, we are likely a - * brand new block group and we don't want to relocate new block groups. - */ - if (old_val < thresh) - return false; - if (new_val >= thresh) - return false; - return true; -} - int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc) { -- cgit From 879b2221983184cd61fe852f90f3c0d6dd647f67 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 13 Oct 2022 11:36:25 +0100 Subject: btrfs: switch GFP_ATOMIC to GFP_NOFS when fixing up low keys When fixing up the first key of each node above the current level, at fixup_low_keys(), we are doing a GFP_ATOMIC allocation for inserting an operation record for the tree mod log. However we can do just fine with GFP_NOFS nowadays. The need for GFP_ATOMIC was for the old days when we had custom locks with spinning behaviour for extent buffers and we were in spinning mode while at fixup_low_keys(). Now we use rw semaphores for extent buffer locks, so we can safely use GFP_NOFS. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5f461e05cc9c..e6182d530377 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2418,7 +2418,7 @@ static void fixup_low_keys(struct btrfs_path *path, break; t = path->nodes[i]; ret = btrfs_tree_mod_log_insert_key(t, tslot, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC); + BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); BUG_ON(ret < 0); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); -- cgit From 33cff222faffefc57f57326547e68133a7816e29 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 14 Oct 2022 14:44:33 +0100 Subject: btrfs: remove gfp_t flag from btrfs_tree_mod_log_insert_key() All callers of btrfs_tree_mod_log_insert_key() are now passing a GFP_NOFS flag to it, so remove the flag from it and from alloc_tree_mod_elem() and use it directly within alloc_tree_mod_elem(). Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 16 ++++++++-------- fs/btrfs/tree-mod-log.c | 19 +++++++++---------- fs/btrfs/tree-mod-log.h | 2 +- 3 files changed, 18 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e6182d530377..d645933ef135 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -489,7 +489,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, } else { WARN_ON(trans->transid != btrfs_header_generation(parent)); btrfs_tree_mod_log_insert_key(parent, parent_slot, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, @@ -1018,7 +1018,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -1064,7 +1064,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -1166,7 +1166,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -1220,7 +1220,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(right, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -2418,7 +2418,7 @@ static void fixup_low_keys(struct btrfs_path *path, break; t = path->nodes[i]; ret = btrfs_tree_mod_log_insert_key(t, tslot, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); @@ -2779,7 +2779,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans, } if (level) { ret = btrfs_tree_mod_log_insert_key(lower, slot, - BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); + BTRFS_MOD_LOG_KEY_ADD); BUG_ON(ret < 0); } btrfs_set_node_key(lower, key, slot); @@ -4219,7 +4219,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, (nritems - slot - 1)); } else if (level) { ret = btrfs_tree_mod_log_insert_key(parent, slot, - BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE); BUG_ON(ret < 0); } diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 8a3a14686d3e..18b4699bcb11 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -197,12 +197,11 @@ static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, int slot, - enum btrfs_mod_log_op op, - gfp_t flags) + enum btrfs_mod_log_op op) { struct tree_mod_elem *tm; - tm = kzalloc(sizeof(*tm), flags); + tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) return NULL; @@ -220,7 +219,7 @@ static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, } int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, - enum btrfs_mod_log_op op, gfp_t flags) + enum btrfs_mod_log_op op) { struct tree_mod_elem *tm; int ret; @@ -228,7 +227,7 @@ int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, if (!tree_mod_need_log(eb->fs_info, eb)) return 0; - tm = alloc_tree_mod_elem(eb, slot, op, flags); + tm = alloc_tree_mod_elem(eb, slot, op); if (!tm) return -ENOMEM; @@ -276,7 +275,7 @@ int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, - BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -364,7 +363,7 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, } for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(old_root, i, - BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -502,14 +501,14 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, tm_list_rem = tm_list + nr_items; for (i = 0; i < nr_items; i++) { tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, - BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE); if (!tm_list_rem[i]) { ret = -ENOMEM; goto free_tms; } tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, - BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); + BTRFS_MOD_LOG_KEY_ADD); if (!tm_list_add[i]) { ret = -ENOMEM; goto free_tms; @@ -564,7 +563,7 @@ int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i, - BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h index 12605d19621b..8cffe0bc2a39 100644 --- a/fs/btrfs/tree-mod-log.h +++ b/fs/btrfs/tree-mod-log.h @@ -32,7 +32,7 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, struct extent_buffer *new_root, bool log_removal); int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, - enum btrfs_mod_log_op op, gfp_t flags); + enum btrfs_mod_log_op op); int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, -- cgit From 5565b8e0adcdd8ee32d02d82d98cdf6d966793c7 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 12 Oct 2022 17:22:35 +0800 Subject: btrfs: make module init/exit match their sequence [BACKGROUND] In theory init_btrfs_fs() and exit_btrfs_fs() should match their sequence, thus normally they should look like this: init_btrfs_fs() | exit_btrfs_fs() ----------------------+------------------------ init_A(); | init_B(); | init_C(); | | exit_C(); | exit_B(); | exit_A(); So is for the error path of init_btrfs_fs(). But it's not the case, some exit functions don't match their init functions sequence in init_btrfs_fs(). Furthermore in init_btrfs_fs(), we need to have a new error label for each new init function we added. This is not really expandable, especially recently we may add several new functions to init_btrfs_fs(). [ENHANCEMENT] The patch will introduce the following things to enhance the situation: - struct init_sequence Just a wrapper of init and exit function pointers. The init function must use int type as return value, thus some init functions need to be updated to return 0. The exit function can be NULL, as there are some init sequence just outputting a message. - struct mod_init_seq[] array This is a const array, recording all the initialization we need to do in init_btrfs_fs(), and the order follows the old init_btrfs_fs(). - bool mod_init_result[] array This is a bool array, recording if we have initialized one entry in mod_init_seq[]. The reason to split mod_init_seq[] and mod_init_result[] is to avoid section mismatch in reference. All init function are in .init.text, but if mod_init_seq[] records the @initialized member it can no longer be const, thus will be put into .data section, and cause modpost warning. For init_btrfs_fs() we just call all init functions in their order in mod_init_seq[] array, and after each call, setting corresponding mod_init_result[] to true. For exit_btrfs_fs() and error handling path of init_btrfs_fs(), we just iterate mod_init_seq[] in reverse order, and skip all uninitialized entry. With this patch, init_btrfs_fs()/exit_btrfs_fs() will be much easier to expand and will always follow the strict order. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/compression.c | 3 +- fs/btrfs/compression.h | 2 +- fs/btrfs/props.c | 3 +- fs/btrfs/props.h | 2 +- fs/btrfs/super.c | 242 ++++++++++++++++++++++++------------------------- 5 files changed, 122 insertions(+), 130 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index e6635fe70067..65a4e5087215 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1243,12 +1243,13 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, return ret; } -void __init btrfs_init_compress(void) +int __init btrfs_init_compress(void) { btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); zstd_init_workspace_manager(); + return 0; } void __cold btrfs_exit_compress(void) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 1aa02903de69..9da2343eeff5 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -77,7 +77,7 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level) return ((type_level & 0xF0) >> 4); } -void __init btrfs_init_compress(void); +int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 07f62e3ba6a5..e04289347775 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -454,7 +454,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, return 0; } -void __init btrfs_props_init(void) +int __init btrfs_props_init(void) { int i; @@ -464,5 +464,6 @@ void __init btrfs_props_init(void) hash_add(prop_handlers_ht, &p->node, h); } + return 0; } diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index ca9dd3df129b..6e283196e38a 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -8,7 +8,7 @@ #include "ctree.h" -void __init btrfs_props_init(void); +int __init btrfs_props_init(void); int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const char *value, size_t value_len, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0a93fbd29494..3a33dd9847d9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2694,7 +2694,7 @@ static __cold void btrfs_interface_exit(void) misc_deregister(&btrfs_misc); } -static void __init btrfs_print_mod_info(void) +static int __init btrfs_print_mod_info(void) { static const char options[] = "" #ifdef CONFIG_BTRFS_DEBUG @@ -2721,143 +2721,133 @@ static void __init btrfs_print_mod_info(void) #endif ; pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options); + return 0; } -static int __init init_btrfs_fs(void) +static int register_btrfs(void) { - int err; - - btrfs_props_init(); - - err = btrfs_init_sysfs(); - if (err) - return err; - - btrfs_init_compress(); - - err = btrfs_init_cachep(); - if (err) - goto free_compress; - - err = btrfs_transaction_init(); - if (err) - goto free_cachep; - - err = btrfs_ctree_init(); - if (err) - goto free_transaction; - - err = btrfs_free_space_init(); - if (err) - goto free_ctree; - - err = extent_state_init_cachep(); - if (err) - goto free_free_space; - - err = extent_buffer_init_cachep(); - if (err) - goto free_extent_cachep; - - err = btrfs_bioset_init(); - if (err) - goto free_eb_cachep; - - err = extent_map_init(); - if (err) - goto free_bioset; - - err = ordered_data_init(); - if (err) - goto free_extent_map; - - err = btrfs_delayed_inode_init(); - if (err) - goto free_ordered_data; - - err = btrfs_auto_defrag_init(); - if (err) - goto free_delayed_inode; - - err = btrfs_delayed_ref_init(); - if (err) - goto free_auto_defrag; - - err = btrfs_prelim_ref_init(); - if (err) - goto free_delayed_ref; - - err = btrfs_interface_init(); - if (err) - goto free_prelim_ref; + return register_filesystem(&btrfs_fs_type); +} - btrfs_print_mod_info(); +static void unregister_btrfs(void) +{ + unregister_filesystem(&btrfs_fs_type); +} - err = btrfs_run_sanity_tests(); - if (err) - goto unregister_ioctl; +/* Helper structure for long init/exit functions. */ +struct init_sequence { + int (*init_func)(void); + /* Can be NULL if the init_func doesn't need cleanup. */ + void (*exit_func)(void); +}; - err = register_filesystem(&btrfs_fs_type); - if (err) - goto unregister_ioctl; +static const struct init_sequence mod_init_seq[] = { + { + .init_func = btrfs_props_init, + .exit_func = NULL, + }, { + .init_func = btrfs_init_sysfs, + .exit_func = btrfs_exit_sysfs, + }, { + .init_func = btrfs_init_compress, + .exit_func = btrfs_exit_compress, + }, { + .init_func = btrfs_init_cachep, + .exit_func = btrfs_destroy_cachep, + }, { + .init_func = btrfs_transaction_init, + .exit_func = btrfs_transaction_exit, + }, { + .init_func = btrfs_ctree_init, + .exit_func = btrfs_ctree_exit, + }, { + .init_func = btrfs_free_space_init, + .exit_func = btrfs_free_space_exit, + }, { + .init_func = extent_state_init_cachep, + .exit_func = extent_state_free_cachep, + }, { + .init_func = extent_buffer_init_cachep, + .exit_func = extent_buffer_free_cachep, + }, { + .init_func = btrfs_bioset_init, + .exit_func = btrfs_bioset_exit, + }, { + .init_func = extent_map_init, + .exit_func = extent_map_exit, + }, { + .init_func = ordered_data_init, + .exit_func = ordered_data_exit, + }, { + .init_func = btrfs_delayed_inode_init, + .exit_func = btrfs_delayed_inode_exit, + }, { + .init_func = btrfs_auto_defrag_init, + .exit_func = btrfs_auto_defrag_exit, + }, { + .init_func = btrfs_delayed_ref_init, + .exit_func = btrfs_delayed_ref_exit, + }, { + .init_func = btrfs_prelim_ref_init, + .exit_func = btrfs_prelim_ref_exit, + }, { + .init_func = btrfs_interface_init, + .exit_func = btrfs_interface_exit, + }, { + .init_func = btrfs_print_mod_info, + .exit_func = NULL, + }, { + .init_func = btrfs_run_sanity_tests, + .exit_func = NULL, + }, { + .init_func = register_btrfs, + .exit_func = unregister_btrfs, + } +}; - return 0; +static bool mod_init_result[ARRAY_SIZE(mod_init_seq)]; -unregister_ioctl: - btrfs_interface_exit(); -free_prelim_ref: - btrfs_prelim_ref_exit(); -free_delayed_ref: - btrfs_delayed_ref_exit(); -free_auto_defrag: - btrfs_auto_defrag_exit(); -free_delayed_inode: - btrfs_delayed_inode_exit(); -free_ordered_data: - ordered_data_exit(); -free_extent_map: - extent_map_exit(); -free_bioset: - btrfs_bioset_exit(); -free_eb_cachep: - extent_buffer_free_cachep(); -free_extent_cachep: - extent_state_free_cachep(); -free_free_space: - btrfs_free_space_exit(); -free_ctree: - btrfs_ctree_exit(); -free_transaction: - btrfs_transaction_exit(); -free_cachep: - btrfs_destroy_cachep(); -free_compress: - btrfs_exit_compress(); - btrfs_exit_sysfs(); +static void __exit exit_btrfs_fs(void) +{ + int i; - return err; + for (i = ARRAY_SIZE(mod_init_seq) - 1; i >= 0; i--) { + if (!mod_init_result[i]) + continue; + if (mod_init_seq[i].exit_func) + mod_init_seq[i].exit_func(); + mod_init_result[i] = false; + } } -static void __exit exit_btrfs_fs(void) +static int __init init_btrfs_fs(void) { - btrfs_free_space_exit(); - btrfs_ctree_exit(); - btrfs_transaction_exit(); - btrfs_destroy_cachep(); - btrfs_delayed_ref_exit(); - btrfs_auto_defrag_exit(); - btrfs_delayed_inode_exit(); - btrfs_prelim_ref_exit(); - ordered_data_exit(); - extent_map_exit(); - btrfs_bioset_exit(); - extent_state_free_cachep(); - extent_buffer_free_cachep(); - btrfs_interface_exit(); - unregister_filesystem(&btrfs_fs_type); - btrfs_exit_sysfs(); - btrfs_cleanup_fs_uuids(); - btrfs_exit_compress(); + int ret; + int i; + + for (i = 0; i < ARRAY_SIZE(mod_init_seq); i++) { + ASSERT(!mod_init_result[i]); + ret = mod_init_seq[i].init_func(); + if (ret < 0) + goto error; + mod_init_result[i] = true; + } + return 0; + +error: + /* + * If we call exit_btrfs_fs() it would cause section mismatch. + * As init_btrfs_fs() belongs to .init.text, while exit_btrfs_fs() + * belongs to .exit.text. + */ + for (i = ARRAY_SIZE(mod_init_seq) - 1; i >= 0; i--) { + if (!mod_init_result[i]) + continue; + if (mod_init_seq[i].exit_func) + mod_init_seq[i].exit_func(); + mod_init_result[i] = false; + } + return ret; } late_initcall(init_btrfs_fs); -- cgit From d549ff7bdbe72f5e0e934895090fed2647e9813b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 9 Sep 2022 17:20:25 +0200 Subject: btrfs: add helper for bit enumeration Define helper macro that can be used in enum {} to utilize the automatic increment to define all bits without directly defining the values or using additional linear bits. 1. capture the sequence value, N 2. use the value to define the given enum with N-th bit set 3. reset the sequence back to N Use for enums that do not require fixed values for symbolic names (like for on-disk structures): enum { ENUM_BIT(FIRST), ENUM_BIT(SECOND), ENUM_BIT(THIRD) }; Where the values would be 0x1, 0x2 and 0x4. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/misc.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index f9850edfd726..69826ccd6644 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -10,6 +10,14 @@ #define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) +/* + * Enumerate bits using enum autoincrement. Define the @name as the n-th bit. + */ +#define ENUM_BIT(name) \ + __ ## name ## _BIT, \ + name = (1U << __ ## name ## _BIT), \ + __ ## name ## _SEQ = __ ## name ## _BIT + static inline void cond_wake_up(struct wait_queue_head *wq) { /* -- cgit From c7321b76dfcc8a1b8d7dad3eff3595bad28252e2 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 9 Sep 2022 17:27:45 +0200 Subject: btrfs: convert BTRFS_ILOCK-* defines to enum bit Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c0c0dc8fa99f..5adf76d89f9b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -34,6 +34,7 @@ #include "async-thread.h" #include "block-rsv.h" #include "locking.h" +#include "misc.h" struct btrfs_trans_handle; struct btrfs_transaction; @@ -3131,9 +3132,11 @@ struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, extern const struct dentry_operations btrfs_dentry_operations; /* Inode locking type flags, by default the exclusive lock is taken */ -#define BTRFS_ILOCK_SHARED (1U << 0) -#define BTRFS_ILOCK_TRY (1U << 1) -#define BTRFS_ILOCK_MMAP (1U << 2) +enum btrfs_ilock_type { + ENUM_BIT(BTRFS_ILOCK_SHARED), + ENUM_BIT(BTRFS_ILOCK_TRY), + ENUM_BIT(BTRFS_ILOCK_MMAP), +}; int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags); void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); -- cgit From fd8d2951f478c095f5e65c5f14f2be4d4724c5ea Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 9 Sep 2022 17:31:38 +0200 Subject: btrfs: convert extent_io page op defines to enum bits Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent_io.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7929f054dda3..a5ec1475988f 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -9,6 +9,7 @@ #include #include "compression.h" #include "ulist.h" +#include "misc.h" enum { EXTENT_BUFFER_UPTODATE, @@ -29,13 +30,15 @@ enum { }; /* these are flags for __process_pages_contig */ -#define PAGE_UNLOCK (1 << 0) -/* Page starts writeback, clear dirty bit and set writeback bit */ -#define PAGE_START_WRITEBACK (1 << 1) -#define PAGE_END_WRITEBACK (1 << 2) -#define PAGE_SET_ORDERED (1 << 3) -#define PAGE_SET_ERROR (1 << 4) -#define PAGE_LOCK (1 << 5) +enum { + ENUM_BIT(PAGE_UNLOCK), + /* Page starts writeback, clear dirty bit and set writeback bit */ + ENUM_BIT(PAGE_START_WRITEBACK), + ENUM_BIT(PAGE_END_WRITEBACK), + ENUM_BIT(PAGE_SET_ORDERED), + ENUM_BIT(PAGE_SET_ERROR), + ENUM_BIT(PAGE_LOCK), +}; /* * page->private values. Every page that is controlled by the extent -- cgit From d3b4d0fd5518fd000938c6cbd4817085c210dcbe Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 9 Sep 2022 17:34:58 +0200 Subject: btrfs: convert EXTENT_* bits to enums Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.h | 71 +++++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index c71aa29f719d..673637ed2fa9 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -3,43 +3,48 @@ #ifndef BTRFS_EXTENT_IO_TREE_H #define BTRFS_EXTENT_IO_TREE_H +#include "misc.h" + struct extent_changeset; struct io_failure_record; /* Bits for the extent state */ -#define EXTENT_DIRTY (1U << 0) -#define EXTENT_UPTODATE (1U << 1) -#define EXTENT_LOCKED (1U << 2) -#define EXTENT_NEW (1U << 3) -#define EXTENT_DELALLOC (1U << 4) -#define EXTENT_DEFRAG (1U << 5) -#define EXTENT_BOUNDARY (1U << 6) -#define EXTENT_NODATASUM (1U << 7) -#define EXTENT_CLEAR_META_RESV (1U << 8) -#define EXTENT_NEED_WAIT (1U << 9) -#define EXTENT_NORESERVE (1U << 11) -#define EXTENT_QGROUP_RESERVED (1U << 12) -#define EXTENT_CLEAR_DATA_RESV (1U << 13) -/* - * Must be cleared only during ordered extent completion or on error paths if we - * did not manage to submit bios and create the ordered extents for the range. - * Should not be cleared during page release and page invalidation (if there is - * an ordered extent in flight), that is left for the ordered extent completion. - */ -#define EXTENT_DELALLOC_NEW (1U << 14) -/* - * When an ordered extent successfully completes for a region marked as a new - * delalloc range, use this flag when clearing a new delalloc range to indicate - * that the VFS' inode number of bytes should be incremented and the inode's new - * delalloc bytes decremented, in an atomic way to prevent races with stat(2). - */ -#define EXTENT_ADD_INODE_BYTES (1U << 15) - -/* - * Set during truncate when we're clearing an entire range and we just want the - * extent states to go away. - */ -#define EXTENT_CLEAR_ALL_BITS (1U << 16) +enum { + ENUM_BIT(EXTENT_DIRTY), + ENUM_BIT(EXTENT_UPTODATE), + ENUM_BIT(EXTENT_LOCKED), + ENUM_BIT(EXTENT_NEW), + ENUM_BIT(EXTENT_DELALLOC), + ENUM_BIT(EXTENT_DEFRAG), + ENUM_BIT(EXTENT_BOUNDARY), + ENUM_BIT(EXTENT_NODATASUM), + ENUM_BIT(EXTENT_CLEAR_META_RESV), + ENUM_BIT(EXTENT_NEED_WAIT), + ENUM_BIT(EXTENT_NORESERVE), + ENUM_BIT(EXTENT_QGROUP_RESERVED), + ENUM_BIT(EXTENT_CLEAR_DATA_RESV), + /* + * Must be cleared only during ordered extent completion or on error + * paths if we did not manage to submit bios and create the ordered + * extents for the range. Should not be cleared during page release + * and page invalidation (if there is an ordered extent in flight), + * that is left for the ordered extent completion. + */ + ENUM_BIT(EXTENT_DELALLOC_NEW), + /* + * When an ordered extent successfully completes for a region marked as + * a new delalloc range, use this flag when clearing a new delalloc + * range to indicate that the VFS' inode number of bytes should be + * incremented and the inode's new delalloc bytes decremented, in an + * atomic way to prevent races with stat(2). + */ + ENUM_BIT(EXTENT_ADD_INODE_BYTES), + /* + * Set during truncate when we're clearing an entire range and we just + * want the extent states to go away. + */ + ENUM_BIT(EXTENT_CLEAR_ALL_BITS), +}; #define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ EXTENT_CLEAR_DATA_RESV) -- cgit From e0a8b9a74767f748881b1e2408ed9afe7a26d9fd Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 9 Sep 2022 17:37:35 +0200 Subject: btrfs: convert QGROUP_* defines to enum bits The defines/enums are used only for tracepoints and are not part of the on-disk format. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/qgroup.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 578c77e94200..3fb5459c9309 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -11,6 +11,7 @@ #include #include "ulist.h" #include "delayed-ref.h" +#include "misc.h" /* * Btrfs qgroup overview @@ -242,9 +243,11 @@ static inline u64 btrfs_qgroup_subvolid(u64 qgroupid) /* * For qgroup event trace points only */ -#define QGROUP_RESERVE (1<<0) -#define QGROUP_RELEASE (1<<1) -#define QGROUP_FREE (1<<2) +enum { + ENUM_BIT(QGROUP_RESERVE), + ENUM_BIT(QGROUP_RELEASE), + ENUM_BIT(QGROUP_FREE), +}; int btrfs_quota_enable(struct btrfs_fs_info *fs_info); int btrfs_quota_disable(struct btrfs_fs_info *fs_info); -- cgit From cc37ea61920eca87316be1ae255e5e5e7b12f7b6 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 9 Sep 2022 17:43:06 +0200 Subject: btrfs: convert __TRANS_* defines to enum bits The base transaction bits can be defined as bits in a contiguous sequence, although right now there's a hole from bit 1 to 8. The bits are used for btrfs_trans_handle::type, and there's another set of TRANS_STATE_* defines that are for btrfs_transaction::state. They are mutually exclusive though the hole in the sequence looks like was made for the states. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/transaction.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 1332f193b800..b39ebf98af60 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -10,6 +10,7 @@ #include "btrfs_inode.h" #include "delayed-ref.h" #include "ctree.h" +#include "misc.h" enum btrfs_trans_state { TRANS_STATE_RUNNING, @@ -98,14 +99,15 @@ struct btrfs_transaction { struct list_head releasing_ebs; }; -#define __TRANS_FREEZABLE (1U << 0) - -#define __TRANS_START (1U << 9) -#define __TRANS_ATTACH (1U << 10) -#define __TRANS_JOIN (1U << 11) -#define __TRANS_JOIN_NOLOCK (1U << 12) -#define __TRANS_DUMMY (1U << 13) -#define __TRANS_JOIN_NOSTART (1U << 14) +enum { + ENUM_BIT(__TRANS_FREEZABLE), + ENUM_BIT(__TRANS_START), + ENUM_BIT(__TRANS_ATTACH), + ENUM_BIT(__TRANS_JOIN), + ENUM_BIT(__TRANS_JOIN_NOLOCK), + ENUM_BIT(__TRANS_DUMMY), + ENUM_BIT(__TRANS_JOIN_NOSTART), +}; #define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) #define TRANS_ATTACH (__TRANS_ATTACH) -- cgit From 7248e0cebbefaba94c0c37f708a134dad3acba0e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 9 Sep 2022 14:45:22 +0800 Subject: btrfs: skip update of block group item if used bytes are the same [BACKGROUND] When committing a transaction, we will update block group items for all dirty block groups. But in fact, dirty block groups don't always need to update their block group items. It's pretty common to have a metadata block group which experienced several COW operations, but still have the same amount of used bytes. In that case, we may unnecessarily COW a tree block doing nothing. [ENHANCEMENT] This patch will introduce btrfs_block_group::commit_used member to remember the last used bytes, and use that new member to skip unnecessary block group item update. This would be more common for large filesystems, where metadata block group can be as large as 1GiB, containing at most 64K metadata items. In that case, if COW added and then deleted one metadata item near the end of the block group, then it's completely possible we don't need to touch the block group item at all. [BENCHMARK] The change itself can have quite a high chance (20~80%) to skip block group item updates in lot of workloads. As a result, it would result shorter time spent on btrfs_write_dirty_block_groups(), and overall reduce the execution time of the critical section of btrfs_commit_transaction(). Here comes a fio command, which will do random writes in 4K block size, causing a very heavy metadata updates. fio --filename=$mnt/file --size=512M --rw=randwrite --direct=1 --bs=4k \ --ioengine=libaio --iodepth=64 --runtime=300 --numjobs=4 \ --name=random_write --fallocate=none --time_based --fsync_on_close=1 The file size (512M) and number of threads (4) means 2GiB file size in total, but during the full 300s run time, my dedicated SATA SSD is able to write around 20~25GiB, which is over 10 times the file size. Thus after we fill the initial 2G, we should not cause much block group item updates. Please note, the fio numbers by themselves don't have much change, but if we look deeper, there is some reduced execution time, especially for the critical section of btrfs_commit_transaction(). I added extra trace_printk() to measure the following per-transaction execution time: - Critical section of btrfs_commit_transaction() By re-using the existing update_commit_stats() function, which has already calculated the interval correctly. - The while() loop for btrfs_write_dirty_block_groups() Although this includes the execution time of btrfs_run_delayed_refs(), it should still be representative overall. Both result involves transid 7~30, the same amount of transaction committed. The result looks like this: | Before | After | Diff ----------------------+-------------------+----------------+-------- Transaction interval | 229247198.5 | 215016933.6 | -6.2% Block group interval | 23133.33333 | 18970.83333 | -18.0% The change in block group item updates is more obvious, as skipped block group item updates also mean less delayed refs. And the overall execution time for that block group update loop is pretty small, thus we can assume the extent tree is already mostly cached. If we can skip an uncached tree block, it would cause more obvious change. Unfortunately the overall reduction in commit transaction critical section is much smaller, as the block group item updates loop is not really the major part, at least not for the above fio script. But still we have a observable reduction in the critical section. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 28 +++++++++++++++++++++++++++- fs/btrfs/block-group.h | 6 ++++++ 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index b1f4acdeafb9..47461fd21975 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2071,6 +2071,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, cache->length = key->offset; cache->used = btrfs_stack_block_group_used(bgi); + cache->commit_used = cache->used; cache->flags = btrfs_stack_block_group_flags(bgi); cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); @@ -2762,6 +2763,25 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_block_group_item bgi; struct btrfs_key key; + u64 old_commit_used; + u64 used; + + /* + * Block group items update can be triggered out of commit transaction + * critical section, thus we need a consistent view of used bytes. + * We cannot use cache->used directly outside of the spin lock, as it + * may be changed. + */ + spin_lock(&cache->lock); + old_commit_used = cache->commit_used; + used = cache->used; + /* No change in used bytes, can safely skip it. */ + if (cache->commit_used == used) { + spin_unlock(&cache->lock); + return 0; + } + cache->commit_used = used; + spin_unlock(&cache->lock); key.objectid = cache->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; @@ -2776,7 +2796,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; bi = btrfs_item_ptr_offset(leaf, path->slots[0]); - btrfs_set_stack_block_group_used(&bgi, cache->used); + btrfs_set_stack_block_group_used(&bgi, used); btrfs_set_stack_block_group_chunk_objectid(&bgi, cache->global_root_id); btrfs_set_stack_block_group_flags(&bgi, cache->flags); @@ -2784,6 +2804,12 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); fail: btrfs_release_path(path); + /* We didn't update the block group item, need to revert @commit_used. */ + if (ret < 0) { + spin_lock(&cache->lock); + cache->commit_used = old_commit_used; + spin_unlock(&cache->lock); + } return ret; } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 39e79bed10ae..6c970a486b68 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -99,6 +99,12 @@ struct btrfs_block_group { u64 cache_generation; u64 global_root_id; + /* + * The last committed used bytes of this block group, if the above @used + * is still the same as @commit_used, we don't need to update block + * group item of this block group. + */ + u64 commit_used; /* * If the free space extent count exceeds this number, convert the block * group to bitmaps. -- cgit From 48acc47d7813a0e650754845161f04b0b27ff8ac Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Oct 2022 10:00:39 -0400 Subject: btrfs: do not use GFP_ATOMIC in the read endio We have done read endio in an async thread for a very, very long time, which makes the use of GFP_ATOMIC and unlock_extent_atomic() unneeded in our read endio path. We've noticed under heavy memory pressure in our fleet that we can fail these allocations, and then often trip a BUG_ON(!allocation), which isn't an ideal outcome. Begin to address this by simply not using GFP_ATOMIC, which will allow us to do things like actually allocate a extent state when doing set_extent_bits(UPTODATE) in the endio handler. End io handlers are not called in atomic context, besides we have been allocating failrec with GFP_NOFS so we'd notice there's a problem. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4e4f28387ace..78d7ea10621d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -897,9 +897,9 @@ static void end_sector_io(struct page *page, u64 offset, bool uptodate) end_page_read(page, uptodate, offset, sectorsize); if (uptodate) set_extent_uptodate(&inode->io_tree, offset, - offset + sectorsize - 1, &cached, GFP_ATOMIC); - unlock_extent_atomic(&inode->io_tree, offset, offset + sectorsize - 1, - &cached); + offset + sectorsize - 1, &cached, GFP_NOFS); + unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, + &cached); } static void submit_data_read_repair(struct inode *inode, @@ -1103,7 +1103,7 @@ static void endio_readpage_release_extent(struct processed_extent *processed, * Now we don't have range contiguous to the processed range, release * the processed range now. */ - unlock_extent_atomic(tree, processed->start, processed->end, &cached); + unlock_extent(tree, processed->start, processed->end, &cached); update: /* Update processed to current range */ -- cgit From da2a071b6f142f47072ea7b37e03d3ef317b8a4e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Oct 2022 10:00:40 -0400 Subject: btrfs: remove unused unlock_extent_atomic As of "btrfs: do not use GFP_ATOMIC in the read endio" we no longer have any users of unlock_extent_atomic, remove it. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 673637ed2fa9..d73ef24bad2e 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -145,13 +145,6 @@ static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, GFP_NOFS, NULL); } -static inline int unlock_extent_atomic(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) -{ - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, - GFP_ATOMIC, NULL); -} - static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits) { -- cgit From 5a75034e71ef5ec0fce983afcb6c9cb0147cd5b9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Oct 2022 10:00:41 -0400 Subject: btrfs: do not panic if we can't allocate a prealloc extent state We sometimes have to allocate new extent states when clearing or setting new bits in an extent io tree. Generally we preallocate this before taking the tree spin lock, but we can use this preallocated extent state sometimes and then need to try to do a GFP_ATOMIC allocation under the lock. Unfortunately sometimes this fails, and then we hit the BUG_ON() and bring the box down. This happens roughly 20 times a week in our fleet. However the vast majority of callers use GFP_NOFS, which means that if this GFP_ATOMIC allocation fails, we could simply drop the spin lock, go back and allocate a new extent state with our given gfp mask, and begin again from where we left off. For the remaining callers that do not use GFP_NOFS, they are generally using GFP_NOWAIT, which still allows for some reclaim. So allow these allocations to attempt to happen outside of the spin lock so we don't need to rely on GFP_ATOMIC allocations. This in essence creates an infinite loop for anything that isn't GFP_NOFS. To address this we may want to migrate to using mempools for extent states so that we will always have emergency reserves in order to make our allocations. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index a630c771d25c..599db7b15574 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -572,7 +572,7 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) clear = 1; again: - if (!prealloc && gfpflags_allow_blocking(mask)) { + if (!prealloc) { /* * Don't care for allocation failure here because we might end * up not needing the pre-allocated extent state at all, which @@ -636,7 +636,8 @@ hit_next: if (state->start < start) { prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); + if (!prealloc) + goto search_again; err = split_state(tree, state, prealloc, start); if (err) extent_io_tree_panic(tree, err); @@ -657,7 +658,8 @@ hit_next: */ if (state->start <= end && state->end > end) { prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); + if (!prealloc) + goto search_again; err = split_state(tree, state, prealloc, end + 1); if (err) extent_io_tree_panic(tree, err); @@ -987,7 +989,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, else ASSERT(failed_start == NULL && failed_state == NULL); again: - if (!prealloc && gfpflags_allow_blocking(mask)) { + if (!prealloc) { /* * Don't care for allocation failure here because we might end * up not needing the pre-allocated extent state at all, which @@ -1012,7 +1014,8 @@ again: state = tree_search_for_insert(tree, start, &p, &parent); if (!state) { prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); + if (!prealloc) + goto search_again; prealloc->start = start; prealloc->end = end; insert_state_fast(tree, prealloc, p, parent, bits, changeset); @@ -1085,7 +1088,8 @@ hit_next: } prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); + if (!prealloc) + goto search_again; err = split_state(tree, state, prealloc, start); if (err) extent_io_tree_panic(tree, err); @@ -1122,7 +1126,8 @@ hit_next: this_end = last_start - 1; prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); + if (!prealloc) + goto search_again; /* * Avoid to free 'prealloc' if it can be merged with the later @@ -1154,7 +1159,8 @@ hit_next: } prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); + if (!prealloc) + goto search_again; err = split_state(tree, state, prealloc, end + 1); if (err) extent_io_tree_panic(tree, err); -- cgit From 467761f90405004e58795116aa7d81f3d298dcfb Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 30 Sep 2022 17:23:01 +0200 Subject: btrfs: sysfs: convert remaining scnprintf to sysfs_emit The sysfs_emit is the safe API for writing to the sysfs files, previously converted from scnprintf, there's one left to do in btrfs_read_policy_show. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 74fef1f49c35..910702df7767 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1160,16 +1160,16 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { if (fs_devices->read_policy == i) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s[%s]", + ret += sysfs_emit_at(buf, ret, "%s[%s]", (ret == 0 ? "" : " "), btrfs_read_policy_name[i]); else - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", + ret += sysfs_emit_at(buf, ret, "%s%s", (ret == 0 ? "" : " "), btrfs_read_policy_name[i]); } - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } -- cgit From 63a7cb13071842966c1ce931edacbc23573aada5 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 26 Jul 2022 20:54:10 +0200 Subject: btrfs: auto enable discard=async when possible There's a request to automatically enable async discard for capable devices. We can do that, the async mode is designed to wait for larger freed extents and is not intrusive, with limits to iops, kbps or latency. The status and tunables will be exported in /sys/fs/btrfs/FSID/discard . The automatic selection is done if there's at least one discard capable device in the filesystem (not capable devices are skipped). Mounting with any other discard option will honor that option, notably mounting with nodiscard will keep it disabled. Link: https://lore.kernel.org/linux-btrfs/CAEg-Je_b1YtdsCR0zS5XZ_SbvJgN70ezwvRwLiCZgDGLbeMB=w@mail.gmail.com/ Reviewed-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 14 ++++++++++++++ fs/btrfs/super.c | 2 ++ fs/btrfs/volumes.c | 3 +++ fs/btrfs/volumes.h | 2 ++ 5 files changed, 22 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5adf76d89f9b..3485f60efa33 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1396,6 +1396,7 @@ enum { BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), + BTRFS_MOUNT_NODISCARD = (1UL << 31), }; #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5705335c80c8..cf59d8f14a05 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3746,6 +3746,20 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations"); } + /* + * For devices supporting discard turn on discard=async automatically, + * unless it's already set or disabled. This could be turned off by + * nodiscard for the same mount. + */ + if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_test_opt(fs_info, DISCARD_ASYNC) || + btrfs_test_opt(fs_info, NODISCARD)) && + fs_info->fs_devices->discardable) { + btrfs_set_and_info(fs_info, DISCARD_ASYNC, + "auto enabling async discard"); + btrfs_clear_opt(fs_info->mount_opt, NODISCARD); + } + /* * Mount does not set all options immediately, we can do it now and do * not have to wait for transaction commit diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3a33dd9847d9..42e0d2fcc407 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -918,12 +918,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, ret = -EINVAL; goto out; } + btrfs_clear_opt(info->mount_opt, NODISCARD); break; case Opt_nodiscard: btrfs_clear_and_info(info, DISCARD_SYNC, "turning off discard"); btrfs_clear_and_info(info, DISCARD_ASYNC, "turning off async discard"); + btrfs_set_opt(info->mount_opt, NODISCARD); break; case Opt_space_cache: case Opt_space_cache_version: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 635f45f1a2ef..796e9f5ff8f8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -641,6 +641,9 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, if (!bdev_nonrot(bdev)) fs_devices->rotating = true; + if (bdev_max_discard_sectors(bdev)) + fs_devices->discardable = true; + device->bdev = bdev; clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); device->mode = flags; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 099def5613b8..a20ee7d57831 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -354,6 +354,8 @@ struct btrfs_fs_devices { * nonrot flag set */ bool rotating; + /* Devices support TRIM/discard commands */ + bool discardable; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ -- cgit From b307f06d37ca12495e4cfff8d456cd92f045f947 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 18 Oct 2022 16:06:38 +0200 Subject: btrfs: simplify generation check in btrfs_get_dentry Callers that pass non-zero generation always want to perform the generation check, we can simply encode that in one parameter and drop check_generation. Add function documentation. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/export.c | 23 +++++++++++++++++------ fs/btrfs/export.h | 3 +-- fs/btrfs/ioctl.c | 2 +- 3 files changed, 19 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index fab7eb76e53b..a51a5dfa737c 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -57,9 +57,20 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, return type; } +/* + * Read dentry of inode with @objectid from filesystem root @root_objectid. + * + * @sb: the filesystem super block + * @objectid: inode objectid + * @root_objectid: object id of the subvolume root where to look up the inode + * @generation: optional, if not zero, verify that the found inode + * generation matches + * + * Return dentry alias for the inode, otherwise an error. In case the + * generation does not match return ESTALE. + */ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u64 generation, - int check_generation) + u64 root_objectid, u64 generation) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; @@ -77,7 +88,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, if (IS_ERR(inode)) return ERR_CAST(inode); - if (check_generation && generation != inode->i_generation) { + if (generation != 0 && generation != inode->i_generation) { iput(inode); return ERR_PTR(-ESTALE); } @@ -106,7 +117,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, objectid = fid->parent_objectid; generation = fid->parent_gen; - return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); + return btrfs_get_dentry(sb, objectid, root_objectid, generation); } static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, @@ -128,7 +139,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, root_objectid = fid->root_objectid; generation = fid->gen; - return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); + return btrfs_get_dentry(sb, objectid, root_objectid, generation); } struct dentry *btrfs_get_parent(struct dentry *child) @@ -188,7 +199,7 @@ struct dentry *btrfs_get_parent(struct dentry *child) if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { return btrfs_get_dentry(fs_info->sb, key.objectid, - found_key.offset, 0, 0); + found_key.offset, 0); } return d_obtain_alias(btrfs_iget(fs_info->sb, key.objectid, root)); diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index 5afb7ca42828..eba6bc4f5a61 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -19,8 +19,7 @@ struct btrfs_fid { } __attribute__ ((packed)); struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u64 generation, - int check_generation); + u64 root_objectid, u64 generation); struct dentry *btrfs_get_parent(struct dentry *child); #endif diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5ba2e810dc6e..a482739c1d82 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3274,7 +3274,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dentry = btrfs_get_dentry(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, - vol_args2->subvolid, 0, 0); + vol_args2->subvolid, 0); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_drop_write; -- cgit From 875c627c5f2045e2b5f0ad6692799e8fb931f3cb Mon Sep 17 00:00:00 2001 From: Wang Yugui Date: Wed, 19 Oct 2022 16:10:01 +0800 Subject: btrfs: send add define for v2 buffer size Add a define for the data buffer size (though the maximum size is not limited by it) BTRFS_SEND_BUF_SIZE_V2 so it's more visible. Signed-off-by: Wang Yugui Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 2 +- fs/btrfs/send.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 1c4b693ee4a3..80104b5af32f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7901,7 +7901,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) if (sctx->proto >= 2) { u32 send_buf_num_pages; - sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE); + sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V2; sctx->send_buf = vmalloc(sctx->send_max_size); if (!sctx->send_buf) { ret = -ENOMEM; diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index f7585cfa7e52..4f5509cb1803 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -18,10 +18,12 @@ #endif /* - * In send stream v1, no command is larger than 64K. In send stream v2, no limit - * should be assumed. + * In send stream v1, no command is larger than 64K. In send stream v2, no + * limit should be assumed, the buffer size is set to be a header with + * compressed extent size. */ #define BTRFS_SEND_BUF_SIZE_V1 SZ_64K +#define BTRFS_SEND_BUF_SIZE_V2 ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE) struct inode; struct btrfs_ioctl_send_args; -- cgit From c7f13d428ea1bfe883f2741a9b5a5352d595eb09 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:47 -0400 Subject: btrfs: move fs wide helpers out of ctree.h We have several fs wide related helpers in ctree.h. The bulk of these are the incompat flag test helpers, but there are things such as btrfs_fs_closing() and the read only helpers that also aren't directly related to the ctree code. Move these into a fs.h header, which will serve as the location for file system wide related helpers. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Makefile | 2 +- fs/btrfs/backref.c | 1 + fs/btrfs/block-group.c | 1 + fs/btrfs/ctree.h | 164 --------------------------------------------- fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 1 + fs/btrfs/file-item.c | 1 + fs/btrfs/file.c | 1 + fs/btrfs/free-space-tree.c | 1 + fs/btrfs/fs.c | 92 +++++++++++++++++++++++++ fs/btrfs/fs.h | 85 +++++++++++++++++++++++ fs/btrfs/inode.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/props.c | 1 + fs/btrfs/qgroup.c | 1 + fs/btrfs/relocation.c | 1 + fs/btrfs/scrub.c | 1 + fs/btrfs/space-info.c | 1 + fs/btrfs/super.c | 1 + fs/btrfs/transaction.c | 1 + fs/btrfs/tree-checker.c | 1 + fs/btrfs/tree-log.c | 1 + fs/btrfs/uuid-tree.c | 1 + fs/btrfs/verity.c | 1 + fs/btrfs/volumes.c | 1 + fs/btrfs/zoned.c | 1 + 26 files changed, 200 insertions(+), 165 deletions(-) create mode 100644 fs/btrfs/fs.c create mode 100644 fs/btrfs/fs.h (limited to 'fs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index fa9ddcc9eb0b..eebb45c06485 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -31,7 +31,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ - subpage.o tree-mod-log.o extent-io-tree.o + subpage.o tree-mod-log.o extent-io-tree.o fs.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 232f49415ab9..f76db317c437 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -15,6 +15,7 @@ #include "locking.h" #include "misc.h" #include "tree-mod-log.h" +#include "fs.h" /* Just arbitrary numbers so we can be sure one of these happened. */ #define BACKREF_FOUND_SHARED 6 diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 47461fd21975..bf8d1e110136 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -17,6 +17,7 @@ #include "discard.h" #include "raid56.h" #include "zoned.h" +#include "fs.h" #ifdef CONFIG_BTRFS_DEBUG int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3485f60efa33..1b2a1dcb60dd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2857,44 +2857,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, struct extent_buffer *parent); -static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) -{ - /* - * Do it this way so we only ever do one test_bit in the normal case. - */ - if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { - if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) - return 2; - return 1; - } - return 0; -} - -/* - * If we remount the fs to be R/O or umount the fs, the cleaner needn't do - * anything except sleeping. This function is used to check the status of - * the fs. - * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount, - * since setting and checking for SB_RDONLY in the superblock's flags is not - * atomic. - */ -static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) -{ - return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) || - btrfs_fs_closing(fs_info); -} - -static inline void btrfs_set_sb_rdonly(struct super_block *sb) -{ - sb->s_flags |= SB_RDONLY; - set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); -} - -static inline void btrfs_clear_sb_rdonly(struct super_block *sb) -{ - sb->s_flags &= ~SB_RDONLY; - clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); -} /* root-item.c */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, @@ -3529,132 +3491,6 @@ do { \ } while (0) -/* compatibility and incompatibility defines */ - -#define btrfs_set_fs_incompat(__fs_info, opt) \ - __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ - #opt) - -static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, - u64 flag, const char* name) -{ - struct btrfs_super_block *disk_super; - u64 features; - - disk_super = fs_info->super_copy; - features = btrfs_super_incompat_flags(disk_super); - if (!(features & flag)) { - spin_lock(&fs_info->super_lock); - features = btrfs_super_incompat_flags(disk_super); - if (!(features & flag)) { - features |= flag; - btrfs_set_super_incompat_flags(disk_super, features); - btrfs_info(fs_info, - "setting incompat feature flag for %s (0x%llx)", - name, flag); - } - spin_unlock(&fs_info->super_lock); - } -} - -#define btrfs_clear_fs_incompat(__fs_info, opt) \ - __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ - #opt) - -static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, - u64 flag, const char* name) -{ - struct btrfs_super_block *disk_super; - u64 features; - - disk_super = fs_info->super_copy; - features = btrfs_super_incompat_flags(disk_super); - if (features & flag) { - spin_lock(&fs_info->super_lock); - features = btrfs_super_incompat_flags(disk_super); - if (features & flag) { - features &= ~flag; - btrfs_set_super_incompat_flags(disk_super, features); - btrfs_info(fs_info, - "clearing incompat feature flag for %s (0x%llx)", - name, flag); - } - spin_unlock(&fs_info->super_lock); - } -} - -#define btrfs_fs_incompat(fs_info, opt) \ - __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) - -static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) -{ - struct btrfs_super_block *disk_super; - disk_super = fs_info->super_copy; - return !!(btrfs_super_incompat_flags(disk_super) & flag); -} - -#define btrfs_set_fs_compat_ro(__fs_info, opt) \ - __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ - #opt) - -static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, - u64 flag, const char *name) -{ - struct btrfs_super_block *disk_super; - u64 features; - - disk_super = fs_info->super_copy; - features = btrfs_super_compat_ro_flags(disk_super); - if (!(features & flag)) { - spin_lock(&fs_info->super_lock); - features = btrfs_super_compat_ro_flags(disk_super); - if (!(features & flag)) { - features |= flag; - btrfs_set_super_compat_ro_flags(disk_super, features); - btrfs_info(fs_info, - "setting compat-ro feature flag for %s (0x%llx)", - name, flag); - } - spin_unlock(&fs_info->super_lock); - } -} - -#define btrfs_clear_fs_compat_ro(__fs_info, opt) \ - __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ - #opt) - -static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, - u64 flag, const char *name) -{ - struct btrfs_super_block *disk_super; - u64 features; - - disk_super = fs_info->super_copy; - features = btrfs_super_compat_ro_flags(disk_super); - if (features & flag) { - spin_lock(&fs_info->super_lock); - features = btrfs_super_compat_ro_flags(disk_super); - if (features & flag) { - features &= ~flag; - btrfs_set_super_compat_ro_flags(disk_super, features); - btrfs_info(fs_info, - "clearing compat-ro feature flag for %s (0x%llx)", - name, flag); - } - spin_unlock(&fs_info->super_lock); - } -} - -#define btrfs_fs_compat_ro(fs_info, opt) \ - __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) - -static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) -{ - struct btrfs_super_block *disk_super; - disk_super = fs_info->super_copy; - return !!(btrfs_super_compat_ro_flags(disk_super) & flag); -} - /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index cf59d8f14a05..5e20b191b108 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -43,6 +43,7 @@ #include "space-info.h" #include "zoned.h" #include "subpage.h" +#include "fs.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2801c991814f..bc010dbcb6b1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -36,6 +36,7 @@ #include "rcu-string.h" #include "zoned.h" #include "dev-replace.h" +#include "fs.h" #undef SCRAMBLE_DELAYED_REFS diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 6bb9fa961a6a..824ff54d8155 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -16,6 +16,7 @@ #include "volumes.h" #include "print-tree.h" #include "compression.h" +#include "fs.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 19b41b5fe6c0..a352c7cacc99 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -30,6 +30,7 @@ #include "delalloc-space.h" #include "reflink.h" #include "subpage.h" +#include "fs.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 367bcfcf68f5..bfc21eb8ec63 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -11,6 +11,7 @@ #include "free-space-tree.h" #include "transaction.h" #include "block-group.h" +#include "fs.h" static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c new file mode 100644 index 000000000000..d4ba948eba56 --- /dev/null +++ b/fs/btrfs/fs.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "ctree.h" +#include "fs.h" + +void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (!(features & flag)) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_incompat_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_incompat_flags(disk_super, features); + btrfs_info(fs_info, + "setting incompat feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (features & flag) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_incompat_flags(disk_super); + if (features & flag) { + features &= ~flag; + btrfs_set_super_incompat_flags(disk_super, features); + btrfs_info(fs_info, + "clearing incompat feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_compat_ro_flags(disk_super); + if (!(features & flag)) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_compat_ro_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_compat_ro_flags(disk_super, features); + btrfs_info(fs_info, + "setting compat-ro feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_compat_ro_flags(disk_super); + if (features & flag) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_compat_ro_flags(disk_super); + if (features & flag) { + features &= ~flag; + btrfs_set_super_compat_ro_flags(disk_super, features); + btrfs_info(fs_info, + "clearing compat-ro feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + } +} diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h new file mode 100644 index 000000000000..8eda9ce0a904 --- /dev/null +++ b/fs/btrfs/fs.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_FS_H +#define BTRFS_FS_H + +/* Compatibility and incompatibility defines */ +void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); +void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); +void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); +void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); + +#define btrfs_set_fs_incompat(__fs_info, opt) \ + __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) + +#define btrfs_clear_fs_incompat(__fs_info, opt) \ + __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) + +#define btrfs_fs_incompat(fs_info, opt) \ + __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + +#define btrfs_set_fs_compat_ro(__fs_info, opt) \ + __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt) + +#define btrfs_clear_fs_compat_ro(__fs_info, opt) \ + __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt) + +#define btrfs_fs_compat_ro(fs_info, opt) \ + __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + +static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) +{ + struct btrfs_super_block *disk_super; + disk_super = fs_info->super_copy; + return !!(btrfs_super_incompat_flags(disk_super) & flag); +} + +static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) +{ + struct btrfs_super_block *disk_super; + disk_super = fs_info->super_copy; + return !!(btrfs_super_compat_ro_flags(disk_super) & flag); +} + +static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) +{ + /* Do it this way so we only ever do one test_bit in the normal case. */ + if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { + if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) + return 2; + return 1; + } + return 0; +} + +/* + * If we remount the fs to be R/O or umount the fs, the cleaner needn't do + * anything except sleeping. This function is used to check the status of + * the fs. + * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount, + * since setting and checking for SB_RDONLY in the superblock's flags is not + * atomic. + */ +static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) +{ + return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) || + btrfs_fs_closing(fs_info); +} + +static inline void btrfs_set_sb_rdonly(struct super_block *sb) +{ + sb->s_flags |= SB_RDONLY; + set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); +} + +static inline void btrfs_clear_sb_rdonly(struct super_block *sb) +{ + sb->s_flags &= ~SB_RDONLY; + clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); +} + +#endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 50584b93a66f..60b3162c035d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -55,6 +55,7 @@ #include "zoned.h" #include "subpage.h" #include "inode-item.h" +#include "fs.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a482739c1d82..564a4ae9c285 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -50,6 +50,7 @@ #include "delalloc-space.h" #include "block-group.h" #include "subpage.h" +#include "fs.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index e04289347775..d2c699c9aa51 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -11,6 +11,7 @@ #include "xattr.h" #include "compression.h" #include "space-info.h" +#include "fs.h" #define BTRFS_PROP_HANDLERS_HT_BITS 8 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b74105a10f16..e87a2f066f4d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -24,6 +24,7 @@ #include "block-group.h" #include "sysfs.h" #include "tree-mod-log.h" +#include "fs.h" /* * Helpers to access qgroup reservation diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 216a4485d914..977afbb4cd45 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -28,6 +28,7 @@ #include "zoned.h" #include "inode-item.h" #include "space-info.h" +#include "fs.h" /* * Relocation overview diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index cdda4b2f20f2..6871c1c54f8c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -21,6 +21,7 @@ #include "raid56.h" #include "block-group.h" #include "zoned.h" +#include "fs.h" /* * This is only the first step towards a full-features scrub. It reads all diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index af2e133aaa5c..af1538c2685d 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -10,6 +10,7 @@ #include "transaction.h" #include "block-group.h" #include "zoned.h" +#include "fs.h" /* * HOW DOES SPACE RESERVATION WORK diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 42e0d2fcc407..ef646a1d9bc2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -49,6 +49,7 @@ #include "discard.h" #include "qgroup.h" #include "raid56.h" +#include "fs.h" #define CREATE_TRACE_POINTS #include diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index ae7d4aca771d..bae77fb05e2b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -23,6 +23,7 @@ #include "block-group.h" #include "space-info.h" #include "zoned.h" +#include "fs.h" static struct kmem_cache *btrfs_trans_handle_cachep; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 43f905ab0a18..862d67798de5 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -25,6 +25,7 @@ #include "volumes.h" #include "misc.h" #include "btrfs_inode.h" +#include "fs.h" /* * Error message should follow the following format: diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c3cf3dabe0b1..e294c38f9b19 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -21,6 +21,7 @@ #include "space-info.h" #include "zoned.h" #include "inode-item.h" +#include "fs.h" #define MAX_CONFLICT_INODES 10 diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index b458452a1aaf..2d7eb290fb9c 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -9,6 +9,7 @@ #include "transaction.h" #include "disk-io.h" #include "print-tree.h" +#include "fs.h" static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key) diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index ee00e33c309e..ab0b39badbbe 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -15,6 +15,7 @@ #include "transaction.h" #include "disk-io.h" #include "locking.h" +#include "fs.h" /* * Implementation of the interface defined in struct fsverity_operations. diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 796e9f5ff8f8..d65d5d7835fe 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -33,6 +33,7 @@ #include "block-group.h" #include "discard.h" #include "zoned.h" +#include "fs.h" static struct bio_set btrfs_bioset; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index c9e2b0c85309..2a7d856c232c 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -15,6 +15,7 @@ #include "transaction.h" #include "dev-replace.h" #include "space-info.h" +#include "fs.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 -- cgit From e118578a8df7941a9bbc568851997852e5bc7338 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:48 -0400 Subject: btrfs: move assert helpers out of ctree.h These call functions that aren't defined in, or will be moved out of, ctree.h Move them to super.c where the other assert/error message code is defined. Drop the __noreturn attribute for btrfs_assertfail as objtool does not like it and fails with warnings like fs/btrfs/dir-item.o: warning: objtool: .text.unlikely: unexpected end of section fs/btrfs/xattr.o: warning: objtool: btrfs_setxattr() falls through to next function btrfs_setxattr_trans.cold() fs/btrfs/xattr.o: warning: objtool: .text.unlikely: unexpected end of section Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 18 +++--------------- fs/btrfs/super.c | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1b2a1dcb60dd..e5e4079c92d4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3333,18 +3333,11 @@ do { \ } while (0) #ifdef CONFIG_BTRFS_ASSERT -__cold __noreturn -static inline void assertfail(const char *expr, const char *file, int line) -{ - pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); - BUG(); -} +void __cold btrfs_assertfail(const char *expr, const char *file, int line); #define ASSERT(expr) \ - (likely(expr) ? (void)0 : assertfail(#expr, __FILE__, __LINE__)) - + (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__)) #else -static inline void assertfail(const char *expr, const char* file, int line) { } #define ASSERT(expr) (void)(expr) #endif @@ -3404,12 +3397,7 @@ static inline unsigned long get_eb_page_index(unsigned long offset) #define EXPORT_FOR_TESTS #endif -__cold -static inline void btrfs_print_v0_err(struct btrfs_fs_info *fs_info) -{ - btrfs_err(fs_info, -"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel"); -} +void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info); __printf(5, 6) __cold diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ef646a1d9bc2..06661bcb991b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -305,6 +305,20 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, } #endif +#ifdef CONFIG_BTRFS_ASSERT +void __cold btrfs_assertfail(const char *expr, const char *file, int line) +{ + pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); + BUG(); +} +#endif + +void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info) +{ + btrfs_err(fs_info, +"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel"); +} + #if BITS_PER_LONG == 32 void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info) { -- cgit From 9b569ea0be6fb27a4985acc9325896a3edc95ede Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:49 -0400 Subject: btrfs: move the printk helpers out of ctree.h We have a bunch of printk helpers that are in ctree.h. These have nothing to do with ctree.c, so move them into their own header. Subsequent patches will cleanup the printk helpers. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.h | 1 + fs/btrfs/check-integrity.c | 1 + fs/btrfs/ctree.c | 1 + fs/btrfs/ctree.h | 249 ------------------------------------------ fs/btrfs/delalloc-space.c | 1 + fs/btrfs/delayed-inode.c | 1 + fs/btrfs/delayed-ref.c | 1 + fs/btrfs/dir-item.c | 1 + fs/btrfs/extent-io-tree.c | 1 + fs/btrfs/extent_map.c | 1 + fs/btrfs/file-item.c | 1 + fs/btrfs/free-space-cache.c | 1 + fs/btrfs/free-space-tree.c | 1 + fs/btrfs/fs.c | 1 + fs/btrfs/inode-item.c | 1 + fs/btrfs/lzo.c | 1 + fs/btrfs/messages.h | 259 ++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ordered-data.c | 1 + fs/btrfs/print-tree.c | 1 + fs/btrfs/props.c | 1 + fs/btrfs/raid56.c | 1 + fs/btrfs/ref-verify.c | 1 + fs/btrfs/reflink.c | 1 + fs/btrfs/root-tree.c | 1 + fs/btrfs/struct-funcs.c | 1 + fs/btrfs/subpage.c | 1 + fs/btrfs/super.c | 1 + fs/btrfs/sysfs.c | 2 +- fs/btrfs/tree-checker.c | 1 + fs/btrfs/tree-log.h | 1 + fs/btrfs/tree-mod-log.c | 1 + fs/btrfs/ulist.c | 1 + fs/btrfs/uuid-tree.c | 1 + fs/btrfs/verity.c | 1 + fs/btrfs/xattr.c | 1 + fs/btrfs/zoned.h | 1 + 36 files changed, 293 insertions(+), 250 deletions(-) create mode 100644 fs/btrfs/messages.h (limited to 'fs') diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 6dac462430b0..fa3c9cbf9ae7 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -7,6 +7,7 @@ #define BTRFS_BACKREF_H #include +#include "messages.h" #include "ulist.h" #include "disk-io.h" #include "extent_io.h" diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 98c6e5feab19..e8e1a92b30ac 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -82,6 +82,7 @@ #include #include #include +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d645933ef135..f423b86f5262 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -8,6 +8,7 @@ #include #include #include +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e5e4079c92d4..d27d059e4d59 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3168,179 +3168,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait); char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, u64 subvol_objectid); -static inline __printf(2, 3) __cold -void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) -{ -} - -#ifdef CONFIG_PRINTK_INDEX - -#define btrfs_printk(fs_info, fmt, args...) \ -do { \ - printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \ - _btrfs_printk(fs_info, fmt, ##args); \ -} while (0) - -__printf(2, 3) -__cold -void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); - -#elif defined(CONFIG_PRINTK) - -#define btrfs_printk(fs_info, fmt, args...) \ - _btrfs_printk(fs_info, fmt, ##args) - -__printf(2, 3) -__cold -void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); - -#else - -#define btrfs_printk(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, fmt, ##args) -#endif - -#define btrfs_emerg(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_INFO fmt, ##args) - -/* - * Wrappers that use printk_in_rcu - */ -#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) - -/* - * Wrappers that use a ratelimited printk_in_rcu - */ -#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) - -/* - * Wrappers that use a ratelimited printk - */ -#define btrfs_emerg_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args) - -#if defined(CONFIG_DYNAMIC_DEBUG) -#define btrfs_debug(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk, \ - fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \ - fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ - fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \ - fs_info, KERN_DEBUG fmt, ##args) -#elif defined(DEBUG) -#define btrfs_debug(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args) -#else -#define btrfs_debug(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ - btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) -#endif - -#define btrfs_printk_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - btrfs_printk(fs_info, fmt, ##args); \ - rcu_read_unlock(); \ -} while (0) - -#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - btrfs_no_printk(fs_info, fmt, ##args); \ - rcu_read_unlock(); \ -} while (0) - -#define btrfs_printk_ratelimited(fs_info, fmt, args...) \ -do { \ - static DEFINE_RATELIMIT_STATE(_rs, \ - DEFAULT_RATELIMIT_INTERVAL, \ - DEFAULT_RATELIMIT_BURST); \ - if (__ratelimit(&_rs)) \ - btrfs_printk(fs_info, fmt, ##args); \ -} while (0) - -#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - btrfs_printk_ratelimited(fs_info, fmt, ##args); \ - rcu_read_unlock(); \ -} while (0) - -#ifdef CONFIG_BTRFS_ASSERT -void __cold btrfs_assertfail(const char *expr, const char *file, int line); - -#define ASSERT(expr) \ - (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__)) -#else -#define ASSERT(expr) (void)(expr) -#endif - #if BITS_PER_LONG == 32 #define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT) /* @@ -3397,88 +3224,12 @@ static inline unsigned long get_eb_page_index(unsigned long offset) #define EXPORT_FOR_TESTS #endif -void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info); - -__printf(5, 6) -__cold -void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); - -const char * __attribute_const__ btrfs_decode_error(int errno); - -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno, bool first_hit); - -bool __cold abort_should_print_stack(int errno); - -/* - * Call btrfs_abort_transaction as early as possible when an error condition is - * detected, that way the exact stack trace is reported for some errors. - */ -#define btrfs_abort_transaction(trans, errno) \ -do { \ - bool first = false; \ - /* Report first abort since mount */ \ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ - &((trans)->fs_info->fs_state))) { \ - first = true; \ - if (WARN(abort_should_print_stack(errno), \ - KERN_DEBUG \ - "BTRFS: Transaction aborted (error %d)\n", \ - (errno))) { \ - /* Stack trace printed. */ \ - } else { \ - btrfs_debug((trans)->fs_info, \ - "Transaction aborted (error %d)", \ - (errno)); \ - } \ - } \ - __btrfs_abort_transaction((trans), __func__, \ - __LINE__, (errno), first); \ -} while (0) - -#ifdef CONFIG_PRINTK_INDEX - -#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ -do { \ - printk_index_subsys_emit( \ - "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \ - KERN_CRIT, fmt); \ - __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args); \ -} while (0) - -#else - -#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ - __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args) - -#endif - #define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ &(fs_info)->fs_state))) #define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ &(fs_info)->fs_state))) -__printf(5, 6) -__cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); -/* - * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic - * will panic(). Otherwise we BUG() here. - */ -#define btrfs_panic(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ - BUG(); \ -} while (0) - - /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 118b2e20b2e1..045545145a2b 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include "messages.h" #include "ctree.h" #include "delalloc-space.h" #include "block-rsv.h" diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index a411f04a7b97..8cf5ee646147 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -6,6 +6,7 @@ #include #include +#include "messages.h" #include "misc.h" #include "delayed-inode.h" #include "disk-io.h" diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 36a3debe9493..c775ff4f1cb1 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -6,6 +6,7 @@ #include #include #include +#include "messages.h" #include "ctree.h" #include "delayed-ref.h" #include "transaction.h" diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 72fb2c518a2b..be5c1c2a8da5 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -3,6 +3,7 @@ * Copyright (C) 2007 Oracle. All rights reserved. */ +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 599db7b15574..2cdff74ff033 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -2,6 +2,7 @@ #include #include +#include "messages.h" #include "ctree.h" #include "extent-io-tree.h" #include "btrfs_inode.h" diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 715979807ae1..f97508afb659 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -3,6 +3,7 @@ #include #include #include +#include "messages.h" #include "ctree.h" #include "volumes.h" #include "extent_map.h" diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 824ff54d8155..675987e2d652 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -9,6 +9,7 @@ #include #include #include +#include "messages.h" #include "misc.h" #include "ctree.h" #include "disk-io.h" diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index bd68fafcccc7..83d866f5ab6c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -11,6 +11,7 @@ #include #include #include +#include "messages.h" #include "misc.h" #include "ctree.h" #include "free-space-cache.h" diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index bfc21eb8ec63..026214d74a02 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -5,6 +5,7 @@ #include #include +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "locking.h" diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index d4ba948eba56..a59504b59435 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include "messages.h" #include "ctree.h" #include "fs.h" diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 366f3a788c6a..b301d8e3df87 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -3,6 +3,7 @@ * Copyright (C) 2007 Oracle. All rights reserved. */ +#include "messages.h" #include "ctree.h" #include "inode-item.h" #include "disk-io.h" diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 89bc5f825e0a..6751874a3e69 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -13,6 +13,7 @@ #include #include #include +#include "messages.h" #include "compression.h" #include "ctree.h" diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h new file mode 100644 index 000000000000..ace5bb02820a --- /dev/null +++ b/fs/btrfs/messages.h @@ -0,0 +1,259 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_MESSAGES_H +#define BTRFS_MESSAGES_H + +#include + +struct btrfs_fs_info; +struct btrfs_trans_handle; + +static inline __printf(2, 3) __cold +void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +{ +} + +#ifdef CONFIG_PRINTK_INDEX + +#define btrfs_printk(fs_info, fmt, args...) \ +do { \ + printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \ + _btrfs_printk(fs_info, fmt, ##args); \ +} while (0) + +__printf(2, 3) +__cold +void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); + +#elif defined(CONFIG_PRINTK) + +#define btrfs_printk(fs_info, fmt, args...) \ + _btrfs_printk(fs_info, fmt, ##args) + +__printf(2, 3) +__cold +void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); + +#else + +#define btrfs_printk(fs_info, fmt, args...) \ + btrfs_no_printk(fs_info, fmt, ##args) +#endif + +#define btrfs_emerg(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use printk_in_rcu + */ +#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use a ratelimited printk_in_rcu + */ +#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use a ratelimited printk + */ +#define btrfs_emerg_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args) + +#if defined(CONFIG_DYNAMIC_DEBUG) +#define btrfs_debug(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk, \ + fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \ + fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ + fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \ + fs_info, KERN_DEBUG fmt, ##args) +#elif defined(DEBUG) +#define btrfs_debug(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args) +#else +#define btrfs_debug(fs_info, fmt, args...) \ + btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) +#endif + +#define btrfs_printk_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_printk(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + +#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_no_printk(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + +#define btrfs_printk_ratelimited(fs_info, fmt, args...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + btrfs_printk(fs_info, fmt, ##args); \ +} while (0) + +#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_printk_ratelimited(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + +#ifdef CONFIG_BTRFS_ASSERT +void __cold btrfs_assertfail(const char *expr, const char *file, int line); + +#define ASSERT(expr) \ + (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__)) +#else +#define ASSERT(expr) (void)(expr) +#endif + +void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info); + +__printf(5, 6) +__cold +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); + +const char * __attribute_const__ btrfs_decode_error(int errno); + +__cold +void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit); + +bool __cold abort_should_print_stack(int errno); + +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact stack trace is reported for some errors. + */ +#define btrfs_abort_transaction(trans, errno) \ +do { \ + bool first = false; \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((trans)->fs_info->fs_state))) { \ + first = true; \ + if (WARN(abort_should_print_stack(errno), \ + KERN_DEBUG \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno))) { \ + /* Stack trace printed. */ \ + } else { \ + btrfs_debug((trans)->fs_info, \ + "Transaction aborted (error %d)", \ + (errno)); \ + } \ + } \ + __btrfs_abort_transaction((trans), __func__, \ + __LINE__, (errno), first); \ +} while (0) + +#ifdef CONFIG_PRINTK_INDEX + +#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ +do { \ + printk_index_subsys_emit( \ + "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \ + KERN_CRIT, fmt); \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args); \ +} while (0) + +#else + +#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args) + +#endif + +__printf(5, 6) +__cold +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); +/* + * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic + * will panic(). Otherwise we BUG() here. + */ +#define btrfs_panic(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ + BUG(); \ +} while (0) + +#endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index de2b716d3e7b..1cbaacdc50da 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -7,6 +7,7 @@ #include #include #include +#include "messages.h" #include "misc.h" #include "ctree.h" #include "transaction.h" diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index dd8777872143..708facaede2c 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -3,6 +3,7 @@ * Copyright (C) 2007 Oracle. All rights reserved. */ +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "print-tree.h" diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index d2c699c9aa51..6d88bf0a9b17 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -4,6 +4,7 @@ */ #include +#include "messages.h" #include "props.h" #include "btrfs_inode.h" #include "transaction.h" diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index c009c0a2081e..349270537c2e 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -13,6 +13,7 @@ #include #include #include +#include "messages.h" #include "misc.h" #include "ctree.h" #include "disk-io.h" diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index a248f46cfe72..f7535b8b62f5 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -5,6 +5,7 @@ #include #include +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "locking.h" diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index f50586ff85c8..6179864de6e7 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -2,6 +2,7 @@ #include #include +#include "messages.h" #include "compression.h" #include "ctree.h" #include "delalloc-space.h" diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index e1f599d7a916..44c8c8ad0a16 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -5,6 +5,7 @@ #include #include +#include "messages.h" #include "ctree.h" #include "transaction.h" #include "disk-io.h" diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index 12455b2b41de..6ba16c018d7f 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -5,6 +5,7 @@ #include +#include "messages.h" #include "ctree.h" static bool check_setget_bounds(const struct extent_buffer *eb, diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 9a176af847d7..dd46b978ac2c 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include "messages.h" #include "ctree.h" #include "subpage.h" #include "btrfs_inode.h" diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 06661bcb991b..7debbcfc5d47 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -26,6 +26,7 @@ #include #include #include +#include "messages.h" #include "delayed-inode.h" #include "ctree.h" #include "disk-io.h" diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 910702df7767..0b6bea00742c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -10,7 +10,7 @@ #include #include #include - +#include "messages.h" #include "ctree.h" #include "discard.h" #include "disk-io.h" diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 862d67798de5..fa9536110d69 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -18,6 +18,7 @@ #include #include #include +#include "messages.h" #include "ctree.h" #include "tree-checker.h" #include "disk-io.h" diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index aed1e05e9879..f5770829d075 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -6,6 +6,7 @@ #ifndef BTRFS_TREE_LOG_H #define BTRFS_TREE_LOG_H +#include "messages.h" #include "ctree.h" #include "transaction.h" diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 18b4699bcb11..41e5d9f789dc 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include "messages.h" #include "tree-mod-log.h" #include "disk-io.h" diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 3374c9e9be67..f2f20c8d84aa 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -5,6 +5,7 @@ */ #include +#include "messages.h" #include "ulist.h" #include "ctree.h" diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 2d7eb290fb9c..190f752a2e10 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -5,6 +5,7 @@ #include #include +#include "messages.h" #include "ctree.h" #include "transaction.h" #include "disk-io.h" diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index ab0b39badbbe..35445855df4d 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -10,6 +10,7 @@ #include #include #include +#include "messages.h" #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 5bb8d8c86311..d12903f01f83 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -12,6 +12,7 @@ #include #include #include +#include "messages.h" #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 8bd16d40b7c6..f43990985d80 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -5,6 +5,7 @@ #include #include +#include "messages.h" #include "volumes.h" #include "disk-io.h" #include "block-group.h" -- cgit From bbde07a40a13cc5a3483eaeb52e9bb3b61d2cf57 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:50 -0400 Subject: btrfs: push printk index code into their respective helpers The printk index work can be pushed into the printk helpers themselves, this allows us to further sanitize messages.h, removing the last include in the header itself. Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/messages.h | 29 +---------------------------- fs/btrfs/super.c | 10 ++++++++++ 2 files changed, 11 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index ace5bb02820a..3772358f8d30 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -13,19 +13,7 @@ void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { } -#ifdef CONFIG_PRINTK_INDEX - -#define btrfs_printk(fs_info, fmt, args...) \ -do { \ - printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \ - _btrfs_printk(fs_info, fmt, ##args); \ -} while (0) - -__printf(2, 3) -__cold -void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); - -#elif defined(CONFIG_PRINTK) +#ifdef CONFIG_PRINTK #define btrfs_printk(fs_info, fmt, args...) \ _btrfs_printk(fs_info, fmt, ##args) @@ -223,25 +211,10 @@ do { \ __LINE__, (errno), first); \ } while (0) -#ifdef CONFIG_PRINTK_INDEX - -#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ -do { \ - printk_index_subsys_emit( \ - "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \ - KERN_CRIT, fmt); \ - __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args); \ -} while (0) - -#else - #define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ (errno), fmt, ##args) -#endif - __printf(5, 6) __cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7debbcfc5d47..3aeb2106e119 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -181,6 +181,12 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function const char *errstr; #endif +#ifdef CONFIG_PRINTK_INDEX + printk_index_subsys_emit( + "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", + KERN_CRIT, fmt); +#endif + /* * Special case: if the error is EROFS, and we're already * under SB_RDONLY, then it is safe here. @@ -273,6 +279,10 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, const char *type = logtypes[4]; struct ratelimit_state *ratelimit = &printk_limits[4]; +#ifdef CONFIG_PRINTK_INDEX + printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); +#endif + va_start(args, fmt); while ((kern_level = printk_get_level(fmt)) != 0) { -- cgit From ec8eb376e271ed2b8724bf488f4c74d2746d5446 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:51 -0400 Subject: btrfs: move BTRFS_FS_STATE* definitions and helpers to fs.h We're going to use fs.h to hold fs wide related helpers and definitions, move the FS_STATE enum and related helpers to fs.h, and then update all files that need these definitions to include fs.h. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 1 + fs/btrfs/ctree.c | 1 + fs/btrfs/ctree.h | 46 ----------------------------------------- fs/btrfs/delalloc-space.c | 1 + fs/btrfs/delayed-inode.c | 3 ++- fs/btrfs/dev-replace.c | 1 + fs/btrfs/extent_io.c | 1 + fs/btrfs/free-space-cache.c | 3 ++- fs/btrfs/fs.h | 49 ++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/inode-item.c | 3 ++- fs/btrfs/reflink.c | 3 ++- fs/btrfs/root-tree.c | 3 ++- fs/btrfs/sysfs.c | 1 + fs/btrfs/tests/btrfs-tests.c | 1 + fs/btrfs/xattr.c | 3 ++- 15 files changed, 68 insertions(+), 52 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 65a4e5087215..c0615af0434f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -23,6 +23,7 @@ #include #include "misc.h" #include "ctree.h" +#include "fs.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f423b86f5262..7ecb658500ce 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -18,6 +18,7 @@ #include "qgroup.h" #include "tree-mod-log.h" #include "tree-checker.h" +#include "fs.h" static struct kmem_cache *btrfs_path_cachep; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index d27d059e4d59..6f1eefbe3691 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -67,37 +67,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) sizeof(struct btrfs_stripe) * (num_stripes - 1); } -/* - * Runtime (in-memory) states of filesystem - */ -enum { - /* Global indicator of serious filesystem errors */ - BTRFS_FS_STATE_ERROR, - /* - * Filesystem is being remounted, allow to skip some operations, like - * defrag - */ - BTRFS_FS_STATE_REMOUNTING, - /* Filesystem in RO mode */ - BTRFS_FS_STATE_RO, - /* Track if a transaction abort has been reported on this filesystem */ - BTRFS_FS_STATE_TRANS_ABORTED, - /* - * Bio operations should be blocked on this filesystem because a source - * or target device is being destroyed as part of a device replace - */ - BTRFS_FS_STATE_DEV_REPLACING, - /* The btrfs_fs_info created for self-tests */ - BTRFS_FS_STATE_DUMMY_FS_INFO, - - BTRFS_FS_STATE_NO_CSUMS, - - /* Indicates there was an error cleaning up a log tree. */ - BTRFS_FS_STATE_LOG_CLEANUP_ERROR, - - BTRFS_FS_STATE_COUNT -}; - #define BTRFS_SUPER_INFO_OFFSET SZ_64K #define BTRFS_SUPER_INFO_SIZE 4096 static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); @@ -3224,12 +3193,6 @@ static inline unsigned long get_eb_page_index(unsigned long offset) #define EXPORT_FOR_TESTS #endif -#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ - &(fs_info)->fs_state))) -#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ - (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ - &(fs_info)->fs_state))) - /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); @@ -3327,15 +3290,6 @@ static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf, /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode); -static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) -{ - return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); -} -#else -static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) -{ - return 0; -} #endif static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 045545145a2b..605d8874a446 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -9,6 +9,7 @@ #include "transaction.h" #include "qgroup.h" #include "block-group.h" +#include "fs.h" /* * HOW DOES THIS WORK diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 8cf5ee646147..2f68570fbb53 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -6,12 +6,13 @@ #include #include +#include "ctree.h" +#include "fs.h" #include "messages.h" #include "misc.h" #include "delayed-inode.h" #include "disk-io.h" #include "transaction.h" -#include "ctree.h" #include "qgroup.h" #include "locking.h" #include "inode-item.h" diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 61e58066b5fd..348aef453e69 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -23,6 +23,7 @@ #include "sysfs.h" #include "zoned.h" #include "block-group.h" +#include "fs.h" /* * Device replace overview diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 78d7ea10621d..aa2d60f683bb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -30,6 +30,7 @@ #include "zoned.h" #include "block-group.h" #include "compression.h" +#include "fs.h" static struct kmem_cache *extent_buffer_cache; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 83d866f5ab6c..703902156f97 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -11,9 +11,10 @@ #include #include #include +#include "ctree.h" +#include "fs.h" #include "messages.h" #include "misc.h" -#include "ctree.h" #include "free-space-cache.h" #include "transaction.h" #include "disk-io.h" diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 8eda9ce0a904..25487af14717 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -3,6 +3,37 @@ #ifndef BTRFS_FS_H #define BTRFS_FS_H +/* + * Runtime (in-memory) states of filesystem + */ +enum { + /* Global indicator of serious filesystem errors */ + BTRFS_FS_STATE_ERROR, + /* + * Filesystem is being remounted, allow to skip some operations, like + * defrag + */ + BTRFS_FS_STATE_REMOUNTING, + /* Filesystem in RO mode */ + BTRFS_FS_STATE_RO, + /* Track if a transaction abort has been reported on this filesystem */ + BTRFS_FS_STATE_TRANS_ABORTED, + /* + * Bio operations should be blocked on this filesystem because a source + * or target device is being destroyed as part of a device replace + */ + BTRFS_FS_STATE_DEV_REPLACING, + /* The btrfs_fs_info created for self-tests */ + BTRFS_FS_STATE_DUMMY_FS_INFO, + + BTRFS_FS_STATE_NO_CSUMS, + + /* Indicates there was an error cleaning up a log tree. */ + BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + + BTRFS_FS_STATE_COUNT +}; + /* Compatibility and incompatibility defines */ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name); @@ -82,4 +113,22 @@ static inline void btrfs_clear_sb_rdonly(struct super_block *sb) clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); } +#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ + &(fs_info)->fs_state))) +#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ + (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ + &(fs_info)->fs_state))) + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) +{ + return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); +} +#else +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) +{ + return 0; +} +#endif + #endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index b301d8e3df87..25e9f1d65067 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -3,8 +3,9 @@ * Copyright (C) 2007 Oracle. All rights reserved. */ -#include "messages.h" #include "ctree.h" +#include "fs.h" +#include "messages.h" #include "inode-item.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 6179864de6e7..daf65bfad30e 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -2,9 +2,10 @@ #include #include +#include "ctree.h" +#include "fs.h" #include "messages.h" #include "compression.h" -#include "ctree.h" #include "delalloc-space.h" #include "disk-io.h" #include "reflink.h" diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 44c8c8ad0a16..112b4bf3c3b8 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -5,8 +5,9 @@ #include #include -#include "messages.h" #include "ctree.h" +#include "fs.h" +#include "messages.h" #include "transaction.h" #include "disk-io.h" #include "print-tree.h" diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 0b6bea00742c..31fb6cb389b1 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -22,6 +22,7 @@ #include "block-group.h" #include "qgroup.h" #include "misc.h" +#include "fs.h" /* * Structure name Path diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index d43cb5242fec..669fa7133a2f 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -16,6 +16,7 @@ #include "../disk-io.h" #include "../qgroup.h" #include "../block-group.h" +#include "../fs.h" static struct vfsmount *test_mnt = NULL; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index d12903f01f83..b26c869f0226 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -12,8 +12,9 @@ #include #include #include -#include "messages.h" #include "ctree.h" +#include "fs.h" +#include "messages.h" #include "btrfs_inode.h" #include "transaction.h" #include "xattr.h" -- cgit From 0d3a9cf8c3068136145b90841778e3f1ac13f22a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:52 -0400 Subject: btrfs: convert incompat and compat flag test helpers to macros These helpers use functions not defined in fs.h, they're simply accessors of the super block in fs_info, convert them to macros so that we don't have a weird dependency between fs.h and accessors.h. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/fs.h | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 25487af14717..682542ed964f 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -44,6 +44,12 @@ void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, const char *name); +#define __btrfs_fs_incompat(fs_info, flags) \ + (!!(btrfs_super_incompat_flags((fs_info)->super_copy) & (flags))) + +#define __btrfs_fs_compat_ro(fs_info, flags) \ + (!!(btrfs_super_compat_ro_flags((fs_info)->super_copy) & (flags))) + #define btrfs_set_fs_incompat(__fs_info, opt) \ __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) @@ -62,20 +68,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, #define btrfs_fs_compat_ro(fs_info, opt) \ __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) -static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) -{ - struct btrfs_super_block *disk_super; - disk_super = fs_info->super_copy; - return !!(btrfs_super_incompat_flags(disk_super) & flag); -} - -static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) -{ - struct btrfs_super_block *disk_super; - disk_super = fs_info->super_copy; - return !!(btrfs_super_compat_ro_flags(disk_super) & flag); -} - static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) { /* Do it this way so we only ever do one test_bit in the normal case. */ -- cgit From fc97a410bd781d00634194bfcddfd2be4185d954 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:53 -0400 Subject: btrfs: move mount option definitions to fs.h These are fs wide definitions and helpers, move them out of ctree.h and into fs.h. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 1 + fs/btrfs/ctree.h | 63 -------------------------------------------------- fs/btrfs/delayed-ref.c | 1 + fs/btrfs/discard.c | 1 + fs/btrfs/fs.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ref-verify.c | 1 + 6 files changed, 67 insertions(+), 63 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 89e3e7d1bff6..a48fa7ac74cc 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -7,6 +7,7 @@ #include "transaction.h" #include "block-group.h" #include "disk-io.h" +#include "fs.h" /* * HOW DO BLOCK RESERVES WORK diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6f1eefbe3691..86af9d36fdf7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1328,69 +1328,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); } -/* - * Flags for mount options. - * - * Note: don't forget to add new options to btrfs_show_options() - */ -enum { - BTRFS_MOUNT_NODATASUM = (1UL << 0), - BTRFS_MOUNT_NODATACOW = (1UL << 1), - BTRFS_MOUNT_NOBARRIER = (1UL << 2), - BTRFS_MOUNT_SSD = (1UL << 3), - BTRFS_MOUNT_DEGRADED = (1UL << 4), - BTRFS_MOUNT_COMPRESS = (1UL << 5), - BTRFS_MOUNT_NOTREELOG = (1UL << 6), - BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7), - BTRFS_MOUNT_SSD_SPREAD = (1UL << 8), - BTRFS_MOUNT_NOSSD = (1UL << 9), - BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10), - BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11), - BTRFS_MOUNT_SPACE_CACHE = (1UL << 12), - BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13), - BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14), - BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15), - BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), - BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), - BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), - BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19), - BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20), - BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21), - BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22), - BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23), - BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24), - BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25), - BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26), - BTRFS_MOUNT_REF_VERIFY = (1UL << 27), - BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), - BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), - BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), - BTRFS_MOUNT_NODISCARD = (1UL << 31), -}; - -#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) -#define BTRFS_DEFAULT_MAX_INLINE (2048) - -#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) -#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) -#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) -#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ - BTRFS_MOUNT_##opt) - -#define btrfs_set_and_info(fs_info, opt, fmt, args...) \ -do { \ - if (!btrfs_test_opt(fs_info, opt)) \ - btrfs_info(fs_info, fmt, ##args); \ - btrfs_set_opt(fs_info->mount_opt, opt); \ -} while (0) - -#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ -do { \ - if (btrfs_test_opt(fs_info, opt)) \ - btrfs_info(fs_info, fmt, ##args); \ - btrfs_clear_opt(fs_info->mount_opt, opt); \ -} while (0) - /* * Requests for changes that need to be done during transaction commit. * diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index c775ff4f1cb1..010cc16297d8 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -13,6 +13,7 @@ #include "qgroup.h" #include "space-info.h" #include "tree-mod-log.h" +#include "fs.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_tree_ref_cachep; diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index e1b7bd927d69..51f0ef386046 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -11,6 +11,7 @@ #include "block-group.h" #include "discard.h" #include "free-space-cache.h" +#include "fs.h" /* * This contains the logic to handle async discard. diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 682542ed964f..4799812a7351 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -34,6 +34,49 @@ enum { BTRFS_FS_STATE_COUNT }; +/* + * Flags for mount options. + * + * Note: don't forget to add new options to btrfs_show_options() + */ +enum { + BTRFS_MOUNT_NODATASUM = (1UL << 0), + BTRFS_MOUNT_NODATACOW = (1UL << 1), + BTRFS_MOUNT_NOBARRIER = (1UL << 2), + BTRFS_MOUNT_SSD = (1UL << 3), + BTRFS_MOUNT_DEGRADED = (1UL << 4), + BTRFS_MOUNT_COMPRESS = (1UL << 5), + BTRFS_MOUNT_NOTREELOG = (1UL << 6), + BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7), + BTRFS_MOUNT_SSD_SPREAD = (1UL << 8), + BTRFS_MOUNT_NOSSD = (1UL << 9), + BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10), + BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11), + BTRFS_MOUNT_SPACE_CACHE = (1UL << 12), + BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13), + BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14), + BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15), + BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), + BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), + BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), + BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19), + BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20), + BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21), + BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22), + BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23), + BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24), + BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25), + BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26), + BTRFS_MOUNT_REF_VERIFY = (1UL << 27), + BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), + BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), + BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), + BTRFS_MOUNT_NODISCARD = (1UL << 31), +}; + +#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) +#define BTRFS_DEFAULT_MAX_INLINE (2048) + /* Compatibility and incompatibility defines */ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name); @@ -68,6 +111,26 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, #define btrfs_fs_compat_ro(fs_info, opt) \ __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) +#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) +#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) +#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) +#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ + BTRFS_MOUNT_##opt) + +#define btrfs_set_and_info(fs_info, opt, fmt, args...) \ +do { \ + if (!btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_set_opt(fs_info->mount_opt, opt); \ +} while (0) + +#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ +do { \ + if (btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_clear_opt(fs_info->mount_opt, opt); \ +} while (0) + static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) { /* Do it this way so we only ever do one test_bit in the normal case. */ diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index f7535b8b62f5..9b09dc50ba14 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -11,6 +11,7 @@ #include "locking.h" #include "delayed-ref.h" #include "ref-verify.h" +#include "fs.h" /* * Used to keep track the roots and number of refs each root has for a given -- cgit From 7966a6b5959bbeb38b35d12b7a533c1dee8c432c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:54 -0400 Subject: btrfs: move fs_info::flags enum to fs.h These definitions are fs wide, take them out of ctree.h and put them in fs.h. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 68 ------------------------------------------- fs/btrfs/fs.h | 68 +++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/tests/qgroup-tests.c | 1 + fs/btrfs/tree-mod-log.c | 1 + 4 files changed, 70 insertions(+), 68 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 86af9d36fdf7..149c185f4314 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -280,69 +280,6 @@ struct btrfs_discard_ctl { atomic64_t discard_bytes_saved; }; -enum { - BTRFS_FS_CLOSING_START, - BTRFS_FS_CLOSING_DONE, - BTRFS_FS_LOG_RECOVERING, - BTRFS_FS_OPEN, - BTRFS_FS_QUOTA_ENABLED, - BTRFS_FS_UPDATE_UUID_TREE_GEN, - BTRFS_FS_CREATING_FREE_SPACE_TREE, - BTRFS_FS_BTREE_ERR, - BTRFS_FS_LOG1_ERR, - BTRFS_FS_LOG2_ERR, - BTRFS_FS_QUOTA_OVERRIDE, - /* Used to record internally whether fs has been frozen */ - BTRFS_FS_FROZEN, - /* - * Indicate that balance has been set up from the ioctl and is in the - * main phase. The fs_info::balance_ctl is initialized. - */ - BTRFS_FS_BALANCE_RUNNING, - - /* - * Indicate that relocation of a chunk has started, it's set per chunk - * and is toggled between chunks. - */ - BTRFS_FS_RELOC_RUNNING, - - /* Indicate that the cleaner thread is awake and doing something. */ - BTRFS_FS_CLEANER_RUNNING, - - /* - * The checksumming has an optimized version and is considered fast, - * so we don't need to offload checksums to workqueues. - */ - BTRFS_FS_CSUM_IMPL_FAST, - - /* Indicate that the discard workqueue can service discards. */ - BTRFS_FS_DISCARD_RUNNING, - - /* Indicate that we need to cleanup space cache v1 */ - BTRFS_FS_CLEANUP_SPACE_CACHE_V1, - - /* Indicate that we can't trust the free space tree for caching yet */ - BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, - - /* Indicate whether there are any tree modification log users */ - BTRFS_FS_TREE_MOD_LOG_USERS, - - /* Indicate that we want the transaction kthread to commit right now. */ - BTRFS_FS_COMMIT_TRANS, - - /* Indicate we have half completed snapshot deletions pending. */ - BTRFS_FS_UNFINISHED_DROPS, - - /* Indicate we have to finish a zone to do next allocation. */ - BTRFS_FS_NEED_ZONE_FINISH, - -#if BITS_PER_LONG == 32 - /* Indicate if we have error/warn message printed on 32bit systems */ - BTRFS_FS_32BIT_ERROR, - BTRFS_FS_32BIT_WARN, -#endif -}; - /* * Exclusive operations (device replace, resize, device add/remove, balance) */ @@ -1037,11 +974,6 @@ enum btrfs_lockdep_trans_states { &lock##_key, 0); \ } while (0) -static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) -{ - clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); -} - /* * Record swapped tree blocks of a subvolume tree for delayed subtree trace * code. For detail check comment in fs/btrfs/qgroup.c. diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 4799812a7351..7337707d3939 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -34,6 +34,69 @@ enum { BTRFS_FS_STATE_COUNT }; +enum { + BTRFS_FS_CLOSING_START, + BTRFS_FS_CLOSING_DONE, + BTRFS_FS_LOG_RECOVERING, + BTRFS_FS_OPEN, + BTRFS_FS_QUOTA_ENABLED, + BTRFS_FS_UPDATE_UUID_TREE_GEN, + BTRFS_FS_CREATING_FREE_SPACE_TREE, + BTRFS_FS_BTREE_ERR, + BTRFS_FS_LOG1_ERR, + BTRFS_FS_LOG2_ERR, + BTRFS_FS_QUOTA_OVERRIDE, + /* Used to record internally whether fs has been frozen */ + BTRFS_FS_FROZEN, + /* + * Indicate that balance has been set up from the ioctl and is in the + * main phase. The fs_info::balance_ctl is initialized. + */ + BTRFS_FS_BALANCE_RUNNING, + + /* + * Indicate that relocation of a chunk has started, it's set per chunk + * and is toggled between chunks. + */ + BTRFS_FS_RELOC_RUNNING, + + /* Indicate that the cleaner thread is awake and doing something. */ + BTRFS_FS_CLEANER_RUNNING, + + /* + * The checksumming has an optimized version and is considered fast, + * so we don't need to offload checksums to workqueues. + */ + BTRFS_FS_CSUM_IMPL_FAST, + + /* Indicate that the discard workqueue can service discards. */ + BTRFS_FS_DISCARD_RUNNING, + + /* Indicate that we need to cleanup space cache v1 */ + BTRFS_FS_CLEANUP_SPACE_CACHE_V1, + + /* Indicate that we can't trust the free space tree for caching yet */ + BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, + + /* Indicate whether there are any tree modification log users */ + BTRFS_FS_TREE_MOD_LOG_USERS, + + /* Indicate that we want the transaction kthread to commit right now. */ + BTRFS_FS_COMMIT_TRANS, + + /* Indicate we have half completed snapshot deletions pending. */ + BTRFS_FS_UNFINISHED_DROPS, + + /* Indicate we have to finish a zone to do next allocation. */ + BTRFS_FS_NEED_ZONE_FINISH, + +#if BITS_PER_LONG == 32 + /* Indicate if we have error/warn message printed on 32bit systems */ + BTRFS_FS_32BIT_ERROR, + BTRFS_FS_32BIT_WARN, +#endif +}; + /* * Flags for mount options. * @@ -168,6 +231,11 @@ static inline void btrfs_clear_sb_rdonly(struct super_block *sb) clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); } +static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); +} + #define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ &(fs_info)->fs_state))) #define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 63676ea19f29..fd0e0ade8154 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -10,6 +10,7 @@ #include "../disk-io.h" #include "../qgroup.h" #include "../backref.h" +#include "../fs.h" static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid) diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 41e5d9f789dc..69083889a9b8 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -3,6 +3,7 @@ #include "messages.h" #include "tree-mod-log.h" #include "disk-io.h" +#include "fs.h" struct tree_mod_root { u64 logical; -- cgit From c52cc7b7acfb3290a0268538b82ac7cf18df7ca4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:55 -0400 Subject: btrfs: add a BTRFS_FS_NEED_TRANS_COMMIT flag Currently we are only using fs_info->pending_changes to indicate that we need a transaction commit. The original users for this were removed years ago and we don't have more usage in sight, so this is the only remaining reason to have this field. Add a flag so we can remove this code. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/fs.h | 3 +++ fs/btrfs/super.c | 3 ++- fs/btrfs/sysfs.c | 4 ++-- fs/btrfs/transaction.c | 2 ++ 4 files changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 7337707d3939..bc5de4d598ad 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -90,6 +90,9 @@ enum { /* Indicate we have to finish a zone to do next allocation. */ BTRFS_FS_NEED_ZONE_FINISH, + /* Indicate that we want to commit the transaction. */ + BTRFS_FS_NEED_TRANS_COMMIT, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3aeb2106e119..0839e4a1cfac 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1535,7 +1535,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) * Exit unless we have some pending changes * that need to go through commit */ - if (fs_info->pending_changes == 0) + if (!test_bit(BTRFS_FS_NEED_TRANS_COMMIT, + &fs_info->flags)) return 0; /* * A non-blocking test if the fs is frozen. We must not diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 31fb6cb389b1..47b221372dcf 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -249,7 +249,7 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, /* * We don't want to do full transaction commit from inside sysfs */ - btrfs_set_pending(fs_info, COMMIT); + set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); wake_up_process(fs_info->transaction_kthread); return count; @@ -960,7 +960,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj, /* * We don't want to do full transaction commit from inside sysfs */ - btrfs_set_pending(fs_info, COMMIT); + set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); wake_up_process(fs_info->transaction_kthread); return len; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bae77fb05e2b..7b6b68ab089a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2104,6 +2104,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ASSERT(refcount_read(&trans->use_count) == 1); btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); + /* Stop the commit early if ->aborted is set */ if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; -- cgit From 55e5cfd36da5d71e21b72a5922c9b6c349744c2a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:56 -0400 Subject: btrfs: remove fs_info::pending_changes and related code Now that we're not using this code anywhere we can remove it as well as the member from fs_info. We don't have any mount options or on/off features that would utilize the pending infrastructure, the last one was inode_cache. There was a patchset [1] to enable some features from sysfs that would break things if it would be set immediately. In case we'll need that kind of logic again the patch can be reverted, but for the current use it can be replaced by the single state bit to do the commit. [1] https://lore.kernel.org/linux-btrfs/1422609654-19519-1-git-send-email-quwenruo@cn.fujitsu.com/ Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ add note ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 23 +---------------------- fs/btrfs/disk-io.c | 6 ------ fs/btrfs/transaction.c | 25 ------------------------- fs/btrfs/transaction.h | 1 - 4 files changed, 1 insertion(+), 54 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 149c185f4314..c5f2f387989e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -373,11 +373,7 @@ struct btrfs_fs_info { */ u64 last_trans_log_full_commit; unsigned long mount_opt; - /* - * Track requests for actions that need to be done during transaction - * commit (like for some mount options). - */ - unsigned long pending_changes; + unsigned long compress_type:4; unsigned int compress_level; u32 commit_interval; @@ -1260,23 +1256,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); } -/* - * Requests for changes that need to be done during transaction commit. - * - * Internal mount options that are used for special handling of the real - * mount options (eg. cannot be set during remount and have to be set during - * transaction commit) - */ - -#define BTRFS_PENDING_COMMIT (0) - -#define btrfs_test_pending(info, opt) \ - test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) -#define btrfs_set_pending(info, opt) \ - set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) -#define btrfs_clear_pending(info, opt) \ - clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) - struct btrfs_map_token { struct extent_buffer *eb; char *kaddr; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5e20b191b108..2c38b4eebd8f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3761,12 +3761,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device btrfs_clear_opt(fs_info->mount_opt, NODISCARD); } - /* - * Mount does not set all options immediately, we can do it now and do - * not have to wait for transaction commit - */ - btrfs_apply_pending_changes(fs_info); - #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) { ret = btrfsic_mount(fs_info, fs_devices, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7b6b68ab089a..37d0baaa41d8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2359,12 +2359,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto unlock_reloc; - /* - * Since the transaction is done, we can apply the pending changes - * before the next transaction. - */ - btrfs_apply_pending_changes(fs_info); - /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots */ @@ -2587,25 +2581,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) return (ret < 0) ? 0 : 1; } -void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info) -{ - unsigned long prev; - unsigned long bit; - - prev = xchg(&fs_info->pending_changes, 0); - if (!prev) - return; - - bit = 1 << BTRFS_PENDING_COMMIT; - if (prev & bit) - btrfs_debug(fs_info, "pending commit done"); - prev &= ~bit; - - if (prev) - btrfs_warn(fs_info, - "unknown pending changes left 0x%lx, ignoring", prev); -} - int __init btrfs_transaction_init(void) { btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index b39ebf98af60..97f6c39f59c8 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -233,7 +233,6 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); void btrfs_put_transaction(struct btrfs_transaction *transaction); -void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); -- cgit From d83eb482b727e7d9fdf6f729db6da3642c4d8a58 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:57 -0400 Subject: btrfs: move the compat/incompat flag masks to fs.h This is fs wide information, move it out of ctree.h into fs.h. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 57 -------------------------------------------------------- fs/btrfs/fs.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 57 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c5f2f387989e..af6b72c7e33f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -78,63 +78,6 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); */ #define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) -/* - * Compat flags that we support. If any incompat flags are set other than the - * ones specified below then we will fail to mount - */ -#define BTRFS_FEATURE_COMPAT_SUPP 0ULL -#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL -#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL - -#define BTRFS_FEATURE_COMPAT_RO_SUPP \ - (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ - BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \ - BTRFS_FEATURE_COMPAT_RO_VERITY | \ - BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE) - -#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL -#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL - -#ifdef CONFIG_BTRFS_DEBUG -/* - * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG - */ -#define BTRFS_FEATURE_INCOMPAT_SUPP \ - (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ - BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ - BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ - BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ - BTRFS_FEATURE_INCOMPAT_RAID56 | \ - BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ - BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ - BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ - BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ - BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ - BTRFS_FEATURE_INCOMPAT_ZONED | \ - BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) -#else -#define BTRFS_FEATURE_INCOMPAT_SUPP \ - (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ - BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ - BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ - BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ - BTRFS_FEATURE_INCOMPAT_RAID56 | \ - BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ - BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ - BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ - BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ - BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ - BTRFS_FEATURE_INCOMPAT_ZONED) -#endif - -#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ - (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) -#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL - /* Read ahead values for struct btrfs_path.reada */ enum { READA_NONE, diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index bc5de4d598ad..af356feec0c7 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -140,6 +140,63 @@ enum { BTRFS_MOUNT_NODISCARD = (1UL << 31), }; +/* + * Compat flags that we support. If any incompat flags are set other than the + * ones specified below then we will fail to mount + */ +#define BTRFS_FEATURE_COMPAT_SUPP 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL + +#define BTRFS_FEATURE_COMPAT_RO_SUPP \ + (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ + BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \ + BTRFS_FEATURE_COMPAT_RO_VERITY | \ + BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE) + +#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL + +#ifdef CONFIG_BTRFS_DEBUG +/* + * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG + */ +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ + BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ + BTRFS_FEATURE_INCOMPAT_ZONED | \ + BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) +#else +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ + BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ + BTRFS_FEATURE_INCOMPAT_ZONED) +#endif + +#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ + (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) +#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL + #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) -- cgit From 818fe33aed42ddd5052171328a3f708e98357e10 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:58 -0400 Subject: btrfs: rename struct-funcs.c to accessors.c Rename struct-funcs.c to accessors.c so we can move the item accessors out of ctree.h. accessors.c is a better description of the code that is contained in these files. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Makefile | 2 +- fs/btrfs/accessors.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/struct-funcs.c | 167 ------------------------------------------------ 3 files changed, 167 insertions(+), 168 deletions(-) create mode 100644 fs/btrfs/accessors.c delete mode 100644 fs/btrfs/struct-funcs.c (limited to 'fs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index eebb45c06485..76f90dcfb14d 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -24,7 +24,7 @@ obj-$(CONFIG_BTRFS_FS) := btrfs.o btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ file-item.o inode-item.o disk-io.o \ transaction.o inode.o file.o tree-defrag.o \ - extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ + extent_map.o sysfs.o accessors.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c new file mode 100644 index 000000000000..118bfd1c0e3e --- /dev/null +++ b/fs/btrfs/accessors.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include +#include "messages.h" +#include "ctree.h" + +static bool check_setget_bounds(const struct extent_buffer *eb, + const void *ptr, unsigned off, int size) +{ + const unsigned long member_offset = (unsigned long)ptr + off; + + if (unlikely(member_offset + size > eb->len)) { + btrfs_warn(eb->fs_info, + "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d", + (member_offset > eb->len ? "start" : "end"), + (unsigned long)ptr, eb->start, member_offset, size); + return false; + } + + return true; +} + +/* + * Macro templates that define helpers to read/write extent buffer data of a + * given size, that are also used via ctree.h for access to item members by + * specialized helpers. + * + * Generic helpers: + * - btrfs_set_8 (for 8/16/32/64) + * - btrfs_get_8 (for 8/16/32/64) + * + * Generic helpers with a token (cached address of the most recently accessed + * page): + * - btrfs_set_token_8 (for 8/16/32/64) + * - btrfs_get_token_8 (for 8/16/32/64) + * + * The set/get functions handle data spanning two pages transparently, in case + * metadata block size is larger than page. Every pointer to metadata items is + * an offset into the extent buffer page array, cast to a specific type. This + * gives us all the type checking. + * + * The extent buffer pages stored in the array pages do not form a contiguous + * phyusical range, but the API functions assume the linear offset to the range + * from 0 to metadata node size. + */ + +#define DEFINE_BTRFS_SETGET_BITS(bits) \ +u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ + const void *ptr, unsigned long off) \ +{ \ + const unsigned long member_offset = (unsigned long)ptr + off; \ + const unsigned long idx = get_eb_page_index(member_offset); \ + const unsigned long oip = get_eb_offset_in_page(token->eb, \ + member_offset); \ + const int size = sizeof(u##bits); \ + u8 lebytes[sizeof(u##bits)]; \ + const int part = PAGE_SIZE - oip; \ + \ + ASSERT(token); \ + ASSERT(token->kaddr); \ + ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ + if (token->offset <= member_offset && \ + member_offset + size <= token->offset + PAGE_SIZE) { \ + return get_unaligned_le##bits(token->kaddr + oip); \ + } \ + token->kaddr = page_address(token->eb->pages[idx]); \ + token->offset = idx << PAGE_SHIFT; \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \ + return get_unaligned_le##bits(token->kaddr + oip); \ + \ + memcpy(lebytes, token->kaddr + oip, part); \ + token->kaddr = page_address(token->eb->pages[idx + 1]); \ + token->offset = (idx + 1) << PAGE_SHIFT; \ + memcpy(lebytes + part, token->kaddr, size - part); \ + return get_unaligned_le##bits(lebytes); \ +} \ +u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ + const void *ptr, unsigned long off) \ +{ \ + const unsigned long member_offset = (unsigned long)ptr + off; \ + const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ + const unsigned long idx = get_eb_page_index(member_offset); \ + char *kaddr = page_address(eb->pages[idx]); \ + const int size = sizeof(u##bits); \ + const int part = PAGE_SIZE - oip; \ + u8 lebytes[sizeof(u##bits)]; \ + \ + ASSERT(check_setget_bounds(eb, ptr, off, size)); \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \ + return get_unaligned_le##bits(kaddr + oip); \ + \ + memcpy(lebytes, kaddr + oip, part); \ + kaddr = page_address(eb->pages[idx + 1]); \ + memcpy(lebytes + part, kaddr, size - part); \ + return get_unaligned_le##bits(lebytes); \ +} \ +void btrfs_set_token_##bits(struct btrfs_map_token *token, \ + const void *ptr, unsigned long off, \ + u##bits val) \ +{ \ + const unsigned long member_offset = (unsigned long)ptr + off; \ + const unsigned long idx = get_eb_page_index(member_offset); \ + const unsigned long oip = get_eb_offset_in_page(token->eb, \ + member_offset); \ + const int size = sizeof(u##bits); \ + u8 lebytes[sizeof(u##bits)]; \ + const int part = PAGE_SIZE - oip; \ + \ + ASSERT(token); \ + ASSERT(token->kaddr); \ + ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ + if (token->offset <= member_offset && \ + member_offset + size <= token->offset + PAGE_SIZE) { \ + put_unaligned_le##bits(val, token->kaddr + oip); \ + return; \ + } \ + token->kaddr = page_address(token->eb->pages[idx]); \ + token->offset = idx << PAGE_SHIFT; \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \ + put_unaligned_le##bits(val, token->kaddr + oip); \ + return; \ + } \ + put_unaligned_le##bits(val, lebytes); \ + memcpy(token->kaddr + oip, lebytes, part); \ + token->kaddr = page_address(token->eb->pages[idx + 1]); \ + token->offset = (idx + 1) << PAGE_SHIFT; \ + memcpy(token->kaddr, lebytes + part, size - part); \ +} \ +void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val) \ +{ \ + const unsigned long member_offset = (unsigned long)ptr + off; \ + const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ + const unsigned long idx = get_eb_page_index(member_offset); \ + char *kaddr = page_address(eb->pages[idx]); \ + const int size = sizeof(u##bits); \ + const int part = PAGE_SIZE - oip; \ + u8 lebytes[sizeof(u##bits)]; \ + \ + ASSERT(check_setget_bounds(eb, ptr, off, size)); \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \ + put_unaligned_le##bits(val, kaddr + oip); \ + return; \ + } \ + \ + put_unaligned_le##bits(val, lebytes); \ + memcpy(kaddr + oip, lebytes, part); \ + kaddr = page_address(eb->pages[idx + 1]); \ + memcpy(kaddr, lebytes + part, size - part); \ +} + +DEFINE_BTRFS_SETGET_BITS(8) +DEFINE_BTRFS_SETGET_BITS(16) +DEFINE_BTRFS_SETGET_BITS(32) +DEFINE_BTRFS_SETGET_BITS(64) + +void btrfs_node_key(const struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + unsigned long ptr = btrfs_node_key_ptr_offset(nr); + read_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); +} diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c deleted file mode 100644 index 6ba16c018d7f..000000000000 --- a/fs/btrfs/struct-funcs.c +++ /dev/null @@ -1,167 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2007 Oracle. All rights reserved. - */ - -#include - -#include "messages.h" -#include "ctree.h" - -static bool check_setget_bounds(const struct extent_buffer *eb, - const void *ptr, unsigned off, int size) -{ - const unsigned long member_offset = (unsigned long)ptr + off; - - if (unlikely(member_offset + size > eb->len)) { - btrfs_warn(eb->fs_info, - "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d", - (member_offset > eb->len ? "start" : "end"), - (unsigned long)ptr, eb->start, member_offset, size); - return false; - } - - return true; -} - -/* - * Macro templates that define helpers to read/write extent buffer data of a - * given size, that are also used via ctree.h for access to item members by - * specialized helpers. - * - * Generic helpers: - * - btrfs_set_8 (for 8/16/32/64) - * - btrfs_get_8 (for 8/16/32/64) - * - * Generic helpers with a token (cached address of the most recently accessed - * page): - * - btrfs_set_token_8 (for 8/16/32/64) - * - btrfs_get_token_8 (for 8/16/32/64) - * - * The set/get functions handle data spanning two pages transparently, in case - * metadata block size is larger than page. Every pointer to metadata items is - * an offset into the extent buffer page array, cast to a specific type. This - * gives us all the type checking. - * - * The extent buffer pages stored in the array pages do not form a contiguous - * phyusical range, but the API functions assume the linear offset to the range - * from 0 to metadata node size. - */ - -#define DEFINE_BTRFS_SETGET_BITS(bits) \ -u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off) \ -{ \ - const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = get_eb_page_index(member_offset); \ - const unsigned long oip = get_eb_offset_in_page(token->eb, \ - member_offset); \ - const int size = sizeof(u##bits); \ - u8 lebytes[sizeof(u##bits)]; \ - const int part = PAGE_SIZE - oip; \ - \ - ASSERT(token); \ - ASSERT(token->kaddr); \ - ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ - if (token->offset <= member_offset && \ - member_offset + size <= token->offset + PAGE_SIZE) { \ - return get_unaligned_le##bits(token->kaddr + oip); \ - } \ - token->kaddr = page_address(token->eb->pages[idx]); \ - token->offset = idx << PAGE_SHIFT; \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \ - return get_unaligned_le##bits(token->kaddr + oip); \ - \ - memcpy(lebytes, token->kaddr + oip, part); \ - token->kaddr = page_address(token->eb->pages[idx + 1]); \ - token->offset = (idx + 1) << PAGE_SHIFT; \ - memcpy(lebytes + part, token->kaddr, size - part); \ - return get_unaligned_le##bits(lebytes); \ -} \ -u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ - const void *ptr, unsigned long off) \ -{ \ - const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ - const unsigned long idx = get_eb_page_index(member_offset); \ - char *kaddr = page_address(eb->pages[idx]); \ - const int size = sizeof(u##bits); \ - const int part = PAGE_SIZE - oip; \ - u8 lebytes[sizeof(u##bits)]; \ - \ - ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \ - return get_unaligned_le##bits(kaddr + oip); \ - \ - memcpy(lebytes, kaddr + oip, part); \ - kaddr = page_address(eb->pages[idx + 1]); \ - memcpy(lebytes + part, kaddr, size - part); \ - return get_unaligned_le##bits(lebytes); \ -} \ -void btrfs_set_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off, \ - u##bits val) \ -{ \ - const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = get_eb_page_index(member_offset); \ - const unsigned long oip = get_eb_offset_in_page(token->eb, \ - member_offset); \ - const int size = sizeof(u##bits); \ - u8 lebytes[sizeof(u##bits)]; \ - const int part = PAGE_SIZE - oip; \ - \ - ASSERT(token); \ - ASSERT(token->kaddr); \ - ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ - if (token->offset <= member_offset && \ - member_offset + size <= token->offset + PAGE_SIZE) { \ - put_unaligned_le##bits(val, token->kaddr + oip); \ - return; \ - } \ - token->kaddr = page_address(token->eb->pages[idx]); \ - token->offset = idx << PAGE_SHIFT; \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \ - put_unaligned_le##bits(val, token->kaddr + oip); \ - return; \ - } \ - put_unaligned_le##bits(val, lebytes); \ - memcpy(token->kaddr + oip, lebytes, part); \ - token->kaddr = page_address(token->eb->pages[idx + 1]); \ - token->offset = (idx + 1) << PAGE_SHIFT; \ - memcpy(token->kaddr, lebytes + part, size - part); \ -} \ -void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ - unsigned long off, u##bits val) \ -{ \ - const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ - const unsigned long idx = get_eb_page_index(member_offset); \ - char *kaddr = page_address(eb->pages[idx]); \ - const int size = sizeof(u##bits); \ - const int part = PAGE_SIZE - oip; \ - u8 lebytes[sizeof(u##bits)]; \ - \ - ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \ - put_unaligned_le##bits(val, kaddr + oip); \ - return; \ - } \ - \ - put_unaligned_le##bits(val, lebytes); \ - memcpy(kaddr + oip, lebytes, part); \ - kaddr = page_address(eb->pages[idx + 1]); \ - memcpy(kaddr, lebytes + part, size - part); \ -} - -DEFINE_BTRFS_SETGET_BITS(8) -DEFINE_BTRFS_SETGET_BITS(16) -DEFINE_BTRFS_SETGET_BITS(32) -DEFINE_BTRFS_SETGET_BITS(64) - -void btrfs_node_key(const struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) -{ - unsigned long ptr = btrfs_node_key_ptr_offset(nr); - read_eb_member(eb, (struct btrfs_key_ptr *)ptr, - struct btrfs_key_ptr, key, disk_key); -} -- cgit From ad1ac5012c2beccc9de4d3e2cc1382fd250e5540 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:59 -0400 Subject: btrfs: move btrfs_map_token to accessors This is specific to the item-accessor code, move it out of ctree.h into accessor.h/.c and then update the users to include the new header file. This un-inlines btrfs_init_map_token, however this is only called once per function so it's not critical to be inlined. This also saves 904 bytes of code on a release build. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/accessors.c | 8 ++++++++ fs/btrfs/accessors.h | 14 ++++++++++++++ fs/btrfs/ctree.c | 1 + fs/btrfs/ctree.h | 16 ++-------------- fs/btrfs/inode.c | 1 + fs/btrfs/tree-log.c | 1 + 6 files changed, 27 insertions(+), 14 deletions(-) create mode 100644 fs/btrfs/accessors.h (limited to 'fs') diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index 118bfd1c0e3e..7a7b7d263102 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -6,6 +6,7 @@ #include #include "messages.h" #include "ctree.h" +#include "accessors.h" static bool check_setget_bounds(const struct extent_buffer *eb, const void *ptr, unsigned off, int size) @@ -23,6 +24,13 @@ static bool check_setget_bounds(const struct extent_buffer *eb, return true; } +void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb) +{ + token->eb = eb; + token->kaddr = page_address(eb->pages[0]); + token->offset = 0; +} + /* * Macro templates that define helpers to read/write extent buffer data of a * given size, that are also used via ctree.h for access to item members by diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h new file mode 100644 index 000000000000..a47b0f9d30ca --- /dev/null +++ b/fs/btrfs/accessors.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ACCESSORS_H +#define BTRFS_ACCESSORS_H + +struct btrfs_map_token { + struct extent_buffer *eb; + char *kaddr; + unsigned long offset; +}; + +void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb); + +#endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7ecb658500ce..1283ca46c5bc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -19,6 +19,7 @@ #include "tree-mod-log.h" #include "tree-checker.h" #include "fs.h" +#include "accessors.h" static struct kmem_cache *btrfs_path_cachep; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index af6b72c7e33f..296ca4de8d52 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -52,6 +52,8 @@ struct btrfs_balance_control; struct btrfs_delayed_root; struct reloc_control; +struct btrfs_map_token; + #define BTRFS_OLDEST_GENERATION 0ULL #define BTRFS_EMPTY_DIR_SIZE 0 @@ -1199,23 +1201,9 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); } -struct btrfs_map_token { - struct extent_buffer *eb; - char *kaddr; - unsigned long offset; -}; - #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ ((bytes) >> (fs_info)->sectorsize_bits) -static inline void btrfs_init_map_token(struct btrfs_map_token *token, - struct extent_buffer *eb) -{ - token->eb = eb; - token->kaddr = page_address(eb->pages[0]); - token->offset = 0; -} - /* some macros to generate set/get functions for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 60b3162c035d..cb2a6d7f6252 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -56,6 +56,7 @@ #include "subpage.h" #include "inode-item.h" #include "fs.h" +#include "accessors.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e294c38f9b19..dc49f0aae1fb 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -22,6 +22,7 @@ #include "zoned.h" #include "inode-item.h" #include "fs.h" +#include "accessors.h" #define MAX_CONFLICT_INODES 10 -- cgit From 07e81dc94474eb62705c6f96d9ab1a5a797b8703 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:51:00 -0400 Subject: btrfs: move accessor helpers into accessors.h This is a large patch, but because they're all macros it's impossible to split up. Simply copy all of the item accessors in ctree.h and paste them in accessors.h, and then update any files to include the header so everything compiles. Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ reformat comments, style fixups ] Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 1068 +++++++++++++++++++++++++++++++ fs/btrfs/backref.c | 1 + fs/btrfs/block-group.c | 1 + fs/btrfs/block-rsv.c | 1 + fs/btrfs/check-integrity.c | 1 + fs/btrfs/ctree.h | 1090 -------------------------------- fs/btrfs/delayed-inode.c | 1 + fs/btrfs/dev-replace.c | 1 + fs/btrfs/dir-item.c | 1 + fs/btrfs/disk-io.c | 1 + fs/btrfs/export.c | 1 + fs/btrfs/extent-tree.c | 1 + fs/btrfs/extent_io.c | 1 + fs/btrfs/file-item.c | 1 + fs/btrfs/file.c | 1 + fs/btrfs/free-space-cache.c | 1 + fs/btrfs/free-space-tree.c | 1 + fs/btrfs/fs.c | 1 + fs/btrfs/inode-item.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/locking.c | 1 + fs/btrfs/print-tree.c | 1 + fs/btrfs/props.c | 1 + fs/btrfs/qgroup.c | 1 + fs/btrfs/ref-verify.c | 1 + fs/btrfs/reflink.c | 1 + fs/btrfs/relocation.c | 1 + fs/btrfs/root-tree.c | 1 + fs/btrfs/scrub.c | 1 + fs/btrfs/send.c | 1 + fs/btrfs/space-info.c | 1 + fs/btrfs/super.c | 1 + fs/btrfs/sysfs.c | 1 + fs/btrfs/tests/extent-buffer-tests.c | 1 + fs/btrfs/tests/free-space-tree-tests.c | 1 + fs/btrfs/tests/inode-tests.c | 1 + fs/btrfs/tests/qgroup-tests.c | 1 + fs/btrfs/transaction.c | 1 + fs/btrfs/tree-checker.c | 1 + fs/btrfs/tree-defrag.c | 1 + fs/btrfs/tree-mod-log.c | 1 + fs/btrfs/uuid-tree.c | 2 +- fs/btrfs/verity.c | 1 + fs/btrfs/volumes.c | 1 + fs/btrfs/xattr.c | 1 + fs/btrfs/zoned.c | 1 + 46 files changed, 1112 insertions(+), 1091 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index a47b0f9d30ca..e8bbd4d80161 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -9,6 +9,1074 @@ struct btrfs_map_token { unsigned long offset; }; +#define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items) + void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb); +/* + * Some macros to generate set/get functions for the struct fields. This + * assumes there is a lefoo_to_cpu for every type, so lets make a simple one + * for u8: + */ +#define le8_to_cpu(v) (v) +#define cpu_to_le8(v) (v) +#define __le8 u8 + +static inline u8 get_unaligned_le8(const void *p) +{ + return *(u8 *)p; +} + +static inline void put_unaligned_le8(u8 val, void *p) +{ + *(u8 *)p = val; +} + +#define read_eb_member(eb, ptr, type, member, result) (\ + read_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#define write_eb_member(eb, ptr, type, member, result) (\ + write_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#define DECLARE_BTRFS_SETGET_BITS(bits) \ +u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ + const void *ptr, unsigned long off); \ +void btrfs_set_token_##bits(struct btrfs_map_token *token, \ + const void *ptr, unsigned long off, \ + u##bits val); \ +u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ + const void *ptr, unsigned long off); \ +void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val); + +DECLARE_BTRFS_SETGET_BITS(8) +DECLARE_BTRFS_SETGET_BITS(16) +DECLARE_BTRFS_SETGET_BITS(32) +DECLARE_BTRFS_SETGET_BITS(64) + +#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ + const type *s) \ +{ \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + return btrfs_get_##bits(eb, s, offsetof(type, member)); \ +} \ +static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ + u##bits val) \ +{ \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + btrfs_set_##bits(eb, s, offsetof(type, member), val); \ +} \ +static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ + const type *s) \ +{ \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + return btrfs_get_token_##bits(token, s, offsetof(type, member));\ +} \ +static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ + type *s, u##bits val) \ +{ \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ +} + +#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ +{ \ + const type *p = page_address(eb->pages[0]) + \ + offset_in_page(eb->start); \ + return get_unaligned_le##bits(&p->member); \ +} \ +static inline void btrfs_set_##name(const struct extent_buffer *eb, \ + u##bits val) \ +{ \ + type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \ + put_unaligned_le##bits(val, &p->member); \ +} + +#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(const type *s) \ +{ \ + return get_unaligned_le##bits(&s->member); \ +} \ +static inline void btrfs_set_##name(type *s, u##bits val) \ +{ \ + put_unaligned_le##bits(val, &s->member); \ +} + +static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, + struct btrfs_dev_item *s) +{ + static_assert(sizeof(u64) == + sizeof(((struct btrfs_dev_item *)0))->total_bytes); + return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, + total_bytes)); +} +static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, + struct btrfs_dev_item *s, + u64 val) +{ + static_assert(sizeof(u64) == + sizeof(((struct btrfs_dev_item *)0))->total_bytes); + WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); + btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); +} + +BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); +BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); +BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); +BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, start_offset, 64); +BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); +BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); +BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); +BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); +BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, + io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, + io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, dev_group, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, + seek_speed, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, + bandwidth, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, + generation, 64); + +static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d) +{ + return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid); +} + +static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d) +{ + return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid); +} + +BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); +BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); +BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); +BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); +BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); +BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); +BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); + +static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) +{ + return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); +} + +BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, + stripe_len, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, + num_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, + sub_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); + +static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, int nr) +{ + unsigned long offset = (unsigned long)c; + + offset += offsetof(struct btrfs_chunk, stripe); + offset += nr * sizeof(struct btrfs_stripe); + return (struct btrfs_stripe *)offset; +} + +static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); +} + +static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); +} + +static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); +} + +/* struct btrfs_block_group_item */ +BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item, + used, 64); +BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item, used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); + +BTRFS_SETGET_FUNCS(block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); +BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags, + struct btrfs_block_group_item, flags, 64); + +/* struct btrfs_free_space_info */ +BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, + extent_count, 32); +BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32); + +/* struct btrfs_inode_ref */ +BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); +BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); + +/* struct btrfs_inode_extref */ +BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, + parent_objectid, 64); +BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref, + name_len, 16); +BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64); + +/* struct btrfs_inode_item */ +BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); +BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); +BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); +BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); +BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); +BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, + sequence, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, + transid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, nbytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, + block_group, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); +BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); +BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); + +/* struct btrfs_dev_extent */ +BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, + chunk_objectid, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, + chunk_offset, 64); +BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); +BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); +BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, generation, 64); +BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); + +BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); + +static inline void btrfs_tree_block_key(const struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, root, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, + objectid, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, + offset, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 32); + +BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32); + +BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, + type, 8); +BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, + offset, 64); + +static inline u32 btrfs_extent_inline_ref_size(int type) +{ + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) + return sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_SHARED_DATA_REF_KEY) + return sizeof(struct btrfs_shared_data_ref) + + sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_EXTENT_DATA_REF_KEY) + return sizeof(struct btrfs_extent_data_ref) + + offsetof(struct btrfs_extent_inline_ref, offset); + return 0; +} + +/* struct btrfs_node */ +BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); +BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, blockptr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr, + generation, 64); + +static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline unsigned long btrfs_node_key_ptr_offset(int nr) +{ + return offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; +} + +void btrfs_node_key(const struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr); + +static inline void btrfs_set_node_key(const struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + unsigned long ptr; + + ptr = btrfs_node_key_ptr_offset(nr); + write_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); +} + +/* struct btrfs_item */ +BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); + +static inline unsigned long btrfs_item_nr_offset(int nr) +{ + return offsetof(struct btrfs_leaf, items) + + sizeof(struct btrfs_item) * nr; +} + +static inline struct btrfs_item *btrfs_item_nr(int nr) +{ + return (struct btrfs_item *)btrfs_item_nr_offset(nr); +} + +#define BTRFS_ITEM_SETGET_FUNCS(member) \ +static inline u32 btrfs_item_##member(const struct extent_buffer *eb, int slot) \ +{ \ + return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \ +} \ +static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ + int slot, u32 val) \ +{ \ + btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \ +} \ +static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ + int slot) \ +{ \ + struct btrfs_item *item = btrfs_item_nr(slot); \ + return btrfs_token_raw_item_##member(token, item); \ +} \ +static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ + int slot, u32 val) \ +{ \ + struct btrfs_item *item = btrfs_item_nr(slot); \ + btrfs_set_token_raw_item_##member(token, item, val); \ +} + +BTRFS_ITEM_SETGET_FUNCS(offset) +BTRFS_ITEM_SETGET_FUNCS(size); + +static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr) +{ + return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr); +} + +static inline void btrfs_item_key(const struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(nr); + + read_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +static inline void btrfs_set_item_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(nr); + + write_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); + +/* struct btrfs_root_ref */ +BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); +BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); +BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); + +/* struct btrfs_dir_item */ +BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); +BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); +BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, data_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, name_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, transid, 64); + +static inline void btrfs_dir_item_key(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, + struct btrfs_dir_item *item, + const struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, + num_entries, 64); +BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, + num_bitmaps, 64); +BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, + generation, 64); + +static inline void btrfs_free_space_key(const struct extent_buffer *eb, + const struct btrfs_free_space_header *h, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + +static inline void btrfs_set_free_space_key(struct extent_buffer *eb, + struct btrfs_free_space_header *h, + const struct btrfs_disk_key *key) +{ + write_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + +/* struct btrfs_disk_key */ +BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, objectid, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); + +#ifdef __LITTLE_ENDIAN + +/* + * Optimized helpers for little-endian architectures where CPU and on-disk + * structures have the same endianness and we can skip conversions. + */ + +static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key, + const struct btrfs_disk_key *disk_key) +{ + memcpy(cpu_key, disk_key, sizeof(struct btrfs_key)); +} + +static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key, + const struct btrfs_key *cpu_key) +{ + memcpy(disk_key, cpu_key, sizeof(struct btrfs_key)); +} + +static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *cpu_key, int nr) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_node_key(eb, disk_key, nr); +} + +static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *cpu_key, int nr) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_item_key(eb, disk_key, nr); +} + +static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, + struct btrfs_key *cpu_key) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_dir_item_key(eb, item, disk_key); +} + +#else + +static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, + const struct btrfs_disk_key *disk) +{ + cpu->offset = le64_to_cpu(disk->offset); + cpu->type = disk->type; + cpu->objectid = le64_to_cpu(disk->objectid); +} + +static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, + const struct btrfs_key *cpu) +{ + disk->offset = cpu_to_le64(cpu->offset); + disk->type = cpu->type; + disk->objectid = cpu_to_le64(cpu->objectid); +} + +static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + + btrfs_node_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + + btrfs_item_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, + struct btrfs_key *key) +{ + struct btrfs_disk_key disk_key; + + btrfs_dir_item_key(eb, item, &disk_key); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +#endif + +/* struct btrfs_header */ +BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); +BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, generation, 64); +BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); +BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); +BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); +BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, nritems, 32); +BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64); + +static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag) +{ + return (btrfs_header_flags(eb) & flag) == flag; +} + +static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + + btrfs_set_header_flags(eb, flags | flag); +} + +static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + + btrfs_set_header_flags(eb, flags & ~flag); +} + +static inline int btrfs_header_backref_rev(const struct extent_buffer *eb) +{ + u64 flags = btrfs_header_flags(eb); + + return flags >> BTRFS_BACKREF_REV_SHIFT; +} + +static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, int rev) +{ + u64 flags = btrfs_header_flags(eb); + + flags &= ~BTRFS_BACKREF_REV_MASK; + flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; + btrfs_set_header_flags(eb, flags); +} + +static inline int btrfs_is_leaf(const struct extent_buffer *eb) +{ + return btrfs_header_level(eb) == 0; +} + +/* struct btrfs_root_item */ +BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, generation, 64); +BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); + +BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8); +BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); +BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); +BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); +BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, + last_snapshot, 64); +BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item, + generation_v2, 64); +BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, ctransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, otransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, stransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, rtransid, 64); + +static inline bool btrfs_root_readonly(const struct btrfs_root *root) +{ + /* Byte-swap the constant at compile time, root_item::flags is LE */ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; +} + +static inline bool btrfs_root_dead(const struct btrfs_root *root) +{ + /* Byte-swap the constant at compile time, root_item::flags is LE */ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; +} + +static inline u64 btrfs_root_id(const struct btrfs_root *root) +{ + return root->root_key.objectid; +} + +/* struct btrfs_root_backup */ +BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, + tree_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, + tree_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, + tree_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, + chunk_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, + chunk_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, + extent_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, + extent_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, + extent_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, + fs_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, + fs_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, + fs_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, + dev_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, + dev_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, + dev_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, + csum_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, + csum_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, + csum_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, + num_devices, 64); + +/* struct btrfs_balance_item */ +BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); + +static inline void btrfs_balance_data(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_set_balance_data(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_balance_meta(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_set_balance_meta(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_balance_sys(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void btrfs_set_balance_sys(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, + const struct btrfs_disk_balance_args *disk) +{ + memset(cpu, 0, sizeof(*cpu)); + + cpu->profiles = le64_to_cpu(disk->profiles); + cpu->usage = le64_to_cpu(disk->usage); + cpu->devid = le64_to_cpu(disk->devid); + cpu->pstart = le64_to_cpu(disk->pstart); + cpu->pend = le64_to_cpu(disk->pend); + cpu->vstart = le64_to_cpu(disk->vstart); + cpu->vend = le64_to_cpu(disk->vend); + cpu->target = le64_to_cpu(disk->target); + cpu->flags = le64_to_cpu(disk->flags); + cpu->limit = le64_to_cpu(disk->limit); + cpu->stripes_min = le32_to_cpu(disk->stripes_min); + cpu->stripes_max = le32_to_cpu(disk->stripes_max); +} + +static inline void btrfs_cpu_balance_args_to_disk( + struct btrfs_disk_balance_args *disk, + const struct btrfs_balance_args *cpu) +{ + memset(disk, 0, sizeof(*disk)); + + disk->profiles = cpu_to_le64(cpu->profiles); + disk->usage = cpu_to_le64(cpu->usage); + disk->devid = cpu_to_le64(cpu->devid); + disk->pstart = cpu_to_le64(cpu->pstart); + disk->pend = cpu_to_le64(cpu->pend); + disk->vstart = cpu_to_le64(cpu->vstart); + disk->vend = cpu_to_le64(cpu->vend); + disk->target = cpu_to_le64(cpu->target); + disk->flags = cpu_to_le64(cpu->flags); + disk->limit = cpu_to_le64(cpu->limit); + disk->stripes_min = cpu_to_le32(cpu->stripes_min); + disk->stripes_max = cpu_to_le32(cpu->stripes_max); +} + +/* struct btrfs_super_block */ +BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); +BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, + struct btrfs_super_block, sys_chunk_array_size, 32); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, + struct btrfs_super_block, chunk_root_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, + root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, + chunk_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, log_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, + log_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, + sectorsize, 32); +BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, + nodesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, + stripesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, + root_dir_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, + num_devices, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, + compat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, + compat_ro_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, + incompat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, + csum_type, 16); +BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, + cache_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); +BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, + uuid_tree_generation, 64); + +int btrfs_super_csum_size(const struct btrfs_super_block *s); +const char *btrfs_super_csum_name(u16 csum_type); +const char *btrfs_super_csum_driver(u16 csum_type); +size_t __attribute_const__ btrfs_get_num_csums(void); + +/* + * The leaf data grows from end-to-front in the node. this returns the address + * of the start of the last item, which is the stop of the leaf data stack. + */ +static inline unsigned int leaf_data_end(const struct extent_buffer *leaf) +{ + u32 nr = btrfs_header_nritems(leaf); + + if (nr == 0) + return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); + return btrfs_item_offset(leaf, nr - 1); +} + +/* struct btrfs_file_extent_item */ +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, + type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, + struct btrfs_file_extent_item, disk_bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset, + struct btrfs_file_extent_item, offset, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, + struct btrfs_file_extent_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, + struct btrfs_file_extent_item, num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes, + struct btrfs_file_extent_item, ram_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, + struct btrfs_file_extent_item, disk_num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, + struct btrfs_file_extent_item, compression, 8); + +static inline unsigned long btrfs_file_extent_inline_start( + const struct btrfs_file_extent_item *e) +{ + return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) +{ + return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; +} + +BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); +BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, + disk_bytenr, 64); +BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, + generation, 64); +BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, + disk_num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, + offset, 64); +BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, + num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, + ram_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, + compression, 8); +BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, + encryption, 8); +BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, + other_encoding, 16); + +/* + * Returns the number of bytes used by the item on disk, minus the size of any + * extent headers. If a file is compressed on disk, this is the compressed + * size. + */ +static inline u32 btrfs_file_extent_inline_item_len( + const struct extent_buffer *eb, + int nr) +{ + return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +/* btrfs_qgroup_status_item */ +BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, + version, 64); +BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, + flags, 64); +BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, + rescan, 64); + +/* btrfs_qgroup_info_item */ +BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, + rfer_cmpr, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, + excl_cmpr, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation, + struct btrfs_qgroup_info_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item, + rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr, + struct btrfs_qgroup_info_item, rfer_cmpr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item, + excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr, + struct btrfs_qgroup_info_item, excl_cmpr, 64); + +/* btrfs_qgroup_limit_item */ +BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, flags, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, + max_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, + max_excl, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, + rsv_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, + rsv_excl, 64); + +/* btrfs_dev_replace_item */ +BTRFS_SETGET_FUNCS(dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, + 64); +BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, + replace_state, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, + time_started, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, + time_stopped, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, + num_write_errors, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, num_uncorrectable_read_errors, + 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, + cursor_left, 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, + cursor_right, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, + cont_reading_from_srcdev_mode, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, + struct btrfs_dev_replace_item, replace_state, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, + struct btrfs_dev_replace_item, time_started, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, + struct btrfs_dev_replace_item, time_stopped, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, + struct btrfs_dev_replace_item, num_write_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, + num_uncorrectable_read_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, + struct btrfs_dev_replace_item, cursor_left, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, + struct btrfs_dev_replace_item, cursor_right, 64); + +/* btrfs_verity_descriptor_item */ +BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, + encryption, 8); +BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item, + size, 64); +BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, + struct btrfs_verity_descriptor_item, encryption, 8); +BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, + struct btrfs_verity_descriptor_item, size, 64); + +/* Cast into the data area of the leaf. */ +#define btrfs_item_ptr(leaf, slot, type) \ + ((type *)(BTRFS_LEAF_DATA_OFFSET + btrfs_item_offset(leaf, slot))) + +#define btrfs_item_ptr_offset(leaf, slot) \ + ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + btrfs_item_offset(leaf, slot))) + #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f76db317c437..41caa6fc6301 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -16,6 +16,7 @@ #include "misc.h" #include "tree-mod-log.h" #include "fs.h" +#include "accessors.h" /* Just arbitrary numbers so we can be sure one of these happened. */ #define BACKREF_FOUND_SHARED 6 diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index bf8d1e110136..d1f8d792b184 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -18,6 +18,7 @@ #include "raid56.h" #include "zoned.h" #include "fs.h" +#include "accessors.h" #ifdef CONFIG_BTRFS_DEBUG int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index a48fa7ac74cc..069ea9d6a8d4 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -8,6 +8,7 @@ #include "block-group.h" #include "disk-io.h" #include "fs.h" +#include "accessors.h" /* * HOW DO BLOCK RESERVES WORK diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index e8e1a92b30ac..7ff0703ef3e4 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -93,6 +93,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "compression.h" +#include "accessors.h" #define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 #define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 296ca4de8d52..20034c23abb2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1176,8 +1176,6 @@ static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) return info->nodesize - sizeof(struct btrfs_header); } -#define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items) - static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info) { return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item); @@ -1204,1094 +1202,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ ((bytes) >> (fs_info)->sectorsize_bits) -/* some macros to generate set/get functions for the struct fields. This - * assumes there is a lefoo_to_cpu for every type, so lets make a simple - * one for u8: - */ -#define le8_to_cpu(v) (v) -#define cpu_to_le8(v) (v) -#define __le8 u8 - -static inline u8 get_unaligned_le8(const void *p) -{ - return *(u8 *)p; -} - -static inline void put_unaligned_le8(u8 val, void *p) -{ - *(u8 *)p = val; -} - -#define read_eb_member(eb, ptr, type, member, result) (\ - read_extent_buffer(eb, (char *)(result), \ - ((unsigned long)(ptr)) + \ - offsetof(type, member), \ - sizeof(((type *)0)->member))) - -#define write_eb_member(eb, ptr, type, member, result) (\ - write_extent_buffer(eb, (char *)(result), \ - ((unsigned long)(ptr)) + \ - offsetof(type, member), \ - sizeof(((type *)0)->member))) - -#define DECLARE_BTRFS_SETGET_BITS(bits) \ -u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off); \ -void btrfs_set_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off, \ - u##bits val); \ -u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ - const void *ptr, unsigned long off); \ -void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ - unsigned long off, u##bits val); - -DECLARE_BTRFS_SETGET_BITS(8) -DECLARE_BTRFS_SETGET_BITS(16) -DECLARE_BTRFS_SETGET_BITS(32) -DECLARE_BTRFS_SETGET_BITS(64) - -#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ - const type *s) \ -{ \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ - return btrfs_get_##bits(eb, s, offsetof(type, member)); \ -} \ -static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ - u##bits val) \ -{ \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ - btrfs_set_##bits(eb, s, offsetof(type, member), val); \ -} \ -static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ - const type *s) \ -{ \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ - return btrfs_get_token_##bits(token, s, offsetof(type, member));\ -} \ -static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ - type *s, u##bits val) \ -{ \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ - btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ -} - -#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ -{ \ - const type *p = page_address(eb->pages[0]) + \ - offset_in_page(eb->start); \ - return get_unaligned_le##bits(&p->member); \ -} \ -static inline void btrfs_set_##name(const struct extent_buffer *eb, \ - u##bits val) \ -{ \ - type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \ - put_unaligned_le##bits(val, &p->member); \ -} - -#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(const type *s) \ -{ \ - return get_unaligned_le##bits(&s->member); \ -} \ -static inline void btrfs_set_##name(type *s, u##bits val) \ -{ \ - put_unaligned_le##bits(val, &s->member); \ -} - -static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, - struct btrfs_dev_item *s) -{ - static_assert(sizeof(u64) == - sizeof(((struct btrfs_dev_item *)0))->total_bytes); - return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, - total_bytes)); -} -static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, - struct btrfs_dev_item *s, - u64 val) -{ - static_assert(sizeof(u64) == - sizeof(((struct btrfs_dev_item *)0))->total_bytes); - WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); - btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); -} - - -BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); -BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); -BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); -BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); -BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, - start_offset, 64); -BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); -BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); -BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); -BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); -BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); -BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); - -BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, - total_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, - bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, - io_align, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, - io_width, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, - sector_size, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, - dev_group, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, - seek_speed, 8); -BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, - bandwidth, 8); -BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, - generation, 64); - -static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d) -{ - return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid); -} - -static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d) -{ - return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid); -} - -BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); -BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); -BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); -BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); -BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); -BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); -BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); -BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); -BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); -BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); -BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); - -static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) -{ - return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); -} - -BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, - stripe_len, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, - io_align, 32); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, - io_width, 32); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, - sector_size, 32); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, - num_stripes, 16); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, - sub_stripes, 16); -BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); - -static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, - int nr) -{ - unsigned long offset = (unsigned long)c; - offset += offsetof(struct btrfs_chunk, stripe); - offset += nr * sizeof(struct btrfs_stripe); - return (struct btrfs_stripe *)offset; -} - -static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) -{ - return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); -} - -static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb, - struct btrfs_chunk *c, int nr) -{ - return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); -} - -static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb, - struct btrfs_chunk *c, int nr) -{ - return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); -} - -/* struct btrfs_block_group_item */ -BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item, - used, 64); -BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item, - used, 64); -BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid, - struct btrfs_block_group_item, chunk_objectid, 64); - -BTRFS_SETGET_FUNCS(block_group_chunk_objectid, - struct btrfs_block_group_item, chunk_objectid, 64); -BTRFS_SETGET_FUNCS(block_group_flags, - struct btrfs_block_group_item, flags, 64); -BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags, - struct btrfs_block_group_item, flags, 64); - -/* struct btrfs_free_space_info */ -BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, - extent_count, 32); -BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32); - -/* struct btrfs_inode_ref */ -BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); -BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); - -/* struct btrfs_inode_extref */ -BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, - parent_objectid, 64); -BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref, - name_len, 16); -BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64); - -/* struct btrfs_inode_item */ -BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); -BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); -BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); -BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); -BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); -BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); -BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); -BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); -BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); -BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); -BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); -BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, - sequence, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, - transid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, - nbytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, - block_group, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); -BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); -BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); -BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); -BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); - -/* struct btrfs_dev_extent */ -BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, - chunk_tree, 64); -BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, - chunk_objectid, 64); -BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, - chunk_offset, 64); -BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); -BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); -BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, - generation, 64); -BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); - -BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); - -static inline void btrfs_tree_block_key(const struct extent_buffer *eb, - struct btrfs_tree_block_info *item, - struct btrfs_disk_key *key) -{ - read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); -} - -static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb, - struct btrfs_tree_block_info *item, - struct btrfs_disk_key *key) -{ - write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); -} - -BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, - root, 64); -BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, - objectid, 64); -BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, - offset, 64); -BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, - count, 32); - -BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, - count, 32); - -BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, - type, 8); -BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, - offset, 64); - -static inline u32 btrfs_extent_inline_ref_size(int type) -{ - if (type == BTRFS_TREE_BLOCK_REF_KEY || - type == BTRFS_SHARED_BLOCK_REF_KEY) - return sizeof(struct btrfs_extent_inline_ref); - if (type == BTRFS_SHARED_DATA_REF_KEY) - return sizeof(struct btrfs_shared_data_ref) + - sizeof(struct btrfs_extent_inline_ref); - if (type == BTRFS_EXTENT_DATA_REF_KEY) - return sizeof(struct btrfs_extent_data_ref) + - offsetof(struct btrfs_extent_inline_ref, offset); - return 0; -} - -/* struct btrfs_node */ -BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); -BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, - blockptr, 64); -BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr, - generation, 64); - -static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); -} - -static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb, - int nr, u64 val) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); -} - -static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); -} - -static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb, - int nr, u64 val) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); -} - -static inline unsigned long btrfs_node_key_ptr_offset(int nr) -{ - return offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; -} - -void btrfs_node_key(const struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr); - -static inline void btrfs_set_node_key(const struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) -{ - unsigned long ptr; - ptr = btrfs_node_key_ptr_offset(nr); - write_eb_member(eb, (struct btrfs_key_ptr *)ptr, - struct btrfs_key_ptr, key, disk_key); -} - -/* struct btrfs_item */ -BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32); -BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32); -BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); -BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); - -static inline unsigned long btrfs_item_nr_offset(int nr) -{ - return offsetof(struct btrfs_leaf, items) + - sizeof(struct btrfs_item) * nr; -} - -static inline struct btrfs_item *btrfs_item_nr(int nr) -{ - return (struct btrfs_item *)btrfs_item_nr_offset(nr); -} - -#define BTRFS_ITEM_SETGET_FUNCS(member) \ -static inline u32 btrfs_item_##member(const struct extent_buffer *eb, \ - int slot) \ -{ \ - return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \ -} \ -static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ - int slot, u32 val) \ -{ \ - btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \ -} \ -static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ - int slot) \ -{ \ - struct btrfs_item *item = btrfs_item_nr(slot); \ - return btrfs_token_raw_item_##member(token, item); \ -} \ -static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ - int slot, u32 val) \ -{ \ - struct btrfs_item *item = btrfs_item_nr(slot); \ - btrfs_set_token_raw_item_##member(token, item, val); \ -} - -BTRFS_ITEM_SETGET_FUNCS(offset) -BTRFS_ITEM_SETGET_FUNCS(size); - -static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr) -{ - return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr); -} - -static inline void btrfs_item_key(const struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) -{ - struct btrfs_item *item = btrfs_item_nr(nr); - read_eb_member(eb, item, struct btrfs_item, key, disk_key); -} - -static inline void btrfs_set_item_key(struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) -{ - struct btrfs_item *item = btrfs_item_nr(nr); - write_eb_member(eb, item, struct btrfs_item, key, disk_key); -} - -BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); - -/* - * struct btrfs_root_ref - */ -BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); -BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); -BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); - -/* struct btrfs_dir_item */ -BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); -BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); -BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); -BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8); -BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, - data_len, 16); -BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, - name_len, 16); -BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, - transid, 64); - -static inline void btrfs_dir_item_key(const struct extent_buffer *eb, - const struct btrfs_dir_item *item, - struct btrfs_disk_key *key) -{ - read_eb_member(eb, item, struct btrfs_dir_item, location, key); -} - -static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, - struct btrfs_dir_item *item, - const struct btrfs_disk_key *key) -{ - write_eb_member(eb, item, struct btrfs_dir_item, location, key); -} - -BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, - num_entries, 64); -BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, - num_bitmaps, 64); -BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, - generation, 64); - -static inline void btrfs_free_space_key(const struct extent_buffer *eb, - const struct btrfs_free_space_header *h, - struct btrfs_disk_key *key) -{ - read_eb_member(eb, h, struct btrfs_free_space_header, location, key); -} - -static inline void btrfs_set_free_space_key(struct extent_buffer *eb, - struct btrfs_free_space_header *h, - const struct btrfs_disk_key *key) -{ - write_eb_member(eb, h, struct btrfs_free_space_header, location, key); -} - -/* struct btrfs_disk_key */ -BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, - objectid, 64); -BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); -BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); - -#ifdef __LITTLE_ENDIAN - -/* - * Optimized helpers for little-endian architectures where CPU and on-disk - * structures have the same endianness and we can skip conversions. - */ - -static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key, - const struct btrfs_disk_key *disk_key) -{ - memcpy(cpu_key, disk_key, sizeof(struct btrfs_key)); -} - -static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key, - const struct btrfs_key *cpu_key) -{ - memcpy(disk_key, cpu_key, sizeof(struct btrfs_key)); -} - -static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, - struct btrfs_key *cpu_key, int nr) -{ - struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; - - btrfs_node_key(eb, disk_key, nr); -} - -static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, - struct btrfs_key *cpu_key, int nr) -{ - struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; - - btrfs_item_key(eb, disk_key, nr); -} - -static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, - const struct btrfs_dir_item *item, - struct btrfs_key *cpu_key) -{ - struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; - - btrfs_dir_item_key(eb, item, disk_key); -} - -#else - -static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, - const struct btrfs_disk_key *disk) -{ - cpu->offset = le64_to_cpu(disk->offset); - cpu->type = disk->type; - cpu->objectid = le64_to_cpu(disk->objectid); -} - -static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, - const struct btrfs_key *cpu) -{ - disk->offset = cpu_to_le64(cpu->offset); - disk->type = cpu->type; - disk->objectid = cpu_to_le64(cpu->objectid); -} - -static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, - struct btrfs_key *key, int nr) -{ - struct btrfs_disk_key disk_key; - btrfs_node_key(eb, &disk_key, nr); - btrfs_disk_key_to_cpu(key, &disk_key); -} - -static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, - struct btrfs_key *key, int nr) -{ - struct btrfs_disk_key disk_key; - btrfs_item_key(eb, &disk_key, nr); - btrfs_disk_key_to_cpu(key, &disk_key); -} - -static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, - const struct btrfs_dir_item *item, - struct btrfs_key *key) -{ - struct btrfs_disk_key disk_key; - btrfs_dir_item_key(eb, item, &disk_key); - btrfs_disk_key_to_cpu(key, &disk_key); -} - -#endif - -/* struct btrfs_header */ -BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); -BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, - generation, 64); -BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); -BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); -BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); -BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); -BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64); -BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, - nritems, 32); -BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64); - -static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag) -{ - return (btrfs_header_flags(eb) & flag) == flag; -} - -static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) -{ - u64 flags = btrfs_header_flags(eb); - btrfs_set_header_flags(eb, flags | flag); -} - -static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) -{ - u64 flags = btrfs_header_flags(eb); - btrfs_set_header_flags(eb, flags & ~flag); -} - -static inline int btrfs_header_backref_rev(const struct extent_buffer *eb) -{ - u64 flags = btrfs_header_flags(eb); - return flags >> BTRFS_BACKREF_REV_SHIFT; -} - -static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, - int rev) -{ - u64 flags = btrfs_header_flags(eb); - flags &= ~BTRFS_BACKREF_REV_MASK; - flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; - btrfs_set_header_flags(eb, flags); -} - -static inline int btrfs_is_leaf(const struct extent_buffer *eb) -{ - return btrfs_header_level(eb) == 0; -} - -/* struct btrfs_root_item */ -BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, - generation, 64); -BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); -BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); -BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); - -BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); -BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8); -BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); -BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); -BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); -BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); -BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); -BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, - last_snapshot, 64); -BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item, - generation_v2, 64); -BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, - ctransid, 64); -BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, - otransid, 64); -BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, - stransid, 64); -BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, - rtransid, 64); - -static inline bool btrfs_root_readonly(const struct btrfs_root *root) -{ - /* Byte-swap the constant at compile time, root_item::flags is LE */ - return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; -} - -static inline bool btrfs_root_dead(const struct btrfs_root *root) -{ - /* Byte-swap the constant at compile time, root_item::flags is LE */ - return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; -} - -static inline u64 btrfs_root_id(const struct btrfs_root *root) -{ - return root->root_key.objectid; -} - -/* struct btrfs_root_backup */ -BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, - tree_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, - tree_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, - tree_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, - chunk_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, - chunk_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, - chunk_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, - extent_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, - extent_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, - extent_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, - fs_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, - fs_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, - fs_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, - dev_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, - dev_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, - dev_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, - csum_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, - csum_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, - csum_root_level, 8); -BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, - total_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, - bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, - num_devices, 64); - -/* struct btrfs_balance_item */ -BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); - -static inline void btrfs_balance_data(const struct extent_buffer *eb, - const struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); -} - -static inline void btrfs_set_balance_data(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - const struct btrfs_disk_balance_args *ba) -{ - write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); -} - -static inline void btrfs_balance_meta(const struct extent_buffer *eb, - const struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); -} - -static inline void btrfs_set_balance_meta(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - const struct btrfs_disk_balance_args *ba) -{ - write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); -} - -static inline void btrfs_balance_sys(const struct extent_buffer *eb, - const struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); -} - -static inline void btrfs_set_balance_sys(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - const struct btrfs_disk_balance_args *ba) -{ - write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); -} - -static inline void -btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, - const struct btrfs_disk_balance_args *disk) -{ - memset(cpu, 0, sizeof(*cpu)); - - cpu->profiles = le64_to_cpu(disk->profiles); - cpu->usage = le64_to_cpu(disk->usage); - cpu->devid = le64_to_cpu(disk->devid); - cpu->pstart = le64_to_cpu(disk->pstart); - cpu->pend = le64_to_cpu(disk->pend); - cpu->vstart = le64_to_cpu(disk->vstart); - cpu->vend = le64_to_cpu(disk->vend); - cpu->target = le64_to_cpu(disk->target); - cpu->flags = le64_to_cpu(disk->flags); - cpu->limit = le64_to_cpu(disk->limit); - cpu->stripes_min = le32_to_cpu(disk->stripes_min); - cpu->stripes_max = le32_to_cpu(disk->stripes_max); -} - -static inline void -btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, - const struct btrfs_balance_args *cpu) -{ - memset(disk, 0, sizeof(*disk)); - - disk->profiles = cpu_to_le64(cpu->profiles); - disk->usage = cpu_to_le64(cpu->usage); - disk->devid = cpu_to_le64(cpu->devid); - disk->pstart = cpu_to_le64(cpu->pstart); - disk->pend = cpu_to_le64(cpu->pend); - disk->vstart = cpu_to_le64(cpu->vstart); - disk->vend = cpu_to_le64(cpu->vend); - disk->target = cpu_to_le64(cpu->target); - disk->flags = cpu_to_le64(cpu->flags); - disk->limit = cpu_to_le64(cpu->limit); - disk->stripes_min = cpu_to_le32(cpu->stripes_min); - disk->stripes_max = cpu_to_le32(cpu->stripes_max); -} - -/* struct btrfs_super_block */ -BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); -BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); -BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, - struct btrfs_super_block, sys_chunk_array_size, 32); -BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, - struct btrfs_super_block, chunk_root_generation, 64); -BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, - root_level, 8); -BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, - chunk_root, 64); -BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, - chunk_root_level, 8); -BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, - log_root, 64); -BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, - log_root_level, 8); -BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, - total_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, - bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, - sectorsize, 32); -BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, - nodesize, 32); -BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, - stripesize, 32); -BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, - root_dir_objectid, 64); -BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, - num_devices, 64); -BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, - compat_flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, - compat_ro_flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, - incompat_flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, - csum_type, 16); -BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, - cache_generation, 64); -BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); -BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, - uuid_tree_generation, 64); - -int btrfs_super_csum_size(const struct btrfs_super_block *s); -const char *btrfs_super_csum_name(u16 csum_type); -const char *btrfs_super_csum_driver(u16 csum_type); -size_t __attribute_const__ btrfs_get_num_csums(void); - - -/* - * The leaf data grows from end-to-front in the node. - * this returns the address of the start of the last item, - * which is the stop of the leaf data stack - */ -static inline unsigned int leaf_data_end(const struct extent_buffer *leaf) -{ - u32 nr = btrfs_header_nritems(leaf); - - if (nr == 0) - return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); - return btrfs_item_offset(leaf, nr - 1); -} - -/* struct btrfs_file_extent_item */ -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, - type, 8); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, - struct btrfs_file_extent_item, disk_bytenr, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset, - struct btrfs_file_extent_item, offset, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, - struct btrfs_file_extent_item, generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, - struct btrfs_file_extent_item, num_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes, - struct btrfs_file_extent_item, ram_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, - struct btrfs_file_extent_item, disk_num_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, - struct btrfs_file_extent_item, compression, 8); - -static inline unsigned long -btrfs_file_extent_inline_start(const struct btrfs_file_extent_item *e) -{ - return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; -} - -static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) -{ - return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; -} - -BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); -BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, - disk_bytenr, 64); -BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, - generation, 64); -BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, - disk_num_bytes, 64); -BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, - offset, 64); -BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, - num_bytes, 64); -BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, - ram_bytes, 64); -BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, - compression, 8); -BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, - encryption, 8); -BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, - other_encoding, 16); - -/* - * this returns the number of bytes used by the item on disk, minus the - * size of any extent headers. If a file is compressed on disk, this is - * the compressed size - */ -static inline u32 btrfs_file_extent_inline_item_len( - const struct extent_buffer *eb, - int nr) -{ - return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START; -} - -/* btrfs_qgroup_status_item */ -BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, - generation, 64); -BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, - version, 64); -BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, - flags, 64); -BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, - rescan, 64); - -/* btrfs_qgroup_info_item */ -BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, - generation, 64); -BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); -BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, - rfer_cmpr, 64); -BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); -BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, - excl_cmpr, 64); - -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation, - struct btrfs_qgroup_info_item, generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item, - rfer, 64); -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr, - struct btrfs_qgroup_info_item, rfer_cmpr, 64); -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item, - excl, 64); -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr, - struct btrfs_qgroup_info_item, excl_cmpr, 64); - -/* btrfs_qgroup_limit_item */ -BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, - flags, 64); -BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, - max_rfer, 64); -BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, - max_excl, 64); -BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, - rsv_rfer, 64); -BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, - rsv_excl, 64); - -/* btrfs_dev_replace_item */ -BTRFS_SETGET_FUNCS(dev_replace_src_devid, - struct btrfs_dev_replace_item, src_devid, 64); -BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, - struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, - 64); -BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, - replace_state, 64); -BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, - time_started, 64); -BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, - time_stopped, 64); -BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, - num_write_errors, 64); -BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, - struct btrfs_dev_replace_item, num_uncorrectable_read_errors, - 64); -BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, - cursor_left, 64); -BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, - cursor_right, 64); - -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, - struct btrfs_dev_replace_item, src_devid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, - struct btrfs_dev_replace_item, - cont_reading_from_srcdev_mode, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, - struct btrfs_dev_replace_item, replace_state, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, - struct btrfs_dev_replace_item, time_started, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, - struct btrfs_dev_replace_item, time_stopped, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, - struct btrfs_dev_replace_item, num_write_errors, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, - struct btrfs_dev_replace_item, - num_uncorrectable_read_errors, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, - struct btrfs_dev_replace_item, cursor_left, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, - struct btrfs_dev_replace_item, cursor_right, 64); - -/* btrfs_verity_descriptor_item */ -BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, - encryption, 8); -BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item, - size, 64); -BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, - struct btrfs_verity_descriptor_item, encryption, 8); -BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, - struct btrfs_verity_descriptor_item, size, 64); - -/* helper function to cast into the data area of the leaf. */ -#define btrfs_item_ptr(leaf, slot, type) \ - ((type *)(BTRFS_LEAF_DATA_OFFSET + \ - btrfs_item_offset(leaf, slot))) - -#define btrfs_item_ptr_offset(leaf, slot) \ - ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ - btrfs_item_offset(leaf, slot))) - static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) { return crc32c(crc, address, length); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 2f68570fbb53..012c96de4701 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -17,6 +17,7 @@ #include "locking.h" #include "inode-item.h" #include "space-info.h" +#include "accessors.h" #define BTRFS_DELAYED_WRITEBACK 512 #define BTRFS_DELAYED_BACKGROUND 128 diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 348aef453e69..94f8975034ce 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -24,6 +24,7 @@ #include "zoned.h" #include "block-group.h" #include "fs.h" +#include "accessors.h" /* * Device replace overview diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index be5c1c2a8da5..9fa37f245c43 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -7,6 +7,7 @@ #include "ctree.h" #include "disk-io.h" #include "transaction.h" +#include "accessors.h" /* * insert a name into a directory, doing overflow properly if there is a hash diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2c38b4eebd8f..75475a213972 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -44,6 +44,7 @@ #include "zoned.h" #include "subpage.h" #include "fs.h" +#include "accessors.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index a51a5dfa737c..b6bc9684648f 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -7,6 +7,7 @@ #include "btrfs_inode.h" #include "print-tree.h" #include "export.h" +#include "accessors.h" #define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \ parent_objectid) / 4) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index bc010dbcb6b1..8d28ba360e9a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -37,6 +37,7 @@ #include "zoned.h" #include "dev-replace.h" #include "fs.h" +#include "accessors.h" #undef SCRAMBLE_DELAYED_REFS diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index aa2d60f683bb..4b47bb8c590f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -31,6 +31,7 @@ #include "block-group.h" #include "compression.h" #include "fs.h" +#include "accessors.h" static struct kmem_cache *extent_buffer_cache; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 675987e2d652..bce6c8d31bc0 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -18,6 +18,7 @@ #include "print-tree.h" #include "compression.h" #include "fs.h" +#include "accessors.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a352c7cacc99..62360a554420 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -31,6 +31,7 @@ #include "reflink.h" #include "subpage.h" #include "fs.h" +#include "accessors.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 703902156f97..26ff1a5100b9 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -26,6 +26,7 @@ #include "discard.h" #include "subpage.h" #include "inode-item.h" +#include "accessors.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 026214d74a02..fc79d21e7b4f 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -13,6 +13,7 @@ #include "transaction.h" #include "block-group.h" #include "fs.h" +#include "accessors.h" static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index a59504b59435..5553e1f8afe8 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -3,6 +3,7 @@ #include "messages.h" #include "ctree.h" #include "fs.h" +#include "accessors.h" void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name) diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 25e9f1d65067..b8dbabfa8b31 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -11,6 +11,7 @@ #include "transaction.h" #include "print-tree.h" #include "space-info.h" +#include "accessors.h" struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, const char *name, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 564a4ae9c285..e33be4032fff 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -51,6 +51,7 @@ #include "block-group.h" #include "subpage.h" #include "fs.h" +#include "accessors.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 0eab3cb274a1..870528d87526 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -12,6 +12,7 @@ #include "ctree.h" #include "extent_io.h" #include "locking.h" +#include "accessors.h" /* * Lockdep class keys for extent_buffer->lock's in this root. For a given diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 708facaede2c..aab7d30eed55 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -7,6 +7,7 @@ #include "ctree.h" #include "disk-io.h" #include "print-tree.h" +#include "accessors.h" struct root_name_map { u64 id; diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 6d88bf0a9b17..9ad15d69718c 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -13,6 +13,7 @@ #include "compression.h" #include "space-info.h" #include "fs.h" +#include "accessors.h" #define BTRFS_PROP_HANDLERS_HT_BITS 8 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index e87a2f066f4d..39f67ca4b4ca 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -25,6 +25,7 @@ #include "sysfs.h" #include "tree-mod-log.h" #include "fs.h" +#include "accessors.h" /* * Helpers to access qgroup reservation diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 9b09dc50ba14..95d28497de7c 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -12,6 +12,7 @@ #include "delayed-ref.h" #include "ref-verify.h" #include "fs.h" +#include "accessors.h" /* * Used to keep track the roots and number of refs each root has for a given diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index daf65bfad30e..f0243eb33df7 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -11,6 +11,7 @@ #include "reflink.h" #include "transaction.h" #include "subpage.h" +#include "accessors.h" #define BTRFS_MAX_DEDUPE_LEN SZ_16M diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 977afbb4cd45..3d0a63465a74 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -29,6 +29,7 @@ #include "inode-item.h" #include "space-info.h" #include "fs.h" +#include "accessors.h" /* * Relocation overview diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 112b4bf3c3b8..fc00dfb281d5 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -13,6 +13,7 @@ #include "print-tree.h" #include "qgroup.h" #include "space-info.h" +#include "accessors.h" /* * Read a root item from the tree. In case we detect a root item smaller then diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6871c1c54f8c..5c6c4aa2fb09 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -22,6 +22,7 @@ #include "block-group.h" #include "zoned.h" #include "fs.h" +#include "accessors.h" /* * This is only the first step towards a full-features scrub. It reads all diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 80104b5af32f..31fb3292e816 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -27,6 +27,7 @@ #include "compression.h" #include "xattr.h" #include "print-tree.h" +#include "accessors.h" /* * Maximum number of references an extent can have in order for us to attempt to diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index af1538c2685d..94d73485454a 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -11,6 +11,7 @@ #include "block-group.h" #include "zoned.h" #include "fs.h" +#include "accessors.h" /* * HOW DOES SPACE RESERVATION WORK diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0839e4a1cfac..c02fcdb9c7c4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -51,6 +51,7 @@ #include "qgroup.h" #include "raid56.h" #include "fs.h" +#include "accessors.h" #define CREATE_TRACE_POINTS #include diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 47b221372dcf..a14812f1254a 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -23,6 +23,7 @@ #include "qgroup.h" #include "misc.h" #include "fs.h" +#include "accessors.h" /* * Structure name Path diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index b7d181a08eab..5ef0b90e25c3 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -8,6 +8,7 @@ #include "../ctree.h" #include "../extent_io.h" #include "../disk-io.h" +#include "../accessors.h" static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) { diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 13734ed43bfc..53a17b1d1744 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -10,6 +10,7 @@ #include "../free-space-tree.h" #include "../transaction.h" #include "../block-group.h" +#include "../accessors.h" struct free_space_extent { u64 start; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 625f7d398368..0a34a54ea9fd 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -11,6 +11,7 @@ #include "../extent_io.h" #include "../volumes.h" #include "../compression.h" +#include "../accessors.h" static void insert_extent(struct btrfs_root *root, u64 start, u64 len, u64 ram_bytes, u64 offset, u64 disk_bytenr, diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index fd0e0ade8154..65b65d55d1f6 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -11,6 +11,7 @@ #include "../qgroup.h" #include "../backref.h" #include "../fs.h" +#include "../accessors.h" static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 37d0baaa41d8..25e6b504edb4 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -24,6 +24,7 @@ #include "space-info.h" #include "zoned.h" #include "fs.h" +#include "accessors.h" static struct kmem_cache *btrfs_trans_handle_cachep; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index fa9536110d69..11cafc520b47 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -27,6 +27,7 @@ #include "misc.h" #include "btrfs_inode.h" #include "fs.h" +#include "accessors.h" /* * Error message should follow the following format: diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index b6cf39f4e7e4..5df604846de6 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -9,6 +9,7 @@ #include "print-tree.h" #include "transaction.h" #include "locking.h" +#include "accessors.h" /* * Defrag all the leaves in a given btree. diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 69083889a9b8..3bea19698a10 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -4,6 +4,7 @@ #include "tree-mod-log.h" #include "disk-io.h" #include "fs.h" +#include "accessors.h" struct tree_mod_root { u64 logical; diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 190f752a2e10..70304b89f31f 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -11,7 +11,7 @@ #include "disk-io.h" #include "print-tree.h" #include "fs.h" - +#include "accessors.h" static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key) { diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 35445855df4d..d02fa354fc2b 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -17,6 +17,7 @@ #include "disk-io.h" #include "locking.h" #include "fs.h" +#include "accessors.h" /* * Implementation of the interface defined in struct fsverity_operations. diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d65d5d7835fe..767c56be6b96 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -34,6 +34,7 @@ #include "discard.h" #include "zoned.h" #include "fs.h" +#include "accessors.h" static struct bio_set btrfs_bioset; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index b26c869f0226..fcf2d5f7e198 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -21,6 +21,7 @@ #include "disk-io.h" #include "props.h" #include "locking.h" +#include "accessors.h" int btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 2a7d856c232c..fe66cb929af9 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -16,6 +16,7 @@ #include "dev-replace.h" #include "space-info.h" #include "fs.h" +#include "accessors.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 -- cgit From e9c83077d2be815cb04f0b054e9494512b1657ee Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:51:01 -0400 Subject: btrfs: remove temporary btrfs_map_token declaration in ctree.h This was added while I was moving this code to its new home, it can be removed now. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 20034c23abb2..3dcfb62af68a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -52,8 +52,6 @@ struct btrfs_balance_control; struct btrfs_delayed_root; struct reloc_control; -struct btrfs_map_token; - #define BTRFS_OLDEST_GENERATION 0ULL #define BTRFS_EMPTY_DIR_SIZE 0 -- cgit From d68194b238228ce470de1fc2453851fc06294739 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 14 Oct 2022 15:45:37 +0200 Subject: btrfs: sink gfp_t parameter to btrfs_backref_iter_alloc There's only one caller that passes GFP_NOFS, we can drop the parameter an use the flags directly. Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/backref.c | 5 ++--- fs/btrfs/backref.h | 3 +-- fs/btrfs/relocation.c | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 41caa6fc6301..f556566b9c79 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2650,12 +2650,11 @@ void free_ipath(struct inode_fs_paths *ipath) kfree(ipath); } -struct btrfs_backref_iter *btrfs_backref_iter_alloc( - struct btrfs_fs_info *fs_info, gfp_t gfp_flag) +struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info) { struct btrfs_backref_iter *ret; - ret = kzalloc(sizeof(*ret), gfp_flag); + ret = kzalloc(sizeof(*ret), GFP_NOFS); if (!ret) return NULL; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index fa3c9cbf9ae7..2dd68f87dca5 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -156,8 +156,7 @@ struct btrfs_backref_iter { u32 end_ptr; }; -struct btrfs_backref_iter *btrfs_backref_iter_alloc( - struct btrfs_fs_info *fs_info, gfp_t gfp_flag); +struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info); static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter) { diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 3d0a63465a74..77d03dce2521 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -473,7 +473,7 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( int ret; int err = 0; - iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS); + iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info); if (!iter) return ERR_PTR(-ENOMEM); path = btrfs_alloc_path(); -- cgit From e2896e791001b778e3bf8466dd8c6104245c08a2 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 14 Oct 2022 15:55:09 +0200 Subject: btrfs: sink gfp_t parameter to btrfs_qgroup_trace_extent All callers pass GFP_NOFS, we can drop the parameter and use it directly. Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 17 +++++++---------- fs/btrfs/qgroup.h | 2 +- fs/btrfs/tree-log.c | 3 +-- 3 files changed, 9 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 39f67ca4b4ca..4786b59035a8 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1842,7 +1842,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, } int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, - u64 num_bytes, gfp_t gfp_flag) + u64 num_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; @@ -1852,7 +1852,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || bytenr == 0 || num_bytes == 0) return 0; - record = kzalloc(sizeof(*record), gfp_flag); + record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) return -ENOMEM; @@ -1904,8 +1904,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); - ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes, - GFP_NOFS); + ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes); if (ret) return ret; } @@ -2104,12 +2103,11 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, * blocks for qgroup accounting. */ ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, - nodesize, GFP_NOFS); + nodesize); if (ret < 0) goto out; - ret = btrfs_qgroup_trace_extent(trans, - dst_path->nodes[dst_level]->start, - nodesize, GFP_NOFS); + ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start, + nodesize); if (ret < 0) goto out; @@ -2393,8 +2391,7 @@ walk_down: path->locks[level] = BTRFS_READ_LOCK; ret = btrfs_qgroup_trace_extent(trans, child_bytenr, - fs_info->nodesize, - GFP_NOFS); + fs_info->nodesize); if (ret) goto out; } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 3fb5459c9309..7bffa10589d6 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -321,7 +321,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, * (NULL trans) */ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, - u64 num_bytes, gfp_t gfp_flag); + u64 num_bytes); /* * Inform qgroup to trace all leaf items of data diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index dc49f0aae1fb..8f377aca3409 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -749,8 +749,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, */ ret = btrfs_qgroup_trace_extent(trans, btrfs_file_extent_disk_bytenr(eb, item), - btrfs_file_extent_disk_num_bytes(eb, item), - GFP_NOFS); + btrfs_file_extent_disk_num_bytes(eb, item)); if (ret < 0) goto out; -- cgit From fe10158c759cc978d0def69b5cb5efabecb42653 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 14 Oct 2022 15:59:35 +0200 Subject: btrfs: switch GFP_NOFS to GFP_KERNEL in scrub_setup_recheck_block There's only one caller that calls scrub_setup_recheck_block in the memalloc_nofs_save/_restore protection so it's effectively already GFP_NOFS and it's safe to use GFP_KERNEL. Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 5c6c4aa2fb09..6e70cc348ae2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1493,7 +1493,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, return -EIO; } - recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); + recover = kzalloc(sizeof(struct scrub_recover), GFP_KERNEL); if (!recover) { btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); @@ -1516,7 +1516,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, sblock = sblocks_for_recheck[mirror_index]; sblock->sctx = sctx; - sector = alloc_scrub_sector(sblock, logical, GFP_NOFS); + sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL); if (!sector) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; -- cgit From 02bc392798f9bf8105d50b2fec44b5b2e3c80d47 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 14 Oct 2022 16:00:55 +0200 Subject: btrfs: sink gfp_t parameter to alloc_scrub_sector All callers pas GFP_KERNEL as parameter so we can use it directly in alloc_scrub_sector. Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6e70cc348ae2..3aed760e1a80 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -297,7 +297,7 @@ static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx, * Will also allocate new pages for @sblock if needed. */ static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock, - u64 logical, gfp_t gfp) + u64 logical) { const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT; struct scrub_sector *ssector; @@ -305,7 +305,7 @@ static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock, /* We must never have scrub_block exceed U32_MAX in size. */ ASSERT(logical - sblock->logical < U32_MAX); - ssector = kzalloc(sizeof(*ssector), gfp); + ssector = kzalloc(sizeof(*ssector), GFP_KERNEL); if (!ssector) return NULL; @@ -313,7 +313,7 @@ static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock, if (!sblock->pages[page_index]) { int ret; - sblock->pages[page_index] = alloc_page(gfp); + sblock->pages[page_index] = alloc_page(GFP_KERNEL); if (!sblock->pages[page_index]) { kfree(ssector); return NULL; @@ -1516,7 +1516,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, sblock = sblocks_for_recheck[mirror_index]; sblock->sctx = sctx; - sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL); + sector = alloc_scrub_sector(sblock, logical); if (!sector) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2438,7 +2438,7 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, */ u32 l = min(sectorsize, len); - sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL); + sector = alloc_scrub_sector(sblock, logical); if (!sector) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2769,7 +2769,7 @@ static int scrub_sectors_for_parity(struct scrub_parity *sparity, for (index = 0; len > 0; index++) { struct scrub_sector *sector; - sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL); + sector = alloc_scrub_sector(sblock, logical); if (!sector) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; -- cgit From 82c0efd3cd5d4ecce028da75d29e3345535b3389 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 19 Oct 2022 21:51:42 +0530 Subject: btrfs: merge module cleanup sequence to one helper The module exit function exit_btrfs_fs() is duplicating a section of code in init_btrfs_fs(). Add a helper to remove the duplicated code. Due to the init/exit section requirements the function must be inline and not a plain static as it could cause section mismatch. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c02fcdb9c7c4..75b1c096e4f1 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2837,7 +2837,7 @@ static const struct init_sequence mod_init_seq[] = { static bool mod_init_result[ARRAY_SIZE(mod_init_seq)]; -static void __exit exit_btrfs_fs(void) +static __always_inline void btrfs_exit_btrfs_fs(void) { int i; @@ -2850,6 +2850,11 @@ static void __exit exit_btrfs_fs(void) } } +static void __exit exit_btrfs_fs(void) +{ + btrfs_exit_btrfs_fs(); +} + static int __init init_btrfs_fs(void) { int ret; @@ -2858,26 +2863,13 @@ static int __init init_btrfs_fs(void) for (i = 0; i < ARRAY_SIZE(mod_init_seq); i++) { ASSERT(!mod_init_result[i]); ret = mod_init_seq[i].init_func(); - if (ret < 0) - goto error; + if (ret < 0) { + btrfs_exit_btrfs_fs(); + return ret; + } mod_init_result[i] = true; } return 0; - -error: - /* - * If we call exit_btrfs_fs() it would cause section mismatch. - * As init_btrfs_fs() belongs to .init.text, while exit_btrfs_fs() - * belongs to .exit.text. - */ - for (i = ARRAY_SIZE(mod_init_seq) - 1; i >= 0; i--) { - if (!mod_init_result[i]) - continue; - if (mod_init_seq[i].exit_func) - mod_init_seq[i].exit_func(); - mod_init_result[i] = false; - } - return ret; } late_initcall(init_btrfs_fs); -- cgit From e43eec81c5167b655b72c781b0e75e62a05e415e Mon Sep 17 00:00:00 2001 From: Sweet Tea Dorminy Date: Thu, 20 Oct 2022 12:58:25 -0400 Subject: btrfs: use struct qstr instead of name and namelen pairs Many functions throughout btrfs take name buffer and name length arguments. Most of these functions at the highest level are usually called with these arguments extracted from a supplied dentry's name. But the entire name can be passed instead, making each function a little more elegant. Each function whose arguments are currently the name and length extracted from a dentry is herein converted to instead take a pointer to the name in the dentry. The couple of calls to these calls without a struct dentry are converted to create an appropriate qstr to pass in. Additionally, every function which is only called with a name/len extracted directly from a qstr is also converted. This change has positive effect on stack consumption, frame of many functions is reduced but this will be used in the future for fscrypt related structures. Signed-off-by: Sweet Tea Dorminy Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 26 +++-- fs/btrfs/dir-item.c | 50 +++++---- fs/btrfs/inode-item.c | 73 +++++++------- fs/btrfs/inode-item.h | 20 ++-- fs/btrfs/inode.c | 130 ++++++++++-------------- fs/btrfs/ioctl.c | 7 +- fs/btrfs/root-tree.c | 19 ++-- fs/btrfs/send.c | 12 ++- fs/btrfs/super.c | 3 +- fs/btrfs/transaction.c | 11 +- fs/btrfs/tree-log.c | 267 ++++++++++++++++++++++--------------------------- fs/btrfs/tree-log.h | 4 +- 12 files changed, 287 insertions(+), 335 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3dcfb62af68a..9e8c9f9bc4fb 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1516,11 +1516,11 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, /* root-item.c */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 sequence, const char *name, - int name_len); + u64 ref_id, u64 dirid, u64 sequence, + const struct qstr *name); int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 *sequence, const char *name, - int name_len); + u64 ref_id, u64 dirid, u64 *sequence, + const struct qstr *name); int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key); int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1549,25 +1549,23 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); /* dir-item.c */ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, - const char *name, int name_len); -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, - int name_len, struct btrfs_inode *dir, + const struct qstr *name); +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, + const struct qstr *name, struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index); struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - const char *name, int name_len, - int mod); + const struct qstr *name, int mod); struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 index, const char *name, int name_len, - int mod); + u64 index, const struct qstr *name, int mod); struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, - const char *name, int name_len); + const struct qstr *name); int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -1648,10 +1646,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, int name_len); + const struct qstr *name); int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const char *name, int name_len, int add_backref, u64 index); + const struct qstr *name, int add_backref, u64 index); int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, int front); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 9fa37f245c43..48a15af4ec57 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -105,8 +105,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, * to use for the second index (if one is created). * Will return 0 or -ENOMEM */ -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, - int name_len, struct btrfs_inode *dir, +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, + const struct qstr *name, struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index) { int ret = 0; @@ -122,7 +122,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, key.objectid = btrfs_ino(dir); key.type = BTRFS_DIR_ITEM_KEY; - key.offset = btrfs_name_hash(name, name_len); + key.offset = btrfs_name_hash(name->name, name->len); path = btrfs_alloc_path(); if (!path) @@ -130,9 +130,9 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, btrfs_cpu_key_to_disk(&disk_key, location); - data_size = sizeof(*dir_item) + name_len; + data_size = sizeof(*dir_item) + name->len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, - name, name_len); + name->name, name->len); if (IS_ERR(dir_item)) { ret = PTR_ERR(dir_item); if (ret == -EEXIST) @@ -144,11 +144,11 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, btrfs_set_dir_item_key(leaf, dir_item, &disk_key); btrfs_set_dir_type(leaf, dir_item, type); btrfs_set_dir_data_len(leaf, dir_item, 0); - btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_name_len(leaf, dir_item, name->len); btrfs_set_dir_transid(leaf, dir_item, trans->transid); name_ptr = (unsigned long)(dir_item + 1); - write_extent_buffer(leaf, name, name_ptr, name_len); + write_extent_buffer(leaf, name->name, name_ptr, name->len); btrfs_mark_buffer_dirty(leaf); second_insert: @@ -159,7 +159,7 @@ second_insert: } btrfs_release_path(path); - ret2 = btrfs_insert_delayed_dir_index(trans, name, name_len, dir, + ret2 = btrfs_insert_delayed_dir_index(trans, name->name, name->len, dir, &disk_key, type, index); out_free: btrfs_free_path(path); @@ -208,7 +208,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir( struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - const char *name, int name_len, + const struct qstr *name, int mod) { struct btrfs_key key; @@ -216,9 +216,10 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, key.objectid = dir; key.type = BTRFS_DIR_ITEM_KEY; - key.offset = btrfs_name_hash(name, name_len); + key.offset = btrfs_name_hash(name->name, name->len); - di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + di = btrfs_lookup_match_dir(trans, root, path, &key, name->name, + name->len, mod); if (IS_ERR(di) && PTR_ERR(di) == -ENOENT) return NULL; @@ -226,7 +227,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, } int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, - const char *name, int name_len) + const struct qstr *name) { int ret; struct btrfs_key key; @@ -242,9 +243,10 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, key.objectid = dir; key.type = BTRFS_DIR_ITEM_KEY; - key.offset = btrfs_name_hash(name, name_len); + key.offset = btrfs_name_hash(name->name, name->len); - di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0); + di = btrfs_lookup_match_dir(NULL, root, path, &key, name->name, + name->len, 0); if (IS_ERR(di)) { ret = PTR_ERR(di); /* Nothing found, we're safe */ @@ -264,11 +266,8 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, goto out; } - /* - * see if there is room in the item to insert this - * name - */ - data_size = sizeof(*di) + name_len; + /* See if there is room in the item to insert this name. */ + data_size = sizeof(*di) + name->len; leaf = path->nodes[0]; slot = path->slots[0]; if (data_size + btrfs_item_size(leaf, slot) + @@ -305,8 +304,7 @@ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 index, const char *name, int name_len, - int mod) + u64 index, const struct qstr *name, int mod) { struct btrfs_dir_item *di; struct btrfs_key key; @@ -315,7 +313,8 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, key.type = BTRFS_DIR_INDEX_KEY; key.offset = index; - di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + di = btrfs_lookup_match_dir(trans, root, path, &key, name->name, + name->len, mod); if (di == ERR_PTR(-ENOENT)) return NULL; @@ -323,9 +322,8 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, } struct btrfs_dir_item * -btrfs_search_dir_index_item(struct btrfs_root *root, - struct btrfs_path *path, u64 dirid, - const char *name, int name_len) +btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, + u64 dirid, const struct qstr *name) { struct btrfs_dir_item *di; struct btrfs_key key; @@ -340,7 +338,7 @@ btrfs_search_dir_index_item(struct btrfs_root *root, break; di = btrfs_match_dir_item_name(root->fs_info, path, - name, name_len); + name->name, name->len); if (di) return di; } diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index b8dbabfa8b31..577e39cfc411 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -14,8 +14,8 @@ #include "accessors.h" struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, - int slot, const char *name, - int name_len) + int slot, + const struct qstr *name) { struct btrfs_inode_ref *ref; unsigned long ptr; @@ -31,9 +31,10 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, len = btrfs_inode_ref_name_len(leaf, ref); name_ptr = (unsigned long)(ref + 1); cur_offset += len + sizeof(*ref); - if (len != name_len) + if (len != name->len) continue; - if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) + if (memcmp_extent_buffer(leaf, name->name, name_ptr, + name->len) == 0) return ref; } return NULL; @@ -41,7 +42,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( struct extent_buffer *leaf, int slot, u64 ref_objectid, - const char *name, int name_len) + const struct qstr *name) { struct btrfs_inode_extref *extref; unsigned long ptr; @@ -64,9 +65,10 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( name_ptr = (unsigned long)(&extref->name); ref_name_len = btrfs_inode_extref_name_len(leaf, extref); - if (ref_name_len == name_len && + if (ref_name_len == name->len && btrfs_inode_extref_parent(leaf, extref) == ref_objectid && - (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) + (memcmp_extent_buffer(leaf, name->name, name_ptr, + name->len) == 0)) return extref; cur_offset += ref_name_len + sizeof(*extref); @@ -79,7 +81,7 @@ struct btrfs_inode_extref * btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - const char *name, int name_len, + const struct qstr *name, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow) { @@ -88,7 +90,7 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; - key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); if (ret < 0) @@ -96,13 +98,13 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, if (ret > 0) return NULL; return btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], - ref_objectid, name, name_len); + ref_objectid, name); } static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct qstr *name, u64 inode_objectid, u64 ref_objectid, u64 *index) { @@ -111,14 +113,14 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_inode_extref *extref; struct extent_buffer *leaf; int ret; - int del_len = name_len + sizeof(*extref); + int del_len = name->len + sizeof(*extref); unsigned long ptr; unsigned long item_start; u32 item_size; key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; - key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); path = btrfs_alloc_path(); if (!path) @@ -136,7 +138,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, * readonly. */ extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], - ref_objectid, name, name_len); + ref_objectid, name); if (!extref) { btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL); ret = -EROFS; @@ -172,8 +174,7 @@ out: } int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, + struct btrfs_root *root, const struct qstr *name, u64 inode_objectid, u64 ref_objectid, u64 *index) { struct btrfs_path *path; @@ -186,7 +187,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, u32 sub_item_len; int ret; int search_ext_refs = 0; - int del_len = name_len + sizeof(*ref); + int del_len = name->len + sizeof(*ref); key.objectid = inode_objectid; key.offset = ref_objectid; @@ -205,8 +206,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, goto out; } - ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name, - name_len); + ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name); if (!ref) { ret = -ENOENT; search_ext_refs = 1; @@ -223,7 +223,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, goto out; } ptr = (unsigned long)ref; - sub_item_len = name_len + sizeof(*ref); + sub_item_len = name->len + sizeof(*ref); item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_size - (ptr + sub_item_len - item_start)); @@ -237,7 +237,7 @@ out: * name in our ref array. Find and remove the extended * inode ref then. */ - return btrfs_del_inode_extref(trans, root, name, name_len, + return btrfs_del_inode_extref(trans, root, name, inode_objectid, ref_objectid, index); } @@ -251,12 +251,13 @@ out: */ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 index) + const struct qstr *name, + u64 inode_objectid, u64 ref_objectid, + u64 index) { struct btrfs_inode_extref *extref; int ret; - int ins_len = name_len + sizeof(*extref); + int ins_len = name->len + sizeof(*extref); unsigned long ptr; struct btrfs_path *path; struct btrfs_key key; @@ -264,7 +265,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; - key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); path = btrfs_alloc_path(); if (!path) @@ -276,7 +277,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, if (btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], ref_objectid, - name, name_len)) + name)) goto out; btrfs_extend_item(path, ins_len); @@ -290,12 +291,12 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len; extref = (struct btrfs_inode_extref *)ptr; - btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len); + btrfs_set_inode_extref_name_len(path->nodes[0], extref, name->len); btrfs_set_inode_extref_index(path->nodes[0], extref, index); btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid); ptr = (unsigned long)&extref->name; - write_extent_buffer(path->nodes[0], name, ptr, name_len); + write_extent_buffer(path->nodes[0], name->name, ptr, name->len); btrfs_mark_buffer_dirty(path->nodes[0]); out: @@ -305,8 +306,7 @@ out: /* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, + struct btrfs_root *root, const struct qstr *name, u64 inode_objectid, u64 ref_objectid, u64 index) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -315,7 +315,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_inode_ref *ref; unsigned long ptr; int ret; - int ins_len = name_len + sizeof(*ref); + int ins_len = name->len + sizeof(*ref); key.objectid = inode_objectid; key.offset = ref_objectid; @@ -331,7 +331,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (ret == -EEXIST) { u32 old_size; ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, name_len); + name); if (ref) goto out; @@ -340,7 +340,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len); btrfs_set_inode_ref_index(path->nodes[0], ref, index); ptr = (unsigned long)(ref + 1); ret = 0; @@ -348,7 +348,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (ret == -EOVERFLOW) { if (btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, name_len)) + name)) ret = -EEXIST; else ret = -EMLINK; @@ -357,11 +357,11 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, } else { ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len); btrfs_set_inode_ref_index(path->nodes[0], ref, index); ptr = (unsigned long)(ref + 1); } - write_extent_buffer(path->nodes[0], name, ptr, name_len); + write_extent_buffer(path->nodes[0], name->name, ptr, name->len); btrfs_mark_buffer_dirty(path->nodes[0]); out: @@ -374,7 +374,6 @@ out: if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) ret = btrfs_insert_inode_extref(trans, root, name, - name_len, inode_objectid, ref_objectid, index); } diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index a8fc16d0147f..3c657c670cfd 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -64,33 +64,31 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_truncate_control *control); int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, + struct btrfs_root *root, const struct qstr *name, u64 inode_objectid, u64 ref_objectid, u64 index); int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 *index); + struct btrfs_root *root, const struct qstr *name, + u64 inode_objectid, u64 ref_objectid, u64 *index); int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); -int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *location, int mod); struct btrfs_inode_extref *btrfs_lookup_inode_extref( struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - const char *name, int name_len, + const struct qstr *name, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, - int slot, const char *name, - int name_len); + int slot, + const struct qstr *name); struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( struct extent_buffer *leaf, int slot, u64 ref_objectid, - const char *name, int name_len); + const struct qstr *name); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cb2a6d7f6252..3162d8f0aaf8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3641,7 +3641,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->delayed_iput_lock); } -/** +/* * Wait for flushing all delayed iputs * * @fs_info: the filesystem @@ -4286,7 +4286,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, int name_len, + const struct qstr *name, struct btrfs_rename_ctx *rename_ctx) { struct btrfs_root *root = dir->root; @@ -4304,8 +4304,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto out; } - di = btrfs_lookup_dir_item(trans, root, path, dir_ino, - name, name_len, -1); + di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto err; @@ -4333,12 +4332,11 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, } } - ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, - dir_ino, &index); + ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); if (ret) { btrfs_info(fs_info, "failed to delete reference to %.*s, inode %llu parent %llu", - name_len, name, ino, dir_ino); + name->len, name->name, ino, dir_ino); btrfs_abort_transaction(trans, ret); goto err; } @@ -4359,10 +4357,8 @@ skip_backref: * operations on the log tree, increasing latency for applications. */ if (!rename_ctx) { - btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, - dir_ino); - btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, - index); + btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino); + btrfs_del_dir_entries_in_log(trans, root, name, dir, index); } /* @@ -4380,7 +4376,7 @@ err: if (ret) goto out; - btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2); + btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); @@ -4393,10 +4389,11 @@ out: int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, int name_len) + const struct qstr *name) { int ret; - ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL); + + ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL); if (!ret) { drop_nlink(&inode->vfs_inode); ret = btrfs_update_inode(trans, inode->root, inode); @@ -4440,9 +4437,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 0); - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), - BTRFS_I(d_inode(dentry)), dentry->d_name.name, - dentry->d_name.len); + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), + &dentry->d_name); if (ret) goto out; @@ -4467,8 +4463,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; - const char *name = dentry->d_name.name; - int name_len = dentry->d_name.len; + const struct qstr *name = &dentry->d_name; u64 index; int ret; u64 objectid; @@ -4487,8 +4482,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - di = btrfs_lookup_dir_item(trans, root, path, dir_ino, - name, name_len, -1); + di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; @@ -4514,8 +4508,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, * call btrfs_del_root_ref, and it _shouldn't_ fail. */ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { - di = btrfs_search_dir_index_item(root, path, dir_ino, - name, name_len); + di = btrfs_search_dir_index_item(root, path, dir_ino, name); if (IS_ERR_OR_NULL(di)) { if (!di) ret = -ENOENT; @@ -4532,7 +4525,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } else { ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid, dir_ino, - &index, name, name_len); + &index, name); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -4545,7 +4538,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, goto out; } - btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2); + btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name->len * 2); inode_inc_iversion(dir); dir->i_mtime = current_time(dir); dir->i_ctime = dir->i_mtime; @@ -4567,6 +4560,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) struct btrfs_path *path; struct btrfs_dir_item *di; struct btrfs_key key; + struct qstr name = QSTR_INIT("default", 7); u64 dir_id; int ret; @@ -4577,7 +4571,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) /* Make sure this root isn't set as the default subvol */ dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, - dir_id, "default", 7, 0); + dir_id, &name, 0); if (di && !IS_ERR(di)) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); if (key.objectid == root->root_key.objectid) { @@ -4844,9 +4838,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; /* now the directory is empty */ - err = btrfs_unlink_inode(trans, BTRFS_I(dir), - BTRFS_I(d_inode(dentry)), dentry->d_name.name, - dentry->d_name.len); + err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), + &dentry->d_name); if (!err) { btrfs_i_size_write(BTRFS_I(inode), 0); /* @@ -5538,8 +5531,7 @@ no_delete: static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type) { - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; + const struct qstr *name = &dentry->d_name; struct btrfs_dir_item *di; struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -5550,7 +5542,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, return -ENOMEM; di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), - name, namelen, 0); + name, 0); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; @@ -5562,7 +5554,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, ret = -EUCLEAN; btrfs_warn(root->fs_info, "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", - __func__, name, btrfs_ino(BTRFS_I(dir)), + __func__, name->name, btrfs_ino(BTRFS_I(dir)), location->objectid, location->type, location->offset); } if (!ret) @@ -6321,8 +6313,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, { struct inode *dir = args->dir; struct inode *inode = args->inode; - const char *name = args->orphan ? NULL : args->dentry->d_name.name; - int name_len = args->orphan ? 0 : args->dentry->d_name.len; + const struct qstr *name = args->orphan ? NULL : &args->dentry->d_name; struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_root *root; struct btrfs_inode_item *inode_item; @@ -6423,7 +6414,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, sizes[1] = 2 + sizeof(*ref); } else { key[1].offset = btrfs_ino(BTRFS_I(dir)); - sizes[1] = name_len + sizeof(*ref); + sizes[1] = name->len + sizeof(*ref); } } @@ -6462,10 +6453,12 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_ref_index(path->nodes[0], ref, 0); write_extent_buffer(path->nodes[0], "..", ptr, 2); } else { - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, + name->len); btrfs_set_inode_ref_index(path->nodes[0], ref, BTRFS_I(inode)->dir_index); - write_extent_buffer(path->nodes[0], name, ptr, name_len); + write_extent_buffer(path->nodes[0], name->name, ptr, + name->len); } } @@ -6526,7 +6519,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, ret = btrfs_orphan_add(trans, BTRFS_I(inode)); } else { ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, - name_len, 0, BTRFS_I(inode)->dir_index); + 0, BTRFS_I(inode)->dir_index); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -6555,7 +6548,7 @@ out: */ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const char *name, int name_len, int add_backref, u64 index) + const struct qstr *name, int add_backref, u64 index) { int ret = 0; struct btrfs_key key; @@ -6574,17 +6567,17 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_add_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, - index, name, name_len); + index, name); } else if (add_backref) { - ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, - parent_ino, index); + ret = btrfs_insert_inode_ref(trans, root, name, + ino, parent_ino, index); } /* Nothing to clean up yet */ if (ret) return ret; - ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key, + ret = btrfs_insert_dir_item(trans, name, parent_inode, &key, btrfs_inode_type(&inode->vfs_inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; @@ -6594,7 +6587,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, } btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + - name_len * 2); + name->len * 2); inode_inc_iversion(&parent_inode->vfs_inode); /* * If we are replaying a log tree, we do not want to update the mtime @@ -6619,15 +6612,15 @@ fail_dir_item: int err; err = btrfs_del_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, - &local_index, name, name_len); + &local_index, name); if (err) btrfs_abort_transaction(trans, err); } else if (add_backref) { u64 local_index; int err; - err = btrfs_del_inode_ref(trans, root, name, name_len, - ino, parent_ino, &local_index); + err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, + &local_index); if (err) btrfs_abort_transaction(trans, err); } @@ -6747,7 +6740,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - dentry->d_name.name, dentry->d_name.len, 1, index); + &dentry->d_name, 1, index); if (err) { drop_inode = 1; @@ -9088,9 +9081,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, dest, - new_dentry->d_name.name, - new_dentry->d_name.len, + ret = btrfs_insert_inode_ref(trans, dest, &new_dentry->d_name, old_ino, btrfs_ino(BTRFS_I(new_dir)), old_idx); @@ -9104,9 +9095,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, root, - old_dentry->d_name.name, - old_dentry->d_name.len, + ret = btrfs_insert_inode_ref(trans, root, &old_dentry->d_name, new_ino, btrfs_ino(BTRFS_I(old_dir)), new_idx); @@ -9142,8 +9131,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), - old_dentry->d_name.name, - old_dentry->d_name.len, + &old_dentry->d_name, &old_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); @@ -9159,8 +9147,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), - new_dentry->d_name.name, - new_dentry->d_name.len, + &new_dentry->d_name, &new_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); @@ -9171,16 +9158,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), - new_dentry->d_name.name, - new_dentry->d_name.len, 0, old_idx); + &new_dentry->d_name, 0, old_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), - old_dentry->d_name.name, - old_dentry->d_name.len, 0, new_idx); + &old_dentry->d_name, 0, new_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; @@ -9281,8 +9266,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, /* check for collisions, even if the name isn't there */ ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, - new_dentry->d_name.name, - new_dentry->d_name.len); + &new_dentry->d_name); if (ret) { if (ret == -EEXIST) { @@ -9376,9 +9360,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, dest, - new_dentry->d_name.name, - new_dentry->d_name.len, + ret = btrfs_insert_inode_ref(trans, dest, &new_dentry->d_name, old_ino, btrfs_ino(BTRFS_I(new_dir)), index); if (ret) @@ -9402,10 +9384,8 @@ static int btrfs_rename(struct user_namespace *mnt_userns, ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), - BTRFS_I(d_inode(old_dentry)), - old_dentry->d_name.name, - old_dentry->d_name.len, - &rename_ctx); + BTRFS_I(d_inode(old_dentry)), + &old_dentry->d_name, &rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9424,8 +9404,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), - new_dentry->d_name.name, - new_dentry->d_name.len); + &new_dentry->d_name); } if (!ret && new_inode->i_nlink == 0) ret = btrfs_orphan_add(trans, @@ -9437,8 +9416,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), - new_dentry->d_name.name, - new_dentry->d_name.len, 0, index); + &new_dentry->d_name, 0, index); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e33be4032fff..fde55d0f61b4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -951,6 +951,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, struct inode *dir = d_inode(parent->dentry); struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct dentry *dentry; + struct qstr name_str = QSTR_INIT(name, namelen); int error; error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); @@ -971,8 +972,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, * check for them now when we can safely fail */ error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, - dir->i_ino, name, - namelen); + dir->i_ino, &name_str); if (error) goto out_dput; @@ -3779,6 +3779,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) struct btrfs_trans_handle *trans; struct btrfs_path *path = NULL; struct btrfs_disk_key disk_key; + struct qstr name = QSTR_INIT("default", 7); u64 objectid = 0; u64 dir_id; int ret; @@ -3822,7 +3823,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path, - dir_id, "default", 7, 1); + dir_id, &name, 1); if (IS_ERR_OR_NULL(di)) { btrfs_release_path(path); btrfs_end_transaction(trans); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index fc00dfb281d5..848a720747af 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -330,9 +330,8 @@ out: } int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 *sequence, const char *name, - int name_len) - + u64 ref_id, u64 dirid, u64 *sequence, + const struct qstr *name) { struct btrfs_root *tree_root = trans->fs_info->tree_root; struct btrfs_path *path; @@ -359,8 +358,8 @@ again: struct btrfs_root_ref); ptr = (unsigned long)(ref + 1); if ((btrfs_root_ref_dirid(leaf, ref) != dirid) || - (btrfs_root_ref_name_len(leaf, ref) != name_len) || - memcmp_extent_buffer(leaf, name, ptr, name_len)) { + (btrfs_root_ref_name_len(leaf, ref) != name->len) || + memcmp_extent_buffer(leaf, name->name, ptr, name->len)) { ret = -ENOENT; goto out; } @@ -403,8 +402,8 @@ out: * Will return 0, -ENOMEM, or anything from the CoW path */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 sequence, const char *name, - int name_len) + u64 ref_id, u64 dirid, u64 sequence, + const struct qstr *name) { struct btrfs_root *tree_root = trans->fs_info->tree_root; struct btrfs_key key; @@ -423,7 +422,7 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, key.offset = ref_id; again: ret = btrfs_insert_empty_item(trans, tree_root, path, &key, - sizeof(*ref) + name_len); + sizeof(*ref) + name->len); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_free_path(path); @@ -434,9 +433,9 @@ again: ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); btrfs_set_root_ref_dirid(leaf, ref, dirid); btrfs_set_root_ref_sequence(leaf, ref, sequence); - btrfs_set_root_ref_name_len(leaf, ref, name_len); + btrfs_set_root_ref_name_len(leaf, ref, name->len); ptr = (unsigned long)(ref + 1); - write_extent_buffer(leaf, name, ptr, name_len); + write_extent_buffer(leaf, name->name, ptr, name->len); btrfs_mark_buffer_dirty(leaf); if (key.type == BTRFS_ROOT_BACKREF_KEY) { diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 31fb3292e816..d0feeea2ba1c 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1597,13 +1597,17 @@ static int gen_unique_name(struct send_ctx *sctx, return -ENOMEM; while (1) { + struct qstr tmp_name; + len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", ino, gen, idx); ASSERT(len < sizeof(tmp)); + tmp_name.name = tmp; + tmp_name.len = strlen(tmp); di = btrfs_lookup_dir_item(NULL, sctx->send_root, path, BTRFS_FIRST_FREE_OBJECTID, - tmp, strlen(tmp), 0); + &tmp_name, 0); btrfs_release_path(path); if (IS_ERR(di)) { ret = PTR_ERR(di); @@ -1623,7 +1627,7 @@ static int gen_unique_name(struct send_ctx *sctx, di = btrfs_lookup_dir_item(NULL, sctx->parent_root, path, BTRFS_FIRST_FREE_OBJECTID, - tmp, strlen(tmp), 0); + &tmp_name, 0); btrfs_release_path(path); if (IS_ERR(di)) { ret = PTR_ERR(di); @@ -1753,13 +1757,13 @@ static int lookup_dir_item_inode(struct btrfs_root *root, struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_path *path; + struct qstr name_str = QSTR_INIT(name, name_len); path = alloc_path_for_send(); if (!path) return -ENOMEM; - di = btrfs_lookup_dir_item(NULL, root, path, - dir, name, name_len, 0); + di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 75b1c096e4f1..13d33e5707de 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1423,6 +1423,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec struct btrfs_dir_item *di; struct btrfs_path *path; struct btrfs_key location; + struct qstr name = QSTR_INIT("default", 7); u64 dir_id; path = btrfs_alloc_path(); @@ -1435,7 +1436,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec * to mount. */ dir_id = btrfs_super_root_dir(fs_info->super_copy); - di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); + di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0); if (IS_ERR(di)) { btrfs_free_path(path); return PTR_ERR(di); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 25e6b504edb4..7db9612f4d0e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1678,8 +1678,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, btrfs_ino(BTRFS_I(parent_inode)), - dentry->d_name.name, - dentry->d_name.len, 0); + &dentry->d_name, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; goto dir_item_existed; @@ -1774,7 +1773,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_add_root_ref(trans, objectid, parent_root->root_key.objectid, btrfs_ino(BTRFS_I(parent_inode)), index, - dentry->d_name.name, dentry->d_name.len); + &dentry->d_name); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1806,9 +1805,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret < 0) goto fail; - ret = btrfs_insert_dir_item(trans, dentry->d_name.name, - dentry->d_name.len, BTRFS_I(parent_inode), - &key, BTRFS_FT_DIR, index); + ret = btrfs_insert_dir_item(trans, &dentry->d_name, + BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, + index); /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8f377aca3409..b14ac7017f41 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -597,6 +597,21 @@ static int overwrite_item(struct btrfs_trans_handle *trans, return do_overwrite_item(trans, root, path, eb, slot, key); } +static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, + struct qstr *name) +{ + char *buf; + + buf = kmalloc(len, GFP_NOFS); + if (!buf) + return -ENOMEM; + + read_extent_buffer(eb, buf, (unsigned long)start, len); + name->name = buf; + name->len = len; + return 0; +} + /* * simple helper to read an inode off the disk from a given root * This can only be called for subvolume roots and not for the log @@ -902,12 +917,11 @@ out: static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, - int name_len) + const struct qstr *name) { int ret; - ret = btrfs_unlink_inode(trans, dir, inode, name, name_len); + ret = btrfs_unlink_inode(trans, dir, inode, name); if (ret) return ret; /* @@ -934,8 +948,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, { struct btrfs_root *root = dir->root; struct inode *inode; - char *name; - int name_len; + struct qstr name; struct extent_buffer *leaf; struct btrfs_key location; int ret; @@ -943,12 +956,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; btrfs_dir_item_key_to_cpu(leaf, di, &location); - name_len = btrfs_dir_name_len(leaf, di); - name = kmalloc(name_len, GFP_NOFS); - if (!name) + ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name); + if (ret) return -ENOMEM; - read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); btrfs_release_path(path); inode = read_one_inode(root, location.objectid); @@ -961,10 +972,9 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name, - name_len); + ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name); out: - kfree(name); + kfree(name.name); iput(inode); return ret; } @@ -979,14 +989,14 @@ out: static noinline int inode_in_dir(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, u64 objectid, u64 index, - const char *name, int name_len) + struct qstr *name) { struct btrfs_dir_item *di; struct btrfs_key location; int ret = 0; di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, - index, name, name_len, 0); + index, name, 0); if (IS_ERR(di)) { ret = PTR_ERR(di); goto out; @@ -999,7 +1009,7 @@ static noinline int inode_in_dir(struct btrfs_root *root, } btrfs_release_path(path); - di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); + di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0); if (IS_ERR(di)) { ret = PTR_ERR(di); goto out; @@ -1026,7 +1036,7 @@ out: static noinline int backref_in_log(struct btrfs_root *log, struct btrfs_key *key, u64 ref_objectid, - const char *name, int namelen) + const struct qstr *name) { struct btrfs_path *path; int ret; @@ -1046,12 +1056,10 @@ static noinline int backref_in_log(struct btrfs_root *log, if (key->type == BTRFS_INODE_EXTREF_KEY) ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], - ref_objectid, - name, namelen); + ref_objectid, name); else ret = !!btrfs_find_name_in_backref(path->nodes[0], - path->slots[0], - name, namelen); + path->slots[0], name); out: btrfs_free_path(path); return ret; @@ -1064,11 +1072,9 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, u64 inode_objectid, u64 parent_objectid, - u64 ref_index, char *name, int namelen) + u64 ref_index, struct qstr *name) { int ret; - char *victim_name; - int victim_name_len; struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key search_key; @@ -1100,43 +1106,40 @@ again: ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); while (ptr < ptr_end) { - victim_ref = (struct btrfs_inode_ref *)ptr; - victim_name_len = btrfs_inode_ref_name_len(leaf, - victim_ref); - victim_name = kmalloc(victim_name_len, GFP_NOFS); - if (!victim_name) - return -ENOMEM; + struct qstr victim_name; - read_extent_buffer(leaf, victim_name, - (unsigned long)(victim_ref + 1), - victim_name_len); + victim_ref = (struct btrfs_inode_ref *)ptr; + ret = read_alloc_one_name(leaf, (victim_ref + 1), + btrfs_inode_ref_name_len(leaf, victim_ref), + &victim_name); + if (ret) + return ret; ret = backref_in_log(log_root, &search_key, - parent_objectid, victim_name, - victim_name_len); + parent_objectid, &victim_name); if (ret < 0) { - kfree(victim_name); + kfree(victim_name.name); return ret; } else if (!ret) { inc_nlink(&inode->vfs_inode); btrfs_release_path(path); ret = unlink_inode_for_log_replay(trans, dir, inode, - victim_name, victim_name_len); - kfree(victim_name); + &victim_name); + kfree(victim_name.name); if (ret) return ret; goto again; } - kfree(victim_name); + kfree(victim_name.name); - ptr = (unsigned long)(victim_ref + 1) + victim_name_len; + ptr = (unsigned long)(victim_ref + 1) + victim_name.len; } } btrfs_release_path(path); /* Same search but for extended refs */ - extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, + extref = btrfs_lookup_inode_extref(NULL, root, path, name, inode_objectid, parent_objectid, 0, 0); if (IS_ERR(extref)) { @@ -1153,29 +1156,28 @@ again: base = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { - extref = (struct btrfs_inode_extref *)(base + cur_offset); + struct qstr victim_name; - victim_name_len = btrfs_inode_extref_name_len(leaf, extref); + extref = (struct btrfs_inode_extref *)(base + cur_offset); if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) goto next; - victim_name = kmalloc(victim_name_len, GFP_NOFS); - if (!victim_name) - return -ENOMEM; - read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, - victim_name_len); + ret = read_alloc_one_name(leaf, &extref->name, + btrfs_inode_extref_name_len(leaf, extref), + &victim_name); + if (ret) + return ret; search_key.objectid = inode_objectid; search_key.type = BTRFS_INODE_EXTREF_KEY; search_key.offset = btrfs_extref_hash(parent_objectid, - victim_name, - victim_name_len); + victim_name.name, + victim_name.len); ret = backref_in_log(log_root, &search_key, - parent_objectid, victim_name, - victim_name_len); + parent_objectid, &victim_name); if (ret < 0) { - kfree(victim_name); + kfree(victim_name.name); return ret; } else if (!ret) { ret = -ENOENT; @@ -1187,26 +1189,24 @@ again: ret = unlink_inode_for_log_replay(trans, BTRFS_I(victim_parent), - inode, - victim_name, - victim_name_len); + inode, &victim_name); } iput(victim_parent); - kfree(victim_name); + kfree(victim_name.name); if (ret) return ret; goto again; } - kfree(victim_name); + kfree(victim_name.name); next: - cur_offset += victim_name_len + sizeof(*extref); + cur_offset += victim_name.len + sizeof(*extref); } } btrfs_release_path(path); /* look for a conflicting sequence number */ di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), - ref_index, name, namelen, 0); + ref_index, name, 0); if (IS_ERR(di)) { return PTR_ERR(di); } else if (di) { @@ -1217,8 +1217,7 @@ next: btrfs_release_path(path); /* look for a conflicting name */ - di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), - name, namelen, 0); + di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0); if (IS_ERR(di)) { return PTR_ERR(di); } else if (di) { @@ -1232,20 +1231,18 @@ next: } static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - u32 *namelen, char **name, u64 *index, + struct qstr *name, u64 *index, u64 *parent_objectid) { struct btrfs_inode_extref *extref; + int ret; extref = (struct btrfs_inode_extref *)ref_ptr; - *namelen = btrfs_inode_extref_name_len(eb, extref); - *name = kmalloc(*namelen, GFP_NOFS); - if (*name == NULL) - return -ENOMEM; - - read_extent_buffer(eb, *name, (unsigned long)&extref->name, - *namelen); + ret = read_alloc_one_name(eb, &extref->name, + btrfs_inode_extref_name_len(eb, extref), name); + if (ret) + return ret; if (index) *index = btrfs_inode_extref_index(eb, extref); @@ -1256,18 +1253,17 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, } static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - u32 *namelen, char **name, u64 *index) + struct qstr *name, u64 *index) { struct btrfs_inode_ref *ref; + int ret; ref = (struct btrfs_inode_ref *)ref_ptr; - *namelen = btrfs_inode_ref_name_len(eb, ref); - *name = kmalloc(*namelen, GFP_NOFS); - if (*name == NULL) - return -ENOMEM; - - read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); + ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref), + name); + if (ret) + return ret; if (index) *index = btrfs_inode_ref_index(eb, ref); @@ -1309,28 +1305,24 @@ again: ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); while (ref_ptr < ref_end) { - char *name = NULL; - int namelen; + struct qstr name; u64 parent_id; if (key->type == BTRFS_INODE_EXTREF_KEY) { - ret = extref_get_fields(eb, ref_ptr, &namelen, &name, + ret = extref_get_fields(eb, ref_ptr, &name, NULL, &parent_id); } else { parent_id = key->offset; - ret = ref_get_fields(eb, ref_ptr, &namelen, &name, - NULL); + ret = ref_get_fields(eb, ref_ptr, &name, NULL); } if (ret) goto out; if (key->type == BTRFS_INODE_EXTREF_KEY) ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, - parent_id, name, - namelen); + parent_id, &name); else - ret = !!btrfs_find_name_in_backref(log_eb, log_slot, - name, namelen); + ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); if (!ret) { struct inode *dir; @@ -1339,20 +1331,20 @@ again: dir = read_one_inode(root, parent_id); if (!dir) { ret = -ENOENT; - kfree(name); + kfree(name.name); goto out; } ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), - inode, name, namelen); - kfree(name); + inode, &name); + kfree(name.name); iput(dir); if (ret) goto out; goto again; } - kfree(name); - ref_ptr += namelen; + kfree(name.name); + ref_ptr += name.len; if (key->type == BTRFS_INODE_EXTREF_KEY) ref_ptr += sizeof(struct btrfs_inode_extref); else @@ -1381,8 +1373,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; - char *name = NULL; - int namelen; + struct qstr name; int ret; int log_ref_ver = 0; u64 parent_objectid; @@ -1426,7 +1417,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, while (ref_ptr < ref_end) { if (log_ref_ver) { - ret = extref_get_fields(eb, ref_ptr, &namelen, &name, + ret = extref_get_fields(eb, ref_ptr, &name, &ref_index, &parent_objectid); /* * parent object can change from one array @@ -1439,15 +1430,13 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, goto out; } } else { - ret = ref_get_fields(eb, ref_ptr, &namelen, &name, - &ref_index); + ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); } if (ret) goto out; ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), - btrfs_ino(BTRFS_I(inode)), ref_index, - name, namelen); + btrfs_ino(BTRFS_I(inode)), ref_index, &name); if (ret < 0) { goto out; } else if (ret == 0) { @@ -1461,7 +1450,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ret = __add_inode_ref(trans, root, path, log, BTRFS_I(dir), BTRFS_I(inode), inode_objectid, parent_objectid, - ref_index, name, namelen); + ref_index, &name); if (ret) { if (ret == 1) ret = 0; @@ -1470,7 +1459,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, /* insert our name */ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - name, namelen, 0, ref_index); + &name, 0, ref_index); if (ret) goto out; @@ -1480,9 +1469,9 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } /* Else, ret == 1, we already have a perfect match, we're done. */ - ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; - kfree(name); - name = NULL; + ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len; + kfree(name.name); + name.name = NULL; if (log_ref_ver) { iput(dir); dir = NULL; @@ -1506,7 +1495,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ret = overwrite_item(trans, root, path, eb, slot, key); out: btrfs_release_path(path); - kfree(name); + kfree(name.name); iput(dir); iput(inode); return ret; @@ -1778,7 +1767,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, static noinline int insert_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 dirid, u64 index, - char *name, int name_len, + const struct qstr *name, struct btrfs_key *location) { struct inode *inode; @@ -1796,7 +1785,7 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, } ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, - name_len, 1, index); + 1, index); /* FIXME, put inode into FIXUP list */ @@ -1856,8 +1845,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di, struct btrfs_key *key) { - char *name; - int name_len; + struct qstr name; struct btrfs_dir_item *dir_dst_di; struct btrfs_dir_item *index_dst_di; bool dir_dst_matches = false; @@ -1875,17 +1863,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, if (!dir) return -EIO; - name_len = btrfs_dir_name_len(eb, di); - name = kmalloc(name_len, GFP_NOFS); - if (!name) { - ret = -ENOMEM; + ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); + if (ret) goto out; - } log_type = btrfs_dir_type(eb, di); - read_extent_buffer(eb, name, (unsigned long)(di + 1), - name_len); - btrfs_dir_item_key_to_cpu(eb, di, &log_key); ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); btrfs_release_path(path); @@ -1895,7 +1877,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = 0; dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, - name, name_len, 1); + &name, 1); if (IS_ERR(dir_dst_di)) { ret = PTR_ERR(dir_dst_di); goto out; @@ -1912,7 +1894,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, key->objectid, key->offset, - name, name_len, 1); + &name, 1); if (IS_ERR(index_dst_di)) { ret = PTR_ERR(index_dst_di); goto out; @@ -1940,7 +1922,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, search_key.objectid = log_key.objectid; search_key.type = BTRFS_INODE_REF_KEY; search_key.offset = key->objectid; - ret = backref_in_log(root->log_root, &search_key, 0, name, name_len); + ret = backref_in_log(root->log_root, &search_key, 0, &name); if (ret < 0) { goto out; } else if (ret) { @@ -1953,8 +1935,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, search_key.objectid = log_key.objectid; search_key.type = BTRFS_INODE_EXTREF_KEY; search_key.offset = key->objectid; - ret = backref_in_log(root->log_root, &search_key, key->objectid, name, - name_len); + ret = backref_in_log(root->log_root, &search_key, key->objectid, &name); if (ret < 0) { goto out; } else if (ret) { @@ -1965,7 +1946,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, } btrfs_release_path(path); ret = insert_one_name(trans, root, key->objectid, key->offset, - name, name_len, &log_key); + &name, &log_key); if (ret && ret != -ENOENT && ret != -EEXIST) goto out; if (!ret) @@ -1975,10 +1956,10 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, out: if (!ret && update_size) { - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); + btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2); ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); } - kfree(name); + kfree(name.name); iput(dir); if (!ret && name_added) ret = 1; @@ -2144,8 +2125,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; - int name_len; - char *name; + struct qstr name; struct inode *inode = NULL; struct btrfs_key location; @@ -2160,22 +2140,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, eb = path->nodes[0]; slot = path->slots[0]; di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); - name_len = btrfs_dir_name_len(eb, di); - name = kmalloc(name_len, GFP_NOFS); - if (!name) { - ret = -ENOMEM; + ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); + if (ret) goto out; - } - - read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); if (log) { struct btrfs_dir_item *log_di; log_di = btrfs_lookup_dir_index_item(trans, log, log_path, dir_key->objectid, - dir_key->offset, - name, name_len, 0); + dir_key->offset, &name, 0); if (IS_ERR(log_di)) { ret = PTR_ERR(log_di); goto out; @@ -2201,7 +2175,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, inc_nlink(inode); ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), - name, name_len); + &name); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset @@ -2210,7 +2184,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); btrfs_release_path(log_path); - kfree(name); + kfree(name.name); iput(inode); return ret; } @@ -3449,7 +3423,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, u64 dir_ino, - const char *name, int name_len, + const struct qstr *name, u64 index) { struct btrfs_dir_item *di; @@ -3459,7 +3433,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, * for dir item keys. */ di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, - index, name, name_len, -1); + index, name, -1); if (IS_ERR(di)) return PTR_ERR(di); else if (!di) @@ -3496,7 +3470,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, */ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct qstr *name, struct btrfs_inode *dir, u64 index) { struct btrfs_path *path; @@ -3523,7 +3497,7 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, } ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), - name, name_len, index); + name, index); btrfs_free_path(path); out_unlock: mutex_unlock(&dir->log_mutex); @@ -3535,7 +3509,7 @@ out_unlock: /* see comments for btrfs_del_dir_entries_in_log */ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct qstr *name, struct btrfs_inode *inode, u64 dirid) { struct btrfs_root *log; @@ -3556,7 +3530,7 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, log = root->log_root; mutex_lock(&inode->log_mutex); - ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), + ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), dirid, &index); mutex_unlock(&inode->log_mutex); if (ret < 0 && ret != -ENOENT) @@ -5270,6 +5244,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, u32 this_len; unsigned long name_ptr; struct btrfs_dir_item *di; + struct qstr name_str; if (key->type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; @@ -5303,8 +5278,11 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, } read_extent_buffer(eb, name, name_ptr, this_name_len); + + name_str.name = name; + name_str.len = this_name_len; di = btrfs_lookup_dir_item(NULL, inode->root, search_path, - parent, name, this_name_len, 0); + parent, &name_str, 0); if (di && !IS_ERR(di)) { struct btrfs_key di_key; @@ -7505,8 +7483,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, */ mutex_lock(&old_dir->log_mutex); ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), - old_dentry->d_name.name, - old_dentry->d_name.len, old_dir_index); + &old_dentry->d_name, old_dir_index); if (ret > 0) { /* * The dentry does not exist in the log, so record its diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index f5770829d075..8c9a387151b0 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -87,11 +87,11 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx); void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct qstr *name, struct btrfs_inode *dir, u64 index); void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct qstr *name, struct btrfs_inode *inode, u64 dirid); void btrfs_end_log_trans(struct btrfs_root *root); void btrfs_pin_log_trans(struct btrfs_root *root); -- cgit From ab3c5c18e8fa3f8ea116016095d25adab466cd39 Mon Sep 17 00:00:00 2001 From: Sweet Tea Dorminy Date: Thu, 20 Oct 2022 12:58:26 -0400 Subject: btrfs: setup qstr from dentrys using fscrypt helper Most places where we get a struct qstr, we are doing so from a dentry. With fscrypt, the dentry's name may be encrypted on-disk, so fscrypt provides a helper to convert a dentry name to the appropriate disk name if necessary. Convert each of the dentry name accesses to use fscrypt_setup_filename(), then convert the resulting fscrypt_name back to an unencrypted qstr. This does not work for nokey names, but the specific locations that could spawn nokey names are noted. At present, since there are no encrypted directories, nothing goes down the filename encryption paths. Signed-off-by: Sweet Tea Dorminy Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 + fs/btrfs/inode.c | 192 +++++++++++++++++++++++++++++++++++++------------ fs/btrfs/transaction.c | 40 +++++++---- fs/btrfs/tree-log.c | 11 ++- 4 files changed, 189 insertions(+), 57 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9e8c9f9bc4fb..b2016c14488b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -28,6 +28,7 @@ #include #include #include +#include #include "extent-io-tree.h" #include "extent_io.h" #include "extent_map.h" @@ -1674,6 +1675,8 @@ struct btrfs_new_inode_args { */ struct posix_acl *default_acl; struct posix_acl *acl; + struct fscrypt_name fname; + struct qstr name; }; int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, unsigned int *trans_num_items); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3162d8f0aaf8..fa4b101bc1eb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4429,28 +4429,41 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; struct inode *inode = d_inode(dentry); int ret; + struct fscrypt_name fname; + struct qstr name; + + ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + if (ret) + return ret; + name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); + + /* This needs to handle no-key deletions later on */ trans = __unlink_start_trans(dir); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto fscrypt_free; + } btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 0); ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), - &dentry->d_name); + &name); if (ret) - goto out; + goto end_trans; if (inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) - goto out; + goto end_trans; } -out: +end_trans: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); +fscrypt_free: + fscrypt_free_filename(&fname); return ret; } @@ -4463,11 +4476,19 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; - const struct qstr *name = &dentry->d_name; + struct qstr name; u64 index; int ret; u64 objectid; u64 dir_ino = btrfs_ino(BTRFS_I(dir)); + struct fscrypt_name fname; + + ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + if (ret) + return ret; + name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); + + /* This needs to handle no-key deletions later on */ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { objectid = inode->root->root_key.objectid; @@ -4475,14 +4496,17 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, objectid = inode->location.objectid; } else { WARN_ON(1); + fscrypt_free_filename(&fname); return -EINVAL; } path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + ret = -ENOMEM; + goto out; + } - di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1); + di = btrfs_lookup_dir_item(trans, root, path, dir_ino, &name, -1); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; @@ -4508,7 +4532,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, * call btrfs_del_root_ref, and it _shouldn't_ fail. */ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { - di = btrfs_search_dir_index_item(root, path, dir_ino, name); + di = btrfs_search_dir_index_item(root, path, dir_ino, &name); if (IS_ERR_OR_NULL(di)) { if (!di) ret = -ENOENT; @@ -4525,7 +4549,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } else { ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid, dir_ino, - &index, name); + &index, &name); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -4538,7 +4562,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, goto out; } - btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name->len * 2); + btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name.len * 2); inode_inc_iversion(dir); dir->i_mtime = current_time(dir); dir->i_ctime = dir->i_mtime; @@ -4547,6 +4571,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); out: btrfs_free_path(path); + fscrypt_free_filename(&fname); return ret; } @@ -4810,6 +4835,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) int err = 0; struct btrfs_trans_handle *trans; u64 last_unlink_trans; + struct fscrypt_name fname; + struct qstr name; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; @@ -4822,9 +4849,18 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) return btrfs_delete_subvolume(dir, dentry); } + err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + if (err) + return err; + name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); + + /* This needs to handle no-key deletions later on */ + trans = __unlink_start_trans(dir); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_notrans; + } if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { err = btrfs_unlink_subvol(trans, dir, dentry); @@ -4839,7 +4875,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) /* now the directory is empty */ err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), - &dentry->d_name); + &name); if (!err) { btrfs_i_size_write(BTRFS_I(inode), 0); /* @@ -4858,7 +4894,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) } out: btrfs_end_transaction(trans); +out_notrans: btrfs_btree_balance_dirty(fs_info); + fscrypt_free_filename(&fname); return err; } @@ -5531,18 +5569,27 @@ no_delete: static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type) { - const struct qstr *name = &dentry->d_name; + struct qstr name; struct btrfs_dir_item *di; struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(dir)->root; int ret = 0; + struct fscrypt_name fname; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + if (ret) + goto out; + + name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); + + /* This needs to handle no-key deletions later on */ + di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), - name, 0); + &name, 0); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; @@ -5554,12 +5601,13 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, ret = -EUCLEAN; btrfs_warn(root->fs_info, "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", - __func__, name->name, btrfs_ino(BTRFS_I(dir)), + __func__, name.name, btrfs_ino(BTRFS_I(dir)), location->objectid, location->type, location->offset); } if (!ret) *type = btrfs_dir_type(path->nodes[0], di); out: + fscrypt_free_filename(&fname); btrfs_free_path(path); return ret; } @@ -5582,6 +5630,14 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, struct btrfs_key key; int ret; int err = 0; + struct fscrypt_name fname; + struct qstr name; + + ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); + if (ret) + return ret; + + name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); path = btrfs_alloc_path(); if (!path) { @@ -5604,12 +5660,11 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) || - btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) + btrfs_root_ref_name_len(leaf, ref) != name.len) goto out; - ret = memcmp_extent_buffer(leaf, dentry->d_name.name, - (unsigned long)(ref + 1), - dentry->d_name.len); + ret = memcmp_extent_buffer(leaf, name.name, (unsigned long)(ref + 1), + name.len); if (ret) goto out; @@ -5628,6 +5683,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, err = 0; out: btrfs_free_path(path); + fscrypt_free_filename(&fname); return err; } @@ -6236,9 +6292,19 @@ int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, struct inode *inode = args->inode; int ret; + if (!args->orphan) { + ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0, + &args->fname); + if (ret) + return ret; + args->name = (struct qstr)FSTR_TO_QSTR(&args->fname.disk_name); + } + ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl); - if (ret) + if (ret) { + fscrypt_free_filename(&args->fname); return ret; + } /* 1 to add inode item */ *trans_num_items = 1; @@ -6278,6 +6344,7 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args) { posix_acl_release(args->acl); posix_acl_release(args->default_acl); + fscrypt_free_filename(&args->fname); } /* @@ -6703,6 +6770,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = d_inode(old_dentry); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct fscrypt_name fname; + struct qstr name; u64 index; int err; int drop_inode = 0; @@ -6714,6 +6783,12 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink >= BTRFS_LINK_MAX) return -EMLINK; + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); + if (err) + goto fail; + + name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); + err = btrfs_set_inode_index(BTRFS_I(dir), &index); if (err) goto fail; @@ -6740,7 +6815,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - &dentry->d_name, 1, index); + &name, 1, index); if (err) { drop_inode = 1; @@ -6764,6 +6839,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } fail: + fscrypt_free_filename(&fname); if (trans) btrfs_end_transaction(trans); if (drop_inode) { @@ -9003,6 +9079,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, int ret; int ret2; bool need_abort = false; + struct fscrypt_name old_fname, new_fname; + struct qstr old_name, new_name; /* * For non-subvolumes allow exchange only within one subvolume, in the @@ -9014,6 +9092,19 @@ static int btrfs_rename_exchange(struct inode *old_dir, new_ino != BTRFS_FIRST_FREE_OBJECTID)) return -EXDEV; + ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); + if (ret) + return ret; + + ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); + if (ret) { + fscrypt_free_filename(&old_fname); + return ret; + } + + old_name = (struct qstr)FSTR_TO_QSTR(&old_fname.disk_name); + new_name = (struct qstr)FSTR_TO_QSTR(&new_fname.disk_name); + /* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID || new_ino == BTRFS_FIRST_FREE_OBJECTID) @@ -9081,8 +9172,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, dest, &new_dentry->d_name, - old_ino, + ret = btrfs_insert_inode_ref(trans, dest, &new_name, old_ino, btrfs_ino(BTRFS_I(new_dir)), old_idx); if (ret) @@ -9095,8 +9185,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, root, &old_dentry->d_name, - new_ino, + ret = btrfs_insert_inode_ref(trans, root, &old_name, new_ino, btrfs_ino(BTRFS_I(old_dir)), new_idx); if (ret) { @@ -9131,8 +9220,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), - &old_dentry->d_name, - &old_rename_ctx); + &old_name, &old_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9147,8 +9235,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), - &new_dentry->d_name, - &new_rename_ctx); + &new_name, &new_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); } @@ -9158,14 +9245,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), - &new_dentry->d_name, 0, old_idx); + &new_name, 0, old_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), - &old_dentry->d_name, 0, new_idx); + &old_name, 0, new_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; @@ -9208,6 +9295,8 @@ out_notrans: old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); + fscrypt_free_filename(&new_fname); + fscrypt_free_filename(&old_fname); return ret; } @@ -9247,6 +9336,8 @@ static int btrfs_rename(struct user_namespace *mnt_userns, int ret; int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); + struct fscrypt_name old_fname, new_fname; + struct qstr old_name, new_name; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9263,21 +9354,32 @@ static int btrfs_rename(struct user_namespace *mnt_userns, new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; + ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); + if (ret) + return ret; + + ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); + if (ret) { + fscrypt_free_filename(&old_fname); + return ret; + } + + old_name = (struct qstr)FSTR_TO_QSTR(&old_fname.disk_name); + new_name = (struct qstr)FSTR_TO_QSTR(&new_fname.disk_name); /* check for collisions, even if the name isn't there */ - ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, - &new_dentry->d_name); + ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_name); if (ret) { if (ret == -EEXIST) { /* we shouldn't get * eexist without a new_inode */ if (WARN_ON(!new_inode)) { - return ret; + goto out_fscrypt_names; } } else { /* maybe -EOVERFLOW */ - return ret; + goto out_fscrypt_names; } } ret = 0; @@ -9360,8 +9462,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, dest, &new_dentry->d_name, - old_ino, + ret = btrfs_insert_inode_ref(trans, dest, &new_name, old_ino, btrfs_ino(BTRFS_I(new_dir)), index); if (ret) goto out_fail; @@ -9385,7 +9486,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } else { ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), - &old_dentry->d_name, &rename_ctx); + &old_name, &rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9404,7 +9505,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), - &new_dentry->d_name); + &new_name); } if (!ret && new_inode->i_nlink == 0) ret = btrfs_orphan_add(trans, @@ -9416,7 +9517,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), - &new_dentry->d_name, 0, index); + &new_name, 0, index); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; @@ -9451,6 +9552,9 @@ out_notrans: out_whiteout_inode: if (flags & RENAME_WHITEOUT) iput(whiteout_args.inode); +out_fscrypt_names: + fscrypt_free_filename(&old_fname); + fscrypt_free_filename(&new_fname); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7db9612f4d0e..3751b155e361 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -1611,10 +1612,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; - struct inode *parent_inode; + struct inode *parent_inode = pending->dir; struct btrfs_path *path; struct btrfs_dir_item *dir_item; - struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; struct timespec64 cur_time; @@ -1623,6 +1623,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 index = 0; u64 objectid; u64 root_flags; + unsigned int nofs_flags; + struct fscrypt_name fname; + struct qstr name; ASSERT(pending->path); path = pending->path; @@ -1630,9 +1633,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ASSERT(pending->root_item); new_root_item = pending->root_item; + /* + * We're inside a transaction and must make sure that any potential + * allocations with GFP_KERNEL in fscrypt won't recurse back to + * filesystem. + */ + nofs_flags = memalloc_nofs_save(); + pending->error = fscrypt_setup_filename(parent_inode, + &pending->dentry->d_name, 0, + &fname); + memalloc_nofs_restore(nofs_flags); + if (pending->error) + goto free_pending; + name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); + pending->error = btrfs_get_free_objectid(tree_root, &objectid); if (pending->error) - goto no_free_objectid; + goto free_fname; /* * Make qgroup to skip current new snapshot's qgroupid, as it is @@ -1661,8 +1678,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 1); - dentry = pending->dentry; - parent_inode = pending->dir; parent_root = BTRFS_I(parent_inode)->root; ret = record_root_in_trans(trans, parent_root, 0); if (ret) @@ -1678,7 +1693,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, btrfs_ino(BTRFS_I(parent_inode)), - &dentry->d_name, 0); + &name, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; goto dir_item_existed; @@ -1773,7 +1788,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_add_root_ref(trans, objectid, parent_root->root_key.objectid, btrfs_ino(BTRFS_I(parent_inode)), index, - &dentry->d_name); + &name); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1805,9 +1820,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret < 0) goto fail; - ret = btrfs_insert_dir_item(trans, &dentry->d_name, - BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, - index); + ret = btrfs_insert_dir_item(trans, &name, BTRFS_I(parent_inode), &key, + BTRFS_FT_DIR, index); /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { @@ -1816,7 +1830,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + - dentry->d_name.len * 2); + name.len * 2); parent_inode->i_mtime = current_time(parent_inode); parent_inode->i_ctime = parent_inode->i_mtime; ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); @@ -1848,7 +1862,9 @@ dir_item_existed: trans->bytes_reserved = 0; clear_skip_qgroup: btrfs_clear_skip_qgroup(trans); -no_free_objectid: +free_fname: + fscrypt_free_filename(&fname); +free_pending: kfree(new_root_item); pending->root_item = NULL; btrfs_free_path(path); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index b14ac7017f41..3f3d10a2dd0e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7446,9 +7446,16 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, if (old_dir && old_dir->logged_trans == trans->transid) { struct btrfs_root *log = old_dir->root->log_root; struct btrfs_path *path; + struct fscrypt_name fname; + struct qstr name; ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); + ret = fscrypt_setup_filename(&old_dir->vfs_inode, + &old_dentry->d_name, 0, &fname); + if (ret) + goto out; + name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); /* * We have two inodes to update in the log, the old directory and * the inode that got renamed, so we must pin the log to prevent @@ -7468,6 +7475,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; + fscrypt_free_filename(&fname); goto out; } @@ -7483,7 +7491,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, */ mutex_lock(&old_dir->log_mutex); ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), - &old_dentry->d_name, old_dir_index); + &name, old_dir_index); if (ret > 0) { /* * The dentry does not exist in the log, so record its @@ -7497,6 +7505,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, mutex_unlock(&old_dir->log_mutex); btrfs_free_path(path); + fscrypt_free_filename(&fname); if (ret < 0) goto out; } -- cgit From 6db75318823a169e836a478ca57d6a7c0a156b77 Mon Sep 17 00:00:00 2001 From: Sweet Tea Dorminy Date: Thu, 20 Oct 2022 12:58:27 -0400 Subject: btrfs: use struct fscrypt_str instead of struct qstr While struct qstr is more natural without fscrypt, since it's provided by dentries, struct fscrypt_str is provided by the fscrypt handlers processing dentries, and is thus more natural in the fscrypt world. Replace all of the struct qstr uses with struct fscrypt_str. Signed-off-by: Sweet Tea Dorminy Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 19 ++++++----- fs/btrfs/dir-item.c | 10 +++--- fs/btrfs/inode-item.c | 14 ++++---- fs/btrfs/inode-item.h | 10 +++--- fs/btrfs/inode.c | 87 ++++++++++++++++++++------------------------------ fs/btrfs/ioctl.c | 4 +-- fs/btrfs/root-tree.c | 4 +-- fs/btrfs/send.c | 4 +-- fs/btrfs/super.c | 2 +- fs/btrfs/transaction.c | 13 ++++---- fs/btrfs/tree-log.c | 42 ++++++++++++------------ fs/btrfs/tree-log.h | 4 +-- 12 files changed, 95 insertions(+), 118 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b2016c14488b..7c710893f4c6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1518,10 +1518,10 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, /* root-item.c */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, - const struct qstr *name); + const struct fscrypt_str *name); int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, - const struct qstr *name); + const struct fscrypt_str *name); int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key); int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1550,23 +1550,23 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); /* dir-item.c */ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, - const struct qstr *name); + const struct fscrypt_str *name); int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, - const struct qstr *name, struct btrfs_inode *dir, + const struct fscrypt_str *name, struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index); struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - const struct qstr *name, int mod); + const struct fscrypt_str *name, int mod); struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 index, const struct qstr *name, int mod); + u64 index, const struct fscrypt_str *name, int mod); struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, - const struct qstr *name); + const struct fscrypt_str *name); int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -1647,10 +1647,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const struct qstr *name); + const struct fscrypt_str *name); int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const struct qstr *name, int add_backref, u64 index); + const struct fscrypt_str *name, int add_backref, u64 index); int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, int front); @@ -1676,7 +1676,6 @@ struct btrfs_new_inode_args { struct posix_acl *default_acl; struct posix_acl *acl; struct fscrypt_name fname; - struct qstr name; }; int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, unsigned int *trans_num_items); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 48a15af4ec57..23777021b331 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -106,7 +106,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, * Will return 0 or -ENOMEM */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, - const struct qstr *name, struct btrfs_inode *dir, + const struct fscrypt_str *name, struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index) { int ret = 0; @@ -208,7 +208,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir( struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - const struct qstr *name, + const struct fscrypt_str *name, int mod) { struct btrfs_key key; @@ -227,7 +227,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, } int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, - const struct qstr *name) + const struct fscrypt_str *name) { int ret; struct btrfs_key key; @@ -304,7 +304,7 @@ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 index, const struct qstr *name, int mod) + u64 index, const struct fscrypt_str *name, int mod) { struct btrfs_dir_item *di; struct btrfs_key key; @@ -323,7 +323,7 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, - u64 dirid, const struct qstr *name) + u64 dirid, const struct fscrypt_str *name) { struct btrfs_dir_item *di; struct btrfs_key key; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 577e39cfc411..d66bdbae5585 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -15,7 +15,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, - const struct qstr *name) + const struct fscrypt_str *name) { struct btrfs_inode_ref *ref; unsigned long ptr; @@ -42,7 +42,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( struct extent_buffer *leaf, int slot, u64 ref_objectid, - const struct qstr *name) + const struct fscrypt_str *name) { struct btrfs_inode_extref *extref; unsigned long ptr; @@ -81,7 +81,7 @@ struct btrfs_inode_extref * btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - const struct qstr *name, + const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow) { @@ -104,7 +104,7 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const struct qstr *name, + const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 *index) { @@ -174,7 +174,7 @@ out: } int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct qstr *name, + struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 *index) { struct btrfs_path *path; @@ -251,7 +251,7 @@ out: */ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const struct qstr *name, + const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 index) { @@ -306,7 +306,7 @@ out: /* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct qstr *name, + struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 index) { struct btrfs_fs_info *fs_info = root->fs_info; diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index 3c657c670cfd..b80aeb715701 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -64,10 +64,10 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_truncate_control *control); int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct qstr *name, + struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 index); int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct qstr *name, + struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 *index); int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -80,15 +80,15 @@ struct btrfs_inode_extref *btrfs_lookup_inode_extref( struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - const struct qstr *name, + const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, - const struct qstr *name); + const struct fscrypt_str *name); struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( struct extent_buffer *leaf, int slot, u64 ref_objectid, - const struct qstr *name); + const struct fscrypt_str *name); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fa4b101bc1eb..7e76d5e91786 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4286,7 +4286,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const struct qstr *name, + const struct fscrypt_str *name, struct btrfs_rename_ctx *rename_ctx) { struct btrfs_root *root = dir->root; @@ -4389,7 +4389,7 @@ out: int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const struct qstr *name) + const struct fscrypt_str *name) { int ret; @@ -4430,12 +4430,10 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode = d_inode(dentry); int ret; struct fscrypt_name fname; - struct qstr name; ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); if (ret) return ret; - name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); /* This needs to handle no-key deletions later on */ @@ -4449,7 +4447,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 0); ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), - &name); + &fname.disk_name); if (ret) goto end_trans; @@ -4476,7 +4474,6 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; - struct qstr name; u64 index; int ret; u64 objectid; @@ -4486,7 +4483,6 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); if (ret) return ret; - name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); /* This needs to handle no-key deletions later on */ @@ -4506,7 +4502,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, goto out; } - di = btrfs_lookup_dir_item(trans, root, path, dir_ino, &name, -1); + di = btrfs_lookup_dir_item(trans, root, path, dir_ino, + &fname.disk_name, -1); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; @@ -4532,7 +4529,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, * call btrfs_del_root_ref, and it _shouldn't_ fail. */ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { - di = btrfs_search_dir_index_item(root, path, dir_ino, &name); + di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name); if (IS_ERR_OR_NULL(di)) { if (!di) ret = -ENOENT; @@ -4549,7 +4546,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } else { ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid, dir_ino, - &index, &name); + &index, &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -4562,7 +4559,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, goto out; } - btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name.len * 2); + btrfs_i_size_write(BTRFS_I(dir), dir->i_size - fname.disk_name.len * 2); inode_inc_iversion(dir); dir->i_mtime = current_time(dir); dir->i_ctime = dir->i_mtime; @@ -4585,7 +4582,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) struct btrfs_path *path; struct btrfs_dir_item *di; struct btrfs_key key; - struct qstr name = QSTR_INIT("default", 7); + struct fscrypt_str name = FSTR_INIT("default", 7); u64 dir_id; int ret; @@ -4836,7 +4833,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; u64 last_unlink_trans; struct fscrypt_name fname; - struct qstr name; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; @@ -4852,7 +4848,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); if (err) return err; - name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); /* This needs to handle no-key deletions later on */ @@ -4875,7 +4870,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) /* now the directory is empty */ err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), - &name); + &fname.disk_name); if (!err) { btrfs_i_size_write(BTRFS_I(inode), 0); /* @@ -5569,7 +5564,6 @@ no_delete: static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type) { - struct qstr name; struct btrfs_dir_item *di; struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -5584,12 +5578,10 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, if (ret) goto out; - name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); - /* This needs to handle no-key deletions later on */ di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), - &name, 0); + &fname.disk_name, 0); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; @@ -5601,7 +5593,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, ret = -EUCLEAN; btrfs_warn(root->fs_info, "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", - __func__, name.name, btrfs_ino(BTRFS_I(dir)), + __func__, fname.disk_name.name, btrfs_ino(BTRFS_I(dir)), location->objectid, location->type, location->offset); } if (!ret) @@ -5631,14 +5623,11 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, int ret; int err = 0; struct fscrypt_name fname; - struct qstr name; ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); if (ret) return ret; - name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); - path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; @@ -5660,11 +5649,11 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) || - btrfs_root_ref_name_len(leaf, ref) != name.len) + btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len) goto out; - ret = memcmp_extent_buffer(leaf, name.name, (unsigned long)(ref + 1), - name.len); + ret = memcmp_extent_buffer(leaf, fname.disk_name.name, + (unsigned long)(ref + 1), fname.disk_name.len); if (ret) goto out; @@ -6297,7 +6286,6 @@ int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, &args->fname); if (ret) return ret; - args->name = (struct qstr)FSTR_TO_QSTR(&args->fname.disk_name); } ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl); @@ -6380,7 +6368,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, { struct inode *dir = args->dir; struct inode *inode = args->inode; - const struct qstr *name = args->orphan ? NULL : &args->dentry->d_name; + const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name; struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_root *root; struct btrfs_inode_item *inode_item; @@ -6615,7 +6603,7 @@ out: */ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const struct qstr *name, int add_backref, u64 index) + const struct fscrypt_str *name, int add_backref, u64 index) { int ret = 0; struct btrfs_key key; @@ -6771,7 +6759,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct inode *inode = d_inode(old_dentry); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct fscrypt_name fname; - struct qstr name; u64 index; int err; int drop_inode = 0; @@ -6787,8 +6774,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (err) goto fail; - name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); - err = btrfs_set_inode_index(BTRFS_I(dir), &index); if (err) goto fail; @@ -6815,7 +6800,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - &name, 1, index); + &fname.disk_name, 1, index); if (err) { drop_inode = 1; @@ -9080,7 +9065,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, int ret2; bool need_abort = false; struct fscrypt_name old_fname, new_fname; - struct qstr old_name, new_name; + struct fscrypt_str *old_name, *new_name; /* * For non-subvolumes allow exchange only within one subvolume, in the @@ -9102,8 +9087,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, return ret; } - old_name = (struct qstr)FSTR_TO_QSTR(&old_fname.disk_name); - new_name = (struct qstr)FSTR_TO_QSTR(&new_fname.disk_name); + old_name = &old_fname.disk_name; + new_name = &new_fname.disk_name; /* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID || @@ -9172,7 +9157,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, dest, &new_name, old_ino, + ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino, btrfs_ino(BTRFS_I(new_dir)), old_idx); if (ret) @@ -9185,7 +9170,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, root, &old_name, new_ino, + ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino, btrfs_ino(BTRFS_I(old_dir)), new_idx); if (ret) { @@ -9220,7 +9205,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), - &old_name, &old_rename_ctx); + old_name, &old_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9235,7 +9220,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), - &new_name, &new_rename_ctx); + new_name, &new_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); } @@ -9245,14 +9230,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), - &new_name, 0, old_idx); + new_name, 0, old_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), - &old_name, 0, new_idx); + old_name, 0, new_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; @@ -9337,7 +9322,6 @@ static int btrfs_rename(struct user_namespace *mnt_userns, int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); struct fscrypt_name old_fname, new_fname; - struct qstr old_name, new_name; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9364,12 +9348,8 @@ static int btrfs_rename(struct user_namespace *mnt_userns, return ret; } - old_name = (struct qstr)FSTR_TO_QSTR(&old_fname.disk_name); - new_name = (struct qstr)FSTR_TO_QSTR(&new_fname.disk_name); - /* check for collisions, even if the name isn't there */ - ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_name); - + ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name); if (ret) { if (ret == -EEXIST) { /* we shouldn't get @@ -9462,8 +9442,9 @@ static int btrfs_rename(struct user_namespace *mnt_userns, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, dest, &new_name, old_ino, - btrfs_ino(BTRFS_I(new_dir)), index); + ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name, + old_ino, btrfs_ino(BTRFS_I(new_dir)), + index); if (ret) goto out_fail; } @@ -9486,7 +9467,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } else { ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), - &old_name, &rename_ctx); + &old_fname.disk_name, &rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9505,7 +9486,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), - &new_name); + &new_fname.disk_name); } if (!ret && new_inode->i_nlink == 0) ret = btrfs_orphan_add(trans, @@ -9517,7 +9498,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), - &new_name, 0, index); + &new_fname.disk_name, 0, index); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fde55d0f61b4..2132e3e15986 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -951,7 +951,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, struct inode *dir = d_inode(parent->dentry); struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct dentry *dentry; - struct qstr name_str = QSTR_INIT(name, namelen); + struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen); int error; error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); @@ -3779,7 +3779,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) struct btrfs_trans_handle *trans; struct btrfs_path *path = NULL; struct btrfs_disk_key disk_key; - struct qstr name = QSTR_INIT("default", 7); + struct fscrypt_str name = FSTR_INIT("default", 7); u64 objectid = 0; u64 dir_id; int ret; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 848a720747af..6aab98114253 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -331,7 +331,7 @@ out: int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, - const struct qstr *name) + const struct fscrypt_str *name) { struct btrfs_root *tree_root = trans->fs_info->tree_root; struct btrfs_path *path; @@ -403,7 +403,7 @@ out: */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, - const struct qstr *name) + const struct fscrypt_str *name) { struct btrfs_root *tree_root = trans->fs_info->tree_root; struct btrfs_key key; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d0feeea2ba1c..0ebca9dba5ab 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1597,7 +1597,7 @@ static int gen_unique_name(struct send_ctx *sctx, return -ENOMEM; while (1) { - struct qstr tmp_name; + struct fscrypt_str tmp_name; len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", ino, gen, idx); @@ -1757,7 +1757,7 @@ static int lookup_dir_item_inode(struct btrfs_root *root, struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_path *path; - struct qstr name_str = QSTR_INIT(name, name_len); + struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len); path = alloc_path_for_send(); if (!path) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 13d33e5707de..33bf211699f8 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1423,7 +1423,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec struct btrfs_dir_item *di; struct btrfs_path *path; struct btrfs_key location; - struct qstr name = QSTR_INIT("default", 7); + struct fscrypt_str name = FSTR_INIT("default", 7); u64 dir_id; path = btrfs_alloc_path(); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3751b155e361..0d44d50dc3be 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1625,7 +1625,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 root_flags; unsigned int nofs_flags; struct fscrypt_name fname; - struct qstr name; ASSERT(pending->path); path = pending->path; @@ -1645,7 +1644,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, memalloc_nofs_restore(nofs_flags); if (pending->error) goto free_pending; - name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); pending->error = btrfs_get_free_objectid(tree_root, &objectid); if (pending->error) @@ -1693,7 +1691,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, btrfs_ino(BTRFS_I(parent_inode)), - &name, 0); + &fname.disk_name, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; goto dir_item_existed; @@ -1788,7 +1786,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_add_root_ref(trans, objectid, parent_root->root_key.objectid, btrfs_ino(BTRFS_I(parent_inode)), index, - &name); + &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1820,8 +1818,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret < 0) goto fail; - ret = btrfs_insert_dir_item(trans, &name, BTRFS_I(parent_inode), &key, - BTRFS_FT_DIR, index); + ret = btrfs_insert_dir_item(trans, &fname.disk_name, + BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, + index); /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { @@ -1830,7 +1829,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + - name.len * 2); + fname.disk_name.len * 2); parent_inode->i_mtime = current_time(parent_inode); parent_inode->i_ctime = parent_inode->i_mtime; ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3f3d10a2dd0e..7002cc3315da 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -598,7 +598,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans, } static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, - struct qstr *name) + struct fscrypt_str *name) { char *buf; @@ -917,7 +917,7 @@ out: static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const struct qstr *name) + const struct fscrypt_str *name) { int ret; @@ -948,7 +948,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, { struct btrfs_root *root = dir->root; struct inode *inode; - struct qstr name; + struct fscrypt_str name; struct extent_buffer *leaf; struct btrfs_key location; int ret; @@ -989,7 +989,7 @@ out: static noinline int inode_in_dir(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, u64 objectid, u64 index, - struct qstr *name) + struct fscrypt_str *name) { struct btrfs_dir_item *di; struct btrfs_key location; @@ -1036,7 +1036,7 @@ out: static noinline int backref_in_log(struct btrfs_root *log, struct btrfs_key *key, u64 ref_objectid, - const struct qstr *name) + const struct fscrypt_str *name) { struct btrfs_path *path; int ret; @@ -1072,7 +1072,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, u64 inode_objectid, u64 parent_objectid, - u64 ref_index, struct qstr *name) + u64 ref_index, struct fscrypt_str *name) { int ret; struct extent_buffer *leaf; @@ -1106,7 +1106,7 @@ again: ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); while (ptr < ptr_end) { - struct qstr victim_name; + struct fscrypt_str victim_name; victim_ref = (struct btrfs_inode_ref *)ptr; ret = read_alloc_one_name(leaf, (victim_ref + 1), @@ -1156,7 +1156,7 @@ again: base = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { - struct qstr victim_name; + struct fscrypt_str victim_name; extref = (struct btrfs_inode_extref *)(base + cur_offset); @@ -1231,7 +1231,7 @@ next: } static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - struct qstr *name, u64 *index, + struct fscrypt_str *name, u64 *index, u64 *parent_objectid) { struct btrfs_inode_extref *extref; @@ -1253,7 +1253,7 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, } static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - struct qstr *name, u64 *index) + struct fscrypt_str *name, u64 *index) { struct btrfs_inode_ref *ref; int ret; @@ -1305,7 +1305,7 @@ again: ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); while (ref_ptr < ref_end) { - struct qstr name; + struct fscrypt_str name; u64 parent_id; if (key->type == BTRFS_INODE_EXTREF_KEY) { @@ -1373,7 +1373,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; - struct qstr name; + struct fscrypt_str name; int ret; int log_ref_ver = 0; u64 parent_objectid; @@ -1767,7 +1767,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, static noinline int insert_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 dirid, u64 index, - const struct qstr *name, + const struct fscrypt_str *name, struct btrfs_key *location) { struct inode *inode; @@ -1845,7 +1845,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di, struct btrfs_key *key) { - struct qstr name; + struct fscrypt_str name; struct btrfs_dir_item *dir_dst_di; struct btrfs_dir_item *index_dst_di; bool dir_dst_matches = false; @@ -2125,7 +2125,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; - struct qstr name; + struct fscrypt_str name; struct inode *inode = NULL; struct btrfs_key location; @@ -3423,7 +3423,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, u64 dir_ino, - const struct qstr *name, + const struct fscrypt_str *name, u64 index) { struct btrfs_dir_item *di; @@ -3470,7 +3470,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, */ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const struct qstr *name, + const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index) { struct btrfs_path *path; @@ -3509,7 +3509,7 @@ out_unlock: /* see comments for btrfs_del_dir_entries_in_log */ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const struct qstr *name, + const struct fscrypt_str *name, struct btrfs_inode *inode, u64 dirid) { struct btrfs_root *log; @@ -5244,7 +5244,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, u32 this_len; unsigned long name_ptr; struct btrfs_dir_item *di; - struct qstr name_str; + struct fscrypt_str name_str; if (key->type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; @@ -7447,7 +7447,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_root *log = old_dir->root->log_root; struct btrfs_path *path; struct fscrypt_name fname; - struct qstr name; ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); @@ -7455,7 +7454,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, &old_dentry->d_name, 0, &fname); if (ret) goto out; - name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); /* * We have two inodes to update in the log, the old directory and * the inode that got renamed, so we must pin the log to prevent @@ -7491,7 +7489,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, */ mutex_lock(&old_dir->log_mutex); ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), - &name, old_dir_index); + &fname.disk_name, old_dir_index); if (ret > 0) { /* * The dentry does not exist in the log, so record its diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 8c9a387151b0..85b43075ac58 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -87,11 +87,11 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx); void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const struct qstr *name, + const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index); void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const struct qstr *name, + const struct fscrypt_str *name, struct btrfs_inode *inode, u64 dirid); void btrfs_end_log_trans(struct btrfs_root *root); void btrfs_pin_log_trans(struct btrfs_root *root); -- cgit From 94a48aef49f235cc1efc74dc18e7708ca3b8d698 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 20 Oct 2022 12:58:28 -0400 Subject: btrfs: extend btrfs_dir_item type to store encryption status For directories with encrypted files/filenames, we need to store a flag indicating this fact. There's no room in other fields, so we'll need to borrow a bit from dir_type. Since it's now a combination of type and flags, we rename it to dir_flags to reflect its new usage. The new flag, FT_ENCRYPTED, indicates a directory containing encrypted data, which is orthogonal to file type; therefore, add the new flag, and make conversion from directory type to file type strip the flag. As the file types almost never change we can afford to use the bits. Actual usage will be guarded behind an incompat bit, this patch only adds the support for later use by fscrypt. Signed-off-by: Omar Sandoval Signed-off-by: Sweet Tea Dorminy Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 15 +++++++++++++-- fs/btrfs/delayed-inode.c | 6 +++--- fs/btrfs/delayed-inode.h | 2 +- fs/btrfs/dir-item.c | 7 +++++-- fs/btrfs/inode.c | 13 +++++++------ fs/btrfs/print-tree.c | 4 ++-- fs/btrfs/send.c | 2 +- fs/btrfs/tree-checker.c | 2 +- fs/btrfs/tree-log.c | 20 ++++++++++---------- 9 files changed, 43 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index e8bbd4d80161..cb59b69d2af1 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -484,14 +484,25 @@ BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); /* struct btrfs_dir_item */ BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); -BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_FUNCS(dir_flags, struct btrfs_dir_item, type, 8); BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_dir_flags, struct btrfs_dir_item, type, 8); BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, data_len, 16); BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, name_len, 16); BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, transid, 64); +static inline u8 btrfs_dir_ftype(const struct extent_buffer *eb, + const struct btrfs_dir_item *item) +{ + return btrfs_dir_flags_to_ftype(btrfs_dir_flags(eb, item)); +} + +static inline u8 btrfs_stack_dir_ftype(const struct btrfs_dir_item *item) +{ + return btrfs_dir_flags_to_ftype(btrfs_stack_dir_flags(item)); +} + static inline void btrfs_dir_item_key(const struct extent_buffer *eb, const struct btrfs_dir_item *item, struct btrfs_disk_key *key) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 012c96de4701..f93d2695e423 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1416,7 +1416,7 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, - struct btrfs_disk_key *disk_key, u8 type, + struct btrfs_disk_key *disk_key, u8 flags, u64 index) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1447,7 +1447,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, btrfs_set_stack_dir_transid(dir_item, trans->transid); btrfs_set_stack_dir_data_len(dir_item, 0); btrfs_set_stack_dir_name_len(dir_item, name_len); - btrfs_set_stack_dir_type(dir_item, type); + btrfs_set_stack_dir_flags(dir_item, flags); memcpy((char *)(dir_item + 1), name, name_len); data_len = delayed_item->data_len + sizeof(struct btrfs_item); @@ -1757,7 +1757,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, name = (char *)(di + 1); name_len = btrfs_stack_dir_name_len(di); - d_type = fs_ftype_to_dtype(di->type); + d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type)); btrfs_disk_key_to_cpu(&location, &di->location); over = !dir_emit(ctx, name, name_len, diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 0163ca637a96..4f21daa3dbc7 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -113,7 +113,7 @@ static inline void btrfs_init_delayed_root( int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, - struct btrfs_disk_key *disk_key, u8 type, + struct btrfs_disk_key *disk_key, u8 flags, u64 index); int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 23777021b331..ca69fb35a2cc 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -83,7 +83,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; btrfs_cpu_key_to_disk(&disk_key, &location); btrfs_set_dir_item_key(leaf, dir_item, &disk_key); - btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR); + btrfs_set_dir_flags(leaf, dir_item, BTRFS_FT_XATTR); btrfs_set_dir_name_len(leaf, dir_item, name_len); btrfs_set_dir_transid(leaf, dir_item, trans->transid); btrfs_set_dir_data_len(leaf, dir_item, data_len); @@ -140,9 +140,12 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, goto out_free; } + if (IS_ENCRYPTED(&dir->vfs_inode)) + type |= BTRFS_FT_ENCRYPTED; + leaf = path->nodes[0]; btrfs_set_dir_item_key(leaf, dir_item, &disk_key); - btrfs_set_dir_type(leaf, dir_item, type); + btrfs_set_dir_flags(leaf, dir_item, type); btrfs_set_dir_data_len(leaf, dir_item, 0); btrfs_set_dir_name_len(leaf, dir_item, name->len); btrfs_set_dir_transid(leaf, dir_item, trans->transid); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7e76d5e91786..78867a084428 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5597,7 +5597,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, location->objectid, location->type, location->offset); } if (!ret) - *type = btrfs_dir_type(path->nodes[0], di); + *type = btrfs_dir_ftype(path->nodes[0], di); out: fscrypt_free_filename(&fname); btrfs_free_path(path); @@ -6046,6 +6046,7 @@ again: btrfs_for_each_slot(root, &key, &found_key, path, ret) { struct dir_entry *entry; struct extent_buffer *leaf = path->nodes[0]; + u8 ftype; if (found_key.objectid != key.objectid) break; @@ -6069,13 +6070,13 @@ again: goto again; } + ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di)); entry = addr; - put_unaligned(name_len, &entry->name_len); name_ptr = (char *)(entry + 1); - read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), - name_len); - put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)), - &entry->type); + read_extent_buffer(leaf, name_ptr, + (unsigned long)(di + 1), name_len); + put_unaligned(name_len, &entry->name_len); + put_unaligned(fs_ftype_to_dtype(ftype), &entry->type); btrfs_dir_item_key_to_cpu(leaf, di, &location); put_unaligned(location.objectid, &entry->ino); put_unaligned(found_key.offset, &entry->offset); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index aab7d30eed55..1a2350fd68be 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -242,9 +242,9 @@ void btrfs_print_leaf(struct extent_buffer *l) case BTRFS_DIR_ITEM_KEY: di = btrfs_item_ptr(l, i, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(l, di, &found_key); - pr_info("\t\tdir oid %llu type %u\n", + pr_info("\t\tdir oid %llu flags %u\n", found_key.objectid, - btrfs_dir_type(l, di)); + btrfs_dir_flags(l, di)); break; case BTRFS_ROOT_ITEM_KEY: ri = btrfs_item_ptr(l, i, struct btrfs_root_item); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 0ebca9dba5ab..2ece3a030a66 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1094,7 +1094,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, data_len = btrfs_dir_data_len(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &di_key); - if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) { + if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) { if (name_len > XATTR_NAME_MAX) { ret = -ENAMETOOLONG; goto out; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 11cafc520b47..1c2d418dda6a 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -531,7 +531,7 @@ static int check_dir_item(struct extent_buffer *leaf, } /* dir type check */ - dir_type = btrfs_dir_type(leaf, di); + dir_type = btrfs_dir_ftype(leaf, di); if (unlikely(dir_type >= BTRFS_FT_MAX)) { dir_item_err(leaf, slot, "invalid dir item type, have %u expect [0, %u)", diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7002cc3315da..a5e56a678af2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1799,7 +1799,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_dir_item *dst_di, const struct btrfs_key *log_key, - u8 log_type, + u8 log_flags, bool exists) { struct btrfs_key found_key; @@ -1809,7 +1809,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, if (found_key.objectid == log_key->objectid && found_key.type == log_key->type && found_key.offset == log_key->offset && - btrfs_dir_type(path->nodes[0], dst_di) == log_type) + btrfs_dir_flags(path->nodes[0], dst_di) == log_flags) return 1; /* @@ -1853,7 +1853,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_key log_key; struct btrfs_key search_key; struct inode *dir; - u8 log_type; + u8 log_flags; bool exists; int ret; bool update_size = true; @@ -1867,7 +1867,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, if (ret) goto out; - log_type = btrfs_dir_type(eb, di); + log_flags = btrfs_dir_flags(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &log_key); ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); btrfs_release_path(path); @@ -1883,8 +1883,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, goto out; } else if (dir_dst_di) { ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - dir_dst_di, &log_key, log_type, - exists); + dir_dst_di, &log_key, + log_flags, exists); if (ret < 0) goto out; dir_dst_matches = (ret == 1); @@ -1901,7 +1901,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, } else if (index_dst_di) { ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, index_dst_di, &log_key, - log_type, exists); + log_flags, exists); if (ret < 0) goto out; index_dst_matches = (ret == 1); @@ -2010,7 +2010,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, * to ever delete the parent directory has it would result in stale * dentries that can never be deleted. */ - if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { + if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) { struct btrfs_path *fixup_path; struct btrfs_key di_key; @@ -5452,7 +5452,7 @@ again: } di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); - type = btrfs_dir_type(leaf, di); + type = btrfs_dir_ftype(leaf, di); if (btrfs_dir_transid(leaf, di) < trans->transid) continue; btrfs_dir_item_key_to_cpu(leaf, di, &di_key); @@ -6292,7 +6292,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, continue; } - if (btrfs_stack_dir_type(dir_item) == BTRFS_FT_DIR) + if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR) log_mode = LOG_INODE_ALL; ctx->log_new_dentries = false; -- cgit From a56159d4080b793ee8dc3d3d315579801ad9096c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 14:46:52 -0400 Subject: btrfs: move btrfs_fs_info declarations into fs.h Now that we have a lot of the fs_info related helpers and stuff isolated, copy these over to fs.h out of ctree.h. Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ reformat comments ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 659 +----------------------------------------------------- fs/btrfs/fs.h | 660 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 661 insertions(+), 658 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7c710893f4c6..ace6f41612e5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -36,6 +36,7 @@ #include "block-rsv.h" #include "locking.h" #include "misc.h" +#include "fs.h" struct btrfs_trans_handle; struct btrfs_transaction; @@ -53,14 +54,6 @@ struct btrfs_balance_control; struct btrfs_delayed_root; struct reloc_control; -#define BTRFS_OLDEST_GENERATION 0ULL - -#define BTRFS_EMPTY_DIR_SIZE 0 - -#define BTRFS_DIRTY_METADATA_THRESH SZ_32M - -#define BTRFS_MAX_EXTENT_SIZE SZ_128M - static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); @@ -68,17 +61,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) sizeof(struct btrfs_stripe) * (num_stripes - 1); } -#define BTRFS_SUPER_INFO_OFFSET SZ_64K -#define BTRFS_SUPER_INFO_SIZE 4096 -static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); - -/* - * The reserved space at the beginning of each device. - * It covers the primary super block and leaves space for potential use by other - * tools like bootloaders or to lower potential damage of accidental overwrite. - */ -#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) - /* Read ahead values for struct btrfs_path.reada */ enum { READA_NONE, @@ -137,645 +119,6 @@ struct btrfs_path { unsigned int nowait:1; }; -struct btrfs_dev_replace { - u64 replace_state; /* see #define above */ - time64_t time_started; /* seconds since 1-Jan-1970 */ - time64_t time_stopped; /* seconds since 1-Jan-1970 */ - atomic64_t num_write_errors; - atomic64_t num_uncorrectable_read_errors; - - u64 cursor_left; - u64 committed_cursor_left; - u64 cursor_left_last_write_of_item; - u64 cursor_right; - - u64 cont_reading_from_srcdev_mode; /* see #define above */ - - int is_valid; - int item_needs_writeback; - struct btrfs_device *srcdev; - struct btrfs_device *tgtdev; - - struct mutex lock_finishing_cancel_unmount; - struct rw_semaphore rwsem; - - struct btrfs_scrub_progress scrub_progress; - - struct percpu_counter bio_counter; - wait_queue_head_t replace_wait; -}; - -/* - * free clusters are used to claim free space in relatively large chunks, - * allowing us to do less seeky writes. They are used for all metadata - * allocations. In ssd_spread mode they are also used for data allocations. - */ -struct btrfs_free_cluster { - spinlock_t lock; - spinlock_t refill_lock; - struct rb_root root; - - /* largest extent in this cluster */ - u64 max_size; - - /* first extent starting offset */ - u64 window_start; - - /* We did a full search and couldn't create a cluster */ - bool fragmented; - - struct btrfs_block_group *block_group; - /* - * when a cluster is allocated from a block group, we put the - * cluster onto a list in the block group so that it can - * be freed before the block group is freed. - */ - struct list_head block_group_list; -}; - -/* Discard control. */ -/* - * Async discard uses multiple lists to differentiate the discard filter - * parameters. Index 0 is for completely free block groups where we need to - * ensure the entire block group is trimmed without being lossy. Indices - * afterwards represent monotonically decreasing discard filter sizes to - * prioritize what should be discarded next. - */ -#define BTRFS_NR_DISCARD_LISTS 3 -#define BTRFS_DISCARD_INDEX_UNUSED 0 -#define BTRFS_DISCARD_INDEX_START 1 - -struct btrfs_discard_ctl { - struct workqueue_struct *discard_workers; - struct delayed_work work; - spinlock_t lock; - struct btrfs_block_group *block_group; - struct list_head discard_list[BTRFS_NR_DISCARD_LISTS]; - u64 prev_discard; - u64 prev_discard_time; - atomic_t discardable_extents; - atomic64_t discardable_bytes; - u64 max_discard_size; - u64 delay_ms; - u32 iops_limit; - u32 kbps_limit; - u64 discard_extent_bytes; - u64 discard_bitmap_bytes; - atomic64_t discard_bytes_saved; -}; - -/* - * Exclusive operations (device replace, resize, device add/remove, balance) - */ -enum btrfs_exclusive_operation { - BTRFS_EXCLOP_NONE, - BTRFS_EXCLOP_BALANCE_PAUSED, - BTRFS_EXCLOP_BALANCE, - BTRFS_EXCLOP_DEV_ADD, - BTRFS_EXCLOP_DEV_REMOVE, - BTRFS_EXCLOP_DEV_REPLACE, - BTRFS_EXCLOP_RESIZE, - BTRFS_EXCLOP_SWAP_ACTIVATE, -}; - -/* Store data about transaction commits, exported via sysfs. */ -struct btrfs_commit_stats { - /* Total number of commits */ - u64 commit_count; - /* The maximum commit duration so far in ns */ - u64 max_commit_dur; - /* The last commit duration in ns */ - u64 last_commit_dur; - /* The total commit duration in ns */ - u64 total_commit_dur; -}; - -struct btrfs_fs_info { - u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; - unsigned long flags; - struct btrfs_root *tree_root; - struct btrfs_root *chunk_root; - struct btrfs_root *dev_root; - struct btrfs_root *fs_root; - struct btrfs_root *quota_root; - struct btrfs_root *uuid_root; - struct btrfs_root *data_reloc_root; - struct btrfs_root *block_group_root; - - /* the log root tree is a directory of all the other log roots */ - struct btrfs_root *log_root_tree; - - /* The tree that holds the global roots (csum, extent, etc) */ - rwlock_t global_root_lock; - struct rb_root global_root_tree; - - spinlock_t fs_roots_radix_lock; - struct radix_tree_root fs_roots_radix; - - /* block group cache stuff */ - rwlock_t block_group_cache_lock; - struct rb_root_cached block_group_cache_tree; - - /* keep track of unallocated space */ - atomic64_t free_chunk_space; - - /* Track ranges which are used by log trees blocks/logged data extents */ - struct extent_io_tree excluded_extents; - - /* logical->physical extent mapping */ - struct extent_map_tree mapping_tree; - - /* - * block reservation for extent, checksum, root tree and - * delayed dir index item - */ - struct btrfs_block_rsv global_block_rsv; - /* block reservation for metadata operations */ - struct btrfs_block_rsv trans_block_rsv; - /* block reservation for chunk tree */ - struct btrfs_block_rsv chunk_block_rsv; - /* block reservation for delayed operations */ - struct btrfs_block_rsv delayed_block_rsv; - /* block reservation for delayed refs */ - struct btrfs_block_rsv delayed_refs_rsv; - - struct btrfs_block_rsv empty_block_rsv; - - u64 generation; - u64 last_trans_committed; - /* - * Generation of the last transaction used for block group relocation - * since the filesystem was last mounted (or 0 if none happened yet). - * Must be written and read while holding btrfs_fs_info::commit_root_sem. - */ - u64 last_reloc_trans; - u64 avg_delayed_ref_runtime; - - /* - * this is updated to the current trans every time a full commit - * is required instead of the faster short fsync log commits - */ - u64 last_trans_log_full_commit; - unsigned long mount_opt; - - unsigned long compress_type:4; - unsigned int compress_level; - u32 commit_interval; - /* - * It is a suggestive number, the read side is safe even it gets a - * wrong number because we will write out the data into a regular - * extent. The write side(mount/remount) is under ->s_umount lock, - * so it is also safe. - */ - u64 max_inline; - - struct btrfs_transaction *running_transaction; - wait_queue_head_t transaction_throttle; - wait_queue_head_t transaction_wait; - wait_queue_head_t transaction_blocked_wait; - wait_queue_head_t async_submit_wait; - - /* - * Used to protect the incompat_flags, compat_flags, compat_ro_flags - * when they are updated. - * - * Because we do not clear the flags for ever, so we needn't use - * the lock on the read side. - * - * We also needn't use the lock when we mount the fs, because - * there is no other task which will update the flag. - */ - spinlock_t super_lock; - struct btrfs_super_block *super_copy; - struct btrfs_super_block *super_for_commit; - struct super_block *sb; - struct inode *btree_inode; - struct mutex tree_log_mutex; - struct mutex transaction_kthread_mutex; - struct mutex cleaner_mutex; - struct mutex chunk_mutex; - - /* - * this is taken to make sure we don't set block groups ro after - * the free space cache has been allocated on them - */ - struct mutex ro_block_group_mutex; - - /* this is used during read/modify/write to make sure - * no two ios are trying to mod the same stripe at the same - * time - */ - struct btrfs_stripe_hash_table *stripe_hash_table; - - /* - * this protects the ordered operations list only while we are - * processing all of the entries on it. This way we make - * sure the commit code doesn't find the list temporarily empty - * because another function happens to be doing non-waiting preflush - * before jumping into the main commit. - */ - struct mutex ordered_operations_mutex; - - struct rw_semaphore commit_root_sem; - - struct rw_semaphore cleanup_work_sem; - - struct rw_semaphore subvol_sem; - - spinlock_t trans_lock; - /* - * the reloc mutex goes with the trans lock, it is taken - * during commit to protect us from the relocation code - */ - struct mutex reloc_mutex; - - struct list_head trans_list; - struct list_head dead_roots; - struct list_head caching_block_groups; - - spinlock_t delayed_iput_lock; - struct list_head delayed_iputs; - atomic_t nr_delayed_iputs; - wait_queue_head_t delayed_iputs_wait; - - atomic64_t tree_mod_seq; - - /* this protects tree_mod_log and tree_mod_seq_list */ - rwlock_t tree_mod_log_lock; - struct rb_root tree_mod_log; - struct list_head tree_mod_seq_list; - - atomic_t async_delalloc_pages; - - /* - * this is used to protect the following list -- ordered_roots. - */ - spinlock_t ordered_root_lock; - - /* - * all fs/file tree roots in which there are data=ordered extents - * pending writeback are added into this list. - * - * these can span multiple transactions and basically include - * every dirty data page that isn't from nodatacow - */ - struct list_head ordered_roots; - - struct mutex delalloc_root_mutex; - spinlock_t delalloc_root_lock; - /* all fs/file tree roots that have delalloc inodes. */ - struct list_head delalloc_roots; - - /* - * there is a pool of worker threads for checksumming during writes - * and a pool for checksumming after reads. This is because readers - * can run with FS locks held, and the writers may be waiting for - * those locks. We don't want ordering in the pending list to cause - * deadlocks, and so the two are serviced separately. - * - * A third pool does submit_bio to avoid deadlocking with the other - * two - */ - struct btrfs_workqueue *workers; - struct btrfs_workqueue *hipri_workers; - struct btrfs_workqueue *delalloc_workers; - struct btrfs_workqueue *flush_workers; - struct workqueue_struct *endio_workers; - struct workqueue_struct *endio_meta_workers; - struct workqueue_struct *endio_raid56_workers; - struct workqueue_struct *rmw_workers; - struct workqueue_struct *compressed_write_workers; - struct btrfs_workqueue *endio_write_workers; - struct btrfs_workqueue *endio_freespace_worker; - struct btrfs_workqueue *caching_workers; - - /* - * fixup workers take dirty pages that didn't properly go through - * the cow mechanism and make them safe to write. It happens - * for the sys_munmap function call path - */ - struct btrfs_workqueue *fixup_workers; - struct btrfs_workqueue *delayed_workers; - - struct task_struct *transaction_kthread; - struct task_struct *cleaner_kthread; - u32 thread_pool_size; - - struct kobject *space_info_kobj; - struct kobject *qgroups_kobj; - struct kobject *discard_kobj; - - /* used to keep from writing metadata until there is a nice batch */ - struct percpu_counter dirty_metadata_bytes; - struct percpu_counter delalloc_bytes; - struct percpu_counter ordered_bytes; - s32 dirty_metadata_batch; - s32 delalloc_batch; - - struct list_head dirty_cowonly_roots; - - struct btrfs_fs_devices *fs_devices; - - /* - * The space_info list is effectively read only after initial - * setup. It is populated at mount time and cleaned up after - * all block groups are removed. RCU is used to protect it. - */ - struct list_head space_info; - - struct btrfs_space_info *data_sinfo; - - struct reloc_control *reloc_ctl; - - /* data_alloc_cluster is only used in ssd_spread mode */ - struct btrfs_free_cluster data_alloc_cluster; - - /* all metadata allocations go through this cluster */ - struct btrfs_free_cluster meta_alloc_cluster; - - /* auto defrag inodes go here */ - spinlock_t defrag_inodes_lock; - struct rb_root defrag_inodes; - atomic_t defrag_running; - - /* Used to protect avail_{data, metadata, system}_alloc_bits */ - seqlock_t profiles_lock; - /* - * these three are in extended format (availability of single - * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other - * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits) - */ - u64 avail_data_alloc_bits; - u64 avail_metadata_alloc_bits; - u64 avail_system_alloc_bits; - - /* restriper state */ - spinlock_t balance_lock; - struct mutex balance_mutex; - atomic_t balance_pause_req; - atomic_t balance_cancel_req; - struct btrfs_balance_control *balance_ctl; - wait_queue_head_t balance_wait_q; - - /* Cancellation requests for chunk relocation */ - atomic_t reloc_cancel_req; - - u32 data_chunk_allocations; - u32 metadata_ratio; - - void *bdev_holder; - - /* private scrub information */ - struct mutex scrub_lock; - atomic_t scrubs_running; - atomic_t scrub_pause_req; - atomic_t scrubs_paused; - atomic_t scrub_cancel_req; - wait_queue_head_t scrub_pause_wait; - /* - * The worker pointers are NULL iff the refcount is 0, ie. scrub is not - * running. - */ - refcount_t scrub_workers_refcnt; - struct workqueue_struct *scrub_workers; - struct workqueue_struct *scrub_wr_completion_workers; - struct workqueue_struct *scrub_parity_workers; - struct btrfs_subpage_info *subpage_info; - - struct btrfs_discard_ctl discard_ctl; - -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - u32 check_integrity_print_mask; -#endif - /* is qgroup tracking in a consistent state? */ - u64 qgroup_flags; - - /* holds configuration and tracking. Protected by qgroup_lock */ - struct rb_root qgroup_tree; - spinlock_t qgroup_lock; - - /* - * used to avoid frequently calling ulist_alloc()/ulist_free() - * when doing qgroup accounting, it must be protected by qgroup_lock. - */ - struct ulist *qgroup_ulist; - - /* - * Protect user change for quota operations. If a transaction is needed, - * it must be started before locking this lock. - */ - struct mutex qgroup_ioctl_lock; - - /* list of dirty qgroups to be written at next commit */ - struct list_head dirty_qgroups; - - /* used by qgroup for an efficient tree traversal */ - u64 qgroup_seq; - - /* qgroup rescan items */ - struct mutex qgroup_rescan_lock; /* protects the progress item */ - struct btrfs_key qgroup_rescan_progress; - struct btrfs_workqueue *qgroup_rescan_workers; - struct completion qgroup_rescan_completion; - struct btrfs_work qgroup_rescan_work; - bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */ - u8 qgroup_drop_subtree_thres; - - /* filesystem state */ - unsigned long fs_state; - - struct btrfs_delayed_root *delayed_root; - - /* Extent buffer radix tree */ - spinlock_t buffer_lock; - /* Entries are eb->start / sectorsize */ - struct radix_tree_root buffer_radix; - - /* next backup root to be overwritten */ - int backup_root_index; - - /* device replace state */ - struct btrfs_dev_replace dev_replace; - - struct semaphore uuid_tree_rescan_sem; - - /* Used to reclaim the metadata space in the background. */ - struct work_struct async_reclaim_work; - struct work_struct async_data_reclaim_work; - struct work_struct preempt_reclaim_work; - - /* Reclaim partially filled block groups in the background */ - struct work_struct reclaim_bgs_work; - struct list_head reclaim_bgs; - int bg_reclaim_threshold; - - spinlock_t unused_bgs_lock; - struct list_head unused_bgs; - struct mutex unused_bg_unpin_mutex; - /* Protect block groups that are going to be deleted */ - struct mutex reclaim_bgs_lock; - - /* Cached block sizes */ - u32 nodesize; - u32 sectorsize; - /* ilog2 of sectorsize, use to avoid 64bit division */ - u32 sectorsize_bits; - u32 csum_size; - u32 csums_per_leaf; - u32 stripesize; - - /* - * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular - * filesystem, on zoned it depends on the device constraints. - */ - u64 max_extent_size; - - /* Block groups and devices containing active swapfiles. */ - spinlock_t swapfile_pins_lock; - struct rb_root swapfile_pins; - - struct crypto_shash *csum_shash; - - /* Type of exclusive operation running, protected by super_lock */ - enum btrfs_exclusive_operation exclusive_operation; - - /* - * Zone size > 0 when in ZONED mode, otherwise it's used for a check - * if the mode is enabled - */ - u64 zone_size; - - /* Max size to emit ZONE_APPEND write command */ - u64 max_zone_append_size; - struct mutex zoned_meta_io_lock; - spinlock_t treelog_bg_lock; - u64 treelog_bg; - - /* - * Start of the dedicated data relocation block group, protected by - * relocation_bg_lock. - */ - spinlock_t relocation_bg_lock; - u64 data_reloc_bg; - struct mutex zoned_data_reloc_io_lock; - - u64 nr_global_roots; - - spinlock_t zone_active_bgs_lock; - struct list_head zone_active_bgs; - - /* Updates are not protected by any lock */ - struct btrfs_commit_stats commit_stats; - - /* - * Last generation where we dropped a non-relocation root. - * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen() - * to change it and to read it, respectively. - */ - u64 last_root_drop_gen; - - /* - * Annotations for transaction events (structures are empty when - * compiled without lockdep). - */ - struct lockdep_map btrfs_trans_num_writers_map; - struct lockdep_map btrfs_trans_num_extwriters_map; - struct lockdep_map btrfs_state_change_map[4]; - struct lockdep_map btrfs_trans_pending_ordered_map; - struct lockdep_map btrfs_ordered_extent_map; - -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - spinlock_t ref_verify_lock; - struct rb_root block_tree; -#endif - -#ifdef CONFIG_BTRFS_DEBUG - struct kobject *debug_kobj; - struct list_head allocated_roots; - - spinlock_t eb_leak_lock; - struct list_head allocated_ebs; -#endif -}; - -static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info, - u64 gen) -{ - WRITE_ONCE(fs_info->last_root_drop_gen, gen); -} - -static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info) -{ - return READ_ONCE(fs_info->last_root_drop_gen); -} - -static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) -{ - return sb->s_fs_info; -} - -/* - * Take the number of bytes to be checksummed and figure out how many leaves - * it would require to store the csums for that many bytes. - */ -static inline u64 btrfs_csum_bytes_to_leaves( - const struct btrfs_fs_info *fs_info, u64 csum_bytes) -{ - const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; - - return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); -} - -/* - * Use this if we would be adding new items, as we could split nodes as we cow - * down the tree. - */ -static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, - unsigned num_items) -{ - return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; -} - -/* - * Doing a truncate or a modification won't result in new nodes or leaves, just - * what we need for COW. - */ -static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, - unsigned num_items) -{ - return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; -} - -#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ - sizeof(struct btrfs_item)) - -static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) -{ - return fs_info->zone_size > 0; -} - -/* - * Count how many fs_info->max_extent_size cover the @size - */ -static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) -{ -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (!fs_info) - return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); -#endif - - return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); -} - -bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type); -bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type); -void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); -void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); -void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation op); - /* * The state of btrfs root */ diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index af356feec0c7..d8c931014c2a 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -3,6 +3,24 @@ #ifndef BTRFS_FS_H #define BTRFS_FS_H +#define BTRFS_MAX_EXTENT_SIZE SZ_128M + +#define BTRFS_OLDEST_GENERATION 0ULL + +#define BTRFS_EMPTY_DIR_SIZE 0 + +#define BTRFS_DIRTY_METADATA_THRESH SZ_32M + +#define BTRFS_SUPER_INFO_OFFSET SZ_64K +#define BTRFS_SUPER_INFO_SIZE 4096 +static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); + +/* + * The reserved space at the beginning of each device. It covers the primary + * super block and leaves space for potential use by other tools like + * bootloaders or to lower potential damage of accidental overwrite. + */ +#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) /* * Runtime (in-memory) states of filesystem */ @@ -200,6 +218,648 @@ enum { #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) +struct btrfs_dev_replace { + /* See #define above */ + u64 replace_state; + /* Seconds since 1-Jan-1970 */ + time64_t time_started; + /* Seconds since 1-Jan-1970 */ + time64_t time_stopped; + atomic64_t num_write_errors; + atomic64_t num_uncorrectable_read_errors; + + u64 cursor_left; + u64 committed_cursor_left; + u64 cursor_left_last_write_of_item; + u64 cursor_right; + + /* See #define above */ + u64 cont_reading_from_srcdev_mode; + + int is_valid; + int item_needs_writeback; + struct btrfs_device *srcdev; + struct btrfs_device *tgtdev; + + struct mutex lock_finishing_cancel_unmount; + struct rw_semaphore rwsem; + + struct btrfs_scrub_progress scrub_progress; + + struct percpu_counter bio_counter; + wait_queue_head_t replace_wait; +}; + +/* + * Free clusters are used to claim free space in relatively large chunks, + * allowing us to do less seeky writes. They are used for all metadata + * allocations. In ssd_spread mode they are also used for data allocations. + */ +struct btrfs_free_cluster { + spinlock_t lock; + spinlock_t refill_lock; + struct rb_root root; + + /* Largest extent in this cluster */ + u64 max_size; + + /* First extent starting offset */ + u64 window_start; + + /* We did a full search and couldn't create a cluster */ + bool fragmented; + + struct btrfs_block_group *block_group; + /* + * When a cluster is allocated from a block group, we put the cluster + * onto a list in the block group so that it can be freed before the + * block group is freed. + */ + struct list_head block_group_list; +}; + +/* Discard control. */ +/* + * Async discard uses multiple lists to differentiate the discard filter + * parameters. Index 0 is for completely free block groups where we need to + * ensure the entire block group is trimmed without being lossy. Indices + * afterwards represent monotonically decreasing discard filter sizes to + * prioritize what should be discarded next. + */ +#define BTRFS_NR_DISCARD_LISTS 3 +#define BTRFS_DISCARD_INDEX_UNUSED 0 +#define BTRFS_DISCARD_INDEX_START 1 + +struct btrfs_discard_ctl { + struct workqueue_struct *discard_workers; + struct delayed_work work; + spinlock_t lock; + struct btrfs_block_group *block_group; + struct list_head discard_list[BTRFS_NR_DISCARD_LISTS]; + u64 prev_discard; + u64 prev_discard_time; + atomic_t discardable_extents; + atomic64_t discardable_bytes; + u64 max_discard_size; + u64 delay_ms; + u32 iops_limit; + u32 kbps_limit; + u64 discard_extent_bytes; + u64 discard_bitmap_bytes; + atomic64_t discard_bytes_saved; +}; + +/* + * Exclusive operations (device replace, resize, device add/remove, balance) + */ +enum btrfs_exclusive_operation { + BTRFS_EXCLOP_NONE, + BTRFS_EXCLOP_BALANCE_PAUSED, + BTRFS_EXCLOP_BALANCE, + BTRFS_EXCLOP_DEV_ADD, + BTRFS_EXCLOP_DEV_REMOVE, + BTRFS_EXCLOP_DEV_REPLACE, + BTRFS_EXCLOP_RESIZE, + BTRFS_EXCLOP_SWAP_ACTIVATE, +}; + +/* Store data about transaction commits, exported via sysfs. */ +struct btrfs_commit_stats { + /* Total number of commits */ + u64 commit_count; + /* The maximum commit duration so far in ns */ + u64 max_commit_dur; + /* The last commit duration in ns */ + u64 last_commit_dur; + /* The total commit duration in ns */ + u64 total_commit_dur; +}; + +struct btrfs_fs_info { + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + unsigned long flags; + struct btrfs_root *tree_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; + struct btrfs_root *fs_root; + struct btrfs_root *quota_root; + struct btrfs_root *uuid_root; + struct btrfs_root *data_reloc_root; + struct btrfs_root *block_group_root; + + /* The log root tree is a directory of all the other log roots */ + struct btrfs_root *log_root_tree; + + /* The tree that holds the global roots (csum, extent, etc) */ + rwlock_t global_root_lock; + struct rb_root global_root_tree; + + spinlock_t fs_roots_radix_lock; + struct radix_tree_root fs_roots_radix; + + /* Block group cache stuff */ + rwlock_t block_group_cache_lock; + struct rb_root_cached block_group_cache_tree; + + /* Keep track of unallocated space */ + atomic64_t free_chunk_space; + + /* Track ranges which are used by log trees blocks/logged data extents */ + struct extent_io_tree excluded_extents; + + /* logical->physical extent mapping */ + struct extent_map_tree mapping_tree; + + /* + * Block reservation for extent, checksum, root tree and delayed dir + * index item. + */ + struct btrfs_block_rsv global_block_rsv; + /* Block reservation for metadata operations */ + struct btrfs_block_rsv trans_block_rsv; + /* Block reservation for chunk tree */ + struct btrfs_block_rsv chunk_block_rsv; + /* Block reservation for delayed operations */ + struct btrfs_block_rsv delayed_block_rsv; + /* Block reservation for delayed refs */ + struct btrfs_block_rsv delayed_refs_rsv; + + struct btrfs_block_rsv empty_block_rsv; + + u64 generation; + u64 last_trans_committed; + /* + * Generation of the last transaction used for block group relocation + * since the filesystem was last mounted (or 0 if none happened yet). + * Must be written and read while holding btrfs_fs_info::commit_root_sem. + */ + u64 last_reloc_trans; + u64 avg_delayed_ref_runtime; + + /* + * This is updated to the current trans every time a full commit is + * required instead of the faster short fsync log commits + */ + u64 last_trans_log_full_commit; + unsigned long mount_opt; + + unsigned long compress_type:4; + unsigned int compress_level; + u32 commit_interval; + /* + * It is a suggestive number, the read side is safe even it gets a + * wrong number because we will write out the data into a regular + * extent. The write side(mount/remount) is under ->s_umount lock, + * so it is also safe. + */ + u64 max_inline; + + struct btrfs_transaction *running_transaction; + wait_queue_head_t transaction_throttle; + wait_queue_head_t transaction_wait; + wait_queue_head_t transaction_blocked_wait; + wait_queue_head_t async_submit_wait; + + /* + * Used to protect the incompat_flags, compat_flags, compat_ro_flags + * when they are updated. + * + * Because we do not clear the flags for ever, so we needn't use + * the lock on the read side. + * + * We also needn't use the lock when we mount the fs, because + * there is no other task which will update the flag. + */ + spinlock_t super_lock; + struct btrfs_super_block *super_copy; + struct btrfs_super_block *super_for_commit; + struct super_block *sb; + struct inode *btree_inode; + struct mutex tree_log_mutex; + struct mutex transaction_kthread_mutex; + struct mutex cleaner_mutex; + struct mutex chunk_mutex; + + /* + * This is taken to make sure we don't set block groups ro after the + * free space cache has been allocated on them. + */ + struct mutex ro_block_group_mutex; + + /* + * This is used during read/modify/write to make sure no two ios are + * trying to mod the same stripe at the same time. + */ + struct btrfs_stripe_hash_table *stripe_hash_table; + + /* + * This protects the ordered operations list only while we are + * processing all of the entries on it. This way we make sure the + * commit code doesn't find the list temporarily empty because another + * function happens to be doing non-waiting preflush before jumping + * into the main commit. + */ + struct mutex ordered_operations_mutex; + + struct rw_semaphore commit_root_sem; + + struct rw_semaphore cleanup_work_sem; + + struct rw_semaphore subvol_sem; + + spinlock_t trans_lock; + /* + * The reloc mutex goes with the trans lock, it is taken during commit + * to protect us from the relocation code. + */ + struct mutex reloc_mutex; + + struct list_head trans_list; + struct list_head dead_roots; + struct list_head caching_block_groups; + + spinlock_t delayed_iput_lock; + struct list_head delayed_iputs; + atomic_t nr_delayed_iputs; + wait_queue_head_t delayed_iputs_wait; + + atomic64_t tree_mod_seq; + + /* This protects tree_mod_log and tree_mod_seq_list */ + rwlock_t tree_mod_log_lock; + struct rb_root tree_mod_log; + struct list_head tree_mod_seq_list; + + atomic_t async_delalloc_pages; + + /* This is used to protect the following list -- ordered_roots. */ + spinlock_t ordered_root_lock; + + /* + * All fs/file tree roots in which there are data=ordered extents + * pending writeback are added into this list. + * + * These can span multiple transactions and basically include every + * dirty data page that isn't from nodatacow. + */ + struct list_head ordered_roots; + + struct mutex delalloc_root_mutex; + spinlock_t delalloc_root_lock; + /* All fs/file tree roots that have delalloc inodes. */ + struct list_head delalloc_roots; + + /* + * There is a pool of worker threads for checksumming during writes and + * a pool for checksumming after reads. This is because readers can + * run with FS locks held, and the writers may be waiting for those + * locks. We don't want ordering in the pending list to cause + * deadlocks, and so the two are serviced separately. + * + * A third pool does submit_bio to avoid deadlocking with the other two. + */ + struct btrfs_workqueue *workers; + struct btrfs_workqueue *hipri_workers; + struct btrfs_workqueue *delalloc_workers; + struct btrfs_workqueue *flush_workers; + struct workqueue_struct *endio_workers; + struct workqueue_struct *endio_meta_workers; + struct workqueue_struct *endio_raid56_workers; + struct workqueue_struct *rmw_workers; + struct workqueue_struct *compressed_write_workers; + struct btrfs_workqueue *endio_write_workers; + struct btrfs_workqueue *endio_freespace_worker; + struct btrfs_workqueue *caching_workers; + + /* + * Fixup workers take dirty pages that didn't properly go through the + * cow mechanism and make them safe to write. It happens for the + * sys_munmap function call path. + */ + struct btrfs_workqueue *fixup_workers; + struct btrfs_workqueue *delayed_workers; + + struct task_struct *transaction_kthread; + struct task_struct *cleaner_kthread; + u32 thread_pool_size; + + struct kobject *space_info_kobj; + struct kobject *qgroups_kobj; + struct kobject *discard_kobj; + + /* Used to keep from writing metadata until there is a nice batch */ + struct percpu_counter dirty_metadata_bytes; + struct percpu_counter delalloc_bytes; + struct percpu_counter ordered_bytes; + s32 dirty_metadata_batch; + s32 delalloc_batch; + + struct list_head dirty_cowonly_roots; + + struct btrfs_fs_devices *fs_devices; + + /* + * The space_info list is effectively read only after initial setup. + * It is populated at mount time and cleaned up after all block groups + * are removed. RCU is used to protect it. + */ + struct list_head space_info; + + struct btrfs_space_info *data_sinfo; + + struct reloc_control *reloc_ctl; + + /* data_alloc_cluster is only used in ssd_spread mode */ + struct btrfs_free_cluster data_alloc_cluster; + + /* All metadata allocations go through this cluster. */ + struct btrfs_free_cluster meta_alloc_cluster; + + /* Auto defrag inodes go here. */ + spinlock_t defrag_inodes_lock; + struct rb_root defrag_inodes; + atomic_t defrag_running; + + /* Used to protect avail_{data, metadata, system}_alloc_bits */ + seqlock_t profiles_lock; + /* + * These three are in extended format (availability of single chunks is + * denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other types are denoted + * by corresponding BTRFS_BLOCK_GROUP_* bits) + */ + u64 avail_data_alloc_bits; + u64 avail_metadata_alloc_bits; + u64 avail_system_alloc_bits; + + /* Balance state */ + spinlock_t balance_lock; + struct mutex balance_mutex; + atomic_t balance_pause_req; + atomic_t balance_cancel_req; + struct btrfs_balance_control *balance_ctl; + wait_queue_head_t balance_wait_q; + + /* Cancellation requests for chunk relocation */ + atomic_t reloc_cancel_req; + + u32 data_chunk_allocations; + u32 metadata_ratio; + + void *bdev_holder; + + /* Private scrub information */ + struct mutex scrub_lock; + atomic_t scrubs_running; + atomic_t scrub_pause_req; + atomic_t scrubs_paused; + atomic_t scrub_cancel_req; + wait_queue_head_t scrub_pause_wait; + /* + * The worker pointers are NULL iff the refcount is 0, ie. scrub is not + * running. + */ + refcount_t scrub_workers_refcnt; + struct workqueue_struct *scrub_workers; + struct workqueue_struct *scrub_wr_completion_workers; + struct workqueue_struct *scrub_parity_workers; + struct btrfs_subpage_info *subpage_info; + + struct btrfs_discard_ctl discard_ctl; + +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + u32 check_integrity_print_mask; +#endif + /* Is qgroup tracking in a consistent state? */ + u64 qgroup_flags; + + /* Holds configuration and tracking. Protected by qgroup_lock. */ + struct rb_root qgroup_tree; + spinlock_t qgroup_lock; + + /* + * Used to avoid frequently calling ulist_alloc()/ulist_free() + * when doing qgroup accounting, it must be protected by qgroup_lock. + */ + struct ulist *qgroup_ulist; + + /* + * Protect user change for quota operations. If a transaction is needed, + * it must be started before locking this lock. + */ + struct mutex qgroup_ioctl_lock; + + /* List of dirty qgroups to be written at next commit. */ + struct list_head dirty_qgroups; + + /* Used by qgroup for an efficient tree traversal. */ + u64 qgroup_seq; + + /* Qgroup rescan items. */ + /* Protects the progress item */ + struct mutex qgroup_rescan_lock; + struct btrfs_key qgroup_rescan_progress; + struct btrfs_workqueue *qgroup_rescan_workers; + struct completion qgroup_rescan_completion; + struct btrfs_work qgroup_rescan_work; + /* Protected by qgroup_rescan_lock */ + bool qgroup_rescan_running; + u8 qgroup_drop_subtree_thres; + + /* Filesystem state */ + unsigned long fs_state; + + struct btrfs_delayed_root *delayed_root; + + /* Extent buffer radix tree */ + spinlock_t buffer_lock; + /* Entries are eb->start / sectorsize */ + struct radix_tree_root buffer_radix; + + /* Next backup root to be overwritten */ + int backup_root_index; + + /* Device replace state */ + struct btrfs_dev_replace dev_replace; + + struct semaphore uuid_tree_rescan_sem; + + /* Used to reclaim the metadata space in the background. */ + struct work_struct async_reclaim_work; + struct work_struct async_data_reclaim_work; + struct work_struct preempt_reclaim_work; + + /* Reclaim partially filled block groups in the background */ + struct work_struct reclaim_bgs_work; + struct list_head reclaim_bgs; + int bg_reclaim_threshold; + + spinlock_t unused_bgs_lock; + struct list_head unused_bgs; + struct mutex unused_bg_unpin_mutex; + /* Protect block groups that are going to be deleted */ + struct mutex reclaim_bgs_lock; + + /* Cached block sizes */ + u32 nodesize; + u32 sectorsize; + /* ilog2 of sectorsize, use to avoid 64bit division */ + u32 sectorsize_bits; + u32 csum_size; + u32 csums_per_leaf; + u32 stripesize; + + /* + * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular + * filesystem, on zoned it depends on the device constraints. + */ + u64 max_extent_size; + + /* Block groups and devices containing active swapfiles. */ + spinlock_t swapfile_pins_lock; + struct rb_root swapfile_pins; + + struct crypto_shash *csum_shash; + + /* Type of exclusive operation running, protected by super_lock */ + enum btrfs_exclusive_operation exclusive_operation; + + /* + * Zone size > 0 when in ZONED mode, otherwise it's used for a check + * if the mode is enabled + */ + u64 zone_size; + + /* Max size to emit ZONE_APPEND write command */ + u64 max_zone_append_size; + struct mutex zoned_meta_io_lock; + spinlock_t treelog_bg_lock; + u64 treelog_bg; + + /* + * Start of the dedicated data relocation block group, protected by + * relocation_bg_lock. + */ + spinlock_t relocation_bg_lock; + u64 data_reloc_bg; + struct mutex zoned_data_reloc_io_lock; + + u64 nr_global_roots; + + spinlock_t zone_active_bgs_lock; + struct list_head zone_active_bgs; + + /* Updates are not protected by any lock */ + struct btrfs_commit_stats commit_stats; + + /* + * Last generation where we dropped a non-relocation root. + * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen() + * to change it and to read it, respectively. + */ + u64 last_root_drop_gen; + + /* + * Annotations for transaction events (structures are empty when + * compiled without lockdep). + */ + struct lockdep_map btrfs_trans_num_writers_map; + struct lockdep_map btrfs_trans_num_extwriters_map; + struct lockdep_map btrfs_state_change_map[4]; + struct lockdep_map btrfs_trans_pending_ordered_map; + struct lockdep_map btrfs_ordered_extent_map; + +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + spinlock_t ref_verify_lock; + struct rb_root block_tree; +#endif + +#ifdef CONFIG_BTRFS_DEBUG + struct kobject *debug_kobj; + struct list_head allocated_roots; + + spinlock_t eb_leak_lock; + struct list_head allocated_ebs; +#endif +}; + +static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info, + u64 gen) +{ + WRITE_ONCE(fs_info->last_root_drop_gen, gen); +} + +static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info) +{ + return READ_ONCE(fs_info->last_root_drop_gen); +} + +static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* + * Take the number of bytes to be checksummed and figure out how many leaves + * it would require to store the csums for that many bytes. + */ +static inline u64 btrfs_csum_bytes_to_leaves( + const struct btrfs_fs_info *fs_info, u64 csum_bytes) +{ + const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; + + return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); +} + +/* + * Use this if we would be adding new items, as we could split nodes as we cow + * down the tree. + */ +static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, + unsigned num_items) +{ + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; +} + +/* + * Doing a truncate or a modification won't result in new nodes or leaves, just + * what we need for COW. + */ +static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, + unsigned num_items) +{ + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; +} + +#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ + sizeof(struct btrfs_item)) + +static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) +{ + return fs_info->zone_size > 0; +} + +/* + * Count how many fs_info->max_extent_size cover the @size + */ +static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) +{ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (!fs_info) + return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); +#endif + + return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); +} + +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation op); + /* Compatibility and incompatibility defines */ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name); -- cgit From eb33a4d65b2adae52530d6c3c7a9f58bf734a750 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 14:46:53 -0400 Subject: btrfs: move the lockdep helpers into locking.h These more naturally fit in with the locking related code, and they're all defines so they can easily go anywhere, move them out of ctree.h into locking.h Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 76 ------------------------------------------------------ fs/btrfs/locking.h | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 76 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ace6f41612e5..970a22a04c94 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -181,82 +181,6 @@ enum { BTRFS_ROOT_RESET_LOCKDEP_CLASS, }; -enum btrfs_lockdep_trans_states { - BTRFS_LOCKDEP_TRANS_COMMIT_START, - BTRFS_LOCKDEP_TRANS_UNBLOCKED, - BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED, - BTRFS_LOCKDEP_TRANS_COMPLETED, -}; - -/* - * Lockdep annotation for wait events. - * - * @owner: The struct where the lockdep map is defined - * @lock: The lockdep map corresponding to a wait event - * - * This macro is used to annotate a wait event. In this case a thread acquires - * the lockdep map as writer (exclusive lock) because it has to block until all - * the threads that hold the lock as readers signal the condition for the wait - * event and release their locks. - */ -#define btrfs_might_wait_for_event(owner, lock) \ - do { \ - rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \ - rwsem_release(&owner->lock##_map, _THIS_IP_); \ - } while (0) - -/* - * Protection for the resource/condition of a wait event. - * - * @owner: The struct where the lockdep map is defined - * @lock: The lockdep map corresponding to a wait event - * - * Many threads can modify the condition for the wait event at the same time - * and signal the threads that block on the wait event. The threads that modify - * the condition and do the signaling acquire the lock as readers (shared - * lock). - */ -#define btrfs_lockdep_acquire(owner, lock) \ - rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_) - -/* - * Used after signaling the condition for a wait event to release the lockdep - * map held by a reader thread. - */ -#define btrfs_lockdep_release(owner, lock) \ - rwsem_release(&owner->lock##_map, _THIS_IP_) - -/* - * Macros for the transaction states wait events, similar to the generic wait - * event macros. - */ -#define btrfs_might_wait_for_state(owner, i) \ - do { \ - rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \ - rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \ - } while (0) - -#define btrfs_trans_state_lockdep_acquire(owner, i) \ - rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_) - -#define btrfs_trans_state_lockdep_release(owner, i) \ - rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_) - -/* Initialization of the lockdep map */ -#define btrfs_lockdep_init_map(owner, lock) \ - do { \ - static struct lock_class_key lock##_key; \ - lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \ - } while (0) - -/* Initialization of the transaction states lockdep maps. */ -#define btrfs_state_lockdep_init_map(owner, lock, state) \ - do { \ - static struct lock_class_key lock##_key; \ - lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \ - &lock##_key, 0); \ - } while (0) - /* * Record swapped tree blocks of a subvolume tree for delayed subtree trace * code. For detail check comment in fs/btrfs/qgroup.c. diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 490c7a79e995..11c2269b4b6f 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -78,6 +78,82 @@ enum btrfs_lock_nesting { BTRFS_NESTING_MAX, }; +enum btrfs_lockdep_trans_states { + BTRFS_LOCKDEP_TRANS_COMMIT_START, + BTRFS_LOCKDEP_TRANS_UNBLOCKED, + BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED, + BTRFS_LOCKDEP_TRANS_COMPLETED, +}; + +/* + * Lockdep annotation for wait events. + * + * @owner: The struct where the lockdep map is defined + * @lock: The lockdep map corresponding to a wait event + * + * This macro is used to annotate a wait event. In this case a thread acquires + * the lockdep map as writer (exclusive lock) because it has to block until all + * the threads that hold the lock as readers signal the condition for the wait + * event and release their locks. + */ +#define btrfs_might_wait_for_event(owner, lock) \ + do { \ + rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \ + rwsem_release(&owner->lock##_map, _THIS_IP_); \ + } while (0) + +/* + * Protection for the resource/condition of a wait event. + * + * @owner: The struct where the lockdep map is defined + * @lock: The lockdep map corresponding to a wait event + * + * Many threads can modify the condition for the wait event at the same time + * and signal the threads that block on the wait event. The threads that modify + * the condition and do the signaling acquire the lock as readers (shared + * lock). + */ +#define btrfs_lockdep_acquire(owner, lock) \ + rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_) + +/* + * Used after signaling the condition for a wait event to release the lockdep + * map held by a reader thread. + */ +#define btrfs_lockdep_release(owner, lock) \ + rwsem_release(&owner->lock##_map, _THIS_IP_) + +/* + * Macros for the transaction states wait events, similar to the generic wait + * event macros. + */ +#define btrfs_might_wait_for_state(owner, i) \ + do { \ + rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \ + rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \ + } while (0) + +#define btrfs_trans_state_lockdep_acquire(owner, i) \ + rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_) + +#define btrfs_trans_state_lockdep_release(owner, i) \ + rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_) + +/* Initialization of the lockdep map */ +#define btrfs_lockdep_init_map(owner, lock) \ + do { \ + static struct lock_class_key lock##_key; \ + lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \ + } while (0) + +/* Initialization of the transaction states lockdep maps. */ +#define btrfs_state_lockdep_init_map(owner, lock, state) \ + do { \ + static struct lock_class_key lock##_key; \ + lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \ + &lock##_key, 0); \ + } while (0) + static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES, "too many lock subclasses defined"); -- cgit From 13d925c1c26916c017b5339625c66bd20df9fd96 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 14:46:54 -0400 Subject: btrfs: minor whitespace in ctree.h We've accumulated some whitespace problems in ctree.h, clean these up. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 970a22a04c94..bf94531f4486 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -435,10 +435,8 @@ struct btrfs_file_private { void *filldir_buf; }; - static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) { - return info->nodesize - sizeof(struct btrfs_header); } -- cgit From 8483d40242c56b0b225238efed1f73ef6985c5af Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 14:46:55 -0400 Subject: btrfs: remove extra space info prototypes in ctree.h These are defined already in space-info.h, remove them from ctree.h. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index bf94531f4486..a430577cf8b7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -567,8 +567,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); -void btrfs_clear_space_info_full(struct btrfs_fs_info *info); - int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); @@ -585,7 +583,6 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); -int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_start_write_no_snapshotting(struct btrfs_root *root); -- cgit From e2f13b343c147bc34691301c30d8a8b35d0a2e5c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 14:46:56 -0400 Subject: btrfs: move btrfs_account_ro_block_groups_free_space into space-info.c This was prototyped in ctree.h and the code existed in extent-tree.c, but it's space-info related so move it into space-info.c. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 - fs/btrfs/extent-tree.c | 34 ---------------------------------- fs/btrfs/space-info.c | 34 ++++++++++++++++++++++++++++++++++ fs/btrfs/space-info.h | 1 + 4 files changed, 35 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a430577cf8b7..07f876961da1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -576,7 +576,6 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, u64 disk_num_bytes, bool noflush); -u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8d28ba360e9a..b97c99a8dd4c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5975,40 +5975,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return ret; } -/* - * helper to account the unused space of all the readonly block group in the - * space_info. takes mirrors into account. - */ -u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) -{ - struct btrfs_block_group *block_group; - u64 free_bytes = 0; - int factor; - - /* It's df, we don't care if it's racy */ - if (list_empty(&sinfo->ro_bgs)) - return 0; - - spin_lock(&sinfo->lock); - list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { - spin_lock(&block_group->lock); - - if (!block_group->ro) { - spin_unlock(&block_group->lock); - continue; - } - - factor = btrfs_bg_type_to_factor(block_group->flags); - free_bytes += (block_group->length - - block_group->used) * factor; - - spin_unlock(&block_group->lock); - } - spin_unlock(&sinfo->lock); - - return free_bytes; -} - int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 94d73485454a..45404798ee8c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1814,3 +1814,37 @@ __cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info) } dump_global_block_rsv(fs_info); } + +/* + * Account the unused space of all the readonly block group in the space_info. + * takes mirrors into account. + */ +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) +{ + struct btrfs_block_group *block_group; + u64 free_bytes = 0; + int factor; + + /* It's df, we don't care if it's racy */ + if (list_empty(&sinfo->ro_bgs)) + return 0; + + spin_lock(&sinfo->lock); + list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { + spin_lock(&block_group->lock); + + if (!block_group->ro) { + spin_unlock(&block_group->lock); + continue; + } + + factor = btrfs_bg_type_to_factor(block_group->flags); + free_bytes += (block_group->length - + block_group->used) * factor; + + spin_unlock(&block_group->lock); + } + spin_unlock(&sinfo->lock); + + return free_bytes; +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index f28bd2c051d4..fc99ea2b0c34 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -236,5 +236,6 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, enum btrfs_reserve_flush_enum flush); void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info); void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); #endif /* BTRFS_SPACE_INFO_H */ -- cgit From a0231804affe78d27264811559ee31bd341c2bff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 14:46:57 -0400 Subject: btrfs: move extent-tree helpers into their own header file Move all the extent tree related prototypes to extent-tree.h out of ctree.h, and then go include it everywhere needed so everything compiles. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 1 + fs/btrfs/block-group.c | 1 + fs/btrfs/ctree.c | 1 + fs/btrfs/ctree.h | 72 ---------------------------------------------- fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 1 + fs/btrfs/extent-tree.h | 72 ++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/file.c | 1 + fs/btrfs/free-space-tree.c | 1 + fs/btrfs/inode-item.c | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/qgroup.c | 1 + fs/btrfs/relocation.c | 1 + fs/btrfs/space-info.c | 1 + fs/btrfs/transaction.c | 1 + fs/btrfs/tree-log.c | 1 + 17 files changed, 87 insertions(+), 72 deletions(-) create mode 100644 fs/btrfs/extent-tree.h (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f556566b9c79..173df40da984 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -17,6 +17,7 @@ #include "tree-mod-log.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" /* Just arbitrary numbers so we can be sure one of these happened. */ #define BACKREF_FOUND_SHARED 6 diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index d1f8d792b184..5a743c4d4e97 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -19,6 +19,7 @@ #include "zoned.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" #ifdef CONFIG_BTRFS_DEBUG int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 1283ca46c5bc..f0ddb44704ee 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -20,6 +20,7 @@ #include "tree-checker.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" static struct kmem_cache *btrfs_path_cachep; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 07f876961da1..8ca67227dec0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -495,78 +495,6 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) return mapping_gfp_constraint(mapping, ~__GFP_FS); } -/* extent-tree.c */ - -enum btrfs_inline_ref_type { - BTRFS_REF_TYPE_INVALID, - BTRFS_REF_TYPE_BLOCK, - BTRFS_REF_TYPE_DATA, - BTRFS_REF_TYPE_ANY, -}; - -int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, - struct btrfs_extent_inline_ref *iref, - enum btrfs_inline_ref_type is_data); -u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); - - -int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 num_bytes); -void btrfs_free_excluded_extents(struct btrfs_block_group *cache); -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - unsigned long count); -void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head); -int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); -int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 offset, int metadata, u64 *refs, u64 *flags); -int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, - int reserved); -int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes); -int btrfs_exclude_logged_extents(struct extent_buffer *eb); -int btrfs_cross_ref_exist(struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr, bool strict, - struct btrfs_path *path); -struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 parent, u64 root_objectid, - const struct btrfs_disk_key *key, - int level, u64 hint, - u64 empty_size, - enum btrfs_lock_nesting nest); -void btrfs_free_tree_block(struct btrfs_trans_handle *trans, - u64 root_id, - struct extent_buffer *buf, - u64 parent, int last_ref); -int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 owner, - u64 offset, u64 ram_bytes, - struct btrfs_key *ins); -int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, - u64 root_objectid, u64 owner, u64 offset, - struct btrfs_key *ins); -int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, - u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data, int delalloc); -int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); -int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); -int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct extent_buffer *eb, u64 flags, int level); -int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); - -int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, int delalloc); -int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, - u64 len); -int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); -int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_ref *generic_ref); - int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 75475a213972..005da95c9c17 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -45,6 +45,7 @@ #include "subpage.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b97c99a8dd4c..d2d5de946242 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -38,6 +38,7 @@ #include "dev-replace.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" #undef SCRAMBLE_DELAYED_REFS diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h new file mode 100644 index 000000000000..b3674b008d58 --- /dev/null +++ b/fs/btrfs/extent-tree.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_EXTENT_TREE_H +#define BTRFS_EXTENT_TREE_H + +enum btrfs_inline_ref_type { + BTRFS_REF_TYPE_INVALID, + BTRFS_REF_TYPE_BLOCK, + BTRFS_REF_TYPE_DATA, + BTRFS_REF_TYPE_ANY, +}; + +int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, + struct btrfs_extent_inline_ref *iref, + enum btrfs_inline_ref_type is_data); +u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); + +int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 num_bytes); +void btrfs_free_excluded_extents(struct btrfs_block_group *cache); +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); +void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); +int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 offset, int metadata, u64 *refs, u64 *flags); +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, + int reserved); +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes); +int btrfs_exclude_logged_extents(struct extent_buffer *eb); +int btrfs_cross_ref_exist(struct btrfs_root *root, + u64 objectid, u64 offset, u64 bytenr, bool strict, + struct btrfs_path *path); +struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + const struct btrfs_disk_key *key, + int level, u64 hint, + u64 empty_size, + enum btrfs_lock_nesting nest); +void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + u64 root_id, + struct extent_buffer *buf, + u64 parent, int last_ref); +int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 owner, + u64 offset, u64 ram_bytes, + struct btrfs_key *ins); +int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + u64 root_objectid, u64 owner, u64 offset, + struct btrfs_key *ins); +int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, + u64 min_alloc_size, u64 empty_size, u64 hint_byte, + struct btrfs_key *ins, int is_data, int delalloc); +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref); +int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref); +int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, + struct extent_buffer *eb, u64 flags, int level); +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); + +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 len, int delalloc); +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len); +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); + +#endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 62360a554420..bad676d2e23f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -32,6 +32,7 @@ #include "subpage.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index fc79d21e7b4f..a652e5dc465b 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -14,6 +14,7 @@ #include "block-group.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index d66bdbae5585..724507ce7a7d 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -12,6 +12,7 @@ #include "print-tree.h" #include "space-info.h" #include "accessors.h" +#include "extent-tree.h" struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 78867a084428..cb2a35b19c75 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -57,6 +57,7 @@ #include "inode-item.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2132e3e15986..1d4bdb22881c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -52,6 +52,7 @@ #include "subpage.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 4786b59035a8..88b248b22810 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -26,6 +26,7 @@ #include "tree-mod-log.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" /* * Helpers to access qgroup reservation diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 77d03dce2521..946190f13138 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -30,6 +30,7 @@ #include "space-info.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" /* * Relocation overview diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 45404798ee8c..e5f9f43ffa4c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -12,6 +12,7 @@ #include "zoned.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" /* * HOW DOES SPACE RESERVATION WORK diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0d44d50dc3be..a04c2a9708e2 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -26,6 +26,7 @@ #include "zoned.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" static struct kmem_cache *btrfs_trans_handle_cachep; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a5e56a678af2..bd9a5eda9def 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -23,6 +23,7 @@ #include "inode-item.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" #define MAX_CONFLICT_INODES 10 -- cgit From 2839c2c142dd4baae146b7d3e42ead8fe5d36014 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 14:46:58 -0400 Subject: btrfs: move delalloc space related prototypes to delalloc-space.h These exist in delalloc-space.c, move them from ctree.h into delalloc-space.h. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 --- fs/btrfs/delalloc-space.h | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8ca67227dec0..afb2bb3acc76 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -500,10 +500,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, int nitems, bool use_global_rsv); void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv); -void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, - u64 disk_num_bytes, bool noflush); int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index e07d46043455..c5d573f2366e 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -20,5 +20,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len); +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, + u64 disk_num_bytes, bool noflush); +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); #endif /* BTRFS_DELALLOC_SPACE_H */ -- cgit From 6d2049a2f36f873b8f793fde7ea1874d0d17a3dc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 14:46:59 -0400 Subject: btrfs: delete unused function prototypes in ctree.h This batch of prototypes no longer have code associated with them, so remove them. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index afb2bb3acc76..acae38e378a6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -507,12 +507,6 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); -int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_start_write_no_snapshotting(struct btrfs_root *root); -void btrfs_end_write_no_snapshotting(struct btrfs_root *root); -void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); - /* ctree.c */ int __init btrfs_ctree_init(void); void __cold btrfs_ctree_exit(void); -- cgit From 45c40c8f9541f336c7857f09ea843fc8fa980b65 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 14:47:00 -0400 Subject: btrfs: move root tree prototypes to their own header Move all the root-tree.c prototypes to root-tree.h, and then update all the necessary files to include the new header. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 32 -------------------------------- fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 1 + fs/btrfs/free-space-tree.c | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/qgroup.c | 1 + fs/btrfs/relocation.c | 1 + fs/btrfs/root-tree.c | 1 + fs/btrfs/root-tree.h | 34 ++++++++++++++++++++++++++++++++++ fs/btrfs/transaction.c | 1 + fs/btrfs/tree-log.c | 1 + 12 files changed, 44 insertions(+), 32 deletions(-) create mode 100644 fs/btrfs/root-tree.h (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index acae38e378a6..77f1f345d040 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -495,12 +495,6 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) return mapping_gfp_constraint(mapping, ~__GFP_FS); } -int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv, - int nitems, bool use_global_rsv); -void btrfs_subvolume_release_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv); - int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, @@ -695,32 +689,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *node, struct extent_buffer *parent); -/* root-item.c */ -int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 sequence, - const struct fscrypt_str *name); -int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 *sequence, - const struct fscrypt_str *name); -int btrfs_del_root(struct btrfs_trans_handle *trans, - const struct btrfs_key *key); -int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const struct btrfs_key *key, - struct btrfs_root_item *item); -int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_key *key, - struct btrfs_root_item *item); -int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, - struct btrfs_path *path, struct btrfs_root_item *root_item, - struct btrfs_key *root_key); -int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info); -void btrfs_set_root_node(struct btrfs_root_item *item, - struct extent_buffer *node); -void btrfs_check_and_init_root_item(struct btrfs_root_item *item); -void btrfs_update_root_times(struct btrfs_trans_handle *trans, - struct btrfs_root *root); - /* uuid-tree.c */ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 005da95c9c17..ebc9baab0e47 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -46,6 +46,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "root-tree.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d2d5de946242..510be00da406 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -39,6 +39,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "root-tree.h" #undef SCRAMBLE_DELAYED_REFS diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index a652e5dc465b..869d062d6765 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -15,6 +15,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "root-tree.h" static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cb2a35b19c75..ec9f0f4afd12 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -58,6 +58,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "root-tree.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1d4bdb22881c..52a268e05d4a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -53,6 +53,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "root-tree.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 88b248b22810..75dd964ca46c 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -27,6 +27,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "root-tree.h" /* * Helpers to access qgroup reservation diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 946190f13138..e345cc71da15 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -31,6 +31,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "root-tree.h" /* * Relocation overview diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 6aab98114253..42f046e5e25f 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -14,6 +14,7 @@ #include "qgroup.h" #include "space-info.h" #include "accessors.h" +#include "root-tree.h" /* * Read a root item from the tree. In case we detect a root item smaller then diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h new file mode 100644 index 000000000000..cbbaca32126e --- /dev/null +++ b/fs/btrfs/root-tree.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ROOT_TREE_H +#define BTRFS_ROOT_TREE_H + +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + int nitems, bool use_global_rsv); +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv); +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, + u64 ref_id, u64 dirid, u64 sequence, + const struct fscrypt_str *name); +int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, + u64 ref_id, u64 dirid, u64 *sequence, + const struct fscrypt_str *name); +int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key); +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *key, + struct btrfs_root_item *item); +int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, + struct btrfs_root_item *item); +int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, + struct btrfs_path *path, struct btrfs_root_item *root_item, + struct btrfs_key *root_key); +int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info); +void btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node); +void btrfs_check_and_init_root_item(struct btrfs_root_item *item); +void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root); + +#endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a04c2a9708e2..82b2e2ec90cf 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -27,6 +27,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "root-tree.h" static struct kmem_cache *btrfs_trans_handle_cachep; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index bd9a5eda9def..f185cd286243 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -24,6 +24,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "root-tree.h" #define MAX_CONFLICT_INODES 10 -- cgit From 911bd75aca7390ca9e39191450829b09772c5a77 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 16:16:14 -0400 Subject: btrfs: remove unused function prototypes I wrote the following coccinelle script to find function declarations that didn't have the corresponding code for them @funcproto@ identifier func; type T; position p0; @@ T func@p0(...); @funccode@ identifier funcproto.func; position p1; @@ func@p1(...) { ... } @script:python depends on !funccode@ p0 << funcproto.p0; @@ print("Proto with no function at %s:%s" % (p0[0].file, p0[0].line)) and ran it against btrfs, which identified the 4 function prototypes I've removed in this patch. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 --- fs/btrfs/disk-io.h | 4 ---- 2 files changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 77f1f345d040..09005bd9bfef 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -739,7 +739,6 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); -int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); /* file-item.c */ int btrfs_del_csums(struct btrfs_trans_handle *trans, @@ -924,8 +923,6 @@ int __pure btrfs_is_empty_uuid(u8 *uuid); int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag); -void btrfs_get_block_group_info(struct list_head *groups_list, - struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index da040ec327ae..6edc66b4b4d3 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -116,8 +116,6 @@ int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid, bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, u64 dio_file_offset, extent_submit_bio_start_t *submit_bio_start); -blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, - int mirror_num); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, @@ -130,8 +128,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); -int btree_lock_page_hook(struct page *page, void *data, - void (*flush_fn)(void *)); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid); int btrfs_init_root_free_objectid(struct btrfs_root *root); -- cgit From 1751850fbd770c89347e14ae9590219bcd2f0649 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 25 Oct 2022 11:13:06 -0400 Subject: btrfs: remove unused btrfs_cond_migrate_bytes The last user of this was removed in 7f9fe6144076 ("btrfs: improve global reserve stealing logic"), drop this code as it's no longer called by anybody. Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 25 ------------------------- fs/btrfs/block-rsv.h | 3 --- 2 files changed, 28 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 069ea9d6a8d4..29705256727f 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -325,31 +325,6 @@ void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, spin_unlock(&block_rsv->lock); } -int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *dest, u64 num_bytes, - int min_factor) -{ - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - u64 min_bytes; - - if (global_rsv->space_info != dest->space_info) - return -ENOSPC; - - spin_lock(&global_rsv->lock); - min_bytes = div_factor(global_rsv->size, min_factor); - if (global_rsv->reserved < min_bytes + num_bytes) { - spin_unlock(&global_rsv->lock); - return -ENOSPC; - } - global_rsv->reserved -= num_bytes; - if (global_rsv->reserved < global_rsv->size) - global_rsv->full = false; - spin_unlock(&global_rsv->lock); - - btrfs_block_rsv_add_bytes(dest, num_bytes, true); - return 0; -} - void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 578c3497a455..7e9016a9e193 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -70,9 +70,6 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes, bool update_size); int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *dest, u64 num_bytes, - int min_factor); void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes, bool update_size); u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, -- cgit From 43dd529abed2bcf1467f13cce83da1c8456587ea Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 14:21:42 +0200 Subject: btrfs: update function comments Update, reformat or reword function comments. This also removes the kdoc marker so we don't get reports when the function name is missing. Changes made: - remove kdoc markers - reformat the brief description to be a proper sentence - reword to imperative voice - align parameter list - fix typos Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 20 ++++---- fs/btrfs/ctree.c | 11 +++-- fs/btrfs/delalloc-space.c | 59 +++++++++++------------ fs/btrfs/delayed-ref.c | 19 ++++---- fs/btrfs/discard.c | 111 +++++++++++++++++++++++++------------------- fs/btrfs/extent_io.c | 59 ++++++++++++----------- fs/btrfs/extent_map.c | 65 ++++++++++++-------------- fs/btrfs/file-item.c | 50 ++++++++++---------- fs/btrfs/free-space-cache.c | 9 ++-- fs/btrfs/inode.c | 14 +++--- fs/btrfs/ioctl.c | 2 +- fs/btrfs/ordered-data.c | 13 +++--- fs/btrfs/raid56.c | 4 +- fs/btrfs/reflink.c | 18 +++---- fs/btrfs/space-info.c | 16 +++---- fs/btrfs/tree-log.c | 2 +- fs/btrfs/ulist.c | 35 ++++++++------ fs/btrfs/volumes.c | 43 +++++++++-------- fs/btrfs/zoned.c | 6 +-- fs/btrfs/zstd.c | 2 +- 20 files changed, 289 insertions(+), 269 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5a743c4d4e97..cfe495d39ebd 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -299,7 +299,7 @@ struct btrfs_block_group *btrfs_next_block_group( return cache; } -/** +/* * Check if we can do a NOCOW write for a given extent. * * @fs_info: The filesystem information object. @@ -340,11 +340,9 @@ struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, return bg; } -/** +/* * Decrement the number of NOCOW writers in a block group. * - * @bg: The block group. - * * This is meant to be called after a previous call to btrfs_inc_nocow_writers(), * and on the block group returned by that call. Typically this is called after * creating an ordered extent for a NOCOW write, to prevent races with scrub and @@ -1813,8 +1811,8 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) write_sequnlock(&fs_info->profiles_lock); } -/** - * Map a physical disk address to a list of logical addresses +/* + * Map a physical disk address to a list of logical addresses. * * @fs_info: the filesystem * @chunk_start: logical address of block group @@ -3421,8 +3419,9 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, return ret; } -/** - * btrfs_add_reserved_bytes - update the block_group and space info counters +/* + * Update the block_group and space info counters. + * * @cache: The cache we are manipulating * @ram_bytes: The number of bytes of file content, and will be same to * @num_bytes except for the compress path. @@ -3465,8 +3464,9 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, return ret; } -/** - * btrfs_free_reserved_bytes - update the block_group and space info counters +/* + * Update the block_group and space info counters. + * * @cache: The cache we are manipulating * @num_bytes: The number of bytes in question * @delalloc: The blocks are allocated for the delalloc write diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f0ddb44704ee..4b47d380da1e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2363,7 +2363,7 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, return ret; } -/** +/* * Search for a valid slot for the given path. * * @root: The root node of the tree. @@ -3985,14 +3985,15 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) } } -/** - * setup_items_for_insert - Helper called before inserting one or more items - * to a leaf. Main purpose is to save stack depth by doing the bulk of the work - * in a function that doesn't call btrfs_search_slot +/* + * Make space in the node before inserting one or more items. * * @root: root we are inserting items to * @path: points to the leaf/slot where we are going to insert new items * @batch: information about the batch of items to insert + * + * Main purpose is to save stack depth by doing the bulk of the work in a + * function that doesn't call btrfs_search_slot */ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_item_batch *batch) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 605d8874a446..7ddb1d104e8e 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -202,8 +202,8 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, btrfs_qgroup_free_data(inode, reserved, start, len); } -/** - * Release any excessive reservation +/* + * Release any excessive reservations for an inode. * * @inode: the inode we need to release from * @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup @@ -377,12 +377,12 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, return 0; } -/** - * Release a metadata reservation for an inode +/* + * Release a metadata reservation for an inode. * - * @inode: the inode to release the reservation for. - * @num_bytes: the number of bytes we are releasing. - * @qgroup_free: free qgroup reservation or convert it to per-trans reservation + * @inode: the inode to release the reservation for. + * @num_bytes: the number of bytes we are releasing. + * @qgroup_free: free qgroup reservation or convert it to per-trans reservation * * This will release the metadata reservation for an inode. This can be called * once we complete IO for a given set of bytes to release their metadata @@ -405,10 +405,11 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, btrfs_inode_rsv_release(inode, qgroup_free); } -/** - * btrfs_delalloc_release_extents - release our outstanding_extents - * @inode: the inode to balance the reservation for. - * @num_bytes: the number of bytes we originally reserved with +/* + * Release our outstanding_extents for an inode. + * + * @inode: the inode to balance the reservation for. + * @num_bytes: the number of bytes we originally reserved with * * When we reserve space we increase outstanding_extents for the extents we may * add. Once we've set the range as delalloc or created our ordered extents we @@ -433,30 +434,30 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) btrfs_inode_rsv_release(inode, true); } -/** - * btrfs_delalloc_reserve_space - reserve data and metadata space for - * delalloc - * @inode: inode we're writing to - * @start: start range we are writing to - * @len: how long the range we are writing to - * @reserved: mandatory parameter, record actually reserved qgroup ranges of - * current reservation. +/* + * Reserve data and metadata space for delalloc + * + * @inode: inode we're writing to + * @start: start range we are writing to + * @len: how long the range we are writing to + * @reserved: mandatory parameter, record actually reserved qgroup ranges of + * current reservation. * * This will do the following things * - * - reserve space in data space info for num bytes - * and reserve precious corresponding qgroup space + * - reserve space in data space info for num bytes and reserve precious + * corresponding qgroup space * (Done in check_data_free_space) * * - reserve space for metadata space, based on the number of outstanding - * extents and how much csums will be needed - * also reserve metadata space in a per root over-reserve method. + * extents and how much csums will be needed also reserve metadata space in a + * per root over-reserve method. * - add to the inodes->delalloc_bytes * - add it to the fs_info's delalloc inodes list. * (Above 3 all done in delalloc_reserve_metadata) * * Return 0 for success - * Return <0 for error(-ENOSPC or -EQUOT) + * Return <0 for error(-ENOSPC or -EDQUOT) */ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len) @@ -475,7 +476,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, return ret; } -/** +/* * Release data and metadata space for delalloc * * @inode: inode we're releasing space for @@ -484,10 +485,10 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, * @len: length of the space already reserved * @qgroup_free: should qgroup reserved-space also be freed * - * This function will release the metadata space that was not used and will - * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes - * list if there are no delalloc bytes left. - * Also it will handle the qgroup reserved space. + * Release the metadata space that was not used and will decrement + * ->delalloc_bytes and remove it from the fs_info->delalloc_inodes list if + * there are no delalloc bytes left. Also it will handle the qgroup reserved + * space. */ void btrfs_delalloc_release_space(struct btrfs_inode *inode, struct extent_changeset *reserved, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 010cc16297d8..573ebab886e2 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -71,14 +71,14 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) return btrfs_check_space_for_delayed_refs(trans->fs_info); } -/** - * Release a ref head's reservation +/* + * Release a ref head's reservation. * * @fs_info: the filesystem * @nr: number of items to drop * - * This drops the delayed ref head's count from the delayed refs rsv and frees - * any excess reservation we had. + * Drops the delayed ref head's count from the delayed refs rsv and free any + * excess reservation we had. */ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) { @@ -104,8 +104,7 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) } /* - * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv - * @trans - the trans that may have generated delayed refs + * Adjust the size of the delayed refs rsv. * * This is to be called anytime we may have adjusted trans->delayed_ref_updates, * it'll calculate the additional size and add it to the delayed_refs_rsv. @@ -139,8 +138,8 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) trans->delayed_ref_updates = 0; } -/** - * Transfer bytes to our delayed refs rsv +/* + * Transfer bytes to our delayed refs rsv. * * @fs_info: the filesystem * @src: source block rsv to transfer from @@ -188,8 +187,8 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, delayed_refs_rsv->space_info, to_free); } -/** - * Refill based on our delayed refs usage +/* + * Refill based on our delayed refs usage. * * @fs_info: the filesystem * @flush: control how we can flush for this reservation. diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 51f0ef386046..ff2e524d9937 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -62,7 +62,7 @@ #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL) #define BTRFS_DISCARD_MAX_IOPS (10U) -/* Montonically decreasing minimum length filters after index 0 */ +/* Monotonically decreasing minimum length filters after index 0 */ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { 0, BTRFS_ASYNC_DISCARD_MAX_FILTER, @@ -147,10 +147,11 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, return running; } -/** - * find_next_block_group - find block_group that's up next for discarding - * @discard_ctl: discard control - * @now: current time +/* + * Find block_group that's up next for discarding. + * + * @discard_ctl: discard control + * @now: current time * * Iterate over the discard lists to find the next block_group up for * discarding checking the discard_eligible_time of block_group. @@ -185,17 +186,17 @@ static struct btrfs_block_group *find_next_block_group( return ret_block_group; } -/** - * Wrap find_next_block_group() +/* + * Look up next block group and set it for use. * * @discard_ctl: discard control * @discard_state: the discard_state of the block_group after state management * @discard_index: the discard_index of the block_group after state management * @now: time when discard was invoked, in ns * - * This wraps find_next_block_group() and sets the block_group to be in use. - * discard_state's control flow is managed here. Variables related to - * discard_state are reset here as needed (eg discard_cursor). @discard_state + * Wrap find_next_block_group() and set the block_group to be in use. + * @discard_state's control flow is managed here. Variables related to + * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state * and @discard_index are remembered as it may change while we're discarding, * but we want the discard to execute in the context determined here. */ @@ -234,10 +235,11 @@ again: return block_group; } -/** - * btrfs_discard_check_filter - updates a block groups filters - * @block_group: block group of interest - * @bytes: recently freed region size after coalescing +/* + * Update a block group's filters. + * + * @block_group: block group of interest + * @bytes: recently freed region size after coalescing * * Async discard maintains multiple lists with progressively smaller filters * to prioritize discarding based on size. Should a free space that matches @@ -272,8 +274,9 @@ void btrfs_discard_check_filter(struct btrfs_block_group *block_group, } } -/** - * btrfs_update_discard_index - moves a block group along the discard lists +/* + * Move a block group along the discard lists. + * * @discard_ctl: discard control * @block_group: block_group of interest * @@ -292,13 +295,14 @@ static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl, add_to_discard_list(discard_ctl, block_group); } -/** - * btrfs_discard_cancel_work - remove a block_group from the discard lists +/* + * Remove a block_group from the discard lists. + * * @discard_ctl: discard control * @block_group: block_group of interest * - * This removes @block_group from the discard lists. If necessary, it waits on - * the current work and then reschedules the delayed work. + * Remove @block_group from the discard lists. If necessary, wait on the + * current work and then reschedule the delayed work. */ void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) @@ -309,12 +313,13 @@ void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, } } -/** - * btrfs_discard_queue_work - handles queuing the block_groups +/* + * Handles queuing the block_groups. + * * @discard_ctl: discard control * @block_group: block_group of interest * - * This maintains the LRU order of the discard lists. + * Maintain the LRU order of the discard lists. */ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) @@ -384,7 +389,8 @@ static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, } /* - * btrfs_discard_schedule_work - responsible for scheduling the discard work + * Responsible for scheduling the discard work. + * * @discard_ctl: discard control * @override: override the current timer * @@ -402,15 +408,16 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, spin_unlock(&discard_ctl->lock); } -/** - * btrfs_finish_discard_pass - determine next step of a block_group +/* + * Determine next step of a block_group. + * * @discard_ctl: discard control * @block_group: block_group of interest * - * This determines the next step for a block group after it's finished going - * through a pass on a discard list. If it is unused and fully trimmed, we can - * mark it unused and send it to the unused_bgs path. Otherwise, pass it onto - * the appropriate filter list or let it fall off. + * Determine the next step for a block group after it's finished going through + * a pass on a discard list. If it is unused and fully trimmed, we can mark it + * unused and send it to the unused_bgs path. Otherwise, pass it onto the + * appropriate filter list or let it fall off. */ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) @@ -427,12 +434,13 @@ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, } } -/** - * btrfs_discard_workfn - discard work function +/* + * Discard work queue callback + * * @work: work * - * This finds the next block_group to start discarding and then discards a - * single region. It does this in a two-pass fashion: first extents and second + * Find the next block_group to start discarding and then discard a single + * region. It does this in a two-pass fashion: first extents and second * bitmaps. Completely discarded block groups are sent to the unused_bgs path. */ static void btrfs_discard_workfn(struct work_struct *work) @@ -508,11 +516,12 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_unlock(&discard_ctl->lock); } -/** - * btrfs_run_discard_work - determines if async discard should be running +/* + * Determine if async discard should be running. + * * @discard_ctl: discard control * - * Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. + * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. */ bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) { @@ -524,8 +533,9 @@ bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); } -/** - * btrfs_discard_calc_delay - recalculate the base delay +/* + * Recalculate the base delay. + * * @discard_ctl: discard control * * Recalculate the base delay which is based off the total number of @@ -546,7 +556,7 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) spin_lock(&discard_ctl->lock); /* - * The following is to fix a potential -1 discrepenancy that we're not + * The following is to fix a potential -1 discrepancy that we're not * sure how to reproduce. But given that this is the only place that * utilizes these numbers and this is only called by from * btrfs_finish_extent_commit() which is synchronized, we can correct @@ -579,13 +589,14 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) spin_unlock(&discard_ctl->lock); } -/** - * btrfs_discard_update_discardable - propagate discard counters +/* + * Propagate discard counters. + * * @block_group: block_group of interest * - * This propagates deltas of counters up to the discard_ctl. It maintains a - * current counter and a previous counter passing the delta up to the global - * stat. Then the current counter value becomes the previous counter value. + * Propagate deltas of counters up to the discard_ctl. It maintains a current + * counter and a previous counter passing the delta up to the global stat. + * Then the current counter value becomes the previous counter value. */ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) { @@ -620,8 +631,9 @@ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) } } -/** - * btrfs_discard_punt_unused_bgs_list - punt unused_bgs list to discard lists +/* + * Punt unused_bgs list to discard lists. + * * @fs_info: fs_info of interest * * The unused_bgs list needs to be punted to the discard lists because the @@ -645,8 +657,9 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->unused_bgs_lock); } -/** - * btrfs_discard_purge_list - purge discard lists +/* + * Purge discard lists. + * * @discard_ctl: discard control * * If we are disabling async discard, we may have intercepted block groups that diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4b47bb8c590f..545d9e1f0f83 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1298,7 +1298,7 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) bio_put(bio); } -/** +/* * Populate every free slot in a provided array with pages. * * @nr_pages: number of pages to allocate @@ -1334,16 +1334,16 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) return 0; } -/** - * Attempt to add a page to bio +/* + * Attempt to add a page to bio. * - * @bio_ctrl: record both the bio, and its bio_flags - * @page: page to add to the bio - * @disk_bytenr: offset of the new bio or to check whether we are adding - * a contiguous page to the previous one - * @size: portion of page that we want to write - * @pg_offset: starting offset in the page - * @compress_type: compression type of the current bio to see if we can merge them + * @bio_ctrl: record both the bio, and its bio_flags + * @page: page to add to the bio + * @disk_bytenr: offset of the new bio or to check whether we are adding + * a contiguous page to the previous one + * @size: portion of page that we want to write + * @pg_offset: starting offset in the page + * @compress_type: compression type of the current bio to see if we can merge them * * Attempt to add a page to bio considering stripe alignment etc. * @@ -3066,7 +3066,7 @@ retry: return ret; } -/** +/* * Walk the list of dirty pages of the given address space and write all of them. * * @mapping: address space structure to write @@ -5460,11 +5460,12 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, *page_offset = offset_in_page(offset); } -/** - * extent_buffer_test_bit - determine whether a bit in a bitmap item is set - * @eb: the extent buffer - * @start: offset of the bitmap item in the extent buffer - * @nr: bit number to test +/* + * Determine whether a bit in a bitmap item is set. + * + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @nr: bit number to test */ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, unsigned long nr) @@ -5481,12 +5482,13 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); } -/** - * extent_buffer_bitmap_set - set an area of a bitmap - * @eb: the extent buffer - * @start: offset of the bitmap item in the extent buffer - * @pos: bit number of the first bit - * @len: number of bits to set +/* + * Set an area of a bitmap to 1. + * + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @pos: bit number of the first bit + * @len: number of bits to set */ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) @@ -5523,12 +5525,13 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star } -/** - * extent_buffer_bitmap_clear - clear an area of a bitmap - * @eb: the extent buffer - * @start: offset of the bitmap item in the extent buffer - * @pos: bit number of the first bit - * @len: number of bits to clear +/* + * Clear an area of a bitmap. + * + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @pos: bit number of the first bit + * @len: number of bits to clear */ void extent_buffer_bitmap_clear(const struct extent_buffer *eb, unsigned long start, unsigned long pos, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index f97508afb659..cd45303f344d 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -28,12 +28,9 @@ void __cold extent_map_exit(void) kmem_cache_destroy(extent_map_cache); } -/** - * extent_map_tree_init - initialize extent map tree - * @tree: tree to initialize - * - * Initialize the extent tree @tree. Should be called for each new inode - * or other user of the extent_map interface. +/* + * Initialize the extent tree @tree. Should be called for each new inode or + * other user of the extent_map interface. */ void extent_map_tree_init(struct extent_map_tree *tree) { @@ -42,12 +39,9 @@ void extent_map_tree_init(struct extent_map_tree *tree) rwlock_init(&tree->lock); } -/** - * alloc_extent_map - allocate new extent map structure - * - * Allocate a new extent_map structure. The new structure is - * returned with a reference count of one and needs to be - * freed using free_extent_map() +/* + * Allocate a new extent_map structure. The new structure is returned with a + * reference count of one and needs to be freed using free_extent_map() */ struct extent_map *alloc_extent_map(void) { @@ -62,12 +56,9 @@ struct extent_map *alloc_extent_map(void) return em; } -/** - * free_extent_map - drop reference count of an extent_map - * @em: extent map being released - * - * Drops the reference out on @em by one and free the structure - * if the reference count hits zero. +/* + * Drop the reference out on @em by one and free the structure if the reference + * count hits zero. */ void free_extent_map(struct extent_map *em) { @@ -82,7 +73,7 @@ void free_extent_map(struct extent_map *em) } } -/* simple helper to do math around the end of an extent, handling wrap */ +/* Do the math around the end of an extent, handling wrapping. */ static u64 range_end(u64 start, u64 len) { if (start + len < start) @@ -138,8 +129,8 @@ static int tree_insert(struct rb_root_cached *root, struct extent_map *em) } /* - * search through the tree for an extent_map with a given offset. If - * it can't be found, try to find some neighboring extents + * Search through the tree for an extent_map with a given offset. If it can't + * be found, try to find some neighboring extents */ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, struct rb_node **prev_or_next_ret) @@ -191,7 +182,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, return NULL; } -/* check to see if two extent_map structs are adjacent and safe to merge */ +/* Check to see if two extent_map structs are adjacent and safe to merge. */ static int mergable_maps(struct extent_map *prev, struct extent_map *next) { if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) @@ -289,8 +280,9 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) } } -/** - * unpin_extent_cache - unpin an extent from the cache +/* + * Unpin an extent from the cache. + * * @tree: tree to unpin the extent in * @start: logical offset in the file * @len: length of the extent @@ -393,7 +385,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) } } -/** +/* * Add new extent map to the extent tree * * @tree: tree to insert new map in @@ -452,8 +444,9 @@ __lookup_extent_mapping(struct extent_map_tree *tree, return em; } -/** - * lookup_extent_mapping - lookup extent_map +/* + * Lookup extent_map that intersects @start + @len range. + * * @tree: tree to lookup in * @start: byte offset to start the search * @len: length of the lookup range @@ -469,8 +462,9 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, return __lookup_extent_mapping(tree, start, len, 1); } -/** - * search_extent_mapping - find a nearby extent map +/* + * Find a nearby extent map intersecting @start + @len (not an exact search). + * * @tree: tree to lookup in * @start: byte offset to start the search * @len: length of the lookup range @@ -486,13 +480,14 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, return __lookup_extent_mapping(tree, start, len, 0); } -/** - * remove_extent_mapping - removes an extent_map from the extent tree +/* + * Remove an extent_map from the extent tree. + * * @tree: extent tree to remove from * @em: extent map being removed * - * Removes @em from @tree. No reference counts are dropped, and no checks - * are done to see if the range is in use + * Remove @em from @tree. No reference counts are dropped, and no checks + * are done to see if the range is in use. */ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { @@ -615,8 +610,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, return add_extent_mapping(em_tree, em, 0); } -/** - * Add extent mapping into em_tree +/* + * Add extent mapping into em_tree. * * @fs_info: the filesystem * @em_tree: extent tree into which we want to insert the extent mapping diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index bce6c8d31bc0..403a857d230e 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -27,8 +27,8 @@ #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ PAGE_SIZE)) -/** - * Set inode's size according to filesystem options +/* + * Set inode's size according to filesystem options. * * @inode: inode we want to update the disk_i_size for * @new_i_size: i_size we want to set to, 0 if we use i_size @@ -67,8 +67,8 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz spin_unlock(&inode->lock); } -/** - * Mark range within a file as having a new extent inserted +/* + * Mark range within a file as having a new extent inserted. * * @inode: inode being modified * @start: start file offset of the file extent we've inserted @@ -95,8 +95,8 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, EXTENT_DIRTY); } -/** - * Marks an inode range as not having a backing extent +/* + * Mark an inode range as not having a backing extent. * * @inode: inode being modified * @start: start file offset of the file extent we've inserted @@ -257,7 +257,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, /* * Find checksums for logical bytenr range [disk_bytenr, disk_bytenr + len) and - * estore the result to @dst. + * store the result to @dst. * * Return >0 for the number of sectors we found. * Return 0 for the range [disk_bytenr, disk_bytenr + sectorsize) has no csum @@ -363,15 +363,15 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode, return ret; } -/** +/* * Lookup the checksum for the read bio in csum tree. * - * @inode: inode that the bio is for. - * @bio: bio to look up. - * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return - * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If - * NULL, the checksum buffer is allocated and returned in - * btrfs_bio(bio)->csum instead. + * @inode: inode that the bio is for. + * @bio: bio to look up. + * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return + * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If + * NULL, the checksum buffer is allocated and returned in + * btrfs_bio(bio)->csum instead. * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ @@ -633,8 +633,8 @@ fail: return ret; } -/** - * Calculate checksums of the data contained inside a bio +/* + * Calculate checksums of the data contained inside a bio. * * @inode: Owner of the data inside the bio * @bio: Contains the data to be checksummed @@ -749,15 +749,16 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, } /* - * helper function for csum removal, this expects the - * key to describe the csum pointed to by the path, and it expects - * the csum to overlap the range [bytenr, len] + * Remove one checksum overlapping a range. + * + * This expects the key to describe the csum pointed to by the path, and it + * expects the csum to overlap the range [bytenr, len] * - * The csum should not be entirely contained in the range and the - * range should not be entirely contained in the csum. + * The csum should not be entirely contained in the range and the range should + * not be entirely contained in the csum. * - * This calls btrfs_truncate_item with the correct args based on the - * overlap, and fixes up the key as required. + * This calls btrfs_truncate_item with the correct args based on the overlap, + * and fixes up the key as required. */ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, struct btrfs_path *path, @@ -806,8 +807,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, } /* - * deletes the csum items from the csum tree for a given - * range of bytes. + * Delete the csum items from the csum tree for a given range of bytes. */ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 26ff1a5100b9..599b41523479 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1369,8 +1369,8 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, path, block_group->start); } -/** - * Write out cached info to an inode +/* + * Write out cached info to an inode. * * @root: root the inode belongs to * @inode: freespace inode we are writing out @@ -3034,10 +3034,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) } -/** - * btrfs_is_free_space_trimmed - see if everything is trimmed - * @block_group: block_group of interest - * +/* * Walk @block_group's free space rb_tree to determine if everything is trimmed. */ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ec9f0f4afd12..5b9857e30a08 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6895,18 +6895,18 @@ static noinline int uncompress_inline(struct btrfs_path *path, return ret; } -/** - * btrfs_get_extent - Lookup the first extent overlapping a range in a file. +/* + * Lookup the first extent overlapping a range in a file. + * * @inode: file to search in * @page: page to read extent data into if the extent is inline * @pg_offset: offset into @page to copy to * @start: file offset * @len: length of range starting at @start * - * This returns the first &struct extent_map which overlaps with the given - * range, reading it from the B-tree and caching it if necessary. Note that - * there may be more extents which overlap the given range after the returned - * extent_map. + * Return the first &struct extent_map which overlaps the given range, reading + * it from the B-tree and caching it if necessary. Note that there may be more + * extents which overlap the given range after the returned extent_map. * * If @page is not NULL and the extent is inline, this also reads the extent * data directly into the page and marks the extent up to date in the io_tree. @@ -11310,7 +11310,7 @@ void btrfs_update_inode_bytes(struct btrfs_inode *inode, spin_unlock(&inode->lock); } -/** +/* * Verify that there are no ordered extents for a given file range. * * @inode: The target inode. diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 52a268e05d4a..96858240588d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4346,7 +4346,7 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->balance_lock); } -/** +/* * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as * required. * diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 1cbaacdc50da..1c36407803ca 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -144,7 +144,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, return ret; } -/** +/* * Add an ordered extent to the per-inode tree. * * @inode: Inode that this extent is for. @@ -1020,17 +1020,18 @@ out: } /* - * btrfs_flush_ordered_range - Lock the passed range and ensures all pending - * ordered extents in it are run to completion. + * Lock the passed range and ensures all pending ordered extents in it are run + * to completion. * * @inode: Inode whose ordered tree is to be searched * @start: Beginning of range to flush * @end: Last byte of range to lock * @cached_state: If passed, will return the extent state responsible for the - * locked range. It's the caller's responsibility to free the cached state. + * locked range. It's the caller's responsibility to free the + * cached state. * - * This function always returns with the given range locked, ensuring after it's - * called no order extent can be pending. + * Always return with the given range locked, ensuring after it's called no + * order extent can be pending. */ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 349270537c2e..ef272e7d932c 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -908,8 +908,8 @@ static void raid_write_end_io(struct bio *bio) rbio_orig_end_io(rbio, err); } -/** - * Get a sector pointer specified by its @stripe_nr and @sector_nr +/* + * Get a sector pointer specified by its @stripe_nr and @sector_nr. * * @rbio: The raid bio * @stripe_nr: Stripe number, valid range [0, real_stripe) diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index f0243eb33df7..9ba7914c7169 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -321,16 +321,16 @@ copy_to_page: goto out; } -/** - * btrfs_clone() - clone a range from inode file to another +/* + * Clone a range from inode file to another. * - * @src: Inode to clone from - * @inode: Inode to clone to - * @off: Offset within source to start clone from - * @olen: Original length, passed by user, of range to clone - * @olen_aligned: Block-aligned value of olen - * @destoff: Offset within @inode to start clone - * @no_time_update: Whether to update mtime/ctime on the target inode + * @src: Inode to clone from + * @inode: Inode to clone to + * @off: Offset within source to start clone from + * @olen: Original length, passed by user, of range to clone + * @olen_aligned: Block-aligned value of olen + * @destoff: Offset within @inode to start clone + * @no_time_update: Whether to update mtime/ctime on the target inode */ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index e5f9f43ffa4c..11e5b5a1eb5a 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1493,8 +1493,8 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); } -/** - * Do the appropriate flushing and waiting for a ticket +/* + * Do the appropriate flushing and waiting for a ticket. * * @fs_info: the filesystem * @space_info: space info for the reservation @@ -1596,8 +1596,8 @@ static inline bool can_ticket(enum btrfs_reserve_flush_enum flush) flush != BTRFS_RESERVE_FLUSH_EMERGENCY); } -/** - * Try to reserve bytes from the block_rsv's space +/* + * Try to reserve bytes from the block_rsv's space. * * @fs_info: the filesystem * @space_info: space info we want to allocate from @@ -1736,8 +1736,8 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, orig_bytes, flush); } -/** - * Trye to reserve metadata bytes from the block_rsv's space +/* + * Try to reserve metadata bytes from the block_rsv's space. * * @fs_info: the filesystem * @block_rsv: block_rsv we're allocating for @@ -1771,8 +1771,8 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, return ret; } -/** - * Try to reserve data bytes for an allocation +/* + * Try to reserve data bytes for an allocation. * * @fs_info: the filesystem * @bytes: number of bytes we need diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f185cd286243..e358df3668e7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7381,7 +7381,7 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, mutex_unlock(&dir->log_mutex); } -/** +/* * Update the log after adding a new name for an inode. * * @trans: Transaction handle. diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index f2f20c8d84aa..13af1e41f9d7 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -38,8 +38,9 @@ * loop would be similar to the above. */ -/** - * ulist_init - freshly initialize a ulist +/* + * Freshly initialize a ulist. + * * @ulist: the ulist to initialize * * Note: don't use this function to init an already used ulist, use @@ -52,8 +53,9 @@ void ulist_init(struct ulist *ulist) ulist->nnodes = 0; } -/** - * ulist_release - free up additionally allocated memory for the ulist +/* + * Free up additionally allocated memory for the ulist. + * * @ulist: the ulist from which to free the additional memory * * This is useful in cases where the base 'struct ulist' has been statically @@ -71,8 +73,9 @@ void ulist_release(struct ulist *ulist) INIT_LIST_HEAD(&ulist->nodes); } -/** - * ulist_reinit - prepare a ulist for reuse +/* + * Prepare a ulist for reuse. + * * @ulist: ulist to be reused * * Free up all additional memory allocated for the list elements and reinit @@ -84,8 +87,9 @@ void ulist_reinit(struct ulist *ulist) ulist_init(ulist); } -/** - * ulist_alloc - dynamically allocate a ulist +/* + * Dynamically allocate a ulist. + * * @gfp_mask: allocation flags to for base allocation * * The allocated ulist will be returned in an initialized state. @@ -102,8 +106,9 @@ struct ulist *ulist_alloc(gfp_t gfp_mask) return ulist; } -/** - * ulist_free - free dynamically allocated ulist +/* + * Free dynamically allocated ulist. + * * @ulist: ulist to free * * It is not necessary to call ulist_release before. @@ -164,8 +169,9 @@ static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) return 0; } -/** - * ulist_add - add an element to the ulist +/* + * Add an element to the ulist. + * * @ulist: ulist to add the element to * @val: value to add to ulist * @aux: auxiliary value to store along with val @@ -243,8 +249,9 @@ int ulist_del(struct ulist *ulist, u64 val, u64 aux) return 0; } -/** - * ulist_next - iterate ulist +/* + * Iterate ulist. + * * @ulist: ulist to iterate * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator) * diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 767c56be6b96..f61bb79d4a7e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -531,14 +531,14 @@ error: return ret; } -/** - * Search and remove all stale devices (which are not mounted). - * When both inputs are NULL, it will search and release all stale devices. +/* + * Search and remove all stale devices (which are not mounted). When both + * inputs are NULL, it will search and release all stale devices. * - * @devt: Optional. When provided will it release all unmounted devices - * matching this devt only. + * @devt: Optional. When provided will it release all unmounted devices + * matching this devt only. * @skip_device: Optional. Will skip this device when searching for the stale - * devices. + * devices. * * Return: 0 for success or if @devt is 0. * -EBUSY if @devt is a mounted device. @@ -1478,8 +1478,9 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device, return changed; } -/** - * dev_extent_hole_check - check if specified hole is suitable for allocation +/* + * Check if specified hole is suitable for allocation. + * * @device: the device which we have the hole * @hole_start: starting position of the hole * @hole_size: the size of the hole @@ -1533,7 +1534,8 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, } /* - * find_free_dev_extent_start - find free space in the specified device + * Find free space in the specified device. + * * @device: the device which we search the free space in * @num_bytes: the size of the free space that we need * @search_start: the position from which to begin the search @@ -1541,9 +1543,8 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, * @len: the size of the free space. that we find, or the size * of the max free space if we don't find suitable free space * - * this uses a pretty simple search, the expectation is that it is - * called very infrequently and that a given device has a small number - * of extents + * This does a pretty simple search, the expectation is that it is called very + * infrequently and that a given device has a small number of extents. * * @start is used to store the start of the free space if we find. But if we * don't find suitable free space, it will be used to store the start position @@ -2322,8 +2323,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) btrfs_free_device(tgtdev); } -/** - * Populate args from device at path +/* + * Populate args from device at path. * * @fs_info: the filesystem * @args: the args to populate @@ -4031,10 +4032,11 @@ error: return ret; } -/** - * alloc_profile_is_valid - see if a given profile is valid and reduced - * @flags: profile to validate - * @extended: if true @flags is treated as an extended profile +/* + * See if a given profile is valid and reduced. + * + * @flags: profile to validate + * @extended: if true @flags is treated as an extended profile */ static int alloc_profile_is_valid(u64 flags, int extended) { @@ -7009,8 +7011,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, return device; } -/** - * btrfs_alloc_device - allocate struct btrfs_device +/* + * Allocate new device struct, set up devid and UUID. + * * @fs_info: used only for generating a new devid, can be NULL if * devid is provided (i.e. @devid != NULL). * @devid: a pointer to devid for this device. If NULL a new devid diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index fe66cb929af9..13034c3bbb65 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1020,8 +1020,8 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); } -/** - * btrfs_find_allocatable_zones - find allocatable zones within a given region +/* + * Find allocatable zones within a given region. * * @device: the device to allocate a region on * @hole_start: the position of the hole to allocate the region @@ -1864,7 +1864,7 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, return device; } -/** +/* * Activate block group and underlying device zones * * @block_group: the block group to activate diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 35a0224d4eb7..4575b3703e74 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -94,7 +94,7 @@ static inline struct workspace *list_to_workspace(struct list_head *list) void zstd_free_workspace(struct list_head *ws); struct list_head *zstd_alloc_workspace(unsigned int level); -/** +/* * Timer callback to free unused workspaces. * * @t: timer -- cgit From cb9a10a6504bb35095e97f4839fcf353bf1458ea Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:16 -0400 Subject: btrfs: convert discard stat defs to enum Do away with the defines and use an enum as it's cleaner. Suggested-by: Johannes Thumshirn Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index cab954a9d97b..a855e0483e03 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -48,9 +48,11 @@ static inline bool btrfs_free_space_trimming_bitmap( * to make it clear what we're doing. An example is discard_extents in * btrfs_free_space_ctl. */ -#define BTRFS_STAT_NR_ENTRIES 2 -#define BTRFS_STAT_CURR 0 -#define BTRFS_STAT_PREV 1 +enum { + BTRFS_STAT_CURR, + BTRFS_STAT_PREV, + BTRFS_STAT_NR_ENTRIES, +}; struct btrfs_free_space_ctl { spinlock_t tree_lock; -- cgit From b31bed170d5241d4c9e2fc572f31fc15b274a6c9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:17 -0400 Subject: btrfs: move btrfs_chunk_item_size out of ctree.h This is used by the volumes code and the tree checker code. We want to maintain inline however, so simply move it to volumes.h. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 7 ------- fs/btrfs/volumes.h | 8 ++++++++ 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 09005bd9bfef..1e0944c37547 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -54,13 +54,6 @@ struct btrfs_balance_control; struct btrfs_delayed_root; struct reloc_control; -static inline unsigned long btrfs_chunk_item_size(int num_stripes) -{ - BUG_ON(num_stripes == 0); - return sizeof(struct btrfs_chunk) + - sizeof(struct btrfs_stripe) * (num_stripes - 1); -} - /* Read ahead values for struct btrfs_path.reada */ enum { READA_NONE, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index a20ee7d57831..b05124e62412 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -10,6 +10,7 @@ #include #include #include "async-thread.h" +#include "messages.h" #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) @@ -605,6 +606,13 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio) } } +static inline unsigned long btrfs_chunk_item_size(int num_stripes) +{ + ASSERT(num_stripes); + return sizeof(struct btrfs_chunk) + + sizeof(struct btrfs_stripe) * (num_stripes - 1); +} + void btrfs_get_bioc(struct btrfs_io_context *bioc); void btrfs_put_bioc(struct btrfs_io_context *bioc); int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, -- cgit From 3683fbbc2314d6721a353f8bf4285a819ea9b512 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:18 -0400 Subject: btrfs: add dependencies to fs.h and block-rsv.h There's several structures that are embedded inside of fs_info.h, so if we don't have all the proper includes when we include fs.h we'll get a variety of compile errors. I fixed this by adding a temporary c file that just had #include "fs.h" and then added include files until the compiler stopped complaining. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-rsv.h | 1 + fs/btrfs/fs.h | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 7e9016a9e193..e2cf0dd43203 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -4,6 +4,7 @@ #define BTRFS_BLOCK_RSV_H struct btrfs_trans_handle; +struct btrfs_root; enum btrfs_reserve_flush_enum; /* diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index d8c931014c2a..a49d11127bf7 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -3,6 +3,14 @@ #ifndef BTRFS_FS_H #define BTRFS_FS_H +#include +#include +#include +#include "extent-io-tree.h" +#include "extent_map.h" +#include "async-thread.h" +#include "block-rsv.h" + #define BTRFS_MAX_EXTENT_SIZE SZ_128M #define BTRFS_OLDEST_GENERATION 0ULL -- cgit From 5034388342564877879e2a94faf72eff468a7437 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:19 -0400 Subject: btrfs: add blk_types.h include to compression.h When moving the printk messages into their own file I got a compiler error because the includes grabbed compression.h, but nothing pulled in the blk_types.h dependency that compression.h has because it uses blkstatus_t. Add blk_types.h to compression.h so that this sort of thing doesn't happen in the future. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.h | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 9da2343eeff5..b961462399ae 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -6,6 +6,7 @@ #ifndef BTRFS_COMPRESSION_H #define BTRFS_COMPRESSION_H +#include #include struct btrfs_inode; -- cgit From 083bd7e54e8e2c1d169f21236873272cdf6f409a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:20 -0400 Subject: btrfs: move the printk and assert helpers to messages.c These helpers are core to btrfs, and in order to more easily sync various parts of the btrfs kernel code into btrfs-progs we need to be able to carry these helpers with us. However we want to have our own implementation for the helpers themselves, currently they're implemented in different files that we want to sync inside of btrfs-progs itself. Move these into their own C file, this will allow us to contain our overrides in btrfs-progs in it's own file without messing with the rest of the codebase. In copying things over I fixed up a few whitespace errors that already existed. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Makefile | 2 +- fs/btrfs/messages.c | 352 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/super.c | 346 --------------------------------------------------- 3 files changed, 353 insertions(+), 347 deletions(-) create mode 100644 fs/btrfs/messages.c (limited to 'fs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 76f90dcfb14d..bb170c9d030d 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -31,7 +31,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ - subpage.o tree-mod-log.o extent-io-tree.o fs.o + subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c new file mode 100644 index 000000000000..196757ee16f1 --- /dev/null +++ b/fs/btrfs/messages.c @@ -0,0 +1,352 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "fs.h" +#include "messages.h" +#include "discard.h" +#include "transaction.h" +#include "space-info.h" + +#ifdef CONFIG_PRINTK + +#define STATE_STRING_PREFACE ": state " +#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) + +/* + * Characters to print to indicate error conditions or uncommon filesystem state. + * RO is not an error. + */ +static const char fs_state_chars[] = { + [BTRFS_FS_STATE_ERROR] = 'E', + [BTRFS_FS_STATE_REMOUNTING] = 'M', + [BTRFS_FS_STATE_RO] = 0, + [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', + [BTRFS_FS_STATE_DEV_REPLACING] = 'R', + [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, + [BTRFS_FS_STATE_NO_CSUMS] = 'C', + [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', +}; + +static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) +{ + unsigned int bit; + bool states_printed = false; + unsigned long fs_state = READ_ONCE(info->fs_state); + char *curr = buf; + + memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); + curr += sizeof(STATE_STRING_PREFACE) - 1; + + for_each_set_bit(bit, &fs_state, sizeof(fs_state)) { + WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT); + if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) { + *curr++ = fs_state_chars[bit]; + states_printed = true; + } + } + + /* If no states were printed, reset the buffer */ + if (!states_printed) + curr = buf; + + *curr++ = 0; +} +#endif + +/* + * Generally the error codes correspond to their respective errors, but there + * are a few special cases. + * + * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for + * instance will return EUCLEAN if any of the blocks are corrupted in + * a way that is problematic. We want to reserve EUCLEAN for these + * sort of corruptions. + * + * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we + * need to use EROFS for this case. We will have no idea of the + * original failure, that will have been reported at the time we tripped + * over the error. Each subsequent error that doesn't have any context + * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR. + */ +const char * __attribute_const__ btrfs_decode_error(int errno) +{ + char *errstr = "unknown"; + + switch (errno) { + case -ENOENT: /* -2 */ + errstr = "No such entry"; + break; + case -EIO: /* -5 */ + errstr = "IO failure"; + break; + case -ENOMEM: /* -12*/ + errstr = "Out of memory"; + break; + case -EEXIST: /* -17 */ + errstr = "Object already exists"; + break; + case -ENOSPC: /* -28 */ + errstr = "No space left"; + break; + case -EROFS: /* -30 */ + errstr = "Readonly filesystem"; + break; + case -EOPNOTSUPP: /* -95 */ + errstr = "Operation not supported"; + break; + case -EUCLEAN: /* -117 */ + errstr = "Filesystem corrupted"; + break; + case -EDQUOT: /* -122 */ + errstr = "Quota exceeded"; + break; + } + + return errstr; +} + +/* + * __btrfs_handle_fs_error decodes expected errors from the caller and + * invokes the appropriate error response. + */ +__cold +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + struct super_block *sb = fs_info->sb; +#ifdef CONFIG_PRINTK + char statestr[STATE_STRING_BUF_LEN]; + const char *errstr; +#endif + +#ifdef CONFIG_PRINTK_INDEX + printk_index_subsys_emit( + "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", KERN_CRIT, fmt); +#endif + + /* + * Special case: if the error is EROFS, and we're already under + * SB_RDONLY, then it is safe here. + */ + if (errno == -EROFS && sb_rdonly(sb)) + return; + +#ifdef CONFIG_PRINTK + errstr = btrfs_decode_error(errno); + btrfs_state_to_string(fs_info, statestr); + if (fmt) { + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n", + sb->s_id, statestr, function, line, errno, errstr, &vaf); + va_end(args); + } else { + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n", + sb->s_id, statestr, function, line, errno, errstr); + } +#endif + + /* + * Today we only save the error info to memory. Long term we'll also + * send it down to the disk. + */ + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); + + /* Don't go through full error handling during mount. */ + if (!(sb->s_flags & SB_BORN)) + return; + + if (sb_rdonly(sb)) + return; + + btrfs_discard_stop(fs_info); + + /* Handle error by forcing the filesystem readonly. */ + btrfs_set_sb_rdonly(sb); + btrfs_info(fs_info, "forced readonly"); + /* + * Note that a running device replace operation is not canceled here + * although there is no way to update the progress. It would add the + * risk of a deadlock, therefore the canceling is omitted. The only + * penalty is that some I/O remains active until the procedure + * completes. The next time when the filesystem is mounted writable + * again, the device replace operation continues. + */ +} + +#ifdef CONFIG_PRINTK +static const char * const logtypes[] = { + "emergency", + "alert", + "critical", + "error", + "warning", + "notice", + "info", + "debug", +}; + +/* + * Use one ratelimit state per log level so that a flood of less important + * messages doesn't cause more important ones to be dropped. + */ +static struct ratelimit_state printk_limits[] = { + RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), +}; + +void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +{ + char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; + struct va_format vaf; + va_list args; + int kern_level; + const char *type = logtypes[4]; + struct ratelimit_state *ratelimit = &printk_limits[4]; + +#ifdef CONFIG_PRINTK_INDEX + printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); +#endif + + va_start(args, fmt); + + while ((kern_level = printk_get_level(fmt)) != 0) { + size_t size = printk_skip_level(fmt) - fmt; + + if (kern_level >= '0' && kern_level <= '7') { + memcpy(lvl, fmt, size); + lvl[size] = '\0'; + type = logtypes[kern_level - '0']; + ratelimit = &printk_limits[kern_level - '0']; + } + fmt += size; + } + + vaf.fmt = fmt; + vaf.va = &args; + + if (__ratelimit(ratelimit)) { + if (fs_info) { + char statestr[STATE_STRING_BUF_LEN]; + + btrfs_state_to_string(fs_info, statestr); + _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, + fs_info->sb->s_id, statestr, &vaf); + } else { + _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); + } + } + + va_end(args); +} +#endif + +#ifdef CONFIG_BTRFS_ASSERT +void __cold btrfs_assertfail(const char *expr, const char *file, int line) +{ + pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); + BUG(); +} +#endif + +void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info) +{ + btrfs_err(fs_info, +"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel"); +} + +#if BITS_PER_LONG == 32 +void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info) +{ + if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) { + btrfs_warn(fs_info, "reaching 32bit limit for logical addresses"); + btrfs_warn(fs_info, +"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT", + BTRFS_32BIT_MAX_FILE_SIZE >> 40); + btrfs_warn(fs_info, + "please consider upgrading to 64bit kernel/hardware"); + } +} + +void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) +{ + if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) { + btrfs_err(fs_info, "reached 32bit limit for logical addresses"); + btrfs_err(fs_info, +"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed", + BTRFS_32BIT_MAX_FILE_SIZE >> 40); + btrfs_err(fs_info, + "please consider upgrading to 64bit kernel/hardware"); + } +} +#endif + +/* + * We only mark the transaction aborted and then set the file system read-only. + * This will prevent new transactions from starting or trying to join this + * one. + * + * This means that error recovery at the call site is limited to freeing + * any local memory allocations and passing the error code up without + * further cleanup. The transaction should complete as it normally would + * in the call path but will return -EIO. + * + * We'll complete the cleanup in btrfs_end_transaction and + * btrfs_commit_transaction. + */ +__cold +void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + WRITE_ONCE(trans->aborted, errno); + WRITE_ONCE(trans->transaction->aborted, errno); + if (first_hit && errno == -ENOSPC) + btrfs_dump_space_info_for_trans_abort(fs_info); + /* Wake up anybody who may be waiting on this transaction */ + wake_up(&fs_info->transaction_wait); + wake_up(&fs_info->transaction_blocked_wait); + __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); +} + +/* + * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an + * alert, and either panics or BUGs, depending on mount options. + */ +__cold +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + char *s_id = ""; + const char *errstr; + struct va_format vaf = { .fmt = fmt }; + va_list args; + + if (fs_info) + s_id = fs_info->sb->s_id; + + va_start(args, fmt); + vaf.va = &args; + + errstr = btrfs_decode_error(errno); + if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR))) + panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", + s_id, function, line, &vaf, errno, errstr); + + btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)", + function, line, &vaf, errno, errstr); + va_end(args); + /* Caller calls BUG() */ +} diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 33bf211699f8..a4030dfeb2f2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -70,352 +70,6 @@ static struct file_system_type btrfs_root_fs_type; static int btrfs_remount(struct super_block *sb, int *flags, char *data); -#ifdef CONFIG_PRINTK - -#define STATE_STRING_PREFACE ": state " -#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) - -/* - * Characters to print to indicate error conditions or uncommon filesystem state. - * RO is not an error. - */ -static const char fs_state_chars[] = { - [BTRFS_FS_STATE_ERROR] = 'E', - [BTRFS_FS_STATE_REMOUNTING] = 'M', - [BTRFS_FS_STATE_RO] = 0, - [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', - [BTRFS_FS_STATE_DEV_REPLACING] = 'R', - [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, - [BTRFS_FS_STATE_NO_CSUMS] = 'C', - [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', -}; - -static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) -{ - unsigned int bit; - bool states_printed = false; - unsigned long fs_state = READ_ONCE(info->fs_state); - char *curr = buf; - - memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); - curr += sizeof(STATE_STRING_PREFACE) - 1; - - for_each_set_bit(bit, &fs_state, sizeof(fs_state)) { - WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT); - if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) { - *curr++ = fs_state_chars[bit]; - states_printed = true; - } - } - - /* If no states were printed, reset the buffer */ - if (!states_printed) - curr = buf; - - *curr++ = 0; -} -#endif - -/* - * Generally the error codes correspond to their respective errors, but there - * are a few special cases. - * - * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for - * instance will return EUCLEAN if any of the blocks are corrupted in - * a way that is problematic. We want to reserve EUCLEAN for these - * sort of corruptions. - * - * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we - * need to use EROFS for this case. We will have no idea of the - * original failure, that will have been reported at the time we tripped - * over the error. Each subsequent error that doesn't have any context - * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR. - */ -const char * __attribute_const__ btrfs_decode_error(int errno) -{ - char *errstr = "unknown"; - - switch (errno) { - case -ENOENT: /* -2 */ - errstr = "No such entry"; - break; - case -EIO: /* -5 */ - errstr = "IO failure"; - break; - case -ENOMEM: /* -12*/ - errstr = "Out of memory"; - break; - case -EEXIST: /* -17 */ - errstr = "Object already exists"; - break; - case -ENOSPC: /* -28 */ - errstr = "No space left"; - break; - case -EROFS: /* -30 */ - errstr = "Readonly filesystem"; - break; - case -EOPNOTSUPP: /* -95 */ - errstr = "Operation not supported"; - break; - case -EUCLEAN: /* -117 */ - errstr = "Filesystem corrupted"; - break; - case -EDQUOT: /* -122 */ - errstr = "Quota exceeded"; - break; - } - - return errstr; -} - -/* - * __btrfs_handle_fs_error decodes expected errors from the caller and - * invokes the appropriate error response. - */ -__cold -void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) -{ - struct super_block *sb = fs_info->sb; -#ifdef CONFIG_PRINTK - char statestr[STATE_STRING_BUF_LEN]; - const char *errstr; -#endif - -#ifdef CONFIG_PRINTK_INDEX - printk_index_subsys_emit( - "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", - KERN_CRIT, fmt); -#endif - - /* - * Special case: if the error is EROFS, and we're already - * under SB_RDONLY, then it is safe here. - */ - if (errno == -EROFS && sb_rdonly(sb)) - return; - -#ifdef CONFIG_PRINTK - errstr = btrfs_decode_error(errno); - btrfs_state_to_string(fs_info, statestr); - if (fmt) { - struct va_format vaf; - va_list args; - - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - - pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n", - sb->s_id, statestr, function, line, errno, errstr, &vaf); - va_end(args); - } else { - pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n", - sb->s_id, statestr, function, line, errno, errstr); - } -#endif - - /* - * Today we only save the error info to memory. Long term we'll - * also send it down to the disk - */ - set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); - - /* Don't go through full error handling during mount */ - if (!(sb->s_flags & SB_BORN)) - return; - - if (sb_rdonly(sb)) - return; - - btrfs_discard_stop(fs_info); - - /* btrfs handle error by forcing the filesystem readonly */ - btrfs_set_sb_rdonly(sb); - btrfs_info(fs_info, "forced readonly"); - /* - * Note that a running device replace operation is not canceled here - * although there is no way to update the progress. It would add the - * risk of a deadlock, therefore the canceling is omitted. The only - * penalty is that some I/O remains active until the procedure - * completes. The next time when the filesystem is mounted writable - * again, the device replace operation continues. - */ -} - -#ifdef CONFIG_PRINTK -static const char * const logtypes[] = { - "emergency", - "alert", - "critical", - "error", - "warning", - "notice", - "info", - "debug", -}; - - -/* - * Use one ratelimit state per log level so that a flood of less important - * messages doesn't cause more important ones to be dropped. - */ -static struct ratelimit_state printk_limits[] = { - RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), -}; - -void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) -{ - char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; - struct va_format vaf; - va_list args; - int kern_level; - const char *type = logtypes[4]; - struct ratelimit_state *ratelimit = &printk_limits[4]; - -#ifdef CONFIG_PRINTK_INDEX - printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); -#endif - - va_start(args, fmt); - - while ((kern_level = printk_get_level(fmt)) != 0) { - size_t size = printk_skip_level(fmt) - fmt; - - if (kern_level >= '0' && kern_level <= '7') { - memcpy(lvl, fmt, size); - lvl[size] = '\0'; - type = logtypes[kern_level - '0']; - ratelimit = &printk_limits[kern_level - '0']; - } - fmt += size; - } - - vaf.fmt = fmt; - vaf.va = &args; - - if (__ratelimit(ratelimit)) { - if (fs_info) { - char statestr[STATE_STRING_BUF_LEN]; - - btrfs_state_to_string(fs_info, statestr); - _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, - fs_info->sb->s_id, statestr, &vaf); - } else { - _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); - } - } - - va_end(args); -} -#endif - -#ifdef CONFIG_BTRFS_ASSERT -void __cold btrfs_assertfail(const char *expr, const char *file, int line) -{ - pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); - BUG(); -} -#endif - -void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info) -{ - btrfs_err(fs_info, -"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel"); -} - -#if BITS_PER_LONG == 32 -void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info) -{ - if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) { - btrfs_warn(fs_info, "reaching 32bit limit for logical addresses"); - btrfs_warn(fs_info, -"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT", - BTRFS_32BIT_MAX_FILE_SIZE >> 40); - btrfs_warn(fs_info, - "please consider upgrading to 64bit kernel/hardware"); - } -} - -void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) -{ - if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) { - btrfs_err(fs_info, "reached 32bit limit for logical addresses"); - btrfs_err(fs_info, -"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed", - BTRFS_32BIT_MAX_FILE_SIZE >> 40); - btrfs_err(fs_info, - "please consider upgrading to 64bit kernel/hardware"); - } -} -#endif - -/* - * We only mark the transaction aborted and then set the file system read-only. - * This will prevent new transactions from starting or trying to join this - * one. - * - * This means that error recovery at the call site is limited to freeing - * any local memory allocations and passing the error code up without - * further cleanup. The transaction should complete as it normally would - * in the call path but will return -EIO. - * - * We'll complete the cleanup in btrfs_end_transaction and - * btrfs_commit_transaction. - */ -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno, bool first_hit) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - - WRITE_ONCE(trans->aborted, errno); - WRITE_ONCE(trans->transaction->aborted, errno); - if (first_hit && errno == -ENOSPC) - btrfs_dump_space_info_for_trans_abort(fs_info); - /* Wake up anybody who may be waiting on this transaction */ - wake_up(&fs_info->transaction_wait); - wake_up(&fs_info->transaction_blocked_wait); - __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); -} -/* - * __btrfs_panic decodes unexpected, fatal errors from the caller, - * issues an alert, and either panics or BUGs, depending on mount options. - */ -__cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) -{ - char *s_id = ""; - const char *errstr; - struct va_format vaf = { .fmt = fmt }; - va_list args; - - if (fs_info) - s_id = fs_info->sb->s_id; - - va_start(args, fmt); - vaf.va = &args; - - errstr = btrfs_decode_error(errno); - if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR))) - panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", - s_id, function, line, &vaf, errno, errstr); - - btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)", - function, line, &vaf, errno, errstr); - va_end(args); - /* Caller calls BUG() */ -} - static void btrfs_put_super(struct super_block *sb) { close_ctree(btrfs_sb(sb)); -- cgit From 2885fd632050ef47317a97a3a68bc98634a6e11e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:21 -0400 Subject: btrfs: move inode prototypes to btrfs_inode.h I initially wanted to make a new header file for this, but these prototypes do naturally fit into btrfs_inode.h. If we want to extract vfs from pure btrfs code in the future we may need to split this up, but btrfs_inode embeds the vfs_inode, so it makes sense to put the prototypes in this header for now. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 136 +++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ctree.h | 140 ------------------------------------------------- 2 files changed, 136 insertions(+), 140 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 530a0ebfab3f..d21c30bf7053 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -410,4 +410,140 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes + +void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num); +void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type); +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + u32 pgoff, u8 *csum, const u8 * const csum_expected); +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff); +unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, + u64 start, u64 end); +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff); +noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, + u64 *orig_start, u64 *orig_block_len, + u64 *ram_bytes, bool nowait, bool strict); + +void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode); +struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); +int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, struct btrfs_inode *inode, + const struct fscrypt_str *name); +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct btrfs_inode *parent_inode, struct btrfs_inode *inode, + const struct fscrypt_str *name, int add_backref, u64 index); +int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); +int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, + int front); + +int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, + bool in_reclaim_context); +int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, + unsigned int extra_bits, + struct extent_state **cached_state); + +struct btrfs_new_inode_args { + /* Input */ + struct inode *dir; + struct dentry *dentry; + struct inode *inode; + bool orphan; + bool subvol; + + /* Output from btrfs_new_inode_prepare(), input to btrfs_create_new_inode(). */ + struct posix_acl *default_acl; + struct posix_acl *acl; + struct fscrypt_name fname; +}; + +int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, + unsigned int *trans_num_items); +int btrfs_create_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_new_inode_args *args); +void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); +struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, + struct inode *dir); + void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, + u32 bits); +void btrfs_clear_delalloc_extent(struct inode *inode, + struct extent_state *state, u32 bits); +void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, + struct extent_state *other); +void btrfs_split_delalloc_extent(struct inode *inode, + struct extent_state *orig, u64 split); +void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); +vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); +void btrfs_evict_inode(struct inode *inode); +struct inode *btrfs_alloc_inode(struct super_block *sb); +void btrfs_destroy_inode(struct inode *inode); +void btrfs_free_inode(struct inode *inode); +int btrfs_drop_inode(struct inode *inode); +int __init btrfs_init_cachep(void); +void __cold btrfs_destroy_cachep(void); +struct inode *btrfs_iget_path(struct super_block *s, u64 ino, + struct btrfs_root *root, struct btrfs_path *path); +struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); +struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, + struct page *page, size_t pg_offset, + u64 start, u64 end); +int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode); +int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode); +int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); +int btrfs_orphan_cleanup(struct btrfs_root *root); +int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); +void btrfs_add_delayed_iput(struct inode *inode); +void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); +int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info); +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); +int btrfs_prealloc_file_range_trans(struct inode *inode, + struct btrfs_trans_handle *trans, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, struct writeback_control *wbc); +int btrfs_writepage_cow_fixup(struct page *page); +void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, + struct page *page, u64 start, + u64 end, bool uptodate); +int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, + int compress_type); +int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, u64 disk_bytenr, + u64 disk_io_size, + struct page **pages); +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, + struct btrfs_ioctl_encoded_io_args *encoded); +ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); + +ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before); +struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before); + +extern const struct dentry_operations btrfs_dentry_operations; + +/* Inode locking type flags, by default the exclusive lock is taken. */ +enum btrfs_ilock_type { + ENUM_BIT(BTRFS_ILOCK_SHARED), + ENUM_BIT(BTRFS_ILOCK_TRY), + ENUM_BIT(BTRFS_ILOCK_MMAP), +}; + +int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags); +void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); +void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, + const u64 del_bytes); +void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); + #endif diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1e0944c37547..a9267b117860 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -764,146 +764,6 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size); u64 btrfs_file_extent_end(const struct btrfs_path *path); -/* inode.c */ -void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num); -void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type); -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, - u32 pgoff, u8 *csum, const u8 * const csum_expected); -int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff); -unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, - u64 start, u64 end); -int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff); -noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, - u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes, bool nowait, bool strict); - -void __btrfs_del_delalloc_inode(struct btrfs_root *root, - struct btrfs_inode *inode); -struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); -int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); -int btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_inode *dir, struct btrfs_inode *inode, - const struct fscrypt_str *name); -int btrfs_add_link(struct btrfs_trans_handle *trans, - struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const struct fscrypt_str *name, int add_backref, u64 index); -int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); -int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, - int front); - -int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, - bool in_reclaim_context); -int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, - unsigned int extra_bits, - struct extent_state **cached_state); -struct btrfs_new_inode_args { - /* Input */ - struct inode *dir; - struct dentry *dentry; - struct inode *inode; - bool orphan; - bool subvol; - - /* - * Output from btrfs_new_inode_prepare(), input to - * btrfs_create_new_inode(). - */ - struct posix_acl *default_acl; - struct posix_acl *acl; - struct fscrypt_name fname; -}; -int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, - unsigned int *trans_num_items); -int btrfs_create_new_inode(struct btrfs_trans_handle *trans, - struct btrfs_new_inode_args *args); -void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); -struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, - struct inode *dir); - void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, - u32 bits); -void btrfs_clear_delalloc_extent(struct inode *inode, - struct extent_state *state, u32 bits); -void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, - struct extent_state *other); -void btrfs_split_delalloc_extent(struct inode *inode, - struct extent_state *orig, u64 split); -void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); -vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); -void btrfs_evict_inode(struct inode *inode); -struct inode *btrfs_alloc_inode(struct super_block *sb); -void btrfs_destroy_inode(struct inode *inode); -void btrfs_free_inode(struct inode *inode); -int btrfs_drop_inode(struct inode *inode); -int __init btrfs_init_cachep(void); -void __cold btrfs_destroy_cachep(void); -struct inode *btrfs_iget_path(struct super_block *s, u64 ino, - struct btrfs_root *root, struct btrfs_path *path); -struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); -struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 end); -int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); -int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); -int btrfs_orphan_add(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode); -int btrfs_orphan_cleanup(struct btrfs_root *root); -int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); -void btrfs_add_delayed_iput(struct inode *inode); -void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); -int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info); -int btrfs_prealloc_file_range(struct inode *inode, int mode, - u64 start, u64 num_bytes, u64 min_size, - loff_t actual_len, u64 *alloc_hint); -int btrfs_prealloc_file_range_trans(struct inode *inode, - struct btrfs_trans_handle *trans, int mode, - u64 start, u64 num_bytes, u64 min_size, - loff_t actual_len, u64 *alloc_hint); -int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, unsigned long *nr_written, - struct writeback_control *wbc); -int btrfs_writepage_cow_fixup(struct page *page); -void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, - struct page *page, u64 start, - u64 end, bool uptodate); -int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, - int compress_type); -int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, u64 disk_bytenr, - u64 disk_io_size, - struct page **pages); -ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, - struct btrfs_ioctl_encoded_io_args *encoded); -ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, - const struct btrfs_ioctl_encoded_io_args *encoded); - -ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, - size_t done_before); -struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, - size_t done_before); - -extern const struct dentry_operations btrfs_dentry_operations; - -/* Inode locking type flags, by default the exclusive lock is taken */ -enum btrfs_ilock_type { - ENUM_BIT(BTRFS_ILOCK_SHARED), - ENUM_BIT(BTRFS_ILOCK_TRY), - ENUM_BIT(BTRFS_ILOCK_MMAP), -}; - -int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags); -void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); -void btrfs_update_inode_bytes(struct btrfs_inode *inode, - const u64 add_bytes, - const u64 del_bytes); -void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); - /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -- cgit From 778dd695dd4d5a21eff07bb1570b570da69dfbd9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:22 -0400 Subject: btrfs: rename tree-defrag.c to defrag.c This currently has only one helper in it, and it's for tree based defrag. We have the various defrag code in 3 different places, so rename this to defrag.c. Followup patches will move the code into this new file. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Makefile | 2 +- fs/btrfs/defrag.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/tree-defrag.c | 133 ------------------------------------------------- 3 files changed, 134 insertions(+), 134 deletions(-) create mode 100644 fs/btrfs/defrag.c delete mode 100644 fs/btrfs/tree-defrag.c (limited to 'fs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index bb170c9d030d..84fb3b4c35b0 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -23,7 +23,7 @@ obj-$(CONFIG_BTRFS_FS) := btrfs.o btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ file-item.o inode-item.o disk-io.o \ - transaction.o inode.o file.o tree-defrag.o \ + transaction.o inode.o file.o defrag.o \ extent_map.o sysfs.o accessors.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c new file mode 100644 index 000000000000..5df604846de6 --- /dev/null +++ b/fs/btrfs/defrag.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" +#include "transaction.h" +#include "locking.h" +#include "accessors.h" + +/* + * Defrag all the leaves in a given btree. + * Read all the leaves and try to get key order to + * better reflect disk order + */ + +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path = NULL; + struct btrfs_key key; + int ret = 0; + int wret; + int level; + int next_key_ret = 0; + u64 last_ret = 0; + + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + goto out; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + level = btrfs_header_level(root->node); + + if (level == 0) + goto out; + + if (root->defrag_progress.objectid == 0) { + struct extent_buffer *root_node; + u32 nritems; + + root_node = btrfs_lock_root_node(root); + nritems = btrfs_header_nritems(root_node); + root->defrag_max.objectid = 0; + /* from above we know this is not a leaf */ + btrfs_node_key_to_cpu(root_node, &root->defrag_max, + nritems - 1); + btrfs_tree_unlock(root_node); + free_extent_buffer(root_node); + memset(&key, 0, sizeof(key)); + } else { + memcpy(&key, &root->defrag_progress, sizeof(key)); + } + + path->keep_locks = 1; + + ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + goto out; + } + btrfs_release_path(path); + /* + * We don't need a lock on a leaf. btrfs_realloc_node() will lock all + * leafs from path->nodes[1], so set lowest_level to 1 to avoid later + * a deadlock (attempting to write lock an already write locked leaf). + */ + path->lowest_level = 1; + wret = btrfs_search_slot(trans, root, &key, path, 0, 1); + + if (wret < 0) { + ret = wret; + goto out; + } + if (!path->nodes[1]) { + ret = 0; + goto out; + } + /* + * The node at level 1 must always be locked when our path has + * keep_locks set and lowest_level is 1, regardless of the value of + * path->slots[1]. + */ + BUG_ON(path->locks[1] == 0); + ret = btrfs_realloc_node(trans, root, + path->nodes[1], 0, + &last_ret, + &root->defrag_progress); + if (ret) { + WARN_ON(ret == -EAGAIN); + goto out; + } + /* + * Now that we reallocated the node we can find the next key. Note that + * btrfs_find_next_key() can release our path and do another search + * without COWing, this is because even with path->keep_locks = 1, + * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a + * node when path->slots[node_level - 1] does not point to the last + * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore + * we search for the next key after reallocating our node. + */ + path->slots[1] = btrfs_header_nritems(path->nodes[1]); + next_key_ret = btrfs_find_next_key(root, path, &key, 1, + BTRFS_OLDEST_GENERATION); + if (next_key_ret == 0) { + memcpy(&root->defrag_progress, &key, sizeof(key)); + ret = -EAGAIN; + } +out: + btrfs_free_path(path); + if (ret == -EAGAIN) { + if (root->defrag_max.objectid > root->defrag_progress.objectid) + goto done; + if (root->defrag_max.type > root->defrag_progress.type) + goto done; + if (root->defrag_max.offset > root->defrag_progress.offset) + goto done; + ret = 0; + } +done: + if (ret != -EAGAIN) + memset(&root->defrag_progress, 0, + sizeof(root->defrag_progress)); + + return ret; +} diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c deleted file mode 100644 index 5df604846de6..000000000000 --- a/fs/btrfs/tree-defrag.c +++ /dev/null @@ -1,133 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2007 Oracle. All rights reserved. - */ - -#include -#include "ctree.h" -#include "disk-io.h" -#include "print-tree.h" -#include "transaction.h" -#include "locking.h" -#include "accessors.h" - -/* - * Defrag all the leaves in a given btree. - * Read all the leaves and try to get key order to - * better reflect disk order - */ - -int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_path *path = NULL; - struct btrfs_key key; - int ret = 0; - int wret; - int level; - int next_key_ret = 0; - u64 last_ret = 0; - - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - goto out; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - level = btrfs_header_level(root->node); - - if (level == 0) - goto out; - - if (root->defrag_progress.objectid == 0) { - struct extent_buffer *root_node; - u32 nritems; - - root_node = btrfs_lock_root_node(root); - nritems = btrfs_header_nritems(root_node); - root->defrag_max.objectid = 0; - /* from above we know this is not a leaf */ - btrfs_node_key_to_cpu(root_node, &root->defrag_max, - nritems - 1); - btrfs_tree_unlock(root_node); - free_extent_buffer(root_node); - memset(&key, 0, sizeof(key)); - } else { - memcpy(&key, &root->defrag_progress, sizeof(key)); - } - - path->keep_locks = 1; - - ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); - if (ret < 0) - goto out; - if (ret > 0) { - ret = 0; - goto out; - } - btrfs_release_path(path); - /* - * We don't need a lock on a leaf. btrfs_realloc_node() will lock all - * leafs from path->nodes[1], so set lowest_level to 1 to avoid later - * a deadlock (attempting to write lock an already write locked leaf). - */ - path->lowest_level = 1; - wret = btrfs_search_slot(trans, root, &key, path, 0, 1); - - if (wret < 0) { - ret = wret; - goto out; - } - if (!path->nodes[1]) { - ret = 0; - goto out; - } - /* - * The node at level 1 must always be locked when our path has - * keep_locks set and lowest_level is 1, regardless of the value of - * path->slots[1]. - */ - BUG_ON(path->locks[1] == 0); - ret = btrfs_realloc_node(trans, root, - path->nodes[1], 0, - &last_ret, - &root->defrag_progress); - if (ret) { - WARN_ON(ret == -EAGAIN); - goto out; - } - /* - * Now that we reallocated the node we can find the next key. Note that - * btrfs_find_next_key() can release our path and do another search - * without COWing, this is because even with path->keep_locks = 1, - * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a - * node when path->slots[node_level - 1] does not point to the last - * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore - * we search for the next key after reallocating our node. - */ - path->slots[1] = btrfs_header_nritems(path->nodes[1]); - next_key_ret = btrfs_find_next_key(root, path, &key, 1, - BTRFS_OLDEST_GENERATION); - if (next_key_ret == 0) { - memcpy(&root->defrag_progress, &key, sizeof(key)); - ret = -EAGAIN; - } -out: - btrfs_free_path(path); - if (ret == -EAGAIN) { - if (root->defrag_max.objectid > root->defrag_progress.objectid) - goto done; - if (root->defrag_max.type > root->defrag_progress.type) - goto done; - if (root->defrag_max.offset > root->defrag_progress.offset) - goto done; - ret = 0; - } -done: - if (ret != -EAGAIN) - memset(&root->defrag_progress, 0, - sizeof(root->defrag_progress)); - - return ret; -} -- cgit From 6e3df18ba7e8e68015dd66bcab326a4b7aaed085 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:23 -0400 Subject: btrfs: move the auto defrag code to defrag.c This currently exists in file.c, move it to the more natural location in defrag.c. Signed-off-by: Josef Bacik [ reformat comments ] Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 337 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/file.c | 340 ------------------------------------------------------ 2 files changed, 337 insertions(+), 340 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 5df604846de6..4c8c03185ce7 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -11,6 +11,326 @@ #include "locking.h" #include "accessors.h" +static struct kmem_cache *btrfs_inode_defrag_cachep; + +/* + * When auto defrag is enabled we queue up these defrag structs to remember + * which inodes need defragging passes. + */ +struct inode_defrag { + struct rb_node rb_node; + /* Inode number */ + u64 ino; + /* + * Transid where the defrag was added, we search for extents newer than + * this. + */ + u64 transid; + + /* Root objectid */ + u64 root; + + /* + * The extent size threshold for autodefrag. + * + * This value is different for compressed/non-compressed extents, thus + * needs to be passed from higher layer. + * (aka, inode_should_defrag()) + */ + u32 extent_thresh; +}; + +static int __compare_inode_defrag(struct inode_defrag *defrag1, + struct inode_defrag *defrag2) +{ + if (defrag1->root > defrag2->root) + return 1; + else if (defrag1->root < defrag2->root) + return -1; + else if (defrag1->ino > defrag2->ino) + return 1; + else if (defrag1->ino < defrag2->ino) + return -1; + else + return 0; +} + +/* + * Pop a record for an inode into the defrag tree. The lock must be held + * already. + * + * If you're inserting a record for an older transid than an existing record, + * the transid already in the tree is lowered. + * + * If an existing record is found the defrag item you pass in is freed. + */ +static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, + struct inode_defrag *defrag) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct inode_defrag *entry; + struct rb_node **p; + struct rb_node *parent = NULL; + int ret; + + p = &fs_info->defrag_inodes.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct inode_defrag, rb_node); + + ret = __compare_inode_defrag(defrag, entry); + if (ret < 0) + p = &parent->rb_left; + else if (ret > 0) + p = &parent->rb_right; + else { + /* + * If we're reinserting an entry for an old defrag run, + * make sure to lower the transid of our existing + * record. + */ + if (defrag->transid < entry->transid) + entry->transid = defrag->transid; + entry->extent_thresh = min(defrag->extent_thresh, + entry->extent_thresh); + return -EEXIST; + } + } + set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); + rb_link_node(&defrag->rb_node, parent, p); + rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); + return 0; +} + +static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) + return 0; + + if (btrfs_fs_closing(fs_info)) + return 0; + + return 1; +} + +/* + * Insert a defrag record for this inode if auto defrag is enabled. + */ +int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u32 extent_thresh) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode_defrag *defrag; + u64 transid; + int ret; + + if (!__need_auto_defrag(fs_info)) + return 0; + + if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) + return 0; + + if (trans) + transid = trans->transid; + else + transid = inode->root->last_trans; + + defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); + if (!defrag) + return -ENOMEM; + + defrag->ino = btrfs_ino(inode); + defrag->transid = transid; + defrag->root = root->root_key.objectid; + defrag->extent_thresh = extent_thresh; + + spin_lock(&fs_info->defrag_inodes_lock); + if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { + /* + * If we set IN_DEFRAG flag and evict the inode from memory, + * and then re-read this inode, this new inode doesn't have + * IN_DEFRAG flag. At the case, we may find the existed defrag. + */ + ret = __btrfs_add_inode_defrag(inode, defrag); + if (ret) + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } else { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } + spin_unlock(&fs_info->defrag_inodes_lock); + return 0; +} + +/* + * Pick the defragable inode that we want, if it doesn't exist, we will get the + * next one. + */ +static struct inode_defrag *btrfs_pick_defrag_inode( + struct btrfs_fs_info *fs_info, u64 root, u64 ino) +{ + struct inode_defrag *entry = NULL; + struct inode_defrag tmp; + struct rb_node *p; + struct rb_node *parent = NULL; + int ret; + + tmp.ino = ino; + tmp.root = root; + + spin_lock(&fs_info->defrag_inodes_lock); + p = fs_info->defrag_inodes.rb_node; + while (p) { + parent = p; + entry = rb_entry(parent, struct inode_defrag, rb_node); + + ret = __compare_inode_defrag(&tmp, entry); + if (ret < 0) + p = parent->rb_left; + else if (ret > 0) + p = parent->rb_right; + else + goto out; + } + + if (parent && __compare_inode_defrag(&tmp, entry) > 0) { + parent = rb_next(parent); + if (parent) + entry = rb_entry(parent, struct inode_defrag, rb_node); + else + entry = NULL; + } +out: + if (entry) + rb_erase(parent, &fs_info->defrag_inodes); + spin_unlock(&fs_info->defrag_inodes_lock); + return entry; +} + +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) +{ + struct inode_defrag *defrag; + struct rb_node *node; + + spin_lock(&fs_info->defrag_inodes_lock); + node = rb_first(&fs_info->defrag_inodes); + while (node) { + rb_erase(node, &fs_info->defrag_inodes); + defrag = rb_entry(node, struct inode_defrag, rb_node); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + + cond_resched_lock(&fs_info->defrag_inodes_lock); + + node = rb_first(&fs_info->defrag_inodes); + } + spin_unlock(&fs_info->defrag_inodes_lock); +} + +#define BTRFS_DEFRAG_BATCH 1024 + +static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + struct inode_defrag *defrag) +{ + struct btrfs_root *inode_root; + struct inode *inode; + struct btrfs_ioctl_defrag_range_args range; + int ret = 0; + u64 cur = 0; + +again: + if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) + goto cleanup; + if (!__need_auto_defrag(fs_info)) + goto cleanup; + + /* Get the inode */ + inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); + if (IS_ERR(inode_root)) { + ret = PTR_ERR(inode_root); + goto cleanup; + } + + inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); + btrfs_put_root(inode_root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto cleanup; + } + + if (cur >= i_size_read(inode)) { + iput(inode); + goto cleanup; + } + + /* Do a chunk of defrag */ + clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); + memset(&range, 0, sizeof(range)); + range.len = (u64)-1; + range.start = cur; + range.extent_thresh = defrag->extent_thresh; + + sb_start_write(fs_info->sb); + ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + BTRFS_DEFRAG_BATCH); + sb_end_write(fs_info->sb); + iput(inode); + + if (ret < 0) + goto cleanup; + + cur = max(cur + fs_info->sectorsize, range.start); + goto again; + +cleanup: + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return ret; +} + +/* + * Run through the list of inodes in the FS that need defragging. + */ +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) +{ + struct inode_defrag *defrag; + u64 first_ino = 0; + u64 root_objectid = 0; + + atomic_inc(&fs_info->defrag_running); + while (1) { + /* Pause the auto defragger. */ + if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) + break; + + if (!__need_auto_defrag(fs_info)) + break; + + /* find an inode to defrag */ + defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); + if (!defrag) { + if (root_objectid || first_ino) { + root_objectid = 0; + first_ino = 0; + continue; + } else { + break; + } + } + + first_ino = defrag->ino + 1; + root_objectid = defrag->root; + + __btrfs_run_defrag_inode(fs_info, defrag); + } + atomic_dec(&fs_info->defrag_running); + + /* + * During unmount, we use the transaction_wait queue to wait for the + * defragger to stop. + */ + wake_up(&fs_info->transaction_wait); + return 0; +} + /* * Defrag all the leaves in a given btree. * Read all the leaves and try to get key order to @@ -131,3 +451,20 @@ done: return ret; } + +void __cold btrfs_auto_defrag_exit(void) +{ + kmem_cache_destroy(btrfs_inode_defrag_cachep); +} + +int __init btrfs_auto_defrag_init(void) +{ + btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", + sizeof(struct inode_defrag), 0, + SLAB_MEM_SPREAD, + NULL); + if (!btrfs_inode_defrag_cachep) + return -ENOMEM; + + return 0; +} diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index bad676d2e23f..8500472aa6ef 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -34,329 +34,6 @@ #include "accessors.h" #include "extent-tree.h" -static struct kmem_cache *btrfs_inode_defrag_cachep; -/* - * when auto defrag is enabled we - * queue up these defrag structs to remember which - * inodes need defragging passes - */ -struct inode_defrag { - struct rb_node rb_node; - /* objectid */ - u64 ino; - /* - * transid where the defrag was added, we search for - * extents newer than this - */ - u64 transid; - - /* root objectid */ - u64 root; - - /* - * The extent size threshold for autodefrag. - * - * This value is different for compressed/non-compressed extents, - * thus needs to be passed from higher layer. - * (aka, inode_should_defrag()) - */ - u32 extent_thresh; -}; - -static int __compare_inode_defrag(struct inode_defrag *defrag1, - struct inode_defrag *defrag2) -{ - if (defrag1->root > defrag2->root) - return 1; - else if (defrag1->root < defrag2->root) - return -1; - else if (defrag1->ino > defrag2->ino) - return 1; - else if (defrag1->ino < defrag2->ino) - return -1; - else - return 0; -} - -/* pop a record for an inode into the defrag tree. The lock - * must be held already - * - * If you're inserting a record for an older transid than an - * existing record, the transid already in the tree is lowered - * - * If an existing record is found the defrag item you - * pass in is freed - */ -static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, - struct inode_defrag *defrag) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct inode_defrag *entry; - struct rb_node **p; - struct rb_node *parent = NULL; - int ret; - - p = &fs_info->defrag_inodes.rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct inode_defrag, rb_node); - - ret = __compare_inode_defrag(defrag, entry); - if (ret < 0) - p = &parent->rb_left; - else if (ret > 0) - p = &parent->rb_right; - else { - /* if we're reinserting an entry for - * an old defrag run, make sure to - * lower the transid of our existing record - */ - if (defrag->transid < entry->transid) - entry->transid = defrag->transid; - entry->extent_thresh = min(defrag->extent_thresh, - entry->extent_thresh); - return -EEXIST; - } - } - set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); - rb_link_node(&defrag->rb_node, parent, p); - rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); - return 0; -} - -static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) -{ - if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) - return 0; - - if (btrfs_fs_closing(fs_info)) - return 0; - - return 1; -} - -/* - * insert a defrag record for this inode if auto defrag is - * enabled - */ -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh) -{ - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct inode_defrag *defrag; - u64 transid; - int ret; - - if (!__need_auto_defrag(fs_info)) - return 0; - - if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) - return 0; - - if (trans) - transid = trans->transid; - else - transid = inode->root->last_trans; - - defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); - if (!defrag) - return -ENOMEM; - - defrag->ino = btrfs_ino(inode); - defrag->transid = transid; - defrag->root = root->root_key.objectid; - defrag->extent_thresh = extent_thresh; - - spin_lock(&fs_info->defrag_inodes_lock); - if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { - /* - * If we set IN_DEFRAG flag and evict the inode from memory, - * and then re-read this inode, this new inode doesn't have - * IN_DEFRAG flag. At the case, we may find the existed defrag. - */ - ret = __btrfs_add_inode_defrag(inode, defrag); - if (ret) - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - } else { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - } - spin_unlock(&fs_info->defrag_inodes_lock); - return 0; -} - -/* - * pick the defragable inode that we want, if it doesn't exist, we will get - * the next one. - */ -static struct inode_defrag * -btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino) -{ - struct inode_defrag *entry = NULL; - struct inode_defrag tmp; - struct rb_node *p; - struct rb_node *parent = NULL; - int ret; - - tmp.ino = ino; - tmp.root = root; - - spin_lock(&fs_info->defrag_inodes_lock); - p = fs_info->defrag_inodes.rb_node; - while (p) { - parent = p; - entry = rb_entry(parent, struct inode_defrag, rb_node); - - ret = __compare_inode_defrag(&tmp, entry); - if (ret < 0) - p = parent->rb_left; - else if (ret > 0) - p = parent->rb_right; - else - goto out; - } - - if (parent && __compare_inode_defrag(&tmp, entry) > 0) { - parent = rb_next(parent); - if (parent) - entry = rb_entry(parent, struct inode_defrag, rb_node); - else - entry = NULL; - } -out: - if (entry) - rb_erase(parent, &fs_info->defrag_inodes); - spin_unlock(&fs_info->defrag_inodes_lock); - return entry; -} - -void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) -{ - struct inode_defrag *defrag; - struct rb_node *node; - - spin_lock(&fs_info->defrag_inodes_lock); - node = rb_first(&fs_info->defrag_inodes); - while (node) { - rb_erase(node, &fs_info->defrag_inodes); - defrag = rb_entry(node, struct inode_defrag, rb_node); - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - - cond_resched_lock(&fs_info->defrag_inodes_lock); - - node = rb_first(&fs_info->defrag_inodes); - } - spin_unlock(&fs_info->defrag_inodes_lock); -} - -#define BTRFS_DEFRAG_BATCH 1024 - -static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, - struct inode_defrag *defrag) -{ - struct btrfs_root *inode_root; - struct inode *inode; - struct btrfs_ioctl_defrag_range_args range; - int ret = 0; - u64 cur = 0; - -again: - if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) - goto cleanup; - if (!__need_auto_defrag(fs_info)) - goto cleanup; - - /* get the inode */ - inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); - if (IS_ERR(inode_root)) { - ret = PTR_ERR(inode_root); - goto cleanup; - } - - inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); - btrfs_put_root(inode_root); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - goto cleanup; - } - - if (cur >= i_size_read(inode)) { - iput(inode); - goto cleanup; - } - - /* do a chunk of defrag */ - clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); - memset(&range, 0, sizeof(range)); - range.len = (u64)-1; - range.start = cur; - range.extent_thresh = defrag->extent_thresh; - - sb_start_write(fs_info->sb); - ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, - BTRFS_DEFRAG_BATCH); - sb_end_write(fs_info->sb); - iput(inode); - - if (ret < 0) - goto cleanup; - - cur = max(cur + fs_info->sectorsize, range.start); - goto again; - -cleanup: - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return ret; -} - -/* - * run through the list of inodes in the FS that need - * defragging - */ -int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) -{ - struct inode_defrag *defrag; - u64 first_ino = 0; - u64 root_objectid = 0; - - atomic_inc(&fs_info->defrag_running); - while (1) { - /* Pause the auto defragger. */ - if (test_bit(BTRFS_FS_STATE_REMOUNTING, - &fs_info->fs_state)) - break; - - if (!__need_auto_defrag(fs_info)) - break; - - /* find an inode to defrag */ - defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, - first_ino); - if (!defrag) { - if (root_objectid || first_ino) { - root_objectid = 0; - first_ino = 0; - continue; - } else { - break; - } - } - - first_ino = defrag->ino + 1; - root_objectid = defrag->root; - - __btrfs_run_defrag_inode(fs_info, defrag); - } - atomic_dec(&fs_info->defrag_running); - - /* - * during unmount, we use the transaction_wait queue to - * wait for the defragger to stop - */ - wake_up(&fs_info->transaction_wait); - return 0; -} - /* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. */ @@ -4153,23 +3830,6 @@ const struct file_operations btrfs_file_operations = { .remap_file_range = btrfs_remap_file_range, }; -void __cold btrfs_auto_defrag_exit(void) -{ - kmem_cache_destroy(btrfs_inode_defrag_cachep); -} - -int __init btrfs_auto_defrag_init(void) -{ - btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", - sizeof(struct inode_defrag), 0, - SLAB_MEM_SPREAD, - NULL); - if (!btrfs_inode_defrag_cachep) - return -ENOMEM; - - return 0; -} - int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) { int ret; -- cgit From a6a01ca61f4949037fa7b94278ab260eab02a289 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:24 -0400 Subject: btrfs: move the file defrag code into defrag.c This is the other big portion of defrag code that has existed in ioctl.c. Move it to its new home in defrag.c. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 903 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ioctl.c | 902 ----------------------------------------------------- 2 files changed, 903 insertions(+), 902 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 4c8c03185ce7..2c260eef40d4 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -10,6 +10,9 @@ #include "transaction.h" #include "locking.h" #include "accessors.h" +#include "messages.h" +#include "delalloc-space.h" +#include "subpage.h" static struct kmem_cache *btrfs_inode_defrag_cachep; @@ -452,6 +455,906 @@ done: return ret; } +/* + * Defrag specific helper to get an extent map. + * + * Differences between this and btrfs_get_extent() are: + * + * - No extent_map will be added to inode->extent_tree + * To reduce memory usage in the long run. + * + * - Extra optimization to skip file extents older than @newer_than + * By using btrfs_search_forward() we can skip entire file ranges that + * have extents created in past transactions, because btrfs_search_forward() + * will not visit leaves and nodes with a generation smaller than given + * minimal generation threshold (@newer_than). + * + * Return valid em if we find a file extent matching the requirement. + * Return NULL if we can not find a file extent matching the requirement. + * + * Return ERR_PTR() for error. + */ +static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, + u64 start, u64 newer_than) +{ + struct btrfs_root *root = inode->root; + struct btrfs_file_extent_item *fi; + struct btrfs_path path = { 0 }; + struct extent_map *em; + struct btrfs_key key; + u64 ino = btrfs_ino(inode); + int ret; + + em = alloc_extent_map(); + if (!em) { + ret = -ENOMEM; + goto err; + } + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + if (newer_than) { + ret = btrfs_search_forward(root, &key, &path, newer_than); + if (ret < 0) + goto err; + /* Can't find anything newer */ + if (ret > 0) + goto not_found; + } else { + ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); + if (ret < 0) + goto err; + } + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { + /* + * If btrfs_search_slot() makes path to point beyond nritems, + * we should not have an empty leaf, as this inode must at + * least have its INODE_ITEM. + */ + ASSERT(btrfs_header_nritems(path.nodes[0])); + path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; + } + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + /* Perfect match, no need to go one slot back */ + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && + key.offset == start) + goto iterate; + + /* We didn't find a perfect match, needs to go one slot back */ + if (path.slots[0] > 0) { + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path.slots[0]--; + } + +iterate: + /* Iterate through the path to find a file extent covering @start */ + while (true) { + u64 extent_end; + + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) + goto next; + + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + + /* + * We may go one slot back to INODE_REF/XATTR item, then + * need to go forward until we reach an EXTENT_DATA. + * But we should still has the correct ino as key.objectid. + */ + if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) + goto next; + + /* It's beyond our target range, definitely not extent found */ + if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) + goto not_found; + + /* + * | |<- File extent ->| + * \- start + * + * This means there is a hole between start and key.offset. + */ + if (key.offset > start) { + em->start = start; + em->orig_start = start; + em->block_start = EXTENT_MAP_HOLE; + em->len = key.offset - start; + break; + } + + fi = btrfs_item_ptr(path.nodes[0], path.slots[0], + struct btrfs_file_extent_item); + extent_end = btrfs_file_extent_end(&path); + + /* + * |<- file extent ->| | + * \- start + * + * We haven't reached start, search next slot. + */ + if (extent_end <= start) + goto next; + + /* Now this extent covers @start, convert it to em */ + btrfs_extent_item_to_extent_map(inode, &path, fi, false, em); + break; +next: + ret = btrfs_next_item(root, &path); + if (ret < 0) + goto err; + if (ret > 0) + goto not_found; + } + btrfs_release_path(&path); + return em; + +not_found: + btrfs_release_path(&path); + free_extent_map(em); + return NULL; + +err: + btrfs_release_path(&path); + free_extent_map(em); + return ERR_PTR(ret); +} + +static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, + u64 newer_than, bool locked) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em; + const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; + + /* + * Hopefully we have this extent in the tree already, try without the + * full extent lock. + */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, sectorsize); + read_unlock(&em_tree->lock); + + /* + * We can get a merged extent, in that case, we need to re-search + * tree to get the original em for defrag. + * + * If @newer_than is 0 or em::generation < newer_than, we can trust + * this em, as either we don't care about the generation, or the + * merged extent map will be rejected anyway. + */ + if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && + newer_than && em->generation >= newer_than) { + free_extent_map(em); + em = NULL; + } + + if (!em) { + struct extent_state *cached = NULL; + u64 end = start + sectorsize - 1; + + /* Get the big lock and read metadata off disk. */ + if (!locked) + lock_extent(io_tree, start, end, &cached); + em = defrag_get_extent(BTRFS_I(inode), start, newer_than); + if (!locked) + unlock_extent(io_tree, start, end, &cached); + + if (IS_ERR(em)) + return NULL; + } + + return em; +} + +static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, + const struct extent_map *em) +{ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + return BTRFS_MAX_COMPRESSED; + return fs_info->max_extent_size; +} + +static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, + u32 extent_thresh, u64 newer_than, bool locked) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct extent_map *next; + bool ret = false; + + /* This is the last extent */ + if (em->start + em->len >= i_size_read(inode)) + return false; + + /* + * Here we need to pass @newer_then when checking the next extent, or + * we will hit a case we mark current extent for defrag, but the next + * one will not be a target. + * This will just cause extra IO without really reducing the fragments. + */ + next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); + /* No more em or hole */ + if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) + goto out; + if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) + goto out; + /* + * If the next extent is at its max capacity, defragging current extent + * makes no sense, as the total number of extents won't change. + */ + if (next->len >= get_extent_max_capacity(fs_info, em)) + goto out; + /* Skip older extent */ + if (next->generation < newer_than) + goto out; + /* Also check extent size */ + if (next->len >= extent_thresh) + goto out; + + ret = true; +out: + free_extent_map(next); + return ret; +} + +/* + * Prepare one page to be defragged. + * + * This will ensure: + * + * - Returned page is locked and has been set up properly. + * - No ordered extent exists in the page. + * - The page is uptodate. + * + * NOTE: Caller should also wait for page writeback after the cluster is + * prepared, here we don't do writeback wait for each page. + */ +static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index) +{ + struct address_space *mapping = inode->vfs_inode.i_mapping; + gfp_t mask = btrfs_alloc_write_mask(mapping); + u64 page_start = (u64)index << PAGE_SHIFT; + u64 page_end = page_start + PAGE_SIZE - 1; + struct extent_state *cached_state = NULL; + struct page *page; + int ret; + +again: + page = find_or_create_page(mapping, index, mask); + if (!page) + return ERR_PTR(-ENOMEM); + + /* + * Since we can defragment files opened read-only, we can encounter + * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We + * can't do I/O using huge pages yet, so return an error for now. + * Filesystem transparent huge pages are typically only used for + * executables that explicitly enable them, so this isn't very + * restrictive. + */ + if (PageCompound(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-ETXTBSY); + } + + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + return ERR_PTR(ret); + } + + /* Wait for any existing ordered extent in the range */ + while (1) { + struct btrfs_ordered_extent *ordered; + + lock_extent(&inode->io_tree, page_start, page_end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); + unlock_extent(&inode->io_tree, page_start, page_end, + &cached_state); + if (!ordered) + break; + + unlock_page(page); + btrfs_start_ordered_extent(ordered, 1); + btrfs_put_ordered_extent(ordered); + lock_page(page); + /* + * We unlocked the page above, so we need check if it was + * released or not. + */ + if (page->mapping != mapping || !PagePrivate(page)) { + unlock_page(page); + put_page(page); + goto again; + } + } + + /* + * Now the page range has no ordered extent any more. Read the page to + * make it uptodate. + */ + if (!PageUptodate(page)) { + btrfs_read_folio(NULL, page_folio(page)); + lock_page(page); + if (page->mapping != mapping || !PagePrivate(page)) { + unlock_page(page); + put_page(page); + goto again; + } + if (!PageUptodate(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-EIO); + } + } + return page; +} + +struct defrag_target_range { + struct list_head list; + u64 start; + u64 len; +}; + +/* + * Collect all valid target extents. + * + * @start: file offset to lookup + * @len: length to lookup + * @extent_thresh: file extent size threshold, any extent size >= this value + * will be ignored + * @newer_than: only defrag extents newer than this value + * @do_compress: whether the defrag is doing compression + * if true, @extent_thresh will be ignored and all regular + * file extents meeting @newer_than will be targets. + * @locked: if the range has already held extent lock + * @target_list: list of targets file extents + */ +static int defrag_collect_targets(struct btrfs_inode *inode, + u64 start, u64 len, u32 extent_thresh, + u64 newer_than, bool do_compress, + bool locked, struct list_head *target_list, + u64 *last_scanned_ret) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + bool last_is_target = false; + u64 cur = start; + int ret = 0; + + while (cur < start + len) { + struct extent_map *em; + struct defrag_target_range *new; + bool next_mergeable = true; + u64 range_len; + + last_is_target = false; + em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked); + if (!em) + break; + + /* + * If the file extent is an inlined one, we may still want to + * defrag it (fallthrough) if it will cause a regular extent. + * This is for users who want to convert inline extents to + * regular ones through max_inline= mount option. + */ + if (em->block_start == EXTENT_MAP_INLINE && + em->len <= inode->root->fs_info->max_inline) + goto next; + + /* Skip hole/delalloc/preallocated extents */ + if (em->block_start == EXTENT_MAP_HOLE || + em->block_start == EXTENT_MAP_DELALLOC || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + goto next; + + /* Skip older extent */ + if (em->generation < newer_than) + goto next; + + /* This em is under writeback, no need to defrag */ + if (em->generation == (u64)-1) + goto next; + + /* + * Our start offset might be in the middle of an existing extent + * map, so take that into account. + */ + range_len = em->len - (cur - em->start); + /* + * If this range of the extent map is already flagged for delalloc, + * skip it, because: + * + * 1) We could deadlock later, when trying to reserve space for + * delalloc, because in case we can't immediately reserve space + * the flusher can start delalloc and wait for the respective + * ordered extents to complete. The deadlock would happen + * because we do the space reservation while holding the range + * locked, and starting writeback, or finishing an ordered + * extent, requires locking the range; + * + * 2) If there's delalloc there, it means there's dirty pages for + * which writeback has not started yet (we clean the delalloc + * flag when starting writeback and after creating an ordered + * extent). If we mark pages in an adjacent range for defrag, + * then we will have a larger contiguous range for delalloc, + * very likely resulting in a larger extent after writeback is + * triggered (except in a case of free space fragmentation). + */ + if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, + EXTENT_DELALLOC, 0, NULL)) + goto next; + + /* + * For do_compress case, we want to compress all valid file + * extents, thus no @extent_thresh or mergeable check. + */ + if (do_compress) + goto add; + + /* Skip too large extent */ + if (range_len >= extent_thresh) + goto next; + + /* + * Skip extents already at its max capacity, this is mostly for + * compressed extents, which max cap is only 128K. + */ + if (em->len >= get_extent_max_capacity(fs_info, em)) + goto next; + + /* + * Normally there are no more extents after an inline one, thus + * @next_mergeable will normally be false and not defragged. + * So if an inline extent passed all above checks, just add it + * for defrag, and be converted to regular extents. + */ + if (em->block_start == EXTENT_MAP_INLINE) + goto add; + + next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, + extent_thresh, newer_than, locked); + if (!next_mergeable) { + struct defrag_target_range *last; + + /* Empty target list, no way to merge with last entry */ + if (list_empty(target_list)) + goto next; + last = list_entry(target_list->prev, + struct defrag_target_range, list); + /* Not mergeable with last entry */ + if (last->start + last->len != cur) + goto next; + + /* Mergeable, fall through to add it to @target_list. */ + } + +add: + last_is_target = true; + range_len = min(extent_map_end(em), start + len) - cur; + /* + * This one is a good target, check if it can be merged into + * last range of the target list. + */ + if (!list_empty(target_list)) { + struct defrag_target_range *last; + + last = list_entry(target_list->prev, + struct defrag_target_range, list); + ASSERT(last->start + last->len <= cur); + if (last->start + last->len == cur) { + /* Mergeable, enlarge the last entry */ + last->len += range_len; + goto next; + } + /* Fall through to allocate a new entry */ + } + + /* Allocate new defrag_target_range */ + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) { + free_extent_map(em); + ret = -ENOMEM; + break; + } + new->start = cur; + new->len = range_len; + list_add_tail(&new->list, target_list); + +next: + cur = extent_map_end(em); + free_extent_map(em); + } + if (ret < 0) { + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + + list_for_each_entry_safe(entry, tmp, target_list, list) { + list_del_init(&entry->list); + kfree(entry); + } + } + if (!ret && last_scanned_ret) { + /* + * If the last extent is not a target, the caller can skip to + * the end of that extent. + * Otherwise, we can only go the end of the specified range. + */ + if (!last_is_target) + *last_scanned_ret = max(cur, *last_scanned_ret); + else + *last_scanned_ret = max(start + len, *last_scanned_ret); + } + return ret; +} + +#define CLUSTER_SIZE (SZ_256K) +static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); + +/* + * Defrag one contiguous target range. + * + * @inode: target inode + * @target: target range to defrag + * @pages: locked pages covering the defrag range + * @nr_pages: number of locked pages + * + * Caller should ensure: + * + * - Pages are prepared + * Pages should be locked, no ordered extent in the pages range, + * no writeback. + * + * - Extent bits are locked + */ +static int defrag_one_locked_target(struct btrfs_inode *inode, + struct defrag_target_range *target, + struct page **pages, int nr_pages, + struct extent_state **cached_state) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_changeset *data_reserved = NULL; + const u64 start = target->start; + const u64 len = target->len; + unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; + unsigned long start_index = start >> PAGE_SHIFT; + unsigned long first_index = page_index(pages[0]); + int ret = 0; + int i; + + ASSERT(last_index - first_index + 1 <= nr_pages); + + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); + if (ret < 0) + return ret; + clear_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, cached_state); + set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); + + /* Update the page status */ + for (i = start_index - first_index; i <= last_index - first_index; i++) { + ClearPageChecked(pages[i]); + btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); + } + btrfs_delalloc_release_extents(inode, len); + extent_changeset_free(data_reserved); + + return ret; +} + +static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, + u32 extent_thresh, u64 newer_than, bool do_compress, + u64 *last_scanned_ret) +{ + struct extent_state *cached_state = NULL; + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + LIST_HEAD(target_list); + struct page **pages; + const u32 sectorsize = inode->root->fs_info->sectorsize; + u64 last_index = (start + len - 1) >> PAGE_SHIFT; + u64 start_index = start >> PAGE_SHIFT; + unsigned int nr_pages = last_index - start_index + 1; + int ret = 0; + int i; + + ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); + ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); + + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + + /* Prepare all pages */ + for (i = 0; i < nr_pages; i++) { + pages[i] = defrag_prepare_one_page(inode, start_index + i); + if (IS_ERR(pages[i])) { + ret = PTR_ERR(pages[i]); + pages[i] = NULL; + goto free_pages; + } + } + for (i = 0; i < nr_pages; i++) + wait_on_page_writeback(pages[i]); + + /* Lock the pages range */ + lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); + /* + * Now we have a consistent view about the extent map, re-check + * which range really needs to be defragged. + * + * And this time we have extent locked already, pass @locked = true + * so that we won't relock the extent range and cause deadlock. + */ + ret = defrag_collect_targets(inode, start, len, extent_thresh, + newer_than, do_compress, true, + &target_list, last_scanned_ret); + if (ret < 0) + goto unlock_extent; + + list_for_each_entry(entry, &target_list, list) { + ret = defrag_one_locked_target(inode, entry, pages, nr_pages, + &cached_state); + if (ret < 0) + break; + } + + list_for_each_entry_safe(entry, tmp, &target_list, list) { + list_del_init(&entry->list); + kfree(entry); + } +unlock_extent: + unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); +free_pages: + for (i = 0; i < nr_pages; i++) { + if (pages[i]) { + unlock_page(pages[i]); + put_page(pages[i]); + } + } + kfree(pages); + return ret; +} + +static int defrag_one_cluster(struct btrfs_inode *inode, + struct file_ra_state *ra, + u64 start, u32 len, u32 extent_thresh, + u64 newer_than, bool do_compress, + unsigned long *sectors_defragged, + unsigned long max_sectors, + u64 *last_scanned_ret) +{ + const u32 sectorsize = inode->root->fs_info->sectorsize; + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + LIST_HEAD(target_list); + int ret; + + ret = defrag_collect_targets(inode, start, len, extent_thresh, + newer_than, do_compress, false, + &target_list, NULL); + if (ret < 0) + goto out; + + list_for_each_entry(entry, &target_list, list) { + u32 range_len = entry->len; + + /* Reached or beyond the limit */ + if (max_sectors && *sectors_defragged >= max_sectors) { + ret = 1; + break; + } + + if (max_sectors) + range_len = min_t(u32, range_len, + (max_sectors - *sectors_defragged) * sectorsize); + + /* + * If defrag_one_range() has updated last_scanned_ret, + * our range may already be invalid (e.g. hole punched). + * Skip if our range is before last_scanned_ret, as there is + * no need to defrag the range anymore. + */ + if (entry->start + range_len <= *last_scanned_ret) + continue; + + if (ra) + page_cache_sync_readahead(inode->vfs_inode.i_mapping, + ra, NULL, entry->start >> PAGE_SHIFT, + ((entry->start + range_len - 1) >> PAGE_SHIFT) - + (entry->start >> PAGE_SHIFT) + 1); + /* + * Here we may not defrag any range if holes are punched before + * we locked the pages. + * But that's fine, it only affects the @sectors_defragged + * accounting. + */ + ret = defrag_one_range(inode, entry->start, range_len, + extent_thresh, newer_than, do_compress, + last_scanned_ret); + if (ret < 0) + break; + *sectors_defragged += range_len >> + inode->root->fs_info->sectorsize_bits; + } +out: + list_for_each_entry_safe(entry, tmp, &target_list, list) { + list_del_init(&entry->list); + kfree(entry); + } + if (ret >= 0) + *last_scanned_ret = max(*last_scanned_ret, start + len); + return ret; +} + +/* + * Entry point to file defragmentation. + * + * @inode: inode to be defragged + * @ra: readahead state (can be NUL) + * @range: defrag options including range and flags + * @newer_than: minimum transid to defrag + * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode + * will be defragged. + * + * Return <0 for error. + * Return >=0 for the number of sectors defragged, and range->start will be updated + * to indicate the file offset where next defrag should be started at. + * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without + * defragging all the range). + */ +int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, + struct btrfs_ioctl_defrag_range_args *range, + u64 newer_than, unsigned long max_to_defrag) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + unsigned long sectors_defragged = 0; + u64 isize = i_size_read(inode); + u64 cur; + u64 last_byte; + bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); + bool ra_allocated = false; + int compress_type = BTRFS_COMPRESS_ZLIB; + int ret = 0; + u32 extent_thresh = range->extent_thresh; + pgoff_t start_index; + + if (isize == 0) + return 0; + + if (range->start >= isize) + return -EINVAL; + + if (do_compress) { + if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) + return -EINVAL; + if (range->compress_type) + compress_type = range->compress_type; + } + + if (extent_thresh == 0) + extent_thresh = SZ_256K; + + if (range->start + range->len > range->start) { + /* Got a specific range */ + last_byte = min(isize, range->start + range->len); + } else { + /* Defrag until file end */ + last_byte = isize; + } + + /* Align the range */ + cur = round_down(range->start, fs_info->sectorsize); + last_byte = round_up(last_byte, fs_info->sectorsize) - 1; + + /* + * If we were not given a ra, allocate a readahead context. As + * readahead is just an optimization, defrag will work without it so + * we don't error out. + */ + if (!ra) { + ra_allocated = true; + ra = kzalloc(sizeof(*ra), GFP_KERNEL); + if (ra) + file_ra_state_init(ra, inode->i_mapping); + } + + /* + * Make writeback start from the beginning of the range, so that the + * defrag range can be written sequentially. + */ + start_index = cur >> PAGE_SHIFT; + if (start_index < inode->i_mapping->writeback_index) + inode->i_mapping->writeback_index = start_index; + + while (cur < last_byte) { + const unsigned long prev_sectors_defragged = sectors_defragged; + u64 last_scanned = cur; + u64 cluster_end; + + if (btrfs_defrag_cancelled(fs_info)) { + ret = -EAGAIN; + break; + } + + /* We want the cluster end at page boundary when possible */ + cluster_end = (((cur >> PAGE_SHIFT) + + (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; + cluster_end = min(cluster_end, last_byte); + + btrfs_inode_lock(inode, 0); + if (IS_SWAPFILE(inode)) { + ret = -ETXTBSY; + btrfs_inode_unlock(inode, 0); + break; + } + if (!(inode->i_sb->s_flags & SB_ACTIVE)) { + btrfs_inode_unlock(inode, 0); + break; + } + if (do_compress) + BTRFS_I(inode)->defrag_compress = compress_type; + ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, + cluster_end + 1 - cur, extent_thresh, + newer_than, do_compress, §ors_defragged, + max_to_defrag, &last_scanned); + + if (sectors_defragged > prev_sectors_defragged) + balance_dirty_pages_ratelimited(inode->i_mapping); + + btrfs_inode_unlock(inode, 0); + if (ret < 0) + break; + cur = max(cluster_end + 1, last_scanned); + if (ret > 0) { + ret = 0; + break; + } + cond_resched(); + } + + if (ra_allocated) + kfree(ra); + /* + * Update range.start for autodefrag, this will indicate where to start + * in next run. + */ + range->start = cur; + if (sectors_defragged) { + /* + * We have defragged some sectors, for compression case they + * need to be written back immediately. + */ + if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { + filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + } + if (range->compress_type == BTRFS_COMPRESS_LZO) + btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); + else if (range->compress_type == BTRFS_COMPRESS_ZSTD) + btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); + ret = sectors_defragged; + } + if (do_compress) { + btrfs_inode_lock(inode, 0); + BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; + btrfs_inode_unlock(inode, 0); + } + return ret; +} + void __cold btrfs_auto_defrag_exit(void) { kmem_cache_destroy(btrfs_inode_defrag_cachep); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 96858240588d..42453dd2ed31 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1039,908 +1039,6 @@ out: return ret; } -/* - * Defrag specific helper to get an extent map. - * - * Differences between this and btrfs_get_extent() are: - * - * - No extent_map will be added to inode->extent_tree - * To reduce memory usage in the long run. - * - * - Extra optimization to skip file extents older than @newer_than - * By using btrfs_search_forward() we can skip entire file ranges that - * have extents created in past transactions, because btrfs_search_forward() - * will not visit leaves and nodes with a generation smaller than given - * minimal generation threshold (@newer_than). - * - * Return valid em if we find a file extent matching the requirement. - * Return NULL if we can not find a file extent matching the requirement. - * - * Return ERR_PTR() for error. - */ -static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, - u64 start, u64 newer_than) -{ - struct btrfs_root *root = inode->root; - struct btrfs_file_extent_item *fi; - struct btrfs_path path = { 0 }; - struct extent_map *em; - struct btrfs_key key; - u64 ino = btrfs_ino(inode); - int ret; - - em = alloc_extent_map(); - if (!em) { - ret = -ENOMEM; - goto err; - } - - key.objectid = ino; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = start; - - if (newer_than) { - ret = btrfs_search_forward(root, &key, &path, newer_than); - if (ret < 0) - goto err; - /* Can't find anything newer */ - if (ret > 0) - goto not_found; - } else { - ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); - if (ret < 0) - goto err; - } - if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { - /* - * If btrfs_search_slot() makes path to point beyond nritems, - * we should not have an empty leaf, as this inode must at - * least have its INODE_ITEM. - */ - ASSERT(btrfs_header_nritems(path.nodes[0])); - path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; - } - btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); - /* Perfect match, no need to go one slot back */ - if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && - key.offset == start) - goto iterate; - - /* We didn't find a perfect match, needs to go one slot back */ - if (path.slots[0] > 0) { - btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); - if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) - path.slots[0]--; - } - -iterate: - /* Iterate through the path to find a file extent covering @start */ - while (true) { - u64 extent_end; - - if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) - goto next; - - btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); - - /* - * We may go one slot back to INODE_REF/XATTR item, then - * need to go forward until we reach an EXTENT_DATA. - * But we should still has the correct ino as key.objectid. - */ - if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) - goto next; - - /* It's beyond our target range, definitely not extent found */ - if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) - goto not_found; - - /* - * | |<- File extent ->| - * \- start - * - * This means there is a hole between start and key.offset. - */ - if (key.offset > start) { - em->start = start; - em->orig_start = start; - em->block_start = EXTENT_MAP_HOLE; - em->len = key.offset - start; - break; - } - - fi = btrfs_item_ptr(path.nodes[0], path.slots[0], - struct btrfs_file_extent_item); - extent_end = btrfs_file_extent_end(&path); - - /* - * |<- file extent ->| | - * \- start - * - * We haven't reached start, search next slot. - */ - if (extent_end <= start) - goto next; - - /* Now this extent covers @start, convert it to em */ - btrfs_extent_item_to_extent_map(inode, &path, fi, false, em); - break; -next: - ret = btrfs_next_item(root, &path); - if (ret < 0) - goto err; - if (ret > 0) - goto not_found; - } - btrfs_release_path(&path); - return em; - -not_found: - btrfs_release_path(&path); - free_extent_map(em); - return NULL; - -err: - btrfs_release_path(&path); - free_extent_map(em); - return ERR_PTR(ret); -} - -static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, - u64 newer_than, bool locked) -{ - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map *em; - const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; - - /* - * hopefully we have this extent in the tree already, try without - * the full extent lock - */ - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, sectorsize); - read_unlock(&em_tree->lock); - - /* - * We can get a merged extent, in that case, we need to re-search - * tree to get the original em for defrag. - * - * If @newer_than is 0 or em::generation < newer_than, we can trust - * this em, as either we don't care about the generation, or the - * merged extent map will be rejected anyway. - */ - if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && - newer_than && em->generation >= newer_than) { - free_extent_map(em); - em = NULL; - } - - if (!em) { - struct extent_state *cached = NULL; - u64 end = start + sectorsize - 1; - - /* get the big lock and read metadata off disk */ - if (!locked) - lock_extent(io_tree, start, end, &cached); - em = defrag_get_extent(BTRFS_I(inode), start, newer_than); - if (!locked) - unlock_extent(io_tree, start, end, &cached); - - if (IS_ERR(em)) - return NULL; - } - - return em; -} - -static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, - const struct extent_map *em) -{ - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) - return BTRFS_MAX_COMPRESSED; - return fs_info->max_extent_size; -} - -static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, - u32 extent_thresh, u64 newer_than, bool locked) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_map *next; - bool ret = false; - - /* this is the last extent */ - if (em->start + em->len >= i_size_read(inode)) - return false; - - /* - * Here we need to pass @newer_then when checking the next extent, or - * we will hit a case we mark current extent for defrag, but the next - * one will not be a target. - * This will just cause extra IO without really reducing the fragments. - */ - next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); - /* No more em or hole */ - if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) - goto out; - if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) - goto out; - /* - * If the next extent is at its max capacity, defragging current extent - * makes no sense, as the total number of extents won't change. - */ - if (next->len >= get_extent_max_capacity(fs_info, em)) - goto out; - /* Skip older extent */ - if (next->generation < newer_than) - goto out; - /* Also check extent size */ - if (next->len >= extent_thresh) - goto out; - - ret = true; -out: - free_extent_map(next); - return ret; -} - -/* - * Prepare one page to be defragged. - * - * This will ensure: - * - * - Returned page is locked and has been set up properly. - * - No ordered extent exists in the page. - * - The page is uptodate. - * - * NOTE: Caller should also wait for page writeback after the cluster is - * prepared, here we don't do writeback wait for each page. - */ -static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, - pgoff_t index) -{ - struct address_space *mapping = inode->vfs_inode.i_mapping; - gfp_t mask = btrfs_alloc_write_mask(mapping); - u64 page_start = (u64)index << PAGE_SHIFT; - u64 page_end = page_start + PAGE_SIZE - 1; - struct extent_state *cached_state = NULL; - struct page *page; - int ret; - -again: - page = find_or_create_page(mapping, index, mask); - if (!page) - return ERR_PTR(-ENOMEM); - - /* - * Since we can defragment files opened read-only, we can encounter - * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We - * can't do I/O using huge pages yet, so return an error for now. - * Filesystem transparent huge pages are typically only used for - * executables that explicitly enable them, so this isn't very - * restrictive. - */ - if (PageCompound(page)) { - unlock_page(page); - put_page(page); - return ERR_PTR(-ETXTBSY); - } - - ret = set_page_extent_mapped(page); - if (ret < 0) { - unlock_page(page); - put_page(page); - return ERR_PTR(ret); - } - - /* Wait for any existing ordered extent in the range */ - while (1) { - struct btrfs_ordered_extent *ordered; - - lock_extent(&inode->io_tree, page_start, page_end, &cached_state); - ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); - unlock_extent(&inode->io_tree, page_start, page_end, - &cached_state); - if (!ordered) - break; - - unlock_page(page); - btrfs_start_ordered_extent(ordered, 1); - btrfs_put_ordered_extent(ordered); - lock_page(page); - /* - * We unlocked the page above, so we need check if it was - * released or not. - */ - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); - goto again; - } - } - - /* - * Now the page range has no ordered extent any more. Read the page to - * make it uptodate. - */ - if (!PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); - goto again; - } - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); - return ERR_PTR(-EIO); - } - } - return page; -} - -struct defrag_target_range { - struct list_head list; - u64 start; - u64 len; -}; - -/* - * Collect all valid target extents. - * - * @start: file offset to lookup - * @len: length to lookup - * @extent_thresh: file extent size threshold, any extent size >= this value - * will be ignored - * @newer_than: only defrag extents newer than this value - * @do_compress: whether the defrag is doing compression - * if true, @extent_thresh will be ignored and all regular - * file extents meeting @newer_than will be targets. - * @locked: if the range has already held extent lock - * @target_list: list of targets file extents - */ -static int defrag_collect_targets(struct btrfs_inode *inode, - u64 start, u64 len, u32 extent_thresh, - u64 newer_than, bool do_compress, - bool locked, struct list_head *target_list, - u64 *last_scanned_ret) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - bool last_is_target = false; - u64 cur = start; - int ret = 0; - - while (cur < start + len) { - struct extent_map *em; - struct defrag_target_range *new; - bool next_mergeable = true; - u64 range_len; - - last_is_target = false; - em = defrag_lookup_extent(&inode->vfs_inode, cur, - newer_than, locked); - if (!em) - break; - - /* - * If the file extent is an inlined one, we may still want to - * defrag it (fallthrough) if it will cause a regular extent. - * This is for users who want to convert inline extents to - * regular ones through max_inline= mount option. - */ - if (em->block_start == EXTENT_MAP_INLINE && - em->len <= inode->root->fs_info->max_inline) - goto next; - - /* Skip hole/delalloc/preallocated extents */ - if (em->block_start == EXTENT_MAP_HOLE || - em->block_start == EXTENT_MAP_DELALLOC || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - goto next; - - /* Skip older extent */ - if (em->generation < newer_than) - goto next; - - /* This em is under writeback, no need to defrag */ - if (em->generation == (u64)-1) - goto next; - - /* - * Our start offset might be in the middle of an existing extent - * map, so take that into account. - */ - range_len = em->len - (cur - em->start); - /* - * If this range of the extent map is already flagged for delalloc, - * skip it, because: - * - * 1) We could deadlock later, when trying to reserve space for - * delalloc, because in case we can't immediately reserve space - * the flusher can start delalloc and wait for the respective - * ordered extents to complete. The deadlock would happen - * because we do the space reservation while holding the range - * locked, and starting writeback, or finishing an ordered - * extent, requires locking the range; - * - * 2) If there's delalloc there, it means there's dirty pages for - * which writeback has not started yet (we clean the delalloc - * flag when starting writeback and after creating an ordered - * extent). If we mark pages in an adjacent range for defrag, - * then we will have a larger contiguous range for delalloc, - * very likely resulting in a larger extent after writeback is - * triggered (except in a case of free space fragmentation). - */ - if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, - EXTENT_DELALLOC, 0, NULL)) - goto next; - - /* - * For do_compress case, we want to compress all valid file - * extents, thus no @extent_thresh or mergeable check. - */ - if (do_compress) - goto add; - - /* Skip too large extent */ - if (range_len >= extent_thresh) - goto next; - - /* - * Skip extents already at its max capacity, this is mostly for - * compressed extents, which max cap is only 128K. - */ - if (em->len >= get_extent_max_capacity(fs_info, em)) - goto next; - - /* - * Normally there are no more extents after an inline one, thus - * @next_mergeable will normally be false and not defragged. - * So if an inline extent passed all above checks, just add it - * for defrag, and be converted to regular extents. - */ - if (em->block_start == EXTENT_MAP_INLINE) - goto add; - - next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, - extent_thresh, newer_than, locked); - if (!next_mergeable) { - struct defrag_target_range *last; - - /* Empty target list, no way to merge with last entry */ - if (list_empty(target_list)) - goto next; - last = list_entry(target_list->prev, - struct defrag_target_range, list); - /* Not mergeable with last entry */ - if (last->start + last->len != cur) - goto next; - - /* Mergeable, fall through to add it to @target_list. */ - } - -add: - last_is_target = true; - range_len = min(extent_map_end(em), start + len) - cur; - /* - * This one is a good target, check if it can be merged into - * last range of the target list. - */ - if (!list_empty(target_list)) { - struct defrag_target_range *last; - - last = list_entry(target_list->prev, - struct defrag_target_range, list); - ASSERT(last->start + last->len <= cur); - if (last->start + last->len == cur) { - /* Mergeable, enlarge the last entry */ - last->len += range_len; - goto next; - } - /* Fall through to allocate a new entry */ - } - - /* Allocate new defrag_target_range */ - new = kmalloc(sizeof(*new), GFP_NOFS); - if (!new) { - free_extent_map(em); - ret = -ENOMEM; - break; - } - new->start = cur; - new->len = range_len; - list_add_tail(&new->list, target_list); - -next: - cur = extent_map_end(em); - free_extent_map(em); - } - if (ret < 0) { - struct defrag_target_range *entry; - struct defrag_target_range *tmp; - - list_for_each_entry_safe(entry, tmp, target_list, list) { - list_del_init(&entry->list); - kfree(entry); - } - } - if (!ret && last_scanned_ret) { - /* - * If the last extent is not a target, the caller can skip to - * the end of that extent. - * Otherwise, we can only go the end of the specified range. - */ - if (!last_is_target) - *last_scanned_ret = max(cur, *last_scanned_ret); - else - *last_scanned_ret = max(start + len, *last_scanned_ret); - } - return ret; -} - -#define CLUSTER_SIZE (SZ_256K) -static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); - -/* - * Defrag one contiguous target range. - * - * @inode: target inode - * @target: target range to defrag - * @pages: locked pages covering the defrag range - * @nr_pages: number of locked pages - * - * Caller should ensure: - * - * - Pages are prepared - * Pages should be locked, no ordered extent in the pages range, - * no writeback. - * - * - Extent bits are locked - */ -static int defrag_one_locked_target(struct btrfs_inode *inode, - struct defrag_target_range *target, - struct page **pages, int nr_pages, - struct extent_state **cached_state) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_changeset *data_reserved = NULL; - const u64 start = target->start; - const u64 len = target->len; - unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; - unsigned long start_index = start >> PAGE_SHIFT; - unsigned long first_index = page_index(pages[0]); - int ret = 0; - int i; - - ASSERT(last_index - first_index + 1 <= nr_pages); - - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); - if (ret < 0) - return ret; - clear_extent_bit(&inode->io_tree, start, start + len - 1, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, cached_state); - set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); - - /* Update the page status */ - for (i = start_index - first_index; i <= last_index - first_index; i++) { - ClearPageChecked(pages[i]); - btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); - } - btrfs_delalloc_release_extents(inode, len); - extent_changeset_free(data_reserved); - - return ret; -} - -static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, - u32 extent_thresh, u64 newer_than, bool do_compress, - u64 *last_scanned_ret) -{ - struct extent_state *cached_state = NULL; - struct defrag_target_range *entry; - struct defrag_target_range *tmp; - LIST_HEAD(target_list); - struct page **pages; - const u32 sectorsize = inode->root->fs_info->sectorsize; - u64 last_index = (start + len - 1) >> PAGE_SHIFT; - u64 start_index = start >> PAGE_SHIFT; - unsigned int nr_pages = last_index - start_index + 1; - int ret = 0; - int i; - - ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); - ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); - - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) - return -ENOMEM; - - /* Prepare all pages */ - for (i = 0; i < nr_pages; i++) { - pages[i] = defrag_prepare_one_page(inode, start_index + i); - if (IS_ERR(pages[i])) { - ret = PTR_ERR(pages[i]); - pages[i] = NULL; - goto free_pages; - } - } - for (i = 0; i < nr_pages; i++) - wait_on_page_writeback(pages[i]); - - /* Lock the pages range */ - lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); - /* - * Now we have a consistent view about the extent map, re-check - * which range really needs to be defragged. - * - * And this time we have extent locked already, pass @locked = true - * so that we won't relock the extent range and cause deadlock. - */ - ret = defrag_collect_targets(inode, start, len, extent_thresh, - newer_than, do_compress, true, - &target_list, last_scanned_ret); - if (ret < 0) - goto unlock_extent; - - list_for_each_entry(entry, &target_list, list) { - ret = defrag_one_locked_target(inode, entry, pages, nr_pages, - &cached_state); - if (ret < 0) - break; - } - - list_for_each_entry_safe(entry, tmp, &target_list, list) { - list_del_init(&entry->list); - kfree(entry); - } -unlock_extent: - unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); -free_pages: - for (i = 0; i < nr_pages; i++) { - if (pages[i]) { - unlock_page(pages[i]); - put_page(pages[i]); - } - } - kfree(pages); - return ret; -} - -static int defrag_one_cluster(struct btrfs_inode *inode, - struct file_ra_state *ra, - u64 start, u32 len, u32 extent_thresh, - u64 newer_than, bool do_compress, - unsigned long *sectors_defragged, - unsigned long max_sectors, - u64 *last_scanned_ret) -{ - const u32 sectorsize = inode->root->fs_info->sectorsize; - struct defrag_target_range *entry; - struct defrag_target_range *tmp; - LIST_HEAD(target_list); - int ret; - - ret = defrag_collect_targets(inode, start, len, extent_thresh, - newer_than, do_compress, false, - &target_list, NULL); - if (ret < 0) - goto out; - - list_for_each_entry(entry, &target_list, list) { - u32 range_len = entry->len; - - /* Reached or beyond the limit */ - if (max_sectors && *sectors_defragged >= max_sectors) { - ret = 1; - break; - } - - if (max_sectors) - range_len = min_t(u32, range_len, - (max_sectors - *sectors_defragged) * sectorsize); - - /* - * If defrag_one_range() has updated last_scanned_ret, - * our range may already be invalid (e.g. hole punched). - * Skip if our range is before last_scanned_ret, as there is - * no need to defrag the range anymore. - */ - if (entry->start + range_len <= *last_scanned_ret) - continue; - - if (ra) - page_cache_sync_readahead(inode->vfs_inode.i_mapping, - ra, NULL, entry->start >> PAGE_SHIFT, - ((entry->start + range_len - 1) >> PAGE_SHIFT) - - (entry->start >> PAGE_SHIFT) + 1); - /* - * Here we may not defrag any range if holes are punched before - * we locked the pages. - * But that's fine, it only affects the @sectors_defragged - * accounting. - */ - ret = defrag_one_range(inode, entry->start, range_len, - extent_thresh, newer_than, do_compress, - last_scanned_ret); - if (ret < 0) - break; - *sectors_defragged += range_len >> - inode->root->fs_info->sectorsize_bits; - } -out: - list_for_each_entry_safe(entry, tmp, &target_list, list) { - list_del_init(&entry->list); - kfree(entry); - } - if (ret >= 0) - *last_scanned_ret = max(*last_scanned_ret, start + len); - return ret; -} - -/* - * Entry point to file defragmentation. - * - * @inode: inode to be defragged - * @ra: readahead state (can be NUL) - * @range: defrag options including range and flags - * @newer_than: minimum transid to defrag - * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode - * will be defragged. - * - * Return <0 for error. - * Return >=0 for the number of sectors defragged, and range->start will be updated - * to indicate the file offset where next defrag should be started at. - * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without - * defragging all the range). - */ -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, - struct btrfs_ioctl_defrag_range_args *range, - u64 newer_than, unsigned long max_to_defrag) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - unsigned long sectors_defragged = 0; - u64 isize = i_size_read(inode); - u64 cur; - u64 last_byte; - bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS; - bool ra_allocated = false; - int compress_type = BTRFS_COMPRESS_ZLIB; - int ret = 0; - u32 extent_thresh = range->extent_thresh; - pgoff_t start_index; - - if (isize == 0) - return 0; - - if (range->start >= isize) - return -EINVAL; - - if (do_compress) { - if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) - return -EINVAL; - if (range->compress_type) - compress_type = range->compress_type; - } - - if (extent_thresh == 0) - extent_thresh = SZ_256K; - - if (range->start + range->len > range->start) { - /* Got a specific range */ - last_byte = min(isize, range->start + range->len); - } else { - /* Defrag until file end */ - last_byte = isize; - } - - /* Align the range */ - cur = round_down(range->start, fs_info->sectorsize); - last_byte = round_up(last_byte, fs_info->sectorsize) - 1; - - /* - * If we were not given a ra, allocate a readahead context. As - * readahead is just an optimization, defrag will work without it so - * we don't error out. - */ - if (!ra) { - ra_allocated = true; - ra = kzalloc(sizeof(*ra), GFP_KERNEL); - if (ra) - file_ra_state_init(ra, inode->i_mapping); - } - - /* - * Make writeback start from the beginning of the range, so that the - * defrag range can be written sequentially. - */ - start_index = cur >> PAGE_SHIFT; - if (start_index < inode->i_mapping->writeback_index) - inode->i_mapping->writeback_index = start_index; - - while (cur < last_byte) { - const unsigned long prev_sectors_defragged = sectors_defragged; - u64 last_scanned = cur; - u64 cluster_end; - - if (btrfs_defrag_cancelled(fs_info)) { - ret = -EAGAIN; - break; - } - - /* We want the cluster end at page boundary when possible */ - cluster_end = (((cur >> PAGE_SHIFT) + - (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; - cluster_end = min(cluster_end, last_byte); - - btrfs_inode_lock(inode, 0); - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - btrfs_inode_unlock(inode, 0); - break; - } - if (!(inode->i_sb->s_flags & SB_ACTIVE)) { - btrfs_inode_unlock(inode, 0); - break; - } - if (do_compress) - BTRFS_I(inode)->defrag_compress = compress_type; - ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, - cluster_end + 1 - cur, extent_thresh, - newer_than, do_compress, §ors_defragged, - max_to_defrag, &last_scanned); - - if (sectors_defragged > prev_sectors_defragged) - balance_dirty_pages_ratelimited(inode->i_mapping); - - btrfs_inode_unlock(inode, 0); - if (ret < 0) - break; - cur = max(cluster_end + 1, last_scanned); - if (ret > 0) { - ret = 0; - break; - } - cond_resched(); - } - - if (ra_allocated) - kfree(ra); - /* - * Update range.start for autodefrag, this will indicate where to start - * in next run. - */ - range->start = cur; - if (sectors_defragged) { - /* - * We have defragged some sectors, for compression case they - * need to be written back immediately. - */ - if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { - filemap_flush(inode->i_mapping); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_flush(inode->i_mapping); - } - if (range->compress_type == BTRFS_COMPRESS_LZO) - btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); - else if (range->compress_type == BTRFS_COMPRESS_ZSTD) - btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); - ret = sectors_defragged; - } - if (do_compress) { - btrfs_inode_lock(inode, 0); - BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; - btrfs_inode_unlock(inode, 0); - } - return ret; -} - /* * Try to start exclusive operation @type or cancel it if it's running. * -- cgit From 59b818e064ab9051cd344b82420307e772d6bca7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:25 -0400 Subject: btrfs: move defrag related prototypes to their own header Now that the defrag code is all in one file, create a defrag.h and move all the defrag related prototypes and helper out of ctree.h and into defrag.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 18 ------------------ fs/btrfs/defrag.c | 1 + fs/btrfs/defrag.h | 22 ++++++++++++++++++++++ fs/btrfs/disk-io.c | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/super.c | 1 + fs/btrfs/transaction.c | 1 + 8 files changed, 28 insertions(+), 18 deletions(-) create mode 100644 fs/btrfs/defrag.h (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a9267b117860..689220637d00 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -773,19 +773,10 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns, int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); int __pure btrfs_is_empty_uuid(u8 *uuid); -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, - struct btrfs_ioctl_defrag_range_args *range, - u64 newer_than, unsigned long max_to_defrag); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); /* file.c */ -int __init btrfs_auto_defrag_init(void); -void __cold btrfs_auto_defrag_exit(void); -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh); -int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); -void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); extern const struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, @@ -811,10 +802,6 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode); bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, u64 *delalloc_start_ret, u64 *delalloc_end_ret); -/* tree-defrag.c */ -int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root); - /* super.c */ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags); @@ -945,11 +932,6 @@ static inline int is_fstree(u64 rootid) return 0; } -static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) -{ - return signal_pending(current); -} - /* verity.c */ #ifdef CONFIG_FS_VERITY diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 2c260eef40d4..291263f93e47 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -13,6 +13,7 @@ #include "messages.h" #include "delalloc-space.h" #include "subpage.h" +#include "defrag.h" static struct kmem_cache *btrfs_inode_defrag_cachep; diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h new file mode 100644 index 000000000000..5305f2283b5e --- /dev/null +++ b/fs/btrfs/defrag.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_DEFRAG_H +#define BTRFS_DEFRAG_H + +int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, + struct btrfs_ioctl_defrag_range_args *range, + u64 newer_than, unsigned long max_to_defrag); +int __init btrfs_auto_defrag_init(void); +void __cold btrfs_auto_defrag_exit(void); +int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u32 extent_thresh); +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root); + +static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) +{ + return signal_pending(current); +} + +#endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ebc9baab0e47..2f7dd79a5a35 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -47,6 +47,7 @@ #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" +#include "defrag.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5b9857e30a08..97f28e37ec7c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -59,6 +59,7 @@ #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" +#include "defrag.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 42453dd2ed31..f91dbd1dc6d1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -54,6 +54,7 @@ #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" +#include "defrag.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a4030dfeb2f2..f105d360d6c9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -52,6 +52,7 @@ #include "raid56.h" #include "fs.h" #include "accessors.h" +#include "defrag.h" #define CREATE_TRACE_POINTS #include diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 82b2e2ec90cf..99d3f2a66227 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -28,6 +28,7 @@ #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" +#include "defrag.h" static struct kmem_cache *btrfs_trans_handle_cachep; -- cgit From f2b39277b87dbb26f15e2e3b667df25dad3dd2ea Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:26 -0400 Subject: btrfs: move dir-item prototypes into dir-item.h Move these prototypes out of ctree.h and into their own header file. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 38 -------------------------------------- fs/btrfs/dir-item.c | 1 + fs/btrfs/dir-item.h | 42 ++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/inode.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/send.c | 1 + fs/btrfs/super.c | 1 + fs/btrfs/transaction.c | 1 + fs/btrfs/tree-log.c | 1 + fs/btrfs/xattr.c | 1 + 10 files changed, 50 insertions(+), 38 deletions(-) create mode 100644 fs/btrfs/dir-item.h (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 689220637d00..a9666dc95105 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -689,44 +689,6 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); -/* dir-item.c */ -int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, - const struct fscrypt_str *name); -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, - const struct fscrypt_str *name, struct btrfs_inode *dir, - struct btrfs_key *location, u8 type, u64 index); -struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 dir, - const struct fscrypt_str *name, int mod); -struct btrfs_dir_item * -btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 dir, - u64 index, const struct fscrypt_str *name, int mod); -struct btrfs_dir_item * -btrfs_search_dir_index_item(struct btrfs_root *root, - struct btrfs_path *path, u64 dirid, - const struct fscrypt_str *name); -int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_dir_item *di); -int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid, - const char *name, u16 name_len, - const void *data, u16 data_len); -struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 dir, - const char *name, u16 name_len, - int mod); -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - const char *name, - int name_len); - /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index ca69fb35a2cc..082eb0e19598 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -8,6 +8,7 @@ #include "disk-io.h" #include "transaction.h" #include "accessors.h" +#include "dir-item.h" /* * insert a name into a directory, doing overflow properly if there is a hash diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h new file mode 100644 index 000000000000..aab4b7cc7fa0 --- /dev/null +++ b/fs/btrfs/dir-item.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_DIR_ITEM_H +#define BTRFS_DIR_ITEM_H + +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, + const struct fscrypt_str *name); +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, + const struct fscrypt_str *name, struct btrfs_inode *dir, + struct btrfs_key *location, u8 type, u64 index); +struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const struct fscrypt_str *name, int mod); +struct btrfs_dir_item *btrfs_lookup_dir_index_item( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + u64 index, const struct fscrypt_str *name, int mod); +struct btrfs_dir_item *btrfs_search_dir_index_item(struct btrfs_root *root, + struct btrfs_path *path, u64 dirid, + const struct fscrypt_str *name); +int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_dir_item *di); +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + const char *name, u16 name_len, + const void *data, u16 data_len); +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod); +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + const char *name, + int name_len); + +#endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 97f28e37ec7c..a3130e7c803d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -60,6 +60,7 @@ #include "extent-tree.h" #include "root-tree.h" #include "defrag.h" +#include "dir-item.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f91dbd1dc6d1..41599b068026 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -55,6 +55,7 @@ #include "extent-tree.h" #include "root-tree.h" #include "defrag.h" +#include "dir-item.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 2ece3a030a66..0d3f25d4f147 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -28,6 +28,7 @@ #include "xattr.h" #include "print-tree.h" #include "accessors.h" +#include "dir-item.h" /* * Maximum number of references an extent can have in order for us to attempt to diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f105d360d6c9..9756b0cda626 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -53,6 +53,7 @@ #include "fs.h" #include "accessors.h" #include "defrag.h" +#include "dir-item.h" #define CREATE_TRACE_POINTS #include diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 99d3f2a66227..a0b7702d980f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -29,6 +29,7 @@ #include "extent-tree.h" #include "root-tree.h" #include "defrag.h" +#include "dir-item.h" static struct kmem_cache *btrfs_trans_handle_cachep; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e358df3668e7..c168db7519e7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -25,6 +25,7 @@ #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" +#include "dir-item.h" #define MAX_CONFLICT_INODES 10 diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index fcf2d5f7e198..0ed4b119a7ca 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -22,6 +22,7 @@ #include "props.h" #include "locking.h" #include "accessors.h" +#include "dir-item.h" int btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size) -- cgit From 7c8ede16280586c72c36af6604985d714b84a32c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:27 -0400 Subject: btrfs: move file-item prototypes into their own header Move these prototypes out of ctree.h and into file-item.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 1 + fs/btrfs/ctree.h | 31 ------------------------------- fs/btrfs/defrag.c | 1 + fs/btrfs/delayed-inode.c | 1 + fs/btrfs/extent-tree.c | 1 + fs/btrfs/extent_io.c | 1 + fs/btrfs/file-item.c | 1 + fs/btrfs/file-item.h | 35 +++++++++++++++++++++++++++++++++++ fs/btrfs/file.c | 1 + fs/btrfs/free-space-cache.c | 1 + fs/btrfs/inode-item.c | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/reflink.c | 1 + fs/btrfs/relocation.c | 1 + fs/btrfs/scrub.c | 1 + fs/btrfs/send.c | 1 + fs/btrfs/tree-log.c | 1 + 17 files changed, 50 insertions(+), 31 deletions(-) create mode 100644 fs/btrfs/file-item.h (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c0615af0434f..61828f8375e6 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -34,6 +34,7 @@ #include "extent_map.h" #include "subpage.h" #include "zoned.h" +#include "file-item.h" static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a9666dc95105..6bfea55c82a0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -695,37 +695,6 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); -/* file-item.c */ -int btrfs_del_csums(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, u64 len); -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); -int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 objectid, u64 pos, - u64 num_bytes); -int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid, - u64 bytenr, int mod); -int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_ordered_sum *sums); -blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 offset, bool one_ordered); -int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit, - bool nowait); -void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, - const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, - const bool new_inline, - struct extent_map *em); -int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, - u64 len); -int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, - u64 len); -void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size); -u64 btrfs_file_extent_end(const struct btrfs_path *path); - /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 291263f93e47..ed82085acea3 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -14,6 +14,7 @@ #include "delalloc-space.h" #include "subpage.h" #include "defrag.h" +#include "file-item.h" static struct kmem_cache *btrfs_inode_defrag_cachep; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index f93d2695e423..c024f97de9e0 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -18,6 +18,7 @@ #include "inode-item.h" #include "space-info.h" #include "accessors.h" +#include "file-item.h" #define BTRFS_DELAYED_WRITEBACK 512 #define BTRFS_DELAYED_BACKGROUND 128 diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 510be00da406..940d4fe23cfb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -40,6 +40,7 @@ #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" +#include "file-item.h" #undef SCRAMBLE_DELAYED_REFS diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 545d9e1f0f83..ea31a326ae93 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -32,6 +32,7 @@ #include "compression.h" #include "fs.h" #include "accessors.h" +#include "file-item.h" static struct kmem_cache *extent_buffer_cache; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 403a857d230e..20d88cd0b602 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -19,6 +19,7 @@ #include "compression.h" #include "fs.h" #include "accessors.h" +#include "file-item.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h new file mode 100644 index 000000000000..51cd3ab5948c --- /dev/null +++ b/fs/btrfs/file-item.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_FILE_ITEM_H +#define BTRFS_FILE_ITEM_H + +int btrfs_del_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 len); +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); +int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid, u64 pos, + u64 num_bytes); +int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + u64 bytenr, int mod); +int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums); +blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, + u64 offset, bool one_ordered); +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait); +void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, + const bool new_inline, + struct extent_map *em); +int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, + u64 len); +int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); +void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size); +u64 btrfs_file_extent_end(const struct btrfs_path *path); + +#endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8500472aa6ef..9c100198dd26 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -33,6 +33,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "file-item.h" /* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 599b41523479..bc1b9aa164ec 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -27,6 +27,7 @@ #include "subpage.h" #include "inode-item.h" #include "accessors.h" +#include "file-item.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 724507ce7a7d..b65c45b5d681 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -13,6 +13,7 @@ #include "space-info.h" #include "accessors.h" #include "extent-tree.h" +#include "file-item.h" struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a3130e7c803d..7231f6b69096 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -61,6 +61,7 @@ #include "root-tree.h" #include "defrag.h" #include "dir-item.h" +#include "file-item.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 9ba7914c7169..204bb884ab91 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -12,6 +12,7 @@ #include "transaction.h" #include "subpage.h" #include "accessors.h" +#include "file-item.h" #define BTRFS_MAX_DEDUPE_LEN SZ_16M diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index e345cc71da15..e86364bdac8e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -32,6 +32,7 @@ #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" +#include "file-item.h" /* * Relocation overview diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3aed760e1a80..e46e6c4d4bf9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -23,6 +23,7 @@ #include "zoned.h" #include "fs.h" #include "accessors.h" +#include "file-item.h" /* * This is only the first step towards a full-features scrub. It reads all diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 0d3f25d4f147..4cc9e855a769 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -29,6 +29,7 @@ #include "print-tree.h" #include "accessors.h" #include "dir-item.h" +#include "file-item.h" /* * Maximum number of references an extent can have in order for us to attempt to diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c168db7519e7..7be540fb5c4a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -26,6 +26,7 @@ #include "extent-tree.h" #include "root-tree.h" #include "dir-item.h" +#include "file-item.h" #define MAX_CONFLICT_INODES 10 -- cgit From c7a03b524d30382ed45883cd27906678e6a0dada Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:28 -0400 Subject: btrfs: move uuid tree prototypes to uuid-tree.h Move these out of ctree.h into uuid-tree.h to cut down on the code in ctree.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 7 ------- fs/btrfs/disk-io.c | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/transaction.c | 1 + fs/btrfs/uuid-tree.c | 1 + fs/btrfs/uuid-tree.h | 12 ++++++++++++ fs/btrfs/volumes.c | 1 + 8 files changed, 18 insertions(+), 7 deletions(-) create mode 100644 fs/btrfs/uuid-tree.h (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6bfea55c82a0..910a23e7cd8f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -682,13 +682,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *node, struct extent_buffer *parent); -/* uuid-tree.c */ -int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, - u64 subid); -int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, - u64 subid); -int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); - /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2f7dd79a5a35..e8082e3c5bdc 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -48,6 +48,7 @@ #include "extent-tree.h" #include "root-tree.h" #include "defrag.h" +#include "uuid-tree.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7231f6b69096..061fa0923772 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -62,6 +62,7 @@ #include "defrag.h" #include "dir-item.h" #include "file-item.h" +#include "uuid-tree.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 41599b068026..c2439aebebd4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -56,6 +56,7 @@ #include "root-tree.h" #include "defrag.h" #include "dir-item.h" +#include "uuid-tree.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a0b7702d980f..c6fe2ab8716b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -30,6 +30,7 @@ #include "root-tree.h" #include "defrag.h" #include "dir-item.h" +#include "uuid-tree.h" static struct kmem_cache *btrfs_trans_handle_cachep; diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 70304b89f31f..7c7001f42b14 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -12,6 +12,7 @@ #include "print-tree.h" #include "fs.h" #include "accessors.h" +#include "uuid-tree.h" static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key) { diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h new file mode 100644 index 000000000000..5350c87fe2ca --- /dev/null +++ b/fs/btrfs/uuid-tree.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_UUID_TREE_H +#define BTRFS_UUID_TREE_H + +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, + u64 subid); +int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, + u64 subid); +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); + +#endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f61bb79d4a7e..55164b2dba2a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -35,6 +35,7 @@ #include "zoned.h" #include "fs.h" #include "accessors.h" +#include "uuid-tree.h" static struct bio_set btrfs_bioset; -- cgit From 7572dec8f5223186ed0fa7f6da47dba98651cbf4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:29 -0400 Subject: btrfs: move ioctl prototypes into ioctl.h Move these out of ctree.h into ioctl.h to cut down on code in ctree.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 12 ------------ fs/btrfs/file.c | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/ioctl.h | 17 +++++++++++++++++ fs/btrfs/send.c | 1 + fs/btrfs/super.c | 1 + fs/btrfs/transaction.c | 1 + fs/btrfs/verity.c | 1 + fs/btrfs/volumes.c | 1 + 10 files changed, 25 insertions(+), 12 deletions(-) create mode 100644 fs/btrfs/ioctl.h (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 910a23e7cd8f..87930337b301 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -688,18 +688,6 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); -/* ioctl.c */ -long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int btrfs_fileattr_set(struct user_namespace *mnt_userns, - struct dentry *dentry, struct fileattr *fa); -int btrfs_ioctl_get_supported_features(void __user *arg); -void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); -int __pure btrfs_is_empty_uuid(u8 *uuid); -void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, - struct btrfs_ioctl_balance_args *bargs); - /* file.c */ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); extern const struct file_operations btrfs_file_operations; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9c100198dd26..22063340a4c2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -34,6 +34,7 @@ #include "accessors.h" #include "extent-tree.h" #include "file-item.h" +#include "ioctl.h" /* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 061fa0923772..7111eec812dc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -63,6 +63,7 @@ #include "dir-item.h" #include "file-item.h" #include "uuid-tree.h" +#include "ioctl.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c2439aebebd4..4c5c2f890adc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -57,6 +57,7 @@ #include "defrag.h" #include "dir-item.h" #include "uuid-tree.h" +#include "ioctl.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h new file mode 100644 index 000000000000..8a855d5ac2fa --- /dev/null +++ b/fs/btrfs/ioctl.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_IOCTL_H +#define BTRFS_IOCTL_H + +long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int btrfs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); +int btrfs_ioctl_get_supported_features(void __user *arg); +void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); +int __pure btrfs_is_empty_uuid(u8 *uuid); +void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_balance_args *bargs); + +#endif diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4cc9e855a769..e9b7deccc5fe 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -30,6 +30,7 @@ #include "accessors.h" #include "dir-item.h" #include "file-item.h" +#include "ioctl.h" /* * Maximum number of references an extent can have in order for us to attempt to diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9756b0cda626..2cd05246bcd3 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -54,6 +54,7 @@ #include "accessors.h" #include "defrag.h" #include "dir-item.h" +#include "ioctl.h" #define CREATE_TRACE_POINTS #include diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c6fe2ab8716b..bbc3ad3e297c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -31,6 +31,7 @@ #include "defrag.h" #include "dir-item.h" #include "uuid-tree.h" +#include "ioctl.h" static struct kmem_cache *btrfs_trans_handle_cachep; diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index d02fa354fc2b..00ba5143a17d 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -18,6 +18,7 @@ #include "locking.h" #include "fs.h" #include "accessors.h" +#include "ioctl.h" /* * Implementation of the interface defined in struct fsverity_operations. diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 55164b2dba2a..823de2f29289 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -36,6 +36,7 @@ #include "fs.h" #include "accessors.h" #include "uuid-tree.h" +#include "ioctl.h" static struct bio_set btrfs_bioset; -- cgit From af142b6f44d36c4d0e1e53acbedbc30a588c58de Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:30 -0400 Subject: btrfs: move file prototypes to file.h Move these out of ctree.h into file.h to cut down on code in ctree.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 26 -------------------------- fs/btrfs/extent_io.c | 1 + fs/btrfs/file.c | 1 + fs/btrfs/file.h | 32 ++++++++++++++++++++++++++++++++ fs/btrfs/free-space-cache.c | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/ordered-data.c | 1 + fs/btrfs/reflink.c | 1 + fs/btrfs/tree-log.c | 1 + 10 files changed, 40 insertions(+), 26 deletions(-) create mode 100644 fs/btrfs/file.h (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 87930337b301..3a46b5b688e3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -688,32 +688,6 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); -/* file.c */ -int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); -extern const struct file_operations btrfs_file_operations; -int btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, - struct btrfs_drop_extents_args *args); -int btrfs_replace_file_extents(struct btrfs_inode *inode, - struct btrfs_path *path, const u64 start, - const u64 end, - struct btrfs_replace_extent_info *extent_info, - struct btrfs_trans_handle **trans_out); -int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u64 start, u64 end); -ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, - const struct btrfs_ioctl_encoded_io_args *encoded); -int btrfs_release_file(struct inode *inode, struct file *file); -int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, size_t write_bytes, - struct extent_state **cached, bool noreserve); -int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); -int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes, bool nowait); -void btrfs_check_nocow_unlock(struct btrfs_inode *inode); -bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, - u64 *delalloc_start_ret, u64 *delalloc_end_ret); - /* super.c */ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ea31a326ae93..7fe637408e98 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -33,6 +33,7 @@ #include "fs.h" #include "accessors.h" #include "file-item.h" +#include "file.h" static struct kmem_cache *extent_buffer_cache; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 22063340a4c2..b94dc4b2c486 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -35,6 +35,7 @@ #include "extent-tree.h" #include "file-item.h" #include "ioctl.h" +#include "file.h" /* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h new file mode 100644 index 000000000000..f3d794e33d46 --- /dev/null +++ b/fs/btrfs/file.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_FILE_H +#define BTRFS_FILE_H + +extern const struct file_operations btrfs_file_operations; + +int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_drop_extents_args *args); +int btrfs_replace_file_extents(struct btrfs_inode *inode, + struct btrfs_path *path, const u64 start, + const u64 end, + struct btrfs_replace_extent_info *extent_info, + struct btrfs_trans_handle **trans_out); +int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u64 start, u64 end); +ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); +int btrfs_release_file(struct inode *inode, struct file *file); +int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, + size_t num_pages, loff_t pos, size_t write_bytes, + struct extent_state **cached, bool noreserve); +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); +int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes, bool nowait); +void btrfs_check_nocow_unlock(struct btrfs_inode *inode); +bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, + u64 *delalloc_start_ret, u64 *delalloc_end_ret); + +#endif diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index bc1b9aa164ec..aef075b63188 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -28,6 +28,7 @@ #include "inode-item.h" #include "accessors.h" #include "file-item.h" +#include "file.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7111eec812dc..3fe3301b88b3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -64,6 +64,7 @@ #include "file-item.h" #include "uuid-tree.h" #include "ioctl.h" +#include "file.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4c5c2f890adc..dba1f07b4194 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -58,6 +58,7 @@ #include "dir-item.h" #include "uuid-tree.h" #include "ioctl.h" +#include "file.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 1c36407803ca..2aea2a17ed95 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -18,6 +18,7 @@ #include "delalloc-space.h" #include "qgroup.h" #include "subpage.h" +#include "file.h" static struct kmem_cache *btrfs_ordered_extent_cache; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 204bb884ab91..3c962b5d8dbd 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -13,6 +13,7 @@ #include "subpage.h" #include "accessors.h" #include "file-item.h" +#include "file.h" #define BTRFS_MAX_DEDUPE_LEN SZ_16M diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7be540fb5c4a..1c505713511c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -27,6 +27,7 @@ #include "root-tree.h" #include "dir-item.h" #include "file-item.h" +#include "file.h" #define MAX_CONFLICT_INODES 10 -- cgit From b538a271ae9bd07e926b1e3bbfcd1aebf63e5860 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:31 -0400 Subject: btrfs: move the 32bit warn defines into messages.h The code for these functions are in messages.c, move the defines and prototypes to messages.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 13 ------------- fs/btrfs/messages.h | 13 +++++++++++++ 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3a46b5b688e3..ae2af0aeebc4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -695,19 +695,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait); char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, u64 subvol_objectid); -#if BITS_PER_LONG == 32 -#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT) -/* - * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical - * addresses of extents. - * - * For 4K page size it's about 10T, for 64K it's 160T. - */ -#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8) -void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info); -void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info); -#endif - /* * Get the correct offset inside the page of extent buffer. * diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 3772358f8d30..5ac410e313e4 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -229,4 +229,17 @@ do { \ BUG(); \ } while (0) +#if BITS_PER_LONG == 32 +#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT) +/* + * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical + * addresses of extents. + * + * For 4K page size it's about 10T, for 64K it's 160T. + */ +#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8) +void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info); +void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info); +#endif + #endif -- cgit From cc68414c6123fcc436c96d8920f48120b0ca2613 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:32 -0400 Subject: btrfs: move the snapshot drop related prototypes to extent-tree.h These belong in extent-tree.h, they were missed because they were not grouped with the other extent-tree.c prototypes. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 ------ fs/btrfs/extent-tree.h | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ae2af0aeebc4..dcbfb1b9d269 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -675,12 +675,6 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) return btrfs_next_old_item(root, p, 0); } int btrfs_leaf_free_space(struct extent_buffer *leaf); -int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, - int for_reloc); -int btrfs_drop_subtree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *node, - struct extent_buffer *parent); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index b3674b008d58..ae5425253603 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -68,5 +68,11 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); +int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, + int for_reloc); +int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *node, + struct extent_buffer *parent); #endif -- cgit From 33cf97a7b6585efc57c1277b9e346afa7c542999 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:33 -0400 Subject: btrfs: move acl prototypes into acl.h Move these out of ctree.h into acl.h to cut down on code in ctree.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/acl.c | 2 +- fs/btrfs/acl.h | 27 +++++++++++++++++++++++++++ fs/btrfs/ctree.h | 18 ------------------ fs/btrfs/inode.c | 1 + 4 files changed, 29 insertions(+), 19 deletions(-) create mode 100644 fs/btrfs/acl.h (limited to 'fs') diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 548d6a5477b4..100bae33c677 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -11,10 +11,10 @@ #include #include #include - #include "ctree.h" #include "btrfs_inode.h" #include "xattr.h" +#include "acl.h" struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) { diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h new file mode 100644 index 000000000000..45197b4f73bf --- /dev/null +++ b/fs/btrfs/acl.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ACL_H +#define BTRFS_ACL_H + +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + +struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); +int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); +int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, + struct posix_acl *acl, int type); + +#else + +#define btrfs_get_acl NULL +#define btrfs_set_acl NULL +static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct posix_acl *acl, + int type) +{ + return -EOPNOTSUPP; +} + +#endif + +#endif diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index dcbfb1b9d269..040b640b0222 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -732,24 +732,6 @@ static inline unsigned long get_eb_page_index(unsigned long offset) #define EXPORT_FOR_TESTS #endif -/* acl.c */ -#ifdef CONFIG_BTRFS_FS_POSIX_ACL -struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); -int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, - struct posix_acl *acl, int type); -int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, - struct posix_acl *acl, int type); -#else -#define btrfs_get_acl NULL -#define btrfs_set_acl NULL -static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct posix_acl *acl, - int type) -{ - return -EOPNOTSUPP; -} -#endif - /* relocation.c */ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3fe3301b88b3..cb038dbbca7d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -65,6 +65,7 @@ #include "uuid-tree.h" #include "ioctl.h" #include "file.h" +#include "acl.h" struct btrfs_iget_args { u64 ino; -- cgit From 677074792a1d533232ec5517f23f78d64e6dffac Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:34 -0400 Subject: btrfs: move relocation prototypes into relocation.h Move these out of ctree.h into relocation.h to cut down on code in ctree.h Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 1 + fs/btrfs/ctree.c | 1 + fs/btrfs/ctree.h | 20 -------------------- fs/btrfs/disk-io.c | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/relocation.c | 1 + fs/btrfs/relocation.h | 23 +++++++++++++++++++++++ fs/btrfs/transaction.c | 1 + fs/btrfs/volumes.c | 1 + 9 files changed, 30 insertions(+), 20 deletions(-) create mode 100644 fs/btrfs/relocation.h (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 173df40da984..04cb608e7cfb 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -18,6 +18,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "relocation.h" /* Just arbitrary numbers so we can be sure one of these happened. */ #define BACKREF_FOUND_SHARED 6 diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4b47d380da1e..0acd85111cdf 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -21,6 +21,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "relocation.h" static struct kmem_cache *btrfs_path_cachep; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 040b640b0222..b1b6de508e20 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -732,26 +732,6 @@ static inline unsigned long get_eb_page_index(unsigned long offset) #define EXPORT_FOR_TESTS #endif -/* relocation.c */ -int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); -int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); -int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); -int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf, - struct extent_buffer *cow); -void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, - u64 *bytes_to_reserve); -int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending); -int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); -struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, - u64 bytenr); -int btrfs_should_ignore_reloc_root(struct btrfs_root *root); - /* scrub.c */ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e8082e3c5bdc..a613a9a7caae 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -49,6 +49,7 @@ #include "root-tree.h" #include "defrag.h" #include "uuid-tree.h" +#include "relocation.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cb038dbbca7d..d8856e621e01 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -66,6 +66,7 @@ #include "ioctl.h" #include "file.h" #include "acl.h" +#include "relocation.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index e86364bdac8e..f31a97d4f9ad 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -33,6 +33,7 @@ #include "extent-tree.h" #include "root-tree.h" #include "file-item.h" +#include "relocation.h" /* * Relocation overview diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h new file mode 100644 index 000000000000..2041a86186de --- /dev/null +++ b/fs/btrfs/relocation.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_RELOCATION_H +#define BTRFS_RELOCATION_H + +int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); +int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); +int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow); +void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve); +int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending); +int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); +struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr); +int btrfs_should_ignore_reloc_root(struct btrfs_root *root); + +#endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bbc3ad3e297c..a262fd188abe 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -32,6 +32,7 @@ #include "dir-item.h" #include "uuid-tree.h" #include "ioctl.h" +#include "relocation.h" static struct kmem_cache *btrfs_trans_handle_cachep; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 823de2f29289..abfa6a9e20e6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -37,6 +37,7 @@ #include "accessors.h" #include "uuid-tree.h" #include "ioctl.h" +#include "relocation.h" static struct bio_set btrfs_bioset; -- cgit From 2fc6822c99d7e902b7cef146efa37420d41c0c59 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:35 -0400 Subject: btrfs: move scrub prototypes into scrub.h Move these out of ctree.h into scrub.h to cut down on code in ctree.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 11 ----------- fs/btrfs/dev-replace.c | 1 + fs/btrfs/disk-io.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/scrub.c | 1 + fs/btrfs/scrub.h | 16 ++++++++++++++++ fs/btrfs/super.c | 1 + fs/btrfs/transaction.c | 1 + fs/btrfs/volumes.c | 1 + 9 files changed, 23 insertions(+), 11 deletions(-) create mode 100644 fs/btrfs/scrub.h (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b1b6de508e20..f3facc10646c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -732,17 +732,6 @@ static inline unsigned long get_eb_page_index(unsigned long offset) #define EXPORT_FOR_TESTS #endif -/* scrub.c */ -int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, - u64 end, struct btrfs_scrub_progress *progress, - int readonly, int is_dev_replace); -void btrfs_scrub_pause(struct btrfs_fs_info *fs_info); -void btrfs_scrub_continue(struct btrfs_fs_info *fs_info); -int btrfs_scrub_cancel(struct btrfs_fs_info *info); -int btrfs_scrub_cancel_dev(struct btrfs_device *dev); -int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, - struct btrfs_scrub_progress *progress); - /* dev-replace.c */ void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 94f8975034ce..84af2010fae2 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -25,6 +25,7 @@ #include "block-group.h" #include "fs.h" #include "accessors.h" +#include "scrub.h" /* * Device replace overview diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a613a9a7caae..880bd39680bc 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -50,6 +50,7 @@ #include "defrag.h" #include "uuid-tree.h" #include "relocation.h" +#include "scrub.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index dba1f07b4194..98cf16f118b5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -59,6 +59,7 @@ #include "uuid-tree.h" #include "ioctl.h" #include "file.h" +#include "scrub.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e46e6c4d4bf9..8c89e63ef2e8 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -24,6 +24,7 @@ #include "fs.h" #include "accessors.h" #include "file-item.h" +#include "scrub.h" /* * This is only the first step towards a full-features scrub. It reads all diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h new file mode 100644 index 000000000000..7639103ebf9d --- /dev/null +++ b/fs/btrfs/scrub.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SCRUB_H +#define BTRFS_SCRUB_H + +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, + u64 end, struct btrfs_scrub_progress *progress, + int readonly, int is_dev_replace); +void btrfs_scrub_pause(struct btrfs_fs_info *fs_info); +void btrfs_scrub_continue(struct btrfs_fs_info *fs_info); +int btrfs_scrub_cancel(struct btrfs_fs_info *info); +int btrfs_scrub_cancel_dev(struct btrfs_device *dev); +int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, + struct btrfs_scrub_progress *progress); + +#endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2cd05246bcd3..4bfda9be4556 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -55,6 +55,7 @@ #include "defrag.h" #include "dir-item.h" #include "ioctl.h" +#include "scrub.h" #define CREATE_TRACE_POINTS #include diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a262fd188abe..593b2414e2be 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -33,6 +33,7 @@ #include "uuid-tree.h" #include "ioctl.h" #include "relocation.h" +#include "scrub.h" static struct kmem_cache *btrfs_trans_handle_cachep; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index abfa6a9e20e6..5606060770eb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -38,6 +38,7 @@ #include "uuid-tree.h" #include "ioctl.h" #include "relocation.h" +#include "scrub.h" static struct bio_set btrfs_bioset; -- cgit From 77407dc032e29f205a1e3922564c0abfd2e23d15 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:36 -0400 Subject: btrfs: move dev-replace prototypes into dev-replace.h We already have a dev-replace.h, simply move these prototypes and helpers into dev-replace.h where they belong. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 9 --------- fs/btrfs/dev-replace.h | 8 ++++++++ fs/btrfs/extent_io.c | 1 + 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f3facc10646c..84bc33ff003f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -732,15 +732,6 @@ static inline unsigned long get_eb_page_index(unsigned long offset) #define EXPORT_FOR_TESTS #endif -/* dev-replace.c */ -void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); -void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); - -static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) -{ - btrfs_bio_counter_sub(fs_info, 1); -} - static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 6084b313056a..675082ccec89 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -25,5 +25,13 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, struct btrfs_block_group *cache, u64 physical); +void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); +void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); + +static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) +{ + btrfs_bio_counter_sub(fs_info, 1); +} + #endif diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7fe637408e98..6373b1565250 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -34,6 +34,7 @@ #include "accessors.h" #include "file-item.h" #include "file.h" +#include "dev-replace.h" static struct kmem_cache *extent_buffer_cache; -- cgit From 5c11adcc383a94cacd652461c9435bf7a5c53c9c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:37 -0400 Subject: btrfs: move verity prototypes into verity.h Move these out of ctree.h into verity.h to cut down on code in ctree.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 22 ---------------------- fs/btrfs/inode.c | 1 + fs/btrfs/send.c | 1 + fs/btrfs/super.c | 1 + fs/btrfs/verity.c | 1 + fs/btrfs/verity.h | 28 ++++++++++++++++++++++++++++ 6 files changed, 32 insertions(+), 22 deletions(-) create mode 100644 fs/btrfs/verity.h (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 84bc33ff003f..15bb90536460 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -741,28 +741,6 @@ static inline int is_fstree(u64 rootid) return 0; } -/* verity.c */ -#ifdef CONFIG_FS_VERITY - -extern const struct fsverity_operations btrfs_verityops; -int btrfs_drop_verity_items(struct btrfs_inode *inode); -int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size); - -#else - -static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) -{ - return 0; -} - -static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf, - size_t buf_size) -{ - return -EPERM; -} - -#endif - /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d8856e621e01..0fda1c5ba28e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -67,6 +67,7 @@ #include "file.h" #include "acl.h" #include "relocation.h" +#include "verity.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e9b7deccc5fe..3befc0d2d866 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -31,6 +31,7 @@ #include "dir-item.h" #include "file-item.h" #include "ioctl.h" +#include "verity.h" /* * Maximum number of references an extent can have in order for us to attempt to diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4bfda9be4556..ae49bdf71d32 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -56,6 +56,7 @@ #include "dir-item.h" #include "ioctl.h" #include "scrub.h" +#include "verity.h" #define CREATE_TRACE_POINTS #include diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 00ba5143a17d..b31d6c7627ff 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -19,6 +19,7 @@ #include "fs.h" #include "accessors.h" #include "ioctl.h" +#include "verity.h" /* * Implementation of the interface defined in struct fsverity_operations. diff --git a/fs/btrfs/verity.h b/fs/btrfs/verity.h new file mode 100644 index 000000000000..91c10f7d0a46 --- /dev/null +++ b/fs/btrfs/verity.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_VERITY_H +#define BTRFS_VERITY_H + +#ifdef CONFIG_FS_VERITY + +extern const struct fsverity_operations btrfs_verityops; + +int btrfs_drop_verity_items(struct btrfs_inode *inode); +int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size); + +#else + +static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) +{ + return 0; +} + +static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf, + size_t buf_size) +{ + return -EPERM; +} + +#endif + +#endif -- cgit From 6a6b4daf92fd8393eeac9631318d48d5b25ee4df Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:38 -0400 Subject: btrfs: move CONFIG_BTRFS_FS_RUN_SANITY_TESTS checks to fs.h We already have a few of these in fs.h, move the remaining checks out of ctree.h into fs.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 15 --------------- fs/btrfs/fs.h | 9 +++++++++ 2 files changed, 9 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 15bb90536460..27bfedf3a9fb 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -722,16 +722,6 @@ static inline unsigned long get_eb_page_index(unsigned long offset) return offset >> PAGE_SHIFT; } -/* - * Use that for functions that are conditionally exported for sanity tests but - * otherwise static - */ -#ifndef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -#define EXPORT_FOR_TESTS static -#else -#define EXPORT_FOR_TESTS -#endif - static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || @@ -741,11 +731,6 @@ static inline int is_fstree(u64 rootid) return 0; } -/* Sanity test specific functions */ -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_destroy_inode(struct inode *inode); -#endif - static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) { return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index a49d11127bf7..7d0da8509567 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -971,11 +971,20 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) &(fs_info)->fs_state))) #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + +#define EXPORT_FOR_TESTS + static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); } + +void btrfs_test_destroy_inode(struct inode *inode); + #else + +#define EXPORT_FOR_TESTS static + static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { return 0; -- cgit From c03b22076bd2c9fe88c055a35637a836d986db76 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:39 -0400 Subject: btrfs: move super prototypes into super.h Move these out of ctree.h into super.h to cut down on code in ctree.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 7 ------- fs/btrfs/disk-io.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/super.c | 1 + fs/btrfs/super.h | 12 ++++++++++++ 5 files changed, 15 insertions(+), 7 deletions(-) create mode 100644 fs/btrfs/super.h (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 27bfedf3a9fb..c32f6b6ae972 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -682,13 +682,6 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); -/* super.c */ -int btrfs_parse_options(struct btrfs_fs_info *info, char *options, - unsigned long new_flags); -int btrfs_sync_fs(struct super_block *sb, int wait); -char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, - u64 subvol_objectid); - /* * Get the correct offset inside the page of extent buffer. * diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 880bd39680bc..f5f793af12a0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -51,6 +51,7 @@ #include "uuid-tree.h" #include "relocation.h" #include "scrub.h" +#include "super.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 98cf16f118b5..6f9f8e8dfea6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -60,6 +60,7 @@ #include "ioctl.h" #include "file.h" #include "scrub.h" +#include "super.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ae49bdf71d32..d54bfec8e506 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -57,6 +57,7 @@ #include "ioctl.h" #include "scrub.h" #include "verity.h" +#include "super.h" #define CREATE_TRACE_POINTS #include diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h new file mode 100644 index 000000000000..c8875653e628 --- /dev/null +++ b/fs/btrfs/super.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SUPER_H +#define BTRFS_SUPER_H + +int btrfs_parse_options(struct btrfs_fs_info *info, char *options, + unsigned long new_flags); +int btrfs_sync_fs(struct super_block *sb, int wait); +char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid); + +#endif -- cgit From 7f0add250f829248281e1745d92648f72192a8f4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:40 -0400 Subject: btrfs: move super_block specific helpers into super.h This will make syncing fs.h to user space a little easier if we can pull the super block specific helpers out of fs.h and put them in super.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 1 + fs/btrfs/defrag.c | 1 + fs/btrfs/export.c | 1 + fs/btrfs/extent_io.c | 1 + fs/btrfs/file-item.c | 1 + fs/btrfs/file.c | 1 + fs/btrfs/free-space-cache.c | 1 + fs/btrfs/fs.h | 17 ----------------- fs/btrfs/inode.c | 1 + fs/btrfs/lzo.c | 1 + fs/btrfs/messages.c | 1 + fs/btrfs/ordered-data.c | 1 + fs/btrfs/props.c | 1 + fs/btrfs/reflink.c | 1 + fs/btrfs/relocation.c | 1 + fs/btrfs/super.h | 17 +++++++++++++++++ fs/btrfs/volumes.c | 1 + 17 files changed, 32 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 61828f8375e6..e66fa18dbcf7 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -35,6 +35,7 @@ #include "subpage.h" #include "zoned.h" #include "file-item.h" +#include "super.h" static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index ed82085acea3..48a9460d49c7 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -15,6 +15,7 @@ #include "subpage.h" #include "defrag.h" #include "file-item.h" +#include "super.h" static struct kmem_cache *btrfs_inode_defrag_cachep; diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index b6bc9684648f..744a02b7fd67 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -8,6 +8,7 @@ #include "print-tree.h" #include "export.h" #include "accessors.h" +#include "super.h" #define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \ parent_objectid) / 4) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6373b1565250..bda420d697f4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -35,6 +35,7 @@ #include "file-item.h" #include "file.h" #include "dev-replace.h" +#include "super.h" static struct kmem_cache *extent_buffer_cache; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 20d88cd0b602..036d50af5666 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -20,6 +20,7 @@ #include "fs.h" #include "accessors.h" #include "file-item.h" +#include "super.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b94dc4b2c486..f9bea57abbde 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -36,6 +36,7 @@ #include "file-item.h" #include "ioctl.h" #include "file.h" +#include "super.h" /* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index aef075b63188..797edb41d0aa 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -29,6 +29,7 @@ #include "accessors.h" #include "file-item.h" #include "file.h" +#include "super.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 7d0da8509567..c7f2a512fba2 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -801,11 +801,6 @@ static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_in return READ_ONCE(fs_info->last_root_drop_gen); } -static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) -{ - return sb->s_fs_info; -} - /* * Take the number of bytes to be checksummed and figure out how many leaves * it would require to store the csums for that many bytes. @@ -947,18 +942,6 @@ static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) btrfs_fs_closing(fs_info); } -static inline void btrfs_set_sb_rdonly(struct super_block *sb) -{ - sb->s_flags |= SB_RDONLY; - set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); -} - -static inline void btrfs_clear_sb_rdonly(struct super_block *sb) -{ - sb->s_flags &= ~SB_RDONLY; - clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); -} - static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) { clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0fda1c5ba28e..83e5ae6b74ef 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -68,6 +68,7 @@ #include "acl.h" #include "relocation.h" #include "verity.h" +#include "super.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 6751874a3e69..e7b1ceffcd33 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -16,6 +16,7 @@ #include "messages.h" #include "compression.h" #include "ctree.h" +#include "super.h" #define LZO_LEN 4 diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 196757ee16f1..625bbbbb2608 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -5,6 +5,7 @@ #include "discard.h" #include "transaction.h" #include "space-info.h" +#include "super.h" #ifdef CONFIG_PRINTK diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 2aea2a17ed95..8fda1949b71b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -19,6 +19,7 @@ #include "qgroup.h" #include "subpage.h" #include "file.h" +#include "super.h" static struct kmem_cache *btrfs_ordered_extent_cache; diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 9ad15d69718c..0755af0e53e3 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -14,6 +14,7 @@ #include "space-info.h" #include "fs.h" #include "accessors.h" +#include "super.h" #define BTRFS_PROP_HANDLERS_HT_BITS 8 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 3c962b5d8dbd..9d728107536e 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -14,6 +14,7 @@ #include "accessors.h" #include "file-item.h" #include "file.h" +#include "super.h" #define BTRFS_MAX_DEDUPE_LEN SZ_16M diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f31a97d4f9ad..d119986d1599 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -34,6 +34,7 @@ #include "root-tree.h" #include "file-item.h" #include "relocation.h" +#include "super.h" /* * Relocation overview diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h index c8875653e628..8dbb909b364f 100644 --- a/fs/btrfs/super.h +++ b/fs/btrfs/super.h @@ -9,4 +9,21 @@ int btrfs_sync_fs(struct super_block *sb, int wait); char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, u64 subvol_objectid); +static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline void btrfs_set_sb_rdonly(struct super_block *sb) +{ + sb->s_flags |= SB_RDONLY; + set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); +} + +static inline void btrfs_clear_sb_rdonly(struct super_block *sb) +{ + sb->s_flags &= ~SB_RDONLY; + clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); +} + #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5606060770eb..1c9f7a946657 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -39,6 +39,7 @@ #include "ioctl.h" #include "relocation.h" #include "scrub.h" +#include "super.h" static struct bio_set btrfs_bioset; -- cgit From aa5d3003ddee8d7c5c517db072f888e114ff1529 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:41 -0400 Subject: btrfs: move orphan prototypes into orphan.h Move these out of ctree.h into orphan.h to cut down on code in ctree.h. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 ------ fs/btrfs/extent-tree.c | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/orphan.c | 1 + fs/btrfs/orphan.h | 11 +++++++++++ fs/btrfs/root-tree.c | 1 + fs/btrfs/tree-log.c | 1 + fs/btrfs/verity.c | 1 + 8 files changed, 17 insertions(+), 6 deletions(-) create mode 100644 fs/btrfs/orphan.h (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c32f6b6ae972..5649f8907984 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -676,12 +676,6 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) } int btrfs_leaf_free_space(struct extent_buffer *leaf); -/* orphan.c */ -int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 offset); -int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 offset); - /* * Get the correct offset inside the page of extent buffer. * diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 940d4fe23cfb..b037107678c8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -41,6 +41,7 @@ #include "extent-tree.h" #include "root-tree.h" #include "file-item.h" +#include "orphan.h" #undef SCRAMBLE_DELAYED_REFS diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 83e5ae6b74ef..9abed40ade11 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -69,6 +69,7 @@ #include "relocation.h" #include "verity.h" #include "super.h" +#include "orphan.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index aa534108c1e2..7a1b021b5669 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -5,6 +5,7 @@ #include "ctree.h" #include "disk-io.h" +#include "orphan.h" int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) diff --git a/fs/btrfs/orphan.h b/fs/btrfs/orphan.h new file mode 100644 index 000000000000..3faab5cbb59a --- /dev/null +++ b/fs/btrfs/orphan.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ORPHAN_H +#define BTRFS_ORPHAN_H + +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); + +#endif diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 42f046e5e25f..859874579456 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -15,6 +15,7 @@ #include "space-info.h" #include "accessors.h" #include "root-tree.h" +#include "orphan.h" /* * Read a root item from the tree. In case we detect a root item smaller then diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1c505713511c..b6e99ef99679 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -28,6 +28,7 @@ #include "dir-item.h" #include "file-item.h" #include "file.h" +#include "orphan.h" #define MAX_CONFLICT_INODES 10 diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index b31d6c7627ff..bf9eb693a6a7 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -20,6 +20,7 @@ #include "accessors.h" #include "ioctl.h" #include "verity.h" +#include "orphan.h" /* * Implementation of the interface defined in struct fsverity_operations. -- cgit From d52a1365258b5781c2635937afa91af66322d981 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 16 Sep 2022 15:28:35 +0800 Subject: btrfs: selftests: remove impossible inline extent at non-zero file offset In our inode-tests.c, we create an inline offset at file offset 5, which is no longer possible since the introduction of tree-checker. Thus I don't think we should spend time maintaining some corner cases which are already ruled out by tree-checker. So this patch will: - Change the inline extent to start at file offset 0 Also change its length to 6 to cover the original length - Add an extra ASSERT() for btrfs_add_extent_mapping() This is to make sure tree-checker is working correctly. - Update the inode selftest Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 7 ++++++ fs/btrfs/tests/inode-tests.c | 57 +++++++++++++++----------------------------- 2 files changed, 26 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index cd45303f344d..4a4362f5cc52 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -638,6 +638,13 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, int ret; struct extent_map *em = *em_in; + /* + * Tree-checker should have rejected any inline extent with non-zero + * file offset. Here just do a sanity check. + */ + if (em->block_start == EXTENT_MAP_INLINE) + ASSERT(em->start == 0); + ret = add_extent_mapping(em_tree, em, 0); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 0a34a54ea9fd..05b03f5eab83 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -73,8 +73,8 @@ static void insert_inode_item_key(struct btrfs_root *root) * diagram of how the extents will look though this may not be possible we still * want to make sure everything acts normally (the last number is not inclusive) * - * [0 - 5][5 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291] - * [hole ][inline][hole but no extent][ hole ][ regular ][regular1 split] + * [0 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291] + * [inline][hole but no extent][ hole ][ regular ][regular1 split] * * [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ] * [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written] @@ -91,19 +91,12 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) u64 disk_bytenr = SZ_1M; u64 offset = 0; - /* First we want a hole */ - insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, - slot); - slot++; - offset += 5; - /* - * Now we want an inline extent, I don't think this is possible but hey - * why not? Also keep in mind if we have an inline extent it counts as - * the whole first page. If we were to expand it we would have to cow - * and we wouldn't have an inline extent anymore. + * Tree-checker has strict limits on inline extents that they can only + * exist at file offset 0, thus we can only have one inline file extent + * at most. */ - insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0, + insert_extent(root, offset, 6, 6, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0, slot); slot++; offset = sectorsize; @@ -282,37 +275,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != EXTENT_MAP_HOLE) { - test_err("expected a hole, got %llu", em->block_start); - goto out; - } - if (em->start != 0 || em->len != 5) { - test_err( - "unexpected extent wanted start 0 len 5, got start %llu len %llu", - em->start, em->len); - goto out; - } - if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); - goto out; - } - offset = em->start + em->len; - free_extent_map(em); - - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); - if (IS_ERR(em)) { - test_err("got an error when we shouldn't have"); - goto out; - } if (em->block_start != EXTENT_MAP_INLINE) { test_err("expected an inline, got %llu", em->block_start); goto out; } - if (em->start != offset || em->len != (sectorsize - 5)) { + /* + * For inline extent, we always round up the em to sectorsize, as + * they are either: + * + * a) a hidden hole + * The range will be zeroed at inline extent read time. + * + * b) a file extent with unaligned bytenr + * Tree checker will reject it. + */ + if (em->start != 0 || em->len != sectorsize) { test_err( - "unexpected extent wanted start %llu len 1, got start %llu len %llu", - offset, em->start, em->len); + "unexpected extent wanted start 0 len %u, got start %llu len %llu", + sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { -- cgit From affc5424338682641829cc84293ec90f36425034 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 16 Sep 2022 15:28:36 +0800 Subject: btrfs: make inline extent read calculation much simpler Currently we calculate inline extent read in a way that inline extent can start at non-zero offset. This is consistent with the inode selftests, which puts an inline extent at file offset 5. Meanwhile the inline extent creation code will only create inline extent at file offset 0. Furthermore with the introduction of tree-checker on file extents, we are actively rejecting inline extent which starts at non-zero file offset. And so far we haven't yet seen any report of rejected inline extents at non-zero file offset. This all means, the extra calculation to support inline extents at non-zero file offset is mostly paper weight, and damaging the readability of the code. Thus this patch will: - Add extra ASSERT()s to make sure involved file offset are all 0 - Remove @extent_offset calculation - Simplify the involved code As several variables are now single-use, no need to declare them as a variable anymore. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9abed40ade11..a374dd718315 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7064,41 +7064,43 @@ next: extent_type == BTRFS_FILE_EXTENT_PREALLOC) { goto insert; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - unsigned long ptr; char *map; - size_t size; - size_t extent_offset; size_t copy_size; if (!page) goto out; - size = btrfs_file_extent_ram_bytes(leaf, item); - extent_offset = page_offset(page) + pg_offset - extent_start; - copy_size = min_t(u64, PAGE_SIZE - pg_offset, - size - extent_offset); - em->start = extent_start + extent_offset; + /* + * Inline extent can only exist at file offset 0. This is + * ensured by tree-checker and inline extent creation path. + * Thus all members representing file offsets should be zero. + */ + ASSERT(page_offset(page) == 0); + ASSERT(pg_offset == 0); + ASSERT(extent_start == 0); + ASSERT(em->start == 0); + + copy_size = min_t(u64, PAGE_SIZE, + btrfs_file_extent_ram_bytes(leaf, item)); + em->start = extent_start; em->len = ALIGN(copy_size, fs_info->sectorsize); em->orig_block_len = em->len; em->orig_start = em->start; - ptr = btrfs_file_extent_inline_start(item) + extent_offset; if (!PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != BTRFS_COMPRESS_NONE) { - ret = uncompress_inline(path, page, pg_offset, - extent_offset, item); + ret = uncompress_inline(path, page, 0, 0, item); if (ret) goto out; } else { map = kmap_local_page(page); - read_extent_buffer(leaf, map + pg_offset, ptr, - copy_size); - if (pg_offset + copy_size < PAGE_SIZE) { - memset(map + pg_offset + copy_size, 0, - PAGE_SIZE - pg_offset - - copy_size); - } + read_extent_buffer(leaf, map, + btrfs_file_extent_inline_start(item), + copy_size); + if (copy_size < PAGE_SIZE) + memset(map + copy_size, 0, + PAGE_SIZE - copy_size); kunmap_local(map); } flush_dcache_page(page); -- cgit From a196a8944f77b7b762795a0862d8aa4a005625a4 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 16 Sep 2022 15:28:37 +0800 Subject: btrfs: do not reset extent map members for inline extents read Currently for inline extents read inside btrfs_get_extent(), we will reset several extent map members: - em->start Reset to extent_start, which is completely unnecessary. The extent_start and em->start should have already be zero, ensured by tree-checker already. - em->len Reset the round_up(copy_size, fs_info->sectorsize), which is again unnecessary. - em->orig_block_len Reset to em->len (sectorsize), while it is originally unset from btrfs_extent_item_to_extent_map(). This makes no difference, as all extent map handling paths will ignore the orig_block_len if they found it's an inlined extent. Such inline extent orig_block_len ignoring examples can be found in btrfs_drop_extent_cache(). - em->orig_start Reset to em->start (0), while it is originally set to EXTENT_MAP_HOLE. This makes no difference either, as all extent map handling paths will ignore the em->orig_start if they found it's an inline extent. Thus all these em members resetting are unnecessary. Replace them with ASSERT()s checking the only two members (block_start and length) that make sense. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a374dd718315..5b45eb141215 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7082,10 +7082,15 @@ next: copy_size = min_t(u64, PAGE_SIZE, btrfs_file_extent_ram_bytes(leaf, item)); - em->start = extent_start; - em->len = ALIGN(copy_size, fs_info->sectorsize); - em->orig_block_len = em->len; - em->orig_start = em->start; + + /* + * btrfs_extent_item_to_extent_map() should have properly + * initialized em members already. + * + * Other members are not utilized for inline extents. + */ + ASSERT(em->block_start == EXTENT_MAP_INLINE); + ASSERT(em->len = fs_info->sectorsize); if (!PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != -- cgit From 280f15cb96a61e4122bb28cdc316343ff4918b7d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 16 Sep 2022 15:28:38 +0800 Subject: btrfs: remove new_inline argument from btrfs_extent_item_to_extent_map() The argument @new_inline changes the following members of extent_map: - em->compress_type - EXTENT_FLAG_COMPRESSED of em->flags However neither members makes a difference for inline extents: - Inline extent read never use above em members As inside btrfs_get_extent() we directly use the file extent item to do the read. - Inline extents are never to be split Thus code really needs em->compress_type or that flag will never be executed on inlined extents. (btrfs_drop_extent_cache() would be one example) - Fiemap no longer relies on extent maps Recent fiemap optimization makes fiemap to search subvolume tree directly, without using any extent map at all. Thus those members make no difference for inline extents any more. Furthermore such exception without much explanation is really a source of confusion. Thus this patch will completely remove the argument, and always set the involved members, unifying the behavior. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 2 +- fs/btrfs/file-item.c | 6 ++---- fs/btrfs/file-item.h | 1 - fs/btrfs/inode.c | 2 +- 4 files changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 48a9460d49c7..919dfe0f7e50 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -582,7 +582,7 @@ iterate: goto next; /* Now this extent covers @start, convert it to em */ - btrfs_extent_item_to_extent_map(inode, &path, fi, false, em); + btrfs_extent_item_to_extent_map(inode, &path, fi, em); break; next: ret = btrfs_next_item(root, &path); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 036d50af5666..456f71b42a9c 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -1214,7 +1214,6 @@ out: void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, - const bool new_inline, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1266,10 +1265,9 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, */ em->orig_start = EXTENT_MAP_HOLE; em->block_len = (u64)-1; - if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) { + em->compress_type = compress_type; + if (compress_type != BTRFS_COMPRESS_NONE) set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - } } else { btrfs_err(fs_info, "unknown file extent item type %d, inode %llu, offset %llu, " diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 51cd3ab5948c..ba12711cf933 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -24,7 +24,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, - const bool new_inline, struct extent_map *em); int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5b45eb141215..0ab899132dbf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7058,7 +7058,7 @@ next: goto insert; } - btrfs_extent_item_to_extent_map(inode, path, item, !page, em); + btrfs_extent_item_to_extent_map(inode, path, item, em); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { -- cgit From a982fc822001e26930b502b16534f6bbd42b232a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 16 Sep 2022 15:28:39 +0800 Subject: btrfs: extract the inline extent read code into its own function Currently we have inline extent read code behind two levels of indentation, factor them them out into a new function, read_inline_extent(), to make it a little easier to read. Since we're here, also remove @extent_offset and @pg_offset arguments from uncompress_inline() function, as it's not possible to have inline extents at non-inline file offset. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 69 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0ab899132dbf..1422f072a7b8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6865,7 +6865,6 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, static noinline int uncompress_inline(struct btrfs_path *path, struct page *page, - size_t pg_offset, u64 extent_offset, struct btrfs_file_extent_item *item) { int ret; @@ -6876,7 +6875,6 @@ static noinline int uncompress_inline(struct btrfs_path *path, unsigned long ptr; int compress_type; - WARN_ON(pg_offset != 0); compress_type = btrfs_file_extent_compression(leaf, item); max_size = btrfs_file_extent_ram_bytes(leaf, item); inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); @@ -6888,8 +6886,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_SIZE, max_size); - ret = btrfs_decompress(compress_type, tmp, page, - extent_offset, inline_size, max_size); + ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size); /* * decompression code contains a memset to fill in any space between the end @@ -6899,13 +6896,40 @@ static noinline int uncompress_inline(struct btrfs_path *path, * cover that region here. */ - if (max_size + pg_offset < PAGE_SIZE) - memzero_page(page, pg_offset + max_size, - PAGE_SIZE - max_size - pg_offset); + if (max_size < PAGE_SIZE) + memzero_page(page, max_size, PAGE_SIZE - max_size); kfree(tmp); return ret; } +static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, + struct page *page) +{ + struct btrfs_file_extent_item *fi; + void *kaddr; + size_t copy_size; + + if (!page || PageUptodate(page)) + return 0; + + ASSERT(page_offset(page) == 0); + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) + return uncompress_inline(path, page, fi); + + copy_size = min_t(u64, PAGE_SIZE, + btrfs_file_extent_ram_bytes(path->nodes[0], fi)); + kaddr = kmap_local_page(page); + read_extent_buffer(path->nodes[0], kaddr, + btrfs_file_extent_inline_start(fi), copy_size); + kunmap_local(kaddr); + if (copy_size < PAGE_SIZE) + memzero_page(page, copy_size, PAGE_SIZE - copy_size); + return 0; +} + /* * Lookup the first extent overlapping a range in a file. * @@ -7064,25 +7088,15 @@ next: extent_type == BTRFS_FILE_EXTENT_PREALLOC) { goto insert; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - char *map; - size_t copy_size; - - if (!page) - goto out; - /* * Inline extent can only exist at file offset 0. This is * ensured by tree-checker and inline extent creation path. * Thus all members representing file offsets should be zero. */ - ASSERT(page_offset(page) == 0); ASSERT(pg_offset == 0); ASSERT(extent_start == 0); ASSERT(em->start == 0); - copy_size = min_t(u64, PAGE_SIZE, - btrfs_file_extent_ram_bytes(leaf, item)); - /* * btrfs_extent_item_to_extent_map() should have properly * initialized em members already. @@ -7092,24 +7106,9 @@ next: ASSERT(em->block_start == EXTENT_MAP_INLINE); ASSERT(em->len = fs_info->sectorsize); - if (!PageUptodate(page)) { - if (btrfs_file_extent_compression(leaf, item) != - BTRFS_COMPRESS_NONE) { - ret = uncompress_inline(path, page, 0, 0, item); - if (ret) - goto out; - } else { - map = kmap_local_page(page); - read_extent_buffer(leaf, map, - btrfs_file_extent_inline_start(item), - copy_size); - if (copy_size < PAGE_SIZE) - memset(map + copy_size, 0, - PAGE_SIZE - copy_size); - kunmap_local(map); - } - flush_dcache_page(page); - } + ret = read_inline_extent(inode, path, page); + if (ret < 0) + goto out; goto insert; } not_found: -- cgit From c30ff698da87aeabb459cca75fe4a77a858609fb Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 18:12:04 +0200 Subject: btrfs: fix SPDX comment in tree-mod-log.h The header files should use the /* */ comment style, introduced in commit f3a84ccd28d0 ("btrfs: move the tree mod log code into its own file"). Signed-off-by: David Sterba --- fs/btrfs/tree-mod-log.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h index 8cffe0bc2a39..94f10afeee97 100644 --- a/fs/btrfs/tree-mod-log.h +++ b/fs/btrfs/tree-mod-log.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef BTRFS_TREE_MOD_LOG_H #define BTRFS_TREE_MOD_LOG_H -- cgit From 20af93d97f46a824647326278df7d377ba3269ff Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 31 Oct 2022 11:43:56 +0000 Subject: btrfs: update stale comment for nowait direct IO writes If when doing a direct IO write we need to fallback to buffered IO, we this comment at btrfs_direct_write() that says we can't directly fallback to buffered IO if we have a NOWAIT iocb, because we have no support for NOWAIT buffered writes. That is not true anymore, as support for NOWAIT buffered writes was added recently in commit 926078b21db9 ("btrfs: enable nowait async buffered writes"). However we still can't fallback to a buffered write in case we have a NOWAIT iocb, because we'll need to flush delalloc and wait for it to complete after doing the buffered write, and that can block for several reasons, the main reason being waiting for IO to complete. So update the comment to mention all that. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f9bea57abbde..f8be9d629e75 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1575,8 +1575,8 @@ buffered: /* * If we are in a NOWAIT context, then return -EAGAIN to signal the caller * it must retry the operation in a context where blocking is acceptable, - * since we currently don't have NOWAIT semantics support for buffered IO - * and may block there for many reasons (reserving space for example). + * because even if we end up not blocking during the buffered IO attempt + * below, we will block when flushing and waiting for the IO. */ if (iocb->ki_flags & IOCB_NOWAIT) { err = -EAGAIN; -- cgit From 428c8e03109e717d90e0d1329dc3d926b9421ad3 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 26 Oct 2022 23:25:14 +0200 Subject: btrfs: simplify percent calculation helpers, rename div_factor The div_factor* helpers calculate fraction or percentage fraction. The name is a bit confusing, we use it only for percentage calculations and there are two helpers. There's a helper mult_frac that's for general fractions, that tries to be accurate but we multiply and divide by small numbers so we can use the div_u64 helper. Rename the div_factor* helpers and use 1..100 percentage range, also drop the case checking for percentage == 100, it's never hit. The conversions: * div_factor calculates tenths and the numbers need to be adjusted * div_factor_fine is direct replacement Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 6 +++--- fs/btrfs/block-rsv.c | 4 ++-- fs/btrfs/block-rsv.h | 2 +- fs/btrfs/free-space-cache.c | 3 +-- fs/btrfs/misc.h | 16 ++-------------- fs/btrfs/space-info.c | 4 ++-- fs/btrfs/sysfs.c | 2 +- fs/btrfs/transaction.c | 2 +- fs/btrfs/volumes.c | 12 +++++------- 9 files changed, 18 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index cfe495d39ebd..ea5fcb665daa 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1551,7 +1551,7 @@ static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_f if (reclaim_thresh == 0) return false; - thresh = div_factor_fine(bg->length, reclaim_thresh); + thresh = mult_perc(bg->length, reclaim_thresh); /* * If we were below the threshold before don't reclaim, we are likely a @@ -3523,13 +3523,13 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info, */ if (force == CHUNK_ALLOC_LIMITED) { thresh = btrfs_super_total_bytes(fs_info->super_copy); - thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); + thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1)); if (sinfo->total_bytes - bytes_used < thresh) return 1; } - if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8)) + if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80)) return 0; return 1; } diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 29705256727f..5367a14d44d2 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -227,7 +227,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, return ret; } -int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent) { u64 num_bytes = 0; int ret = -ENOSPC; @@ -236,7 +236,7 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) return 0; spin_lock(&block_rsv->lock); - num_bytes = div_factor(block_rsv->size, min_factor); + num_bytes = mult_perc(block_rsv->size, min_percent); if (block_rsv->reserved >= num_bytes) ret = 0; spin_unlock(&block_rsv->lock); diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index e2cf0dd43203..4cc41c9aaa82 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -63,7 +63,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush); -int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent); int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 min_reserved, enum btrfs_reserve_flush_enum flush); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 797edb41d0aa..627bd6120368 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2726,8 +2726,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, btrfs_mark_bg_unused(block_group); } else if (bg_reclaim_threshold && reclaimable_unusable >= - div_factor_fine(block_group->zone_capacity, - bg_reclaim_threshold)) { + mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) { btrfs_mark_bg_to_reclaim(block_group); } diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 69826ccd6644..768583a440e1 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -40,22 +40,10 @@ static inline void cond_wake_up_nomb(struct wait_queue_head *wq) wake_up(wq); } -static inline u64 div_factor(u64 num, int factor) +static inline u64 mult_perc(u64 num, u32 percent) { - if (factor == 10) - return num; - num *= factor; - return div_u64(num, 10); + return div_u64(num * percent, 100); } - -static inline u64 div_factor_fine(u64 num, int factor) -{ - if (factor == 100) - return num; - num *= factor; - return div_u64(num, 100); -} - /* Copy of is_power_of_two that is 64bit safe */ static inline bool is_power_of_two_u64(u64 n) { diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 11e5b5a1eb5a..d28ee4e36f3d 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -859,7 +859,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, u64 thresh; u64 used; - thresh = div_factor_fine(total, 90); + thresh = mult_perc(total, 90); lockdep_assert_held(&space_info->lock); @@ -977,7 +977,7 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, return false; spin_lock(&global_rsv->lock); - min_bytes = div_factor(global_rsv->size, 1); + min_bytes = mult_perc(global_rsv->size, 10); if (global_rsv->reserved < min_bytes + ticket->bytes) { spin_unlock(&global_rsv->lock); return false; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index a14812f1254a..45615ce36498 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -764,7 +764,7 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj, val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE); /* Limit stripe size to 10% of available space. */ - val = min(div_factor(fs_info->fs_devices->total_rw_bytes, 1), val); + val = min(mult_perc(fs_info->fs_devices->total_rw_bytes, 10), val); /* Must be multiple of 256M. */ val &= ~((u64)SZ_256M - 1); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 593b2414e2be..2e2dd2ea109b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -949,7 +949,7 @@ static bool should_end_transaction(struct btrfs_trans_handle *trans) if (btrfs_check_space_for_delayed_refs(fs_info)) return true; - return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); + return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 50); } bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1c9f7a946657..7751ab620761 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3614,16 +3614,14 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_off if (bargs->usage_min == 0) user_thresh_min = 0; else - user_thresh_min = div_factor_fine(cache->length, - bargs->usage_min); + user_thresh_min = mult_perc(cache->length, bargs->usage_min); if (bargs->usage_max == 0) user_thresh_max = 1; else if (bargs->usage_max > 100) user_thresh_max = cache->length; else - user_thresh_max = div_factor_fine(cache->length, - bargs->usage_max); + user_thresh_max = mult_perc(cache->length, bargs->usage_max); if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) ret = 0; @@ -3647,7 +3645,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, else if (bargs->usage > 100) user_thresh = cache->length; else - user_thresh = div_factor_fine(cache->length, bargs->usage); + user_thresh = mult_perc(cache->length, bargs->usage); if (chunk_used < user_thresh) ret = 0; @@ -5113,7 +5111,7 @@ static void init_alloc_chunk_ctl_policy_regular( ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); /* We don't want a chunk larger than 10% of writable space */ - ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), + ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10), ctl->max_chunk_size); ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; } @@ -5144,7 +5142,7 @@ static void init_alloc_chunk_ctl_policy_zoned( } /* We don't want a chunk larger than 10% of writable space */ - limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1), + limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10), zone_size), min_chunk_size); ctl->max_chunk_size = min(limit, ctl->max_chunk_size); -- cgit From 8ec8519b47897d01a7e8c5ad95734529a3dd60bf Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 13:07:03 +0200 Subject: btrfs: switch extent_page_data bit fields to bools The semantics of the two members is a boolean, so change the type accordingly. We have space in extent_page_data due to alignment there's no change in size. Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bda420d697f4..c1294a6de88a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -109,10 +109,10 @@ struct extent_page_data { /* tells writepage not to lock the state bits for this range * it still does the unlocking */ - unsigned int extent_locked:1; + bool extent_locked; /* tells the submit_bio code to use REQ_SYNC */ - unsigned int sync_io:1; + bool sync_io; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) -- cgit From ee5f017dccc79e5a9b442ca473631a53e1e7e376 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 13:07:05 +0200 Subject: btrfs: merge struct extent_page_data to btrfs_bio_ctrl The two structures appear on the same call paths, btrfs_bio_ctrl is embedded in extent_page_data and we pass bio_ctrl to some functions. After merging there are fewer indirections and we have only one control structure. The packing remains same. The btrfs_bio_ctrl was selected as the target structure as the operation is closer to bio processing. Structure layout: struct btrfs_bio_ctrl { struct bio * bio; /* 0 8 */ int mirror_num; /* 8 4 */ enum btrfs_compression_type compress_type; /* 12 4 */ u32 len_to_stripe_boundary; /* 16 4 */ u32 len_to_oe_boundary; /* 20 4 */ btrfs_bio_end_io_t end_io_func; /* 24 8 */ bool extent_locked; /* 32 1 */ bool sync_io; /* 33 1 */ /* size: 40, cachelines: 1, members: 8 */ /* padding: 6 */ /* last cacheline: 40 bytes */ }; Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 115 ++++++++++++++++++++++++--------------------------- 1 file changed, 55 insertions(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c1294a6de88a..2ec989b83f54 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -102,16 +102,14 @@ struct btrfs_bio_ctrl { u32 len_to_stripe_boundary; u32 len_to_oe_boundary; btrfs_bio_end_io_t end_io_func; -}; -struct extent_page_data { - struct btrfs_bio_ctrl bio_ctrl; - /* tells writepage not to lock the state bits for this range - * it still does the unlocking + /* + * Tell writepage not to lock the state bits for this range, it still + * does the unlocking. */ bool extent_locked; - /* tells the submit_bio code to use REQ_SYNC */ + /* Tell the submit_bio code to use REQ_SYNC */ bool sync_io; }; @@ -148,11 +146,11 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) } /* - * Submit or fail the current bio in an extent_page_data structure. + * Submit or fail the current bio in the bio_ctrl structure. */ -static void submit_write_bio(struct extent_page_data *epd, int ret) +static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret) { - struct bio *bio = epd->bio_ctrl.bio; + struct bio *bio = bio_ctrl->bio; if (!bio) return; @@ -161,9 +159,9 @@ static void submit_write_bio(struct extent_page_data *epd, int ret) ASSERT(ret < 0); btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); /* The bio is owned by the end_io handler now */ - epd->bio_ctrl.bio = NULL; + bio_ctrl->bio = NULL; } else { - submit_one_bio(&epd->bio_ctrl); + submit_one_bio(bio_ctrl); } } @@ -2071,7 +2069,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd, + struct btrfs_bio_ctrl *bio_ctrl, loff_t i_size, int *nr_ret) { @@ -2103,7 +2101,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, */ wbc->nr_to_write--; - epd->bio_ctrl.end_io_func = end_bio_extent_writepage; + bio_ctrl->end_io_func = end_bio_extent_writepage; while (cur <= end) { u64 disk_bytenr; u64 em_end; @@ -2197,7 +2195,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, btrfs_page_clear_dirty(fs_info, page, cur, iosize); ret = submit_extent_page(op | write_flags, wbc, - &epd->bio_ctrl, disk_bytenr, + bio_ctrl, disk_bytenr, page, iosize, cur - page_offset(page), 0, false); @@ -2237,7 +2235,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * Return <0 for error. */ static int __extent_writepage(struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; @@ -2274,7 +2272,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, goto done; } - if (!epd->extent_locked) { + if (!bio_ctrl->extent_locked) { ret = writepage_delalloc(BTRFS_I(inode), page, wbc); if (ret == 1) return 0; @@ -2282,7 +2280,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, goto done; } - ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size, + ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, bio_ctrl, i_size, &nr); if (ret == 1) return 0; @@ -2326,9 +2324,9 @@ done: */ if (PageError(page)) end_extent_writepage(page, ret, page_start, page_end); - if (epd->extent_locked) { + if (bio_ctrl->extent_locked) { /* - * If epd->extent_locked, it's from extent_write_locked_range(), + * If bio_ctrl->extent_locked, it's from extent_write_locked_range(), * the page can either be locked by lock_page() or * process_one_page(). * Let btrfs_page_unlock_writer() handle both cases. @@ -2367,7 +2365,7 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) * Return <0 if something went wrong, no page is locked. */ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = eb->fs_info; int i, num_pages; @@ -2375,17 +2373,17 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb int ret = 0; if (!btrfs_try_tree_write_lock(eb)) { - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); flush = 1; btrfs_tree_lock(eb); } if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { btrfs_tree_unlock(eb); - if (!epd->sync_io) + if (!bio_ctrl->sync_io) return 0; if (!flush) { - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); flush = 1; } while (1) { @@ -2432,7 +2430,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!trylock_page(p)) { if (!flush) { - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); flush = 1; } lock_page(p); @@ -2672,7 +2670,7 @@ static void prepare_eb_write(struct extent_buffer *eb) */ static int write_one_subpage_eb(struct extent_buffer *eb, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page = eb->pages[0]; @@ -2692,10 +2690,10 @@ static int write_one_subpage_eb(struct extent_buffer *eb, if (no_dirty_ebs) clear_page_dirty_for_io(page); - epd->bio_ctrl.end_io_func = end_bio_subpage_eb_writepage; + bio_ctrl->end_io_func = end_bio_subpage_eb_writepage; ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - &epd->bio_ctrl, eb->start, page, eb->len, + bio_ctrl, eb->start, page, eb->len, eb->start - page_offset(page), 0, false); if (ret) { btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); @@ -2718,7 +2716,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { u64 disk_bytenr = eb->start; int i, num_pages; @@ -2727,7 +2725,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, prepare_eb_write(eb); - epd->bio_ctrl.end_io_func = end_bio_extent_buffer_writepage; + bio_ctrl->end_io_func = end_bio_extent_buffer_writepage; num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { @@ -2736,7 +2734,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - &epd->bio_ctrl, disk_bytenr, p, + bio_ctrl, disk_bytenr, p, PAGE_SIZE, 0, 0, false); if (ret) { set_btree_ioerr(p, eb); @@ -2779,7 +2777,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, */ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); int submitted = 0; @@ -2832,7 +2830,7 @@ static int submit_eb_subpage(struct page *page, if (!eb) continue; - ret = lock_extent_buffer_for_io(eb, epd); + ret = lock_extent_buffer_for_io(eb, bio_ctrl); if (ret == 0) { free_extent_buffer(eb); continue; @@ -2841,7 +2839,7 @@ static int submit_eb_subpage(struct page *page, free_extent_buffer(eb); goto cleanup; } - ret = write_one_subpage_eb(eb, wbc, epd); + ret = write_one_subpage_eb(eb, wbc, bio_ctrl); free_extent_buffer(eb); if (ret < 0) goto cleanup; @@ -2851,7 +2849,7 @@ static int submit_eb_subpage(struct page *page, cleanup: /* We hit error, end bio for the submitted extent buffers */ - submit_write_bio(epd, ret); + submit_write_bio(bio_ctrl, ret); return ret; } @@ -2876,7 +2874,7 @@ cleanup: * Return <0 for fatal error. */ static int submit_eb_page(struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd, + struct btrfs_bio_ctrl *bio_ctrl, struct extent_buffer **eb_context) { struct address_space *mapping = page->mapping; @@ -2888,7 +2886,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, return 0; if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return submit_eb_subpage(page, wbc, epd); + return submit_eb_subpage(page, wbc, bio_ctrl); spin_lock(&mapping->private_lock); if (!PagePrivate(page)) { @@ -2931,7 +2929,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, *eb_context = eb; - ret = lock_extent_buffer_for_io(eb, epd); + ret = lock_extent_buffer_for_io(eb, bio_ctrl); if (ret <= 0) { btrfs_revert_meta_write_pointer(cache, eb); if (cache) @@ -2946,7 +2944,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, btrfs_schedule_zone_finish_bg(cache, eb); btrfs_put_block_group(cache); } - ret = write_one_eb(eb, wbc, epd); + ret = write_one_eb(eb, wbc, bio_ctrl); free_extent_buffer(eb); if (ret < 0) return ret; @@ -2957,10 +2955,9 @@ int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { struct extent_buffer *eb_context = NULL; - struct extent_page_data epd = { - .bio_ctrl = { 0 }, + struct btrfs_bio_ctrl bio_ctrl = { .extent_locked = 0, - .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .sync_io = (wbc->sync_mode == WB_SYNC_ALL), }; struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; int ret = 0; @@ -3003,7 +3000,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - ret = submit_eb_page(page, wbc, &epd, &eb_context); + ret = submit_eb_page(page, wbc, &bio_ctrl, &eb_context); if (ret == 0) continue; if (ret < 0) { @@ -3064,7 +3061,7 @@ retry: ret = 0; if (!ret && BTRFS_FS_ERROR(fs_info)) ret = -EROFS; - submit_write_bio(&epd, ret); + submit_write_bio(&bio_ctrl, ret); btrfs_zoned_meta_io_unlock(fs_info); return ret; @@ -3073,9 +3070,9 @@ retry: /* * Walk the list of dirty pages of the given address space and write all of them. * - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @epd: holds context for the write, namely the bio + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @bio_ctrl: holds context for the write, namely the bio * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, @@ -3087,7 +3084,7 @@ retry: */ static int extent_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct inode *inode = mapping->host; int ret = 0; @@ -3168,7 +3165,7 @@ retry: * tmpfs file mapping */ if (!trylock_page(page)) { - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); lock_page(page); } @@ -3179,7 +3176,7 @@ retry: if (wbc->sync_mode != WB_SYNC_NONE) { if (PageWriteback(page)) - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); wait_on_page_writeback(page); } @@ -3189,7 +3186,7 @@ retry: continue; } - ret = __extent_writepage(page, wbc, epd); + ret = __extent_writepage(page, wbc, bio_ctrl); if (ret < 0) { done = 1; break; @@ -3219,7 +3216,7 @@ retry: * page in our current bio, and thus deadlock, so flush the * write bio here. */ - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); goto retry; } @@ -3245,8 +3242,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) u64 cur = start; unsigned long nr_pages; const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize; - struct extent_page_data epd = { - .bio_ctrl = { 0 }, + struct btrfs_bio_ctrl bio_ctrl = { .extent_locked = 1, .sync_io = 1, }; @@ -3277,7 +3273,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) ASSERT(PageLocked(page)); ASSERT(PageDirty(page)); clear_page_dirty_for_io(page); - ret = __extent_writepage(page, &wbc_writepages, &epd); + ret = __extent_writepage(page, &wbc_writepages, &bio_ctrl); ASSERT(ret <= 0); if (ret < 0) { found_error = true; @@ -3287,7 +3283,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) cur = cur_end + 1; } - submit_write_bio(&epd, found_error ? ret : 0); + submit_write_bio(&bio_ctrl, found_error ? ret : 0); wbc_detach_inode(&wbc_writepages); if (found_error) @@ -3300,10 +3296,9 @@ int extent_writepages(struct address_space *mapping, { struct inode *inode = mapping->host; int ret = 0; - struct extent_page_data epd = { - .bio_ctrl = { 0 }, + struct btrfs_bio_ctrl bio_ctrl = { .extent_locked = 0, - .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .sync_io = (wbc->sync_mode == WB_SYNC_ALL), }; /* @@ -3311,8 +3306,8 @@ int extent_writepages(struct address_space *mapping, * protect the write pointer updates. */ btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); - ret = extent_write_cache_pages(mapping, wbc, &epd); - submit_write_bio(&epd, ret); + ret = extent_write_cache_pages(mapping, wbc, &bio_ctrl); + submit_write_bio(&bio_ctrl, ret); btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); return ret; } -- cgit From 9c5ff9b42c1cb22823c94983b7d52121c559bf4d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:01 +0800 Subject: btrfs: raid56: extract the vertical stripe recovery code into recover_vertical() This refactor includes the following behavior change first: - Don't error out if only P/Q is corrupted The old code will directly error out if only P/Q is corrupted. Although it is an logical error if we go into rebuild path with only P/Q corrupted, there is no need to error out. Just skip the rebuild and return the already good data. Then comes the following refactor which shouldn't cause behavior changes: - Introduce a helper to do vertical stripe recovery This not only reduce one indent level, but also paves the road for later data checksum verification in RMW cycles. - Sort rbio->faila/b before recovery So we don't need to do the same swap every vertical stripe - Replace a BUG_ON() with ASSERT() Or checkpatch won't let me pass. - Mark recovered sectors uptodate after the recover loop - Do the cleanup for pointers unconditionally We only need to initialize @pointers and @unmap_array to NULL, so we can safely free them unconditionally. - Mark the repaired sector uptodate in recover_vertical() Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 285 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 149 insertions(+), 136 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index ef272e7d932c..34d51d2ef746 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1886,6 +1886,144 @@ fail: bio_endio(bio); } +/* + * Recover a vertical stripe specified by @sector_nr. + * @*pointers are the pre-allocated pointers by the caller, so we don't + * need to allocate/free the pointers again and again. + */ +static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, + void **pointers, void **unmap_array) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct sector_ptr *sector; + const u32 sectorsize = fs_info->sectorsize; + const int faila = rbio->faila; + const int failb = rbio->failb; + int stripe_nr; + + /* + * Now we just use bitmap to mark the horizontal stripes in + * which we have data when doing parity scrub. + */ + if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && + !test_bit(sector_nr, &rbio->dbitmap)) + return; + + /* + * Setup our array of pointers with sectors from each stripe + * + * NOTE: store a duplicate array of pointers to preserve the + * pointer order. + */ + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + /* + * If we're rebuilding a read, we have to use + * pages from the bio list + */ + if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && + (stripe_nr == faila || stripe_nr == failb)) { + sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + } else { + sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); + } + ASSERT(sector->page); + pointers[stripe_nr] = kmap_local_page(sector->page) + + sector->pgoff; + unmap_array[stripe_nr] = pointers[stripe_nr]; + } + + /* All raid6 handling here */ + if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { + /* Single failure, rebuild from parity raid5 style */ + if (failb < 0) { + if (faila == rbio->nr_data) + /* + * Just the P stripe has failed, without + * a bad data or Q stripe. + * We have nothing to do, just skip the + * recovery for this stripe. + */ + goto cleanup; + /* + * a single failure in raid6 is rebuilt + * in the pstripe code below + */ + goto pstripe; + } + + /* + * If the q stripe is failed, do a pstripe reconstruction from + * the xors. + * If both the q stripe and the P stripe are failed, we're + * here due to a crc mismatch and we can't give them the + * data they want. + */ + if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { + if (rbio->bioc->raid_map[faila] == + RAID5_P_STRIPE) + /* + * Only P and Q are corrupted. + * We only care about data stripes recovery, + * can skip this vertical stripe. + */ + goto cleanup; + /* + * Otherwise we have one bad data stripe and + * a good P stripe. raid5! + */ + goto pstripe; + } + + if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { + raid6_datap_recov(rbio->real_stripes, sectorsize, + faila, pointers); + } else { + raid6_2data_recov(rbio->real_stripes, sectorsize, + faila, failb, pointers); + } + } else { + void *p; + + /* Rebuild from P stripe here (raid5 or raid6). */ + ASSERT(failb == -1); +pstripe: + /* Copy parity block into failed block to start with */ + memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); + + /* Rearrange the pointer array */ + p = pointers[faila]; + for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1; + stripe_nr++) + pointers[stripe_nr] = pointers[stripe_nr + 1]; + pointers[rbio->nr_data - 1] = p; + + /* Xor in the rest */ + run_xor(pointers, rbio->nr_data - 1, sectorsize); + + } + + /* + * No matter if this is a RMW or recovery, we should have all + * failed sectors repaired in the vertical stripe, thus they are now + * uptodate. + * Especially if we determine to cache the rbio, we need to + * have at least all data sectors uptodate. + */ + if (rbio->faila >= 0) { + sector = rbio_stripe_sector(rbio, rbio->faila, sector_nr); + sector->uptodate = 1; + } + if (rbio->failb >= 0) { + sector = rbio_stripe_sector(rbio, rbio->failb, sector_nr); + sector->uptodate = 1; + } + +cleanup: + for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) + kunmap_local(unmap_array[stripe_nr]); +} + /* * all parity reconstruction happens here. We've read in everything * we can find from the drives and this does the heavy lifting of @@ -1893,13 +2031,10 @@ fail: */ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - int sectornr, stripe; - void **pointers; - void **unmap_array; - int faila = -1, failb = -1; + int sectornr; + void **pointers = NULL; + void **unmap_array = NULL; blk_status_t err; - int i; /* * This array stores the pointer for each sector, thus it has the extra @@ -1908,7 +2043,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!pointers) { err = BLK_STS_RESOURCE; - goto cleanup_io; + goto cleanup; } /* @@ -1918,11 +2053,12 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!unmap_array) { err = BLK_STS_RESOURCE; - goto cleanup_pointers; + goto cleanup; } - faila = rbio->faila; - failb = rbio->failb; + /* Make sure faila and fail b are in order. */ + if (rbio->faila >= 0 && rbio->failb >= 0 && rbio->faila > rbio->failb) + swap(rbio->faila, rbio->failb); if (rbio->operation == BTRFS_RBIO_READ_REBUILD || rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { @@ -1933,138 +2069,15 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; - - /* - * Now we just use bitmap to mark the horizontal stripes in - * which we have data when doing parity scrub. - */ - if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && - !test_bit(sectornr, &rbio->dbitmap)) - continue; - - /* - * Setup our array of pointers with sectors from each stripe - * - * NOTE: store a duplicate array of pointers to preserve the - * pointer order - */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - /* - * If we're rebuilding a read, we have to use - * pages from the bio list - */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && - (stripe == faila || stripe == failb)) { - sector = sector_in_rbio(rbio, stripe, sectornr, 0); - } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); - } - ASSERT(sector->page); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; - unmap_array[stripe] = pointers[stripe]; - } - - /* All raid6 handling here */ - if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { - /* Single failure, rebuild from parity raid5 style */ - if (failb < 0) { - if (faila == rbio->nr_data) { - /* - * Just the P stripe has failed, without - * a bad data or Q stripe. - * TODO, we should redo the xor here. - */ - err = BLK_STS_IOERR; - goto cleanup; - } - /* - * a single failure in raid6 is rebuilt - * in the pstripe code below - */ - goto pstripe; - } - - /* make sure our ps and qs are in order */ - if (faila > failb) - swap(faila, failb); - - /* if the q stripe is failed, do a pstripe reconstruction - * from the xors. - * If both the q stripe and the P stripe are failed, we're - * here due to a crc mismatch and we can't give them the - * data they want - */ - if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { - if (rbio->bioc->raid_map[faila] == - RAID5_P_STRIPE) { - err = BLK_STS_IOERR; - goto cleanup; - } - /* - * otherwise we have one bad data stripe and - * a good P stripe. raid5! - */ - goto pstripe; - } - - if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { - raid6_datap_recov(rbio->real_stripes, - sectorsize, faila, pointers); - } else { - raid6_2data_recov(rbio->real_stripes, - sectorsize, faila, failb, - pointers); - } - } else { - void *p; - - /* rebuild from P stripe here (raid5 or raid6) */ - BUG_ON(failb != -1); -pstripe: - /* Copy parity block into failed block to start with */ - memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); - - /* rearrange the pointer array */ - p = pointers[faila]; - for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) - pointers[stripe] = pointers[stripe + 1]; - pointers[rbio->nr_data - 1] = p; - - /* xor in the rest */ - run_xor(pointers, rbio->nr_data - 1, sectorsize); - } - - /* - * No matter if this is a RMW or recovery, we should have all - * failed sectors repaired, thus they are now uptodate. - * Especially if we determine to cache the rbio, we need to - * have at least all data sectors uptodate. - */ - for (i = 0; i < rbio->stripe_nsectors; i++) { - if (faila != -1) { - sector = rbio_stripe_sector(rbio, faila, i); - sector->uptodate = 1; - } - if (failb != -1) { - sector = rbio_stripe_sector(rbio, failb, i); - sector->uptodate = 1; - } - } - for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--) - kunmap_local(unmap_array[stripe]); - } + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) + recover_vertical(rbio, sectornr, pointers, unmap_array); err = BLK_STS_OK; + cleanup: kfree(unmap_array); -cleanup_pointers: kfree(pointers); -cleanup_io: /* * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a * valid rbio which is consistent with ondisk content, thus such a -- cgit From 30e3c897f4a8d070d4fa079f6dade4b3e3cb74ad Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:02 +0800 Subject: btrfs: raid56: extract the pq generation code into a helper Currently finish_rmw() will update the P/Q stripes before submitting the writes. It's done behind a for(;;) loop, it's a little congested indent-wise, so extract the code into a helper called generate_pq_vertical(). Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 90 +++++++++++++++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 34d51d2ef746..66dd4e9752a9 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1193,6 +1193,48 @@ not_found: trace_info->stripe_nr = -1; } +/* Generate PQ for one veritical stripe. */ +static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) +{ + void **pointers = rbio->finish_pointers; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + struct sector_ptr *sector; + int stripe; + const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; + + /* First collect one sector from each data stripe */ + for (stripe = 0; stripe < rbio->nr_data; stripe++) { + sector = sector_in_rbio(rbio, stripe, sectornr, 0); + pointers[stripe] = kmap_local_page(sector->page) + + sector->pgoff; + } + + /* Then add the parity stripe */ + sector = rbio_pstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; + + if (has_qstripe) { + /* + * RAID6, add the qstripe and call the library function + * to fill in our p/q + */ + sector = rbio_qstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + + sector->pgoff; + + raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, + pointers); + } else { + /* raid5 */ + memcpy(pointers[rbio->nr_data], pointers[0], sectorsize); + run_xor(pointers + 1, rbio->nr_data - 1, sectorsize); + } + for (stripe = stripe - 1; stripe >= 0; stripe--) + kunmap_local(pointers[stripe]); +} + /* * this is called from one of two situations. We either * have a full stripe from the higher layers, or we've read all @@ -1204,28 +1246,17 @@ not_found: static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { struct btrfs_io_context *bioc = rbio->bioc; - const u32 sectorsize = bioc->fs_info->sectorsize; - void **pointers = rbio->finish_pointers; - int nr_data = rbio->nr_data; /* The total sector number inside the full stripe. */ int total_sector_nr; int stripe; /* Sector number inside a stripe. */ int sectornr; - bool has_qstripe; struct bio_list bio_list; struct bio *bio; int ret; bio_list_init(&bio_list); - if (rbio->real_stripes - rbio->nr_data == 1) - has_qstripe = false; - else if (rbio->real_stripes - rbio->nr_data == 2) - has_qstripe = true; - else - BUG(); - /* We should have at least one data sector. */ ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); @@ -1258,41 +1289,8 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; - - /* First collect one sector from each data stripe */ - for (stripe = 0; stripe < nr_data; stripe++) { - sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; - } - - /* Then add the parity stripe */ - sector = rbio_pstripe_sector(rbio, sectornr); - sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; - - if (has_qstripe) { - /* - * RAID6, add the qstripe and call the library function - * to fill in our p/q - */ - sector = rbio_qstripe_sector(rbio, sectornr); - sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + - sector->pgoff; - - raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, - pointers); - } else { - /* raid5 */ - memcpy(pointers[nr_data], pointers[0], sectorsize); - run_xor(pointers + 1, nr_data - 1, sectorsize); - } - for (stripe = stripe - 1; stripe >= 0; stripe--) - kunmap_local(pointers[stripe]); - } + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) + generate_pq_vertical(rbio, sectornr); /* * Start writing. Make bios for everything from the higher layers (the -- cgit From d31968d9b6ac684e7e64be0d46eb0f5fe38d1841 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:03 +0800 Subject: btrfs: raid56: extract the recovery bio list build code into a helper This new helper will be also utilized in the incoming refactor of recovery path. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 64 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 66dd4e9752a9..9234d3423b31 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2134,30 +2134,14 @@ static void raid_recover_end_io_work(struct work_struct *work) __raid_recover_end_io(rbio); } -/* - * reads everything we need off the disk to reconstruct - * the parity. endio handlers trigger final reconstruction - * when the IO is done. - * - * This is used both for reads from the higher layers and for - * parity construction required to finish a rmw cycle. - */ -static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) +static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) { - int bios_to_read = 0; - struct bio_list bio_list; - int ret; - int total_sector_nr; struct bio *bio; + int total_sector_nr; + int ret = 0; - bio_list_init(&bio_list); - - ret = alloc_rbio_pages(rbio); - if (ret) - goto cleanup; - - atomic_set(&rbio->error, 0); - + ASSERT(bio_list_size(bio_list) == 0); /* * Read everything that hasn't failed. However this time we will * not trust any cached sector. @@ -2180,11 +2164,45 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) continue; } sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret < 0) - goto cleanup; + goto error; } + return 0; +error: + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); + + return -EIO; +} + +/* + * reads everything we need off the disk to reconstruct + * the parity. endio handlers trigger final reconstruction + * when the IO is done. + * + * This is used both for reads from the higher layers and for + * parity construction required to finish a rmw cycle. + */ +static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct bio_list bio_list; + int ret; + struct bio *bio; + + bio_list_init(&bio_list); + + ret = alloc_rbio_pages(rbio); + if (ret) + goto cleanup; + + atomic_set(&rbio->error, 0); + + ret = recover_assemble_read_bios(rbio, &bio_list); + if (ret < 0) + goto cleanup; bios_to_read = bio_list_size(&bio_list); if (!bios_to_read) { -- cgit From ec936b0354e2d716e977e16165d188db044696b7 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:04 +0800 Subject: btrfs: raid56: extract sector recovery code into a helper This includes extra changes: - The allocation for unmap_array[] and pointers[] Now we allocate them in one go, and free them together. - Remove @err Use errno_to_blk_status(ret) instead. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 59 +++++++++++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 9234d3423b31..da76e72383c3 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2022,36 +2022,24 @@ cleanup: kunmap_local(unmap_array[stripe_nr]); } -/* - * all parity reconstruction happens here. We've read in everything - * we can find from the drives and this does the heavy lifting of - * sorting the good from the bad. - */ -static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) +static int recover_sectors(struct btrfs_raid_bio *rbio) { - int sectornr; void **pointers = NULL; void **unmap_array = NULL; - blk_status_t err; + int sectornr; + int ret = 0; /* - * This array stores the pointer for each sector, thus it has the extra - * pgoff value added from each sector + * @pointers array stores the pointer for each sector. + * + * @unmap_array stores copy of pointers that does not get reordered + * during reconstruction so that kunmap_local works. */ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); - if (!pointers) { - err = BLK_STS_RESOURCE; - goto cleanup; - } - - /* - * Store copy of pointers that does not get reordered during - * reconstruction so that kunmap_local works. - */ unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); - if (!unmap_array) { - err = BLK_STS_RESOURCE; - goto cleanup; + if (!pointers || !unmap_array) { + ret = -ENOMEM; + goto out; } /* Make sure faila and fail b are in order. */ @@ -2070,11 +2058,22 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) recover_vertical(rbio, sectornr, pointers, unmap_array); - err = BLK_STS_OK; - -cleanup: - kfree(unmap_array); +out: kfree(pointers); + kfree(unmap_array); + return ret; +} + +/* + * all parity reconstruction happens here. We've read in everything + * we can find from the drives and this does the heavy lifting of + * sorting the good from the bad. + */ +static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) +{ + int ret; + + ret = recover_sectors(rbio); /* * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a @@ -2098,13 +2097,13 @@ cleanup: * Cache this rbio iff the above read reconstruction is * executed without problems. */ - if (err == BLK_STS_OK && rbio->failb < 0) + if (!ret && rbio->failb < 0) cache_rbio_pages(rbio); else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - rbio_orig_end_io(rbio, err); - } else if (err == BLK_STS_OK) { + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + } else if (!ret) { rbio->faila = -1; rbio->failb = -1; @@ -2115,7 +2114,7 @@ cleanup: else BUG(); } else { - rbio_orig_end_io(rbio, err); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } } -- cgit From d817ce35d24a53c6736ac68e759ed83135ff7c3b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:05 +0800 Subject: btrfs: raid56: switch recovery path to a single function Currently btrfs uses end_io functions to jump between different stages of recovery. For example, we go the following different functions: - raid56_bio_end_io() This handles the read for all the sectors (except the missing device). - __raid_recover_end_io() This does the real work, it's called inside the delayed work function raid_recover_end_io_work(). This one recovery path involves at least 3 different functions, which is a big burden for readers. This patch will change the behavior by: - Introduce a unified recovery entrance, recover_rbio() - Use submit-and-wait method So the workflow is not interrupted by the endio function jump. This doesn't bring performance change, but reduce the burden for reviewers. - Run the main function in the rmw_workers workqueue Now raid56_parity_recover() only needs to setup the work, and queue the work using start_async_work(). Now readers only need to do one function jump (start_async_work()) to find out the main entrance of recovery path. Furthermore, recover_rbio() function can easily be reused by other paths. The old recovery path is still utilized by degraded write path. It will be cleaned up when we have migrated the write path. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 144 ++++++++++++++++++++++++++++++++++++++++++++---------- fs/btrfs/raid56.h | 2 + 2 files changed, 119 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index da76e72383c3..2dd87ceda9db 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -67,7 +67,6 @@ struct sector_ptr { static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); static noinline void finish_rmw(struct btrfs_raid_bio *rbio); static void rmw_work(struct work_struct *work); -static void read_rebuild_work(struct work_struct *work); static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); static void index_rbio_pages(struct btrfs_raid_bio *rbio); @@ -752,6 +751,8 @@ out: return ret; } +static void recover_rbio_work_locked(struct work_struct *work); + /* * called as rmw or parity rebuild is completed. If the plug list has more * rbios waiting for this stripe, the next one on the list will be started @@ -809,10 +810,10 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) spin_unlock_irqrestore(&h->lock, flags); if (next->operation == BTRFS_RBIO_READ_REBUILD) - start_async_work(next, read_rebuild_work); + start_async_work(next, recover_rbio_work_locked); else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { steal_rbio(rbio, next); - start_async_work(next, read_rebuild_work); + start_async_work(next, recover_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); start_async_work(next, rmw_work); @@ -989,6 +990,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, } bio_list_init(&rbio->bio_list); + init_waitqueue_head(&rbio->io_wait); INIT_LIST_HEAD(&rbio->plug_list); spin_lock_init(&rbio->bio_list_lock); INIT_LIST_HEAD(&rbio->stripe_cache); @@ -1519,6 +1521,39 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) } } +static void raid_wait_read_end_io(struct bio *bio) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (bio->bi_status) + fail_bio_stripe(rbio, bio); + else + set_bio_pages_uptodate(rbio, bio); + + bio_put(bio); + if (atomic_dec_and_test(&rbio->stripes_pending)) + wake_up(&rbio->io_wait); +} + +static void submit_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; + + atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); + while ((bio = bio_list_pop(bio_list))) { + bio->bi_end_io = raid_wait_read_end_io; + + if (trace_raid56_scrub_read_recover_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_read_recover(rbio, bio, &trace_info); + } + submit_bio(bio); + } +} + static void raid56_bio_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; @@ -2176,6 +2211,79 @@ error: return -EIO; } +static int recover_rbio(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list; + struct bio *bio; + int ret; + + /* + * Either we're doing recover for a read failure or degraded write, + * caller should have set faila/b correctly. + */ + ASSERT(rbio->faila >= 0 || rbio->failb >= 0); + bio_list_init(&bio_list); + + /* + * Reset error to 0, as we will later increase error for missing + * devices. + */ + atomic_set(&rbio->error, 0); + + /* For recovery, we need to read all sectors including P/Q. */ + ret = alloc_rbio_pages(rbio); + if (ret < 0) + goto out; + + index_rbio_pages(rbio); + + ret = recover_assemble_read_bios(rbio, &bio_list); + if (ret < 0) + goto out; + + submit_read_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + /* We have more errors than our tolerance during the read. */ + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { + ret = -EIO; + goto out; + } + + ret = recover_sectors(rbio); + +out: + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); + + return ret; +} + +static void recover_rbio_work(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio; + int ret; + + rbio = container_of(work, struct btrfs_raid_bio, work); + + ret = lock_stripe_add(rbio); + if (ret == 0) { + ret = recover_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + } +} + +static void recover_rbio_work_locked(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio; + int ret; + + rbio = container_of(work, struct btrfs_raid_bio, work); + + ret = recover_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); +} + /* * reads everything we need off the disk to reconstruct * the parity. endio handlers trigger final reconstruction @@ -2264,7 +2372,8 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); - goto out_end_bio; + bio_endio(bio); + return; } rbio->operation = BTRFS_RBIO_READ_REBUILD; @@ -2278,7 +2387,8 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, (u64)bio->bi_iter.bi_size, bioc->map_type); free_raid_bio(rbio); bio->bi_status = BLK_STS_IOERR; - goto out_end_bio; + bio_endio(bio); + return; } /* @@ -2298,18 +2408,7 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, rbio->failb--; } - if (lock_stripe_add(rbio)) - return; - - /* - * This adds our rbio to the list of rbios that will be handled after - * the current lock owner is done. - */ - __raid56_parity_recover(rbio); - return; - -out_end_bio: - bio_endio(bio); + start_async_work(rbio, recover_rbio_work); } static void rmw_work(struct work_struct *work) @@ -2320,14 +2419,6 @@ static void rmw_work(struct work_struct *work) raid56_rmw_stripe(rbio); } -static void read_rebuild_work(struct work_struct *work) -{ - struct btrfs_raid_bio *rbio; - - rbio = container_of(work, struct btrfs_raid_bio, work); - __raid56_parity_recover(rbio); -} - /* * The following code is used to scrub/replace the parity stripe * @@ -2818,6 +2909,5 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) { - if (!lock_stripe_add(rbio)) - start_async_work(rbio, read_rebuild_work); + start_async_work(rbio, recover_rbio_work); } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 91d5c0adad15..445e833fcfcf 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -95,6 +95,8 @@ struct btrfs_raid_bio { atomic_t error; + wait_queue_head_t io_wait; + struct work_struct end_io_work; /* Bitmap to record which horizontal stripe has data */ -- cgit From 509c27aa2fb69b4475d05d0ac5c5c4206e28d5d6 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:06 +0800 Subject: btrfs: raid56: extract the rmw bio list build code into a helper The helper will later be used to refactor the whole RMW path. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 56 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 2dd87ceda9db..88e2204c9f28 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1595,28 +1595,16 @@ static void raid56_rmw_end_io_work(struct work_struct *work) validate_rbio_for_rmw(rbio); } -/* - * the stripe must be locked by the caller. It will - * unlock after all the writes are done - */ -static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) +static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) { - int bios_to_read = 0; - struct bio_list bio_list; const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data; - int ret; - int total_sector_nr; struct bio *bio; + int total_sector_nr; + int ret = 0; - bio_list_init(&bio_list); - - ret = alloc_rbio_pages(rbio); - if (ret) - goto cleanup; - - index_rbio_pages(rbio); + ASSERT(bio_list_size(bio_list) == 0); - atomic_set(&rbio->error, 0); /* Build a list of bios to read all the missing data sectors. */ for (total_sector_nr = 0; total_sector_nr < nr_data_sectors; total_sector_nr++) { @@ -1641,11 +1629,43 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) if (sector->uptodate) continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, + ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret) goto cleanup; } + return 0; + +cleanup: + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); + return ret; +} + +/* + * the stripe must be locked by the caller. It will + * unlock after all the writes are done + */ +static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct bio_list bio_list; + int ret; + struct bio *bio; + + bio_list_init(&bio_list); + + ret = alloc_rbio_pages(rbio); + if (ret) + goto cleanup; + + index_rbio_pages(rbio); + + atomic_set(&rbio->error, 0); + + ret = rmw_assemble_read_bios(rbio, &bio_list); + if (ret < 0) + goto cleanup; bios_to_read = bio_list_size(&bio_list); if (!bios_to_read) { -- cgit From 6486d21c99cb46666c11632e4d595568863b965c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:07 +0800 Subject: btrfs: raid56: extract rwm write bios assembly into a helper The helper will be later used to refactor the rmw write path. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 135 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 79 insertions(+), 56 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 88e2204c9f28..5e5bfc2c21fc 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1237,65 +1237,23 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) kunmap_local(pointers[stripe]); } -/* - * this is called from one of two situations. We either - * have a full stripe from the higher layers, or we've read all - * the missing bits off disk. - * - * This will calculate the parity and then send down any - * changed blocks. - */ -static noinline void finish_rmw(struct btrfs_raid_bio *rbio) +static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) { - struct btrfs_io_context *bioc = rbio->bioc; + struct bio *bio; /* The total sector number inside the full stripe. */ int total_sector_nr; - int stripe; - /* Sector number inside a stripe. */ int sectornr; - struct bio_list bio_list; - struct bio *bio; + int stripe; int ret; - bio_list_init(&bio_list); + ASSERT(bio_list_size(bio_list) == 0); /* We should have at least one data sector. */ ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); - /* at this point we either have a full stripe, - * or we've read the full stripe from the drive. - * recalculate the parity and write the new results. - * - * We're not allowed to add any new bios to the - * bio list here, anyone else that wants to - * change this stripe needs to do their own rmw. - */ - spin_lock_irq(&rbio->bio_list_lock); - set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); - spin_unlock_irq(&rbio->bio_list_lock); - - atomic_set(&rbio->error, 0); - /* - * now that we've set rmw_locked, run through the - * bio list one last time and map the page pointers - * - * We don't cache full rbios because we're assuming - * the higher layers are unlikely to use this area of - * the disk again soon. If they do use it again, - * hopefully they will send another full bio. - */ - index_rbio_pages(rbio); - if (!rbio_is_full(rbio)) - cache_rbio_pages(rbio); - else - clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) - generate_pq_vertical(rbio, sectornr); - - /* - * Start writing. Make bios for everything from the higher layers (the + * Start assembly. Make bios for everything from the higher layers (the * bio_list in our rbio) and our P/Q. Ignore everything else. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; @@ -1317,15 +1275,16 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) sector = rbio_stripe_sector(rbio, stripe, sectornr); } - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_WRITE); if (ret) - goto cleanup; + goto error; } - if (likely(!bioc->num_tgtdevs)) - goto write_data; + if (likely(!rbio->bioc->num_tgtdevs)) + return 0; + /* Make a copy for the replace target device. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { struct sector_ptr *sector; @@ -1333,7 +1292,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; - if (!bioc->tgtdev_map[stripe]) { + if (!rbio->bioc->tgtdev_map[stripe]) { /* * We can skip the whole stripe completely, note * total_sector_nr will be increased by one anyway. @@ -1355,14 +1314,78 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) sector = rbio_stripe_sector(rbio, stripe, sectornr); } - ret = rbio_add_io_sector(rbio, &bio_list, sector, + ret = rbio_add_io_sector(rbio, bio_list, sector, rbio->bioc->tgtdev_map[stripe], sectornr, REQ_OP_WRITE); if (ret) - goto cleanup; + goto error; } -write_data: + return 0; +error: + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); + return -EIO; +} + +/* + * this is called from one of two situations. We either + * have a full stripe from the higher layers, or we've read all + * the missing bits off disk. + * + * This will calculate the parity and then send down any + * changed blocks. + */ +static noinline void finish_rmw(struct btrfs_raid_bio *rbio) +{ + /* The total sector number inside the full stripe. */ + /* Sector number inside a stripe. */ + int sectornr; + struct bio_list bio_list; + struct bio *bio; + int ret; + + bio_list_init(&bio_list); + + /* We should have at least one data sector. */ + ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); + + /* at this point we either have a full stripe, + * or we've read the full stripe from the drive. + * recalculate the parity and write the new results. + * + * We're not allowed to add any new bios to the + * bio list here, anyone else that wants to + * change this stripe needs to do their own rmw. + */ + spin_lock_irq(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock_irq(&rbio->bio_list_lock); + + atomic_set(&rbio->error, 0); + + /* + * now that we've set rmw_locked, run through the + * bio list one last time and map the page pointers + * + * We don't cache full rbios because we're assuming + * the higher layers are unlikely to use this area of + * the disk again soon. If they do use it again, + * hopefully they will send another full bio. + */ + index_rbio_pages(rbio); + if (!rbio_is_full(rbio)) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) + generate_pq_vertical(rbio, sectornr); + + ret = rmw_assemble_write_bios(rbio, &bio_list); + if (ret < 0) + goto cleanup; + atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); BUG_ON(atomic_read(&rbio->stripes_pending) == 0); -- cgit From 5eb30ee26fa4dbd2d31f50ee3b4212933f86cb57 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:08 +0800 Subject: btrfs: raid56: introduce the main entrance for RMW path The new entrance will be called rmw_rbio(), it will have a streamlined workflow by using submit-and-wait method. Thus there will be no weird jumps between tons of functions, thus way more reader friendly, and will make later expansion easier, as it's now a straight workflow, the timing is way more clear. Unfortunately we can not yet migrate the RMW path to use this new entrance as we still need extra work to address the plug and unlock_stripe() function. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/raid56.h | 5 ++ 2 files changed, 166 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 5e5bfc2c21fc..8fd633f01d9e 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1252,6 +1252,14 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, /* We should have at least one data sector. */ ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); + /* + * Reset errors, as we may have errors inherited from from degraded + * write. + */ + atomic_set(&rbio->error, 0); + rbio->faila = -1; + rbio->failb = -1; + /* * Start assembly. Make bios for everything from the higher layers (the * bio_list in our rbio) and our P/Q. Ignore everything else. @@ -1665,6 +1673,19 @@ cleanup: return ret; } +static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) +{ + const int data_pages = rbio->nr_data * rbio->stripe_npages; + int ret; + + ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages); + if (ret < 0) + return ret; + + index_stripe_sectors(rbio); + return 0; +} + /* * the stripe must be locked by the caller. It will * unlock after all the writes are done @@ -2454,6 +2475,146 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, start_async_work(rbio, recover_rbio_work); } +static int rmw_read_and_wait(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list; + struct bio *bio; + int ret; + + bio_list_init(&bio_list); + atomic_set(&rbio->error, 0); + + ret = rmw_assemble_read_bios(rbio, &bio_list); + if (ret < 0) + goto out; + + submit_read_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + return ret; +out: + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); + + return ret; +} + +static void raid_wait_write_end_io(struct bio *bio) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + blk_status_t err = bio->bi_status; + + if (err) + fail_bio_stripe(rbio, bio); + bio_put(bio); + if (atomic_dec_and_test(&rbio->stripes_pending)) + wake_up(&rbio->io_wait); +} + +static void submit_write_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; + + atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); + while ((bio = bio_list_pop(bio_list))) { + bio->bi_end_io = raid_wait_write_end_io; + + if (trace_raid56_write_stripe_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_write_stripe(rbio, bio, &trace_info); + } + submit_bio(bio); + } +} + +int rmw_rbio(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list; + int sectornr; + int ret = 0; + + /* + * Allocate the pages for parity first, as P/Q pages will always be + * needed for both full-stripe and sub-stripe writes. + */ + ret = alloc_rbio_parity_pages(rbio); + if (ret < 0) + return ret; + + /* Full stripe write, can write the full stripe right now. */ + if (rbio_is_full(rbio)) + goto write; + /* + * Now we're doing sub-stripe write, also need all data stripes to do + * the full RMW. + */ + ret = alloc_rbio_data_pages(rbio); + if (ret < 0) + return ret; + + atomic_set(&rbio->error, 0); + index_rbio_pages(rbio); + + ret = rmw_read_and_wait(rbio); + if (ret < 0) + return ret; + + /* Too many read errors, beyond our tolerance. */ + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) + return ret; + + /* Have read failures but under tolerance, needs recovery. */ + if (rbio->faila >= 0 || rbio->failb >= 0) { + ret = recover_rbio(rbio); + if (ret < 0) + return ret; + } +write: + /* + * At this stage we're not allowed to add any new bios to the + * bio list any more, anyone else that wants to change this stripe + * needs to do their own rmw. + */ + spin_lock_irq(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock_irq(&rbio->bio_list_lock); + + atomic_set(&rbio->error, 0); + + index_rbio_pages(rbio); + + /* + * We don't cache full rbios because we're assuming + * the higher layers are unlikely to use this area of + * the disk again soon. If they do use it again, + * hopefully they will send another full bio. + */ + if (!rbio_is_full(rbio)) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) + generate_pq_vertical(rbio, sectornr); + + bio_list_init(&bio_list); + ret = rmw_assemble_write_bios(rbio, &bio_list); + if (ret < 0) + return ret; + + /* We should have at least one bio assembled. */ + ASSERT(bio_list_size(&bio_list)); + submit_write_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + /* We have more errors than our tolerance during the read. */ + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) + ret = -EIO; + return ret; +} + static void rmw_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 445e833fcfcf..0e77c77c5dba 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -185,4 +185,9 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); +/* + * Placeholder definition to avoid warning, will be removed when + * the full write path is migrated. + */ +int rmw_rbio(struct btrfs_raid_bio *rbio); #endif -- cgit From 93723095b5d54b923abf07459998bcb9bbac8ba6 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:09 +0800 Subject: btrfs: raid56: switch write path to rmw_rbio() This includes the following changes: - Implement new raid_unplug() functions Now we don't need a workqueue to run the plug, as all our work is just queue rmw_rbio_work() call, which can be executed without sleep. - Implement a rmw_rbio_work_locked() helper This is for unlock_stripe(), which is already holding the full stripe lock. - Remove all the old functions This should already shows how complex the old functions are, as we ended up removing the following functions: * rmw_work() * validate_rbio_for_rmw() * raid56_rmw_end_io_work() * raid56_rmw_stripe() * full_stripe_write() * partial_stripe_write() * __raid56_parity_write() * run_plug() * unplug_work() * btrfs_raid_unplug() * rmw_work() * __raid56_parity_recover() * raid_recover_end_io_work() - Unexport rmw_rbio() Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 350 +++++++----------------------------------------------- fs/btrfs/raid56.h | 5 - 2 files changed, 42 insertions(+), 313 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 8fd633f01d9e..ffedbfde95e0 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -64,9 +64,9 @@ struct sector_ptr { unsigned int uptodate:8; }; -static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); static noinline void finish_rmw(struct btrfs_raid_bio *rbio); -static void rmw_work(struct work_struct *work); +static void rmw_rbio_work(struct work_struct *work); +static void rmw_rbio_work_locked(struct work_struct *work); static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); static void index_rbio_pages(struct btrfs_raid_bio *rbio); @@ -816,7 +816,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) start_async_work(next, recover_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); - start_async_work(next, rmw_work); + start_async_work(next, rmw_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { steal_rbio(rbio, next); start_async_work(next, scrub_parity_work); @@ -1108,23 +1108,6 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, return 0; } -/* - * while we're doing the read/modify/write cycle, we could - * have errors in reading pages off the disk. This checks - * for errors and if we're not able to read the page it'll - * trigger parity reconstruction. The rmw will be finished - * after we've reconstructed the failed stripes - */ -static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) -{ - if (rbio->faila >= 0 || rbio->failb >= 0) { - BUG_ON(rbio->faila == rbio->real_stripes - 1); - __raid56_parity_recover(rbio); - } else { - finish_rmw(rbio); - } -} - static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; @@ -1601,31 +1584,6 @@ static void raid56_bio_end_io(struct bio *bio) &rbio->end_io_work); } -/* - * End io handler for the read phase of the RMW cycle. All the bios here are - * physical stripe bios we've read from the disk so we can recalculate the - * parity of the stripe. - * - * This will usually kick off finish_rmw once all the bios are read in, but it - * may trigger parity reconstruction if we had any errors along the way - */ -static void raid56_rmw_end_io_work(struct work_struct *work) -{ - struct btrfs_raid_bio *rbio = - container_of(work, struct btrfs_raid_bio, end_io_work); - - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { - rbio_orig_end_io(rbio, BLK_STS_IOERR); - return; - } - - /* - * This will normally call finish_rmw to start our write but if there - * are any failed stripes we'll reconstruct from parity first. - */ - validate_rbio_for_rmw(rbio); -} - static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { @@ -1686,122 +1644,6 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) return 0; } -/* - * the stripe must be locked by the caller. It will - * unlock after all the writes are done - */ -static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) -{ - int bios_to_read = 0; - struct bio_list bio_list; - int ret; - struct bio *bio; - - bio_list_init(&bio_list); - - ret = alloc_rbio_pages(rbio); - if (ret) - goto cleanup; - - index_rbio_pages(rbio); - - atomic_set(&rbio->error, 0); - - ret = rmw_assemble_read_bios(rbio, &bio_list); - if (ret < 0) - goto cleanup; - - bios_to_read = bio_list_size(&bio_list); - if (!bios_to_read) { - /* - * this can happen if others have merged with - * us, it means there is nothing left to read. - * But if there are missing devices it may not be - * safe to do the full stripe write yet. - */ - goto finish; - } - - /* - * The bioc may be freed once we submit the last bio. Make sure not to - * touch it after that. - */ - atomic_set(&rbio->stripes_pending, bios_to_read); - INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work); - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid56_bio_end_io; - - if (trace_raid56_read_partial_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; - - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_read_partial(rbio, bio, &trace_info); - } - submit_bio(bio); - } - /* the actual write will happen once the reads are done */ - return 0; - -cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return -EIO; - -finish: - validate_rbio_for_rmw(rbio); - return 0; -} - -/* - * if the upper layers pass in a full stripe, we thank them by only allocating - * enough pages to hold the parity, and sending it all down quickly. - */ -static int full_stripe_write(struct btrfs_raid_bio *rbio) -{ - int ret; - - ret = alloc_rbio_parity_pages(rbio); - if (ret) - return ret; - - ret = lock_stripe_add(rbio); - if (ret == 0) - finish_rmw(rbio); - return 0; -} - -/* - * partial stripe writes get handed over to async helpers. - * We're really hoping to merge a few more writes into this - * rbio before calculating new parity - */ -static int partial_stripe_write(struct btrfs_raid_bio *rbio) -{ - int ret; - - ret = lock_stripe_add(rbio); - if (ret == 0) - start_async_work(rbio, rmw_work); - return 0; -} - -/* - * sometimes while we were reading from the drive to - * recalculate parity, enough new bios come into create - * a full stripe. So we do a check here to see if we can - * go directly to finish_rmw - */ -static int __raid56_parity_write(struct btrfs_raid_bio *rbio) -{ - /* head off into rmw land if we don't have a full stripe */ - if (!rbio_is_full(rbio)) - return partial_stripe_write(rbio); - return full_stripe_write(rbio); -} - /* * We use plugging call backs to collect full stripes. * Any time we get a partial stripe write while plugged @@ -1836,28 +1678,22 @@ static int plug_cmp(void *priv, const struct list_head *a, return 0; } -static void run_plug(struct btrfs_plug_cb *plug) +static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule) { + struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb); struct btrfs_raid_bio *cur; struct btrfs_raid_bio *last = NULL; - /* - * sort our plug list then try to merge - * everything we can in hopes of creating full - * stripes. - */ list_sort(NULL, &plug->rbio_list, plug_cmp); + while (!list_empty(&plug->rbio_list)) { cur = list_entry(plug->rbio_list.next, struct btrfs_raid_bio, plug_list); list_del_init(&cur->plug_list); if (rbio_is_full(cur)) { - int ret; - - /* we have a full stripe, send it down */ - ret = full_stripe_write(cur); - BUG_ON(ret); + /* We have a full stripe, queue it down. */ + start_async_work(cur, rmw_rbio_work); continue; } if (last) { @@ -1865,42 +1701,16 @@ static void run_plug(struct btrfs_plug_cb *plug) merge_rbio(last, cur); free_raid_bio(cur); continue; - } - __raid56_parity_write(last); + start_async_work(last, rmw_rbio_work); } last = cur; } - if (last) { - __raid56_parity_write(last); - } + if (last) + start_async_work(last, rmw_rbio_work); kfree(plug); } -/* - * if the unplug comes from schedule, we have to push the - * work off to a helper thread - */ -static void unplug_work(struct work_struct *work) -{ - struct btrfs_plug_cb *plug; - plug = container_of(work, struct btrfs_plug_cb, work); - run_plug(plug); -} - -static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) -{ - struct btrfs_plug_cb *plug; - plug = container_of(cb, struct btrfs_plug_cb, cb); - - if (from_schedule) { - INIT_WORK(&plug->work, unplug_work); - queue_work(plug->info->rmw_workers, &plug->work); - return; - } - run_plug(plug); -} - /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) { @@ -1948,19 +1758,13 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) rbio_add_bio(rbio, bio); /* - * don't plug on full rbios, just get them out the door + * Don't plug on full rbios, just get them out the door * as quickly as we can */ - if (rbio_is_full(rbio)) { - ret = full_stripe_write(rbio); - if (ret) { - free_raid_bio(rbio); - goto fail; - } - return; - } + if (rbio_is_full(rbio)) + goto queue_rbio; - cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); + cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); if (cb) { plug = container_of(cb, struct btrfs_plug_cb, cb); if (!plug->info) { @@ -1968,13 +1772,14 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) INIT_LIST_HEAD(&plug->rbio_list); } list_add_tail(&rbio->plug_list, &plug->rbio_list); - } else { - ret = __raid56_parity_write(rbio); - if (ret) { - free_raid_bio(rbio); - goto fail; - } + return; } +queue_rbio: + /* + * Either we don't have any existing plug, or we're doing a full stripe, + * can queue the rmw work now. + */ + start_async_work(rbio, rmw_rbio_work); return; @@ -2217,21 +2022,6 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) } } -/* - * This is called only for stripes we've read from disk to reconstruct the - * parity. - */ -static void raid_recover_end_io_work(struct work_struct *work) -{ - struct btrfs_raid_bio *rbio = - container_of(work, struct btrfs_raid_bio, end_io_work); - - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - rbio_orig_end_io(rbio, BLK_STS_IOERR); - else - __raid_recover_end_io(rbio); -} - static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { @@ -2348,79 +2138,6 @@ static void recover_rbio_work_locked(struct work_struct *work) rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } -/* - * reads everything we need off the disk to reconstruct - * the parity. endio handlers trigger final reconstruction - * when the IO is done. - * - * This is used both for reads from the higher layers and for - * parity construction required to finish a rmw cycle. - */ -static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) -{ - int bios_to_read = 0; - struct bio_list bio_list; - int ret; - struct bio *bio; - - bio_list_init(&bio_list); - - ret = alloc_rbio_pages(rbio); - if (ret) - goto cleanup; - - atomic_set(&rbio->error, 0); - - ret = recover_assemble_read_bios(rbio, &bio_list); - if (ret < 0) - goto cleanup; - - bios_to_read = bio_list_size(&bio_list); - if (!bios_to_read) { - /* - * we might have no bios to read just because the pages - * were up to date, or we might have no bios to read because - * the devices were gone. - */ - if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) { - __raid_recover_end_io(rbio); - return 0; - } else { - goto cleanup; - } - } - - /* - * The bioc may be freed once we submit the last bio. Make sure not to - * touch it after that. - */ - atomic_set(&rbio->stripes_pending, bios_to_read); - INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work); - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid56_bio_end_io; - - if (trace_raid56_scrub_read_recover_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; - - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_read_recover(rbio, bio, &trace_info); - } - submit_bio(bio); - } - - return 0; - -cleanup: - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) - rbio_orig_end_io(rbio, BLK_STS_IOERR); - - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return -EIO; -} - /* * the main entry point for reads from the higher layers. This * is really only called when the normal read path had a failure, @@ -2529,7 +2246,7 @@ static void submit_write_bios(struct btrfs_raid_bio *rbio, } } -int rmw_rbio(struct btrfs_raid_bio *rbio) +static int rmw_rbio(struct btrfs_raid_bio *rbio) { struct bio_list bio_list; int sectornr; @@ -2615,12 +2332,29 @@ write: return ret; } -static void rmw_work(struct work_struct *work) +static void rmw_rbio_work(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio; + int ret; + + rbio = container_of(work, struct btrfs_raid_bio, work); + + ret = lock_stripe_add(rbio); + if (ret == 0) { + ret = rmw_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + } +} + +static void rmw_rbio_work_locked(struct work_struct *work) { struct btrfs_raid_bio *rbio; + int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - raid56_rmw_stripe(rbio); + + ret = rmw_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } /* diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 0e77c77c5dba..445e833fcfcf 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -185,9 +185,4 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); -/* - * Placeholder definition to avoid warning, will be removed when - * the full write path is migrated. - */ -int rmw_rbio(struct btrfs_raid_bio *rbio); #endif -- cgit From cb3450b7d7d0af6ed6ff60e174129938914083ab Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:10 +0800 Subject: btrfs: raid56: extract scrub read bio list assembly code into a helper Just like what we did for write/recovery, also extract the read bio assembly code into a helper for scrub. The difference between the three are: - rmw_assemble_read_bios() only submit reads for missing sectors Thus it will skip cached sectors, but will also read sectors which is not covered by any full stripe. (For cache usage) - recover_assemble_read_bios() reads every sector which has not failed - scrub_assemble_read_bios() has extra check for vertical stripes It's mostly the same as rmw_assemble_read_bios(), but will skip sectors which is not covered by a vertical stripe. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 44 +++++++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index ffedbfde95e0..fcac70ff5a78 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2707,21 +2707,15 @@ static void raid56_parity_scrub_end_io_work(struct work_struct *work) validate_rbio_for_parity_scrub(rbio); } -static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) +static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) { - int bios_to_read = 0; - struct bio_list bio_list; - int ret; - int total_sector_nr; struct bio *bio; + int total_sector_nr; + int ret = 0; - bio_list_init(&bio_list); - - ret = alloc_rbio_essential_pages(rbio); - if (ret) - goto cleanup; + ASSERT(bio_list_size(bio_list) == 0); - atomic_set(&rbio->error, 0); /* Build a list of bios to read all the missing parts. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { @@ -2750,11 +2744,35 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) if (sector->uptodate) continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret) - goto cleanup; + goto error; } + return 0; +error: + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); + return ret; +} + +static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct bio_list bio_list; + int ret; + struct bio *bio; + + bio_list_init(&bio_list); + + ret = alloc_rbio_essential_pages(rbio); + if (ret) + goto cleanup; + + atomic_set(&rbio->error, 0); + ret = scrub_assemble_read_bios(rbio, &bio_list); + if (ret < 0) + goto cleanup; bios_to_read = bio_list_size(&bio_list); if (!bios_to_read) { -- cgit From 6bfd0133bee27737db415c530617cb015274d21f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:11 +0800 Subject: btrfs: raid56: switch scrub path to use a single function This switch involves the following changes: - Make finish_parity_scrub() only to submit the write bios It will no longer call rbio_orig_end_io(), and now it will return error. - Add a new helper, recover_scrub_rbio(), to handle recovery It's just doing extra scrub related checks, and then call recover_sectors(). - Rename raid56_parity_scrub_stripe() to scrub_rbio() - Rename scrub_parity_work() to scrub_rbio_work_locked() To follow the existing naming scheme. - Delete unused functions Including: * finish_rmw() * raid_write_end_io() * raid56_bio_end_io() * __raid_recover_end_io() Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 397 +++++++++++------------------------------------------- 1 file changed, 79 insertions(+), 318 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index fcac70ff5a78..629237102189 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -64,7 +64,6 @@ struct sector_ptr { unsigned int uptodate:8; }; -static noinline void finish_rmw(struct btrfs_raid_bio *rbio); static void rmw_rbio_work(struct work_struct *work); static void rmw_rbio_work_locked(struct work_struct *work); static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); @@ -72,9 +71,8 @@ static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); static void index_rbio_pages(struct btrfs_raid_bio *rbio); static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); -static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, - int need_check); -static void scrub_parity_work(struct work_struct *work); +static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); +static void scrub_rbio_work_locked(struct work_struct *work); static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) { @@ -819,7 +817,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) start_async_work(next, rmw_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { steal_rbio(rbio, next); - start_async_work(next, scrub_parity_work); + start_async_work(next, scrub_rbio_work_locked); } goto done_nolock; @@ -880,35 +878,6 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) rbio_endio_bio_list(extra, err); } -/* - * end io function used by finish_rmw. When we finally - * get here, we've written a full stripe - */ -static void raid_write_end_io(struct bio *bio) -{ - struct btrfs_raid_bio *rbio = bio->bi_private; - blk_status_t err = bio->bi_status; - int max_errors; - - if (err) - fail_bio_stripe(rbio, bio); - - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; - - err = BLK_STS_OK; - - /* OK, we have read all the stripes we need to. */ - max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? - 0 : rbio->bioc->max_errors; - if (atomic_read(&rbio->error) > max_errors) - err = BLK_STS_IOERR; - - rbio_orig_end_io(rbio, err); -} - /* * Get a sector pointer specified by its @stripe_nr and @sector_nr. * @@ -1319,87 +1288,6 @@ error: return -EIO; } -/* - * this is called from one of two situations. We either - * have a full stripe from the higher layers, or we've read all - * the missing bits off disk. - * - * This will calculate the parity and then send down any - * changed blocks. - */ -static noinline void finish_rmw(struct btrfs_raid_bio *rbio) -{ - /* The total sector number inside the full stripe. */ - /* Sector number inside a stripe. */ - int sectornr; - struct bio_list bio_list; - struct bio *bio; - int ret; - - bio_list_init(&bio_list); - - /* We should have at least one data sector. */ - ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); - - /* at this point we either have a full stripe, - * or we've read the full stripe from the drive. - * recalculate the parity and write the new results. - * - * We're not allowed to add any new bios to the - * bio list here, anyone else that wants to - * change this stripe needs to do their own rmw. - */ - spin_lock_irq(&rbio->bio_list_lock); - set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); - spin_unlock_irq(&rbio->bio_list_lock); - - atomic_set(&rbio->error, 0); - - /* - * now that we've set rmw_locked, run through the - * bio list one last time and map the page pointers - * - * We don't cache full rbios because we're assuming - * the higher layers are unlikely to use this area of - * the disk again soon. If they do use it again, - * hopefully they will send another full bio. - */ - index_rbio_pages(rbio); - if (!rbio_is_full(rbio)) - cache_rbio_pages(rbio); - else - clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) - generate_pq_vertical(rbio, sectornr); - - ret = rmw_assemble_write_bios(rbio, &bio_list); - if (ret < 0) - goto cleanup; - - atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); - BUG_ON(atomic_read(&rbio->stripes_pending) == 0); - - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid_write_end_io; - - if (trace_raid56_write_stripe_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; - - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_write_stripe(rbio, bio, &trace_info); - } - submit_bio(bio); - } - return; - -cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); -} - /* * helper to find the stripe number for a given bio. Used to figure out which * stripe has failed. This expects the bio to correspond to a physical disk, @@ -1568,22 +1456,6 @@ static void submit_read_bios(struct btrfs_raid_bio *rbio, } } -static void raid56_bio_end_io(struct bio *bio) -{ - struct btrfs_raid_bio *rbio = bio->bi_private; - - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(rbio, bio); - - bio_put(bio); - - if (atomic_dec_and_test(&rbio->stripes_pending)) - queue_work(rbio->bioc->fs_info->endio_raid56_workers, - &rbio->end_io_work); -} - static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { @@ -1968,60 +1840,6 @@ out: return ret; } -/* - * all parity reconstruction happens here. We've read in everything - * we can find from the drives and this does the heavy lifting of - * sorting the good from the bad. - */ -static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) -{ - int ret; - - ret = recover_sectors(rbio); - - /* - * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a - * valid rbio which is consistent with ondisk content, thus such a - * valid rbio can be cached to avoid further disk reads. - */ - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { - /* - * - In case of two failures, where rbio->failb != -1: - * - * Do not cache this rbio since the above read reconstruction - * (raid6_datap_recov() or raid6_2data_recov()) may have - * changed some content of stripes which are not identical to - * on-disk content any more, otherwise, a later write/recover - * may steal stripe_pages from this rbio and end up with - * corruptions or rebuild failures. - * - * - In case of single failure, where rbio->failb == -1: - * - * Cache this rbio iff the above read reconstruction is - * executed without problems. - */ - if (!ret && rbio->failb < 0) - cache_rbio_pages(rbio); - else - clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); - } else if (!ret) { - rbio->faila = -1; - rbio->failb = -1; - - if (rbio->operation == BTRFS_RBIO_WRITE) - finish_rmw(rbio); - else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) - finish_parity_scrub(rbio, 0); - else - BUG(); - } else { - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); - } -} - static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { @@ -2449,8 +2267,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) return 0; } -static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, - int need_check) +static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) { struct btrfs_io_context *bioc = rbio->bioc; const u32 sectorsize = bioc->fs_info->sectorsize; @@ -2493,7 +2310,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, p_sector.page = alloc_page(GFP_NOFS); if (!p_sector.page) - goto cleanup; + return -ENOMEM; p_sector.pgoff = 0; p_sector.uptodate = 1; @@ -2503,7 +2320,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, if (!q_sector.page) { __free_page(p_sector.page); p_sector.page = NULL; - goto cleanup; + return -ENOMEM; } q_sector.pgoff = 0; q_sector.uptodate = 1; @@ -2590,33 +2407,13 @@ writeback: } submit_write: - nr_data = bio_list_size(&bio_list); - if (!nr_data) { - /* Every parity is right */ - rbio_orig_end_io(rbio, BLK_STS_OK); - return; - } - - atomic_set(&rbio->stripes_pending, nr_data); - - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid_write_end_io; - - if (trace_raid56_scrub_write_stripe_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; - - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_write_stripe(rbio, bio, &trace_info); - } - submit_bio(bio); - } - return; + submit_write_bios(rbio, &bio_list); + return 0; cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - while ((bio = bio_list_pop(&bio_list))) bio_put(bio); + return ret; } static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) @@ -2626,85 +2423,51 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) return 0; } -/* - * While we're doing the parity check and repair, we could have errors - * in reading pages off the disk. This checks for errors and if we're - * not able to read the page it'll trigger parity reconstruction. The - * parity scrub will be finished after we've reconstructed the failed - * stripes - */ -static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) +static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) { - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - goto cleanup; - - if (rbio->faila >= 0 || rbio->failb >= 0) { - int dfail = 0, failp = -1; - - if (is_data_stripe(rbio, rbio->faila)) - dfail++; - else if (is_parity_stripe(rbio->faila)) - failp = rbio->faila; - - if (is_data_stripe(rbio, rbio->failb)) - dfail++; - else if (is_parity_stripe(rbio->failb)) - failp = rbio->failb; - - /* - * Because we can not use a scrubbing parity to repair - * the data, so the capability of the repair is declined. - * (In the case of RAID5, we can not repair anything) - */ - if (dfail > rbio->bioc->max_errors - 1) - goto cleanup; + int dfail = 0, failp = -1; + int ret; - /* - * If all data is good, only parity is correctly, just - * repair the parity. - */ - if (dfail == 0) { - finish_parity_scrub(rbio, 0); - return; - } + /* No error case should be already handled by the caller. */ + ASSERT(rbio->faila >= 0 || rbio->failb >= 0); - /* - * Here means we got one corrupted data stripe and one - * corrupted parity on RAID6, if the corrupted parity - * is scrubbing parity, luckily, use the other one to repair - * the data, or we can not repair the data stripe. - */ - if (failp != rbio->scrubp) - goto cleanup; + if (is_data_stripe(rbio, rbio->faila)) + dfail++; + else if (is_parity_stripe(rbio->faila)) + failp = rbio->faila; - __raid_recover_end_io(rbio); - } else { - finish_parity_scrub(rbio, 1); - } - return; + if (is_data_stripe(rbio, rbio->failb)) + dfail++; + else if (is_parity_stripe(rbio->failb)) + failp = rbio->failb; -cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); -} + /* + * Because we can not use a scrubbing parity to repair + * the data, so the capability of the repair is declined. + * (In the case of RAID5, we can not repair anything) + */ + if (dfail > rbio->bioc->max_errors - 1) + return -EIO; -/* - * end io for the read phase of the rmw cycle. All the bios here are physical - * stripe bios we've read from the disk so we can recalculate the parity of the - * stripe. - * - * This will usually kick off finish_rmw once all the bios are read in, but it - * may trigger parity reconstruction if we had any errors along the way - */ -static void raid56_parity_scrub_end_io_work(struct work_struct *work) -{ - struct btrfs_raid_bio *rbio = - container_of(work, struct btrfs_raid_bio, end_io_work); + /* + * If all data is good, only parity is correctly, just + * repair the parity. + */ + if (dfail == 0) + return 0; /* - * This will normally call finish_rmw to start our write, but if there - * are any failed stripes we'll reconstruct from parity first + * Here means we got one corrupted data stripe and one + * corrupted parity on RAID6, if the corrupted parity + * is scrubbing parity, luckily, use the other one to repair + * the data, or we can not repair the data stripe. */ - validate_rbio_for_parity_scrub(rbio); + if (failp != rbio->scrubp) + return -EIO; + + /* We have some corrupted sectors, need to repair them. */ + ret = recover_sectors(rbio); + return ret; } static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, @@ -2756,9 +2519,9 @@ error: return ret; } -static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) +static int scrub_rbio(struct btrfs_raid_bio *rbio) { - int bios_to_read = 0; + bool need_check = false; struct bio_list bio_list; int ret; struct bio *bio; @@ -2774,61 +2537,59 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) if (ret < 0) goto cleanup; - bios_to_read = bio_list_size(&bio_list); - if (!bios_to_read) { - /* - * this can happen if others have merged with - * us, it means there is nothing left to read. - * But if there are missing devices it may not be - * safe to do the full stripe write yet. - */ - goto finish; - } + submit_read_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { + ret = -EIO; + goto cleanup; + } /* - * The bioc may be freed once we submit the last bio. Make sure not to - * touch it after that. + * No error during read, can finish the scrub and need to verify the + * P/Q sectors; */ - atomic_set(&rbio->stripes_pending, bios_to_read); - INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work); - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid56_bio_end_io; + if (atomic_read(&rbio->error) == 0) { + need_check = true; + goto finish; + } - if (trace_raid56_scrub_read_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; + /* We have some failures, need to recover the failed sectors first. */ + ret = recover_scrub_rbio(rbio); + if (ret < 0) + goto cleanup; - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_read(rbio, bio, &trace_info); - } - submit_bio(bio); - } - /* the actual write will happen once the reads are done */ - return; +finish: + /* + * We have every sector properly prepared. Can finish the scrub + * and writeback the good content. + */ + ret = finish_parity_scrub(rbio, need_check); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) + ret = -EIO; + return ret; cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - while ((bio = bio_list_pop(&bio_list))) bio_put(bio); - return; - -finish: - validate_rbio_for_parity_scrub(rbio); + return ret; } -static void scrub_parity_work(struct work_struct *work) +static void scrub_rbio_work_locked(struct work_struct *work) { struct btrfs_raid_bio *rbio; + int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - raid56_parity_scrub_stripe(rbio); + ret = scrub_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) { if (!lock_stripe_add(rbio)) - start_async_work(rbio, scrub_parity_work); + start_async_work(rbio, scrub_rbio_work_locked); } /* The following code is used for dev replace of a missing RAID 5/6 device. */ -- cgit From 1a1a285139707f2430b3833103e152f07c66e63e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:12 +0800 Subject: btrfs: remove the unused endio_raid56_workers and btrfs_raid_bio::end_io_work Since we have switched all raid56 workload to submit-and-wait method, there is no use for btrfs_fs_info::endio_raid56_workers workqueue and btrfs_raid_bio::end_io_work. Remove them to save some memory. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 +----- fs/btrfs/fs.h | 1 - fs/btrfs/raid56.h | 2 -- 3 files changed, 1 insertion(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f5f793af12a0..e5687bdff150 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2097,8 +2097,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->workers); if (fs_info->endio_workers) destroy_workqueue(fs_info->endio_workers); - if (fs_info->endio_raid56_workers) - destroy_workqueue(fs_info->endio_raid56_workers); if (fs_info->rmw_workers) destroy_workqueue(fs_info->rmw_workers); if (fs_info->compressed_write_workers) @@ -2304,8 +2302,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) alloc_workqueue("btrfs-endio", flags, max_active); fs_info->endio_meta_workers = alloc_workqueue("btrfs-endio-meta", flags, max_active); - fs_info->endio_raid56_workers = - alloc_workqueue("btrfs-endio-raid56", flags, max_active); fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, @@ -2327,7 +2323,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->compressed_write_workers && - fs_info->endio_write_workers && fs_info->endio_raid56_workers && + fs_info->endio_write_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->fixup_workers && fs_info->delayed_workers && fs_info->qgroup_rescan_workers && diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index c7f2a512fba2..a749367e5ae2 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -532,7 +532,6 @@ struct btrfs_fs_info { struct btrfs_workqueue *flush_workers; struct workqueue_struct *endio_workers; struct workqueue_struct *endio_meta_workers; - struct workqueue_struct *endio_raid56_workers; struct workqueue_struct *rmw_workers; struct workqueue_struct *compressed_write_workers; struct btrfs_workqueue *endio_write_workers; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 445e833fcfcf..9fae97b7a2a5 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -97,8 +97,6 @@ struct btrfs_raid_bio { wait_queue_head_t io_wait; - struct work_struct end_io_work; - /* Bitmap to record which horizontal stripe has data */ unsigned long dbitmap; -- cgit From 61ce908a3c260fbe37826a6cbd56636abeffcd28 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:41 +0000 Subject: btrfs: send: avoid unnecessary path allocations when finding extent clone When looking for an extent clone, at find_extent_clone(), we start by allocating a path and then check for cases where we can't have clones and exit immediately in those cases. It's a waste of time to allocate the path before those cases, so reorder the logic so that we check for those cases before allocating the path. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 3befc0d2d866..d1388d5dc951 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1365,40 +1365,35 @@ static int find_extent_clone(struct send_ctx *sctx, int compressed; u32 i; - tmp_path = alloc_path_for_send(); - if (!tmp_path) - return -ENOMEM; - - /* We only use this path under the commit sem */ - tmp_path->need_commit_sem = 0; - if (data_offset >= ino_size) { /* * There may be extents that lie behind the file's size. * I at least had this in combination with snapshotting while * writing large files. */ - ret = 0; - goto out; + return 0; } - fi = btrfs_item_ptr(eb, path->slots[0], - struct btrfs_file_extent_item); + fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(eb, fi); - if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - ret = -ENOENT; - goto out; - } - compressed = btrfs_file_extent_compression(eb, fi); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + return -ENOENT; - num_bytes = btrfs_file_extent_num_bytes(eb, fi); disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte == 0) { - ret = -ENOENT; - goto out; - } + if (disk_byte == 0) + return -ENOENT; + + compressed = btrfs_file_extent_compression(eb, fi); + num_bytes = btrfs_file_extent_num_bytes(eb, fi); logical = disk_byte + btrfs_file_extent_offset(eb, fi); + tmp_path = alloc_path_for_send(); + if (!tmp_path) + return -ENOMEM; + + /* We only use this path under the commit sem */ + tmp_path->need_commit_sem = 0; + down_read(&fs_info->commit_root_sem); ret = extent_from_logical(fs_info, disk_byte, tmp_path, &found_key, &flags); -- cgit From d3f41317f0fedac6b0a3de68b3dab66b71923dd8 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:42 +0000 Subject: btrfs: send: update comment at find_extent_clone() We have this unclear comment at find_extent_clone() about extents starting at a file offset greater than or equals to the i_size of the inode. It's not really informative and it's misleading, since it mentions the author found such extents with snapshots and large files. Such extents are a result of fallocate with FALLOC_FL_KEEP_SIZE and there is no relation to snapshots or large files (all write paths update the i_size before inserting a new file extent item). So update the comment to be precise about it and why we don't bother looking for clone sources in that case. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d1388d5dc951..4ce5c154f6d7 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1365,14 +1365,14 @@ static int find_extent_clone(struct send_ctx *sctx, int compressed; u32 i; - if (data_offset >= ino_size) { - /* - * There may be extents that lie behind the file's size. - * I at least had this in combination with snapshotting while - * writing large files. - */ + /* + * With fallocate we can get prealloc extents beyond the inode's i_size, + * so we don't do anything here because clone operations can not clone + * to a range beyond i_size without increasing the i_size of the + * destination inode. + */ + if (data_offset >= ino_size) return 0; - } fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(eb, fi); -- cgit From 344174a1a68a55599d3a264db47d96583ff66473 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:43 +0000 Subject: btrfs: send: drop unnecessary backref context field initializations At find_extent_clone() we are initializing to zero the 'found_itself' and 'found' fields of the backref context before we use it but we have already initialized the structure to zeroes when we declared it on stack, so it's pointless to initialize those fields and they are unnecessarily increasing the object text size with two "mov" instructions (x86_64). Similarly make the 'extent_len' initialization more clear by using an if- -then-else instead of a double assignment to it in case the extent's end crosses the i_size boundary. Before this change: $ size fs/btrfs/send.o text data bss dec hex filename 68694 4252 16 72962 11d02 fs/btrfs/send.o After this change: $ size fs/btrfs/send.o text data bss dec hex filename 68678 4252 16 72946 11cf2 fs/btrfs/send.o Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4ce5c154f6d7..7d289bdc6de1 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1432,11 +1432,8 @@ static int find_extent_clone(struct send_ctx *sctx, } backref_ctx.sctx = sctx; - backref_ctx.found = 0; backref_ctx.cur_objectid = ino; backref_ctx.cur_offset = data_offset; - backref_ctx.found_itself = 0; - backref_ctx.extent_len = num_bytes; /* * The last extent of a file may be too large due to page alignment. @@ -1445,6 +1442,8 @@ static int find_extent_clone(struct send_ctx *sctx, */ if (data_offset + num_bytes >= ino_size) backref_ctx.extent_len = ino_size - data_offset; + else + backref_ctx.extent_len = num_bytes; /* * Now collect all backrefs. -- cgit From 22a3c0ac8ed0043af209a15928ae4c4855b0a4c4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:44 +0000 Subject: btrfs: send: avoid unnecessary backref lookups when finding clone source At find_extent_clone(), unless we are given an inline extent, a file extent item that represents hole or an extent that starts beyond the i_size, we always do backref walking to look for clone sources, unless if we have more than SEND_MAX_EXTENT_REFS (64) known references on the extent. However if we know we only have one reference in the extent item and only one clone source (the send root), then it's pointless to do the backref walking to search for clone sources, as we can't clone from any other root. So skip the backref walking in that case. The following test was run on a non-debug kernel (Debian's default kernel config): $ cat test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi mkfs.btrfs -f $DEV mount $DEV $MNT # Create an extent tree that's not too small and none of the # extents is shared. for ((i = 1; i <= 50000; i++)); do xfs_io -f -c "pwrite 0 4K" $MNT/file_$i > /dev/null echo -ne "\r$i files created..." done echo btrfs subvolume snapshot -r $MNT $MNT/snap start=$(date +%s%N) btrfs send $MNT/snap > /dev/null end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo -e "\nsend took $dur milliseconds" umount $MNT Before this change: send took 5389 milliseconds After this change: send took 4519 milliseconds (-16.1%) Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7d289bdc6de1..d9e01b66b32b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1354,6 +1354,7 @@ static int find_extent_clone(struct send_ctx *sctx, u64 disk_byte; u64 num_bytes; u64 extent_item_pos; + u64 extent_refs; u64 flags = 0; struct btrfs_file_extent_item *fi; struct extent_buffer *eb = path->nodes[0]; @@ -1408,14 +1409,22 @@ static int find_extent_clone(struct send_ctx *sctx, ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0], struct btrfs_extent_item); + extent_refs = btrfs_extent_refs(tmp_path->nodes[0], ei); /* * Backreference walking (iterate_extent_inodes() below) is currently * too expensive when an extent has a large number of references, both * in time spent and used memory. So for now just fallback to write * operations instead of clone operations when an extent has more than * a certain amount of references. + * + * Also, if we have only one reference and only the send root as a clone + * source - meaning no clone roots were given in the struct + * btrfs_ioctl_send_args passed to the send ioctl - then it's our + * reference and there's no point in doing backref walking which is + * expensive, so exit early. */ - if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) { + if ((extent_refs == 1 && sctx->clone_roots_cnt == 1) || + extent_refs > SEND_MAX_EXTENT_REFS) { ret = -ENOENT; goto out; } -- cgit From c7499a64dcf62bf27968b0e9cce353c490b9ada1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:45 +0000 Subject: btrfs: send: optimize clone detection to increase extent sharing Currently send does not do the best decisions when it comes to decide between multiple clone sources, which results in clone operations for partial extent ranges, which has the following disadvantages: 1) We get less shared extents at the destination; 2) We have to read more data during the send operation and emit more write commands. Besides not being optimal behaviour, it also breaks user expectations and is often reported by users, with a recent example in the Link tag at the bottom of this change log. Part of the reason for this non-optimal behaviour is that the backref walking code does not provide information about the length of the file extent items that were found for each backref, so send is blind about which backref is the best to chose as a cloning source. The other existing reasons are just silliness, namely always prefering the inode with the lowest number when multiple are found for the same root and when we can clone from multiple roots, always prefer the send root over any of the other clone roots. This does not make any sense since any inode or root is fine and as good as any other inode/root. Fix this by making backref walking pass information about the number of bytes referenced by each file extent item and then have send's backref callback pick the inode with the highest number of bytes for each root. Finally select the root from which we can clone more bytes from. Example reproducer: $ cat test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi mkfs.btrfs -f $DEV mount $DEV $MNT xfs_io -f -c "pwrite -S 0xab -b 2M 0 2M" $MNT/foo cp --reflink=always $MNT/foo $MNT/bar cp --reflink=always $MNT/foo $MNT/baz sync # Overwrite the second half of file foo. xfs_io -c "pwrite -S 0xcd -b 1M 1M 1M" $MNT/foo sync echo echo "*** fiemap in the original filesystem ***" echo xfs_io -c "fiemap -v" $MNT/foo xfs_io -c "fiemap -v" $MNT/bar xfs_io -c "fiemap -v" $MNT/baz echo btrfs filesystem du $MNT btrfs subvolume snapshot -r $MNT $MNT/snap btrfs send -f /tmp/send_stream $MNT/snap umount $MNT mkfs.btrfs -f $DEV &> /dev/null mount $DEV $MNT btrfs receive -f /tmp/send_stream $MNT echo echo "*** fiemap in the new filesystem ***" echo xfs_io -r -c "fiemap -v" $MNT/snap/foo xfs_io -r -c "fiemap -v" $MNT/snap/bar xfs_io -r -c "fiemap -v" $MNT/snap/baz echo btrfs filesystem du $MNT rm -f /tmp/send_stream rm -f /tmp/snap.fssum umount $MNT Before this change: $ ./test.sh (...) *** fiemap in the original filesystem *** /mnt/sdi/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2047]: 26624..28671 2048 0x2000 1: [2048..4095]: 30720..32767 2048 0x1 /mnt/sdi/bar: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..4095]: 26624..30719 4096 0x2001 /mnt/sdi/baz: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..4095]: 26624..30719 4096 0x2001 Total Exclusive Set shared Filename 2.00MiB 1.00MiB - /mnt/sdi/foo 2.00MiB 0.00B - /mnt/sdi/bar 2.00MiB 0.00B - /mnt/sdi/baz 6.00MiB 1.00MiB 2.00MiB /mnt/sdi Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap' At subvol /mnt/sdi/snap At subvol snap *** fiemap in the new filesystem *** /mnt/sdi/snap/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..4095]: 26624..30719 4096 0x2001 /mnt/sdi/snap/bar: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2047]: 26624..28671 2048 0x2000 1: [2048..4095]: 30720..32767 2048 0x1 /mnt/sdi/snap/baz: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2047]: 26624..28671 2048 0x2000 1: [2048..4095]: 32768..34815 2048 0x1 Total Exclusive Set shared Filename 2.00MiB 0.00B - /mnt/sdi/snap/foo 2.00MiB 1.00MiB - /mnt/sdi/snap/bar 2.00MiB 1.00MiB - /mnt/sdi/snap/baz 6.00MiB 2.00MiB - /mnt/sdi/snap 6.00MiB 2.00MiB 2.00MiB /mnt/sdi We end up with two 1M extents that are not shared for files bar and baz. After this change: $ ./test.sh (...) *** fiemap in the original filesystem *** /mnt/sdi/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2047]: 26624..28671 2048 0x2000 1: [2048..4095]: 30720..32767 2048 0x1 /mnt/sdi/bar: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..4095]: 26624..30719 4096 0x2001 /mnt/sdi/baz: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..4095]: 26624..30719 4096 0x2001 Total Exclusive Set shared Filename 2.00MiB 1.00MiB - /mnt/sdi/foo 2.00MiB 0.00B - /mnt/sdi/bar 2.00MiB 0.00B - /mnt/sdi/baz 6.00MiB 1.00MiB 2.00MiB /mnt/sdi Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap' At subvol /mnt/sdi/snap At subvol snap *** fiemap in the new filesystem *** /mnt/sdi/snap/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..4095]: 26624..30719 4096 0x2001 /mnt/sdi/snap/bar: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2047]: 26624..28671 2048 0x2000 1: [2048..4095]: 30720..32767 2048 0x2001 /mnt/sdi/snap/baz: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2047]: 26624..28671 2048 0x2000 1: [2048..4095]: 30720..32767 2048 0x2001 Total Exclusive Set shared Filename 2.00MiB 0.00B - /mnt/sdi/snap/foo 2.00MiB 0.00B - /mnt/sdi/snap/bar 2.00MiB 0.00B - /mnt/sdi/snap/baz 6.00MiB 0.00B - /mnt/sdi/snap 6.00MiB 0.00B 3.00MiB /mnt/sdi Now there's a much better sharing, files bar and baz share 1M of the extent of file foo and the second extent of files bar and baz is shared between themselves. This will later be turned into a test case for fstests. Link: https://lore.kernel.org/linux-btrfs/20221008005704.795b44b0@crass-HP-ZBook-15-G2/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 9 +++++---- fs/btrfs/backref.h | 4 ++-- fs/btrfs/scrub.c | 4 ++-- fs/btrfs/send.c | 49 +++++++++++++++++++++++++++++++++---------------- 4 files changed, 42 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 04cb608e7cfb..9be11a342de5 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -27,6 +27,7 @@ struct extent_inode_elem { u64 inum; u64 offset; + u64 num_bytes; struct extent_inode_elem *next; }; @@ -37,6 +38,7 @@ static int check_extent_in_eb(const struct btrfs_key *key, struct extent_inode_elem **eie, bool ignore_offset) { + const u64 data_len = btrfs_file_extent_num_bytes(eb, fi); u64 offset = 0; struct extent_inode_elem *e; @@ -45,10 +47,8 @@ static int check_extent_in_eb(const struct btrfs_key *key, !btrfs_file_extent_encryption(eb, fi) && !btrfs_file_extent_other_encoding(eb, fi)) { u64 data_offset; - u64 data_len; data_offset = btrfs_file_extent_offset(eb, fi); - data_len = btrfs_file_extent_num_bytes(eb, fi); if (extent_item_pos < data_offset || extent_item_pos >= data_offset + data_len) @@ -63,6 +63,7 @@ static int check_extent_in_eb(const struct btrfs_key *key, e->next = *eie; e->inum = key->objectid; e->offset = key->offset + offset; + e->num_bytes = data_len; *eie = e; return 0; @@ -2265,7 +2266,7 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, "ref for %llu resolved, key (%llu EXTEND_DATA %llu), root %llu", extent_item_objectid, eie->inum, eie->offset, root); - ret = iterate(eie->inum, eie->offset, root, ctx); + ret = iterate(eie->inum, eie->offset, eie->num_bytes, root, ctx); if (ret) { btrfs_debug(fs_info, "stopping iteration for %llu due to ret=%d", @@ -2357,7 +2358,7 @@ out: return ret; } -static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) +static int build_ino_list(u64 inum, u64 offset, u64 num_bytes, u64 root, void *ctx) { struct btrfs_data_container *inodes = ctx; const size_t c = 3 * sizeof(u64); diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 2dd68f87dca5..8d3598155f3b 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -75,8 +75,8 @@ struct btrfs_backref_share_check_ctx { int prev_extents_cache_slot; }; -typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, - void *ctx); +typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 num_bytes, + u64 root, void *ctx); struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void); void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 8c89e63ef2e8..3c22573bfe0d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -809,8 +809,8 @@ nomem: return ERR_PTR(-ENOMEM); } -static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, - void *warn_ctx) +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, + u64 root, void *warn_ctx) { u32 nlink; int ret; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d9e01b66b32b..49759cd9eecb 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -76,7 +76,7 @@ struct clone_root { struct btrfs_root *root; u64 ino; u64 offset; - + u64 num_bytes; u64 found_refs; }; @@ -1273,7 +1273,8 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2) * Called for every backref that is found for the current extent. * Results are collected in sctx->clone_roots->ino/offset/found_refs */ -static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) +static int __iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root, + void *ctx_) { struct backref_ctx *bctx = ctx_; struct clone_root *found; @@ -1318,15 +1319,17 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) bctx->found++; found->found_refs++; - if (ino < found->ino) { + + /* + * If the given backref refers to a file extent item with a larger + * number of bytes than what we found before, use the new one so that + * we clone more optimally and end up doing less writes and getting + * less exclusive, non-shared extents at the destination. + */ + if (num_bytes > found->num_bytes) { found->ino = ino; found->offset = offset; - } else if (found->ino == ino) { - /* - * same extent found more then once in the same file. - */ - if (found->offset > offset + bctx->extent_len) - found->offset = offset; + found->num_bytes = num_bytes; } return 0; @@ -1437,6 +1440,7 @@ static int find_extent_clone(struct send_ctx *sctx, cur_clone_root = sctx->clone_roots + i; cur_clone_root->ino = (u64)-1; cur_clone_root->offset = 0; + cur_clone_root->num_bytes = 0; cur_clone_root->found_refs = 0; } @@ -1506,14 +1510,27 @@ static int find_extent_clone(struct send_ctx *sctx, cur_clone_root = NULL; for (i = 0; i < sctx->clone_roots_cnt; i++) { - if (sctx->clone_roots[i].found_refs) { - if (!cur_clone_root) - cur_clone_root = sctx->clone_roots + i; - else if (sctx->clone_roots[i].root == sctx->send_root) - /* prefer clones from send_root over others */ - cur_clone_root = sctx->clone_roots + i; - } + struct clone_root *clone_root = &sctx->clone_roots[i]; + if (clone_root->found_refs == 0) + continue; + + /* + * Choose the root from which we can clone more bytes, to + * minimize write operations and therefore have more extent + * sharing at the destination (the same as in the source). + */ + if (!cur_clone_root || + clone_root->num_bytes > cur_clone_root->num_bytes) { + cur_clone_root = clone_root; + + /* + * We found an optimal clone candidate (any inode from + * any root is fine), so we're done. + */ + if (clone_root->num_bytes >= backref_ctx.extent_len) + break; + } } if (cur_clone_root) { -- cgit From 6ce6ba534418132f4c727d5707fe2794c797299c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:46 +0000 Subject: btrfs: use a single argument for extent offset in backref walking functions The interface for find_parent_nodes() has two extent offset related arguments: 1) One u64 pointer argument for the extent offset; 2) One boolean argument to tell if the extent offset should be ignored or not. These are confusing, becase the extent offset pointer can be NULL and in some cases callers pass a NULL value as a way to tell the backref walking code to ignore offsets in file extent items (and simply consider all file extent items that point to the target data extent). The boolean argument was added in commit c995ab3cda3f ("btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents"), but it was never really necessary, it was enough if it could find a way to get a NULL value passed to the "extent_item_pos" argument of find_parent_nodes(). The arguments are also passed to functions called by find_parent_nodes() and respective helper functions, which further makes everything more complicated than needed. Then we have several backref walking related functions that end up calling find_parent_nodes(), either directly or through some other function that they call, and for many we have to use an "extent_item_pos" (u64) argument and a boolean "ignore_offset" argument too. This is confusing and not really necessary. So use a single argument to specify the extent offset, as a simple u64 and not as a pointer, but using a special value of (u64)-1, defined as a documented constant, to indicate when the extent offset should be ignored. This is also preparation work for the upcoming patches in the series that add other arguments to find_parent_nodes() and other related functions that use it. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 87 +++++++++++++++++++++++++-------------------------- fs/btrfs/backref.h | 12 +++++-- fs/btrfs/relocation.c | 2 +- fs/btrfs/scrub.c | 2 +- fs/btrfs/send.c | 2 +- 5 files changed, 54 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 9be11a342de5..432064ee788e 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -35,15 +35,13 @@ static int check_extent_in_eb(const struct btrfs_key *key, const struct extent_buffer *eb, const struct btrfs_file_extent_item *fi, u64 extent_item_pos, - struct extent_inode_elem **eie, - bool ignore_offset) + struct extent_inode_elem **eie) { const u64 data_len = btrfs_file_extent_num_bytes(eb, fi); u64 offset = 0; struct extent_inode_elem *e; - if (!ignore_offset && - !btrfs_file_extent_compression(eb, fi) && + if (!btrfs_file_extent_compression(eb, fi) && !btrfs_file_extent_encryption(eb, fi) && !btrfs_file_extent_other_encoding(eb, fi)) { u64 data_offset; @@ -81,8 +79,7 @@ static void free_inode_elem_list(struct extent_inode_elem *eie) static int find_extent_in_eb(const struct extent_buffer *eb, u64 wanted_disk_byte, u64 extent_item_pos, - struct extent_inode_elem **eie, - bool ignore_offset) + struct extent_inode_elem **eie) { u64 disk_byte; struct btrfs_key key; @@ -111,7 +108,7 @@ static int find_extent_in_eb(const struct extent_buffer *eb, if (disk_byte != wanted_disk_byte) continue; - ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie, ignore_offset); + ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie); if (ret < 0) return ret; } @@ -450,9 +447,9 @@ static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr) static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, struct ulist *parents, struct preftrees *preftrees, struct prelim_ref *ref, - int level, u64 time_seq, const u64 *extent_item_pos, - bool ignore_offset) + int level, u64 time_seq, u64 extent_item_pos) { + const bool ignore_offset = (extent_item_pos == BTRFS_IGNORE_EXTENT_OFFSET); int ret = 0; int slot; struct extent_buffer *eb; @@ -528,10 +525,9 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, count++; else goto next; - if (extent_item_pos) { + if (!ignore_offset) { ret = check_extent_in_eb(&key, eb, fi, - *extent_item_pos, - &eie, ignore_offset); + extent_item_pos, &eie); if (ret < 0) break; } @@ -541,7 +537,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie, (void **)&old, GFP_NOFS); if (ret < 0) break; - if (!ret && extent_item_pos) { + if (!ret && !ignore_offset) { while (old->next) old = old->next; old->next = eie; @@ -570,7 +566,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct preftrees *preftrees, struct prelim_ref *ref, struct ulist *parents, - const u64 *extent_item_pos, bool ignore_offset) + u64 extent_item_pos) { struct btrfs_root *root; struct extent_buffer *eb; @@ -664,7 +660,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, } ret = add_all_parents(root, path, parents, preftrees, ref, level, - time_seq, extent_item_pos, ignore_offset); + time_seq, extent_item_pos); out: btrfs_put_root(root); out_free: @@ -712,8 +708,8 @@ static void free_leaf_list(struct ulist *ulist) static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct preftrees *preftrees, - const u64 *extent_item_pos, - struct share_check *sc, bool ignore_offset) + u64 extent_item_pos, + struct share_check *sc) { int err; int ret = 0; @@ -756,8 +752,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, goto out; } err = resolve_indirect_ref(fs_info, path, time_seq, preftrees, - ref, parents, extent_item_pos, - ignore_offset); + ref, parents, extent_item_pos); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. @@ -1340,18 +1335,21 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx * * Otherwise this returns 0 for success and <0 for an error. * - * If ignore_offset is set to false, only extent refs whose offsets match - * extent_item_pos are returned. If true, every extent ref is returned - * and extent_item_pos is ignored. + * @extent_item_pos is meaningful only if we are dealing with a data extent. + * If its value is not BTRFS_IGNORE_EXTENT_OFFSET, then only collect references + * from file extent items that refer to a section of the data extent that + * contains @extent_item_pos. If its value is BTRFS_IGNORE_EXTENT_OFFSET then + * collect references for every file extent item that points to the data extent. * * FIXME some caching might speed things up */ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist *refs, - struct ulist *roots, const u64 *extent_item_pos, - struct share_check *sc, bool ignore_offset) + struct ulist *roots, u64 extent_item_pos, + struct share_check *sc) { + const bool ignore_offset = (extent_item_pos == BTRFS_IGNORE_EXTENT_OFFSET); struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); struct btrfs_key key; struct btrfs_path *path; @@ -1539,7 +1537,7 @@ again: WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root)); ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, - extent_item_pos, sc, ignore_offset); + extent_item_pos, sc); if (ret) goto out; @@ -1573,8 +1571,7 @@ again: goto out; } if (ref->count && ref->parent) { - if (extent_item_pos && !ref->inode_list && - ref->level == 0) { + if (!ignore_offset && !ref->inode_list && ref->level == 0) { struct extent_buffer *eb; eb = read_tree_block(fs_info, ref->parent, 0, @@ -1592,7 +1589,7 @@ again: if (!path->skip_locking) btrfs_tree_read_lock(eb); ret = find_extent_in_eb(eb, bytenr, - *extent_item_pos, &eie, ignore_offset); + extent_item_pos, &eie); if (!path->skip_locking) btrfs_tree_read_unlock(eb); free_extent_buffer(eb); @@ -1611,7 +1608,7 @@ again: (void **)&eie, GFP_NOFS); if (ret < 0) goto out; - if (!ret && extent_item_pos) { + if (!ret && !ignore_offset) { /* * We've recorded that parent, so we must extend * its inode list here. @@ -1665,7 +1662,7 @@ out: int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist **leafs, - const u64 *extent_item_pos, bool ignore_offset) + u64 extent_item_pos) { int ret; @@ -1674,7 +1671,7 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, return -ENOMEM; ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - *leafs, NULL, extent_item_pos, NULL, ignore_offset); + *leafs, NULL, extent_item_pos, NULL); if (ret < 0 && ret != -ENOENT) { free_leaf_list(*leafs); return ret; @@ -1698,8 +1695,7 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, */ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots, - bool ignore_offset) + u64 time_seq, struct ulist **roots) { struct ulist *tmp; struct ulist_node *node = NULL; @@ -1718,7 +1714,8 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, ULIST_ITER_INIT(&uiter); while (1) { ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - tmp, *roots, NULL, NULL, ignore_offset); + tmp, *roots, BTRFS_IGNORE_EXTENT_OFFSET, + NULL); if (ret < 0 && ret != -ENOENT) { ulist_free(tmp); ulist_free(*roots); @@ -1745,8 +1742,7 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, if (!trans && !skip_commit_root_sem) down_read(&fs_info->commit_root_sem); - ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, - time_seq, roots, false); + ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, time_seq, roots); if (!trans && !skip_commit_root_sem) up_read(&fs_info->commit_root_sem); return ret; @@ -1845,7 +1841,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, bool cached; ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, &ctx->refs, - NULL, NULL, &shared, false); + NULL, BTRFS_IGNORE_EXTENT_OFFSET, &shared); if (ret == BACKREF_FOUND_SHARED || ret == BACKREF_FOUND_NOT_SHARED) { /* If shared must return 1, otherwise return 0. */ @@ -2286,8 +2282,7 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, int iterate_extent_inodes(struct btrfs_fs_info *fs_info, u64 extent_item_objectid, u64 extent_item_pos, int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx, - bool ignore_offset) + iterate_extent_inodes_t *iterate, void *ctx) { int ret; struct btrfs_trans_handle *trans = NULL; @@ -2318,16 +2313,14 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, down_read(&fs_info->commit_root_sem); ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, - seq_elem.seq, &refs, - &extent_item_pos, ignore_offset); + seq_elem.seq, &refs, extent_item_pos); if (ret) goto out; ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val, - seq_elem.seq, &roots, - ignore_offset); + seq_elem.seq, &roots); if (ret) break; ULIST_ITER_INIT(&root_uiter); @@ -2395,10 +2388,14 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) return -EINVAL; - extent_item_pos = logical - found_key.objectid; + if (ignore_offset) + extent_item_pos = BTRFS_IGNORE_EXTENT_OFFSET; + else + extent_item_pos = logical - found_key.objectid; + ret = iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, search_commit_root, - build_ino_list, ctx, ignore_offset); + build_ino_list, ctx); return ret; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 8d3598155f3b..2eb99f23cc8f 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -12,6 +12,13 @@ #include "disk-io.h" #include "extent_io.h" +/* + * Pass to backref walking functions to tell them to include references from + * all file extent items that point to the target data extent, regardless if + * they refer to the whole extent or just sections of it (bookend extents). + */ +#define BTRFS_IGNORE_EXTENT_OFFSET ((u64)-1) + struct inode_fs_paths { struct btrfs_path *btrfs_path; struct btrfs_root *fs_root; @@ -92,8 +99,7 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, int iterate_extent_inodes(struct btrfs_fs_info *fs_info, u64 extent_item_objectid, u64 extent_offset, int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx, - bool ignore_offset); + iterate_extent_inodes_t *iterate, void *ctx); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, void *ctx, @@ -104,7 +110,7 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist **leafs, - const u64 *extent_item_pos, bool ignore_offset); + u64 extent_item_pos); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist **roots, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d119986d1599..45690f7b5900 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3417,7 +3417,7 @@ int add_data_references(struct reloc_control *rc, btrfs_release_path(path); ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid, - 0, &leaves, NULL, true); + 0, &leaves, BTRFS_IGNORE_EXTENT_OFFSET); if (ret < 0) return ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3c22573bfe0d..6c7dc89709fc 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -969,7 +969,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) swarn.dev = dev; iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, 1, - scrub_print_warning_inode, &swarn, false); + scrub_print_warning_inode, &swarn); } out: diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 49759cd9eecb..6bf06939f891 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1467,7 +1467,7 @@ static int find_extent_clone(struct send_ctx *sctx, extent_item_pos = 0; ret = iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, 1, __iterate_backrefs, - &backref_ctx, false); + &backref_ctx); if (ret < 0) goto out; -- cgit From a2c8d27e5ee810b7149b42b88ddf7298e5b8dfe0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:47 +0000 Subject: btrfs: use a structure to pass arguments to backref walking functions The public backref walking functions have quite a lot of arguments that are passed down the call stack to find_parent_nodes(), the core function of the backref walking code. The next patches in series will need to add even arguments to these functions that should be passed not only to find_parent_nodes(), but also to other functions used by the later (directly or even lower in the call stack). So create a structure to hold all these arguments and state used by the main backref walking function, find_parent_nodes(), and use it as the argument for the public backref walking functions iterate_extent_inodes(), btrfs_find_all_leafs() and btrfs_find_all_roots(). Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 343 ++++++++++++++++++++++-------------------- fs/btrfs/backref.h | 76 ++++++++-- fs/btrfs/qgroup.c | 38 +++-- fs/btrfs/relocation.c | 19 ++- fs/btrfs/scrub.c | 14 +- fs/btrfs/send.c | 15 +- fs/btrfs/tests/qgroup-tests.c | 50 ++++-- 7 files changed, 328 insertions(+), 227 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 432064ee788e..a1a00a7bdba5 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -444,12 +444,12 @@ static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr) return 0; } -static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, +static int add_all_parents(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_root *root, struct btrfs_path *path, struct ulist *parents, struct preftrees *preftrees, struct prelim_ref *ref, - int level, u64 time_seq, u64 extent_item_pos) + int level) { - const bool ignore_offset = (extent_item_pos == BTRFS_IGNORE_EXTENT_OFFSET); int ret = 0; int slot; struct extent_buffer *eb; @@ -484,10 +484,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (path->slots[0] >= btrfs_header_nritems(eb) || is_shared_data_backref(preftrees, eb->start) || ref->root_id != btrfs_header_owner(eb)) { - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_leaf(root, path); else - ret = btrfs_next_old_leaf(root, path, time_seq); + ret = btrfs_next_old_leaf(root, path, ctx->time_seq); } while (!ret && count < ref->count) { @@ -508,10 +508,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (slot == 0 && (is_shared_data_backref(preftrees, eb->start) || ref->root_id != btrfs_header_owner(eb))) { - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_leaf(root, path); else - ret = btrfs_next_old_leaf(root, path, time_seq); + ret = btrfs_next_old_leaf(root, path, ctx->time_seq); continue; } fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); @@ -525,9 +525,9 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, count++; else goto next; - if (!ignore_offset) { + if (!ctx->ignore_extent_item_pos) { ret = check_extent_in_eb(&key, eb, fi, - extent_item_pos, &eie); + ctx->extent_item_pos, &eie); if (ret < 0) break; } @@ -537,7 +537,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie, (void **)&old, GFP_NOFS); if (ret < 0) break; - if (!ret && !ignore_offset) { + if (!ret && !ctx->ignore_extent_item_pos) { while (old->next) old = old->next; old->next = eie; @@ -545,10 +545,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie = NULL; } next: - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_item(root, path); else - ret = btrfs_next_old_item(root, path, time_seq); + ret = btrfs_next_old_item(root, path, ctx->time_seq); } if (ret > 0) @@ -562,11 +562,10 @@ next: * resolve an indirect backref in the form (root_id, key, level) * to a logical address */ -static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 time_seq, +static int resolve_indirect_ref(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_path *path, struct preftrees *preftrees, - struct prelim_ref *ref, struct ulist *parents, - u64 extent_item_pos) + struct prelim_ref *ref, struct ulist *parents) { struct btrfs_root *root; struct extent_buffer *eb; @@ -584,9 +583,9 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, * here. */ if (path->search_commit_root) - root = btrfs_get_fs_root_commit_root(fs_info, path, ref->root_id); + root = btrfs_get_fs_root_commit_root(ctx->fs_info, path, ref->root_id); else - root = btrfs_get_fs_root(fs_info, ref->root_id, false); + root = btrfs_get_fs_root(ctx->fs_info, ref->root_id, false); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out_free; @@ -598,17 +597,17 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - if (btrfs_is_testing(fs_info)) { + if (btrfs_is_testing(ctx->fs_info)) { ret = -ENOENT; goto out; } if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); - else if (time_seq == BTRFS_SEQ_LAST) + else if (ctx->time_seq == BTRFS_SEQ_LAST) root_level = btrfs_header_level(root->node); else - root_level = btrfs_old_root_level(root, time_seq); + root_level = btrfs_old_root_level(root, ctx->time_seq); if (root_level + 1 == level) goto out; @@ -636,12 +635,12 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, search_key.offset >= LLONG_MAX) search_key.offset = 0; path->lowest_level = level; - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); else - ret = btrfs_search_old_slot(root, &search_key, path, time_seq); + ret = btrfs_search_old_slot(root, &search_key, path, ctx->time_seq); - btrfs_debug(fs_info, + btrfs_debug(ctx->fs_info, "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)", ref->root_id, level, ref->count, ret, ref->key_for_search.objectid, ref->key_for_search.type, @@ -659,8 +658,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, eb = path->nodes[level]; } - ret = add_all_parents(root, path, parents, preftrees, ref, level, - time_seq, extent_item_pos); + ret = add_all_parents(ctx, root, path, parents, preftrees, ref, level); out: btrfs_put_root(root); out_free: @@ -705,10 +703,9 @@ static void free_leaf_list(struct ulist *ulist) * rbtree as they are encountered. The new backrefs are subsequently * resolved as above. */ -static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 time_seq, +static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_path *path, struct preftrees *preftrees, - u64 extent_item_pos, struct share_check *sc) { int err; @@ -751,14 +748,13 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, ret = BACKREF_FOUND_SHARED; goto out; } - err = resolve_indirect_ref(fs_info, path, time_seq, preftrees, - ref, parents, extent_item_pos); + err = resolve_indirect_ref(ctx, path, preftrees, ref, parents); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. */ if (err == -ENOENT) { - prelim_ref_insert(fs_info, &preftrees->direct, ref, + prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref, NULL); continue; } else if (err) { @@ -787,7 +783,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, memcpy(new_ref, ref, sizeof(*ref)); new_ref->parent = node->val; new_ref->inode_list = unode_aux_to_inode_list(node); - prelim_ref_insert(fs_info, &preftrees->direct, + prelim_ref_insert(ctx->fs_info, &preftrees->direct, new_ref, NULL); } @@ -795,7 +791,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, * Now it's a direct ref, put it in the direct tree. We must * do this last because the ref could be merged/freed here. */ - prelim_ref_insert(fs_info, &preftrees->direct, ref, NULL); + prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref, NULL); ulist_reinit(parents); cond_resched(); @@ -1325,32 +1321,18 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx * indirect refs to their parent bytenr. * When roots are found, they're added to the roots list * - * If time_seq is set to BTRFS_SEQ_LAST, it will not search delayed_refs, and - * behave much like trans == NULL case, the difference only lies in it will not - * commit root. - * The special case is for qgroup to search roots in commit_transaction(). - * - * @sc - if !NULL, then immediately return BACKREF_FOUND_SHARED when a - * shared extent is detected. + * @ctx: Backref walking context object, must be not NULL. + * @sc: If !NULL, then immediately return BACKREF_FOUND_SHARED when a + * shared extent is detected. * * Otherwise this returns 0 for success and <0 for an error. * - * @extent_item_pos is meaningful only if we are dealing with a data extent. - * If its value is not BTRFS_IGNORE_EXTENT_OFFSET, then only collect references - * from file extent items that refer to a section of the data extent that - * contains @extent_item_pos. If its value is BTRFS_IGNORE_EXTENT_OFFSET then - * collect references for every file extent item that points to the data extent. - * * FIXME some caching might speed things up */ -static int find_parent_nodes(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist *refs, - struct ulist *roots, u64 extent_item_pos, +static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, struct share_check *sc) { - const bool ignore_offset = (extent_item_pos == BTRFS_IGNORE_EXTENT_OFFSET); - struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); + struct btrfs_root *root = btrfs_extent_root(ctx->fs_info, ctx->bytenr); struct btrfs_key key; struct btrfs_path *path; struct btrfs_delayed_ref_root *delayed_refs = NULL; @@ -1368,11 +1350,11 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, /* Roots ulist is not needed when using a sharedness check context. */ if (sc) - ASSERT(roots == NULL); + ASSERT(ctx->roots == NULL); - key.objectid = bytenr; + key.objectid = ctx->bytenr; key.offset = (u64)-1; - if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + if (btrfs_fs_incompat(ctx->fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; @@ -1380,12 +1362,12 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - if (!trans) { + if (!ctx->trans) { path->search_commit_root = 1; path->skip_locking = 1; } - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) path->skip_locking = 1; again: @@ -1401,17 +1383,17 @@ again: goto out; } - if (trans && likely(trans->type != __TRANS_DUMMY) && - time_seq != BTRFS_SEQ_LAST) { + if (ctx->trans && likely(ctx->trans->type != __TRANS_DUMMY) && + ctx->time_seq != BTRFS_SEQ_LAST) { /* * We have a specific time_seq we care about and trans which * means we have the path lock, we need to grab the ref head and * lock it so we have a consistent view of the refs at the given * time. */ - delayed_refs = &trans->transaction->delayed_refs; + delayed_refs = &ctx->trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(delayed_refs, ctx->bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { refcount_inc(&head->refs); @@ -1429,7 +1411,7 @@ again: goto again; } spin_unlock(&delayed_refs->lock); - ret = add_delayed_refs(fs_info, head, time_seq, + ret = add_delayed_refs(ctx->fs_info, head, ctx->time_seq, &preftrees, sc); mutex_unlock(&head->mutex); if (ret) @@ -1447,14 +1429,14 @@ again: leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid == bytenr && + if (key.objectid == ctx->bytenr && (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY)) { - ret = add_inline_refs(fs_info, path, bytenr, + ret = add_inline_refs(ctx->fs_info, path, ctx->bytenr, &info_level, &preftrees, sc); if (ret) goto out; - ret = add_keyed_refs(root, path, bytenr, info_level, + ret = add_keyed_refs(root, path, ctx->bytenr, info_level, &preftrees, sc); if (ret) goto out; @@ -1483,7 +1465,7 @@ again: * extent item pointing to the data extent) is shared, that is, if any * of the extent buffers in the path is referenced by other trees. */ - if (sc && bytenr == sc->data_bytenr) { + if (sc && ctx->bytenr == sc->data_bytenr) { /* * If our data extent is from a generation more recent than the * last generation used to snapshot the root, then we know that @@ -1530,14 +1512,13 @@ again: btrfs_release_path(path); - ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0); + ret = add_missing_keys(ctx->fs_info, &preftrees, path->skip_locking == 0); if (ret) goto out; WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root)); - ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, - extent_item_pos, sc); + ret = resolve_indirect_refs(ctx, path, &preftrees, sc); if (ret) goto out; @@ -1564,17 +1545,18 @@ again: * e.g. different offsets would not be merged, * and would retain their original ref->count < 0. */ - if (roots && ref->count && ref->root_id && ref->parent == 0) { + if (ctx->roots && ref->count && ref->root_id && ref->parent == 0) { /* no parent == root of tree */ - ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); + ret = ulist_add(ctx->roots, ref->root_id, 0, GFP_NOFS); if (ret < 0) goto out; } if (ref->count && ref->parent) { - if (!ignore_offset && !ref->inode_list && ref->level == 0) { + if (!ctx->ignore_extent_item_pos && !ref->inode_list && + ref->level == 0) { struct extent_buffer *eb; - eb = read_tree_block(fs_info, ref->parent, 0, + eb = read_tree_block(ctx->fs_info, ref->parent, 0, 0, ref->level, NULL); if (IS_ERR(eb)) { ret = PTR_ERR(eb); @@ -1588,8 +1570,8 @@ again: if (!path->skip_locking) btrfs_tree_read_lock(eb); - ret = find_extent_in_eb(eb, bytenr, - extent_item_pos, &eie); + ret = find_extent_in_eb(eb, ctx->bytenr, + ctx->extent_item_pos, &eie); if (!path->skip_locking) btrfs_tree_read_unlock(eb); free_extent_buffer(eb); @@ -1603,12 +1585,12 @@ again: */ eie = NULL; } - ret = ulist_add_merge_ptr(refs, ref->parent, + ret = ulist_add_merge_ptr(ctx->refs, ref->parent, ref->inode_list, (void **)&eie, GFP_NOFS); if (ret < 0) goto out; - if (!ret && !ignore_offset) { + if (!ret && !ctx->ignore_extent_item_pos) { /* * We've recorded that parent, so we must extend * its inode list here. @@ -1652,28 +1634,29 @@ out: } /* - * Finds all leafs with a reference to the specified combination of bytenr and - * offset. key_list_head will point to a list of corresponding keys (caller must - * free each list element). The leafs will be stored in the leafs ulist, which - * must be freed with ulist_free. + * Finds all leaves with a reference to the specified combination of + * @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are + * added to the ulist at @ctx->refs, and that ulist is allocated by this + * function. The caller should free the ulist with free_leaf_list() if + * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is + * enough. * - * returns 0 on success, <0 on error + * Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated. */ -int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **leafs, - u64 extent_item_pos) +int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx) { int ret; - *leafs = ulist_alloc(GFP_NOFS); - if (!*leafs) + ASSERT(ctx->refs == NULL); + + ctx->refs = ulist_alloc(GFP_NOFS); + if (!ctx->refs) return -ENOMEM; - ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - *leafs, NULL, extent_item_pos, NULL); + ret = find_parent_nodes(ctx, NULL); if (ret < 0 && ret != -ENOENT) { - free_leaf_list(*leafs); + free_leaf_list(ctx->refs); + ctx->refs = NULL; return ret; } @@ -1681,7 +1664,7 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, } /* - * walk all backrefs for a given extent to find all roots that reference this + * Walk all backrefs for a given extent to find all roots that reference this * extent. Walking a backref means finding all extents that reference this * extent and in turn walk the backrefs of those, too. Naturally this is a * recursive process, but here it is implemented in an iterative fashion: We @@ -1689,62 +1672,74 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, * list. In turn, we find all referencing extents for those, further appending * to the list. The way we iterate the list allows adding more elements after * the current while iterating. The process stops when we reach the end of the - * list. Found roots are added to the roots list. + * list. + * + * Found roots are added to @ctx->roots, which is allocated by this function and + * @ctx->roots should be NULL when calling this function. This function also + * requires @ctx->refs to be NULL, as it uses it for allocating a ulist to do + * temporary work, and frees it before returning. * - * returns 0 on success, < 0 on error. + * Returns 0 on success, < 0 on error. On error @ctx->roots is always NULL. */ -static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots) +static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx) { - struct ulist *tmp; - struct ulist_node *node = NULL; + const u64 orig_bytenr = ctx->bytenr; + const bool orig_ignore_extent_item_pos = ctx->ignore_extent_item_pos; struct ulist_iterator uiter; - int ret; + int ret = 0; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) + ASSERT(ctx->refs == NULL); + ASSERT(ctx->roots == NULL); + + ctx->refs = ulist_alloc(GFP_NOFS); + if (!ctx->refs) return -ENOMEM; - *roots = ulist_alloc(GFP_NOFS); - if (!*roots) { - ulist_free(tmp); + + ctx->roots = ulist_alloc(GFP_NOFS); + if (!ctx->roots) { + ulist_free(ctx->refs); + ctx->refs = NULL; return -ENOMEM; } + ctx->ignore_extent_item_pos = true; + ULIST_ITER_INIT(&uiter); while (1) { - ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - tmp, *roots, BTRFS_IGNORE_EXTENT_OFFSET, - NULL); + struct ulist_node *node; + + ret = find_parent_nodes(ctx, NULL); if (ret < 0 && ret != -ENOENT) { - ulist_free(tmp); - ulist_free(*roots); - *roots = NULL; - return ret; + ulist_free(ctx->roots); + ctx->roots = NULL; + break; } - node = ulist_next(tmp, &uiter); + ret = 0; + node = ulist_next(ctx->refs, &uiter); if (!node) break; - bytenr = node->val; + ctx->bytenr = node->val; cond_resched(); } - ulist_free(tmp); - return 0; + ulist_free(ctx->refs); + ctx->refs = NULL; + ctx->bytenr = orig_bytenr; + ctx->ignore_extent_item_pos = orig_ignore_extent_item_pos; + + return ret; } -int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots, +int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx, bool skip_commit_root_sem) { int ret; - if (!trans && !skip_commit_root_sem) - down_read(&fs_info->commit_root_sem); - ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, time_seq, roots); - if (!trans && !skip_commit_root_sem) - up_read(&fs_info->commit_root_sem); + if (!ctx->trans && !skip_commit_root_sem) + down_read(&ctx->fs_info->commit_root_sem); + ret = btrfs_find_all_roots_safe(ctx); + if (!ctx->trans && !skip_commit_root_sem) + up_read(&ctx->fs_info->commit_root_sem); return ret; } @@ -1794,6 +1789,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, u64 extent_gen, struct btrfs_backref_share_check_ctx *ctx) { + struct btrfs_backref_walk_ctx walk_ctx = { 0 }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; @@ -1830,8 +1826,14 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, down_read(&fs_info->commit_root_sem); } else { btrfs_get_tree_mod_seq(fs_info, &elem); + walk_ctx.time_seq = elem.seq; } + walk_ctx.ignore_extent_item_pos = true; + walk_ctx.trans = trans; + walk_ctx.fs_info = fs_info; + walk_ctx.refs = &ctx->refs; + /* -1 means we are in the bytenr of the data extent. */ level = -1; ULIST_ITER_INIT(&uiter); @@ -1840,8 +1842,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, bool is_shared; bool cached; - ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, &ctx->refs, - NULL, BTRFS_IGNORE_EXTENT_OFFSET, &shared); + walk_ctx.bytenr = bytenr; + ret = find_parent_nodes(&walk_ctx, &shared); if (ret == BACKREF_FOUND_SHARED || ret == BACKREF_FOUND_NOT_SHARED) { /* If shared must return 1, otherwise return 0. */ @@ -2279,73 +2281,81 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, * the given parameters. * when the iterator function returns a non-zero value, iteration stops. */ -int iterate_extent_inodes(struct btrfs_fs_info *fs_info, - u64 extent_item_objectid, u64 extent_item_pos, - int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx) +int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, + bool search_commit_root, + iterate_extent_inodes_t *iterate, void *user_ctx) { int ret; - struct btrfs_trans_handle *trans = NULL; - struct ulist *refs = NULL; - struct ulist *roots = NULL; - struct ulist_node *ref_node = NULL; - struct ulist_node *root_node = NULL; + struct ulist *refs; + struct ulist_node *ref_node; struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem); struct ulist_iterator ref_uiter; - struct ulist_iterator root_uiter; - btrfs_debug(fs_info, "resolving all inodes for extent %llu", - extent_item_objectid); + btrfs_debug(ctx->fs_info, "resolving all inodes for extent %llu", + ctx->bytenr); + + ASSERT(ctx->trans == NULL); if (!search_commit_root) { - trans = btrfs_attach_transaction(fs_info->tree_root); + struct btrfs_trans_handle *trans; + + trans = btrfs_attach_transaction(ctx->fs_info->tree_root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) return PTR_ERR(trans); trans = NULL; } + ctx->trans = trans; } - if (trans) - btrfs_get_tree_mod_seq(fs_info, &seq_elem); - else - down_read(&fs_info->commit_root_sem); + if (ctx->trans) { + btrfs_get_tree_mod_seq(ctx->fs_info, &seq_elem); + ctx->time_seq = seq_elem.seq; + } else { + down_read(&ctx->fs_info->commit_root_sem); + } - ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, - seq_elem.seq, &refs, extent_item_pos); + ret = btrfs_find_all_leafs(ctx); if (ret) goto out; + refs = ctx->refs; + ctx->refs = NULL; ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { - ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val, - seq_elem.seq, &roots); + struct ulist_node *root_node; + struct ulist_iterator root_uiter; + + ctx->bytenr = ref_node->val; + ret = btrfs_find_all_roots_safe(ctx); if (ret) break; + ULIST_ITER_INIT(&root_uiter); - while (!ret && (root_node = ulist_next(roots, &root_uiter))) { - btrfs_debug(fs_info, + while (!ret && (root_node = ulist_next(ctx->roots, &root_uiter))) { + btrfs_debug(ctx->fs_info, "root %llu references leaf %llu, data list %#llx", root_node->val, ref_node->val, ref_node->aux); - ret = iterate_leaf_refs(fs_info, + ret = iterate_leaf_refs(ctx->fs_info, (struct extent_inode_elem *) (uintptr_t)ref_node->aux, - root_node->val, - extent_item_objectid, - iterate, ctx); + root_node->val, ctx->bytenr, + iterate, user_ctx); } - ulist_free(roots); + ulist_free(ctx->roots); + ctx->roots = NULL; } free_leaf_list(refs); out: - if (trans) { - btrfs_put_tree_mod_seq(fs_info, &seq_elem); - btrfs_end_transaction(trans); + if (ctx->trans) { + btrfs_put_tree_mod_seq(ctx->fs_info, &seq_elem); + btrfs_end_transaction(ctx->trans); + ctx->trans = NULL; } else { - up_read(&fs_info->commit_root_sem); + up_read(&ctx->fs_info->commit_root_sem); } return ret; @@ -2375,8 +2385,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, void *ctx, bool ignore_offset) { + struct btrfs_backref_walk_ctx walk_ctx = { 0 }; int ret; - u64 extent_item_pos; u64 flags = 0; struct btrfs_key found_key; int search_commit_root = path->search_commit_root; @@ -2388,16 +2398,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) return -EINVAL; + walk_ctx.bytenr = found_key.objectid; if (ignore_offset) - extent_item_pos = BTRFS_IGNORE_EXTENT_OFFSET; + walk_ctx.ignore_extent_item_pos = true; else - extent_item_pos = logical - found_key.objectid; + walk_ctx.extent_item_pos = logical - found_key.objectid; + walk_ctx.fs_info = fs_info; - ret = iterate_extent_inodes(fs_info, found_key.objectid, - extent_item_pos, search_commit_root, - build_ino_list, ctx); - - return ret; + return iterate_extent_inodes(&walk_ctx, search_commit_root, + build_ino_list, ctx); } static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 2eb99f23cc8f..ce700dfcc016 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -13,11 +13,63 @@ #include "extent_io.h" /* - * Pass to backref walking functions to tell them to include references from - * all file extent items that point to the target data extent, regardless if - * they refer to the whole extent or just sections of it (bookend extents). + * Context and arguments for backref walking functions. Some of the fields are + * to be filled by the caller of such functions while other are filled by the + * functions themselves, as described below. */ -#define BTRFS_IGNORE_EXTENT_OFFSET ((u64)-1) +struct btrfs_backref_walk_ctx { + /* + * The address of the extent for which we are doing backref walking. + * Can be either a data extent or a metadata extent. + * + * Must always be set by the top level caller. + */ + u64 bytenr; + /* + * Offset relative to the target extent. This is only used for data + * extents, and it's meaningful because we can have file extent items + * that point only to a section of a data extent ("bookend" extents), + * and we want to filter out any that don't point to a section of the + * data extent containing the given offset. + * + * Must always be set by the top level caller. + */ + u64 extent_item_pos; + /* + * If true and bytenr corresponds to a data extent, then references from + * all file extent items that point to the data extent are considered, + * @extent_item_pos is ignored. + */ + bool ignore_extent_item_pos; + /* A valid transaction handle or NULL. */ + struct btrfs_trans_handle *trans; + /* + * The file system's info object, can not be NULL. + * + * Must always be set by the top level caller. + */ + struct btrfs_fs_info *fs_info; + /* + * Time sequence acquired from btrfs_get_tree_mod_seq(), in case the + * caller joined the tree mod log to get a consistent view of b+trees + * while we do backref walking, or BTRFS_SEQ_LAST. + * When using BTRFS_SEQ_LAST, delayed refs are not checked and it uses + * commit roots when searching b+trees - this is a special case for + * qgroups used during a transaction commit. + */ + u64 time_seq; + /* + * Used to collect the bytenr of metadata extents that point to the + * target extent. + */ + struct ulist *refs; + /* + * List used to collect the IDs of the roots from which the target + * extent is accessible. Can be NULL in case the caller does not care + * about collecting root IDs. + */ + struct ulist *roots; +}; struct inode_fs_paths { struct btrfs_path *btrfs_path; @@ -96,10 +148,9 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, struct btrfs_key *key, struct btrfs_extent_item *ei, u32 item_size, u64 *out_root, u8 *out_level); -int iterate_extent_inodes(struct btrfs_fs_info *fs_info, - u64 extent_item_objectid, - u64 extent_offset, int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx); +int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, + bool search_commit_root, + iterate_extent_inodes_t *iterate, void *user_ctx); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, void *ctx, @@ -107,13 +158,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); -int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **leafs, - u64 extent_item_pos); -int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots, +int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx); +int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx, bool skip_commit_root_sem); char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, u32 name_len, unsigned long name_off, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 75dd964ca46c..24c013c61a94 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1794,8 +1794,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord) { - struct ulist *old_root; - u64 bytenr = qrecord->bytenr; + struct btrfs_backref_walk_ctx ctx = { 0 }; int ret; /* @@ -1822,8 +1821,10 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) return 0; - ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root, - true); + ctx.bytenr = qrecord->bytenr; + ctx.fs_info = trans->fs_info; + + ret = btrfs_find_all_roots(&ctx, true); if (ret < 0) { qgroup_mark_inconsistent(trans->fs_info); btrfs_warn(trans->fs_info, @@ -1839,7 +1840,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, * * So modifying qrecord->old_roots is safe here */ - qrecord->old_roots = old_root; + qrecord->old_roots = ctx.roots; return 0; } @@ -2750,17 +2751,22 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) if (!ret && !(fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { + struct btrfs_backref_walk_ctx ctx = { 0 }; + + ctx.bytenr = record->bytenr; + ctx.fs_info = fs_info; + /* * Old roots should be searched when inserting qgroup * extent record */ if (WARN_ON(!record->old_roots)) { /* Search commit root to find old_roots */ - ret = btrfs_find_all_roots(NULL, fs_info, - record->bytenr, 0, - &record->old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto cleanup; + record->old_roots = ctx.roots; + ctx.roots = NULL; } /* Free the reserved data space */ @@ -2773,10 +2779,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) * which doesn't lock tree or delayed_refs and search * current root. It's safe inside commit_transaction(). */ - ret = btrfs_find_all_roots(trans, fs_info, - record->bytenr, BTRFS_SEQ_LAST, &new_roots, false); + ctx.trans = trans; + ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto cleanup; + new_roots = ctx.roots; if (qgroup_to_skip) { ulist_del(new_roots, qgroup_to_skip, 0); ulist_del(record->old_roots, qgroup_to_skip, @@ -3242,7 +3249,6 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root; struct btrfs_key found; struct extent_buffer *scratch_leaf = NULL; - struct ulist *roots = NULL; u64 num_bytes; bool done; int slot; @@ -3292,6 +3298,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, mutex_unlock(&fs_info->qgroup_rescan_lock); for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { + struct btrfs_backref_walk_ctx ctx = { 0 }; + btrfs_item_key_to_cpu(scratch_leaf, &found, slot); if (found.type != BTRFS_EXTENT_ITEM_KEY && found.type != BTRFS_METADATA_ITEM_KEY) @@ -3301,13 +3309,15 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, else num_bytes = found.offset; - ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, - &roots, false); + ctx.bytenr = found.objectid; + ctx.fs_info = fs_info; + + ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto out; /* For rescan, just pass old_roots as NULL */ ret = btrfs_qgroup_account_extent(trans, found.objectid, - num_bytes, NULL, roots); + num_bytes, NULL, ctx.roots); if (ret < 0) goto out; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 45690f7b5900..2ecca24e1001 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3408,24 +3408,27 @@ int add_data_references(struct reloc_control *rc, struct btrfs_path *path, struct rb_root *blocks) { - struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - struct ulist *leaves = NULL; + struct btrfs_backref_walk_ctx ctx = { 0 }; struct ulist_iterator leaf_uiter; struct ulist_node *ref_node = NULL; - const u32 blocksize = fs_info->nodesize; + const u32 blocksize = rc->extent_root->fs_info->nodesize; int ret = 0; btrfs_release_path(path); - ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid, - 0, &leaves, BTRFS_IGNORE_EXTENT_OFFSET); + + ctx.bytenr = extent_key->objectid; + ctx.ignore_extent_item_pos = true; + ctx.fs_info = rc->extent_root->fs_info; + + ret = btrfs_find_all_leafs(&ctx); if (ret < 0) return ret; ULIST_ITER_INIT(&leaf_uiter); - while ((ref_node = ulist_next(leaves, &leaf_uiter))) { + while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) { struct extent_buffer *eb; - eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL); + eb = read_tree_block(ctx.fs_info, ref_node->val, 0, 0, 0, NULL); if (IS_ERR(eb)) { ret = PTR_ERR(eb); break; @@ -3441,7 +3444,7 @@ int add_data_references(struct reloc_control *rc, } if (ret < 0) free_block_list(blocks); - ulist_free(leaves); + ulist_free(ctx.refs); return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6c7dc89709fc..288a97992aa4 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -909,7 +909,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) struct btrfs_extent_item *ei; struct scrub_warning swarn; unsigned long ptr = 0; - u64 extent_item_pos; u64 flags = 0; u64 ref_root; u32 item_size; @@ -941,7 +940,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) if (ret < 0) goto out; - extent_item_pos = swarn.logical - found_key.objectid; swarn.extent_item_size = found_key.offset; eb = path->nodes[0]; @@ -964,12 +962,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) } while (ret != 1); btrfs_release_path(path); } else { + struct btrfs_backref_walk_ctx ctx = { 0 }; + btrfs_release_path(path); + + ctx.bytenr = found_key.objectid; + ctx.extent_item_pos = swarn.logical - found_key.objectid; + ctx.fs_info = fs_info; + swarn.path = path; swarn.dev = dev; - iterate_extent_inodes(fs_info, found_key.objectid, - extent_item_pos, 1, - scrub_print_warning_inode, &swarn); + + iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn); } out: diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6bf06939f891..0c9a9933341e 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1356,12 +1356,12 @@ static int find_extent_clone(struct send_ctx *sctx, u64 logical; u64 disk_byte; u64 num_bytes; - u64 extent_item_pos; u64 extent_refs; u64 flags = 0; struct btrfs_file_extent_item *fi; struct extent_buffer *eb = path->nodes[0]; - struct backref_ctx backref_ctx = {0}; + struct backref_ctx backref_ctx = { 0 }; + struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 }; struct clone_root *cur_clone_root; struct btrfs_key found_key; struct btrfs_path *tmp_path; @@ -1461,14 +1461,13 @@ static int find_extent_clone(struct send_ctx *sctx, /* * Now collect all backrefs. */ + backref_walk_ctx.bytenr = found_key.objectid; if (compressed == BTRFS_COMPRESS_NONE) - extent_item_pos = logical - found_key.objectid; - else - extent_item_pos = 0; - ret = iterate_extent_inodes(fs_info, found_key.objectid, - extent_item_pos, 1, __iterate_backrefs, - &backref_ctx); + backref_walk_ctx.extent_item_pos = logical - found_key.objectid; + backref_walk_ctx.fs_info = fs_info; + ret = iterate_extent_inodes(&backref_walk_ctx, true, __iterate_backrefs, + &backref_ctx); if (ret < 0) goto out; diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 65b65d55d1f6..3fc8dc3fd980 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -205,6 +205,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, static int test_no_shared_qgroup(struct btrfs_root *root, u32 sectorsize, u32 nodesize) { + struct btrfs_backref_walk_ctx ctx = { 0 }; struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; struct ulist *old_roots = NULL; @@ -220,16 +221,22 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return ret; } + ctx.bytenr = nodesize; + ctx.trans = &trans; + ctx.fs_info = fs_info; + /* * Since the test trans doesn't have the complicated delayed refs, * we can only call btrfs_qgroup_account_extent() directly to test * quota. */ - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, BTRFS_FS_TREE_OBJECTID); @@ -238,12 +245,14 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); @@ -262,11 +271,13 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = remove_extent_item(root, nodesize, nodesize); if (ret) { @@ -274,12 +285,14 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); @@ -304,6 +317,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root, static int test_multiple_refs(struct btrfs_root *root, u32 sectorsize, u32 nodesize) { + struct btrfs_backref_walk_ctx ctx = { 0 }; struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; struct ulist *old_roots = NULL; @@ -324,11 +338,17 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ctx.bytenr = nodesize; + ctx.trans = &trans; + ctx.fs_info = fs_info; + + ret = btrfs_find_all_roots(&ctx, false); if (ret) { test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, BTRFS_FS_TREE_OBJECTID); @@ -337,12 +357,14 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); @@ -357,11 +379,13 @@ static int test_multiple_refs(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = add_tree_ref(root, nodesize, nodesize, 0, BTRFS_FIRST_FREE_OBJECTID); @@ -370,12 +394,14 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); @@ -396,11 +422,13 @@ static int test_multiple_refs(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = remove_extent_ref(root, nodesize, nodesize, 0, BTRFS_FIRST_FREE_OBJECTID); @@ -409,12 +437,14 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); -- cgit From 1baea6f18abf34037169d3b0c585356abb395376 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:48 +0000 Subject: btrfs: reuse roots ulist on each leaf iteration for iterate_extent_inodes() At iterate_extent_inodes() we collect a ulist of leaves for a given extent with a call to btrfs_find_all_leafs() and then we enter a loop where we iterate over all the collected leaves. Each iteration of that loop does a call to btrfs_find_all_roots_safe(), to determine all roots from which a leaf is accessible, and that results in allocating and releasing a ulist to store the root IDs. Instead of allocating and releasing the roots ulist on every iteration, allocate a ulist before entering the loop and keep using it on each iteration, reinitializing the ulist at the end of each iteration. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 46 +++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index a1a00a7bdba5..dc276ce3afe0 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1674,32 +1674,36 @@ int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx) * the current while iterating. The process stops when we reach the end of the * list. * - * Found roots are added to @ctx->roots, which is allocated by this function and - * @ctx->roots should be NULL when calling this function. This function also - * requires @ctx->refs to be NULL, as it uses it for allocating a ulist to do - * temporary work, and frees it before returning. + * Found roots are added to @ctx->roots, which is allocated by this function if + * it points to NULL, in which case the caller is responsible for freeing it + * after it's not needed anymore. + * This function requires @ctx->refs to be NULL, as it uses it for allocating a + * ulist to do temporary work, and frees it before returning. * - * Returns 0 on success, < 0 on error. On error @ctx->roots is always NULL. + * Returns 0 on success, < 0 on error. */ static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx) { const u64 orig_bytenr = ctx->bytenr; const bool orig_ignore_extent_item_pos = ctx->ignore_extent_item_pos; + bool roots_ulist_allocated = false; struct ulist_iterator uiter; int ret = 0; ASSERT(ctx->refs == NULL); - ASSERT(ctx->roots == NULL); ctx->refs = ulist_alloc(GFP_NOFS); if (!ctx->refs) return -ENOMEM; - ctx->roots = ulist_alloc(GFP_NOFS); if (!ctx->roots) { - ulist_free(ctx->refs); - ctx->refs = NULL; - return -ENOMEM; + ctx->roots = ulist_alloc(GFP_NOFS); + if (!ctx->roots) { + ulist_free(ctx->refs); + ctx->refs = NULL; + return -ENOMEM; + } + roots_ulist_allocated = true; } ctx->ignore_extent_item_pos = true; @@ -1710,8 +1714,10 @@ static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx) ret = find_parent_nodes(ctx, NULL); if (ret < 0 && ret != -ENOENT) { - ulist_free(ctx->roots); - ctx->roots = NULL; + if (roots_ulist_allocated) { + ulist_free(ctx->roots); + ctx->roots = NULL; + } break; } ret = 0; @@ -2295,6 +2301,11 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, ctx->bytenr); ASSERT(ctx->trans == NULL); + ASSERT(ctx->roots == NULL); + + ctx->roots = ulist_alloc(GFP_NOFS); + if (!ctx->roots) + return -ENOMEM; if (!search_commit_root) { struct btrfs_trans_handle *trans; @@ -2302,8 +2313,11 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, trans = btrfs_attach_transaction(ctx->fs_info->tree_root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT && - PTR_ERR(trans) != -EROFS) + PTR_ERR(trans) != -EROFS) { + ulist_free(ctx->roots); + ctx->roots = NULL; return PTR_ERR(trans); + } trans = NULL; } ctx->trans = trans; @@ -2344,8 +2358,7 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, root_node->val, ctx->bytenr, iterate, user_ctx); } - ulist_free(ctx->roots); - ctx->roots = NULL; + ulist_reinit(ctx->roots); } free_leaf_list(refs); @@ -2358,6 +2371,9 @@ out: up_read(&ctx->fs_info->commit_root_sem); } + ulist_free(ctx->roots); + ctx->roots = NULL; + return ret; } -- cgit From fa104a879073f3974d673ad8c609d986042cf666 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:49 +0000 Subject: btrfs: constify ulist parameter of ulist_next() The ulist_next() iterator function does not need to change the given ulist so make it const. This will allow the next patch in the series to pass a ulist to a function that does not need, and should not, modify the ulist. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ulist.c | 2 +- fs/btrfs/ulist.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 13af1e41f9d7..33606025513d 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -266,7 +266,7 @@ int ulist_del(struct ulist *ulist, u64 val, u64 aux) * It is allowed to call ulist_add during an enumeration. Newly added items * are guaranteed to show up in the running enumeration. */ -struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) +struct ulist_node *ulist_next(const struct ulist *ulist, struct ulist_iterator *uiter) { struct ulist_node *node; diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 02fda0a2d4ce..b2cef187ea8e 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -66,7 +66,7 @@ static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux, #endif } -struct ulist_node *ulist_next(struct ulist *ulist, +struct ulist_node *ulist_next(const struct ulist *ulist, struct ulist_iterator *uiter); #define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL) -- cgit From 66d04209e5a8d3fc47900de6bd0c319790a52b3e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:50 +0000 Subject: btrfs: send: cache leaf to roots mapping during backref walking During a send operation, when doing backref walking to determine which inodes/offsets/roots we can clone from, the most repetitive and expensive step is to map each leaf that has file extent items pointing to the target data extent to the IDs of the roots from which the leaves are accessible, which happens at iterate_extent_inodes(). That step requires finding every parent node of a leaf, then the parent of each parent, and so on until we reach a root node. So it's a naturally expensive operation, and repetitive because each leaf can have hundreds of file extent items (for a nodesize of 16K, that can be slightly over 200 file extent items). There's also temporal locality, as we process all file extent items from a leave before moving the next leaf. This change caches the mapping of leaves to root IDs, to avoid repeating those computations over and over again. The cache is limited to a maximum of 128 entries, with each entry being a struct with a size of 128 bytes, so the maximum cache size is 16K plus any nodes internally allocated by the maple tree that is used to index pointers to those structs. The cache is invalidated whenever we detect relocation happened since we started filling the cache, because if relocation happened then extent buffers for leaves and nodes of the trees used by a send operation may have been reallocated. This cache also allows for another important optimization that is introduced in the next patch in the series. This change is part of a patchset comprised of the following patches: 01/17 btrfs: fix inode list leak during backref walking at resolve_indirect_refs() 02/17 btrfs: fix inode list leak during backref walking at find_parent_nodes() 03/17 btrfs: fix ulist leaks in error paths of qgroup self tests 04/17 btrfs: remove pointless and double ulist frees in error paths of qgroup tests 05/17 btrfs: send: avoid unnecessary path allocations when finding extent clone 06/17 btrfs: send: update comment at find_extent_clone() 07/17 btrfs: send: drop unnecessary backref context field initializations 08/17 btrfs: send: avoid unnecessary backref lookups when finding clone source 09/17 btrfs: send: optimize clone detection to increase extent sharing 10/17 btrfs: use a single argument for extent offset in backref walking functions 11/17 btrfs: use a structure to pass arguments to backref walking functions 12/17 btrfs: reuse roots ulist on each leaf iteration for iterate_extent_inodes() 13/17 btrfs: constify ulist parameter of ulist_next() 14/17 btrfs: send: cache leaf to roots mapping during backref walking 15/17 btrfs: send: skip unnecessary backref iterations 16/17 btrfs: send: avoid double extent tree search when finding clone source 17/17 btrfs: send: skip resolution of our own backref when finding clone source Performance test results are in the changelog of patch 17/17. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 52 +++++++++++---- fs/btrfs/backref.h | 11 ++++ fs/btrfs/send.c | 185 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 236 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index dc276ce3afe0..9dacf487017d 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2303,21 +2303,14 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, ASSERT(ctx->trans == NULL); ASSERT(ctx->roots == NULL); - ctx->roots = ulist_alloc(GFP_NOFS); - if (!ctx->roots) - return -ENOMEM; - if (!search_commit_root) { struct btrfs_trans_handle *trans; trans = btrfs_attach_transaction(ctx->fs_info->tree_root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT && - PTR_ERR(trans) != -EROFS) { - ulist_free(ctx->roots); - ctx->roots = NULL; + PTR_ERR(trans) != -EROFS) return PTR_ERR(trans); - } trans = NULL; } ctx->trans = trans; @@ -2338,23 +2331,58 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { + const u64 leaf_bytenr = ref_node->val; struct ulist_node *root_node; struct ulist_iterator root_uiter; + struct extent_inode_elem *inode_list; + + inode_list = (struct extent_inode_elem *)(uintptr_t)ref_node->aux; + + if (ctx->cache_lookup) { + const u64 *root_ids; + int root_count; + bool cached; + + cached = ctx->cache_lookup(leaf_bytenr, ctx->user_ctx, + &root_ids, &root_count); + if (cached) { + for (int i = 0; i < root_count; i++) { + ret = iterate_leaf_refs(ctx->fs_info, + inode_list, + root_ids[i], + leaf_bytenr, + iterate, + user_ctx); + if (ret) + break; + } + continue; + } + } + + if (!ctx->roots) { + ctx->roots = ulist_alloc(GFP_NOFS); + if (!ctx->roots) { + ret = -ENOMEM; + break; + } + } - ctx->bytenr = ref_node->val; + ctx->bytenr = leaf_bytenr; ret = btrfs_find_all_roots_safe(ctx); if (ret) break; + if (ctx->cache_store) + ctx->cache_store(leaf_bytenr, ctx->roots, ctx->user_ctx); + ULIST_ITER_INIT(&root_uiter); while (!ret && (root_node = ulist_next(ctx->roots, &root_uiter))) { btrfs_debug(ctx->fs_info, "root %llu references leaf %llu, data list %#llx", root_node->val, ref_node->val, ref_node->aux); - ret = iterate_leaf_refs(ctx->fs_info, - (struct extent_inode_elem *) - (uintptr_t)ref_node->aux, + ret = iterate_leaf_refs(ctx->fs_info, inode_list, root_node->val, ctx->bytenr, iterate, user_ctx); } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index ce700dfcc016..5005f579f235 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -69,6 +69,17 @@ struct btrfs_backref_walk_ctx { * about collecting root IDs. */ struct ulist *roots; + /* + * Used by iterate_extent_inodes(). Lookup and store functions for an + * optional cache which maps the logical address (bytenr) of leaves + * to an array of root IDs. + */ + bool (*cache_lookup)(u64 leaf_bytenr, void *user_ctx, + const u64 **root_ids_ret, int *root_count_ret); + void (*cache_store)(u64 leaf_bytenr, const struct ulist *root_ids, + void *user_ctx); + /* Context object to pass to @cache_lookup and @cache_store. */ + void *user_ctx; }; struct inode_fs_paths { diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 0c9a9933341e..fa94bd550564 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -83,6 +83,39 @@ struct clone_root { #define SEND_CTX_MAX_NAME_CACHE_SIZE 128 #define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) +/* + * Limit the root_ids array of struct backref_cache_entry to 12 elements. + * This makes the size of a cache entry to be exactly 128 bytes on x86_64. + * The most common case is to have a single root for cloning, which corresponds + * to the send root. Having the user specify more than 11 clone roots is not + * common, and in such rare cases we simply don't use caching if the number of + * cloning roots that lead down to a leaf is more than 12. + */ +#define SEND_MAX_BACKREF_CACHE_ROOTS 12 + +/* + * Max number of entries in the cache. + * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding + * maple tree's internal nodes, is 16K. + */ +#define SEND_MAX_BACKREF_CACHE_SIZE 128 + +/* + * A backref cache entry maps a leaf to a list of IDs of roots from which the + * leaf is accessible and we can use for clone operations. + * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on + * x86_64). + */ +struct backref_cache_entry { + /* List to link to the cache's lru list. */ + struct list_head list; + /* The key for this entry in the cache. */ + u64 key; + u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS]; + /* Number of valid elements in the root_ids array. */ + int num_roots; +}; + struct send_ctx { struct file *send_filp; loff_t send_off; @@ -251,6 +284,14 @@ struct send_ctx { struct rb_root rbtree_new_refs; struct rb_root rbtree_deleted_refs; + + struct { + u64 last_reloc_trans; + struct list_head lru_list; + struct maple_tree entries; + /* Number of entries stored in the cache. */ + int size; + } backref_cache; }; struct pending_dir_move { @@ -1335,6 +1376,142 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root, return 0; } +static void empty_backref_cache(struct send_ctx *sctx) +{ + struct backref_cache_entry *entry; + struct backref_cache_entry *tmp; + + list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list) + kfree(entry); + + INIT_LIST_HEAD(&sctx->backref_cache.lru_list); + mtree_destroy(&sctx->backref_cache.entries); + sctx->backref_cache.size = 0; +} + +static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, + const u64 **root_ids_ret, int *root_count_ret) +{ + struct send_ctx *sctx = ctx; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; + struct backref_cache_entry *entry; + + if (sctx->backref_cache.size == 0) + return false; + + /* + * If relocation happened since we first filled the cache, then we must + * empty the cache and can not use it, because even though we operate on + * read-only roots, their leaves and nodes may have been reallocated and + * now be used for different nodes/leaves of the same tree or some other + * tree. + * + * We are called from iterate_extent_inodes() while either holding a + * transaction handle or holding fs_info->commit_root_sem, so no need + * to take any lock here. + */ + if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) { + empty_backref_cache(sctx); + return false; + } + + entry = mtree_load(&sctx->backref_cache.entries, key); + if (!entry) + return false; + + *root_ids_ret = entry->root_ids; + *root_count_ret = entry->num_roots; + list_move_tail(&entry->list, &sctx->backref_cache.lru_list); + + return true; +} + +static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, + void *ctx) +{ + struct send_ctx *sctx = ctx; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + struct backref_cache_entry *new_entry; + struct ulist_iterator uiter; + struct ulist_node *node; + int ret; + + /* + * We're called while holding a transaction handle or while holding + * fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a + * NOFS allocation. + */ + new_entry = kmalloc(sizeof(struct backref_cache_entry), GFP_NOFS); + /* No worries, cache is optional. */ + if (!new_entry) + return; + + new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->num_roots = 0; + ULIST_ITER_INIT(&uiter); + while ((node = ulist_next(root_ids, &uiter)) != NULL) { + const u64 root_id = node->val; + struct clone_root *root; + + root = bsearch((void *)(uintptr_t)root_id, sctx->clone_roots, + sctx->clone_roots_cnt, sizeof(struct clone_root), + __clone_root_cmp_bsearch); + if (!root) + continue; + + /* Too many roots, just exit, no worries as caching is optional. */ + if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) { + kfree(new_entry); + return; + } + + new_entry->root_ids[new_entry->num_roots] = root_id; + new_entry->num_roots++; + } + + /* + * We may have not added any roots to the new cache entry, which means + * none of the roots is part of the list of roots from which we are + * allowed to clone. Cache the new entry as it's still useful to avoid + * backref walking to determine which roots have a path to the leaf. + */ + + if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) { + struct backref_cache_entry *lru_entry; + struct backref_cache_entry *mt_entry; + + lru_entry = list_first_entry(&sctx->backref_cache.lru_list, + struct backref_cache_entry, list); + mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key); + ASSERT(mt_entry == lru_entry); + list_del(&mt_entry->list); + kfree(mt_entry); + sctx->backref_cache.size--; + } + + ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key, + new_entry, GFP_NOFS); + ASSERT(ret == 0 || ret == -ENOMEM); + if (ret) { + /* Caching is optional, no worries. */ + kfree(new_entry); + return; + } + + list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list); + + /* + * We are called from iterate_extent_inodes() while either holding a + * transaction handle or holding fs_info->commit_root_sem, so no need + * to take any lock here. + */ + if (sctx->backref_cache.size == 0) + sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans; + + sctx->backref_cache.size++; +} + /* * Given an inode, offset and extent item, it finds a good clone for a clone * instruction. Returns -ENOENT when none could be found. The function makes @@ -1465,6 +1642,9 @@ static int find_extent_clone(struct send_ctx *sctx, if (compressed == BTRFS_COMPRESS_NONE) backref_walk_ctx.extent_item_pos = logical - found_key.objectid; backref_walk_ctx.fs_info = fs_info; + backref_walk_ctx.cache_lookup = lookup_backref_cache; + backref_walk_ctx.cache_store = store_backref_cache; + backref_walk_ctx.user_ctx = sctx; ret = iterate_extent_inodes(&backref_walk_ctx, true, __iterate_backrefs, &backref_ctx); @@ -7891,6 +8071,9 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); INIT_LIST_HEAD(&sctx->name_cache_list); + INIT_LIST_HEAD(&sctx->backref_cache.lru_list); + mt_init(&sctx->backref_cache.entries); + sctx->flags = arg->flags; if (arg->flags & BTRFS_SEND_FLAG_VERSION) { @@ -8153,6 +8336,8 @@ out: close_current_inode(sctx); + empty_backref_cache(sctx); + kfree(sctx); } -- cgit From 88ffb665c894b1929b30b09e05506ff359d9fb89 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:51 +0000 Subject: btrfs: send: skip unnecessary backref iterations When looking for a clone source for an extent, we are iterating over all the backreferences for an extent. This is often a waste of time, because once we find a good clone source we could stop immediately instead of continuing backref walking, which is expensive. Basically what happens currently is this: 1) Call iterate_extent_inodes() to iterate over all the backreferences; 2) It calls btrfs_find_all_leafs() which in turn calls the main function to walk over backrefs and collect them - find_parent_nodes(); 3) Then we collect all the references for our target data extent from the extent tree (and delayed refs if any), add them to the rb trees, resolve all the indirect backreferences and search for all the file extent items in fs trees, building a list of inodes for each one of them (struct extent_inode_elem); 4) Then back at iterate_extent_inodes() we find all the roots associated to each found leaf, and call the callback __iterate_backrefs defined at send.c for each inode in the inode list associated to each leaf. Some times one the first backreferences we find in a fs tree is optimal to satisfy the clone operation that send wants to perform, and in that case we could stop immediately and avoid resolving all the remaining indirect backreferences (search fs trees for the respective file extent items, etc). This possibly if when we find a fs tree leaf with a file extent item we are able to know what are all the roots that can lead to the leaf - this is now possible after the previous patch in the series that adds a cache that maps leaves to a list of roots. So we can now shortcircuit backref walking during send, by having the callback we pass to iterate_extent_inodes() to be called when we find a file extent item for an indirect backreference, and have it return a special value when it found a suitable backreference and it does not need to look for more backreferences. This change does that. This change is part of a patchset comprised of the following patches: 01/17 btrfs: fix inode list leak during backref walking at resolve_indirect_refs() 02/17 btrfs: fix inode list leak during backref walking at find_parent_nodes() 03/17 btrfs: fix ulist leaks in error paths of qgroup self tests 04/17 btrfs: remove pointless and double ulist frees in error paths of qgroup tests 05/17 btrfs: send: avoid unnecessary path allocations when finding extent clone 06/17 btrfs: send: update comment at find_extent_clone() 07/17 btrfs: send: drop unnecessary backref context field initializations 08/17 btrfs: send: avoid unnecessary backref lookups when finding clone source 09/17 btrfs: send: optimize clone detection to increase extent sharing 10/17 btrfs: use a single argument for extent offset in backref walking functions 11/17 btrfs: use a structure to pass arguments to backref walking functions 12/17 btrfs: reuse roots ulist on each leaf iteration for iterate_extent_inodes() 13/17 btrfs: constify ulist parameter of ulist_next() 14/17 btrfs: send: cache leaf to roots mapping during backref walking 15/17 btrfs: send: skip unnecessary backref iterations 16/17 btrfs: send: avoid double extent tree search when finding clone source 17/17 btrfs: send: skip resolution of our own backref when finding clone source Performance test results are in the changelog of patch 17/17. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 73 ++++++++++++++++++++++++++++++++---------------- fs/btrfs/backref.h | 44 ++++++++++++++++++++++++----- fs/btrfs/send.c | 81 +++++++++++++++++++++++++++--------------------------- 3 files changed, 128 insertions(+), 70 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 9dacf487017d..bb59ba68d7ee 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -31,15 +31,18 @@ struct extent_inode_elem { struct extent_inode_elem *next; }; -static int check_extent_in_eb(const struct btrfs_key *key, +static int check_extent_in_eb(struct btrfs_backref_walk_ctx *ctx, + const struct btrfs_key *key, const struct extent_buffer *eb, const struct btrfs_file_extent_item *fi, - u64 extent_item_pos, struct extent_inode_elem **eie) { const u64 data_len = btrfs_file_extent_num_bytes(eb, fi); - u64 offset = 0; + u64 offset = key->offset; struct extent_inode_elem *e; + const u64 *root_ids; + int root_count; + bool cached; if (!btrfs_file_extent_compression(eb, fi) && !btrfs_file_extent_encryption(eb, fi) && @@ -48,19 +51,38 @@ static int check_extent_in_eb(const struct btrfs_key *key, data_offset = btrfs_file_extent_offset(eb, fi); - if (extent_item_pos < data_offset || - extent_item_pos >= data_offset + data_len) + if (ctx->extent_item_pos < data_offset || + ctx->extent_item_pos >= data_offset + data_len) return 1; - offset = extent_item_pos - data_offset; + offset += ctx->extent_item_pos - data_offset; + } + + if (!ctx->indirect_ref_iterator || !ctx->cache_lookup) + goto add_inode_elem; + + cached = ctx->cache_lookup(eb->start, ctx->user_ctx, &root_ids, + &root_count); + if (!cached) + goto add_inode_elem; + + for (int i = 0; i < root_count; i++) { + int ret; + + ret = ctx->indirect_ref_iterator(key->objectid, offset, + data_len, root_ids[i], + ctx->user_ctx); + if (ret) + return ret; } +add_inode_elem: e = kmalloc(sizeof(*e), GFP_NOFS); if (!e) return -ENOMEM; e->next = *eie; e->inum = key->objectid; - e->offset = key->offset + offset; + e->offset = offset; e->num_bytes = data_len; *eie = e; @@ -77,8 +99,8 @@ static void free_inode_elem_list(struct extent_inode_elem *eie) } } -static int find_extent_in_eb(const struct extent_buffer *eb, - u64 wanted_disk_byte, u64 extent_item_pos, +static int find_extent_in_eb(struct btrfs_backref_walk_ctx *ctx, + const struct extent_buffer *eb, struct extent_inode_elem **eie) { u64 disk_byte; @@ -105,11 +127,11 @@ static int find_extent_in_eb(const struct extent_buffer *eb, continue; /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte != wanted_disk_byte) + if (disk_byte != ctx->bytenr) continue; - ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie); - if (ret < 0) + ret = check_extent_in_eb(ctx, &key, eb, fi, eie); + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) return ret; } @@ -526,9 +548,9 @@ static int add_all_parents(struct btrfs_backref_walk_ctx *ctx, else goto next; if (!ctx->ignore_extent_item_pos) { - ret = check_extent_in_eb(&key, eb, fi, - ctx->extent_item_pos, &eie); - if (ret < 0) + ret = check_extent_in_eb(ctx, &key, eb, fi, &eie); + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || + ret < 0) break; } if (ret > 0) @@ -551,10 +573,11 @@ next: ret = btrfs_next_old_item(root, path, ctx->time_seq); } - if (ret > 0) - ret = 0; - else if (ret < 0) + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) free_inode_elem_list(eie); + else if (ret > 0) + ret = 0; + return ret; } @@ -1570,12 +1593,12 @@ again: if (!path->skip_locking) btrfs_tree_read_lock(eb); - ret = find_extent_in_eb(eb, ctx->bytenr, - ctx->extent_item_pos, &eie); + ret = find_extent_in_eb(ctx, eb, &eie); if (!path->skip_locking) btrfs_tree_read_unlock(eb); free_extent_buffer(eb); - if (ret < 0) + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || + ret < 0) goto out; ref->inode_list = eie; /* @@ -1628,7 +1651,7 @@ out: prelim_release(&preftrees.indirect); prelim_release(&preftrees.indirect_missing_keys); - if (ret < 0) + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) free_inode_elem_list(eie); return ret; } @@ -1654,7 +1677,8 @@ int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx) return -ENOMEM; ret = find_parent_nodes(ctx, NULL); - if (ret < 0 && ret != -ENOENT) { + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || + (ret < 0 && ret != -ENOENT)) { free_leaf_list(ctx->refs); ctx->refs = NULL; return ret; @@ -2402,6 +2426,9 @@ out: ulist_free(ctx->roots); ctx->roots = NULL; + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP) + ret = 0; + return ret; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 5005f579f235..a80d1e8be718 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -12,6 +12,25 @@ #include "disk-io.h" #include "extent_io.h" +/* + * Used by implementations of iterate_extent_inodes_t (see definition below) to + * signal that backref iteration can stop immediately and no error happened. + * The value must be non-negative and must not be 0, 1 (which is a common return + * value from things like btrfs_search_slot() and used internally in the backref + * walking code) and different from BACKREF_FOUND_SHARED and + * BACKREF_FOUND_NOT_SHARED + */ +#define BTRFS_ITERATE_EXTENT_INODES_STOP 5 + +/* + * Should return 0 if no errors happened and iteration of backrefs should + * continue. Can return BTRFS_ITERATE_EXTENT_INODES_STOP or any other non-zero + * value to immediately stop iteration and possibly signal an error back to + * the caller. + */ +typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 num_bytes, + u64 root, void *ctx); + /* * Context and arguments for backref walking functions. Some of the fields are * to be filled by the caller of such functions while other are filled by the @@ -70,15 +89,29 @@ struct btrfs_backref_walk_ctx { */ struct ulist *roots; /* - * Used by iterate_extent_inodes(). Lookup and store functions for an - * optional cache which maps the logical address (bytenr) of leaves - * to an array of root IDs. + * Used by iterate_extent_inodes() and the main backref walk code + * (find_parent_nodes()). Lookup and store functions for an optional + * cache which maps the logical address (bytenr) of leaves to an array + * of root IDs. */ bool (*cache_lookup)(u64 leaf_bytenr, void *user_ctx, const u64 **root_ids_ret, int *root_count_ret); void (*cache_store)(u64 leaf_bytenr, const struct ulist *root_ids, void *user_ctx); - /* Context object to pass to @cache_lookup and @cache_store. */ + /* + * If this is not NULL, then the backref walking code will call this + * for each indirect data extent reference as soon as it finds one, + * before collecting all the remaining backrefs and before resolving + * indirect backrefs. This allows for the caller to terminate backref + * walking as soon as it finds one backref that matches some specific + * criteria. The @cache_lookup and @cache_store callbacks should not + * be NULL in order to use this callback. + */ + iterate_extent_inodes_t *indirect_ref_iterator; + /* + * Context object to pass to the @cache_lookup, @cache_store and + * @indirect_ref_iterator callbacks. + */ void *user_ctx; }; @@ -145,9 +178,6 @@ struct btrfs_backref_share_check_ctx { int prev_extents_cache_slot; }; -typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 num_bytes, - u64 root, void *ctx); - struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void); void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index fa94bd550564..f453f6406564 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -77,7 +77,7 @@ struct clone_root { u64 ino; u64 offset; u64 num_bytes; - u64 found_refs; + bool found_ref; }; #define SEND_CTX_MAX_NAME_CACHE_SIZE 128 @@ -1281,9 +1281,6 @@ struct backref_ctx { /* may be truncated in case it's the last extent in a file */ u64 extent_len; - - /* Just to check for bugs in backref resolving */ - int found_itself; }; static int __clone_root_cmp_bsearch(const void *key, const void *elt) @@ -1312,33 +1309,33 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2) /* * Called for every backref that is found for the current extent. - * Results are collected in sctx->clone_roots->ino/offset/found_refs + * Results are collected in sctx->clone_roots->ino/offset. */ -static int __iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root, - void *ctx_) +static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id, + void *ctx_) { struct backref_ctx *bctx = ctx_; - struct clone_root *found; + struct clone_root *clone_root; /* First check if the root is in the list of accepted clone sources */ - found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots, - bctx->sctx->clone_roots_cnt, - sizeof(struct clone_root), - __clone_root_cmp_bsearch); - if (!found) + clone_root = bsearch((void *)(uintptr_t)root_id, bctx->sctx->clone_roots, + bctx->sctx->clone_roots_cnt, + sizeof(struct clone_root), + __clone_root_cmp_bsearch); + if (!clone_root) return 0; - if (found->root == bctx->sctx->send_root && + /* This is our own reference, bail out as we can't clone from it. */ + if (clone_root->root == bctx->sctx->send_root && ino == bctx->cur_objectid && - offset == bctx->cur_offset) { - bctx->found_itself = 1; - } + offset == bctx->cur_offset) + return 0; /* * Make sure we don't consider clones from send_root that are * behind the current inode/offset. */ - if (found->root == bctx->sctx->send_root) { + if (clone_root->root == bctx->sctx->send_root) { /* * If the source inode was not yet processed we can't issue a * clone operation, as the source extent does not exist yet at @@ -1359,7 +1356,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root, } bctx->found++; - found->found_refs++; + clone_root->found_ref = true; /* * If the given backref refers to a file extent item with a larger @@ -1367,10 +1364,17 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root, * we clone more optimally and end up doing less writes and getting * less exclusive, non-shared extents at the destination. */ - if (num_bytes > found->num_bytes) { - found->ino = ino; - found->offset = offset; - found->num_bytes = num_bytes; + if (num_bytes > clone_root->num_bytes) { + clone_root->ino = ino; + clone_root->offset = offset; + clone_root->num_bytes = num_bytes; + + /* + * Found a perfect candidate, so there's no need to continue + * backref walking. + */ + if (num_bytes >= bctx->extent_len) + return BTRFS_ITERATE_EXTENT_INODES_STOP; } return 0; @@ -1392,7 +1396,8 @@ static void empty_backref_cache(struct send_ctx *sctx) static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, const u64 **root_ids_ret, int *root_count_ret) { - struct send_ctx *sctx = ctx; + struct backref_ctx *bctx = ctx; + struct send_ctx *sctx = bctx->sctx; struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; struct backref_cache_entry *entry; @@ -1430,7 +1435,8 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, void *ctx) { - struct send_ctx *sctx = ctx; + struct backref_ctx *bctx = ctx; + struct send_ctx *sctx = bctx->sctx; struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; struct backref_cache_entry *new_entry; struct ulist_iterator uiter; @@ -1618,7 +1624,7 @@ static int find_extent_clone(struct send_ctx *sctx, cur_clone_root->ino = (u64)-1; cur_clone_root->offset = 0; cur_clone_root->num_bytes = 0; - cur_clone_root->found_refs = 0; + cur_clone_root->found_ref = false; } backref_ctx.sctx = sctx; @@ -1628,7 +1634,7 @@ static int find_extent_clone(struct send_ctx *sctx, /* * The last extent of a file may be too large due to page alignment. * We need to adjust extent_len in this case so that the checks in - * __iterate_backrefs work. + * iterate_backrefs() work. */ if (data_offset + num_bytes >= ino_size) backref_ctx.extent_len = ino_size - data_offset; @@ -1644,9 +1650,10 @@ static int find_extent_clone(struct send_ctx *sctx, backref_walk_ctx.fs_info = fs_info; backref_walk_ctx.cache_lookup = lookup_backref_cache; backref_walk_ctx.cache_store = store_backref_cache; - backref_walk_ctx.user_ctx = sctx; + backref_walk_ctx.indirect_ref_iterator = iterate_backrefs; + backref_walk_ctx.user_ctx = &backref_ctx; - ret = iterate_extent_inodes(&backref_walk_ctx, true, __iterate_backrefs, + ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs, &backref_ctx); if (ret < 0) goto out; @@ -1671,27 +1678,21 @@ static int find_extent_clone(struct send_ctx *sctx, } up_read(&fs_info->commit_root_sem); - if (!backref_ctx.found_itself) { - /* found a bug in backref code? */ - ret = -EIO; - btrfs_err(fs_info, - "did not find backref in send_root. inode=%llu, offset=%llu, disk_byte=%llu found extent=%llu", - ino, data_offset, disk_byte, found_key.objectid); - goto out; - } - btrfs_debug(fs_info, "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu", data_offset, ino, num_bytes, logical); - if (!backref_ctx.found) + if (!backref_ctx.found) { btrfs_debug(fs_info, "no clones found"); + ret = -ENOENT; + goto out; + } cur_clone_root = NULL; for (i = 0; i < sctx->clone_roots_cnt; i++) { struct clone_root *clone_root = &sctx->clone_roots[i]; - if (clone_root->found_refs == 0) + if (!clone_root->found_ref) continue; /* -- cgit From f73853c7168aef0e071c160979af05a148691e61 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:52 +0000 Subject: btrfs: send: avoid double extent tree search when finding clone source At find_extent_clone() we search twice for the extent item corresponding to the data extent that the current file extent items points to: 1) Once with a call to extent_from_logical(); 2) Once again during backref walking, through iterate_extent_inodes() which eventually leads to find_parent_nodes() where we will search again the extent tree for the same extent item. The extent tree can be huge, so doing this one extra search for every extent we want to send adds up and it's expensive. The first call is there since the send code was introduced and it accomplishes two things: 1) Check that the extent is flagged as a data extent in the extent tree. But it can not be anything else, otherwise we wouldn't have a file extent item in the send root pointing to it. This was probably added to catch bugs in the early days where send was yet too young and the interaction with everything else was far from perfect; 2) Check how many direct references there are on the extent, and if there's too many (more than SEND_MAX_EXTENT_REFS), avoid doing the backred walking as it may take too long and slowdown send. So improve on this by having a callback in the backref walking code that is called when it finds the extent item in the extent tree, and have those checks done in the callback. When the callback returns anything different from 0, it stops the backref walking code. This way we do a single search on the extent tree for the extent item of our data extent. Also, before this change we were only checking the number of references on the data extent against SEND_MAX_EXTENT_REFS, but after starting backref walking we will end up resolving backrefs for extent buffers in the path from a leaf having a file extent item pointing to our data extent, up to roots of trees from which the extent buffer is accessible from, due to shared subtrees resulting from snapshoting. We were therefore allowing for the possibility for send taking too long due to some node in the path from the leaf to a root node being shared too many times. After this change we check for reference counts being greater than SEND_MAX_EXTENT_REFS for both data extents and metadata extents. This change is part of a patchset comprised of the following patches: 01/17 btrfs: fix inode list leak during backref walking at resolve_indirect_refs() 02/17 btrfs: fix inode list leak during backref walking at find_parent_nodes() 03/17 btrfs: fix ulist leaks in error paths of qgroup self tests 04/17 btrfs: remove pointless and double ulist frees in error paths of qgroup tests 05/17 btrfs: send: avoid unnecessary path allocations when finding extent clone 06/17 btrfs: send: update comment at find_extent_clone() 07/17 btrfs: send: drop unnecessary backref context field initializations 08/17 btrfs: send: avoid unnecessary backref lookups when finding clone source 09/17 btrfs: send: optimize clone detection to increase extent sharing 10/17 btrfs: use a single argument for extent offset in backref walking functions 11/17 btrfs: use a structure to pass arguments to backref walking functions 12/17 btrfs: reuse roots ulist on each leaf iteration for iterate_extent_inodes() 13/17 btrfs: constify ulist parameter of ulist_next() 14/17 btrfs: send: cache leaf to roots mapping during backref walking 15/17 btrfs: send: skip unnecessary backref iterations 16/17 btrfs: send: avoid double extent tree search when finding clone source 17/17 btrfs: send: skip resolution of our own backref when finding clone source Performance test results are in the changelog of patch 17/17. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 31 +++++++++------- fs/btrfs/backref.h | 9 +++-- fs/btrfs/send.c | 103 ++++++++++++++++++++++++----------------------------- 3 files changed, 73 insertions(+), 70 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index bb59ba68d7ee..33056c4c0528 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1003,8 +1003,8 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, * * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED. */ -static int add_inline_refs(const struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 bytenr, +static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_path *path, int *info_level, struct preftrees *preftrees, struct share_check *sc) { @@ -1029,6 +1029,13 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + + if (ctx->check_extent_item) { + ret = ctx->check_extent_item(ctx->bytenr, ei, leaf, ctx->user_ctx); + if (ret) + return ret; + } + flags = btrfs_extent_flags(leaf, ei); btrfs_item_key_to_cpu(leaf, &found_key, slot); @@ -1064,9 +1071,9 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, switch (type) { case BTRFS_SHARED_BLOCK_REF_KEY: - ret = add_direct_ref(fs_info, preftrees, + ret = add_direct_ref(ctx->fs_info, preftrees, *info_level + 1, offset, - bytenr, 1, NULL, GFP_NOFS); + ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_SHARED_DATA_REF_KEY: { struct btrfs_shared_data_ref *sdref; @@ -1075,14 +1082,14 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, sdref = (struct btrfs_shared_data_ref *)(iref + 1); count = btrfs_shared_data_ref_count(leaf, sdref); - ret = add_direct_ref(fs_info, preftrees, 0, offset, - bytenr, count, sc, GFP_NOFS); + ret = add_direct_ref(ctx->fs_info, preftrees, 0, offset, + ctx->bytenr, count, sc, GFP_NOFS); break; } case BTRFS_TREE_BLOCK_REF_KEY: - ret = add_indirect_ref(fs_info, preftrees, offset, + ret = add_indirect_ref(ctx->fs_info, preftrees, offset, NULL, *info_level + 1, - bytenr, 1, NULL, GFP_NOFS); + ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_extent_data_ref *dref; @@ -1104,8 +1111,8 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, root = btrfs_extent_data_ref_root(leaf, dref); - ret = add_indirect_ref(fs_info, preftrees, root, - &key, 0, bytenr, count, + ret = add_indirect_ref(ctx->fs_info, preftrees, root, + &key, 0, ctx->bytenr, count, sc, GFP_NOFS); break; @@ -1455,8 +1462,8 @@ again: if (key.objectid == ctx->bytenr && (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY)) { - ret = add_inline_refs(ctx->fs_info, path, ctx->bytenr, - &info_level, &preftrees, sc); + ret = add_inline_refs(ctx, path, &info_level, + &preftrees, sc); if (ret) goto out; ret = add_keyed_refs(root, path, ctx->bytenr, info_level, diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index a80d1e8be718..1bd5a15c7f9e 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -109,9 +109,14 @@ struct btrfs_backref_walk_ctx { */ iterate_extent_inodes_t *indirect_ref_iterator; /* - * Context object to pass to the @cache_lookup, @cache_store and - * @indirect_ref_iterator callbacks. + * If this is not NULL, then the backref walking code will call this for + * each extent item it's meant to process before it actually starts + * processing it. If this returns anything other than 0, then it stops + * the backref walking code immediately. */ + int (*check_extent_item)(u64 bytenr, const struct btrfs_extent_item *ei, + const struct extent_buffer *leaf, void *user_ctx); + /* Context object to pass to the callbacks defined above. */ void *user_ctx; }; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f453f6406564..516b80637bfb 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1281,6 +1281,9 @@ struct backref_ctx { /* may be truncated in case it's the last extent in a file */ u64 extent_len; + + /* The bytenr the file extent item we are processing refers to. */ + u64 bytenr; }; static int __clone_root_cmp_bsearch(const void *key, const void *elt) @@ -1518,6 +1521,43 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, sctx->backref_cache.size++; } +static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei, + const struct extent_buffer *leaf, void *ctx) +{ + const u64 refs = btrfs_extent_refs(leaf, ei); + const struct backref_ctx *bctx = ctx; + const struct send_ctx *sctx = bctx->sctx; + + if (bytenr == bctx->bytenr) { + const u64 flags = btrfs_extent_flags(leaf, ei); + + if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) + return -EUCLEAN; + + /* + * If we have only one reference and only the send root as a + * clone source - meaning no clone roots were given in the + * struct btrfs_ioctl_send_args passed to the send ioctl - then + * it's our reference and there's no point in doing backref + * walking which is expensive, so exit early. + */ + if (refs == 1 && sctx->clone_roots_cnt == 1) + return -ENOENT; + } + + /* + * Backreference walking (iterate_extent_inodes() below) is currently + * too expensive when an extent has a large number of references, both + * in time spent and used memory. So for now just fallback to write + * operations instead of clone operations when an extent has more than + * a certain amount of references. + */ + if (refs > SEND_MAX_EXTENT_REFS) + return -ENOENT; + + return 0; +} + /* * Given an inode, offset and extent item, it finds a good clone for a clone * instruction. Returns -ENOENT when none could be found. The function makes @@ -1539,16 +1579,11 @@ static int find_extent_clone(struct send_ctx *sctx, u64 logical; u64 disk_byte; u64 num_bytes; - u64 extent_refs; - u64 flags = 0; struct btrfs_file_extent_item *fi; struct extent_buffer *eb = path->nodes[0]; struct backref_ctx backref_ctx = { 0 }; struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 }; struct clone_root *cur_clone_root; - struct btrfs_key found_key; - struct btrfs_path *tmp_path; - struct btrfs_extent_item *ei; int compressed; u32 i; @@ -1574,48 +1609,6 @@ static int find_extent_clone(struct send_ctx *sctx, num_bytes = btrfs_file_extent_num_bytes(eb, fi); logical = disk_byte + btrfs_file_extent_offset(eb, fi); - tmp_path = alloc_path_for_send(); - if (!tmp_path) - return -ENOMEM; - - /* We only use this path under the commit sem */ - tmp_path->need_commit_sem = 0; - - down_read(&fs_info->commit_root_sem); - ret = extent_from_logical(fs_info, disk_byte, tmp_path, - &found_key, &flags); - up_read(&fs_info->commit_root_sem); - - if (ret < 0) - goto out; - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - ret = -EIO; - goto out; - } - - ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0], - struct btrfs_extent_item); - extent_refs = btrfs_extent_refs(tmp_path->nodes[0], ei); - /* - * Backreference walking (iterate_extent_inodes() below) is currently - * too expensive when an extent has a large number of references, both - * in time spent and used memory. So for now just fallback to write - * operations instead of clone operations when an extent has more than - * a certain amount of references. - * - * Also, if we have only one reference and only the send root as a clone - * source - meaning no clone roots were given in the struct - * btrfs_ioctl_send_args passed to the send ioctl - then it's our - * reference and there's no point in doing backref walking which is - * expensive, so exit early. - */ - if ((extent_refs == 1 && sctx->clone_roots_cnt == 1) || - extent_refs > SEND_MAX_EXTENT_REFS) { - ret = -ENOENT; - goto out; - } - btrfs_release_path(tmp_path); - /* * Setup the clone roots. */ @@ -1630,6 +1623,7 @@ static int find_extent_clone(struct send_ctx *sctx, backref_ctx.sctx = sctx; backref_ctx.cur_objectid = ino; backref_ctx.cur_offset = data_offset; + backref_ctx.bytenr = disk_byte; /* * The last extent of a file may be too large due to page alignment. @@ -1644,19 +1638,20 @@ static int find_extent_clone(struct send_ctx *sctx, /* * Now collect all backrefs. */ - backref_walk_ctx.bytenr = found_key.objectid; + backref_walk_ctx.bytenr = disk_byte; if (compressed == BTRFS_COMPRESS_NONE) - backref_walk_ctx.extent_item_pos = logical - found_key.objectid; + backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, fi); backref_walk_ctx.fs_info = fs_info; backref_walk_ctx.cache_lookup = lookup_backref_cache; backref_walk_ctx.cache_store = store_backref_cache; backref_walk_ctx.indirect_ref_iterator = iterate_backrefs; + backref_walk_ctx.check_extent_item = check_extent_item; backref_walk_ctx.user_ctx = &backref_ctx; ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs, &backref_ctx); if (ret < 0) - goto out; + return ret; down_read(&fs_info->commit_root_sem); if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { @@ -1673,8 +1668,7 @@ static int find_extent_clone(struct send_ctx *sctx, * was already reallocated after the relocation. */ up_read(&fs_info->commit_root_sem); - ret = -ENOENT; - goto out; + return -ENOENT; } up_read(&fs_info->commit_root_sem); @@ -1684,8 +1678,7 @@ static int find_extent_clone(struct send_ctx *sctx, if (!backref_ctx.found) { btrfs_debug(fs_info, "no clones found"); - ret = -ENOENT; - goto out; + return -ENOENT; } cur_clone_root = NULL; @@ -1720,8 +1713,6 @@ static int find_extent_clone(struct send_ctx *sctx, ret = -ENOENT; } -out: - btrfs_free_path(tmp_path); return ret; } -- cgit From adf0241868bd46c7be8012ef99cb88c7f47c16ce Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:53 +0000 Subject: btrfs: send: skip resolution of our own backref when finding clone source When doing backref walking to determine a source range to clone from, it is worthless to collect and resolve our own data backref, as we can't obviously use it as a clone source and it represents the range we want to clone into. Collecting the backref implies doing the extra work to resolve it, doing the search for a file extent item in a subvolume tree, etc. Skipping the data backref is valid as long as we only have the send root as the single clone root, otherwise the leaf with the file extent item may be accessible from another clone root due to shared subtrees created by snapshots, and therefore we have to collect the backref and resolve it. So add a callback to the backref walking code to guide it to skip data backrefs. This change is part of a patchset comprised of the following patches: 01/17 btrfs: fix inode list leak during backref walking at resolve_indirect_refs() 02/17 btrfs: fix inode list leak during backref walking at find_parent_nodes() 03/17 btrfs: fix ulist leaks in error paths of qgroup self tests 04/17 btrfs: remove pointless and double ulist frees in error paths of qgroup tests 05/17 btrfs: send: avoid unnecessary path allocations when finding extent clone 06/17 btrfs: send: update comment at find_extent_clone() 07/17 btrfs: send: drop unnecessary backref context field initializations 08/17 btrfs: send: avoid unnecessary backref lookups when finding clone source 09/17 btrfs: send: optimize clone detection to increase extent sharing 10/17 btrfs: use a single argument for extent offset in backref walking functions 11/17 btrfs: use a structure to pass arguments to backref walking functions 12/17 btrfs: reuse roots ulist on each leaf iteration for iterate_extent_inodes() 13/17 btrfs: constify ulist parameter of ulist_next() 14/17 btrfs: send: cache leaf to roots mapping during backref walking 15/17 btrfs: send: skip unnecessary backref iterations 16/17 btrfs: send: avoid double extent tree search when finding clone source 17/17 btrfs: send: skip resolution of our own backref when finding clone source The following test was run on non-debug kernel (Debian's default kernel config) before and after applying the patchset: $ cat test-send-many-shared-extents.sh #!/bin/bash DEV=/dev/sdh MNT=/mnt/sdh umount $DEV &> /dev/null mkfs.btrfs -f $DEV mount $DEV $MNT num_files=50000 num_clones_per_file=50 for ((i = 1; i <= $num_files; i++)); do xfs_io -f -c "pwrite 0 64K" $MNT/file_$i > /dev/null echo -ne "\r$i files created..." done echo btrfs subvolume snapshot -r $MNT $MNT/snap1 cloned=0 for ((i = 1; i <= $num_clones_per_file; i++)); do for ((j = 1; j <= $num_files; j++)); do cp --reflink=always $MNT/file_$j $MNT/file_${j}_clone_${i} cloned=$((cloned + 1)) echo -ne "\r$cloned / $((num_files * num_clones_per_file)) clone operations" done done echo btrfs subvolume snapshot -r $MNT $MNT/snap2 # Unmount and mount again to clear all cached metadata (and data). umount $DEV mount $DEV $MNT start=$(date +%s%N) btrfs send $MNT/snap2 > /dev/null end=$(date +%s%N) dur=$(( (end - start) / 1000000000 )) echo -e "\nFull send took $dur seconds" # Unmount and mount again to clear all cached metadata (and data). umount $DEV mount $DEV $MNT start=$(date +%s%N) btrfs send -p $MNT/snap1 $MNT/snap2 > /dev/null end=$(date +%s%N) dur=$(( (end - start) / 1000000000 )) echo -e "\nIncremental send took $dur seconds" umount $MNT Before applying the patchset: (...) Full send took 1108 seconds (...) Incremental send took 1135 seconds After applying the whole patchset: (...) Full send took 268 seconds (-75.8%) (...) Incremental send took 316 seconds (-72.2%) Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 35 +++++++++++++++++++++-------------- fs/btrfs/backref.h | 9 +++++++++ fs/btrfs/send.c | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 33056c4c0528..430974cf3b96 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1111,10 +1111,12 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, root = btrfs_extent_data_ref_root(leaf, dref); - ret = add_indirect_ref(ctx->fs_info, preftrees, root, - &key, 0, ctx->bytenr, count, - sc, GFP_NOFS); - + if (!ctx->skip_data_ref || + !ctx->skip_data_ref(root, key.objectid, key.offset, + ctx->user_ctx)) + ret = add_indirect_ref(ctx->fs_info, preftrees, + root, &key, 0, ctx->bytenr, + count, sc, GFP_NOFS); break; } default: @@ -1133,8 +1135,9 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, * * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED. */ -static int add_keyed_refs(struct btrfs_root *extent_root, - struct btrfs_path *path, u64 bytenr, +static int add_keyed_refs(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_root *extent_root, + struct btrfs_path *path, int info_level, struct preftrees *preftrees, struct share_check *sc) { @@ -1157,7 +1160,7 @@ static int add_keyed_refs(struct btrfs_root *extent_root, leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid != bytenr) + if (key.objectid != ctx->bytenr) break; if (key.type < BTRFS_TREE_BLOCK_REF_KEY) continue; @@ -1169,7 +1172,7 @@ static int add_keyed_refs(struct btrfs_root *extent_root, /* SHARED DIRECT METADATA backref */ ret = add_direct_ref(fs_info, preftrees, info_level + 1, key.offset, - bytenr, 1, NULL, GFP_NOFS); + ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_SHARED_DATA_REF_KEY: { /* SHARED DIRECT FULL backref */ @@ -1180,14 +1183,14 @@ static int add_keyed_refs(struct btrfs_root *extent_root, struct btrfs_shared_data_ref); count = btrfs_shared_data_ref_count(leaf, sdref); ret = add_direct_ref(fs_info, preftrees, 0, - key.offset, bytenr, count, + key.offset, ctx->bytenr, count, sc, GFP_NOFS); break; } case BTRFS_TREE_BLOCK_REF_KEY: /* NORMAL INDIRECT METADATA backref */ ret = add_indirect_ref(fs_info, preftrees, key.offset, - NULL, info_level + 1, bytenr, + NULL, info_level + 1, ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_EXTENT_DATA_REF_KEY: { @@ -1211,9 +1214,13 @@ static int add_keyed_refs(struct btrfs_root *extent_root, } root = btrfs_extent_data_ref_root(leaf, dref); - ret = add_indirect_ref(fs_info, preftrees, root, - &key, 0, bytenr, count, - sc, GFP_NOFS); + + if (!ctx->skip_data_ref || + !ctx->skip_data_ref(root, key.objectid, key.offset, + ctx->user_ctx)) + ret = add_indirect_ref(fs_info, preftrees, root, + &key, 0, ctx->bytenr, + count, sc, GFP_NOFS); break; } default: @@ -1466,7 +1473,7 @@ again: &preftrees, sc); if (ret) goto out; - ret = add_keyed_refs(root, path, ctx->bytenr, info_level, + ret = add_keyed_refs(ctx, root, path, info_level, &preftrees, sc); if (ret) goto out; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 1bd5a15c7f9e..ef6bbea3f456 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -116,6 +116,15 @@ struct btrfs_backref_walk_ctx { */ int (*check_extent_item)(u64 bytenr, const struct btrfs_extent_item *ei, const struct extent_buffer *leaf, void *user_ctx); + /* + * If this is not NULL, then the backref walking code will call this for + * each extent data ref it finds (BTRFS_EXTENT_DATA_REF_KEY keys) before + * processing that data ref. If this callback return false, then it will + * ignore this data ref and it will never resolve the indirect data ref, + * saving time searching for leaves in a fs tree with file extent items + * matching the data ref. + */ + bool (*skip_data_ref)(u64 root, u64 ino, u64 offset, void *user_ctx); /* Context object to pass to the callbacks defined above. */ void *user_ctx; }; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 516b80637bfb..383bc8a5cb6c 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1284,6 +1284,10 @@ struct backref_ctx { /* The bytenr the file extent item we are processing refers to. */ u64 bytenr; + /* The owner (root id) of the data backref for the current extent. */ + u64 backref_owner; + /* The offset of the data backref for the current extent. */ + u64 backref_offset; }; static int __clone_root_cmp_bsearch(const void *key, const void *elt) @@ -1558,6 +1562,18 @@ static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei, return 0; } +static bool skip_self_data_ref(u64 root, u64 ino, u64 offset, void *ctx) +{ + const struct backref_ctx *bctx = ctx; + + if (ino == bctx->cur_objectid && + root == bctx->backref_owner && + offset == bctx->backref_offset) + return true; + + return false; +} + /* * Given an inode, offset and extent item, it finds a good clone for a clone * instruction. Returns -ENOENT when none could be found. The function makes @@ -1624,6 +1640,12 @@ static int find_extent_clone(struct send_ctx *sctx, backref_ctx.cur_objectid = ino; backref_ctx.cur_offset = data_offset; backref_ctx.bytenr = disk_byte; + /* + * Use the header owner and not the send root's id, because in case of a + * snapshot we can have shared subtrees. + */ + backref_ctx.backref_owner = btrfs_header_owner(eb); + backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, fi); /* * The last extent of a file may be too large due to page alignment. @@ -1648,6 +1670,17 @@ static int find_extent_clone(struct send_ctx *sctx, backref_walk_ctx.check_extent_item = check_extent_item; backref_walk_ctx.user_ctx = &backref_ctx; + /* + * If have a single clone root, then it's the send root and we can tell + * the backref walking code to skip our own backref and not resolve it, + * since we can not use it for cloning - the source and destination + * ranges can't overlap and in case the leaf is shared through a subtree + * due to snapshots, we can't use those other roots since they are not + * in the list of clone roots. + */ + if (sctx->clone_roots_cnt == 1) + backref_walk_ctx.skip_data_ref = skip_self_data_ref; + ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs, &backref_ctx); if (ret < 0) -- cgit From e2a041657774102b184cc41369a53be3d024e8ee Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:54 +0000 Subject: btrfs: send: bump the extent reference count limit for backref walking After the previous patchset which is comprised of the following patches: 01/17 btrfs: fix inode list leak during backref walking at resolve_indirect_refs() 02/17 btrfs: fix inode list leak during backref walking at find_parent_nodes() 03/17 btrfs: fix ulist leaks in error paths of qgroup self tests 04/17 btrfs: remove pointless and double ulist frees in error paths of qgroup tests 05/17 btrfs: send: avoid unnecessary path allocations when finding extent clone 06/17 btrfs: send: update comment at find_extent_clone() 07/17 btrfs: send: drop unnecessary backref context field initializations 08/17 btrfs: send: avoid unnecessary backref lookups when finding clone source 09/17 btrfs: send: optimize clone detection to increase extent sharing 10/17 btrfs: use a single argument for extent offset in backref walking functions 11/17 btrfs: use a structure to pass arguments to backref walking functions 12/17 btrfs: reuse roots ulist on each leaf iteration for iterate_extent_inodes() 13/17 btrfs: constify ulist parameter of ulist_next() 14/17 btrfs: send: cache leaf to roots mapping during backref walking 15/17 btrfs: send: skip unnecessary backref iterations 16/17 btrfs: send: avoid double extent tree search when finding clone source 17/17 btrfs: send: skip resolution of our own backref when finding clone source we have now much better performance when doing backref walking in the send code, so we can increase the current limit from 64 to 1024 references. This limit is still a bit conservative because there are still edge cases where backref walking will be too slow and spend a lot of cpu time, some IO reading b+tree nodes/leaves and memory. The goal is to eventually get rid of any limit, but for now bump it as it benefits users with extents shared more than 64 times and up to 1024 times, allowing for more deduplication at the destination without having to run a dedupe tool after a receive. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 383bc8a5cb6c..67f7c698ade3 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -39,7 +39,7 @@ * avoid hitting limitations of the backreference walking code (taking a lot of * time and using too much memory for extents with large number of references). */ -#define SEND_MAX_EXTENT_REFS 64 +#define SEND_MAX_EXTENT_REFS 1024 /* * A fs_path is a helper to dynamically build path names with unknown size. -- cgit From 9e5e6d4e2e5e37e28bf2be85fa08761e33aa5efb Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 31 Oct 2022 20:33:40 +0100 Subject: btrfs: zlib: use copy_page for full page copy The copy_page helper may use an optimized version for full page copy (eg. on s390 there's a special instruction for that), there's one more left to convert. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/zlib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index b4f44662cda7..c5275cb23837 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -155,8 +155,8 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, in_page = find_get_page(mapping, start >> PAGE_SHIFT); data_in = kmap_local_page(in_page); - memcpy(workspace->buf + i * PAGE_SIZE, - data_in, PAGE_SIZE); + copy_page(workspace->buf + i * PAGE_SIZE, + data_in); start += PAGE_SIZE; } workspace->strm.next_in = workspace->buf; -- cgit From fd463ac4616e69c7306c680dabfe857a2f50fc69 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 31 Oct 2022 20:33:42 +0100 Subject: btrfs: zoned: use helper to check a power of two zone size We have a 64bit compatible helper to check if a value is a power of two, use it instead of open coding it. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 13034c3bbb65..95dec93374ea 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -394,8 +394,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) zone_sectors = bdev_zone_sectors(bdev); } - /* Check if it's power of 2 (see is_power_of_2) */ - ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); + ASSERT(is_power_of_two_u64(zone_sectors)); zone_info->zone_size = zone_sectors << SECTOR_SHIFT; /* We reject devices with a zone size larger than 8GB */ -- cgit From 0d7764ff58b4b45c39eb03f2c74a819c1a88fa7b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 31 Oct 2022 20:33:44 +0100 Subject: btrfs: convert btrfs_block_group::needs_free_space to runtime flag We already have flags in block group to track various status bits, convert needs_free_space as well and reduce size of btrfs_block_group. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 +- fs/btrfs/block-group.h | 8 ++------ fs/btrfs/free-space-tree.c | 10 +++++----- fs/btrfs/tests/free-space-tree-tests.c | 2 +- 4 files changed, 9 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index ea5fcb665daa..708d843daa72 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2553,7 +2553,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran cache->global_root_id = calculate_global_root_id(fs_info, cache->start); if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) - cache->needs_free_space = 1; + set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags); ret = btrfs_load_block_group_zone_info(cache, true); if (ret) { diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 6c970a486b68..4cee23c11938 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -55,6 +55,8 @@ enum btrfs_block_group_flags { BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, + /* Does the block group need to be added to the free space tree? */ + BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, }; enum btrfs_caching_type { @@ -208,12 +210,6 @@ struct btrfs_block_group { /* Lock for free space tree operations. */ struct mutex free_space_lock; - /* - * Does the block group need to be added to the free space tree? - * Protected by free_space_lock. - */ - int needs_free_space; - /* Flag indicating this block group is placed on a sequential zone */ bool seq_zone; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 869d062d6765..c667e878ef1a 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -808,7 +808,7 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, u32 flags; int ret; - if (block_group->needs_free_space) { + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { ret = __add_block_group_free_space(trans, block_group, path); if (ret) return ret; @@ -1001,7 +1001,7 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans, u32 flags; int ret; - if (block_group->needs_free_space) { + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { ret = __add_block_group_free_space(trans, block_group, path); if (ret) return ret; @@ -1304,7 +1304,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, { int ret; - block_group->needs_free_space = 0; + clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags); ret = add_new_free_space_info(trans, block_group, path); if (ret) @@ -1326,7 +1326,7 @@ int add_block_group_free_space(struct btrfs_trans_handle *trans, return 0; mutex_lock(&block_group->free_space_lock); - if (!block_group->needs_free_space) + if (!test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) goto out; path = btrfs_alloc_path(); @@ -1359,7 +1359,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) return 0; - if (block_group->needs_free_space) { + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { /* We never added this block group to the free space tree. */ return 0; } diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 53a17b1d1744..b61972046feb 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -471,7 +471,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, } cache->bitmap_low_thresh = 0; cache->bitmap_high_thresh = (u32)-1; - cache->needs_free_space = 1; + set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags); cache->fs_info = root->fs_info; btrfs_init_dummy_trans(&trans, root->fs_info); -- cgit From 961f5b8bf48a463ac5fe5b13143426d79eb41817 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 31 Oct 2022 20:33:46 +0100 Subject: btrfs: convert btrfs_block_group::seq_zone to runtime flag In zoned mode the sequential status of zone can be also tracked in the runtime flags of block group. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/block-group.h | 5 ++--- fs/btrfs/zoned.c | 7 ++++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 4cee23c11938..a02ea76fd6cf 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -57,6 +57,8 @@ enum btrfs_block_group_flags { BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, /* Does the block group need to be added to the free space tree? */ BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, + /* Indicate that the block group is placed on a sequential zone */ + BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, }; enum btrfs_caching_type { @@ -210,9 +212,6 @@ struct btrfs_block_group { /* Lock for free space tree operations. */ struct mutex free_space_lock; - /* Flag indicating this block group is placed on a sequential zone */ - bool seq_zone; - /* * Number of extents in this block group used for swap files. * All accesses protected by the spinlock 'lock'. diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 95dec93374ea..a759668477bb 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1437,7 +1437,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } if (num_sequential > 0) - cache->seq_zone = true; + set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); if (num_conventional > 0) { /* Zone capacity is always zone size in emulation */ @@ -1649,7 +1649,7 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) if (!cache) return false; - ret = cache->seq_zone; + ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); btrfs_put_block_group(cache); return ret; @@ -2154,7 +2154,8 @@ static void btrfs_zone_finish_endio_workfn(struct work_struct *work) void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb) { - if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) + if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) || + eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) return; if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) { -- cgit From 19af6a7d345acc885f970d57577fa80ca4ad3d98 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:02:32 +0200 Subject: btrfs: change how repair action is passed to btrfs_repair_one_sector There's a function pointer passed to btrfs_repair_one_sector that will submit the right bio for repair. However there are only two callbacks, for buffered and for direct IO. This can be simplified to a bool-based switch and call either function, indirect calls in this case is an unnecessary abstraction. This allows to remove the submit_bio_hook_t typedef. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 3 +++ fs/btrfs/compression.c | 2 +- fs/btrfs/extent_io.c | 14 +++++++++----- fs/btrfs/extent_io.h | 6 +----- fs/btrfs/inode.c | 9 ++++----- 5 files changed, 18 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d21c30bf7053..fa0c72cabd8f 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -414,6 +414,9 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num); void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); +void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, + int mirror_num, + enum btrfs_compression_type compress_type); int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index e66fa18dbcf7..cf3dc7e501ec 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -196,7 +196,7 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio) refcount_inc(&cb->pending_ios); ret = btrfs_repair_one_sector(inode, bbio, offset, bv.bv_page, bv.bv_offset, - btrfs_submit_data_read_bio); + true); if (ret) { refcount_dec(&cb->pending_ios); status = errno_to_blk_status(ret); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2ec989b83f54..44ff41304247 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -797,7 +797,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, u32 bio_offset, struct page *page, unsigned int pgoff, - submit_bio_hook_t *submit_bio_hook) + bool submit_buffered) { u64 start = failed_bbio->file_offset + bio_offset; struct io_failure_record *failrec; @@ -856,11 +856,15 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, failrec->this_mirror); /* - * At this point we have a bio, so any errors from submit_bio_hook() - * will be handled by the endio on the repair_bio, so we can't return an + * At this point we have a bio, so any errors from bio submission will + * be handled by the endio on the repair_bio, so we can't return an * error here. */ - submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0); + if (submit_buffered) + btrfs_submit_data_read_bio(inode, repair_bio, failrec->this_mirror, 0); + else + btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror, 0); + return BLK_STS_OK; } @@ -951,7 +955,7 @@ static void submit_data_read_repair(struct inode *inode, ret = btrfs_repair_one_sector(inode, failed_bbio, bio_offset + offset, page, pgoff + offset, - btrfs_submit_data_read_bio); + true); if (!ret) { /* * We have submitted the read repair, the page release diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a5ec1475988f..321680f229c6 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -70,10 +70,6 @@ struct extent_io_tree; int __init extent_buffer_init_cachep(void); void __cold extent_buffer_free_cachep(void); -typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio, - int mirror_num, - enum btrfs_compression_type compress_type); - typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode, struct bio *bio, u64 dio_file_offset); @@ -277,7 +273,7 @@ struct io_failure_record { int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, u32 bio_offset, struct page *page, unsigned int pgoff, - submit_bio_hook_t *submit_bio_hook); + bool submit_buffered); void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page, unsigned int pg_offset); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1422f072a7b8..8cc1283fb347 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7923,9 +7923,9 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) bio_endio(&dip->bio); } -static void submit_dio_repair_bio(struct inode *inode, struct bio *bio, - int mirror_num, - enum btrfs_compression_type compress_type) +void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, + int mirror_num, + enum btrfs_compression_type compress_type) { struct btrfs_dio_private *dip = btrfs_bio(bio)->private; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -7960,8 +7960,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, int ret; ret = btrfs_repair_one_sector(inode, bbio, offset, - bv.bv_page, bv.bv_offset, - submit_dio_repair_bio); + bv.bv_page, bv.bv_offset, false); if (ret) err = errno_to_blk_status(ret); } -- cgit From 7920b773bd8a458c05b2ba4581fe19c5704fffd7 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:07:10 +0200 Subject: btrfs: drop parameter compression_type from btrfs_submit_dio_repair_bio Compression and direct io don't work together so the compression parameter can be dropped after previous patch that changed the call to direct. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 4 +--- fs/btrfs/extent_io.c | 2 +- fs/btrfs/inode.c | 4 +--- 3 files changed, 3 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index fa0c72cabd8f..62019d7c1cbd 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -414,9 +414,7 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num); void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); -void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, - int mirror_num, - enum btrfs_compression_type compress_type); +void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num); int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 44ff41304247..cc05ae772fa5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -863,7 +863,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, if (submit_buffered) btrfs_submit_data_read_bio(inode, repair_bio, failrec->this_mirror, 0); else - btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror, 0); + btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror); return BLK_STS_OK; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8cc1283fb347..74ed0afe4c2f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7923,9 +7923,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) bio_endio(&dip->bio); } -void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, - int mirror_num, - enum btrfs_compression_type compress_type) +void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num) { struct btrfs_dio_private *dip = btrfs_bio(bio)->private; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); -- cgit From ab2072b2921eced166dbdec1d65b0284b6eba2b9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:22:19 +0200 Subject: btrfs: change how submit bio callback is passed to btrfs_wq_submit_bio There's a callback function parameter for btrfs_wq_submit_bio that can be one of: metadata, buffered data, direct io data. The callback abstraction is unnecessary as we have all functions available. Replace the parameter with a command that leads to a direct call in run_one_async_start. The called functions can be then simplified and we can also remove the extent_submit_bio_start_t typedef. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 5 +++++ fs/btrfs/disk-io.c | 33 +++++++++++++++++++++++---------- fs/btrfs/disk-io.h | 12 ++++++++++-- fs/btrfs/extent_io.h | 3 --- fs/btrfs/inode.c | 15 +++++++-------- 5 files changed, 45 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 62019d7c1cbd..72cf235b7beb 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -415,6 +415,11 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num); +blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, + u64 dio_file_offset); +blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, + struct bio *bio, + u64 dio_file_offset); int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e5687bdff150..394dfe5ff567 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -86,10 +86,10 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) struct async_submit_bio { struct inode *inode; struct bio *bio; - extent_submit_bio_start_t *submit_bio_start; + enum btrfs_wq_submit_cmd submit_cmd; int mirror_num; - /* Optional parameter for submit_bio_start used by direct io */ + /* Optional parameter for used by direct io */ u64 dio_file_offset; struct btrfs_work work; blk_status_t status; @@ -637,8 +637,22 @@ static void run_one_async_start(struct btrfs_work *work) blk_status_t ret; async = container_of(work, struct async_submit_bio, work); - ret = async->submit_bio_start(async->inode, async->bio, - async->dio_file_offset); + switch (async->submit_cmd) { + case WQ_SUBMIT_METADATA: + ret = btree_submit_bio_start(async->inode, async->bio, + async->dio_file_offset); + break; + case WQ_SUBMIT_DATA: + ret = btrfs_submit_bio_start(async->inode, async->bio, + async->dio_file_offset); + + break; + case WQ_SUBMIT_DATA_DIO: + ret = btrfs_submit_bio_start_direct_io(async->inode, async->bio, + async->dio_file_offset); + + break; + } if (ret) async->status = ret; } @@ -689,8 +703,7 @@ static void run_one_async_free(struct btrfs_work *work) * - false in case of error */ bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, - extent_submit_bio_start_t *submit_bio_start) + u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct async_submit_bio *async; @@ -702,7 +715,7 @@ bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, async->inode = inode; async->bio = bio; async->mirror_num = mirror_num; - async->submit_bio_start = submit_bio_start; + async->submit_cmd = cmd; btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, run_one_async_free); @@ -736,8 +749,8 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) return errno_to_blk_status(ret); } -static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, - u64 dio_file_offset) +blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, + u64 dio_file_offset) { /* * when we're called for a write, we're already in the async @@ -776,7 +789,7 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_ * happen in parallel across all CPUs. */ if (should_async_write(fs_info, BTRFS_I(inode)) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start)) + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA)) return; ret = btree_csum_one_bio(bio); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 6edc66b4b4d3..5998b2589830 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -113,9 +113,17 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid, int level, struct btrfs_key *first_key); + +enum btrfs_wq_submit_cmd { + WQ_SUBMIT_METADATA, + WQ_SUBMIT_DATA, + WQ_SUBMIT_DATA_DIO, +}; + bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, - extent_submit_bio_start_t *submit_bio_start); + u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd); +blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, + u64 dio_file_offset); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 321680f229c6..b3d4b568fe33 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -70,9 +70,6 @@ struct extent_io_tree; int __init extent_buffer_init_cachep(void); void __cold extent_buffer_free_cachep(void); -typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode, - struct bio *bio, u64 dio_file_offset); - #define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE) struct extent_buffer { u64 start; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 74ed0afe4c2f..06931c20ebfa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2550,8 +2550,8 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, - u64 dio_file_offset) +blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, + u64 dio_file_offset) { return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); } @@ -2758,8 +2758,7 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && !btrfs_is_data_reloc_root(bi->root)) { if (!atomic_read(&bi->sync_writers) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, - btrfs_submit_bio_start)) + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA)) return; ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false); @@ -7967,9 +7966,9 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, return err; } -static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, - struct bio *bio, - u64 dio_file_offset) +blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, + struct bio *bio, + u64 dio_file_offset) { return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false); } @@ -8017,7 +8016,7 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, /* Check btrfs_submit_data_write_bio() for async submit rules */ if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) && btrfs_wq_submit_bio(inode, bio, 0, file_offset, - btrfs_submit_bio_start_direct_io)) + WQ_SUBMIT_DATA_DIO)) return; /* -- cgit From ad65ecf30b037549a17d2e402fac8d39242073d1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:30:45 +0200 Subject: btrfs: simplify btree_submit_bio_start and btrfs_submit_bio_start parameters After previous patches the unused parameters can be removed from btree_submit_bio_start and btrfs_submit_bio_start as they don't need to conform to the extent_submit_bio_start_t typedef. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 3 +-- fs/btrfs/disk-io.c | 11 +++-------- fs/btrfs/disk-io.h | 3 +-- fs/btrfs/inode.c | 3 +-- 4 files changed, 6 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 72cf235b7beb..54bf002e0013 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -415,8 +415,7 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num); -blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, - u64 dio_file_offset); +blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio); blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, struct bio *bio, u64 dio_file_offset); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 394dfe5ff567..7bc0fee4da13 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -639,18 +639,14 @@ static void run_one_async_start(struct btrfs_work *work) async = container_of(work, struct async_submit_bio, work); switch (async->submit_cmd) { case WQ_SUBMIT_METADATA: - ret = btree_submit_bio_start(async->inode, async->bio, - async->dio_file_offset); + ret = btree_submit_bio_start(async->bio); break; case WQ_SUBMIT_DATA: - ret = btrfs_submit_bio_start(async->inode, async->bio, - async->dio_file_offset); - + ret = btrfs_submit_bio_start(async->inode, async->bio); break; case WQ_SUBMIT_DATA_DIO: ret = btrfs_submit_bio_start_direct_io(async->inode, async->bio, async->dio_file_offset); - break; } if (ret) @@ -749,8 +745,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) return errno_to_blk_status(ret); } -blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, - u64 dio_file_offset) +blk_status_t btree_submit_bio_start(struct bio *bio) { /* * when we're called for a write, we're already in the async diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 5998b2589830..d5b25fa8037b 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -122,8 +122,7 @@ enum btrfs_wq_submit_cmd { bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd); -blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, - u64 dio_file_offset); +blk_status_t btree_submit_bio_start(struct bio *bio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06931c20ebfa..cd9325f51ffd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2550,8 +2550,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, - u64 dio_file_offset) +blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio) { return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); } -- cgit From da67daab8dd679b9a10dee86520c530011342a8d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:40:36 +0200 Subject: btrfs: switch async_submit_bio::inode to btrfs_inode The async bio submit is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7bc0fee4da13..44806ccad43c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -84,7 +84,7 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) * just before they are sent down the IO stack. */ struct async_submit_bio { - struct inode *inode; + struct btrfs_inode *inode; struct bio *bio; enum btrfs_wq_submit_cmd submit_cmd; int mirror_num; @@ -642,11 +642,11 @@ static void run_one_async_start(struct btrfs_work *work) ret = btree_submit_bio_start(async->bio); break; case WQ_SUBMIT_DATA: - ret = btrfs_submit_bio_start(async->inode, async->bio); + ret = btrfs_submit_bio_start(&async->inode->vfs_inode, async->bio); break; case WQ_SUBMIT_DATA_DIO: - ret = btrfs_submit_bio_start_direct_io(async->inode, async->bio, - async->dio_file_offset); + ret = btrfs_submit_bio_start_direct_io(&async->inode->vfs_inode, + async->bio, async->dio_file_offset); break; } if (ret) @@ -665,7 +665,7 @@ static void run_one_async_done(struct btrfs_work *work) { struct async_submit_bio *async = container_of(work, struct async_submit_bio, work); - struct inode *inode = async->inode; + struct btrfs_inode *inode = async->inode; struct btrfs_bio *bbio = btrfs_bio(async->bio); /* If an error occurred we just want to clean up the bio and move on */ @@ -680,7 +680,7 @@ static void run_one_async_done(struct btrfs_work *work) * This changes nothing when cgroups aren't in use. */ async->bio->bi_opf |= REQ_CGROUP_PUNT; - btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num); + btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num); } static void run_one_async_free(struct btrfs_work *work) @@ -708,7 +708,7 @@ bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, if (!async) return false; - async->inode = inode; + async->inode = BTRFS_I(inode); async->bio = bio; async->mirror_num = mirror_num; async->submit_cmd = cmd; -- cgit From 882681ac98837aa7683a59020c0c98d05479805f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_submit_bio_start The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/inode.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 54bf002e0013..4ec6a74dd6ba 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -415,7 +415,7 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num); -blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio); +blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio); blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, struct bio *bio, u64 dio_file_offset); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 44806ccad43c..3e42fab48e6b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -642,7 +642,7 @@ static void run_one_async_start(struct btrfs_work *work) ret = btree_submit_bio_start(async->bio); break; case WQ_SUBMIT_DATA: - ret = btrfs_submit_bio_start(&async->inode->vfs_inode, async->bio); + ret = btrfs_submit_bio_start(async->inode, async->bio); break; case WQ_SUBMIT_DATA_DIO: ret = btrfs_submit_bio_start_direct_io(&async->inode->vfs_inode, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cd9325f51ffd..aa55cf6e763e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2550,9 +2550,9 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio) +blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio) { - return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); + return btrfs_csum_one_bio(inode, bio, (u64)-1, false); } /* -- cgit From bfa17066822cead4fb8a6a5d4a214a2c527e4614 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_submit_bio_start_direct_io The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/inode.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 4ec6a74dd6ba..a41d4f953bfa 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -416,7 +416,7 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num); blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio); -blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, +blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, struct bio *bio, u64 dio_file_offset); int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3e42fab48e6b..5aacb88069a7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -645,7 +645,7 @@ static void run_one_async_start(struct btrfs_work *work) ret = btrfs_submit_bio_start(async->inode, async->bio); break; case WQ_SUBMIT_DATA_DIO: - ret = btrfs_submit_bio_start_direct_io(&async->inode->vfs_inode, + ret = btrfs_submit_bio_start_direct_io(async->inode, async->bio, async->dio_file_offset); break; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aa55cf6e763e..06ec84be4b51 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7965,11 +7965,11 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, return err; } -blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, +blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, struct bio *bio, u64 dio_file_offset) { - return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false); + return btrfs_csum_one_bio(inode, bio, dio_file_offset, false); } static void btrfs_end_dio_bio(struct btrfs_bio *bbio) -- cgit From 5fcdadc2704534ef3a7b99aef65d54183c723493 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_wq_submit_bio The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/disk-io.h | 2 +- fs/btrfs/inode.c | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5aacb88069a7..65c168d6ed77 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -698,17 +698,17 @@ static void run_one_async_free(struct btrfs_work *work) * - true if the work has been succesfuly submitted * - false in case of error */ -bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, +bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd) { - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) return false; - async->inode = BTRFS_I(inode); + async->inode = inode; async->bio = bio; async->mirror_num = mirror_num; async->submit_cmd = cmd; @@ -784,7 +784,7 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_ * happen in parallel across all CPUs. */ if (should_async_write(fs_info, BTRFS_I(inode)) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA)) + btrfs_wq_submit_bio(BTRFS_I(inode), bio, mirror_num, 0, WQ_SUBMIT_METADATA)) return; ret = btree_csum_one_bio(bio); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index d5b25fa8037b..65cf976b74e8 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -120,7 +120,7 @@ enum btrfs_wq_submit_cmd { WQ_SUBMIT_DATA_DIO, }; -bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, +bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd); blk_status_t btree_submit_bio_start(struct bio *bio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06ec84be4b51..a7cbbfc22329 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2757,7 +2757,7 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && !btrfs_is_data_reloc_root(bi->root)) { if (!atomic_read(&bi->sync_writers) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA)) + btrfs_wq_submit_bio(bi, bio, mirror_num, 0, WQ_SUBMIT_DATA)) return; ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false); @@ -8014,7 +8014,7 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, if (btrfs_op(bio) == BTRFS_MAP_WRITE) { /* Check btrfs_submit_data_write_bio() for async submit rules */ if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) && - btrfs_wq_submit_bio(inode, bio, 0, file_offset, + btrfs_wq_submit_bio(BTRFS_I(inode), bio, 0, file_offset, WQ_SUBMIT_DATA_DIO)) return; -- cgit From 644094fd285499b484e50a8e986965b550a0924e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_submit_metadata_bio The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/disk-io.h | 2 +- fs/btrfs/extent_io.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 65c168d6ed77..72e3446d9840 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -766,9 +766,9 @@ static bool should_async_write(struct btrfs_fs_info *fs_info, return true; } -void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num) +void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_bio *bbio = btrfs_bio(bio); blk_status_t ret; @@ -783,8 +783,8 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_ * Kthread helpers are used to submit writes so that checksumming can * happen in parallel across all CPUs. */ - if (should_async_write(fs_info, BTRFS_I(inode)) && - btrfs_wq_submit_bio(BTRFS_I(inode), bio, mirror_num, 0, WQ_SUBMIT_METADATA)) + if (should_async_write(fs_info, inode) && + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA)) return; ret = btree_csum_one_bio(bio); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 65cf976b74e8..f2c507fd0e04 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -86,7 +86,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror); -void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num); +void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cc05ae772fa5..e770cbc5cb6a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -134,7 +134,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; if (!is_data_inode(inode)) - btrfs_submit_metadata_bio(inode, bio, mirror_num); + btrfs_submit_metadata_bio(BTRFS_I(inode), bio, mirror_num); else if (btrfs_op(bio) == BTRFS_MAP_WRITE) btrfs_submit_data_write_bio(inode, bio, mirror_num); else -- cgit From 535a7e5d6b7e04da0336c632c834a575e82b5082 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_submit_data_write_bio The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/inode.c | 17 ++++++++--------- 3 files changed, 10 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index a41d4f953bfa..01fc62d39ed2 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -411,7 +411,7 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes -void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num); +void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e770cbc5cb6a..13fba51be32d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -136,7 +136,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) if (!is_data_inode(inode)) btrfs_submit_metadata_bio(BTRFS_I(inode), bio, mirror_num); else if (btrfs_op(bio) == BTRFS_MAP_WRITE) - btrfs_submit_data_write_bio(inode, bio, mirror_num); + btrfs_submit_data_write_bio(BTRFS_I(inode), bio, mirror_num); else btrfs_submit_data_read_bio(inode, bio, mirror_num, bio_ctrl->compress_type); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a7cbbfc22329..538f55088bdf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2730,14 +2730,13 @@ out: return errno_to_blk_status(ret); } -void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num) +void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_inode *bi = BTRFS_I(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; blk_status_t ret; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - ret = extract_ordered_extent(bi, bio, + ret = extract_ordered_extent(inode, bio, page_offset(bio_first_bvec_all(bio)->bv_page)); if (ret) { btrfs_bio_end_io(btrfs_bio(bio), ret); @@ -2753,14 +2752,14 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro * Csum items for reloc roots have already been cloned at this point, * so they are handled as part of the no-checksum case. */ - if (!(bi->flags & BTRFS_INODE_NODATASUM) && + if (!(inode->flags & BTRFS_INODE_NODATASUM) && !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && - !btrfs_is_data_reloc_root(bi->root)) { - if (!atomic_read(&bi->sync_writers) && - btrfs_wq_submit_bio(bi, bio, mirror_num, 0, WQ_SUBMIT_DATA)) + !btrfs_is_data_reloc_root(inode->root)) { + if (!atomic_read(&inode->sync_writers) && + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA)) return; - ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false); + ret = btrfs_csum_one_bio(inode, bio, (u64)-1, false); if (ret) { btrfs_bio_end_io(btrfs_bio(bio), ret); return; -- cgit From b762041629e73d8e24a10aae4bf256774fbbd082 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_submit_data_read_bio The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent_io.c | 5 +++-- fs/btrfs/inode.c | 8 ++++---- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 01fc62d39ed2..5406a2f817b2 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -412,7 +412,7 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, #define CSUM_FMT_VALUE(size, bytes) size, bytes void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); -void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, +void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num); blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 13fba51be32d..ac69e5f1605c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -138,7 +138,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) else if (btrfs_op(bio) == BTRFS_MAP_WRITE) btrfs_submit_data_write_bio(BTRFS_I(inode), bio, mirror_num); else - btrfs_submit_data_read_bio(inode, bio, mirror_num, + btrfs_submit_data_read_bio(BTRFS_I(inode), bio, mirror_num, bio_ctrl->compress_type); /* The bio is owned by the end_io handler now */ @@ -861,7 +861,8 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, * error here. */ if (submit_buffered) - btrfs_submit_data_read_bio(inode, repair_bio, failrec->this_mirror, 0); + btrfs_submit_data_read_bio(BTRFS_I(inode), repair_bio, + failrec->this_mirror, 0); else btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 538f55088bdf..e3bbd19edea3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2768,10 +2768,10 @@ void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int btrfs_submit_bio(fs_info, bio, mirror_num); } -void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, +void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; blk_status_t ret; if (compress_type != BTRFS_COMPRESS_NONE) { @@ -2779,7 +2779,7 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, * btrfs_submit_compressed_read will handle completing the bio * if there were any errors, so just return here. */ - btrfs_submit_compressed_read(inode, bio, mirror_num); + btrfs_submit_compressed_read(&inode->vfs_inode, bio, mirror_num); return; } @@ -2790,7 +2790,7 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, * Lookup bio sums does extra checks around whether we need to csum or * not, which is why we ignore skip_sum here. */ - ret = btrfs_lookup_bio_sums(inode, bio, NULL); + ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); if (ret) { btrfs_bio_end_io(btrfs_bio(bio), ret); return; -- cgit From d781c1c315ce649002a9fd0387edc3ee93271743 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_submit_dio_repair_bio The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent_io.c | 3 ++- fs/btrfs/inode.c | 5 ++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 5406a2f817b2..9fe1a11a2eb3 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -414,7 +414,7 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); -void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num); +void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio); blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, struct bio *bio, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ac69e5f1605c..bddda397e438 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -864,7 +864,8 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, btrfs_submit_data_read_bio(BTRFS_I(inode), repair_bio, failrec->this_mirror, 0); else - btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror); + btrfs_submit_dio_repair_bio(BTRFS_I(inode), repair_bio, + failrec->this_mirror); return BLK_STS_OK; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e3bbd19edea3..f0b4f3a9245f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7920,15 +7920,14 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) bio_endio(&dip->bio); } -void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num) +void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { struct btrfs_dio_private *dip = btrfs_bio(bio)->private; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); BUG_ON(bio_op(bio) == REQ_OP_WRITE); refcount_inc(&dip->refs); - btrfs_submit_bio(fs_info, bio, mirror_num); + btrfs_submit_bio(inode->root->fs_info, bio, mirror_num); } static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, -- cgit From c5ca391b0dd82d4c0457159286bd19f2364a70be Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to submit_one_bio The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bddda397e438..192f604ac7ea 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -117,7 +117,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct bio *bio; struct bio_vec *bv; - struct inode *inode; + struct btrfs_inode *inode; int mirror_num; if (!bio_ctrl->bio) @@ -125,7 +125,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio = bio_ctrl->bio; bv = bio_first_bvec_all(bio); - inode = bv->bv_page->mapping->host; + inode = BTRFS_I(bv->bv_page->mapping->host); mirror_num = bio_ctrl->mirror_num; /* Caller should ensure the bio has at least some range added */ @@ -133,12 +133,12 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; - if (!is_data_inode(inode)) - btrfs_submit_metadata_bio(BTRFS_I(inode), bio, mirror_num); + if (!is_data_inode(&inode->vfs_inode)) + btrfs_submit_metadata_bio(inode, bio, mirror_num); else if (btrfs_op(bio) == BTRFS_MAP_WRITE) - btrfs_submit_data_write_bio(BTRFS_I(inode), bio, mirror_num); + btrfs_submit_data_write_bio(inode, bio, mirror_num); else - btrfs_submit_data_read_bio(BTRFS_I(inode), bio, mirror_num, + btrfs_submit_data_read_bio(inode, bio, mirror_num, bio_ctrl->compress_type); /* The bio is owned by the end_io handler now */ -- cgit From d8f9268ece91e6fe887b444780787828890a8051 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_repair_one_sector The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/compression.c | 2 +- fs/btrfs/extent_io.c | 17 ++++++++--------- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 2 +- 4 files changed, 11 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index cf3dc7e501ec..6f5ad0d6c409 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -194,7 +194,7 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio) int ret; refcount_inc(&cb->pending_ios); - ret = btrfs_repair_one_sector(inode, bbio, offset, + ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, bv.bv_page, bv.bv_offset, true); if (ret) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 192f604ac7ea..05768f7f7872 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -795,13 +795,13 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode return failrec; } -int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, +int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, u32 bio_offset, struct page *page, unsigned int pgoff, bool submit_buffered) { u64 start = failed_bbio->file_offset + bio_offset; struct io_failure_record *failrec; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *failed_bio = &failed_bbio->bio; const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; @@ -812,7 +812,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset); + failrec = btrfs_get_io_failure_record(&inode->vfs_inode, failed_bbio, bio_offset); if (IS_ERR(failrec)) return PTR_ERR(failrec); @@ -830,7 +830,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, btrfs_debug(fs_info, "failed to repair num_copies %d this_mirror %d failed_mirror %d", failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); - free_io_failure(BTRFS_I(inode), failrec); + free_io_failure(inode, failrec); return -EIO; } @@ -851,7 +851,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, bio_add_page(repair_bio, page, failrec->len, pgoff); repair_bbio->iter = repair_bio->bi_iter; - btrfs_debug(btrfs_sb(inode->i_sb), + btrfs_debug(fs_info, "repair read error: submitting new read to mirror %d", failrec->this_mirror); @@ -861,11 +861,10 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, * error here. */ if (submit_buffered) - btrfs_submit_data_read_bio(BTRFS_I(inode), repair_bio, + btrfs_submit_data_read_bio(inode, repair_bio, failrec->this_mirror, 0); else - btrfs_submit_dio_repair_bio(BTRFS_I(inode), repair_bio, - failrec->this_mirror); + btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror); return BLK_STS_OK; } @@ -955,7 +954,7 @@ static void submit_data_read_repair(struct inode *inode, goto next; } - ret = btrfs_repair_one_sector(inode, failed_bbio, + ret = btrfs_repair_one_sector(BTRFS_I(inode), failed_bbio, bio_offset + offset, page, pgoff + offset, true); if (!ret) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b3d4b568fe33..805e262811b4 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -268,7 +268,7 @@ struct io_failure_record { int num_copies; }; -int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, +int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, u32 bio_offset, struct page *page, unsigned int pgoff, bool submit_buffered); void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f0b4f3a9245f..4a2fec637714 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7953,7 +7953,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, } else { int ret; - ret = btrfs_repair_one_sector(inode, bbio, offset, + ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, bv.bv_page, bv.bv_offset, false); if (ret) err = errno_to_blk_status(ret); -- cgit From e2884c3d445669c231fd8ca4c6fd160e729ec862 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: switch btrfs_dio_private::inode to btrfs_inode The btrfs_dio_private structure is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4a2fec637714..02100bb9c2a2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -84,7 +84,7 @@ struct btrfs_dio_data { }; struct btrfs_dio_private { - struct inode *inode; + struct btrfs_inode *inode; /* * Since DIO can use anonymous page, we cannot use page_offset() to @@ -7907,11 +7907,11 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) return; if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { - btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL, + btrfs_mark_ordered_io_finished(dip->inode, NULL, dip->file_offset, dip->bytes, !dip->bio.bi_status); } else { - unlock_extent(&BTRFS_I(dip->inode)->io_tree, + unlock_extent(&dip->inode->io_tree, dip->file_offset, dip->file_offset + dip->bytes - 1, NULL); } @@ -7934,7 +7934,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, struct btrfs_bio *bbio, const bool uptodate) { - struct inode *inode = dip->inode; + struct inode *inode = &dip->inode->vfs_inode; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); blk_status_t err = BLK_STS_OK; @@ -7977,9 +7977,9 @@ static void btrfs_end_dio_bio(struct btrfs_bio *bbio) blk_status_t err = bio->bi_status; if (err) - btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, + btrfs_warn(dip->inode->root->fs_info, "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", - btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio), + btrfs_ino(dip->inode), bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); @@ -7989,7 +7989,7 @@ static void btrfs_end_dio_bio(struct btrfs_bio *bbio) if (err) dip->bio.bi_status = err; - btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio); + btrfs_record_physical_zoned(&dip->inode->vfs_inode, bbio->file_offset, bio); bio_put(bio); btrfs_dio_private_put(dip); @@ -8056,7 +8056,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, struct btrfs_dio_data *dio_data = iter->private; struct extent_map *em = NULL; - dip->inode = inode; + dip->inode = BTRFS_I(inode); dip->file_offset = file_offset; dip->bytes = dio_bio->bi_iter.bi_size; refcount_set(&dip->refs, 1); -- cgit From bb41632ea7d252855f7a12f269afefc18a84e07b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_submit_dio_bio The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 02100bb9c2a2..b1369f645f1b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7995,10 +7995,10 @@ static void btrfs_end_dio_bio(struct btrfs_bio *bbio) btrfs_dio_private_put(dip); } -static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, +static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode, u64 file_offset, int async_submit) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_dio_private *dip = btrfs_bio(bio)->private; blk_status_t ret; @@ -8006,13 +8006,13 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, if (btrfs_op(bio) == BTRFS_MAP_READ) btrfs_bio(bio)->iter = bio->bi_iter; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + if (inode->flags & BTRFS_INODE_NODATASUM) goto map; if (btrfs_op(bio) == BTRFS_MAP_WRITE) { /* Check btrfs_submit_data_write_bio() for async submit rules */ - if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) && - btrfs_wq_submit_bio(BTRFS_I(inode), bio, 0, file_offset, + if (async_submit && !atomic_read(&inode->sync_writers) && + btrfs_wq_submit_bio(inode, bio, 0, file_offset, WQ_SUBMIT_DATA_DIO)) return; @@ -8020,7 +8020,7 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, * If we aren't doing async submit, calculate the csum of the * bio now. */ - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false); + ret = btrfs_csum_one_bio(inode, bio, file_offset, false); if (ret) { btrfs_bio_end_io(btrfs_bio(bio), ret); return; @@ -8142,7 +8142,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, async_submit = 1; } - btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); + btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit); dio_data->submitted += clone_len; clone_offset += clone_len; -- cgit From d9dcae67b7fe1f8c7d974a7a002c81d9b02b7281 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_truncate The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b1369f645f1b..f612c8b9d4bc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -124,7 +124,7 @@ static const struct file_operations btrfs_dir_file_operations; static struct kmem_cache *btrfs_inode_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); -static int btrfs_truncate(struct inode *inode, bool skip_writeback); +static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, @@ -5270,7 +5270,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) inode_dio_wait(inode); - ret = btrfs_truncate(inode, newsize == oldsize); + ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize); if (ret && inode->i_nlink) { int err; @@ -8632,16 +8632,16 @@ out_noreserve: return ret; } -static int btrfs_truncate(struct inode *inode, bool skip_writeback) +static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) { struct btrfs_truncate_control control = { - .inode = BTRFS_I(inode), - .ino = btrfs_ino(BTRFS_I(inode)), + .inode = inode, + .ino = btrfs_ino(inode), .min_type = BTRFS_EXTENT_DATA_KEY, .clear_extent_range = true, }; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *rsv; int ret; struct btrfs_trans_handle *trans; @@ -8649,7 +8649,8 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) u64 min_size = btrfs_calc_metadata_size(fs_info, 1); if (!skip_writeback) { - ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), + ret = btrfs_wait_ordered_range(&inode->vfs_inode, + inode->vfs_inode.i_size & (~mask), (u64)-1); if (ret) return ret; @@ -8708,34 +8709,32 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) while (1) { struct extent_state *cached_state = NULL; - const u64 new_size = inode->i_size; + const u64 new_size = inode->vfs_inode.i_size; const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); control.new_size = new_size; - lock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, - &cached_state); + lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* * We want to drop from the next block forward in case this new * size is not block aligned since we will be keeping the last * block of the extent just the way it is. */ - btrfs_drop_extent_map_range(BTRFS_I(inode), + btrfs_drop_extent_map_range(inode, ALIGN(new_size, fs_info->sectorsize), (u64)-1, false); ret = btrfs_truncate_inode_items(trans, root, &control); - inode_sub_bytes(inode, control.sub_bytes); - btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size); + inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); + btrfs_inode_safe_disk_i_size_write(inode, control.last_size); - unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, - &cached_state); + unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) break; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, inode); if (ret) break; @@ -8766,7 +8765,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); + ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0); if (ret) goto out; trans = btrfs_start_transaction(root, 1); @@ -8774,14 +8773,14 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) ret = PTR_ERR(trans); goto out; } - btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); + btrfs_inode_safe_disk_i_size_write(inode, 0); } if (trans) { int ret2; trans->block_rsv = &fs_info->trans_block_rsv; - ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret2 = btrfs_update_inode(trans, root, inode); if (ret2 && !ret) ret = ret2; @@ -8807,7 +8806,7 @@ out: * extents beyond i_size to drop. */ if (control.extents_found > 0) - btrfs_set_inode_full_sync(BTRFS_I(inode)); + btrfs_set_inode_full_sync(inode); return ret; } -- cgit From 29b6352b1494bd7ec6a14bda087f8eb858d2fc1f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_inode_lock The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/defrag.c | 4 ++-- fs/btrfs/delayed-inode.c | 2 +- fs/btrfs/file.c | 16 ++++++++-------- fs/btrfs/inode.c | 14 +++++++------- fs/btrfs/ioctl.c | 2 +- fs/btrfs/reflink.c | 2 +- fs/btrfs/relocation.c | 2 +- 8 files changed, 22 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 9fe1a11a2eb3..a06d1c0a0cc2 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -545,7 +545,7 @@ enum btrfs_ilock_type { ENUM_BIT(BTRFS_ILOCK_MMAP), }; -int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags); +int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags); void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, const u64 del_bytes); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 919dfe0f7e50..6aade4838927 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -1295,7 +1295,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; cluster_end = min(cluster_end, last_byte); - btrfs_inode_lock(inode, 0); + btrfs_inode_lock(BTRFS_I(inode), 0); if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; btrfs_inode_unlock(inode, 0); @@ -1351,7 +1351,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, ret = sectors_defragged; } if (do_compress) { - btrfs_inode_lock(inode, 0); + btrfs_inode_lock(BTRFS_I(inode), 0); BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; btrfs_inode_unlock(inode, 0); } diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index c024f97de9e0..4edf44d8cd9e 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1647,7 +1647,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, * item->readdir_list. */ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - btrfs_inode_lock(inode, 0); + btrfs_inode_lock(BTRFS_I(inode), 0); mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f8be9d629e75..b3b7b276cce0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1193,7 +1193,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, if (nowait) ilock_flags |= BTRFS_ILOCK_TRY; - ret = btrfs_inode_lock(inode, ilock_flags); + ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); if (ret < 0) return ret; @@ -1468,7 +1468,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) ilock_flags |= BTRFS_ILOCK_SHARED; relock: - err = btrfs_inode_lock(inode, ilock_flags); + err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); if (err < 0) return err; @@ -1616,7 +1616,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, loff_t count; ssize_t ret; - btrfs_inode_lock(inode, 0); + btrfs_inode_lock(BTRFS_I(inode), 0); count = encoded->len; ret = generic_write_checks_count(iocb, &count); if (ret == 0 && count != encoded->len) { @@ -1806,7 +1806,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) goto out; - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); atomic_inc(&root->log_batch); @@ -2596,7 +2596,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) bool truncated_block = false; bool updated_inode = false; - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) @@ -3054,7 +3054,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (mode & FALLOC_FL_PUNCH_HOLE) return btrfs_punch_hole(file, offset, len); - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { ret = inode_newsize_ok(inode, offset + len); @@ -3691,7 +3691,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) return generic_file_llseek(file, offset, whence); case SEEK_DATA: case SEEK_HOLE: - btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); offset = find_desired_extent(BTRFS_I(inode), offset, whence); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); break; @@ -3748,7 +3748,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos)) return 0; - btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); again: /* * This is similar to what we do for direct IO writes, see the comment diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f612c8b9d4bc..9b0f84c85bef 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -172,27 +172,27 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, * return -EAGAIN * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock */ -int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) +int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) { if (ilock_flags & BTRFS_ILOCK_SHARED) { if (ilock_flags & BTRFS_ILOCK_TRY) { - if (!inode_trylock_shared(inode)) + if (!inode_trylock_shared(&inode->vfs_inode)) return -EAGAIN; else return 0; } - inode_lock_shared(inode); + inode_lock_shared(&inode->vfs_inode); } else { if (ilock_flags & BTRFS_ILOCK_TRY) { - if (!inode_trylock(inode)) + if (!inode_trylock(&inode->vfs_inode)) return -EAGAIN; else return 0; } - inode_lock(inode); + inode_lock(&inode->vfs_inode); } if (ilock_flags & BTRFS_ILOCK_MMAP) - down_write(&BTRFS_I(inode)->i_mmap_lock); + down_write(&inode->i_mmap_lock); return 0; } @@ -10529,7 +10529,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, file_accessed(iocb->ki_filp); - btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); if (iocb->ki_pos >= inode->vfs_inode.i_size) { btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6f9f8e8dfea6..a912e1cb283d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2528,7 +2528,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_dput; } - btrfs_inode_lock(inode, 0); + btrfs_inode_lock(BTRFS_I(inode), 0); err = btrfs_delete_subvolume(dir, dentry); btrfs_inode_unlock(inode, 0); if (!err) diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 9d728107536e..c62c7fdd55d9 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -893,7 +893,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, return -EINVAL; if (same_inode) { - btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP); + btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); } else { lock_two_nondirectories(src_inode, dst_inode); btrfs_double_mmap_lock(src_inode, dst_inode); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2ecca24e1001..f34cb92f3151 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2874,7 +2874,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( if (ret) return ret; - btrfs_inode_lock(&inode->vfs_inode, 0); + btrfs_inode_lock(inode, 0); for (nr = 0; nr < cluster->nr; nr++) { struct extent_state *cached_state = NULL; -- cgit From e5d4d75bd3241d0a5990060ef9601bf17adf9bb5 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_inode_unlock The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/defrag.c | 8 ++++---- fs/btrfs/delayed-inode.c | 2 +- fs/btrfs/file.c | 32 ++++++++++++++++---------------- fs/btrfs/inode.c | 18 +++++++++--------- fs/btrfs/ioctl.c | 6 +++--- fs/btrfs/reflink.c | 2 +- fs/btrfs/relocation.c | 2 +- 8 files changed, 36 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index a06d1c0a0cc2..279bbbc59dc6 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -546,7 +546,7 @@ enum btrfs_ilock_type { }; int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags); -void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); +void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags); void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, const u64 del_bytes); void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 6aade4838927..0a3c261b69c9 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -1298,11 +1298,11 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, btrfs_inode_lock(BTRFS_I(inode), 0); if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; - btrfs_inode_unlock(inode, 0); + btrfs_inode_unlock(BTRFS_I(inode), 0); break; } if (!(inode->i_sb->s_flags & SB_ACTIVE)) { - btrfs_inode_unlock(inode, 0); + btrfs_inode_unlock(BTRFS_I(inode), 0); break; } if (do_compress) @@ -1315,7 +1315,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, if (sectors_defragged > prev_sectors_defragged) balance_dirty_pages_ratelimited(inode->i_mapping); - btrfs_inode_unlock(inode, 0); + btrfs_inode_unlock(BTRFS_I(inode), 0); if (ret < 0) break; cur = max(cluster_end + 1, last_scanned); @@ -1353,7 +1353,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, if (do_compress) { btrfs_inode_lock(BTRFS_I(inode), 0); BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; - btrfs_inode_unlock(inode, 0); + btrfs_inode_unlock(BTRFS_I(inode), 0); } return ret; } diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 4edf44d8cd9e..0095c6e4c3d1 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1646,7 +1646,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, * We can only do one readdir with delayed items at a time because of * item->readdir_list. */ - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); btrfs_inode_lock(BTRFS_I(inode), 0); mutex_lock(&delayed_node->mutex); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b3b7b276cce0..e6c93be91a06 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1428,7 +1428,7 @@ again: iocb->ki_pos += num_written; } out: - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); return num_written ? num_written : ret; } @@ -1474,13 +1474,13 @@ relock: err = generic_write_checks(iocb, from); if (err <= 0) { - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); return err; } err = btrfs_write_check(iocb, from, err); if (err < 0) { - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto out; } @@ -1491,13 +1491,13 @@ relock: */ if ((ilock_flags & BTRFS_ILOCK_SHARED) && pos + iov_iter_count(from) > i_size_read(inode)) { - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); ilock_flags &= ~BTRFS_ILOCK_SHARED; goto relock; } if (check_direct_IO(fs_info, from, pos)) { - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto buffered; } @@ -1528,7 +1528,7 @@ relock: * iocb, and that needs to lock the inode. So unlock it before calling * iomap_dio_complete() to avoid a deadlock. */ - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); if (IS_ERR_OR_NULL(dio)) err = PTR_ERR_OR_ZERO(dio); @@ -1635,7 +1635,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, ret = btrfs_do_encoded_write(iocb, from, encoded); out: - btrfs_inode_unlock(inode, 0); + btrfs_inode_unlock(BTRFS_I(inode), 0); return ret; } @@ -1830,7 +1830,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) { - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); goto out; } @@ -1933,7 +1933,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); if (ret == BTRFS_NO_LOG_SYNC) { ret = btrfs_end_transaction(trans); @@ -2001,7 +2001,7 @@ out: out_release_extents: btrfs_release_log_ctx_extents(&ctx); - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); goto out; } @@ -2644,7 +2644,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) truncated_block = true; ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); if (ret) { - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; } } @@ -2743,7 +2743,7 @@ out_only_mutex: ret = ret2; } } - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; } @@ -3104,7 +3104,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (mode & FALLOC_FL_ZERO_RANGE) { ret = btrfs_zero_range(inode, offset, len, mode); - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; } @@ -3202,7 +3202,7 @@ out_unlock: unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state); out: - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); extent_changeset_free(data_reserved); return ret; } @@ -3693,7 +3693,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) case SEEK_HOLE: btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); offset = find_desired_extent(BTRFS_I(inode), offset, whence); - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); break; } @@ -3797,7 +3797,7 @@ again: goto again; } } - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); return ret < 0 ? ret : read; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9b0f84c85bef..60cc23997216 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -202,14 +202,14 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() * to decide whether the lock acquired is shared or exclusive. */ -void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags) +void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) { if (ilock_flags & BTRFS_ILOCK_MMAP) - up_write(&BTRFS_I(inode)->i_mmap_lock); + up_write(&inode->i_mmap_lock); if (ilock_flags & BTRFS_ILOCK_SHARED) - inode_unlock_shared(inode); + inode_unlock_shared(&inode->vfs_inode); else - inode_unlock(inode); + inode_unlock(&inode->vfs_inode); } /* @@ -10277,7 +10277,7 @@ static ssize_t btrfs_encoded_read_inline( read_extent_buffer(leaf, tmp, ptr, count); btrfs_release_path(path); unlock_extent(io_tree, start, lockend, cached_state); - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; ret = copy_to_iter(tmp, count, iter); @@ -10480,7 +10480,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, goto out; unlock_extent(io_tree, start, lockend, cached_state); - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; if (compressed) { @@ -10532,7 +10532,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); if (iocb->ki_pos >= inode->vfs_inode.i_size) { - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return 0; } start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize); @@ -10630,7 +10630,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, if (disk_bytenr == EXTENT_MAP_HOLE) { unlock_extent(io_tree, start, lockend, &cached_state); - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); unlocked = true; ret = iov_iter_zero(count, iter); if (ret != count) @@ -10653,7 +10653,7 @@ out_unlock_extent: unlock_extent(io_tree, start, lockend, &cached_state); out_unlock_inode: if (!unlocked) - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a912e1cb283d..1832d3011d12 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1002,7 +1002,7 @@ out_up_read: out_dput: dput(dentry); out_unlock: - btrfs_inode_unlock(dir, 0); + btrfs_inode_unlock(BTRFS_I(dir), 0); return error; } @@ -2530,14 +2530,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, btrfs_inode_lock(BTRFS_I(inode), 0); err = btrfs_delete_subvolume(dir, dentry); - btrfs_inode_unlock(inode, 0); + btrfs_inode_unlock(BTRFS_I(inode), 0); if (!err) d_delete_notify(dir, dentry); out_dput: dput(dentry); out_unlock_dir: - btrfs_inode_unlock(dir, 0); + btrfs_inode_unlock(BTRFS_I(dir), 0); free_subvol_name: kfree(subvol_name_ptr); free_parent: diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index c62c7fdd55d9..0474bbe39da7 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -911,7 +911,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, out_unlock: if (same_inode) { - btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); } else { btrfs_double_mmap_unlock(src_inode, dst_inode); unlock_two_nondirectories(src_inode, dst_inode); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f34cb92f3151..d75b18e84285 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2894,7 +2894,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( if (ret) break; } - btrfs_inode_unlock(&inode->vfs_inode, 0); + btrfs_inode_unlock(inode, 0); if (cur_offset < prealloc_end) btrfs_free_reserved_data_space_noquota(inode->root->fs_info, -- cgit From 7152b425da54b6aa6ddfb0cbc0603614cc2400bd Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_dirty_inode The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 60cc23997216..dc06511c359a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -286,7 +286,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); } -static int btrfs_dirty_inode(struct inode *inode); +static int btrfs_dirty_inode(struct btrfs_inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args) @@ -5313,7 +5313,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr if (attr->ia_valid) { setattr_copy(mnt_userns, inode, attr); inode_inc_iversion(inode); - err = btrfs_dirty_inode(inode); + err = btrfs_dirty_inode(BTRFS_I(inode)); if (!err && attr->ia_valid & ATTR_MODE) err = posix_acl_chmod(mnt_userns, inode, inode->i_mode); @@ -6144,21 +6144,21 @@ err: * FIXME, needs more benchmarking...there are no reasons other than performance * to keep or drop this code. */ -static int btrfs_dirty_inode(struct inode *inode) +static int btrfs_dirty_inode(struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; int ret; - if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) + if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags)) return 0; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, inode); if (ret && (ret == -ENOSPC || ret == -EDQUOT)) { /* whoops, lets try again with the full transaction */ btrfs_end_transaction(trans); @@ -6166,10 +6166,10 @@ static int btrfs_dirty_inode(struct inode *inode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, inode); } btrfs_end_transaction(trans); - if (BTRFS_I(inode)->delayed_node) + if (inode->delayed_node) btrfs_balance_delayed_items(fs_info); return ret; @@ -6196,7 +6196,7 @@ static int btrfs_update_time(struct inode *inode, struct timespec64 *now, inode->i_mtime = *now; if (flags & S_ATIME) inode->i_atime = *now; - return dirty ? btrfs_dirty_inode(inode) : 0; + return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0; } /* -- cgit From 82ca5a04f03fa1e74a5034ec61ac86d9c817a2b9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_add_delalloc_inodes The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dc06511c359a..d9d7e11d8d9a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2369,16 +2369,14 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, } static void btrfs_add_delalloc_inodes(struct btrfs_root *root, - struct inode *inode) + struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; spin_lock(&root->delalloc_lock); - if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_add_tail(&BTRFS_I(inode)->delalloc_inodes, - &root->delalloc_inodes); - set_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags); + if (list_empty(&inode->delalloc_inodes)) { + list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); + set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags); root->nr_delalloc_inodes++; if (root->nr_delalloc_inodes == 1) { spin_lock(&fs_info->delalloc_root_lock); @@ -2391,7 +2389,6 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root, spin_unlock(&root->delalloc_lock); } - void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode) { @@ -2458,7 +2455,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, BTRFS_I(inode)->defrag_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, &BTRFS_I(inode)->runtime_flags)) - btrfs_add_delalloc_inodes(root, inode); + btrfs_add_delalloc_inodes(root, BTRFS_I(inode)); spin_unlock(&BTRFS_I(inode)->lock); } -- cgit From 36eeaef5595dc09f1dc4c859d96ffb3cd69dee55 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 28 Oct 2022 02:22:15 +0200 Subject: btrfs: switch btrfs_writepage_fixup::inode to btrfs_inode The btrfs_writepage_fixup structure is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d9d7e11d8d9a..06ea43460e90 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2890,7 +2890,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { struct page *page; - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_work work; }; @@ -2909,7 +2909,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) fixup = container_of(work, struct btrfs_writepage_fixup, work); page = fixup->page; - inode = BTRFS_I(fixup->inode); + inode = fixup->inode; page_start = page_offset(page); page_end = page_offset(page) + PAGE_SIZE - 1; @@ -3068,7 +3068,7 @@ int btrfs_writepage_cow_fixup(struct page *page) get_page(page); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; - fixup->inode = inode; + fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); return -EAGAIN; -- cgit From 621af94af3342c9a8c3df34b4231d7707accd00e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_check_data_csum The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 4 +--- fs/btrfs/compression.c | 2 +- fs/btrfs/inode.c | 15 +++++++-------- 3 files changed, 9 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 279bbbc59dc6..ba0dbdc91ec5 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -421,13 +421,11 @@ blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, u64 dio_file_offset); int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); -int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, +int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u32 pgoff); unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u64 start, u64 end); -int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes, bool nowait, bool strict); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 6f5ad0d6c409..42e6dde2ad59 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -186,7 +186,7 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio) u64 start = bbio->file_offset + offset; if (!status && - (!csum || !btrfs_check_data_csum(inode, bbio, offset, + (!csum || !btrfs_check_data_csum(bi, bbio, offset, bv.bv_page, bv.bv_offset))) { btrfs_clean_io_failure(bi, start, bv.bv_page, bv.bv_offset); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06ea43460e90..76e2bea2270a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3490,10 +3490,10 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of * When csum mismatch is detected, we will also report the error and fill the * corrupted range with zero. (Thus it needs the extra parameters) */ -int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, +int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u32 pgoff) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; u32 len = fs_info->sectorsize; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; @@ -3507,8 +3507,7 @@ int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, return 0; zeroit: - btrfs_print_data_csum_error(BTRFS_I(inode), - bbio->file_offset + bio_offset, + btrfs_print_data_csum_error(inode, bbio->file_offset + bio_offset, csum, csum_expected, bbio->mirror_num); if (bbio->device) btrfs_dev_stat_inc_and_print(bbio->device, @@ -3573,7 +3572,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, EXTENT_NODATASUM); continue; } - ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off); + ret = btrfs_check_data_csum(BTRFS_I(inode), bbio, bio_offset, page, pg_off); if (ret < 0) { const int nr_bit = (pg_off - offset_in_page(start)) >> root->fs_info->sectorsize_bits; @@ -7943,8 +7942,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, u64 start = bbio->file_offset + offset; if (uptodate && - (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page, - bv.bv_offset))) { + (!csum || !btrfs_check_data_csum(BTRFS_I(inode), bbio, offset, + bv.bv_page, bv.bv_offset))) { btrfs_clean_io_failure(BTRFS_I(inode), start, bv.bv_page, bv.bv_offset); } else { @@ -10334,7 +10333,7 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) pgoff = bvec->bv_offset; for (i = 0; i < nr_sectors; i++) { ASSERT(pgoff < PAGE_SIZE); - if (btrfs_check_data_csum(&inode->vfs_inode, bbio, bio_offset, + if (btrfs_check_data_csum(inode, bbio, bio_offset, bvec->bv_page, pgoff)) return BLK_STS_IOERR; bio_offset += sectorsize; -- cgit From e569b1d545511492d5fc1a062ef8b63fd1b78d84 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to __unlink_start_trans The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 76e2bea2270a..ffb42560acd5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4415,9 +4415,9 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, * plenty of slack room in the global reserve to migrate, otherwise we cannot * allow the unlink to occur. */ -static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) +static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir) { - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; /* * 1 for the possible orphan item @@ -4443,7 +4443,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) /* This needs to handle no-key deletions later on */ - trans = __unlink_start_trans(dir); + trans = __unlink_start_trans(BTRFS_I(dir)); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fscrypt_free; @@ -4857,7 +4857,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) /* This needs to handle no-key deletions later on */ - trans = __unlink_start_trans(dir); + trans = __unlink_start_trans(BTRFS_I(dir)); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_notrans; -- cgit From 3c4f91e23a87a486426db5e481ed315b1b2640f1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_delete_subvolume The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/inode.c | 10 +++++----- fs/btrfs/ioctl.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ba0dbdc91ec5..481c75c47fc4 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -439,7 +439,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const struct fscrypt_str *name, int add_backref, u64 index); -int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); +int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry); int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, int front); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ffb42560acd5..5e2fd24b6951 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4698,10 +4698,10 @@ again: spin_unlock(&root->inode_lock); } -int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) +int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) { struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; struct inode *inode = d_inode(dentry); struct btrfs_root *dest = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -4758,9 +4758,9 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; - btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); + btrfs_record_snapshot_destroy(trans, dir); - ret = btrfs_unlink_subvol(trans, dir, dentry); + ret = btrfs_unlink_subvol(trans, &dir->vfs_inode, dentry); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -4848,7 +4848,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) "extent tree v2 doesn't support snapshot deletion yet"); return -EOPNOTSUPP; } - return btrfs_delete_subvolume(dir, dentry); + return btrfs_delete_subvolume(BTRFS_I(dir), dentry); } err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1832d3011d12..7a9e697d9931 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2529,7 +2529,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, } btrfs_inode_lock(BTRFS_I(inode), 0); - err = btrfs_delete_subvolume(dir, dentry); + err = btrfs_delete_subvolume(BTRFS_I(dir), dentry); btrfs_inode_unlock(BTRFS_I(inode), 0); if (!err) d_delete_notify(dir, dentry); -- cgit From 35da5a7edec32935135737608c9d9da24a419bab Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 28 Oct 2022 02:47:06 +0200 Subject: btrfs: drop private_data parameter from extent_io_tree_init All callers except one pass NULL, so the parameter can be dropped and the inode::io_tree initialization can be open coded. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/extent-io-tree.c | 5 ++--- fs/btrfs/extent-io-tree.h | 3 +-- fs/btrfs/inode.c | 5 +++-- fs/btrfs/relocation.c | 3 +-- fs/btrfs/tests/btrfs-tests.c | 2 +- fs/btrfs/tests/extent-io-tests.c | 4 ++-- fs/btrfs/transaction.c | 4 ++-- fs/btrfs/volumes.c | 3 +-- 9 files changed, 17 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 72e3446d9840..e602e0b4c25d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1044,9 +1044,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->anon_dev = 0; if (!dummy) { extent_io_tree_init(fs_info, &root->dirty_log_pages, - IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL); + IO_TREE_ROOT_DIRTY_LOG_PAGES); extent_io_tree_init(fs_info, &root->log_csum_range, - IO_TREE_LOG_CSUM_RANGE, NULL); + IO_TREE_LOG_CSUM_RANGE); } spin_lock_init(&root->root_item_lock); @@ -2250,7 +2250,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, - IO_TREE_BTREE_INODE_IO, NULL); + IO_TREE_BTREE_INODE_IO); extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); @@ -3074,7 +3074,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->block_group_cache_tree = RB_ROOT_CACHED; extent_io_tree_init(fs_info, &fs_info->excluded_extents, - IO_TREE_FS_EXCLUDED_EXTENTS, NULL); + IO_TREE_FS_EXCLUDED_EXTENTS); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 2cdff74ff033..81365870576f 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -94,13 +94,12 @@ struct tree_entry { }; void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner, - void *private_data) + struct extent_io_tree *tree, unsigned int owner) { tree->fs_info = fs_info; tree->state = RB_ROOT; spin_lock_init(&tree->lock); - tree->private_data = private_data; + tree->private_data = NULL; tree->owner = owner; if (owner == IO_TREE_INODE_FILE_EXTENT) lockdep_set_class(&tree->lock, &file_extent_tree_class); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index d73ef24bad2e..0e16642c28a3 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -104,8 +104,7 @@ struct extent_state { }; void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner, - void *private_data); + struct extent_io_tree *tree, unsigned int owner); void extent_io_tree_release(struct extent_io_tree *tree); int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5e2fd24b6951..29c6c0ec0016 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8871,9 +8871,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); - extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode); + extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); + ei->io_tree.private_data = inode; extent_io_tree_init(fs_info, &ei->file_extent_tree, - IO_TREE_INODE_FILE_EXTENT, NULL); + IO_TREE_INODE_FILE_EXTENT); ei->io_failure_tree = RB_ROOT; atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d75b18e84285..1440cb332a5a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3928,8 +3928,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&rc->dirty_subvol_roots); btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1); mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(fs_info, &rc->processed_blocks, - IO_TREE_RELOC_BLOCKS, NULL); + extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); return rc; } diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 669fa7133a2f..181469fc0bb3 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -102,7 +102,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info) if (!dev) return ERR_PTR(-ENOMEM); - extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL); + extent_io_tree_init(NULL, &dev->alloc_state, 0); INIT_LIST_HEAD(&dev->dev_list); list_add(&dev->dev_list, &fs_info->fs_devices->devices); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 350da449db08..dfc5c7fa6038 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -132,7 +132,7 @@ static int test_find_delalloc(u32 sectorsize) * Passing NULL as we don't have fs_info but tracepoints are not used * at this point */ - extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST, NULL); + extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST); /* * First go through and create and mark all of our pages dirty, we pin @@ -489,7 +489,7 @@ static int test_find_first_clear_extent_bit(void) test_msg("running find_first_clear_extent_bit test"); - extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL); + extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST); /* Test correct handling of empty tree */ find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2e2dd2ea109b..b8c52e89688c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -378,9 +378,9 @@ loop: spin_lock_init(&cur_trans->releasing_ebs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, - IO_TREE_TRANS_DIRTY_PAGES, NULL); + IO_TREE_TRANS_DIRTY_PAGES); extent_io_tree_init(fs_info, &cur_trans->pinned_extents, - IO_TREE_FS_PINNED_EXTENTS, NULL); + IO_TREE_FS_PINNED_EXTENTS); fs_info->generation++; cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7751ab620761..acb97494bb52 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7048,8 +7048,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); - extent_io_tree_init(fs_info, &dev->alloc_state, - IO_TREE_DEVICE_ALLOC_STATE, NULL); + extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE); if (devid) tmp = *devid; -- cgit From 0988fc7bda79feff046056f032fd149ab08e03d1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 28 Oct 2022 02:55:51 +0200 Subject: btrfs: switch extent_io_tree::private_data to btrfs_inode and rename The extent_io_tree::private_data was meant to be a preparatory work for the metadata inode rework but that never materialized. Now it's used only for an inode so it's better to change the appropriate type and rename it. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 32 ++++++++++++++++---------------- fs/btrfs/extent-io-tree.h | 3 ++- fs/btrfs/inode.c | 2 +- 3 files changed, 19 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 81365870576f..bbcc65593d1d 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -58,17 +58,17 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - struct inode *inode = tree->private_data; + struct btrfs_inode *inode = tree->inode; u64 isize; if (!inode) return; - isize = i_size_read(inode); + isize = i_size_read(&inode->vfs_inode); if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, + btrfs_debug_rl(inode->root->fs_info, "%s: ino %llu isize %llu odd range [%llu,%llu]", - caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); + caller, btrfs_ino(inode), isize, start, end); } } #else @@ -99,7 +99,7 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, tree->fs_info = fs_info; tree->state = RB_ROOT; spin_lock_init(&tree->lock); - tree->private_data = NULL; + tree->inode = NULL; tree->owner = owner; if (owner == IO_TREE_INODE_FILE_EXTENT) lockdep_set_class(&tree->lock, &file_extent_tree_class); @@ -346,9 +346,9 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *state) other = prev_state(state); if (other && other->end == state->start - 1 && other->state == state->state) { - if (tree->private_data) - btrfs_merge_delalloc_extent(tree->private_data, - state, other); + if (tree->inode) + btrfs_merge_delalloc_extent(&tree->inode->vfs_inode, state, + other); state->start = other->start; rb_erase(&other->rb_node, &tree->state); RB_CLEAR_NODE(&other->rb_node); @@ -357,8 +357,8 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *state) other = next_state(state); if (other && other->start == state->end + 1 && other->state == state->state) { - if (tree->private_data) - btrfs_merge_delalloc_extent(tree->private_data, state, + if (tree->inode) + btrfs_merge_delalloc_extent(&tree->inode->vfs_inode, state, other); state->end = other->end; rb_erase(&other->rb_node, &tree->state); @@ -374,8 +374,8 @@ static void set_state_bits(struct extent_io_tree *tree, u32 bits_to_set = bits & ~EXTENT_CTLBITS; int ret; - if (tree->private_data) - btrfs_set_delalloc_extent(tree->private_data, state, bits); + if (tree->inode) + btrfs_set_delalloc_extent(&tree->inode->vfs_inode, state, bits); ret = add_extent_changeset(state, bits_to_set, changeset, 1); BUG_ON(ret < 0); @@ -462,8 +462,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct rb_node *parent = NULL; struct rb_node **node; - if (tree->private_data) - btrfs_split_delalloc_extent(tree->private_data, orig, split); + if (tree->inode) + btrfs_split_delalloc_extent(&tree->inode->vfs_inode, orig, split); prealloc->start = orig->start; prealloc->end = split - 1; @@ -510,8 +510,8 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, u32 bits_to_clear = bits & ~EXTENT_CTLBITS; int ret; - if (tree->private_data) - btrfs_clear_delalloc_extent(tree->private_data, state, bits); + if (tree->inode) + btrfs_clear_delalloc_extent(&tree->inode->vfs_inode, state, bits); ret = add_extent_changeset(state, bits_to_clear, changeset, 0); BUG_ON(ret < 0); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 0e16642c28a3..cdee8c0854c8 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -80,7 +80,8 @@ enum { struct extent_io_tree { struct rb_root state; struct btrfs_fs_info *fs_info; - void *private_data; + /* Inode associated with this tree, or NULL. */ + struct btrfs_inode *inode; /* Who owns this io tree, should be one of IO_TREE_* */ u8 owner; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 29c6c0ec0016..23ec33ac86e0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8872,7 +8872,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); - ei->io_tree.private_data = inode; + ei->io_tree.inode = ei; extent_io_tree_init(fs_info, &ei->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); ei->io_failure_tree = RB_ROOT; -- cgit From 2454151cdede9f6f3a2213ae84b07d9a9edb485e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_merge_delalloc_extent The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent-io-tree.c | 6 ++---- fs/btrfs/inode.c | 16 ++++++++-------- 3 files changed, 11 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 481c75c47fc4..130c95c6f7df 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -475,7 +475,7 @@ struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, u32 bits); void btrfs_clear_delalloc_extent(struct inode *inode, struct extent_state *state, u32 bits); -void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, +void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, struct extent_state *other); void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split); diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index bbcc65593d1d..942212e1dbaf 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -347,8 +347,7 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *state) if (other && other->end == state->start - 1 && other->state == state->state) { if (tree->inode) - btrfs_merge_delalloc_extent(&tree->inode->vfs_inode, state, - other); + btrfs_merge_delalloc_extent(tree->inode, state, other); state->start = other->start; rb_erase(&other->rb_node, &tree->state); RB_CLEAR_NODE(&other->rb_node); @@ -358,8 +357,7 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *state) if (other && other->start == state->end + 1 && other->state == state->state) { if (tree->inode) - btrfs_merge_delalloc_extent(&tree->inode->vfs_inode, state, - other); + btrfs_merge_delalloc_extent(tree->inode, state, other); state->end = other->end; rb_erase(&other->rb_node, &tree->state); RB_CLEAR_NODE(&other->rb_node); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 23ec33ac86e0..a2de63d38b1a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2314,10 +2314,10 @@ void btrfs_split_delalloc_extent(struct inode *inode, * that are just merged onto old extents, such as when we are doing sequential * writes, so we can properly account for the metadata space we'll need. */ -void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, +void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, struct extent_state *other) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 new_size, old_size; u32 num_extents; @@ -2332,9 +2332,9 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, /* we're not bigger than the max, unreserve the space and go */ if (new_size <= fs_info->max_extent_size) { - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, -1); + spin_unlock(&inode->lock); return; } @@ -2363,9 +2363,9 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, if (count_max_extents(fs_info, new_size) >= num_extents) return; - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, -1); + spin_unlock(&inode->lock); } static void btrfs_add_delalloc_inodes(struct btrfs_root *root, -- cgit From 4c5d166f6b3674473905ff133daea4de33f00b91 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_set_delalloc_extent The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent-io-tree.c | 2 +- fs/btrfs/inode.c | 33 ++++++++++++++++----------------- 3 files changed, 18 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 130c95c6f7df..8a8280233199 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -471,7 +471,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, struct inode *dir); - void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, + void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits); void btrfs_clear_delalloc_extent(struct inode *inode, struct extent_state *state, u32 bits); diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 942212e1dbaf..1b4c52d7201d 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -373,7 +373,7 @@ static void set_state_bits(struct extent_io_tree *tree, int ret; if (tree->inode) - btrfs_set_delalloc_extent(&tree->inode->vfs_inode, state, bits); + btrfs_set_delalloc_extent(tree->inode, state, bits); ret = add_extent_changeset(state, bits_to_set, changeset, 1); BUG_ON(ret < 0); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a2de63d38b1a..fae41b954373 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2421,10 +2421,10 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, * Properly track delayed allocation bytes in the inode and to maintain the * list of inodes that have pending delalloc work to be done. */ -void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, +void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) WARN_ON(1); @@ -2434,14 +2434,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, * bit, which is only set or cleared with irqs on */ if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(fs_info, len); - bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); + bool do_list = !btrfs_is_free_space_inode(inode); - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents); - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, num_extents); + spin_unlock(&inode->lock); /* For sanity tests */ if (btrfs_is_testing(fs_info)) @@ -2449,22 +2449,21 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, percpu_counter_add_batch(&fs_info->delalloc_bytes, len, fs_info->delalloc_batch); - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->delalloc_bytes += len; + spin_lock(&inode->lock); + inode->delalloc_bytes += len; if (bits & EXTENT_DEFRAG) - BTRFS_I(inode)->defrag_bytes += len; + inode->defrag_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags)) - btrfs_add_delalloc_inodes(root, BTRFS_I(inode)); - spin_unlock(&BTRFS_I(inode)->lock); + &inode->runtime_flags)) + btrfs_add_delalloc_inodes(root, inode); + spin_unlock(&inode->lock); } if (!(state->state & EXTENT_DELALLOC_NEW) && (bits & EXTENT_DELALLOC_NEW)) { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - - state->start; - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + inode->new_delalloc_bytes += state->end + 1 - state->start; + spin_unlock(&inode->lock); } } -- cgit From 62798a491561e1e1bbbd08873561532f19525fbf Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_split_delalloc_extent The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent-io-tree.c | 2 +- fs/btrfs/inode.c | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 8a8280233199..ddf1867ba6d5 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -477,7 +477,7 @@ void btrfs_clear_delalloc_extent(struct inode *inode, struct extent_state *state, u32 bits); void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, struct extent_state *other); -void btrfs_split_delalloc_extent(struct inode *inode, +void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct extent_state *orig, u64 split); void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 1b4c52d7201d..124aede5e492 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -461,7 +461,7 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct rb_node **node; if (tree->inode) - btrfs_split_delalloc_extent(&tree->inode->vfs_inode, orig, split); + btrfs_split_delalloc_extent(tree->inode, orig, split); prealloc->start = orig->start; prealloc->end = split - 1; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fae41b954373..a43d7a063807 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2277,10 +2277,10 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page return ret; } -void btrfs_split_delalloc_extent(struct inode *inode, +void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct extent_state *orig, u64 split) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 size; /* not delalloc, ignore it */ @@ -2304,9 +2304,9 @@ void btrfs_split_delalloc_extent(struct inode *inode, return; } - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, 1); + spin_unlock(&inode->lock); } /* -- cgit From bd54766e40df1300564babbbc31866d36c0c4bb6 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_clear_delalloc_extent The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent-io-tree.c | 2 +- fs/btrfs/inode.c | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ddf1867ba6d5..9e31dc8b0285 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -473,7 +473,7 @@ struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, struct inode *dir); void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits); -void btrfs_clear_delalloc_extent(struct inode *inode, +void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits); void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, struct extent_state *other); diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 124aede5e492..285b0ff6e953 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -509,7 +509,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, int ret; if (tree->inode) - btrfs_clear_delalloc_extent(&tree->inode->vfs_inode, state, bits); + btrfs_clear_delalloc_extent(tree->inode, state, bits); ret = add_extent_changeset(state, bits_to_clear, changeset, 0); BUG_ON(ret < 0); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a43d7a063807..59ec9b2ef508 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2471,11 +2471,10 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s * Once a range is no longer delalloc this function ensures that proper * accounting happens. */ -void btrfs_clear_delalloc_extent(struct inode *vfs_inode, +void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits) { - struct btrfs_inode *inode = BTRFS_I(vfs_inode); - struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(fs_info, len); -- cgit From 5b7544cb06fef21e4f8c189082a7bf02fd9832a1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_unlink_subvol The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 59ec9b2ef508..3577ee142888 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4470,9 +4470,9 @@ fscrypt_free: } static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, - struct inode *dir, struct dentry *dentry) + struct btrfs_inode *dir, struct dentry *dentry) { - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_path *path; struct extent_buffer *leaf; @@ -4481,10 +4481,10 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, u64 index; int ret; u64 objectid; - u64 dir_ino = btrfs_ino(BTRFS_I(dir)); + u64 dir_ino = btrfs_ino(dir); struct fscrypt_name fname; - ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); if (ret) return ret; @@ -4557,17 +4557,17 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } } - ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index); + ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - btrfs_i_size_write(BTRFS_I(dir), dir->i_size - fname.disk_name.len * 2); - inode_inc_iversion(dir); - dir->i_mtime = current_time(dir); - dir->i_ctime = dir->i_mtime; - ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir)); + btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2); + inode_inc_iversion(&dir->vfs_inode); + dir->vfs_inode.i_mtime = current_time(&dir->vfs_inode); + dir->vfs_inode.i_ctime = dir->vfs_inode.i_mtime; + ret = btrfs_update_inode_fallback(trans, root, dir); if (ret) btrfs_abort_transaction(trans, ret); out: @@ -4758,7 +4758,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) btrfs_record_snapshot_destroy(trans, dir); - ret = btrfs_unlink_subvol(trans, &dir->vfs_inode, dentry); + ret = btrfs_unlink_subvol(trans, dir, dentry); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -4862,7 +4862,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) } if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - err = btrfs_unlink_subvol(trans, dir, dentry); + err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); goto out; } @@ -9208,7 +9208,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), @@ -9223,7 +9223,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), @@ -9470,7 +9470,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, BTRFS_I(old_inode), 1); if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { - ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); } else { ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), @@ -9488,7 +9488,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, new_inode->i_ctime = current_time(new_inode); if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), -- cgit From d1de429bcedac78d047c545d748c7b45fe16d56d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_inode_by_name The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3577ee142888..75a0e2350cdb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5565,12 +5565,12 @@ no_delete: * If no dir entries were found, returns -ENOENT. * If found a corrupted location in dir entry, returns -EUCLEAN. */ -static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, +static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type) { struct btrfs_dir_item *di; struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; int ret = 0; struct fscrypt_name fname; @@ -5578,13 +5578,13 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, if (!path) return -ENOMEM; - ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); if (ret) goto out; /* This needs to handle no-key deletions later on */ - di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), + di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), &fname.disk_name, 0); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; @@ -5597,7 +5597,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, ret = -EUCLEAN; btrfs_warn(root->fs_info, "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", - __func__, fname.disk_name.name, btrfs_ino(BTRFS_I(dir)), + __func__, fname.disk_name.name, btrfs_ino(dir), location->objectid, location->type, location->offset); } if (!ret) @@ -5881,7 +5881,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - ret = btrfs_inode_by_name(dir, dentry, &location, &di_type); + ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type); if (ret < 0) return ERR_PTR(ret); -- cgit From 3c1b1c4c0e874dc9f2cf2faa231d39c4ca0d5888 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to fixup_tree_root_location The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 75a0e2350cdb..a14a2b3a928b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5614,7 +5614,7 @@ out: * is kind of like crossing a mount point. */ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, - struct inode *dir, + struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, struct btrfs_root **sub_root) @@ -5628,7 +5628,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, int err = 0; struct fscrypt_name fname; - ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); + ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname); if (ret) return ret; @@ -5639,7 +5639,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, } err = -ENOENT; - key.objectid = BTRFS_I(dir)->root->root_key.objectid; + key.objectid = dir->root->root_key.objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = location->objectid; @@ -5652,7 +5652,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); - if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) || + if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len) goto out; @@ -5902,7 +5902,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return inode; } - ret = fixup_tree_root_location(fs_info, dir, dentry, + ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry, &location, &sub_root); if (ret < 0) { if (ret != -ENOENT) -- cgit From 4c45a4f4de1b7083800954afa4821c67df157967 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to inode_tree_add The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a14a2b3a928b..b1a4f53cc2c4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5680,16 +5680,16 @@ out: return err; } -static void inode_tree_add(struct inode *inode) +static void inode_tree_add(struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_inode *entry; struct rb_node **p; struct rb_node *parent; - struct rb_node *new = &BTRFS_I(inode)->rb_node; - u64 ino = btrfs_ino(BTRFS_I(inode)); + struct rb_node *new = &inode->rb_node; + u64 ino = btrfs_ino(inode); - if (inode_unhashed(inode)) + if (inode_unhashed(&inode->vfs_inode)) return; parent = NULL; spin_lock(&root->inode_lock); @@ -5801,7 +5801,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino, ret = btrfs_read_locked_inode(inode, path); if (!ret) { - inode_tree_add(inode); + inode_tree_add(BTRFS_I(inode)); unlock_new_inode(inode); } else { iget_failed(inode); @@ -6568,7 +6568,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } - inode_tree_add(inode); + inode_tree_add(BTRFS_I(inode)); trace_btrfs_inode_new(inode); btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); -- cgit From 7a0443f031a68188a38e0845686747d4ccd2c16d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: btrfs: pass btrfs_inode to btrfs_inherit_iflags The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b1a4f53cc2c4..a242417a7e18 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6345,27 +6345,27 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args) * * Currently only the compression flags and the cow flags are inherited. */ -static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) +static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir) { unsigned int flags; - flags = BTRFS_I(dir)->flags; + flags = dir->flags; if (flags & BTRFS_INODE_NOCOMPRESS) { - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + inode->flags &= ~BTRFS_INODE_COMPRESS; + inode->flags |= BTRFS_INODE_NOCOMPRESS; } else if (flags & BTRFS_INODE_COMPRESS) { - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; + inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + inode->flags |= BTRFS_INODE_COMPRESS; } if (flags & BTRFS_INODE_NODATACOW) { - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; - if (S_ISREG(inode->i_mode)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + inode->flags |= BTRFS_INODE_NODATACOW; + if (S_ISREG(inode->vfs_inode.i_mode)) + inode->flags |= BTRFS_INODE_NODATASUM; } - btrfs_sync_inode_flags_to_i_flags(inode); + btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); } int btrfs_create_new_inode(struct btrfs_trans_handle *trans, @@ -6424,7 +6424,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, * change it now without compatibility issues. */ if (!args->subvol) - btrfs_inherit_iflags(inode, dir); + btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir)); if (S_ISREG(inode->i_mode)) { if (btrfs_test_opt(fs_info, NODATASUM)) -- cgit From 99a81a444448ef6cde906500dbdf26aa5961e291 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 28 Oct 2022 03:37:50 +0200 Subject: btrfs: switch async_chunk::inode to btrfs_inode The async_chunk::inode structure is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a242417a7e18..d2409d8a147a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -520,7 +520,7 @@ struct async_extent { }; struct async_chunk { - struct inode *inode; + struct btrfs_inode *inode; struct page *locked_page; u64 start; u64 end; @@ -648,7 +648,7 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, */ static noinline int compress_file_range(struct async_chunk *async_chunk) { - struct inode *inode = async_chunk->inode; + struct inode *inode = &async_chunk->inode->vfs_inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; @@ -1113,7 +1113,7 @@ out_free: */ static noinline void submit_compressed_extents(struct async_chunk *async_chunk) { - struct btrfs_inode *inode = BTRFS_I(async_chunk->inode); + struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct async_extent *async_extent; u64 alloc_hint = 0; @@ -1494,7 +1494,7 @@ static noinline void async_cow_start(struct btrfs_work *work) compressed_extents = compress_file_range(async_chunk); if (compressed_extents == 0) { - btrfs_add_delayed_iput(async_chunk->inode); + btrfs_add_delayed_iput(&async_chunk->inode->vfs_inode); async_chunk->inode = NULL; } } @@ -1534,7 +1534,7 @@ static noinline void async_cow_free(struct btrfs_work *work) async_chunk = container_of(work, struct async_chunk, work); if (async_chunk->inode) - btrfs_add_delayed_iput(async_chunk->inode); + btrfs_add_delayed_iput(&async_chunk->inode->vfs_inode); if (async_chunk->blkcg_css) css_put(async_chunk->blkcg_css); @@ -1602,7 +1602,7 @@ static int cow_file_range_async(struct btrfs_inode *inode, */ ihold(&inode->vfs_inode); async_chunk[i].async_cow = ctx; - async_chunk[i].inode = &inode->vfs_inode; + async_chunk[i].inode = inode; async_chunk[i].start = start; async_chunk[i].end = cur_end; async_chunk[i].write_flags = write_flags; -- cgit From 99a01bd6388e52dabb56015b939c1e9b26b0196e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 28 Oct 2022 03:47:16 +0200 Subject: btrfs: use btrfs_inode inside compress_file_range The function is mostly using internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d2409d8a147a..28725dd76db9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -648,8 +648,8 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, */ static noinline int compress_file_range(struct async_chunk *async_chunk) { - struct inode *inode = &async_chunk->inode->vfs_inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *inode = async_chunk->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; u64 end = async_chunk->end; @@ -666,8 +666,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) int compressed_extents = 0; int redirty = 0; - inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, - SZ_16K); + inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); /* * We need to save i_size before now because it could change in between @@ -679,7 +678,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) * does that for us. */ barrier(); - i_size = i_size_read(inode); + i_size = i_size_read(&inode->vfs_inode); barrier(); actual_end = min_t(u64, i_size, end + 1); again: @@ -708,7 +707,7 @@ again: * isn't an inline extent, since it doesn't save disk space at all. */ if (total_compressed <= blocksize && - (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) + (start > 0 || end + 1 < inode->disk_i_size)) goto cleanup_and_bail_uncompressed; /* @@ -732,7 +731,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (inode_need_compress(BTRFS_I(inode), start, end)) { + if (inode_need_compress(inode, start, end)) { WARN_ON(pages); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { @@ -741,10 +740,10 @@ again: goto cont; } - if (BTRFS_I(inode)->defrag_compress) - compress_type = BTRFS_I(inode)->defrag_compress; - else if (BTRFS_I(inode)->prop_compress) - compress_type = BTRFS_I(inode)->prop_compress; + if (inode->defrag_compress) + compress_type = inode->defrag_compress; + else if (inode->prop_compress) + compress_type = inode->prop_compress; /* * we need to call clear_page_dirty_for_io on each @@ -759,14 +758,14 @@ again: * has moved, the end is the original one. */ if (!redirty) { - extent_range_clear_dirty_for_io(inode, start, end); + extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); redirty = 1; } /* Compression level is applied here and only here */ ret = btrfs_compress_pages( compress_type | (fs_info->compress_level << 4), - inode->i_mapping, start, + inode->vfs_inode.i_mapping, start, pages, &nr_pages, &total_in, @@ -795,12 +794,12 @@ cont: /* we didn't compress the entire range, try * to make an uncompressed inline extent. */ - ret = cow_file_range_inline(BTRFS_I(inode), actual_end, + ret = cow_file_range_inline(inode, actual_end, 0, BTRFS_COMPRESS_NONE, NULL, false); } else { /* try making a compressed inline extent */ - ret = cow_file_range_inline(BTRFS_I(inode), actual_end, + ret = cow_file_range_inline(inode, actual_end, total_compressed, compress_type, pages, false); @@ -823,7 +822,7 @@ cont: * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, + extent_clear_unlock_delalloc(inode, start, end, NULL, clear_flags, PAGE_UNLOCK | @@ -898,8 +897,8 @@ cont: /* flag the file so we don't compress in the future */ if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && - !(BTRFS_I(inode)->prop_compress)) { - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + !(inode->prop_compress)) { + inode->flags |= BTRFS_INODE_NOCOMPRESS; } } cleanup_and_bail_uncompressed: @@ -917,7 +916,7 @@ cleanup_and_bail_uncompressed: } if (redirty) - extent_range_redirty_for_io(inode, start, end); + extent_range_redirty_for_io(&inode->vfs_inode, start, end); add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); compressed_extents++; -- cgit From 5fc24314c89491cb2f53583d5e513731d58c7699 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 28 Oct 2022 03:47:16 +0200 Subject: btrfs: use btrfs_inode inside btrfs_verify_data_csum The function is mostly using internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 28725dd76db9..d3e5a5ff2d5c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3529,10 +3529,10 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u64 start, u64 end) { - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; const u32 sectorsize = root->fs_info->sectorsize; u32 pg_off; unsigned int result = 0; @@ -3545,7 +3545,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, if (bbio->csum == NULL) return 0; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + if (inode->flags & BTRFS_INODE_NODATASUM) return 0; if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))) @@ -3569,7 +3569,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, EXTENT_NODATASUM); continue; } - ret = btrfs_check_data_csum(BTRFS_I(inode), bbio, bio_offset, page, pg_off); + ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off); if (ret < 0) { const int nr_bit = (pg_off - offset_in_page(start)) >> root->fs_info->sectorsize_bits; -- cgit From e55cf7ca85e323028774feeb117ad94358a78070 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 28 Oct 2022 03:53:04 +0200 Subject: btrfs: pass btrfs_inode to btrfs_add_delayed_iput The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/free-space-cache.c | 4 ++-- fs/btrfs/inode.c | 19 +++++++++---------- fs/btrfs/ordered-data.c | 2 +- fs/btrfs/relocation.c | 4 ++-- fs/btrfs/tree-log.c | 24 ++++++++++++------------ 7 files changed, 28 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 9e31dc8b0285..195c09e20609 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -501,7 +501,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); -void btrfs_add_delayed_iput(struct inode *inode); +void btrfs_add_delayed_iput(struct btrfs_inode *inode); void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info); int btrfs_prealloc_file_range(struct inode *inode, int mode, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 05768f7f7872..859a41624c31 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3228,7 +3228,7 @@ retry: if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) mapping->writeback_index = done_index; - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); return ret; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 627bd6120368..0d250d052487 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -260,7 +260,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, } ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); goto out; } clear_nlink(inode); @@ -274,7 +274,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, spin_unlock(&block_group->lock); } /* One for the lookup ref */ - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.type = 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d3e5a5ff2d5c..aec1b232a71c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1493,7 +1493,7 @@ static noinline void async_cow_start(struct btrfs_work *work) compressed_extents = compress_file_range(async_chunk); if (compressed_extents == 0) { - btrfs_add_delayed_iput(&async_chunk->inode->vfs_inode); + btrfs_add_delayed_iput(async_chunk->inode); async_chunk->inode = NULL; } } @@ -1533,7 +1533,7 @@ static noinline void async_cow_free(struct btrfs_work *work) async_chunk = container_of(work, struct async_chunk, work); if (async_chunk->inode) - btrfs_add_delayed_iput(&async_chunk->inode->vfs_inode); + btrfs_add_delayed_iput(async_chunk->inode); if (async_chunk->blkcg_css) css_put(async_chunk->blkcg_css); @@ -3016,7 +3016,7 @@ out_page: * that could need flushing space. Recursing back to fixup worker would * deadlock. */ - btrfs_add_delayed_iput(&inode->vfs_inode); + btrfs_add_delayed_iput(inode); } /* @@ -3590,18 +3590,17 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, * the inode to the delayed iput machinery. Delayed iputs are processed at * transaction commit time/superblock commit/cleaner kthread. */ -void btrfs_add_delayed_iput(struct inode *inode) +void btrfs_add_delayed_iput(struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_inode *binode = BTRFS_I(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; - if (atomic_add_unless(&inode->i_count, -1, 1)) + if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return; atomic_inc(&fs_info->nr_delayed_iputs); spin_lock(&fs_info->delayed_iput_lock); - ASSERT(list_empty(&binode->delayed_iput)); - list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); + ASSERT(list_empty(&inode->delayed_iput)); + list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs); spin_unlock(&fs_info->delayed_iput_lock); if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) wake_up_process(fs_info->cleaner_kthread); @@ -9661,7 +9660,7 @@ static int start_delalloc_inodes(struct btrfs_root *root, &work->work); } else { ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); if (ret || wbc->nr_to_write <= 0) goto out; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 8fda1949b71b..4bed0839b640 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -504,7 +504,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) ASSERT(list_empty(&entry->log_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) - btrfs_add_delayed_iput(entry->inode); + btrfs_add_delayed_iput(BTRFS_I(entry->inode)); while (!list_empty(&entry->list)) { cur = entry->list.next; sum = list_entry(cur, struct btrfs_ordered_sum, list); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 1440cb332a5a..8914aa920bb7 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1117,7 +1117,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, inode = find_next_inode(root, key.objectid); first = 0; } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); inode = find_next_inode(root, key.objectid); } if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { @@ -1181,7 +1181,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, if (dirty) btrfs_mark_buffer_dirty(leaf); if (inode) - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); return ret; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index b6e99ef99679..f7b1bb9c63e4 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5473,7 +5473,7 @@ again: } if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(di_inode); + btrfs_add_delayed_iput(BTRFS_I(di_inode)); break; } @@ -5482,7 +5482,7 @@ again: log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); - btrfs_add_delayed_iput(di_inode); + btrfs_add_delayed_iput(BTRFS_I(di_inode)); if (ret) goto out; if (ctx->log_new_dentries) { @@ -5676,11 +5676,11 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, * so that the log ends up with the new name and without the old name. */ if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); return 0; } - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); if (!ino_elem) @@ -5755,7 +5755,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, */ ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_ALL, ctx); - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); if (ret) break; continue; @@ -5772,7 +5772,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * that, we can avoid doing it again. */ if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); continue; } @@ -5784,7 +5784,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * log with the new name before we unpin it. */ ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); if (ret) break; } @@ -6294,7 +6294,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, } if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(di_inode); + btrfs_add_delayed_iput(BTRFS_I(di_inode)); continue; } @@ -6307,7 +6307,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, if (!ret && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx); - btrfs_add_delayed_iput(di_inode); + btrfs_add_delayed_iput(BTRFS_I(di_inode)); if (ret) break; @@ -6768,7 +6768,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, } if (!need_log_inode(trans, BTRFS_I(dir_inode))) { - btrfs_add_delayed_iput(dir_inode); + btrfs_add_delayed_iput(BTRFS_I(dir_inode)); continue; } @@ -6778,7 +6778,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (!ret && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, BTRFS_I(dir_inode), ctx); - btrfs_add_delayed_iput(dir_inode); + btrfs_add_delayed_iput(BTRFS_I(dir_inode)); if (ret) goto out; } @@ -6823,7 +6823,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, need_log_inode(trans, BTRFS_I(inode))) ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); if (ret) return ret; -- cgit From 2942a50dea74126bf3395b3060a808fb046136fc Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 7 Nov 2022 15:32:29 +0800 Subject: btrfs: raid56: introduce btrfs_raid_bio::error_bitmap Currently btrfs raid56 uses btrfs_raid_bio::faila and failb to indicate which stripe(s) had IO errors. But that has some problems: - If one sector failed csum check, the whole stripe where the corruption is will be marked error. This can reduce the chance we do recover, like this: 0 4K 8K Data 1 |XX| | Data 2 | |XX| Parity | | | In above case, 0~4K in data 1 should be recovered using data 2 and parity, while 4K~8K in data 2 should be recovered using data 1 and parity. Currently if we trigger read on 0~4K of data 1, we will also recover 4K~8K of data 1 using corrupted data 2 and parity, causing wrong result in rbio cache. - Harder to expand for future M-N scheme As we're limited to just faila/b, two corruptions. - Harder to expand to handle extra csum errors This can be problematic if we start to do csum verification. This patch will introduce an extra @error_bitmap, where one bit represents error that happened for that sector. The choice to introduce a new error bitmap other than reusing sector_ptr, is to avoid extra search between rbio::stripe_sectors[] and rbio::bio_sectors[]. Since we can submit bio using sectors from both sectors, doing proper search on both array will more complex. Although the new bitmap will take extra memory, later we can remove things like @error and faila/b to save some memory. Currently the new error bitmap and failab mechanism coexists, the error bitmap is only updated at endio time and recover entrance. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/raid56.h | 11 +++++++ 2 files changed, 103 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 629237102189..e14968a37c08 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -76,6 +76,7 @@ static void scrub_rbio_work_locked(struct work_struct *work); static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) { + bitmap_free(rbio->error_bitmap); kfree(rbio->stripe_pages); kfree(rbio->bio_sectors); kfree(rbio->stripe_sectors); @@ -950,9 +951,10 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), GFP_NOFS); rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); + rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || - !rbio->finish_pointers) { + !rbio->finish_pointers || !rbio->error_bitmap) { free_raid_bio_pointers(rbio); kfree(rbio); return ERR_PTR(-ENOMEM); @@ -1044,8 +1046,11 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, disk_start = stripe->physical + sector_nr * sectorsize; /* if the device is missing, just fail this stripe */ - if (!stripe->dev->bdev) + if (!stripe->dev->bdev) { + set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr, + rbio->error_bitmap); return fail_rbio_index(rbio, stripe_nr); + } /* see if we can add this page onto our existing bio */ if (last) { @@ -1209,6 +1214,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, * write. */ atomic_set(&rbio->error, 0); + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); rbio->faila = -1; rbio->failb = -1; @@ -1332,6 +1338,40 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, return -1; } +static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + rbio->bioc->raid_map[0]; + int total_nr_sector = offset >> fs_info->sectorsize_bits; + + ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors); + + bitmap_set(rbio->error_bitmap, total_nr_sector, + bio->bi_iter.bi_size >> fs_info->sectorsize_bits); + + /* + * Special handling for raid56_alloc_missing_rbio() used by + * scrub/replace. Unlike call path in raid56_parity_recover(), they + * pass an empty bio here. Thus we have to find out the missing device + * and mark the stripe error instead. + */ + if (bio->bi_iter.bi_size == 0) { + bool found_missing = false; + int stripe_nr; + + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + if (!rbio->bioc->stripes[stripe_nr].dev->bdev) { + found_missing = true; + bitmap_set(rbio->error_bitmap, + stripe_nr * rbio->stripe_nsectors, + rbio->stripe_nsectors); + } + } + ASSERT(found_missing); + } +} + /* * returns -EIO if we had too many failures */ @@ -1423,14 +1463,49 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) } } +static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) +{ + struct bio_vec *bv = bio_first_bvec_all(bio); + int i; + + for (i = 0; i < rbio->nr_sectors; i++) { + struct sector_ptr *sector; + + sector = &rbio->stripe_sectors[i]; + if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + break; + sector = &rbio->bio_sectors[i]; + if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + break; + } + ASSERT(i < rbio->nr_sectors); + return i; +} + +static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio) +{ + int total_sector_nr = get_bio_sector_nr(rbio, bio); + u32 bio_size = 0; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) + bio_size += bvec->bv_len; + + bitmap_set(rbio->error_bitmap, total_sector_nr, + bio_size >> rbio->bioc->fs_info->sectorsize_bits); +} + static void raid_wait_read_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_status) + if (bio->bi_status) { fail_bio_stripe(rbio, bio); - else + rbio_update_error_bitmap(rbio, bio); + } else { set_bio_pages_uptodate(rbio, bio); + } bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending)) @@ -1863,10 +1938,10 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, struct sector_ptr *sector; if (rbio->faila == stripe || rbio->failb == stripe) { - atomic_inc(&rbio->error); /* Skip the current stripe. */ ASSERT(sectornr == 0); total_sector_nr += rbio->stripe_nsectors - 1; + atomic_inc(&rbio->error); continue; } sector = rbio_stripe_sector(rbio, stripe, sectornr); @@ -1891,9 +1966,10 @@ static int recover_rbio(struct btrfs_raid_bio *rbio) /* * Either we're doing recover for a read failure or degraded write, - * caller should have set faila/b correctly. + * caller should have set faila/b and error bitmap correctly. */ ASSERT(rbio->faila >= 0 || rbio->failb >= 0); + ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); bio_list_init(&bio_list); /* @@ -1978,6 +2054,8 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, rbio->operation = BTRFS_RBIO_READ_REBUILD; rbio_add_bio(rbio, bio); + set_rbio_range_error(rbio, bio); + rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { btrfs_warn(fs_info, @@ -2038,8 +2116,10 @@ static void raid_wait_write_end_io(struct bio *bio) struct btrfs_raid_bio *rbio = bio->bi_private; blk_status_t err = bio->bi_status; - if (err) + if (err) { fail_bio_stripe(rbio, bio); + rbio_update_error_bitmap(rbio, bio); + } bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending)) wake_up(&rbio->io_wait); @@ -2117,6 +2197,7 @@ write: spin_unlock_irq(&rbio->bio_list_lock); atomic_set(&rbio->error, 0); + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); index_rbio_pages(rbio); @@ -2328,6 +2409,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) } atomic_set(&rbio->error, 0); + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* Map the parity stripe just once */ pointers[nr_data] = kmap_local_page(p_sector.page); @@ -2533,6 +2615,8 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio) goto cleanup; atomic_set(&rbio->error, 0); + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); + ret = scrub_assemble_read_bios(rbio, &bio_list); if (ret < 0) goto cleanup; @@ -2612,6 +2696,7 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) */ ASSERT(!bio->bi_iter.bi_size); + set_rbio_range_error(rbio, bio); rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { btrfs_warn_rl(fs_info, diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 9fae97b7a2a5..e38da4fa76d6 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -126,6 +126,17 @@ struct btrfs_raid_bio { /* Allocated with real_stripes-many pointers for finish_*() calls */ void **finish_pointers; + + /* + * The bitmap recording where IO errors happened. + * Each bit is corresponding to one sector in either bio_sectors[] or + * stripe_sectors[] array. + * + * The reason we don't use another bit in sector_ptr is, we have two + * arrays of sectors, and a lot of IO can use sectors in both arrays. + * Thus making it much harder to iterate. + */ + unsigned long *error_bitmap; }; /* -- cgit From 75b47033296595efb208cc563cbb8cf4fb7c3ebc Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 7 Nov 2022 15:32:30 +0800 Subject: btrfs: raid56: migrate recovery and scrub recovery path to use error_bitmap Since we have rbio::error_bitmap to indicate exactly where the errors are (including read error and csum mismatch error), we can make recovery path more accurate. For example: 0 32K 64K Data 1 |XXXXXXXX| | Data 2 | |XXXXXXXXX| Parity | | | 1) Get csum mismatch when reading data 1 [0, 32K) 2) Mark corresponding range error The old code will mark the whole data 1 stripe as error. While the new code will only mark data 1 [0, 32K) as error. 3) Recovery path The old code will recover data 1 [0, 64K), all using Data 2 and parity. This means, Data 1 [32K, 64K) will be corrupted data, as data 2 [32K, 64K) is already corrupted. While the new code will only recover data 1 [0, 32K), as only that range has error so far. This new behavior can avoid populating rbio cache with incorrect data. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 279 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 193 insertions(+), 86 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index e14968a37c08..fd67ba4648c9 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1013,6 +1013,36 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) return 0; } +/* + * Return the total numer of errors found in the vertical stripe of @sector_nr. + * + * @faila and @failb will also be updated to the first and second stripe + * number of the errors. + */ +static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, + int *faila, int *failb) +{ + int stripe_nr; + int found_errors = 0; + + ASSERT(faila && failb); + *faila = -1; + *failb = -1; + + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr; + + if (test_bit(total_sector_nr, rbio->error_bitmap)) { + found_errors++; + if (*faila < 0) + *faila = stripe_nr; + else if (*failb < 0) + *failb = stripe_nr; + } + } + return found_errors; +} + /* * Add a single sector @sector into our list of bios for IO. * @@ -1740,14 +1770,15 @@ fail: * @*pointers are the pre-allocated pointers by the caller, so we don't * need to allocate/free the pointers again and again. */ -static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, - void **pointers, void **unmap_array) +static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, + void **pointers, void **unmap_array) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; struct sector_ptr *sector; const u32 sectorsize = fs_info->sectorsize; - const int faila = rbio->faila; - const int failb = rbio->failb; + int found_errors; + int faila; + int failb; int stripe_nr; /* @@ -1756,7 +1787,19 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, */ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && !test_bit(sector_nr, &rbio->dbitmap)) - return; + return 0; + + found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, + &failb); + /* + * No errors in the veritical stripe, skip it. Can happen for recovery + * which only part of a stripe failed csum check. + */ + if (!found_errors) + return 0; + + if (found_errors > rbio->bioc->max_errors) + return -EIO; /* * Setup our array of pointers with sectors from each stripe @@ -1766,12 +1809,11 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, */ for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { /* - * If we're rebuilding a read, we have to use - * pages from the bio list + * If we're rebuilding a read, we have to use pages from the + * bio list if possible. */ if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && - (stripe_nr == faila || stripe_nr == failb)) { + rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); @@ -1859,18 +1901,19 @@ pstripe: * Especially if we determine to cache the rbio, we need to * have at least all data sectors uptodate. */ - if (rbio->faila >= 0) { - sector = rbio_stripe_sector(rbio, rbio->faila, sector_nr); + if (faila >= 0) { + sector = rbio_stripe_sector(rbio, faila, sector_nr); sector->uptodate = 1; } - if (rbio->failb >= 0) { - sector = rbio_stripe_sector(rbio, rbio->failb, sector_nr); + if (failb >= 0) { + sector = rbio_stripe_sector(rbio, failb, sector_nr); sector->uptodate = 1; } cleanup: for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) kunmap_local(unmap_array[stripe_nr]); + return 0; } static int recover_sectors(struct btrfs_raid_bio *rbio) @@ -1893,10 +1936,6 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) goto out; } - /* Make sure faila and fail b are in order. */ - if (rbio->faila >= 0 && rbio->failb >= 0 && rbio->faila > rbio->failb) - swap(rbio->faila, rbio->failb); - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { spin_lock_irq(&rbio->bio_list_lock); @@ -1906,8 +1945,11 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) - recover_vertical(rbio, sectornr, pointers, unmap_array); + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + ret = recover_vertical(rbio, sectornr, pointers, unmap_array); + if (ret < 0) + break; + } out: kfree(pointers); @@ -1937,13 +1979,21 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, int sectornr = total_sector_nr % rbio->stripe_nsectors; struct sector_ptr *sector; - if (rbio->faila == stripe || rbio->failb == stripe) { - /* Skip the current stripe. */ - ASSERT(sectornr == 0); - total_sector_nr += rbio->stripe_nsectors - 1; - atomic_inc(&rbio->error); + /* + * Skip the range which has error. It can be a range which is + * marked error (for csum mismatch), or it can be a missing + * device. + */ + if (!rbio->bioc->stripes[stripe].dev->bdev || + test_bit(total_sector_nr, rbio->error_bitmap)) { + /* + * Also set the error bit for missing device, which + * may not yet have its error bit set. + */ + set_bit(total_sector_nr, rbio->error_bitmap); continue; } + sector = rbio_stripe_sector(rbio, stripe, sectornr); ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_READ); @@ -1966,9 +2016,8 @@ static int recover_rbio(struct btrfs_raid_bio *rbio) /* * Either we're doing recover for a read failure or degraded write, - * caller should have set faila/b and error bitmap correctly. + * caller should have set error bitmap correctly. */ - ASSERT(rbio->faila >= 0 || rbio->failb >= 0); ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); bio_list_init(&bio_list); @@ -1992,12 +2041,6 @@ static int recover_rbio(struct btrfs_raid_bio *rbio) submit_read_bios(rbio, &bio_list); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); - /* We have more errors than our tolerance during the read. */ - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { - ret = -EIO; - goto out; - } - ret = recover_sectors(rbio); out: @@ -2032,6 +2075,51 @@ static void recover_rbio_work_locked(struct work_struct *work) rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } +static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) +{ + bool found = false; + int sector_nr; + + /* + * This is for RAID6 extra recovery tries, thus mirror number should + * be large than 2. + * Mirror 1 means read from data stripes. Mirror 2 means rebuild using + * RAID5 methods. + */ + ASSERT(mirror_num > 2); + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { + int found_errors; + int faila; + int failb; + + found_errors = get_rbio_veritical_errors(rbio, sector_nr, + &faila, &failb); + /* This vertical stripe doesn't have errors. */ + if (!found_errors) + continue; + + /* + * If we found errors, there should be only one error marked + * by previous set_rbio_range_error(). + */ + ASSERT(found_errors == 1); + found = true; + + /* Now select another stripe to mark as error. */ + failb = rbio->real_stripes - (mirror_num - 1); + if (failb <= faila) + failb--; + + /* Set the extra bit in error bitmap. */ + if (failb >= 0) + set_bit(failb * rbio->stripe_nsectors + sector_nr, + rbio->error_bitmap); + } + + /* We should found at least one vertical stripe with error.*/ + ASSERT(found); +} + /* * the main entry point for reads from the higher layers. This * is really only called when the normal read path had a failure, @@ -2074,11 +2162,7 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, * for 'mirror_num > 2', select a stripe to fail on every retry. */ if (mirror_num > 2) { - /* - * 'mirror == 3' is to fail the p stripe and - * reconstruct from the q stripe. 'mirror > 3' is to - * fail a data stripe and reconstruct from p+q stripe. - */ + set_rbio_raid6_extra_error(rbio, mirror_num); rbio->failb = rbio->real_stripes - (mirror_num - 1); ASSERT(rbio->failb > 0); if (rbio->failb <= rbio->faila) @@ -2507,48 +2591,85 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) { - int dfail = 0, failp = -1; + void **pointers = NULL; + void **unmap_array = NULL; + int sector_nr; int ret; - /* No error case should be already handled by the caller. */ - ASSERT(rbio->faila >= 0 || rbio->failb >= 0); + /* + * @pointers array stores the pointer for each sector. + * + * @unmap_array stores copy of pointers that does not get reordered + * during reconstruction so that kunmap_local works. + */ + pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + if (!pointers || !unmap_array) { + ret = -ENOMEM; + goto out; + } - if (is_data_stripe(rbio, rbio->faila)) - dfail++; - else if (is_parity_stripe(rbio->faila)) - failp = rbio->faila; + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { + int dfail = 0, failp = -1; + int faila; + int failb; + int found_errors; - if (is_data_stripe(rbio, rbio->failb)) - dfail++; - else if (is_parity_stripe(rbio->failb)) - failp = rbio->failb; + found_errors = get_rbio_veritical_errors(rbio, sector_nr, + &faila, &failb); + if (found_errors > rbio->bioc->max_errors) { + ret = -EIO; + goto out; + } + if (found_errors == 0) + continue; - /* - * Because we can not use a scrubbing parity to repair - * the data, so the capability of the repair is declined. - * (In the case of RAID5, we can not repair anything) - */ - if (dfail > rbio->bioc->max_errors - 1) - return -EIO; + /* We should have at least one error here. */ + ASSERT(faila >= 0 || failb >= 0); - /* - * If all data is good, only parity is correctly, just - * repair the parity. - */ - if (dfail == 0) - return 0; + if (is_data_stripe(rbio, faila)) + dfail++; + else if (is_parity_stripe(faila)) + failp = faila; - /* - * Here means we got one corrupted data stripe and one - * corrupted parity on RAID6, if the corrupted parity - * is scrubbing parity, luckily, use the other one to repair - * the data, or we can not repair the data stripe. - */ - if (failp != rbio->scrubp) - return -EIO; + if (is_data_stripe(rbio, failb)) + dfail++; + else if (is_parity_stripe(failb)) + failp = failb; + /* + * Because we can not use a scrubbing parity to repair the + * data, so the capability of the repair is declined. (In the + * case of RAID5, we can not repair anything.) + */ + if (dfail > rbio->bioc->max_errors - 1) { + ret = -EIO; + goto out; + } + /* + * If all data is good, only parity is correctly, just repair + * the parity, no need to recover data stripes. + */ + if (dfail == 0) + continue; - /* We have some corrupted sectors, need to repair them. */ - ret = recover_sectors(rbio); + /* + * Here means we got one corrupted data stripe and one + * corrupted parity on RAID6, if the corrupted parity is + * scrubbing parity, luckily, use the other one to repair the + * data, or we can not repair the data stripe. + */ + if (failp != rbio->scrubp) { + ret = -EIO; + goto out; + } + + ret = recover_vertical(rbio, sector_nr, pointers, unmap_array); + if (ret < 0) + goto out; + } +out: + kfree(pointers); + kfree(unmap_array); return ret; } @@ -2624,25 +2745,11 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio) submit_read_bios(rbio, &bio_list); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { - ret = -EIO; - goto cleanup; - } - /* - * No error during read, can finish the scrub and need to verify the - * P/Q sectors; - */ - if (atomic_read(&rbio->error) == 0) { - need_check = true; - goto finish; - } - - /* We have some failures, need to recover the failed sectors first. */ + /* We may have some failures, recover the failed sectors first. */ ret = recover_scrub_rbio(rbio); if (ret < 0) goto cleanup; -finish: /* * We have every sector properly prepared. Can finish the scrub * and writeback the good content. -- cgit From ad3daf1c3f5bcbba49a302d8d0e46467556bf6f3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 7 Nov 2022 15:32:31 +0800 Subject: btrfs: raid56: remove the old error tracking system Since all the recovery paths have been migrated to the new error bitmap based system, we can remove the old stripe number based system. This cleanup involves one behavior change: - Rebuild rbio can no longer be merged Previously a rebuild rbio (caused by retry after data csum mismatch) can be merged, if the error happens in the same stripe. But with the new error bitmap based solution, it's much harder to compare error bitmaps. So here we just don't merge rebuild rbio at all. This may introduce some performance impact at extreme corner cases, but we're willing to take it. Other than that, this patch will cleanup the following members: - rbio::faila - rbio::failb They will be replaced by per-vertical stripe check, which is more accurate. - rbio::error It will be replace by per-vertical stripe error bitmap check. - Allow get_rbio_vertical_errors() to accept NULL pointers for @faila and @failb Some call sites only want to check if we have errors beyond the tolerance. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 234 +++++++++++++----------------------------------------- fs/btrfs/raid56.h | 8 -- 2 files changed, 53 insertions(+), 189 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index fd67ba4648c9..11be5d0a7eab 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -66,8 +66,6 @@ struct sector_ptr { static void rmw_rbio_work(struct work_struct *work); static void rmw_rbio_work_locked(struct work_struct *work); -static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); -static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); static void index_rbio_pages(struct btrfs_raid_bio *rbio); static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); @@ -588,28 +586,10 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, if (last->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; - if (last->operation == BTRFS_RBIO_REBUILD_MISSING) + if (last->operation == BTRFS_RBIO_REBUILD_MISSING || + last->operation == BTRFS_RBIO_READ_REBUILD) return 0; - if (last->operation == BTRFS_RBIO_READ_REBUILD) { - int fa = last->faila; - int fb = last->failb; - int cur_fa = cur->faila; - int cur_fb = cur->failb; - - if (last->faila >= last->failb) { - fa = last->failb; - fb = last->faila; - } - - if (cur->faila >= cur->failb) { - cur_fa = cur->failb; - cur_fb = cur->faila; - } - - if (fa != cur_fa || fb != cur_fb) - return 0; - } return 1; } @@ -973,10 +953,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, rbio->real_stripes = real_stripes; rbio->stripe_npages = stripe_npages; rbio->stripe_nsectors = stripe_nsectors; - rbio->faila = -1; - rbio->failb = -1; refcount_set(&rbio->refs, 1); - atomic_set(&rbio->error, 0); atomic_set(&rbio->stripes_pending, 0); ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); @@ -1025,19 +1002,28 @@ static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, int stripe_nr; int found_errors = 0; - ASSERT(faila && failb); - *faila = -1; - *failb = -1; + if (faila || failb) { + /* + * Both @faila and @failb should be valid pointers if any of + * them is specified. + */ + ASSERT(faila && failb); + *faila = -1; + *failb = -1; + } for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr; if (test_bit(total_sector_nr, rbio->error_bitmap)) { found_errors++; - if (*faila < 0) - *faila = stripe_nr; - else if (*failb < 0) - *failb = stripe_nr; + if (faila) { + /* Update faila and failb. */ + if (*faila < 0) + *faila = stripe_nr; + else if (*failb < 0) + *failb = stripe_nr; + } } } return found_errors; @@ -1077,9 +1063,17 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, /* if the device is missing, just fail this stripe */ if (!stripe->dev->bdev) { + int found_errors; + set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr, rbio->error_bitmap); - return fail_rbio_index(rbio, stripe_nr); + + /* Check if we have reached tolerance early. */ + found_errors = get_rbio_veritical_errors(rbio, sector_nr, + NULL, NULL); + if (found_errors > rbio->bioc->max_errors) + return -EIO; + return 0; } /* see if we can add this page onto our existing bio */ @@ -1243,10 +1237,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, * Reset errors, as we may have errors inherited from from degraded * write. */ - atomic_set(&rbio->error, 0); bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); - rbio->faila = -1; - rbio->failb = -1; /* * Start assembly. Make bios for everything from the higher layers (the @@ -1324,50 +1315,6 @@ error: return -EIO; } -/* - * helper to find the stripe number for a given bio. Used to figure out which - * stripe has failed. This expects the bio to correspond to a physical disk, - * so it looks up based on physical sector numbers. - */ -static int find_bio_stripe(struct btrfs_raid_bio *rbio, - struct bio *bio) -{ - u64 physical = bio->bi_iter.bi_sector; - int i; - struct btrfs_io_stripe *stripe; - - physical <<= 9; - - for (i = 0; i < rbio->bioc->num_stripes; i++) { - stripe = &rbio->bioc->stripes[i]; - if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) && - stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { - return i; - } - } - return -1; -} - -/* - * helper to find the stripe number for a given - * bio (before mapping). Used to figure out which stripe has - * failed. This looks up based on logical block numbers. - */ -static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, - struct bio *bio) -{ - u64 logical = bio->bi_iter.bi_sector << 9; - int i; - - for (i = 0; i < rbio->nr_data; i++) { - u64 stripe_start = rbio->bioc->raid_map[i]; - - if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN)) - return i; - } - return -1; -} - static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; @@ -1402,52 +1349,6 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) } } -/* - * returns -EIO if we had too many failures - */ -static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) -{ - unsigned long flags; - int ret = 0; - - spin_lock_irqsave(&rbio->bio_list_lock, flags); - - /* we already know this stripe is bad, move on */ - if (rbio->faila == failed || rbio->failb == failed) - goto out; - - if (rbio->faila == -1) { - /* first failure on this rbio */ - rbio->faila = failed; - atomic_inc(&rbio->error); - } else if (rbio->failb == -1) { - /* second failure on this rbio */ - rbio->failb = failed; - atomic_inc(&rbio->error); - } else { - ret = -EIO; - } -out: - spin_unlock_irqrestore(&rbio->bio_list_lock, flags); - - return ret; -} - -/* - * helper to fail a stripe based on a physical disk - * bio. - */ -static int fail_bio_stripe(struct btrfs_raid_bio *rbio, - struct bio *bio) -{ - int failed = find_bio_stripe(rbio, bio); - - if (failed < 0) - return -EIO; - - return fail_rbio_index(rbio, failed); -} - /* * For subpage case, we can no longer set page Uptodate directly for * stripe_pages[], thus we need to locate the sector. @@ -1530,12 +1431,10 @@ static void raid_wait_read_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_status) { - fail_bio_stripe(rbio, bio); + if (bio->bi_status) rbio_update_error_bitmap(rbio, bio); - } else { + else set_bio_pages_uptodate(rbio, bio); - } bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending)) @@ -2021,12 +1920,6 @@ static int recover_rbio(struct btrfs_raid_bio *rbio) ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); bio_list_init(&bio_list); - /* - * Reset error to 0, as we will later increase error for missing - * devices. - */ - atomic_set(&rbio->error, 0); - /* For recovery, we need to read all sectors including P/Q. */ ret = alloc_rbio_pages(rbio); if (ret < 0) @@ -2144,30 +2037,13 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, set_rbio_range_error(rbio, bio); - rbio->faila = find_logical_bio_stripe(rbio, bio); - if (rbio->faila == -1) { - btrfs_warn(fs_info, -"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", - __func__, bio->bi_iter.bi_sector << 9, - (u64)bio->bi_iter.bi_size, bioc->map_type); - free_raid_bio(rbio); - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); - return; - } - /* * Loop retry: * for 'mirror == 2', reconstruct from all other stripes. * for 'mirror_num > 2', select a stripe to fail on every retry. */ - if (mirror_num > 2) { + if (mirror_num > 2) set_rbio_raid6_extra_error(rbio, mirror_num); - rbio->failb = rbio->real_stripes - (mirror_num - 1); - ASSERT(rbio->failb > 0); - if (rbio->failb <= rbio->faila) - rbio->failb--; - } start_async_work(rbio, recover_rbio_work); } @@ -2179,7 +2055,6 @@ static int rmw_read_and_wait(struct btrfs_raid_bio *rbio) int ret; bio_list_init(&bio_list); - atomic_set(&rbio->error, 0); ret = rmw_assemble_read_bios(rbio, &bio_list); if (ret < 0) @@ -2200,10 +2075,8 @@ static void raid_wait_write_end_io(struct bio *bio) struct btrfs_raid_bio *rbio = bio->bi_private; blk_status_t err = bio->bi_status; - if (err) { - fail_bio_stripe(rbio, bio); + if (err) rbio_update_error_bitmap(rbio, bio); - } bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending)) wake_up(&rbio->io_wait); @@ -2253,19 +2126,14 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) if (ret < 0) return ret; - atomic_set(&rbio->error, 0); index_rbio_pages(rbio); ret = rmw_read_and_wait(rbio); if (ret < 0) return ret; - /* Too many read errors, beyond our tolerance. */ - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - return ret; - - /* Have read failures but under tolerance, needs recovery. */ - if (rbio->faila >= 0 || rbio->failb >= 0) { + /* We have read errors, try recovery path. */ + if (!bitmap_empty(rbio->error_bitmap, rbio->nr_sectors)) { ret = recover_rbio(rbio); if (ret < 0) return ret; @@ -2280,7 +2148,6 @@ write: set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); spin_unlock_irq(&rbio->bio_list_lock); - atomic_set(&rbio->error, 0); bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); index_rbio_pages(rbio); @@ -2309,9 +2176,16 @@ write: submit_write_bios(rbio, &bio_list); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); - /* We have more errors than our tolerance during the read. */ - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - ret = -EIO; + /* We may have more errors than our tolerance during the read. */ + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + int found_errors; + + found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); + if (found_errors > rbio->bioc->max_errors) { + ret = -EIO; + break; + } + } return ret; } @@ -2492,7 +2366,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); } - atomic_set(&rbio->error, 0); bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* Map the parity stripe just once */ @@ -2726,6 +2599,7 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio) { bool need_check = false; struct bio_list bio_list; + int sector_nr; int ret; struct bio *bio; @@ -2735,7 +2609,6 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio) if (ret) goto cleanup; - atomic_set(&rbio->error, 0); bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); ret = scrub_assemble_read_bios(rbio, &bio_list); @@ -2756,8 +2629,15 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio) */ ret = finish_parity_scrub(rbio, need_check); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - ret = -EIO; + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { + int found_errors; + + found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); + if (found_errors > rbio->bioc->max_errors) { + ret = -EIO; + break; + } + } return ret; cleanup: @@ -2804,14 +2684,6 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) ASSERT(!bio->bi_iter.bi_size); set_rbio_range_error(rbio, bio); - rbio->faila = find_logical_bio_stripe(rbio, bio); - if (rbio->faila == -1) { - btrfs_warn_rl(fs_info, - "can not determine the failed stripe number for full stripe %llu", - bioc->raid_map[0]); - free_raid_bio(rbio); - return NULL; - } return rbio; } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index e38da4fa76d6..a2e653e93fd8 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -74,12 +74,6 @@ struct btrfs_raid_bio { /* How many sectors there are for each stripe */ u8 stripe_nsectors; - /* First bad stripe, -1 means no corruption */ - s8 faila; - - /* Second bad stripe (for RAID6 use) */ - s8 failb; - /* Stripe number that we're scrubbing */ u8 scrubp; @@ -93,8 +87,6 @@ struct btrfs_raid_bio { atomic_t stripes_pending; - atomic_t error; - wait_queue_head_t io_wait; /* Bitmap to record which horizontal stripe has data */ -- cgit From 3e09b5b2293f21e6e28929b6bbb73833678bfdd1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 7 Nov 2022 17:30:21 +0100 Subject: btrfs: constify input buffer parameter in compression code The input buffers passed down to compression must never be changed, switch type to u8 as it's a raw byte buffer and use const. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/compression.c | 4 ++-- fs/btrfs/compression.h | 8 ++++---- fs/btrfs/lzo.c | 2 +- fs/btrfs/zlib.c | 2 +- fs/btrfs/zstd.c | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 42e6dde2ad59..30adf430241e 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -119,7 +119,7 @@ static int compression_decompress_bio(struct list_head *ws, } static int compression_decompress(int type, struct list_head *ws, - unsigned char *data_in, struct page *dest_page, + const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { switch (type) { @@ -1232,7 +1232,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) * single page, and we want to read a single page out of it. * start_byte tells us the offset into the compressed data we're interested in */ -int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, +int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { struct list_head *workspace; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index b961462399ae..6209d40a1e08 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -86,7 +86,7 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, unsigned long *out_pages, unsigned long *total_in, unsigned long *total_out); -int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, +int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); @@ -150,7 +150,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zlib_decompress(struct list_head *ws, unsigned char *data_in, +int zlib_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); struct list_head *zlib_alloc_workspace(unsigned int level); @@ -161,7 +161,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int lzo_decompress(struct list_head *ws, unsigned char *data_in, +int lzo_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); struct list_head *lzo_alloc_workspace(unsigned int level); @@ -171,7 +171,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zstd_decompress(struct list_head *ws, unsigned char *data_in, +int zstd_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); void zstd_init_workspace_manager(void); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index e7b1ceffcd33..d5e78cbc8fbc 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -427,7 +427,7 @@ out: return ret; } -int lzo_decompress(struct list_head *ws, unsigned char *data_in, +int lzo_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index c5275cb23837..01a13de11832 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -355,7 +355,7 @@ done: return ret; } -int zlib_decompress(struct list_head *ws, unsigned char *data_in, +int zlib_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 4575b3703e74..e34f1ab99d56 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -616,7 +616,7 @@ done: return ret; } -int zstd_decompress(struct list_head *ws, unsigned char *data_in, +int zstd_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { -- cgit From bb21e30260a672172a26ee1626dc1463215cf18c Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Mon, 7 Nov 2022 23:07:17 +0800 Subject: btrfs: move device->name RCU allocation and assign to btrfs_alloc_device() There is a repeating code section in the parent function after calling btrfs_alloc_device(), as below: name = rcu_string_strdup(path, GFP_...); if (!name) { btrfs_free_device(device); return ERR_PTR(-ENOMEM); } rcu_assign_pointer(device->name, name); Except in add_missing_dev() for obvious reasons. This patch consolidates that repeating code into the btrfs_alloc_device() itself so that the parent function doesn't have to duplicate code. This consolidation also helps to review issues regarding RCU lock violation with device->name. Parent function device_list_add() and add_missing_dev() use GFP_NOFS for the allocation, whereas the rest of the parent functions use GFP_KERNEL, so bring the NOFS allocation context using memalloc_nofs_save() in the function device_list_add() and add_missing_dev() is already doing it. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 10 +------- fs/btrfs/volumes.c | 68 +++++++++++++++++++++++--------------------------- fs/btrfs/volumes.h | 4 +-- 3 files changed, 34 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 84af2010fae2..9c4a8649a0f4 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -249,7 +249,6 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; struct block_device *bdev; - struct rcu_string *name; u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; @@ -293,19 +292,12 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, } - device = btrfs_alloc_device(NULL, &devid, NULL); + device = btrfs_alloc_device(NULL, &devid, NULL, device_path); if (IS_ERR(device)) { ret = PTR_ERR(device); goto error; } - name = rcu_string_strdup(device_path, GFP_KERNEL); - if (!name) { - btrfs_free_device(device); - ret = -ENOMEM; - goto error; - } - rcu_assign_pointer(device->name, name); ret = lookup_bdev(device_path, &device->devt); if (ret) goto error; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index acb97494bb52..49d6e79d4343 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -845,26 +845,23 @@ static noinline struct btrfs_device *device_list_add(const char *path, } if (!device) { + unsigned int nofs_flag; + if (fs_devices->opened) { mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EBUSY); } + nofs_flag = memalloc_nofs_save(); device = btrfs_alloc_device(NULL, &devid, - disk_super->dev_item.uuid); + disk_super->dev_item.uuid, path); + memalloc_nofs_restore(nofs_flag); if (IS_ERR(device)) { mutex_unlock(&fs_devices->device_list_mutex); /* we can safely leave the fs_devices entry around */ return device; } - name = rcu_string_strdup(path, GFP_NOFS); - if (!name) { - btrfs_free_device(device); - mutex_unlock(&fs_devices->device_list_mutex); - return ERR_PTR(-ENOMEM); - } - rcu_assign_pointer(device->name, name); device->devt = path_devt; list_add_rcu(&device->dev_list, &fs_devices->devices); @@ -997,30 +994,22 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) fs_devices->total_devices = orig->total_devices; list_for_each_entry(orig_dev, &orig->devices, dev_list) { - struct rcu_string *name; + const char *dev_path = NULL; + + /* + * This is ok to do without RCU read locked because we hold the + * uuid mutex so nothing we touch in here is going to disappear. + */ + if (orig_dev->name) + dev_path = orig_dev->name->str; device = btrfs_alloc_device(NULL, &orig_dev->devid, - orig_dev->uuid); + orig_dev->uuid, dev_path); if (IS_ERR(device)) { ret = PTR_ERR(device); goto error; } - /* - * This is ok to do without rcu read locked because we hold the - * uuid mutex so nothing we touch in here is going to disappear. - */ - if (orig_dev->name) { - name = rcu_string_strdup(orig_dev->name->str, - GFP_KERNEL); - if (!name) { - btrfs_free_device(device); - ret = -ENOMEM; - goto error; - } - rcu_assign_pointer(device->name, name); - } - if (orig_dev->zone_info) { struct btrfs_zoned_device_info *zone_info; @@ -2604,7 +2593,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct btrfs_device *device; struct block_device *bdev; struct super_block *sb = fs_info->sb; - struct rcu_string *name; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *seed_devices; u64 orig_super_total_bytes; @@ -2645,20 +2633,13 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } rcu_read_unlock(); - device = btrfs_alloc_device(fs_info, NULL, NULL); + device = btrfs_alloc_device(fs_info, NULL, NULL, device_path); if (IS_ERR(device)) { /* we can safely leave the fs_devices entry around */ ret = PTR_ERR(device); goto error; } - name = rcu_string_strdup(device_path, GFP_KERNEL); - if (!name) { - ret = -ENOMEM; - goto error_free_device; - } - rcu_assign_pointer(device->name, name); - device->fs_info = fs_info; device->bdev = bdev; ret = lookup_bdev(device_path, &device->devt); @@ -6998,8 +6979,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, * always do NOFS because we use it in a lot of other GFP_KERNEL safe * places. */ + nofs_flag = memalloc_nofs_save(); - device = btrfs_alloc_device(NULL, &devid, dev_uuid); + device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL); memalloc_nofs_restore(nofs_flag); if (IS_ERR(device)) return device; @@ -7023,14 +7005,15 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, * is generated. * @uuid: a pointer to UUID for this device. If NULL a new UUID * is generated. + * @path: a pointer to device path if available, NULL otherwise. * * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() * on error. Returned struct is not linked onto any lists and must be * destroyed with btrfs_free_device. */ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, - const u64 *devid, - const u8 *uuid) + const u64 *devid, const u8 *uuid, + const char *path) { struct btrfs_device *dev; u64 tmp; @@ -7068,6 +7051,17 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, else generate_random_uuid(dev->uuid); + if (path) { + struct rcu_string *name; + + name = rcu_string_strdup(path, GFP_KERNEL); + if (!name) { + btrfs_free_device(dev); + return ERR_PTR(-ENOMEM); + } + rcu_assign_pointer(dev->name, name); + } + return dev; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index b05124e62412..f2a152937cd4 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -649,8 +649,8 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, const char *path); struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, - const u64 *devid, - const u8 *uuid); + const u64 *devid, const u8 *uuid, + const char *path); void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); void btrfs_free_device(struct btrfs_device *device); int btrfs_rm_device(struct btrfs_fs_info *fs_info, -- cgit From 789d6a3a876e32c23fc9633d5b372d02a5188f0e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 14 Sep 2022 13:32:50 +0800 Subject: btrfs: concentrate all tree block parentness check parameters into one structure There are several different tree block parentness check parameters used across several helpers: - level Mandatory - transid Under most cases it's mandatory, but there are several backref cases which skips this check. - owner_root - first_key Utilized by most top-down tree search routine. Otherwise can be skipped. Those four members are not always mandatory checks, and some of them are the same u64, which means if some arguments got swapped compiler will not catch it. Furthermore if we're going to further expand the parentness check, we need to modify quite some helpers just to add one more parameter. This patch will concentrate all these members into a structure called btrfs_tree_parent_check, and pass that structure for the following helpers: - btrfs_read_extent_buffer() - read_tree_block() Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/backref.c | 15 ++++++++---- fs/btrfs/ctree.c | 28 +++++++++++++--------- fs/btrfs/disk-io.c | 63 ++++++++++++++++++++++++++++--------------------- fs/btrfs/disk-io.h | 36 ++++++++++++++++++++++++---- fs/btrfs/extent-tree.c | 12 ++++++---- fs/btrfs/print-tree.c | 14 ++++++----- fs/btrfs/qgroup.c | 18 ++++++++++---- fs/btrfs/relocation.c | 11 ++++++--- fs/btrfs/tree-log.c | 25 ++++++++++++++------ fs/btrfs/tree-mod-log.c | 9 +++++-- 10 files changed, 159 insertions(+), 72 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 430974cf3b96..55c072ba6747 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -840,6 +840,8 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, struct rb_node *node; while ((node = rb_first_cached(&tree->root))) { + struct btrfs_tree_parent_check check = { 0 }; + ref = rb_entry(node, struct prelim_ref, rbnode); rb_erase_cached(node, &tree->root); @@ -847,8 +849,10 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, BUG_ON(ref->key_for_search.type); BUG_ON(!ref->wanted_disk_byte); - eb = read_tree_block(fs_info, ref->wanted_disk_byte, - ref->root_id, 0, ref->level - 1, NULL); + check.level = ref->level - 1; + check.owner_root = ref->root_id; + + eb = read_tree_block(fs_info, ref->wanted_disk_byte, &check); if (IS_ERR(eb)) { free_pref(ref); return PTR_ERR(eb); @@ -1591,10 +1595,13 @@ again: if (ref->count && ref->parent) { if (!ctx->ignore_extent_item_pos && !ref->inode_list && ref->level == 0) { + struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb; - eb = read_tree_block(ctx->fs_info, ref->parent, 0, - 0, ref->level, NULL); + check.level = ref->level; + + eb = read_tree_block(ctx->fs_info, ref->parent, + &check); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0acd85111cdf..f75e398d7b71 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -857,19 +857,22 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, int slot) { int level = btrfs_header_level(parent); + struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb; - struct btrfs_key first_key; if (slot < 0 || slot >= btrfs_header_nritems(parent)) return ERR_PTR(-ENOENT); BUG_ON(level == 0); - btrfs_node_key_to_cpu(parent, &first_key, slot); + check.level = level - 1; + check.transid = btrfs_node_ptr_generation(parent, slot); + check.owner_root = btrfs_header_owner(parent); + check.has_first_key = true; + btrfs_node_key_to_cpu(parent, &check.first_key, slot); + eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), - btrfs_header_owner(parent), - btrfs_node_ptr_generation(parent, slot), - level - 1, &first_key); + &check); if (IS_ERR(eb)) return eb; if (!extent_buffer_uptodate(eb)) { @@ -1428,10 +1431,10 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, const struct btrfs_key *key) { struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_tree_parent_check check = { 0 }; u64 blocknr; u64 gen; struct extent_buffer *tmp; - struct btrfs_key first_key; int ret; int parent_level; bool unlock_up; @@ -1440,7 +1443,11 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, blocknr = btrfs_node_blockptr(*eb_ret, slot); gen = btrfs_node_ptr_generation(*eb_ret, slot); parent_level = btrfs_header_level(*eb_ret); - btrfs_node_key_to_cpu(*eb_ret, &first_key, slot); + btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); + check.has_first_key = true; + check.level = parent_level - 1; + check.transid = gen; + check.owner_root = root->root_key.objectid; /* * If we need to read an extent buffer from disk and we are holding locks @@ -1462,7 +1469,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, * parents (shared tree blocks). */ if (btrfs_verify_level_key(tmp, - parent_level - 1, &first_key, gen)) { + parent_level - 1, &check.first_key, gen)) { free_extent_buffer(tmp); return -EUCLEAN; } @@ -1479,7 +1486,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, btrfs_unlock_up_safe(p, level + 1); /* now we're allowed to do a blocking uptodate check */ - ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key); + ret = btrfs_read_extent_buffer(tmp, &check); if (ret) { free_extent_buffer(tmp); btrfs_release_path(p); @@ -1509,8 +1516,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (p->reada != READA_NONE) reada_for_search(fs_info, p, level, slot, key->objectid); - tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, - gen, parent_level - 1, &first_key); + tmp = read_tree_block(fs_info, blocknr, &check); if (IS_ERR(tmp)) { btrfs_release_path(p); return PTR_ERR(tmp); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e602e0b4c25d..2f944a7c70d5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -259,13 +259,11 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, * helper to read a given tree block, doing retries as required when * the checksums don't match and we have alternate mirrors to try. * - * @parent_transid: expected transid, skip check if 0 - * @level: expected level, mandatory check - * @first_key: expected key of first slot, skip check if NULL + * @check: expected tree parentness check, see the comments of the + * structure for details. */ int btrfs_read_extent_buffer(struct extent_buffer *eb, - u64 parent_transid, int level, - struct btrfs_key *first_key) + struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; @@ -275,16 +273,19 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, int mirror_num = 0; int failed_mirror = 0; + ASSERT(check); + io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; while (1) { clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num); if (!ret) { - if (verify_parent_transid(io_tree, eb, - parent_transid, 0)) + if (verify_parent_transid(io_tree, eb, check->transid, 0)) ret = -EIO; - else if (btrfs_verify_level_key(eb, level, - first_key, parent_transid)) + else if (btrfs_verify_level_key(eb, check->level, + check->has_first_key ? + &check->first_key : NULL, + check->transid)) ret = -EUCLEAN; else break; @@ -936,28 +937,28 @@ struct extent_buffer *btrfs_find_create_tree_block( * Read tree block at logical address @bytenr and do variant basic but critical * verification. * - * @owner_root: the objectid of the root owner for this block. - * @parent_transid: expected transid of this tree block, skip check if 0 - * @level: expected level, mandatory check - * @first_key: expected key in slot 0, skip check if NULL + * @check: expected tree parentness check, see comments of the + * structure for details. */ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 owner_root, u64 parent_transid, - int level, struct btrfs_key *first_key) + struct btrfs_tree_parent_check *check) { struct extent_buffer *buf = NULL; int ret; - buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); + ASSERT(check); + + buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root, + check->level); if (IS_ERR(buf)) return buf; - ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key); + ret = btrfs_read_extent_buffer(buf, check); if (ret) { free_extent_buffer_stale(buf); return ERR_PTR(ret); } - if (btrfs_check_eb_owner(buf, owner_root)) { + if (btrfs_check_eb_owner(buf, check->owner_root)) { free_extent_buffer_stale(buf); return ERR_PTR(-EUCLEAN); } @@ -1373,6 +1374,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, struct btrfs_key *key) { struct btrfs_root *root; + struct btrfs_tree_parent_check check = { 0 }; struct btrfs_fs_info *fs_info = tree_root->fs_info; u64 generation; int ret; @@ -1392,9 +1394,11 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, generation = btrfs_root_generation(&root->root_item); level = btrfs_root_level(&root->root_item); - root->node = read_tree_block(fs_info, - btrfs_root_bytenr(&root->root_item), - key->objectid, generation, level, NULL); + check.level = level; + check.transid = generation; + check.owner_root = key->objectid; + root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item), + &check); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); root->node = NULL; @@ -2367,6 +2371,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { int ret; + struct btrfs_tree_parent_check check = { 0 }; struct btrfs_root *log_tree_root; struct btrfs_super_block *disk_super = fs_info->super_copy; u64 bytenr = btrfs_super_log_root(disk_super); @@ -2382,10 +2387,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, if (!log_tree_root) return -ENOMEM; - log_tree_root->node = read_tree_block(fs_info, bytenr, - BTRFS_TREE_LOG_OBJECTID, - fs_info->generation + 1, level, - NULL); + check.level = level; + check.transid = fs_info->generation + 1; + check.owner_root = BTRFS_TREE_LOG_OBJECTID; + log_tree_root->node = read_tree_block(fs_info, bytenr, &check); if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); @@ -2863,10 +2868,14 @@ out: static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level) { + struct btrfs_tree_parent_check check = { + .level = level, + .transid = gen, + .owner_root = root->root_key.objectid + }; int ret = 0; - root->node = read_tree_block(root->fs_info, bytenr, - root->root_key.objectid, gen, level, NULL); + root->node = read_tree_block(root->fs_info, bytenr, &check); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); root->node = NULL; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index f2c507fd0e04..03fe4154ffb8 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -25,6 +25,35 @@ static inline u64 btrfs_sb_offset(int mirror) return BTRFS_SUPER_INFO_OFFSET; } +/* All the extra info needed to verify the parentness of a tree block. */ +struct btrfs_tree_parent_check { + /* + * The owner check against the tree block. + * + * Can be 0 to skip the owner check. + */ + u64 owner_root; + + /* + * Expected transid, can be 0 to skip the check, but such skip + * should only be utlized for backref walk related code. + */ + u64 transid; + + /* + * The expected first key. + * + * This check can be skipped if @has_first_key is false, such skip + * can happen for case where we don't have the parent node key, + * e.g. reading the tree root, doing backref walk. + */ + struct btrfs_key first_key; + bool has_first_key; + + /* The expected level. Should always be set. */ + u8 level; +}; + struct btrfs_device; struct btrfs_fs_devices; @@ -33,8 +62,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); int btrfs_verify_level_key(struct extent_buffer *eb, int level, struct btrfs_key *first_key, u64 parent_transid); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 owner_root, u64 parent_transid, - int level, struct btrfs_key *first_key); + struct btrfs_tree_parent_check *check); struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, @@ -111,8 +139,8 @@ void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); -int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid, - int level, struct btrfs_key *first_key); +int btrfs_read_extent_buffer(struct extent_buffer *buf, + struct btrfs_tree_parent_check *check); enum btrfs_wq_submit_cmd { WQ_SUBMIT_METADATA, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b037107678c8..10cc757a602b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5261,8 +5261,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, u64 bytenr; u64 generation; u64 parent; + struct btrfs_tree_parent_check check = { 0 }; struct btrfs_key key; - struct btrfs_key first_key; struct btrfs_ref ref = { 0 }; struct extent_buffer *next; int level = wc->level; @@ -5284,7 +5284,12 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); - btrfs_node_key_to_cpu(path->nodes[level], &first_key, + + check.level = level - 1; + check.transid = generation; + check.owner_root = root->root_key.objectid; + check.has_first_key = true; + btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]); next = find_extent_buffer(fs_info, bytenr); @@ -5346,8 +5351,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (!next) { if (reada && level == 1) reada_walk_down(trans, root, wc, path); - next = read_tree_block(fs_info, bytenr, root->root_key.objectid, - generation, level - 1, &first_key); + next = read_tree_block(fs_info, bytenr, &check); if (IS_ERR(next)) { return PTR_ERR(next); } else if (!extent_buffer_uptodate(next)) { diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 1a2350fd68be..1469aa55ad48 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -386,14 +386,16 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow) if (!follow) return; for (i = 0; i < nr; i++) { - struct btrfs_key first_key; + struct btrfs_tree_parent_check check = { + .level = level - 1, + .transid = btrfs_node_ptr_generation(c, i), + .owner_root = btrfs_header_owner(c), + .has_first_key = true + }; struct extent_buffer *next; - btrfs_node_key_to_cpu(c, &first_key, i); - next = read_tree_block(fs_info, btrfs_node_blockptr(c, i), - btrfs_header_owner(c), - btrfs_node_ptr_generation(c, i), - level - 1, &first_key); + btrfs_node_key_to_cpu(c, &check.first_key, i); + next = read_tree_block(fs_info, btrfs_node_blockptr(c, i), &check); if (IS_ERR(next)) continue; if (!extent_buffer_uptodate(next)) { diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 24c013c61a94..e0522c6c0d67 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2339,7 +2339,13 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, } if (!extent_buffer_uptodate(root_eb)) { - ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL); + struct btrfs_tree_parent_check check = { + .has_first_key = false, + .transid = root_gen, + .level = root_level + }; + + ret = btrfs_read_extent_buffer(root_eb, &check); if (ret) goto out; } @@ -4303,6 +4309,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct extent_buffer *subvol_eb) { struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_tree_parent_check check = { 0 }; struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; struct btrfs_qgroup_swapped_block *block; struct extent_buffer *reloc_eb = NULL; @@ -4351,10 +4358,13 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, blocks->swapped = swapped; spin_unlock(&blocks->lock); + check.level = block->level; + check.transid = block->reloc_generation; + check.has_first_key = true; + memcpy(&check.first_key, &block->first_key, sizeof(check.first_key)); + /* Read out reloc subtree root */ - reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0, - block->reloc_generation, block->level, - &block->first_key); + reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check); if (IS_ERR(reloc_eb)) { ret = PTR_ERR(reloc_eb); reloc_eb = NULL; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8914aa920bb7..56c8afa6f6d2 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2610,10 +2610,14 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc) static int get_tree_block_key(struct btrfs_fs_info *fs_info, struct tree_block *block) { + struct btrfs_tree_parent_check check = { + .level = block->level, + .owner_root = block->owner, + .transid = block->key.offset + }; struct extent_buffer *eb; - eb = read_tree_block(fs_info, block->bytenr, block->owner, - block->key.offset, block->level, NULL); + eb = read_tree_block(fs_info, block->bytenr, &check); if (IS_ERR(eb)) return PTR_ERR(eb); if (!extent_buffer_uptodate(eb)) { @@ -3426,9 +3430,10 @@ int add_data_references(struct reloc_control *rc, ULIST_ITER_INIT(&leaf_uiter); while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) { + struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb; - eb = read_tree_block(ctx.fs_info, ref_node->val, 0, 0, 0, NULL); + eb = read_tree_block(ctx.fs_info, ref_node->val, &check); if (IS_ERR(eb)) { ret = PTR_ERR(eb); break; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f7b1bb9c63e4..4d9f6803bfbe 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -341,7 +341,12 @@ static int process_one_buffer(struct btrfs_root *log, * pin down any logged extents, so we have to read the block. */ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { - ret = btrfs_read_extent_buffer(eb, gen, level, NULL); + struct btrfs_tree_parent_check check = { + .level = level, + .transid = gen + }; + + ret = btrfs_read_extent_buffer(eb, &check); if (ret) return ret; } @@ -2411,13 +2416,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen, int level) { int nritems; + struct btrfs_tree_parent_check check = { + .transid = gen, + .level = level + }; struct btrfs_path *path; struct btrfs_root *root = wc->replay_dest; struct btrfs_key key; int i; int ret; - ret = btrfs_read_extent_buffer(eb, gen, level, NULL); + ret = btrfs_read_extent_buffer(eb, &check); if (ret) return ret; @@ -2597,7 +2606,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, int ret = 0; while (*level > 0) { - struct btrfs_key first_key; + struct btrfs_tree_parent_check check = { 0 }; cur = path->nodes[*level]; @@ -2609,7 +2618,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(cur, path->slots[*level]); ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); - btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); + check.transid = ptr_gen; + check.level = *level - 1; + check.has_first_key = true; + btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]); blocksize = fs_info->nodesize; next = btrfs_find_create_tree_block(fs_info, bytenr, @@ -2628,8 +2640,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, path->slots[*level]++; if (wc->free) { - ret = btrfs_read_extent_buffer(next, ptr_gen, - *level - 1, &first_key); + ret = btrfs_read_extent_buffer(next, &check); if (ret) { free_extent_buffer(next); return ret; @@ -2657,7 +2668,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, free_extent_buffer(next); continue; } - ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key); + ret = btrfs_read_extent_buffer(next, &check); if (ret) { free_extent_buffer(next); return ret; diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 3bea19698a10..779ad44d285f 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -821,10 +821,15 @@ struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq) tm = tree_mod_log_search(fs_info, logical, time_seq); if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + struct btrfs_tree_parent_check check = { 0 }; + btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - old = read_tree_block(fs_info, logical, root->root_key.objectid, - 0, level, NULL); + + check.level = level; + check.owner_root = root->root_key.objectid; + + old = read_tree_block(fs_info, logical, &check); if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { if (!IS_ERR(old)) free_extent_buffer(old); -- cgit From 947a629988f191807d2d22ba63ae18259bb645c5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 14 Sep 2022 13:32:51 +0800 Subject: btrfs: move tree block parentness check into validate_extent_buffer() [BACKGROUND] Although both btrfs metadata and data has their read time verification done at endio time (btrfs_validate_metadata_buffer() and btrfs_verify_data_csum()), metadata has extra verification, mostly parentness check including first key/transid/owner_root/level, done at read_tree_block() and btrfs_read_extent_buffer(). On the other hand, all the data verification is done at endio context. [ENHANCEMENT] This patch will make a new union in btrfs_bio, taking the space of the old data checksums, thus it will not increase the memory usage. With that extra btrfs_tree_parent_check inside btrfs_bio, we can just pass the check parameter into read_extent_buffer_pages(), and before submitting the bio, we can copy the check structure into btrfs_bio. And finally at endio time, we can grab btrfs_bio::parent_check and pass it to validate_extent_buffer(), to move the remaining checks into it. This brings the following benefits: - Much simpler btrfs_read_extent_buffer() Now it only needs to iterate through all mirrors. - Simpler read-time transid check Previously we go verify_parent_transid() after reading out the extent buffer. Now the transid check is done inside the endio function, no other code can modify the content. Thus no need to use the extent lock anymore. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 73 ++++++++++++++++++++++++++++++++++++++-------------- fs/btrfs/extent_io.c | 18 ++++++++++--- fs/btrfs/extent_io.h | 5 ++-- fs/btrfs/volumes.h | 25 +++++++++++++++--- 4 files changed, 93 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2f944a7c70d5..559e7f3d727e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -266,7 +266,6 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct extent_io_tree *io_tree; int failed = 0; int ret; int num_copies = 0; @@ -275,21 +274,11 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, ASSERT(check); - io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; while (1) { clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); - ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num); - if (!ret) { - if (verify_parent_transid(io_tree, eb, check->transid, 0)) - ret = -EIO; - else if (btrfs_verify_level_key(eb, check->level, - check->has_first_key ? - &check->first_key : NULL, - check->transid)) - ret = -EUCLEAN; - else - break; - } + ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check); + if (!ret) + break; num_copies = btrfs_num_copies(fs_info, eb->start, eb->len); @@ -465,7 +454,8 @@ static int check_tree_block_fsid(struct extent_buffer *eb) } /* Do basic extent buffer checks at read time */ -static int validate_extent_buffer(struct extent_buffer *eb) +static int validate_extent_buffer(struct extent_buffer *eb, + struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start; @@ -475,6 +465,8 @@ static int validate_extent_buffer(struct extent_buffer *eb) const u8 *header_csum; int ret = 0; + ASSERT(check); + found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { btrfs_err_rl(fs_info, @@ -513,6 +505,45 @@ static int validate_extent_buffer(struct extent_buffer *eb) goto out; } + if (found_level != check->level) { + ret = -EIO; + goto out; + } + if (unlikely(check->transid && + btrfs_header_generation(eb) != check->transid)) { + btrfs_err_rl(eb->fs_info, +"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", + eb->start, eb->read_mirror, check->transid, + btrfs_header_generation(eb)); + ret = -EIO; + goto out; + } + if (check->has_first_key) { + struct btrfs_key *expect_key = &check->first_key; + struct btrfs_key found_key; + + if (found_level) + btrfs_node_key_to_cpu(eb, &found_key, 0); + else + btrfs_item_key_to_cpu(eb, &found_key, 0); + if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) { + btrfs_err(fs_info, +"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", + eb->start, check->transid, + expect_key->objectid, + expect_key->type, expect_key->offset, + found_key.objectid, found_key.type, + found_key.offset); + ret = -EUCLEAN; + goto out; + } + } + if (check->owner_root) { + ret = btrfs_check_eb_owner(eb, check->owner_root); + if (ret < 0) + goto out; + } + /* * If this is a leaf block and it is corrupt, set the corrupt bit so * that we don't try and read the other copies of this block, just @@ -537,13 +568,15 @@ out: } static int validate_subpage_buffer(struct page *page, u64 start, u64 end, - int mirror) + int mirror, struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); struct extent_buffer *eb; bool reads_done; int ret = 0; + ASSERT(check); + /* * We don't allow bio merge for subpage metadata read, so we should * only get one eb for each endio hook. @@ -567,7 +600,7 @@ static int validate_subpage_buffer(struct page *page, u64 start, u64 end, ret = -EIO; goto err; } - ret = validate_extent_buffer(eb); + ret = validate_extent_buffer(eb, check); if (ret < 0) goto err; @@ -597,7 +630,8 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, ASSERT(page->private); if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return validate_subpage_buffer(page, start, end, mirror); + return validate_subpage_buffer(page, start, end, mirror, + &bbio->parent_check); eb = (struct extent_buffer *)page->private; @@ -616,7 +650,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, ret = -EIO; goto err; } - ret = validate_extent_buffer(eb); + ret = validate_extent_buffer(eb, &bbio->parent_check); err: if (ret) { /* @@ -774,6 +808,7 @@ void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int m blk_status_t ret; bio->bi_opf |= REQ_META; + bbio->is_metadata = 1; if (btrfs_op(bio) != BTRFS_MAP_WRITE) { btrfs_submit_bio(fs_info, bio, mirror_num); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 859a41624c31..d257879edaba 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4934,7 +4934,8 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) } static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, - int mirror_num) + int mirror_num, + struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; @@ -4947,6 +4948,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); ASSERT(PagePrivate(page)); + ASSERT(check); io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; if (wait == WAIT_NONE) { @@ -4990,6 +4992,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, */ atomic_dec(&eb->io_pages); } + memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check)); submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) { free_extent_state(cached_state); @@ -5003,7 +5006,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, return ret; } -int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) +int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, + struct btrfs_tree_parent_check *check) { int i; struct page *page; @@ -5029,7 +5033,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) return -EIO; if (eb->fs_info->nodesize < PAGE_SIZE) - return read_extent_buffer_subpage(eb, wait, mirror_num); + return read_extent_buffer_subpage(eb, wait, mirror_num, check); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { @@ -5106,6 +5110,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } } + memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check)); submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) @@ -5841,6 +5846,11 @@ int try_release_extent_buffer(struct page *page) void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level) { + struct btrfs_tree_parent_check check = { + .has_first_key = 0, + .level = level, + .transid = gen + }; struct extent_buffer *eb; int ret; @@ -5853,7 +5863,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, return; } - ret = read_extent_buffer_pages(eb, WAIT_NONE, 0); + ret = read_extent_buffer_pages(eb, WAIT_NONE, 0, &check); if (ret < 0) free_extent_buffer_stale(eb); else diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 805e262811b4..a0bafc7f6c07 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -66,6 +66,7 @@ struct btrfs_inode; struct btrfs_fs_info; struct io_failure_record; struct extent_io_tree; +struct btrfs_tree_parent_check; int __init extent_buffer_init_cachep(void); void __cold extent_buffer_free_cachep(void); @@ -170,8 +171,8 @@ void free_extent_buffer_stale(struct extent_buffer *eb); #define WAIT_NONE 0 #define WAIT_COMPLETE 1 #define WAIT_PAGE_LOCK 2 -int read_extent_buffer_pages(struct extent_buffer *eb, int wait, - int mirror_num); +int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, + struct btrfs_tree_parent_check *parent_check); void wait_on_extent_buffer_writeback(struct extent_buffer *eb); void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index f2a152937cd4..efa6a3d48cd8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -11,6 +11,7 @@ #include #include "async-thread.h" #include "messages.h" +#include "disk-io.h" #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) @@ -397,7 +398,15 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); * Mostly for btrfs specific features like csum and mirror_num. */ struct btrfs_bio { - unsigned int mirror_num; + unsigned int mirror_num:7; + + /* + * Extra indicator for metadata bios. + * For some btrfs bios they use pages without a mapping, thus + * we can not rely on page->mapping->host to determine if + * it's a metadata bio. + */ + unsigned int is_metadata:1; struct bvec_iter iter; /* for direct I/O */ @@ -405,8 +414,16 @@ struct btrfs_bio { /* @device is for stripe IO submission. */ struct btrfs_device *device; - u8 *csum; - u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; + union { + /* For data checksum verification. */ + struct { + u8 *csum; + u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; + }; + + /* For metadata parentness verification. */ + struct btrfs_tree_parent_check parent_check; + }; /* End I/O information supplied to btrfs_bio_alloc */ btrfs_bio_end_io_t end_io; @@ -443,6 +460,8 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) { + if (bbio->is_metadata) + return; if (bbio->csum != bbio->csum_inline) { kfree(bbio->csum); bbio->csum = NULL; -- cgit From 2c8f5e8cdf0f77670b1a9f72156ad4e82ed323d1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 11:50:27 +0000 Subject: btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree We don't need to set the EXTENT_UPDATE bit in an inode's io_tree to mark a range as uptodate, we rely on the pages themselves being uptodate - page reading is not triggered for already uptodate pages. Recently we removed most use of the EXTENT_UPTODATE for buffered IO with commit 52b029f42751 ("btrfs: remove unnecessary EXTENT_UPTODATE state in buffered I/O path"), but there were a few leftovers, namely when reading from holes and successfully finishing read repair. These leftovers are unnecessarily making an inode's tree larger and deeper, slowing down searches on it. So remove all the leftovers. This change is part of a patchset that has the goal to make performance better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to iterate over the extents of a file. Two examples are the cp program from coreutils 9.0+ and the tar program (when using its --sparse / -S option). A sample test and results are listed in the changelog of the last patch in the series: 1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree 2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap 3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap 4/9 btrfs: search for delalloc more efficiently during lseek/fiemap 5/9 btrfs: remove no longer used btrfs_next_extent_map() 6/9 btrfs: allow passing a cached state record to count_range_bits() 7/9 btrfs: update stale comment for count_range_bits() 8/9 btrfs: use cached state when looking for delalloc ranges with fiemap 9/9 btrfs: use cached state when looking for delalloc ranges with lseek Reported-by: Wang Yugui Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/ Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.h | 7 ------- fs/btrfs/extent_io.c | 19 +++---------------- 2 files changed, 3 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index cdee8c0854c8..18ab82f62611 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -216,13 +216,6 @@ static inline int set_extent_new(struct extent_io_tree *tree, u64 start, return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS); } -static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, - cached_state, mask); -} - int find_first_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits, struct extent_state **cached_state); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d257879edaba..33cc3a9a2b68 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -901,14 +901,9 @@ static void end_sector_io(struct page *page, u64 offset, bool uptodate) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); const u32 sectorsize = inode->root->fs_info->sectorsize; - struct extent_state *cached = NULL; end_page_read(page, uptodate, offset, sectorsize); - if (uptodate) - set_extent_uptodate(&inode->io_tree, offset, - offset + sectorsize - 1, &cached, GFP_NOFS); - unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, - &cached); + unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, NULL); } static void submit_data_read_repair(struct inode *inode, @@ -1781,13 +1776,9 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { - struct extent_state *cached = NULL; - iosize = PAGE_SIZE - pg_offset; memzero_page(page, pg_offset, iosize); - set_extent_uptodate(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); - unlock_extent(tree, cur, cur + iosize - 1, &cached); + unlock_extent(tree, cur, cur + iosize - 1, NULL); end_page_read(page, true, cur, iosize); break; } @@ -1863,13 +1854,9 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - struct extent_state *cached = NULL; - memzero_page(page, pg_offset, iosize); - set_extent_uptodate(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); - unlock_extent(tree, cur, cur + iosize - 1, &cached); + unlock_extent(tree, cur, cur + iosize - 1, NULL); end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; -- cgit From 40daf3e095dbd1059d7e600d27e5bb987f563561 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 11:50:28 +0000 Subject: btrfs: add an early exit when searching for delalloc range for lseek/fiemap During fiemap and lseek (SEEK_HOLE/DATA), when looking for delalloc in a range corresponding to a hole or a prealloc extent, if we found the whole range marked as delalloc in the inode's io_tree, then we can terminate immediately and avoid searching the extent map tree. If not, and if the found delalloc starts at the same offset of our search start but ends before our search range's end, then we can adjust the search range for the search in the extent map tree. So implement those changes. This change is part of a patchset that has the goal to make performance better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to iterate over the extents of a file. Two examples are the cp program from coreutils 9.0+ and the tar program (when using its --sparse / -S option). A sample test and results are listed in the changelog of the last patch in the series: 1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree 2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap 3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap 4/9 btrfs: search for delalloc more efficiently during lseek/fiemap 5/9 btrfs: remove no longer used btrfs_next_extent_map() 6/9 btrfs: allow passing a cached state record to count_range_bits() 7/9 btrfs: update stale comment for count_range_bits() 8/9 btrfs: use cached state when looking for delalloc ranges with fiemap 9/9 btrfs: use cached state when looking for delalloc ranges with lseek Reported-by: Wang Yugui Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/ Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e6c93be91a06..9b1f76109682 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3216,7 +3216,7 @@ out: static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end, u64 *delalloc_start_ret, u64 *delalloc_end_ret) { - const u64 len = end + 1 - start; + u64 len = end + 1 - start; struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; u64 em_end; @@ -3242,13 +3242,23 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end delalloc_len = 0; } - /* - * If delalloc was found then *delalloc_start_ret has a sector size - * aligned value (rounded down). - */ - if (delalloc_len > 0) + if (delalloc_len > 0) { + /* + * If delalloc was found then *delalloc_start_ret has a sector size + * aligned value (rounded down). + */ *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1; + if (*delalloc_start_ret == start) { + /* Delalloc for the whole range, nothing more to do. */ + if (*delalloc_end_ret == end) + return true; + /* Else trim our search range for extent maps. */ + start = *delalloc_end_ret + 1; + len = end + 1 - start; + } + } + /* * No outstanding extents means we don't have any delalloc that is * flushing, so return the unflushed range found in the io tree (if any). -- cgit From af979fd618a40009c5cae03de21403848b13a931 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 11:50:29 +0000 Subject: btrfs: skip unnecessary delalloc searches during lseek/fiemap During lseek (SEEK_HOLE/DATA) and fiemap, when processing a file range that corresponds to a hole or a prealloc extent, if we find that there is no delalloc marked in the inode's io_tree but there is delalloc due to an extent map in the io tree, then on the next iteration that calls find_delalloc_subrange() we can skip searching the io tree again, since on the first call we had no delalloc in the io tree for the whole range. This change is part of a patchset that has the goal to make performance better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to iterate over the extents of a file. Two examples are the cp program from coreutils 9.0+ and the tar program (when using its --sparse / -S option). A sample test and results are listed in the changelog of the last patch in the series: 1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree 2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap 3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap 4/9 btrfs: search for delalloc more efficiently during lseek/fiemap 5/9 btrfs: remove no longer used btrfs_next_extent_map() 6/9 btrfs: allow passing a cached state record to count_range_bits() 7/9 btrfs: update stale comment for count_range_bits() 8/9 btrfs: use cached state when looking for delalloc ranges with fiemap 9/9 btrfs: use cached state when looking for delalloc ranges with lseek Reported-by: Wang Yugui Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/ Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9b1f76109682..99cc95487d42 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3214,6 +3214,7 @@ out: * looping while it gets adjacent subranges, and merging them together. */ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end, + bool *search_io_tree, u64 *delalloc_start_ret, u64 *delalloc_end_ret) { u64 len = end + 1 - start; @@ -3231,7 +3232,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end spin_lock(&inode->lock); outstanding_extents = inode->outstanding_extents; - if (inode->delalloc_bytes > 0) { + if (*search_io_tree && inode->delalloc_bytes > 0) { spin_unlock(&inode->lock); *delalloc_start_ret = start; delalloc_len = count_range_bits(&inode->io_tree, @@ -3257,6 +3258,9 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end start = *delalloc_end_ret + 1; len = end + 1 - start; } + } else { + /* No delalloc, future calls don't need to search again. */ + *search_io_tree = false; } /* @@ -3390,6 +3394,7 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, { u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize); u64 prev_delalloc_end = 0; + bool search_io_tree = true; bool ret = false; while (cur_offset < end) { @@ -3398,6 +3403,7 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, bool delalloc; delalloc = find_delalloc_subrange(inode, cur_offset, end, + &search_io_tree, &delalloc_start, &delalloc_end); if (!delalloc) -- cgit From 8ddc8274e4be9a35eafc5bef9ff64d94f452d193 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 11:50:30 +0000 Subject: btrfs: search for delalloc more efficiently during lseek/fiemap During lseek (SEEK_HOLE/DATA) and fiemap, when processing a file range that corresponds to a hole or a prealloc extent, we have to check if there's any delalloc in the range. We do it by searching for delalloc ranges in the inode's io_tree (for unflushed delalloc) and in the inode's extent map tree (for delalloc that is flushing). We avoid searching the extent map tree if the number of outstanding extents is 0, as in that case we can't have extent maps for our search range in the tree that correspond to delalloc that is flushing. However if we have any unflushed delalloc, due to buffered writes or mmap writes, then the outstanding extents counter is not 0 and we'll search the extent map tree. The tree may be large because it can have lots of extent maps that were loaded by reads or created by previous writes, therefore taking a significant time to search the tree, specially if have a file with a lot of holes and/or prealloc extents. We can improve on this by instead of searching the extent map tree, searching the ordered extents tree of the inode, since when delalloc is flushing we create an ordered extent along with the new extent map, while holding the respective file range locked in the inode's io_tree. The ordered extents tree is typically much smaller, since ordered extents have a short life and get removed from the tree once they are completed, while extent maps can stay for a very long time in the extent map tree, either created by previous writes or loaded by read operations. So use the ordered extents tree instead of the extent maps tree. This change is part of a patchset that has the goal to make performance better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to iterate over the extents of a file. Two examples are the cp program from coreutils 9.0+ and the tar program (when using its --sparse / -S option). A sample test and results are listed in the changelog of the last patch in the series: 1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree 2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap 3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap 4/9 btrfs: search for delalloc more efficiently during lseek/fiemap 5/9 btrfs: remove no longer used btrfs_next_extent_map() 6/9 btrfs: allow passing a cached state record to count_range_bits() 7/9 btrfs: update stale comment for count_range_bits() 8/9 btrfs: use cached state when looking for delalloc ranges with fiemap 9/9 btrfs: use cached state when looking for delalloc ranges with lseek Reported-by: Wang Yugui Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/ Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 152 ++++++++++++++++++-------------------------------------- 1 file changed, 48 insertions(+), 104 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 99cc95487d42..6bc2397e324c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3218,29 +3218,27 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end u64 *delalloc_start_ret, u64 *delalloc_end_ret) { u64 len = end + 1 - start; - struct extent_map_tree *em_tree = &inode->extent_tree; - struct extent_map *em; - u64 em_end; - u64 delalloc_len; - unsigned int outstanding_extents; + u64 delalloc_len = 0; + struct btrfs_ordered_extent *oe; + u64 oe_start; + u64 oe_end; /* * Search the io tree first for EXTENT_DELALLOC. If we find any, it * means we have delalloc (dirty pages) for which writeback has not * started yet. */ - spin_lock(&inode->lock); - outstanding_extents = inode->outstanding_extents; - - if (*search_io_tree && inode->delalloc_bytes > 0) { - spin_unlock(&inode->lock); - *delalloc_start_ret = start; - delalloc_len = count_range_bits(&inode->io_tree, - delalloc_start_ret, end, - len, EXTENT_DELALLOC, 1); - } else { - spin_unlock(&inode->lock); - delalloc_len = 0; + if (*search_io_tree) { + spin_lock(&inode->lock); + if (inode->delalloc_bytes > 0) { + spin_unlock(&inode->lock); + *delalloc_start_ret = start; + delalloc_len = count_range_bits(&inode->io_tree, + delalloc_start_ret, end, + len, EXTENT_DELALLOC, 1); + } else { + spin_unlock(&inode->lock); + } } if (delalloc_len > 0) { @@ -3254,7 +3252,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end /* Delalloc for the whole range, nothing more to do. */ if (*delalloc_end_ret == end) return true; - /* Else trim our search range for extent maps. */ + /* Else trim our search range for ordered extents. */ start = *delalloc_end_ret + 1; len = end + 1 - start; } @@ -3264,110 +3262,56 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end } /* - * No outstanding extents means we don't have any delalloc that is - * flushing, so return the unflushed range found in the io tree (if any). - */ - if (outstanding_extents == 0) - return (delalloc_len > 0); - - /* - * Now also check if there's any extent map in the range that does not - * map to a hole or prealloc extent. We do this because: + * Now also check if there's any ordered extent in the range. + * We do this because: * * 1) When delalloc is flushed, the file range is locked, we clear the - * EXTENT_DELALLOC bit from the io tree and create an extent map for - * an allocated extent. So we might just have been called after - * delalloc is flushed and before the ordered extent completes and - * inserts the new file extent item in the subvolume's btree; + * EXTENT_DELALLOC bit from the io tree and create an extent map and + * an ordered extent for the write. So we might just have been called + * after delalloc is flushed and before the ordered extent completes + * and inserts the new file extent item in the subvolume's btree; * - * 2) We may have an extent map created by flushing delalloc for a + * 2) We may have an ordered extent created by flushing delalloc for a * subrange that starts before the subrange we found marked with * EXTENT_DELALLOC in the io tree. + * + * We could also use the extent map tree to find such delalloc that is + * being flushed, but using the ordered extents tree is more efficient + * because it's usually much smaller as ordered extents are removed from + * the tree once they complete. With the extent maps, we mau have them + * in the extent map tree for a very long time, and they were either + * created by previous writes or loaded by read operations. */ - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - if (!em) { - read_unlock(&em_tree->lock); + oe = btrfs_lookup_first_ordered_range(inode, start, len); + if (!oe) return (delalloc_len > 0); - } - - /* extent_map_end() returns a non-inclusive end offset. */ - em_end = extent_map_end(em); - - /* - * If we have a hole/prealloc extent map, check the next one if this one - * ends before our range's end. - */ - if ((em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) { - struct extent_map *next_em; - - next_em = btrfs_next_extent_map(em_tree, em); - free_extent_map(em); - - /* - * There's no next extent map or the next one starts beyond our - * range, return the range found in the io tree (if any). - */ - if (!next_em || next_em->start > end) { - read_unlock(&em_tree->lock); - free_extent_map(next_em); - return (delalloc_len > 0); - } - em_end = extent_map_end(next_em); - em = next_em; - } - - read_unlock(&em_tree->lock); + /* The ordered extent may span beyond our search range. */ + oe_start = max(oe->file_offset, start); + oe_end = min(oe->file_offset + oe->num_bytes - 1, end); - /* - * We have a hole or prealloc extent that ends at or beyond our range's - * end, return the range found in the io tree (if any). - */ - if (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { - free_extent_map(em); - return (delalloc_len > 0); - } + btrfs_put_ordered_extent(oe); - /* - * We don't have any range as EXTENT_DELALLOC in the io tree, so the - * extent map is the only subrange representing delalloc. - */ + /* Don't have unflushed delalloc, return the ordered extent range. */ if (delalloc_len == 0) { - *delalloc_start_ret = em->start; - *delalloc_end_ret = min(end, em_end - 1); - free_extent_map(em); + *delalloc_start_ret = oe_start; + *delalloc_end_ret = oe_end; return true; } /* - * The extent map represents a delalloc range that starts before the - * delalloc range we found in the io tree. + * We have both unflushed delalloc (io_tree) and an ordered extent. + * If the ranges are adjacent returned a combined range, otherwise + * return the leftmost range. */ - if (em->start < *delalloc_start_ret) { - *delalloc_start_ret = em->start; - /* - * If the ranges are adjacent, return a combined range. - * Otherwise return the extent map's range. - */ - if (em_end < *delalloc_start_ret) - *delalloc_end_ret = min(end, em_end - 1); - - free_extent_map(em); - return true; + if (oe_start < *delalloc_start_ret) { + if (oe_end < *delalloc_start_ret) + *delalloc_end_ret = oe_end; + *delalloc_start_ret = oe_start; + } else if (*delalloc_end_ret + 1 == oe_start) { + *delalloc_end_ret = oe_end; } - /* - * The extent map starts after the delalloc range we found in the io - * tree. If it's adjacent, return a combined range, otherwise return - * the range found in the io tree. - */ - if (*delalloc_end_ret + 1 == em->start) - *delalloc_end_ret = min(end, em_end - 1); - - free_extent_map(em); return true; } -- cgit From cfd7a17d9b4588dd7a29e1a131257bee3e72b766 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 11:50:31 +0000 Subject: btrfs: remove no longer used btrfs_next_extent_map() There are no more users of btrfs_next_extent_map(), the previous patch in the series ("btrfs: search for delalloc more efficiently during lseek/fiemap") removed the last usage of the function, so delete it. This change is part of a patchset that has the goal to make performance better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to iterate over the extents of a file. Two examples are the cp program from coreutils 9.0+ and the tar program (when using its --sparse / -S option). A sample test and results are listed in the changelog of the last patch in the series: 1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree 2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap 3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap 4/9 btrfs: search for delalloc more efficiently during lseek/fiemap 5/9 btrfs: remove no longer used btrfs_next_extent_map() 6/9 btrfs: allow passing a cached state record to count_range_bits() 7/9 btrfs: update stale comment for count_range_bits() 8/9 btrfs: use cached state when looking for delalloc ranges with fiemap 9/9 btrfs: use cached state when looking for delalloc ranges with lseek Reported-by: Wang Yugui Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/ Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 29 ----------------------------- fs/btrfs/extent_map.h | 2 -- 2 files changed, 31 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 4a4362f5cc52..be94030e1dfb 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -529,35 +529,6 @@ static struct extent_map *next_extent_map(const struct extent_map *em) return container_of(next, struct extent_map, rb_node); } -/* - * Get the extent map that immediately follows another one. - * - * @tree: The extent map tree that the extent map belong to. - * Holding read or write access on the tree's lock is required. - * @em: An extent map from the given tree. The caller must ensure that - * between getting @em and between calling this function, the - * extent map @em is not removed from the tree - for example, by - * holding the tree's lock for the duration of those 2 operations. - * - * Returns the extent map that immediately follows @em, or NULL if @em is the - * last extent map in the tree. - */ -struct extent_map *btrfs_next_extent_map(const struct extent_map_tree *tree, - const struct extent_map *em) -{ - struct extent_map *next; - - /* The lock must be acquired either in read mode or write mode. */ - lockdep_assert_held(&tree->lock); - ASSERT(extent_map_in_tree(em)); - - next = next_extent_map(em); - if (next) - refcount_inc(&next->refs); - - return next; -} - static struct extent_map *prev_extent_map(struct extent_map *em) { struct rb_node *prev; diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 68d3f2c9ea1d..ad311864272a 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -87,8 +87,6 @@ static inline u64 extent_map_block_end(struct extent_map *em) void extent_map_tree_init(struct extent_map_tree *tree); struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); -struct extent_map *btrfs_next_extent_map(const struct extent_map_tree *tree, - const struct extent_map *em); int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified); void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); -- cgit From 8c6e53a79d16b3651ad3abeb415e1c637da75082 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 11:50:32 +0000 Subject: btrfs: allow passing a cached state record to count_range_bits() An inode's io_tree can be quite large and there are cases where due to delalloc it can have thousands of extent state records, which makes the red black tree have a depth of 10 or more, making the operation of count_range_bits() slow if we repeatedly call it for a range that starts where, or after, the previous one we called it for. Such use cases are when searching for delalloc in a file range that corresponds to a hole or a prealloc extent, which is done during lseek SEEK_HOLE/DATA and fiemap. So introduce a cached state parameter to count_range_bits() which we use to store the last extent state record we visited, and then allow the caller to pass it again on its next call to count_range_bits(). The next patches in the series will make fiemap and lseek use the new parameter. This change is part of a patchset that has the goal to make performance better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to iterate over the extents of a file. Two examples are the cp program from coreutils 9.0+ and the tar program (when using its --sparse / -S option). A sample test and results are listed in the changelog of the last patch in the series: 1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree 2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap 3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap 4/9 btrfs: search for delalloc more efficiently during lseek/fiemap 5/9 btrfs: remove no longer used btrfs_next_extent_map() 6/9 btrfs: allow passing a cached state record to count_range_bits() 7/9 btrfs: update stale comment for count_range_bits() 8/9 btrfs: use cached state when looking for delalloc ranges with fiemap 9/9 btrfs: use cached state when looking for delalloc ranges with lseek Reported-by: Wang Yugui Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/ Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 47 ++++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/extent-io-tree.h | 3 ++- fs/btrfs/file.c | 3 ++- fs/btrfs/inode.c | 2 +- 4 files changed, 49 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 285b0ff6e953..6b0d78df7eee 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -1521,9 +1521,11 @@ out: */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - u32 bits, int contig) + u32 bits, int contig, + struct extent_state **cached_state) { - struct extent_state *state; + struct extent_state *state = NULL; + struct extent_state *cached; u64 cur_start = *start; u64 total_bytes = 0; u64 last = 0; @@ -1534,11 +1536,41 @@ u64 count_range_bits(struct extent_io_tree *tree, spin_lock(&tree->lock); + if (!cached_state || !*cached_state) + goto search; + + cached = *cached_state; + + if (!extent_state_in_tree(cached)) + goto search; + + if (cached->start <= cur_start && cur_start <= cached->end) { + state = cached; + } else if (cached->start > cur_start) { + struct extent_state *prev; + + /* + * The cached state starts after our search range's start. Check + * if the previous state record starts at or before the range we + * are looking for, and if so, use it - this is a common case + * when there are holes between records in the tree. If there is + * no previous state record, we can start from our cached state. + */ + prev = prev_state(cached); + if (!prev) + state = cached; + else if (prev->start <= cur_start && cur_start <= prev->end) + state = prev; + } + /* * This search will find all the extents that end after our range * starts. */ - state = tree_search(tree, cur_start); +search: + if (!state) + state = tree_search(tree, cur_start); + while (state) { if (state->start > search_end) break; @@ -1559,7 +1591,16 @@ u64 count_range_bits(struct extent_io_tree *tree, } state = next_state(state); } + + if (cached_state) { + free_extent_state(*cached_state); + *cached_state = state; + if (state) + refcount_inc(&state->refs); + } + spin_unlock(&tree->lock); + return total_bytes; } diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 18ab82f62611..e3eeec380844 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -119,7 +119,8 @@ void __cold extent_state_free_cachep(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, u32 bits, int contig); + u64 max_bytes, u32 bits, int contig, + struct extent_state **cached_state); void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 6bc2397e324c..dc8399610ca3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3235,7 +3235,8 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end *delalloc_start_ret = start; delalloc_len = count_range_bits(&inode->io_tree, delalloc_start_ret, end, - len, EXTENT_DELALLOC, 1); + len, EXTENT_DELALLOC, 1, + NULL); } else { spin_unlock(&inode->lock); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aec1b232a71c..83898bca39d5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1769,7 +1769,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, * when starting writeback. */ count = count_range_bits(io_tree, &range_start, end, range_bytes, - EXTENT_NORESERVE, 0); + EXTENT_NORESERVE, 0, NULL); if (count > 0 || is_space_ino || is_reloc_ino) { u64 bytes = count; struct btrfs_fs_info *fs_info = inode->root->fs_info; -- cgit From 1ee51a06255d425c1b7effff095f0bf9c7240078 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 11:50:33 +0000 Subject: btrfs: update stale comment for count_range_bits() The comment for count_range_bits() mentions that the search is fast if we are asking for a range with the EXTENT_DIRTY bit set. However that is no longer true since we don't use that bit and the optimization for that was removed in: commit 71528e9e16c7 ("btrfs: get rid of extent_io_tree::dirty_bytes") So remove that part of the comment mentioning the no longer existing optimized case, and, while at it, add proper documentation describing the purpose, arguments and return value of the function. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 6b0d78df7eee..21fa15123af8 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -1515,9 +1515,29 @@ out: } /* - * Count the number of bytes in the tree that have a given bit(s) set. This - * can be fairly slow, except for EXTENT_DIRTY which is cached. The total - * number found is returned. + * Count the number of bytes in the tree that have a given bit(s) set for a + * given range. + * + * @tree: The io tree to search. + * @start: The start offset of the range. This value is updated to the + * offset of the first byte found with the given bit(s), so it + * can end up being bigger than the initial value. + * @search_end: The end offset (inclusive value) of the search range. + * @max_bytes: The maximum byte count we are interested. The search stops + * once it reaches this count. + * @bits: The bits the range must have in order to be accounted for. + * If multiple bits are set, then only subranges that have all + * the bits set are accounted for. + * @contig: Indicate if we should ignore holes in the range or not. If + * this is true, then stop once we find a hole. + * @cached_state: A cached state to be used across multiple calls to this + * function in order to speedup searches. Use NULL if this is + * called only once or if each call does not start where the + * previous one ended. + * + * Returns the total number of bytes found within the given range that have + * all given bits set. If the returned number of bytes is greater than zero + * then @start is updated with the offset of the first byte with the bits set. */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, -- cgit From b3e744fe6d289bec77b138b6201201e4148dcb0e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 11:50:34 +0000 Subject: btrfs: use cached state when looking for delalloc ranges with fiemap During fiemap, whenever we find a hole or prealloc extent, we will look for delalloc in that range, and one of the things we do for that is to find out ranges in the inode's io_tree marked with EXTENT_DELALLOC, using calls to count_range_bits(). Since we process file extents from left to right, if we have a file with several holes or prealloc extents, we benefit from keeping a cached extent state record for calls to count_range_bits(). Most of the time the last extent state record we visited in one call to count_range_bits() matches the first extent state record we will use in the next call to count_range_bits(), so there's a benefit here. So use an extent state record to cache results from count_range_bits() calls during fiemap. This change is part of a patchset that has the goal to make performance better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to iterate over the extents of a file. Two examples are the cp program from coreutils 9.0+ and the tar program (when using its --sparse / -S option). A sample test and results are listed in the changelog of the last patch in the series: 1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree 2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap 3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap 4/9 btrfs: search for delalloc more efficiently during lseek/fiemap 5/9 btrfs: remove no longer used btrfs_next_extent_map() 6/9 btrfs: allow passing a cached state record to count_range_bits() 7/9 btrfs: update stale comment for count_range_bits() 8/9 btrfs: use cached state when looking for delalloc ranges with fiemap 9/9 btrfs: use cached state when looking for delalloc ranges with lseek Reported-by: Wang Yugui Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/ Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 11 ++++++++++- fs/btrfs/file.c | 10 +++++++--- fs/btrfs/file.h | 1 + 3 files changed, 18 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 33cc3a9a2b68..704ae7c08867 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3698,6 +3698,7 @@ static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path static int fiemap_process_hole(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache, + struct extent_state **delalloc_cached_state, struct btrfs_backref_share_check_ctx *backref_ctx, u64 disk_bytenr, u64 extent_offset, u64 extent_gen, @@ -3722,6 +3723,7 @@ static int fiemap_process_hole(struct btrfs_inode *inode, bool delalloc; delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, + delalloc_cached_state, &delalloc_start, &delalloc_end); if (!delalloc) @@ -3892,6 +3894,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, { const u64 ino = btrfs_ino(inode); struct extent_state *cached_state = NULL; + struct extent_state *delalloc_cached_state = NULL; struct btrfs_path *path; struct fiemap_cache cache = { 0 }; struct btrfs_backref_share_check_ctx *backref_ctx; @@ -3966,6 +3969,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, const u64 range_end = min(key.offset, lockend) - 1; ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, backref_ctx, 0, 0, 0, prev_extent_end, range_end); if (ret < 0) { @@ -4006,6 +4010,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, extent_len, flags); } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, backref_ctx, disk_bytenr, extent_offset, extent_gen, key.offset, @@ -4013,6 +4018,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, } else if (disk_bytenr == 0) { /* We have an explicit hole. */ ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, backref_ctx, 0, 0, 0, key.offset, extent_end - 1); } else { @@ -4070,7 +4076,8 @@ check_eof_delalloc: path = NULL; if (!stopped && prev_extent_end < lockend) { - ret = fiemap_process_hole(inode, fieinfo, &cache, backref_ctx, + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, backref_ctx, 0, 0, 0, prev_extent_end, lockend - 1); if (ret < 0) goto out_unlock; @@ -4088,6 +4095,7 @@ check_eof_delalloc: delalloc = btrfs_find_delalloc_in_range(inode, prev_extent_end, i_size - 1, + &delalloc_cached_state, &delalloc_start, &delalloc_end); if (!delalloc) @@ -4102,6 +4110,7 @@ check_eof_delalloc: out_unlock: unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); out: + free_extent_state(delalloc_cached_state); btrfs_free_backref_share_ctx(backref_ctx); btrfs_free_path(path); return ret; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index dc8399610ca3..da390f891220 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3214,6 +3214,7 @@ out: * looping while it gets adjacent subranges, and merging them together. */ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state, bool *search_io_tree, u64 *delalloc_start_ret, u64 *delalloc_end_ret) { @@ -3236,7 +3237,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end delalloc_len = count_range_bits(&inode->io_tree, delalloc_start_ret, end, len, EXTENT_DELALLOC, 1, - NULL); + cached_state); } else { spin_unlock(&inode->lock); } @@ -3324,6 +3325,8 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end * sector size aligned. * @end: The end offset (inclusive value) of the search range. * It does not need to be sector size aligned. + * @cached_state: Extent state record used for speeding up delalloc + * searches in the inode's io_tree. Can be NULL. * @delalloc_start_ret: Output argument, set to the start offset of the * subrange found with delalloc (may not be sector size * aligned). @@ -3335,6 +3338,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end * end offsets of the subrange. */ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state, u64 *delalloc_start_ret, u64 *delalloc_end_ret) { u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize); @@ -3348,7 +3352,7 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, bool delalloc; delalloc = find_delalloc_subrange(inode, cur_offset, end, - &search_io_tree, + cached_state, &search_io_tree, &delalloc_start, &delalloc_end); if (!delalloc) @@ -3400,7 +3404,7 @@ static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, u64 delalloc_end; bool delalloc; - delalloc = btrfs_find_delalloc_in_range(inode, start, end, + delalloc = btrfs_find_delalloc_in_range(inode, start, end, NULL, &delalloc_start, &delalloc_end); if (delalloc && whence == SEEK_DATA) { *start_ret = delalloc_start; diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h index f3d794e33d46..82b34fbb295f 100644 --- a/fs/btrfs/file.h +++ b/fs/btrfs/file.h @@ -27,6 +27,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, size_t *write_bytes, bool nowait); void btrfs_check_nocow_unlock(struct btrfs_inode *inode); bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state, u64 *delalloc_start_ret, u64 *delalloc_end_ret); #endif -- cgit From 3c32c7212f1639471ec0197ff1179b8ef2e0f3d3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 11:50:35 +0000 Subject: btrfs: use cached state when looking for delalloc ranges with lseek During lseek (SEEK_HOLE/DATA), whenever we find a hole or prealloc extent, we will look for delalloc in that range, and one of the things we do for that is to find out ranges in the inode's io_tree marked with EXTENT_DELALLOC, using calls to count_range_bits(). Typically there's a single, or few, searches in the io_tree for delalloc per lseek call. However it's common for applications to keep calling lseek with SEEK_HOLE and SEEK_DATA to find where extents and holes are in a file, read the extents and skip holes in order to avoid unnecessary IO and save disk space by preserving holes. One popular user is the cp utility from coreutils. Starting with coreutils 9.0, cp uses SEEK_HOLE and SEEK_DATA to iterate over the extents of a file. Before 9.0, it used fiemap to figure out where holes and extents are in the source file. Another popular user is the tar utility when used with the --sparse / -S option to detect and preserve holes. Given that the pattern is to keep calling lseek with a start offset that matches the returned offset from the previous lseek call, we can benefit from caching the last extent state visited in count_range_bits() and use it for the next count_range_bits() from the next lseek call. Example, the following strace excerpt from running tar: $ strace tar cJSvf foo.tar.xz qemu_disk_file.raw (...) lseek(5, 125019574272, SEEK_HOLE) = 125024989184 lseek(5, 125024989184, SEEK_DATA) = 125024993280 lseek(5, 125024993280, SEEK_HOLE) = 125025239040 lseek(5, 125025239040, SEEK_DATA) = 125025255424 lseek(5, 125025255424, SEEK_HOLE) = 125025353728 lseek(5, 125025353728, SEEK_DATA) = 125025357824 lseek(5, 125025357824, SEEK_HOLE) = 125026766848 lseek(5, 125026766848, SEEK_DATA) = 125026770944 lseek(5, 125026770944, SEEK_HOLE) = 125027053568 (...) Shows that pattern, which is the same as with cp from coreutils 9.0+. So start using a cached state for the delalloc searches in lseek, and store it in struct file's private data so that it can be reused across lseek calls. This change is part of a patchset that is comprised of the following patches: 1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree 2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap 3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap 4/9 btrfs: search for delalloc more efficiently during lseek/fiemap 5/9 btrfs: remove no longer used btrfs_next_extent_map() 6/9 btrfs: allow passing a cached state record to count_range_bits() 7/9 btrfs: update stale comment for count_range_bits() 8/9 btrfs: use cached state when looking for delalloc ranges with fiemap 9/9 btrfs: use cached state when looking for delalloc ranges with lseek The following test was run before and after applying the whole patchset: $ cat test-cp.sh #!/bin/bash DEV=/dev/sdh MNT=/mnt/sdh # coreutils 8.32, cp uses fiemap to detect holes and extents #CP_PROG=/usr/bin/cp # coreutils 9.1, cp uses SEEK_HOLE/DATA to detect holes and extents CP_PROG=/home/fdmanana/git/hub/coreutils/src/cp umount $DEV &> /dev/null mkfs.btrfs -f $DEV mount $DEV $MNT FILE_SIZE=$((1024 * 1024 * 1024)) echo "Creating file with a size of $((FILE_SIZE / 1024 / 1024))M" # Create a very sparse file, where each extent has a length of 4K and # is preceded by a 4K hole and followed by another 4K hole. start=$(date +%s%N) echo -n > $MNT/foobar for ((off = 0; off < $FILE_SIZE; off += 8192)); do xfs_io -c "pwrite -S 0xab $off 4K" $MNT/foobar > /dev/null echo -ne "\r$off / $FILE_SIZE ..." done end=$(date +%s%N) echo -e "\nFile created ($(( (end - start) / 1000000 )) milliseconds)" start=$(date +%s%N) $CP_PROG $MNT/foobar /dev/null end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "cp took $dur milliseconds with data/metadata cached and delalloc" # Flush all delalloc. sync start=$(date +%s%N) $CP_PROG $MNT/foobar /dev/null end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "cp took $dur milliseconds with data/metadata cached and no delalloc" # Unmount and mount again to test the case without any metadata # loaded in memory. umount $MNT mount $DEV $MNT start=$(date +%s%N) $CP_PROG $MNT/foobar /dev/null end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "cp took $dur milliseconds without data/metadata cached and no delalloc" umount $MNT The results, running on a box with a non-debug kernel (Debian's default kernel config), were the following: 128M file, before patchset: cp took 16574 milliseconds with data/metadata cached and delalloc cp took 122 milliseconds with data/metadata cached and no delalloc cp took 20144 milliseconds without data/metadata cached and no delalloc 128M file, after patchset: cp took 6277 milliseconds with data/metadata cached and delalloc cp took 109 milliseconds with data/metadata cached and no delalloc cp took 210 milliseconds without data/metadata cached and no delalloc 512M file, before patchset: cp took 14369 milliseconds with data/metadata cached and delalloc cp took 429 milliseconds with data/metadata cached and no delalloc cp took 88034 milliseconds without data/metadata cached and no delalloc 512M file, after patchset: cp took 12106 milliseconds with data/metadata cached and delalloc cp took 427 milliseconds with data/metadata cached and no delalloc cp took 824 milliseconds without data/metadata cached and no delalloc 1G file, before patchset: cp took 10074 milliseconds with data/metadata cached and delalloc cp took 886 milliseconds with data/metadata cached and no delalloc cp took 181261 milliseconds without data/metadata cached and no delalloc 1G file, after patchset: cp took 3320 milliseconds with data/metadata cached and delalloc cp took 880 milliseconds with data/metadata cached and no delalloc cp took 1801 milliseconds without data/metadata cached and no delalloc Reported-by: Wang Yugui Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/ Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/file.c | 40 ++++++++++++++++++++++++++++++++-------- 2 files changed, 33 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5649f8907984..99defab7e1f6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -426,6 +426,7 @@ struct btrfs_drop_extents_args { struct btrfs_file_private { void *filldir_buf; + struct extent_state *llseek_cached_state; }; static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index da390f891220..448b143a5cb2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1696,10 +1696,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp) { struct btrfs_file_private *private = filp->private_data; - if (private && private->filldir_buf) + if (private) { kfree(private->filldir_buf); - kfree(private); - filp->private_data = NULL; + free_extent_state(private->llseek_cached_state); + kfree(private); + filp->private_data = NULL; + } /* * Set by setattr when we are about to truncate a file from a non-zero @@ -3398,13 +3400,14 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, * is found, it updates @start_ret with the start of the subrange. */ static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, + struct extent_state **cached_state, u64 start, u64 end, u64 *start_ret) { u64 delalloc_start; u64 delalloc_end; bool delalloc; - delalloc = btrfs_find_delalloc_in_range(inode, start, end, NULL, + delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state, &delalloc_start, &delalloc_end); if (delalloc && whence == SEEK_DATA) { *start_ret = delalloc_start; @@ -3447,11 +3450,13 @@ static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, return false; } -static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, - int whence) +static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) { + struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host); + struct btrfs_file_private *private = file->private_data; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_state *cached_state = NULL; + struct extent_state **delalloc_cached_state; const loff_t i_size = i_size_read(&inode->vfs_inode); const u64 ino = btrfs_ino(inode); struct btrfs_root *root = inode->root; @@ -3476,6 +3481,22 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, inode_get_bytes(&inode->vfs_inode) == i_size) return i_size; + if (!private) { + private = kzalloc(sizeof(*private), GFP_KERNEL); + /* + * No worries if memory allocation failed. + * The private structure is used only for speeding up multiple + * lseek SEEK_HOLE/DATA calls to a file when there's delalloc, + * so everything will still be correct. + */ + file->private_data = private; + } + + if (private) + delalloc_cached_state = &private->llseek_cached_state; + else + delalloc_cached_state = NULL; + /* * offset can be negative, in this case we start finding DATA/HOLE from * the very start of the file. @@ -3553,6 +3574,7 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, search_start = offset; found = find_desired_extent_in_hole(inode, whence, + delalloc_cached_state, search_start, key.offset - 1, &found_start); @@ -3587,6 +3609,7 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, search_start = offset; found = find_desired_extent_in_hole(inode, whence, + delalloc_cached_state, search_start, extent_end - 1, &found_start); @@ -3628,7 +3651,8 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, /* We have an implicit hole from the last extent found up to i_size. */ if (!found && start < i_size) { - found = find_desired_extent_in_hole(inode, whence, start, + found = find_desired_extent_in_hole(inode, whence, + delalloc_cached_state, start, i_size - 1, &start); if (!found) start = i_size; @@ -3657,7 +3681,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) case SEEK_DATA: case SEEK_HOLE: btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); - offset = find_desired_extent(BTRFS_I(inode), offset, whence); + offset = find_desired_extent(file, offset, whence); btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); break; } -- cgit From cb3e217bdb39e390f8e64af519acb02af336b53d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 13 Nov 2022 09:32:07 +0800 Subject: btrfs: use btrfs_dev_name() helper to handle missing devices better [BUG] If dev-replace failed to re-construct its data/metadata, the kernel message would be incorrect for the missing device: BTRFS info (device dm-1): dev_replace from (devid 2) to /dev/mapper/test-scratch2 started BTRFS error (device dm-1): failed to rebuild valid logical 38862848 for dev (efault) Note the above "dev (efault)" of the second line. While the first line is properly reporting "". [CAUSE] Although dev-replace is using btrfs_dev_name(), the heavy lifting work is still done by scrub (scrub is reused by both dev-replace and regular scrub). Unfortunately scrub code never uses btrfs_dev_name() helper, as it's only declared locally inside dev-replace.c. [FIX] Fix the output by: - Move the btrfs_dev_name() helper to volumes.h - Use btrfs_dev_name() to replace open-coded rcu_str_deref() calls Only zoned code is not touched, as I'm not familiar with degraded zoned code. - Constify return value and parameter Now the output looks pretty sane: BTRFS info (device dm-1): dev_replace from (devid 2) to /dev/mapper/test-scratch2 started BTRFS error (device dm-1): failed to rebuild valid logical 38862848 for dev Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/check-integrity.c | 2 +- fs/btrfs/dev-replace.c | 15 +++------------ fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/extent_io.c | 3 +-- fs/btrfs/ioctl.c | 4 ++-- fs/btrfs/scrub.c | 20 +++++++++----------- fs/btrfs/super.c | 2 +- fs/btrfs/volumes.c | 16 ++++++++-------- fs/btrfs/volumes.h | 9 +++++++++ 10 files changed, 36 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 7ff0703ef3e4..82e49d985019 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -757,7 +757,7 @@ static int btrfsic_process_superblock_dev_mirror( btrfs_info_in_rcu(fs_info, "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)", superblock_bdev, - rcu_str_deref(device->name), dev_bytenr, + btrfs_dev_name(device), dev_bytenr, dev_state->bdev, dev_bytenr, superblock_mirror_num); list_add(&superblock_tmp->all_blocks_node, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 9c4a8649a0f4..78696d331639 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -18,7 +18,6 @@ #include "volumes.h" #include "async-thread.h" #include "check-integrity.h" -#include "rcu-string.h" #include "dev-replace.h" #include "sysfs.h" #include "zoned.h" @@ -451,14 +450,6 @@ out: return ret; } -static char* btrfs_dev_name(struct btrfs_device *device) -{ - if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) - return ""; - else - return rcu_str_deref(device->name); -} - static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, struct btrfs_device *src_dev) { @@ -674,7 +665,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, "dev_replace from %s (devid %llu) to %s started", btrfs_dev_name(src_device), src_device->devid, - rcu_str_deref(tgt_device->name)); + btrfs_dev_name(tgt_device)); /* * from now on, the writes to the srcdev are all duplicated to @@ -933,7 +924,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, "btrfs_scrub_dev(%s, %llu, %s) failed %d", btrfs_dev_name(src_device), src_device->devid, - rcu_str_deref(tgt_device->name), scrub_ret); + btrfs_dev_name(tgt_device), scrub_ret); error: up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); @@ -951,7 +942,7 @@ error: "dev_replace from %s (devid %llu) to %s finished", btrfs_dev_name(src_device), src_device->devid, - rcu_str_deref(tgt_device->name)); + btrfs_dev_name(tgt_device)); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state); tgt_device->devid = src_device->devid; src_device->devid = BTRFS_DEV_REPLACE_DEVID; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 559e7f3d727e..91a088210e5a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3944,7 +3944,7 @@ static void btrfs_end_super_write(struct bio *bio) if (bio->bi_status) { btrfs_warn_rl_in_rcu(device->fs_info, "lost page write due to IO error on %s (%d)", - rcu_str_deref(device->name), + btrfs_dev_name(device), blk_status_to_errno(bio->bi_status)); ClearPageUptodate(page); SetPageError(page); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 10cc757a602b..17f599027c3d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6048,7 +6048,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) btrfs_warn_in_rcu(fs_info, "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", start, end - start + 1, - rcu_str_deref(device->name), + btrfs_dev_name(device), device->total_bytes); mutex_unlock(&fs_info->chunk_mutex); ret = 0; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 704ae7c08867..65ba5c3658cf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -611,8 +611,7 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, btrfs_info_rl_in_rcu(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", - ino, start, - rcu_str_deref(dev->name), sector); + ino, start, btrfs_dev_name(dev), sector); ret = 0; out_bio_uninit: diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7a9e697d9931..bed74a3ff574 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1228,7 +1228,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret == 0 && new_size != old_size) btrfs_info_in_rcu(fs_info, "resize device %s (devid %llu) from %llu to %llu", - rcu_str_deref(device->name), device->devid, + btrfs_dev_name(device), device->devid, old_size, new_size); out_finish: btrfs_exclop_finish(fs_info); @@ -2860,7 +2860,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, di_args->total_bytes = btrfs_device_get_total_bytes(dev); memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); if (dev->name) { - strncpy(di_args->path, rcu_str_deref(dev->name), + strncpy(di_args->path, btrfs_dev_name(dev), sizeof(di_args->path) - 1); di_args->path[sizeof(di_args->path) - 1] = 0; } else { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 288a97992aa4..0ce5b773867f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -17,7 +17,6 @@ #include "extent_io.h" #include "dev-replace.h" #include "check-integrity.h" -#include "rcu-string.h" #include "raid56.h" #include "block-group.h" #include "zoned.h" @@ -877,7 +876,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)", swarn->errstr, swarn->logical, - rcu_str_deref(swarn->dev->name), + btrfs_dev_name(swarn->dev), swarn->physical, root, inum, offset, fs_info->sectorsize, nlink, @@ -891,7 +890,7 @@ err: btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", swarn->errstr, swarn->logical, - rcu_str_deref(swarn->dev->name), + btrfs_dev_name(swarn->dev), swarn->physical, root, inum, offset, ret); @@ -922,8 +921,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) /* Super block error, no need to search extent tree. */ if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu", - errstr, rcu_str_deref(dev->name), - sblock->physical); + errstr, btrfs_dev_name(dev), sblock->physical); return; } path = btrfs_alloc_path(); @@ -954,7 +952,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", errstr, swarn.logical, - rcu_str_deref(dev->name), + btrfs_dev_name(dev), swarn.physical, ref_level ? "node" : "leaf", ret < 0 ? -1 : ref_level, @@ -1377,7 +1375,7 @@ corrected_error: spin_unlock(&sctx->stat_lock); btrfs_err_rl_in_rcu(fs_info, "fixed up error at logical %llu on dev %s", - logical, rcu_str_deref(dev->name)); + logical, btrfs_dev_name(dev)); } } else { did_not_correct_error: @@ -1386,7 +1384,7 @@ did_not_correct_error: spin_unlock(&sctx->stat_lock); btrfs_err_rl_in_rcu(fs_info, "unable to fixup (regular) error at logical %llu on dev %s", - logical, rcu_str_deref(dev->name)); + logical, btrfs_dev_name(dev)); } out: @@ -2332,14 +2330,14 @@ static void scrub_missing_raid56_worker(struct work_struct *work) spin_unlock(&sctx->stat_lock); btrfs_err_rl_in_rcu(fs_info, "IO error rebuilding logical %llu for dev %s", - logical, rcu_str_deref(dev->name)); + logical, btrfs_dev_name(dev)); } else if (sblock->header_error || sblock->checksum_error) { spin_lock(&sctx->stat_lock); sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); btrfs_err_rl_in_rcu(fs_info, "failed to rebuild valid logical %llu for dev %s", - logical, rcu_str_deref(dev->name)); + logical, btrfs_dev_name(dev)); } else { scrub_write_block_to_dev_replace(sblock); } @@ -4303,7 +4301,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_err_in_rcu(fs_info, "scrub on devid %llu: filesystem on %s is not writable", - devid, rcu_str_deref(dev->name)); + devid, btrfs_dev_name(dev)); ret = -EROFS; goto out; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d54bfec8e506..ea83dd9f735a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2336,7 +2336,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) * the end of RCU grace period. */ rcu_read_lock(); - seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\"); + seq_escape(m, btrfs_dev_name(fs_info->fs_devices->latest_dev), " \t\n\\"); rcu_read_unlock(); return 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 49d6e79d4343..2d0950a17f48 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -941,7 +941,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, } btrfs_info_in_rcu(NULL, "devid %llu device path %s changed to %s scanned by %s (%d)", - devid, rcu_str_deref(device->name), + devid, btrfs_dev_name(device), path, current->comm, task_pid_nr(current)); } @@ -2101,7 +2101,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, if (btrfs_pinned_by_swapfile(fs_info, device)) { btrfs_warn_in_rcu(fs_info, "cannot remove device %s (devid %llu) due to active swapfile", - rcu_str_deref(device->name), device->devid); + btrfs_dev_name(device), device->devid); return -ETXTBSY; } @@ -6827,7 +6827,7 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) btrfs_debug_in_rcu(dev->fs_info, "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, - (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), + (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), dev->devid, bio->bi_iter.bi_size); btrfsic_check_bio(bio); @@ -7908,7 +7908,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, if (ret < 0) { btrfs_warn_in_rcu(fs_info, "error %d while searching for dev_stats item for device %s", - ret, rcu_str_deref(device->name)); + ret, btrfs_dev_name(device)); goto out; } @@ -7919,7 +7919,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, if (ret != 0) { btrfs_warn_in_rcu(fs_info, "delete too small dev_stats item for device %s failed %d", - rcu_str_deref(device->name), ret); + btrfs_dev_name(device), ret); goto out; } ret = 1; @@ -7933,7 +7933,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, if (ret < 0) { btrfs_warn_in_rcu(fs_info, "insert dev_stats item for device %s failed %d", - rcu_str_deref(device->name), ret); + btrfs_dev_name(device), ret); goto out; } } @@ -7998,7 +7998,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) return; btrfs_err_rl_in_rcu(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", - rcu_str_deref(dev->name), + btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), @@ -8018,7 +8018,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) btrfs_info_in_rcu(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", - rcu_str_deref(dev->name), + btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index efa6a3d48cd8..2c90e50c460a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -12,6 +12,7 @@ #include "async-thread.h" #include "messages.h" #include "disk-io.h" +#include "rcu-string.h" #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) @@ -770,6 +771,14 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev, atomic_inc(&dev->dev_stats_ccnt); } +static inline const char *btrfs_dev_name(const struct btrfs_device *device) +{ + if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) + return ""; + else + return rcu_str_deref(device->name); +} + void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); -- cgit From 9f0eac070d23405f18e7a84820bc3d59b1415bec Mon Sep 17 00:00:00 2001 From: Li zeming Date: Wed, 26 Oct 2022 09:36:11 +0800 Subject: btrfs: allocate btrfs_io_context without GFP_NOFAIL The __GFP_NOFAIL flag could loop indefinitely when allocation memory in alloc_btrfs_io_context. The callers starting from __btrfs_map_block already handle errors so it's safe to drop the flag. Signed-off-by: Li zeming Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2d0950a17f48..e51fd5f1042a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5899,7 +5899,10 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_ * and the stripes. */ sizeof(u64) * (total_stripes), - GFP_NOFS|__GFP_NOFAIL); + GFP_NOFS); + + if (!bioc) + return NULL; refcount_set(&bioc->refs, 1); -- cgit From cb649e81dad429ffbd97a689ac0011599952a668 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 14 Nov 2022 08:26:31 +0800 Subject: btrfs: refactor checksum calculations in btrfs_lookup_csums_range() The refactoring involves the following parts: - Introduce bytes_to_csum_size() and csum_size_to_bytes() helpers As we have quite some open-coded calculations, some of them are even split into two assignments just to fit 80 chars limit. - Remove the @csum_size parameter from max_ordered_sum_bytes() Csum size can be fetched from @fs_info. And we will use the csum_size_to_bytes() helper anyway. - Add a comment explaining how we handle the first search result - Use newly introduced helpers to cleanup btrfs_lookup_csums_range() - Move variables declaration to the minimal scope - Never mix number of sectors with bytes There are several locations doing things like: size = min_t(size_t, csum_end - start, max_ordered_sum_bytes(fs_info)); ... size >>= fs_info->sectorsize_bits Or offset = (start - key.offset) >> fs_info->sectorsize_bits; offset *= csum_size; Make sure these variables can only represent BYTES inside the function, by using the above bytes_to_csum_size() helpers. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 68 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 456f71b42a9c..654ad2494bce 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -126,12 +126,26 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, start + len - 1, EXTENT_DIRTY, NULL); } -static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, - u16 csum_size) +static size_t bytes_to_csum_size(const struct btrfs_fs_info *fs_info, u32 bytes) { - u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size; + ASSERT(IS_ALIGNED(bytes, fs_info->sectorsize)); - return ncsums * fs_info->sectorsize; + return (bytes >> fs_info->sectorsize_bits) * fs_info->csum_size; +} + +static size_t csum_size_to_bytes(const struct btrfs_fs_info *fs_info, u32 csum_size) +{ + ASSERT(IS_ALIGNED(csum_size, fs_info->csum_size)); + + return (csum_size / fs_info->csum_size) << fs_info->sectorsize_bits; +} + +static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info) +{ + u32 max_csum_size = round_down(PAGE_SIZE - sizeof(struct btrfs_ordered_sum), + fs_info->csum_size); + + return csum_size_to_bytes(fs_info, max_csum_size); } /* @@ -140,9 +154,7 @@ static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, */ static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes) { - int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize); - - return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size; + return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes); } int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, @@ -526,11 +538,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_ordered_sum *sums; struct btrfs_csum_item *item; LIST_HEAD(tmplist); - unsigned long offset; int ret; - size_t size; - u64 csum_end; - const u32 csum_size = fs_info->csum_size; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(end + 1, fs_info->sectorsize)); @@ -556,16 +564,33 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (ret > 0 && path->slots[0] > 0) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + + /* + * There are two cases we can hit here for the previous csum + * item: + * + * |<- search range ->| + * |<- csum item ->| + * + * Or + * |<- search range ->| + * |<- csum item ->| + * + * Check if the previous csum item covers the leading part of + * the search range. If so we have to start from previous csum + * item. + */ if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && key.type == BTRFS_EXTENT_CSUM_KEY) { - offset = (start - key.offset) >> fs_info->sectorsize_bits; - if (offset * csum_size < + if (bytes_to_csum_size(fs_info, start - key.offset) < btrfs_item_size(leaf, path->slots[0] - 1)) path->slots[0]--; } } while (start <= end) { + u64 csum_end; + leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); @@ -585,8 +610,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (key.offset > start) start = key.offset; - size = btrfs_item_size(leaf, path->slots[0]); - csum_end = key.offset + (size / csum_size) * fs_info->sectorsize; + csum_end = key.offset + csum_size_to_bytes(fs_info, + btrfs_item_size(leaf, path->slots[0])); if (csum_end <= start) { path->slots[0]++; continue; @@ -596,8 +621,11 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_csum_item); while (start < csum_end) { + unsigned long offset; + size_t size; + size = min_t(size_t, csum_end - start, - max_ordered_sum_bytes(fs_info, csum_size)); + max_ordered_sum_bytes(fs_info)); sums = kzalloc(btrfs_ordered_sum_size(fs_info, size), GFP_NOFS); if (!sums) { @@ -608,16 +636,14 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, sums->bytenr = start; sums->len = (int)size; - offset = (start - key.offset) >> fs_info->sectorsize_bits; - offset *= csum_size; - size >>= fs_info->sectorsize_bits; + offset = bytes_to_csum_size(fs_info, start - key.offset); read_extent_buffer(path->nodes[0], sums->sums, ((unsigned long)item) + offset, - csum_size * size); + bytes_to_csum_size(fs_info, size)); - start += fs_info->sectorsize * size; + start += size; list_add_tail(&sums->list, &tmplist); } path->slots[0]++; -- cgit From 97e3823933108cfc648bb08d5ad36251b6588164 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 14 Nov 2022 08:26:32 +0800 Subject: btrfs: introduce a bitmap based csum range search function Although we have an existing function, btrfs_lookup_csums_range(), to find all data checksums for a range, it's based on a btrfs_ordered_sum list. For the incoming RAID56 data checksum verification at RMW time, we don't want to waste time by allocating temporary memory. So this patch will introduce a new helper, btrfs_lookup_csums_bitmap(). It will use bitmap based result, which will be a perfect fit for later RAID56 usage. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/file-item.h | 8 ++-- fs/btrfs/inode.c | 5 +- fs/btrfs/relocation.c | 4 +- fs/btrfs/scrub.c | 8 ++-- fs/btrfs/tree-log.c | 15 +++--- 6 files changed, 144 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 654ad2494bce..352bbb33b76f 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -527,9 +527,9 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst return ret; } -int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit, - bool nowait) +int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; @@ -661,6 +661,127 @@ fail: return ret; } +/* + * Do the same work as btrfs_lookup_csums_list(), the difference is in how + * we return the result. + * + * This version will set the corresponding bits in @csum_bitmap to represent + * that there is a csum found. + * Each bit represents a sector. Thus caller should ensure @csum_buf passed + * in is large enough to contain all csums. + */ +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, + u8 *csum_buf, unsigned long *csum_bitmap) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_csum_item *item; + const u64 orig_start = start; + int ret; + + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(end + 1, fs_info->sectorsize)); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = start; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto fail; + if (ret > 0 && path->slots[0] > 0) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + + /* + * There are two cases we can hit here for the previous csum + * item: + * + * |<- search range ->| + * |<- csum item ->| + * + * Or + * |<- search range ->| + * |<- csum item ->| + * + * Check if the previous csum item covers the leading part of + * the search range. If so we have to start from previous csum + * item. + */ + if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && + key.type == BTRFS_EXTENT_CSUM_KEY) { + if (bytes_to_csum_size(fs_info, start - key.offset) < + btrfs_item_size(leaf, path->slots[0] - 1)) + path->slots[0]--; + } + } + + while (start <= end) { + u64 csum_end; + + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto fail; + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset > end) + break; + + if (key.offset > start) + start = key.offset; + + csum_end = key.offset + csum_size_to_bytes(fs_info, + btrfs_item_size(leaf, path->slots[0])); + if (csum_end <= start) { + path->slots[0]++; + continue; + } + + csum_end = min(csum_end, end + 1); + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + while (start < csum_end) { + unsigned long offset; + size_t size; + u8 *csum_dest = csum_buf + bytes_to_csum_size(fs_info, + start - orig_start); + + size = min_t(size_t, csum_end - start, end + 1 - start); + + offset = bytes_to_csum_size(fs_info, start - key.offset); + + read_extent_buffer(path->nodes[0], csum_dest, + ((unsigned long)item) + offset, + bytes_to_csum_size(fs_info, size)); + + bitmap_set(csum_bitmap, + (start - orig_start) >> fs_info->sectorsize_bits, + size >> fs_info->sectorsize_bits); + + start += size; + } + path->slots[0]++; + } + ret = 0; +fail: + btrfs_free_path(path); + return ret; +} + /* * Calculate checksums of the data contained inside a bio. * diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index ba12711cf933..95b371208b30 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -18,9 +18,11 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sums); blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, u64 offset, bool one_ordered); -int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit, - bool nowait); +int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait); +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, + u8 *csum_buf, unsigned long *csum_bitmap); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 83898bca39d5..4248e6cabbdc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1709,9 +1709,8 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, int ret; LIST_HEAD(list); - ret = btrfs_lookup_csums_range(csum_root, bytenr, - bytenr + num_bytes - 1, &list, 0, - nowait); + ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1, + &list, 0, nowait); if (ret == 0 && list_empty(&list)) return 0; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 56c8afa6f6d2..aa80e51bc8ca 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4357,8 +4357,8 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) disk_bytenr = file_pos + inode->index_cnt; csum_root = btrfs_csum_root(fs_info, disk_bytenr); - ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, - disk_bytenr + len - 1, &list, 0, false); + ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, + disk_bytenr + len - 1, &list, 0, false); if (ret) goto out; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0ce5b773867f..52b346795f66 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3238,9 +3238,9 @@ static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx, extent_dev = bioc->stripes[0].dev; btrfs_put_bioc(bioc); - ret = btrfs_lookup_csums_range(csum_root, extent_start, - extent_start + extent_size - 1, - &sctx->csum_list, 1, false); + ret = btrfs_lookup_csums_list(csum_root, extent_start, + extent_start + extent_size - 1, + &sctx->csum_list, 1, false); if (ret) { scrub_parity_mark_sectors_error(sparity, extent_start, extent_size); @@ -3464,7 +3464,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, cur_logical; if (extent_flags & BTRFS_EXTENT_FLAG_DATA) { - ret = btrfs_lookup_csums_range(csum_root, cur_logical, + ret = btrfs_lookup_csums_list(csum_root, cur_logical, cur_logical + scrub_len - 1, &sctx->csum_list, 1, false); if (ret) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4d9f6803bfbe..8f8d7e7dc3a8 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -826,7 +826,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, btrfs_file_extent_num_bytes(eb, item); } - ret = btrfs_lookup_csums_range(root->log_root, + ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1, &ordered_sums, 0, false); if (ret) @@ -4443,9 +4443,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr); disk_bytenr += extent_offset; - ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, - disk_bytenr + extent_num_bytes - 1, - &ordered_sums, 0, false); + ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, + disk_bytenr + extent_num_bytes - 1, + &ordered_sums, 0, false); if (ret) goto out; @@ -4638,10 +4638,9 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, /* block start is already adjusted for the file extent offset. */ csum_root = btrfs_csum_root(trans->fs_info, em->block_start); - ret = btrfs_lookup_csums_range(csum_root, - em->block_start + csum_offset, - em->block_start + csum_offset + - csum_len - 1, &ordered_sums, 0, false); + ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset, + em->block_start + csum_offset + + csum_len - 1, &ordered_sums, 0, false); if (ret) return ret; -- cgit From c5a415627be4758ebdd274de7148706d6713c7ec Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 14 Nov 2022 08:26:33 +0800 Subject: btrfs: raid56: prepare data checksums for later RMW verification This is for later data checksum verification at RMW time. This patch will try to allocate the needed memory for a locked rbio if the rbio is for data exclusively (we don't want to handle mixed bg yet). The memory will be released when the rbio is finished. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/raid56.h | 12 +++++++++ 2 files changed, 86 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 11be5d0a7eab..5ef4fbb49df2 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -20,6 +20,7 @@ #include "volumes.h" #include "raid56.h" #include "async-thread.h" +#include "file-item.h" /* set when additional merges to this rbio are not allowed */ #define RBIO_RMW_LOCKED_BIT 1 @@ -835,6 +836,11 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *extra; + kfree(rbio->csum_buf); + bitmap_free(rbio->csum_bitmap); + rbio->csum_buf = NULL; + rbio->csum_bitmap = NULL; + /* * Clear the data bitmap, as the rbio may be cached for later usage. * do this before before unlock_stripe() so there will be no new bio @@ -2048,6 +2054,67 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, start_async_work(rbio, recover_rbio_work); } +static void fill_data_csums(struct btrfs_raid_bio *rbio) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, + rbio->bioc->raid_map[0]); + const u64 start = rbio->bioc->raid_map[0]; + const u32 len = (rbio->nr_data * rbio->stripe_nsectors) << + fs_info->sectorsize_bits; + int ret; + + /* The rbio should not have its csum buffer initialized. */ + ASSERT(!rbio->csum_buf && !rbio->csum_bitmap); + + /* + * Skip the csum search if: + * + * - The rbio doesn't belong to data block groups + * Then we are doing IO for tree blocks, no need to search csums. + * + * - The rbio belongs to mixed block groups + * This is to avoid deadlock, as we're already holding the full + * stripe lock, if we trigger a metadata read, and it needs to do + * raid56 recovery, we will deadlock. + */ + if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) || + rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA) + return; + + rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors * + fs_info->csum_size, GFP_NOFS); + rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors, + GFP_NOFS); + if (!rbio->csum_buf || !rbio->csum_bitmap) { + ret = -ENOMEM; + goto error; + } + + ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1, + rbio->csum_buf, rbio->csum_bitmap); + if (ret < 0) + goto error; + if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) + goto no_csum; + return; + +error: + /* + * We failed to allocate memory or grab the csum, but it's not fatal, + * we can still continue. But better to warn users that RMW is no + * longer safe for this particular sub-stripe write. + */ + btrfs_warn_rl(fs_info, +"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d", + rbio->bioc->raid_map[0], ret); +no_csum: + kfree(rbio->csum_buf); + bitmap_free(rbio->csum_bitmap); + rbio->csum_buf = NULL; + rbio->csum_bitmap = NULL; +} + static int rmw_read_and_wait(struct btrfs_raid_bio *rbio) { struct bio_list bio_list; @@ -2056,6 +2123,13 @@ static int rmw_read_and_wait(struct btrfs_raid_bio *rbio) bio_list_init(&bio_list); + /* + * Fill the data csums we need for data verification. We need to fill + * the csum_bitmap/csum_buf first, as our endio function will try to + * verify the data sectors. + */ + fill_data_csums(rbio); + ret = rmw_assemble_read_bios(rbio, &bio_list); if (ret < 0) goto out; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index a2e653e93fd8..7c73a443939e 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -129,6 +129,18 @@ struct btrfs_raid_bio { * Thus making it much harder to iterate. */ unsigned long *error_bitmap; + + /* + * Checksum buffer if the rbio is for data. The buffer should cover + * all data sectors (exlcuding P/Q sectors). + */ + u8 *csum_buf; + + /* + * Each bit represents if the corresponding sector has data csum found. + * Should only cover data sectors (excluding P/Q sectors). + */ + unsigned long *csum_bitmap; }; /* -- cgit From 7a3150723061ba1ac2ba10b1392b1cd75234172a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 14 Nov 2022 08:26:34 +0800 Subject: btrfs: raid56: do data csum verification during RMW cycle [BUG] For the following small script, btrfs will be unable to recover the content of file1: mkfs.btrfs -f -m raid1 -d raid5 -b 1G $dev1 $dev2 $dev3 mount $dev1 $mnt xfs_io -f -c "pwrite -S 0xff 0 64k" -c sync $mnt/file1 md5sum $mnt/file1 umount $mnt # Corrupt the above 64K data stripe. xfs_io -f -c "pwrite -S 0x00 323026944 64K" -c sync $dev3 mount $dev1 $mnt # Write a new 64K, which should be in the other data stripe # And this is a sub-stripe write, which will cause RMW xfs_io -f -c "pwrite 0 64k" -c sync $mnt/file2 md5sum $mnt/file1 umount $mnt Above md5sum would fail. [CAUSE] There is a long existing problem for raid56 (not limited to btrfs raid56) that, if we already have some corrupted on-disk data, and then trigger a sub-stripe write (which needs RMW cycle), it can cause further damage into P/Q stripe. Disk 1: data 1 |0x000000000000| <- Corrupted Disk 2: data 2 |0x000000000000| Disk 2: parity |0xffffffffffff| In above case, data 1 is already corrupted, the original data should be 64KiB of 0xff. At this stage, if we read data 1, and it has data checksum, we can still recovery going via the regular RAID56 recovery path. But if now we decide to write some data into data 2, then we need to go RMW. Let's say we want to write 64KiB of '0x00' into data 2, then we read the on-disk data of data 1, calculate the new parity, resulting the following layout: Disk 1: data 1 |0x000000000000| <- Corrupted Disk 2: data 2 |0x000000000000| <- New '0x00' writes Disk 2: parity |0x000000000000| <- New Parity. But the new parity is calculated using the *corrupted* data 1, we can no longer recover the correct data of data1. Thus the corruption is forever there. [FIX] To solve above problem, this patch will do a full stripe data checksum verification at RMW time. This involves the following changes: - Always read the full stripe (including data/P/Q) when doing RMW Before we only read the missing data sectors, but since we may do a data csum verification and recovery, we need to read everything out. Please note that, if we have a cached rbio, we don't need to read anything, and can treat it the same as full stripe write. As only stripe with all its csum matches can be cached. - Verify the data csum during read. The goal is only the rbio stripe sectors, and only if the rbio already has csum_buf/csum_bitmap filled. And sectors which cannot pass csum verification will have their bit set in error_bitmap. - Always call recovery_sectors() after we read out all the sectors Since error_bitmap will be updated during read, recover_sectors() can easily find out all the bad sectors and try to recover (if still under tolerance). And since recovery_sectors() is already migrated to use error_bitmap, it can skip vertical stripes which don't have any error. - Verify the repaired sectors against its csum in recover_vertical() - Rename rmw_read_and_wait() to rmw_read_wait_recover() Since we will always recover the sectors, the old name is no longer accurate. Furthermore since recovery is already done in rmw_read_wait_recover(), we no longer need to call recovery_sectors() inside rmw_rbio(). Obviously this will have a performance impact, as we are doing more work during RMW cycle: - Fetch the data checksums - Do checksum verification for all data stripes - Do checksum verification again after repair But for full stripe write or cached rbio we won't have the overhead all, thus for fully optimized RAID56 workload (always full stripe write), there should be no extra overhead. To me, the extra overhead looks reasonable, as data consistency is way more important than performance. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 169 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 137 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 5ef4fbb49df2..2d90a6b5eb00 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -21,6 +21,7 @@ #include "raid56.h" #include "async-thread.h" #include "file-item.h" +#include "btrfs_inode.h" /* set when additional merges to this rbio are not allowed */ #define RBIO_RMW_LOCKED_BIT 1 @@ -1433,14 +1434,56 @@ static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bi bio_size >> rbio->bioc->fs_info->sectorsize_bits); } +/* Verify the data sectors at read time. */ +static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, + struct bio *bio) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + int total_sector_nr = get_bio_sector_nr(rbio, bio); + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + /* No data csum for the whole stripe, no need to verify. */ + if (!rbio->csum_bitmap || !rbio->csum_buf) + return; + + /* P/Q stripes, they have no data csum to verify against. */ + if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) + return; + + bio_for_each_segment_all(bvec, bio, iter_all) { + int bv_offset; + + for (bv_offset = bvec->bv_offset; + bv_offset < bvec->bv_offset + bvec->bv_len; + bv_offset += fs_info->sectorsize, total_sector_nr++) { + u8 csum_buf[BTRFS_CSUM_SIZE]; + u8 *expected_csum = rbio->csum_buf + + total_sector_nr * fs_info->csum_size; + int ret; + + /* No csum for this sector, skip to the next sector. */ + if (!test_bit(total_sector_nr, rbio->csum_bitmap)) + continue; + + ret = btrfs_check_sector_csum(fs_info, bvec->bv_page, + bv_offset, csum_buf, expected_csum); + if (ret < 0) + set_bit(total_sector_nr, rbio->error_bitmap); + } + } +} + static void raid_wait_read_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_status) + if (bio->bi_status) { rbio_update_error_bitmap(rbio, bio); - else + } else { set_bio_pages_uptodate(rbio, bio); + verify_bio_data_sectors(rbio, bio); + } bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending)) @@ -1469,37 +1512,25 @@ static void submit_read_bios(struct btrfs_raid_bio *rbio, static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { - const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data; struct bio *bio; int total_sector_nr; int ret = 0; ASSERT(bio_list_size(bio_list) == 0); - /* Build a list of bios to read all the missing data sectors. */ - for (total_sector_nr = 0; total_sector_nr < nr_data_sectors; + /* + * Build a list of bios to read all sectors (including data and P/Q). + * + * This behaviro is to compensate the later csum verification and + * recovery. + */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { struct sector_ptr *sector; int stripe = total_sector_nr / rbio->stripe_nsectors; int sectornr = total_sector_nr % rbio->stripe_nsectors; - /* - * We want to find all the sectors missing from the rbio and - * read them from the disk. If sector_in_rbio() finds a page - * in the bio list we don't need to read it off the stripe. - */ - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (sector) - continue; - sector = rbio_stripe_sector(rbio, stripe, sectornr); - /* - * The bio cache may have handed us an uptodate page. If so, - * use it. - */ - if (sector->uptodate) - continue; - ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret) @@ -1670,6 +1701,42 @@ fail: bio_endio(bio); } +static int verify_one_sector(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct sector_ptr *sector; + u8 csum_buf[BTRFS_CSUM_SIZE]; + u8 *csum_expected; + int ret; + + if (!rbio->csum_bitmap || !rbio->csum_buf) + return 0; + + /* No way to verify P/Q as they are not covered by data csum. */ + if (stripe_nr >= rbio->nr_data) + return 0; + /* + * If we're rebuilding a read, we have to use pages from the + * bio list if possible. + */ + if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { + sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + } else { + sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); + } + + ASSERT(sector->page); + + csum_expected = rbio->csum_buf + + (stripe_nr * rbio->stripe_nsectors + sector_nr) * + fs_info->csum_size; + ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff, + csum_buf, csum_expected); + return ret; +} + /* * Recover a vertical stripe specified by @sector_nr. * @*pointers are the pre-allocated pointers by the caller, so we don't @@ -1685,6 +1752,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, int faila; int failb; int stripe_nr; + int ret = 0; /* * Now we just use bitmap to mark the horizontal stripes in @@ -1805,12 +1873,23 @@ pstripe: * uptodate. * Especially if we determine to cache the rbio, we need to * have at least all data sectors uptodate. + * + * If possible, also check if the repaired sector matches its data + * checksum. */ if (faila >= 0) { + ret = verify_one_sector(rbio, faila, sector_nr); + if (ret < 0) + goto cleanup; + sector = rbio_stripe_sector(rbio, faila, sector_nr); sector->uptodate = 1; } if (failb >= 0) { + ret = verify_one_sector(rbio, faila, sector_nr); + if (ret < 0) + goto cleanup; + sector = rbio_stripe_sector(rbio, failb, sector_nr); sector->uptodate = 1; } @@ -1818,7 +1897,7 @@ pstripe: cleanup: for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) kunmap_local(unmap_array[stripe_nr]); - return 0; + return ret; } static int recover_sectors(struct btrfs_raid_bio *rbio) @@ -2115,7 +2194,7 @@ no_csum: rbio->csum_bitmap = NULL; } -static int rmw_read_and_wait(struct btrfs_raid_bio *rbio) +static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) { struct bio_list bio_list; struct bio *bio; @@ -2136,6 +2215,12 @@ static int rmw_read_and_wait(struct btrfs_raid_bio *rbio) submit_read_bios(rbio, &bio_list); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + /* + * We may or may not have any corrupted sectors (including missing dev + * and csum mismatch), just let recover_sectors() to handle them all. + */ + ret = recover_sectors(rbio); return ret; out: while ((bio = bio_list_pop(&bio_list))) @@ -2175,6 +2260,28 @@ static void submit_write_bios(struct btrfs_raid_bio *rbio, } } +/* + * To determine if we need to read any sector from the disk. + * Should only be utilized in RMW path, to skip cached rbio. + */ +static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) +{ + int i; + + for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { + struct sector_ptr *sector = &rbio->stripe_sectors[i]; + + /* + * We have a sector which doesn't have page nor uptodate, + * thus this rbio can not be cached one, as cached one must + * have all its data sectors present and uptodate. + */ + if (!sector->page || !sector->uptodate) + return true; + } + return false; +} + static int rmw_rbio(struct btrfs_raid_bio *rbio) { struct bio_list bio_list; @@ -2189,9 +2296,13 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) if (ret < 0) return ret; - /* Full stripe write, can write the full stripe right now. */ - if (rbio_is_full(rbio)) + /* + * Either full stripe write, or we have every data sector already + * cached, can go to write path immediately. + */ + if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio)) goto write; + /* * Now we're doing sub-stripe write, also need all data stripes to do * the full RMW. @@ -2202,16 +2313,10 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); - ret = rmw_read_and_wait(rbio); + ret = rmw_read_wait_recover(rbio); if (ret < 0) return ret; - /* We have read errors, try recovery path. */ - if (!bitmap_empty(rbio->error_bitmap, rbio->nr_sectors)) { - ret = recover_rbio(rbio); - if (ret < 0) - return ret; - } write: /* * At this stage we're not allowed to add any new bios to the -- cgit From 27137fac4c0628fc8320bb7f1ce3bb9f84b76a9b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Nov 2022 10:44:04 +0100 Subject: btrfs: move struct btrfs_tree_parent_check out of disk-io.h Move struct btrfs_tree_parent_check out of disk-io.h so that volumes.h an various .c files don't have to include disk-io.h just for it. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba [ use tree-checker.h for the structure ] Signed-off-by: David Sterba --- fs/btrfs/backref.c | 1 + fs/btrfs/disk-io.h | 30 +----------------------------- fs/btrfs/print-tree.c | 1 + fs/btrfs/qgroup.c | 1 + fs/btrfs/tree-checker.h | 35 +++++++++++++++++++++++++++++++++-- fs/btrfs/tree-mod-log.c | 1 + fs/btrfs/volumes.h | 2 +- 7 files changed, 39 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 55c072ba6747..21c92c74bf71 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -19,6 +19,7 @@ #include "accessors.h" #include "extent-tree.h" #include "relocation.h" +#include "tree-checker.h" /* Just arbitrary numbers so we can be sure one of these happened. */ #define BACKREF_FOUND_SHARED 6 diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 03fe4154ffb8..363935cfc084 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -25,37 +25,9 @@ static inline u64 btrfs_sb_offset(int mirror) return BTRFS_SUPER_INFO_OFFSET; } -/* All the extra info needed to verify the parentness of a tree block. */ -struct btrfs_tree_parent_check { - /* - * The owner check against the tree block. - * - * Can be 0 to skip the owner check. - */ - u64 owner_root; - - /* - * Expected transid, can be 0 to skip the check, but such skip - * should only be utlized for backref walk related code. - */ - u64 transid; - - /* - * The expected first key. - * - * This check can be skipped if @has_first_key is false, such skip - * can happen for case where we don't have the parent node key, - * e.g. reading the tree root, doing backref walk. - */ - struct btrfs_key first_key; - bool has_first_key; - - /* The expected level. Should always be set. */ - u8 level; -}; - struct btrfs_device; struct btrfs_fs_devices; +struct btrfs_tree_parent_check; void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info); void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 1469aa55ad48..b93c96213304 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -8,6 +8,7 @@ #include "disk-io.h" #include "print-tree.h" #include "accessors.h" +#include "tree-checker.h" struct root_name_map { u64 id; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index e0522c6c0d67..5c636e00d77d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -28,6 +28,7 @@ #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" +#include "tree-checker.h" /* * Helpers to access qgroup reservation diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index ece497e26558..bfb5efa4e01f 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -6,8 +6,39 @@ #ifndef BTRFS_TREE_CHECKER_H #define BTRFS_TREE_CHECKER_H -#include "ctree.h" -#include "extent_io.h" +#include + +struct extent_buffer; +struct btrfs_chunk; + +/* All the extra info needed to verify the parentness of a tree block. */ +struct btrfs_tree_parent_check { + /* + * The owner check against the tree block. + * + * Can be 0 to skip the owner check. + */ + u64 owner_root; + + /* + * Expected transid, can be 0 to skip the check, but such skip + * should only be utlized for backref walk related code. + */ + u64 transid; + + /* + * The expected first key. + * + * This check can be skipped if @has_first_key is false, such skip + * can happen for case where we don't have the parent node key, + * e.g. reading the tree root, doing backref walk. + */ + struct btrfs_key first_key; + bool has_first_key; + + /* The expected level. Should always be set. */ + u8 level; +}; /* * Comprehensive leaf checker. diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 779ad44d285f..146a6b198933 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -5,6 +5,7 @@ #include "disk-io.h" #include "fs.h" #include "accessors.h" +#include "tree-checker.h" struct tree_mod_root { u64 logical; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2c90e50c460a..ab551471c1f8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -11,7 +11,7 @@ #include #include "async-thread.h" #include "messages.h" -#include "disk-io.h" +#include "tree-checker.h" #include "rcu-string.h" #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) -- cgit From 103c19723c80bf74e4e70cd6cde3b8783a27aceb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Nov 2022 10:44:05 +0100 Subject: btrfs: split the bio submission path into a separate file The code used by btrfs_submit_bio only interacts with the rest of volumes.c through __btrfs_map_block (which itself is a more generic version of two exported helpers) and does not really have anything to do with volumes.c. Create a new bio.c file and a bio.h header going along with it for the btrfs_bio-based storage layer, which will grow even more going forward. Also update the file with my copyright notice given that a large part of the moved code was written or rewritten by me. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Makefile | 2 +- fs/btrfs/bio.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/bio.h | 127 +++++++++++++++++++++ fs/btrfs/compression.c | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent-tree.c | 1 + fs/btrfs/extent_io.c | 2 +- fs/btrfs/file-item.c | 2 +- fs/btrfs/inode.c | 2 +- fs/btrfs/relocation.c | 1 + fs/btrfs/super.c | 2 +- fs/btrfs/tree-log.c | 1 + fs/btrfs/volumes.c | 296 +------------------------------------------------ fs/btrfs/volumes.h | 110 +----------------- 14 files changed, 438 insertions(+), 403 deletions(-) create mode 100644 fs/btrfs/bio.c create mode 100644 fs/btrfs/bio.h (limited to 'fs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 84fb3b4c35b0..555c962fdad6 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -31,7 +31,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ - subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o + subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c new file mode 100644 index 000000000000..9e881dc91bae --- /dev/null +++ b/fs/btrfs/bio.c @@ -0,0 +1,291 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2022 Christoph Hellwig. + */ + +#include +#include "bio.h" +#include "ctree.h" +#include "volumes.h" +#include "raid56.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "dev-replace.h" +#include "rcu-string.h" +#include "zoned.h" + +static struct bio_set btrfs_bioset; + +/* + * Initialize a btrfs_bio structure. This skips the embedded bio itself as it + * is already initialized by the block layer. + */ +static inline void btrfs_bio_init(struct btrfs_bio *bbio, + btrfs_bio_end_io_t end_io, void *private) +{ + memset(bbio, 0, offsetof(struct btrfs_bio, bio)); + bbio->end_io = end_io; + bbio->private = private; +} + +/* + * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for + * btrfs, and is used for all I/O submitted through btrfs_submit_bio. + * + * Just like the underlying bio_alloc_bioset it will not fail as it is backed by + * a mempool. + */ +struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + btrfs_bio_end_io_t end_io, void *private) +{ + struct bio *bio; + + bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); + btrfs_bio_init(btrfs_bio(bio), end_io, private); + return bio; +} + +struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, + btrfs_bio_end_io_t end_io, void *private) +{ + struct bio *bio; + struct btrfs_bio *bbio; + + ASSERT(offset <= UINT_MAX && size <= UINT_MAX); + + bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); + bbio = btrfs_bio(bio); + btrfs_bio_init(bbio, end_io, private); + + bio_trim(bio, offset >> 9, size >> 9); + bbio->iter = bio->bi_iter; + return bio; +} + +static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) +{ + if (!dev || !dev->bdev) + return; + if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) + return; + + if (btrfs_op(bio) == BTRFS_MAP_WRITE) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + if (!(bio->bi_opf & REQ_RAHEAD)) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); + if (bio->bi_opf & REQ_PREFLUSH) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); +} + +static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, + struct bio *bio) +{ + if (bio->bi_opf & REQ_META) + return fs_info->endio_meta_workers; + return fs_info->endio_workers; +} + +static void btrfs_end_bio_work(struct work_struct *work) +{ + struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); + + bbio->end_io(bbio); +} + +static void btrfs_simple_end_io(struct bio *bio) +{ + struct btrfs_fs_info *fs_info = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); + + btrfs_bio_counter_dec(fs_info); + + if (bio->bi_status) + btrfs_log_dev_io_error(bio, bbio->device); + + if (bio_op(bio) == REQ_OP_READ) { + INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); + queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); + } else { + bbio->end_io(bbio); + } +} + +static void btrfs_raid56_end_io(struct bio *bio) +{ + struct btrfs_io_context *bioc = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); + + btrfs_bio_counter_dec(bioc->fs_info); + bbio->mirror_num = bioc->mirror_num; + bbio->end_io(bbio); + + btrfs_put_bioc(bioc); +} + +static void btrfs_orig_write_end_io(struct bio *bio) +{ + struct btrfs_io_stripe *stripe = bio->bi_private; + struct btrfs_io_context *bioc = stripe->bioc; + struct btrfs_bio *bbio = btrfs_bio(bio); + + btrfs_bio_counter_dec(bioc->fs_info); + + if (bio->bi_status) { + atomic_inc(&bioc->error); + btrfs_log_dev_io_error(bio, stripe->dev); + } + + /* + * Only send an error to the higher layers if it is beyond the tolerance + * threshold. + */ + if (atomic_read(&bioc->error) > bioc->max_errors) + bio->bi_status = BLK_STS_IOERR; + else + bio->bi_status = BLK_STS_OK; + + bbio->end_io(bbio); + btrfs_put_bioc(bioc); +} + +static void btrfs_clone_write_end_io(struct bio *bio) +{ + struct btrfs_io_stripe *stripe = bio->bi_private; + + if (bio->bi_status) { + atomic_inc(&stripe->bioc->error); + btrfs_log_dev_io_error(bio, stripe->dev); + } + + /* Pass on control to the original bio this one was cloned from */ + bio_endio(stripe->bioc->orig_bio); + bio_put(bio); +} + +static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) +{ + if (!dev || !dev->bdev || + test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || + (btrfs_op(bio) == BTRFS_MAP_WRITE && + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { + bio_io_error(bio); + return; + } + + bio_set_dev(bio, dev->bdev); + + /* + * For zone append writing, bi_sector must point the beginning of the + * zone + */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + + if (btrfs_dev_is_sequential(dev, physical)) { + u64 zone_start = round_down(physical, + dev->fs_info->zone_size); + + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; + } else { + bio->bi_opf &= ~REQ_OP_ZONE_APPEND; + bio->bi_opf |= REQ_OP_WRITE; + } + } + btrfs_debug_in_rcu(dev->fs_info, + "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", + __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, + (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), + dev->devid, bio->bi_iter.bi_size); + + btrfsic_check_bio(bio); + submit_bio(bio); +} + +static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) +{ + struct bio *orig_bio = bioc->orig_bio, *bio; + + ASSERT(bio_op(orig_bio) != REQ_OP_READ); + + /* Reuse the bio embedded into the btrfs_bio for the last mirror */ + if (dev_nr == bioc->num_stripes - 1) { + bio = orig_bio; + bio->bi_end_io = btrfs_orig_write_end_io; + } else { + bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); + bio_inc_remaining(orig_bio); + bio->bi_end_io = btrfs_clone_write_end_io; + } + + bio->bi_private = &bioc->stripes[dev_nr]; + bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; + bioc->stripes[dev_nr].bioc = bioc; + btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); +} + +void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) +{ + u64 logical = bio->bi_iter.bi_sector << 9; + u64 length = bio->bi_iter.bi_size; + u64 map_length = length; + struct btrfs_io_context *bioc = NULL; + struct btrfs_io_stripe smap; + int ret; + + btrfs_bio_counter_inc_blocked(fs_info); + ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num, 1); + if (ret) { + btrfs_bio_counter_dec(fs_info); + btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); + return; + } + + if (map_length < length) { + btrfs_crit(fs_info, + "mapping failed logical %llu bio len %llu len %llu", + logical, length, map_length); + BUG(); + } + + if (!bioc) { + /* Single mirror read/write fast path */ + btrfs_bio(bio)->mirror_num = mirror_num; + btrfs_bio(bio)->device = smap.dev; + bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; + bio->bi_private = fs_info; + bio->bi_end_io = btrfs_simple_end_io; + btrfs_submit_dev_bio(smap.dev, bio); + } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + /* Parity RAID write or read recovery */ + bio->bi_private = bioc; + bio->bi_end_io = btrfs_raid56_end_io; + if (bio_op(bio) == REQ_OP_READ) + raid56_parity_recover(bio, bioc, mirror_num); + else + raid56_parity_write(bio, bioc); + } else { + /* Write to multiple mirrors */ + int total_devs = bioc->num_stripes; + int dev_nr; + + bioc->orig_bio = bio; + for (dev_nr = 0; dev_nr < total_devs; dev_nr++) + btrfs_submit_mirrored_bio(bioc, dev_nr); + } +} + +int __init btrfs_bioset_init(void) +{ + if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_bio, bio), + BIOSET_NEED_BVECS)) + return -ENOMEM; + return 0; +} + +void __cold btrfs_bioset_exit(void) +{ + bioset_exit(&btrfs_bioset); +} diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h new file mode 100644 index 000000000000..b12f84b3b341 --- /dev/null +++ b/fs/btrfs/bio.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2022 Christoph Hellwig. + */ + +#ifndef BTRFS_BIO_H +#define BTRFS_BIO_H + +#include +#include +#include "tree-checker.h" + +struct btrfs_bio; +struct btrfs_fs_info; + +#define BTRFS_BIO_INLINE_CSUM_SIZE 64 + +/* + * Maximum number of sectors for a single bio to limit the size of the + * checksum array. This matches the number of bio_vecs per bio and thus the + * I/O size for buffered I/O. + */ +#define BTRFS_MAX_BIO_SECTORS (256) + +typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); + +/* + * Additional info to pass along bio. + * + * Mostly for btrfs specific features like csum and mirror_num. + */ +struct btrfs_bio { + unsigned int mirror_num:7; + + /* + * Extra indicator for metadata bios. + * For some btrfs bios they use pages without a mapping, thus + * we can not rely on page->mapping->host to determine if + * it's a metadata bio. + */ + unsigned int is_metadata:1; + struct bvec_iter iter; + + /* for direct I/O */ + u64 file_offset; + + /* @device is for stripe IO submission. */ + struct btrfs_device *device; + union { + /* For data checksum verification. */ + struct { + u8 *csum; + u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; + }; + + /* For metadata parentness verification. */ + struct btrfs_tree_parent_check parent_check; + }; + + /* End I/O information supplied to btrfs_bio_alloc */ + btrfs_bio_end_io_t end_io; + void *private; + + /* For read end I/O handling */ + struct work_struct end_io_work; + + /* + * This member must come last, bio_alloc_bioset will allocate enough + * bytes for entire btrfs_bio but relies on bio being last. + */ + struct bio bio; +}; + +static inline struct btrfs_bio *btrfs_bio(struct bio *bio) +{ + return container_of(bio, struct btrfs_bio, bio); +} + +int __init btrfs_bioset_init(void); +void __cold btrfs_bioset_exit(void); + +struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + btrfs_bio_end_io_t end_io, void *private); +struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, + btrfs_bio_end_io_t end_io, void *private); + + +static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) +{ + bbio->bio.bi_status = status; + bbio->end_io(bbio); +} + +static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) +{ + if (bbio->is_metadata) + return; + if (bbio->csum != bbio->csum_inline) { + kfree(bbio->csum); + bbio->csum = NULL; + } +} + +/* + * Iterate through a btrfs_bio (@bbio) on a per-sector basis. + * + * bvl - struct bio_vec + * bbio - struct btrfs_bio + * iters - struct bvec_iter + * bio_offset - unsigned int + */ +#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ + for ((iter) = (bbio)->iter, (bio_offset) = 0; \ + (iter).bi_size && \ + (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ + (bio_offset) += fs_info->sectorsize, \ + bio_advance_iter_single(&(bbio)->bio, &(iter), \ + (fs_info)->sectorsize)) + +void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num); +int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num); + +#endif diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 30adf430241e..5122ca79f7ea 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -27,7 +27,7 @@ #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "volumes.h" +#include "bio.h" #include "ordered-data.h" #include "compression.h" #include "extent_io.h" diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 91a088210e5a..d5be259845d1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -23,7 +23,7 @@ #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "volumes.h" +#include "bio.h" #include "print-tree.h" #include "locking.h" #include "tree-log.h" diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 17f599027c3d..892d78c1853c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -42,6 +42,7 @@ #include "root-tree.h" #include "file-item.h" #include "orphan.h" +#include "tree-checker.h" #undef SCRAMBLE_DELAYED_REFS diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 65ba5c3658cf..95d54b5834c0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -20,7 +20,7 @@ #include "extent_map.h" #include "ctree.h" #include "btrfs_inode.h" -#include "volumes.h" +#include "bio.h" #include "check-integrity.h" #include "locking.h" #include "rcu-string.h" diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 352bbb33b76f..5de73466b2ca 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -14,7 +14,7 @@ #include "ctree.h" #include "disk-io.h" #include "transaction.h" -#include "volumes.h" +#include "bio.h" #include "print-tree.h" #include "compression.h" #include "fs.h" diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4248e6cabbdc..905ea19df125 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -43,7 +43,7 @@ #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" -#include "volumes.h" +#include "bio.h" #include "compression.h" #include "locking.h" #include "free-space-cache.h" diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index aa80e51bc8ca..31ec4a7658ce 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -35,6 +35,7 @@ #include "file-item.h" #include "relocation.h" #include "super.h" +#include "tree-checker.h" /* * Relocation overview diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ea83dd9f735a..93f52ee85f6f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -35,7 +35,7 @@ #include "print-tree.h" #include "props.h" #include "xattr.h" -#include "volumes.h" +#include "bio.h" #include "export.h" #include "compression.h" #include "rcu-string.h" diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8f8d7e7dc3a8..4387cd2ade97 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -29,6 +29,7 @@ #include "file-item.h" #include "file.h" #include "orphan.h" +#include "tree-checker.h" #define MAX_CONFLICT_INODES 10 diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e51fd5f1042a..acab20f2863d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5,12 +5,9 @@ #include #include -#include #include -#include #include #include -#include #include #include #include @@ -23,8 +20,6 @@ #include "print-tree.h" #include "volumes.h" #include "raid56.h" -#include "async-thread.h" -#include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" #include "sysfs.h" @@ -41,8 +36,6 @@ #include "scrub.h" #include "super.h" -static struct bio_set btrfs_bioset; - #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID10 | \ BTRFS_BLOCK_GROUP_RAID56_MASK) @@ -255,11 +248,6 @@ out_overflow:; static int init_first_rw_device(struct btrfs_trans_handle *trans); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); -static int __btrfs_map_block(struct btrfs_fs_info *fs_info, - enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, - int *mirror_num_ret, int need_raid_map); /* * Device locking @@ -6364,11 +6352,11 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup * stripe_offset + stripe_nr * map->stripe_len; } -static int __btrfs_map_block(struct btrfs_fs_info *fs_info, - enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, - int *mirror_num_ret, int need_raid_map) +int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 *length, + struct btrfs_io_context **bioc_ret, + struct btrfs_io_stripe *smap, int *mirror_num_ret, + int need_raid_map) { struct extent_map *em; struct map_lookup *map; @@ -6651,266 +6639,6 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, NULL, NULL, 1); } -/* - * Initialize a btrfs_bio structure. This skips the embedded bio itself as it - * is already initialized by the block layer. - */ -static inline void btrfs_bio_init(struct btrfs_bio *bbio, - btrfs_bio_end_io_t end_io, void *private) -{ - memset(bbio, 0, offsetof(struct btrfs_bio, bio)); - bbio->end_io = end_io; - bbio->private = private; -} - -/* - * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for - * btrfs, and is used for all I/O submitted through btrfs_submit_bio. - * - * Just like the underlying bio_alloc_bioset it will not fail as it is backed by - * a mempool. - */ -struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, - btrfs_bio_end_io_t end_io, void *private) -{ - struct bio *bio; - - bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); - btrfs_bio_init(btrfs_bio(bio), end_io, private); - return bio; -} - -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, - btrfs_bio_end_io_t end_io, void *private) -{ - struct bio *bio; - struct btrfs_bio *bbio; - - ASSERT(offset <= UINT_MAX && size <= UINT_MAX); - - bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); - bbio = btrfs_bio(bio); - btrfs_bio_init(bbio, end_io, private); - - bio_trim(bio, offset >> 9, size >> 9); - bbio->iter = bio->bi_iter; - return bio; -} - -static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) -{ - if (!dev || !dev->bdev) - return; - if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) - return; - - if (btrfs_op(bio) == BTRFS_MAP_WRITE) - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - if (!(bio->bi_opf & REQ_RAHEAD)) - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); - if (bio->bi_opf & REQ_PREFLUSH) - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); -} - -static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, - struct bio *bio) -{ - if (bio->bi_opf & REQ_META) - return fs_info->endio_meta_workers; - return fs_info->endio_workers; -} - -static void btrfs_end_bio_work(struct work_struct *work) -{ - struct btrfs_bio *bbio = - container_of(work, struct btrfs_bio, end_io_work); - - bbio->end_io(bbio); -} - -static void btrfs_simple_end_io(struct bio *bio) -{ - struct btrfs_fs_info *fs_info = bio->bi_private; - struct btrfs_bio *bbio = btrfs_bio(bio); - - btrfs_bio_counter_dec(fs_info); - - if (bio->bi_status) - btrfs_log_dev_io_error(bio, bbio->device); - - if (bio_op(bio) == REQ_OP_READ) { - INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); - queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); - } else { - bbio->end_io(bbio); - } -} - -static void btrfs_raid56_end_io(struct bio *bio) -{ - struct btrfs_io_context *bioc = bio->bi_private; - struct btrfs_bio *bbio = btrfs_bio(bio); - - btrfs_bio_counter_dec(bioc->fs_info); - bbio->mirror_num = bioc->mirror_num; - bbio->end_io(bbio); - - btrfs_put_bioc(bioc); -} - -static void btrfs_orig_write_end_io(struct bio *bio) -{ - struct btrfs_io_stripe *stripe = bio->bi_private; - struct btrfs_io_context *bioc = stripe->bioc; - struct btrfs_bio *bbio = btrfs_bio(bio); - - btrfs_bio_counter_dec(bioc->fs_info); - - if (bio->bi_status) { - atomic_inc(&bioc->error); - btrfs_log_dev_io_error(bio, stripe->dev); - } - - /* - * Only send an error to the higher layers if it is beyond the tolerance - * threshold. - */ - if (atomic_read(&bioc->error) > bioc->max_errors) - bio->bi_status = BLK_STS_IOERR; - else - bio->bi_status = BLK_STS_OK; - - bbio->end_io(bbio); - btrfs_put_bioc(bioc); -} - -static void btrfs_clone_write_end_io(struct bio *bio) -{ - struct btrfs_io_stripe *stripe = bio->bi_private; - - if (bio->bi_status) { - atomic_inc(&stripe->bioc->error); - btrfs_log_dev_io_error(bio, stripe->dev); - } - - /* Pass on control to the original bio this one was cloned from */ - bio_endio(stripe->bioc->orig_bio); - bio_put(bio); -} - -static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) -{ - if (!dev || !dev->bdev || - test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || - (btrfs_op(bio) == BTRFS_MAP_WRITE && - !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { - bio_io_error(bio); - return; - } - - bio_set_dev(bio, dev->bdev); - - /* - * For zone append writing, bi_sector must point the beginning of the - * zone - */ - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; - - if (btrfs_dev_is_sequential(dev, physical)) { - u64 zone_start = round_down(physical, - dev->fs_info->zone_size); - - bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; - } else { - bio->bi_opf &= ~REQ_OP_ZONE_APPEND; - bio->bi_opf |= REQ_OP_WRITE; - } - } - btrfs_debug_in_rcu(dev->fs_info, - "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", - __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, - (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), - dev->devid, bio->bi_iter.bi_size); - - btrfsic_check_bio(bio); - submit_bio(bio); -} - -static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) -{ - struct bio *orig_bio = bioc->orig_bio, *bio; - - ASSERT(bio_op(orig_bio) != REQ_OP_READ); - - /* Reuse the bio embedded into the btrfs_bio for the last mirror */ - if (dev_nr == bioc->num_stripes - 1) { - bio = orig_bio; - bio->bi_end_io = btrfs_orig_write_end_io; - } else { - bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); - bio_inc_remaining(orig_bio); - bio->bi_end_io = btrfs_clone_write_end_io; - } - - bio->bi_private = &bioc->stripes[dev_nr]; - bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; - bioc->stripes[dev_nr].bioc = bioc; - btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); -} - -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) -{ - u64 logical = bio->bi_iter.bi_sector << 9; - u64 length = bio->bi_iter.bi_size; - u64 map_length = length; - struct btrfs_io_context *bioc = NULL; - struct btrfs_io_stripe smap; - int ret; - - btrfs_bio_counter_inc_blocked(fs_info); - ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num, 1); - if (ret) { - btrfs_bio_counter_dec(fs_info); - btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); - return; - } - - if (map_length < length) { - btrfs_crit(fs_info, - "mapping failed logical %llu bio len %llu len %llu", - logical, length, map_length); - BUG(); - } - - if (!bioc) { - /* Single mirror read/write fast path */ - btrfs_bio(bio)->mirror_num = mirror_num; - btrfs_bio(bio)->device = smap.dev; - bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - bio->bi_private = fs_info; - bio->bi_end_io = btrfs_simple_end_io; - btrfs_submit_dev_bio(smap.dev, bio); - } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - /* Parity RAID write or read recovery */ - bio->bi_private = bioc; - bio->bi_end_io = btrfs_raid56_end_io; - if (bio_op(bio) == REQ_OP_READ) - raid56_parity_recover(bio, bioc, mirror_num); - else - raid56_parity_write(bio, bioc); - } else { - /* Write to multiple mirrors */ - int total_devs = bioc->num_stripes; - int dev_nr; - - bioc->orig_bio = bio; - for (dev_nr = 0; dev_nr < total_devs; dev_nr++) - btrfs_submit_mirrored_bio(bioc, dev_nr); - } -} - static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, const struct btrfs_fs_devices *fs_devices) { @@ -8440,17 +8168,3 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) return true; } - -int __init btrfs_bioset_init(void) -{ - if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_bio, bio), - BIOSET_NEED_BVECS)) - return -ENOMEM; - return 0; -} - -void __cold btrfs_bioset_exit(void) -{ - bioset_exit(&btrfs_bioset); -} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ab551471c1f8..6b7a05f6cf82 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -6,7 +6,6 @@ #ifndef BTRFS_VOLUMES_H #define BTRFS_VOLUMES_H -#include #include #include #include "async-thread.h" @@ -373,8 +372,6 @@ struct btrfs_fs_devices { enum btrfs_read_policy read_policy; }; -#define BTRFS_BIO_INLINE_CSUM_SIZE 64 - #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ - sizeof(struct btrfs_chunk)) \ / sizeof(struct btrfs_stripe) + 1) @@ -384,107 +381,6 @@ struct btrfs_fs_devices { - 2 * sizeof(struct btrfs_chunk)) \ / sizeof(struct btrfs_stripe) + 1) -/* - * Maximum number of sectors for a single bio to limit the size of the - * checksum array. This matches the number of bio_vecs per bio and thus the - * I/O size for buffered I/O. - */ -#define BTRFS_MAX_BIO_SECTORS (256) - -typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); - -/* - * Additional info to pass along bio. - * - * Mostly for btrfs specific features like csum and mirror_num. - */ -struct btrfs_bio { - unsigned int mirror_num:7; - - /* - * Extra indicator for metadata bios. - * For some btrfs bios they use pages without a mapping, thus - * we can not rely on page->mapping->host to determine if - * it's a metadata bio. - */ - unsigned int is_metadata:1; - struct bvec_iter iter; - - /* for direct I/O */ - u64 file_offset; - - /* @device is for stripe IO submission. */ - struct btrfs_device *device; - union { - /* For data checksum verification. */ - struct { - u8 *csum; - u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; - }; - - /* For metadata parentness verification. */ - struct btrfs_tree_parent_check parent_check; - }; - - /* End I/O information supplied to btrfs_bio_alloc */ - btrfs_bio_end_io_t end_io; - void *private; - - /* For read end I/O handling */ - struct work_struct end_io_work; - - /* - * This member must come last, bio_alloc_bioset will allocate enough - * bytes for entire btrfs_bio but relies on bio being last. - */ - struct bio bio; -}; - -static inline struct btrfs_bio *btrfs_bio(struct bio *bio) -{ - return container_of(bio, struct btrfs_bio, bio); -} - -int __init btrfs_bioset_init(void); -void __cold btrfs_bioset_exit(void); - -struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, - btrfs_bio_end_io_t end_io, void *private); -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, - btrfs_bio_end_io_t end_io, void *private); - -static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) -{ - bbio->bio.bi_status = status; - bbio->end_io(bbio); -} - -static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) -{ - if (bbio->is_metadata) - return; - if (bbio->csum != bbio->csum_inline) { - kfree(bbio->csum); - bbio->csum = NULL; - } -} - -/* - * Iterate through a btrfs_bio (@bbio) on a per-sector basis. - * - * bvl - struct bio_vec - * bbio - struct btrfs_bio - * iters - struct bvec_iter - * bio_offset - unsigned int - */ -#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ - for ((iter) = (bbio)->iter, (bio_offset) = 0; \ - (iter).bi_size && \ - (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ - (bio_offset) += fs_info->sectorsize, \ - bio_advance_iter_single(&(bbio)->bio, &(iter), \ - (fs_info)->sectorsize)) - struct btrfs_io_stripe { struct btrfs_device *dev; union { @@ -641,6 +537,11 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret); +int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 *length, + struct btrfs_io_context **bioc_ret, + struct btrfs_io_stripe *smap, int *mirror_num_ret, + int need_raid_map); struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, u32 *num_stripes); @@ -652,7 +553,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type); void btrfs_mapping_tree_free(struct extent_map_tree *tree); -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, -- cgit From bacf60e515862904dd91a332f3a54f092416eaa6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Nov 2022 10:44:06 +0100 Subject: btrfs: move repair_io_failure to bio.c repair_io_failure ties directly into all the glory low-level details of mapping a bio with a logic address to the actual physical location. Move it right below btrfs_submit_bio to keep all the related logic together. Also move btrfs_repair_eb_io_failure to its caller in disk-io.c now that repair_io_failure is available in a header. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 90 +++++++++++++++++++++++++++++++++++++++ fs/btrfs/disk-io.c | 24 +++++++++++ fs/btrfs/extent_io.c | 117 +-------------------------------------------------- fs/btrfs/extent_io.h | 1 - 4 files changed, 116 insertions(+), 116 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 9e881dc91bae..b8fb7ef6b520 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -276,6 +276,96 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror } } +/* + * Submit a repair write. + * + * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a + * RAID setup. Here we only want to write the one bad copy, so we do the + * mapping ourselves and submit the bio directly. + * + * The I/O is issued sychronously to block the repair read completion from + * freeing the bio. + */ +int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num) +{ + struct btrfs_device *dev; + struct bio_vec bvec; + struct bio bio; + u64 map_length = 0; + u64 sector; + struct btrfs_io_context *bioc = NULL; + int ret = 0; + + ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); + BUG_ON(!mirror_num); + + if (btrfs_repair_one_zone(fs_info, logical)) + return 0; + + map_length = length; + + /* + * Avoid races with device replace and make sure our bioc has devices + * associated to its stripes that don't go away while we are doing the + * read repair operation. + */ + btrfs_bio_counter_inc_blocked(fs_info); + if (btrfs_is_parity_mirror(fs_info, logical, length)) { + /* + * Note that we don't use BTRFS_MAP_WRITE because it's supposed + * to update all raid stripes, but here we just want to correct + * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad + * stripe's dev and sector. + */ + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, + &map_length, &bioc, 0); + if (ret) + goto out_counter_dec; + ASSERT(bioc->mirror_num == 1); + } else { + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, + &map_length, &bioc, mirror_num); + if (ret) + goto out_counter_dec; + BUG_ON(mirror_num != bioc->mirror_num); + } + + sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; + dev = bioc->stripes[bioc->mirror_num - 1].dev; + btrfs_put_bioc(bioc); + + if (!dev || !dev->bdev || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { + ret = -EIO; + goto out_counter_dec; + } + + bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); + bio.bi_iter.bi_sector = sector; + __bio_add_page(&bio, page, length, pg_offset); + + btrfsic_check_bio(&bio); + ret = submit_bio_wait(&bio); + if (ret) { + /* try to remap that extent elsewhere? */ + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + goto out_bio_uninit; + } + + btrfs_info_rl_in_rcu(fs_info, + "read error corrected: ino %llu off %llu (dev %s sector %llu)", + ino, start, btrfs_dev_name(dev), sector); + ret = 0; + +out_bio_uninit: + bio_uninit(&bio); +out_counter_dec: + btrfs_bio_counter_dec(fs_info); + return ret; +} + int __init btrfs_bioset_init(void) { if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d5be259845d1..0888d484df80 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -255,6 +255,30 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, return ret; } +static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, + int mirror_num) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + u64 start = eb->start; + int i, num_pages = num_extent_pages(eb); + int ret = 0; + + if (sb_rdonly(fs_info->sb)) + return -EROFS; + + for (i = 0; i < num_pages; i++) { + struct page *p = eb->pages[i]; + + ret = btrfs_repair_io_failure(fs_info, 0, start, PAGE_SIZE, + start, p, start - page_offset(p), mirror_num); + if (ret) + break; + start += PAGE_SIZE; + } + + return ret; +} + /* * helper to read a given tree block, doing retries as required when * the checksums don't match and we have alternate mirrors to try. diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 95d54b5834c0..8528e7d3f38f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -531,119 +531,6 @@ static void free_io_failure(struct btrfs_inode *inode, kfree(rec); } -/* - * this bypasses the standard btrfs submit functions deliberately, as - * the standard behavior is to write all copies in a raid setup. here we only - * want to write the one bad copy. so we do the mapping for ourselves and issue - * submit_bio directly. - * to avoid any synchronization issues, wait for the data after writing, which - * actually prevents the read that triggered the error from finishing. - * currently, there can be no more than two copies of every data bit. thus, - * exactly one rewrite is required. - */ -static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num) -{ - struct btrfs_device *dev; - struct bio_vec bvec; - struct bio bio; - u64 map_length = 0; - u64 sector; - struct btrfs_io_context *bioc = NULL; - int ret = 0; - - ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); - BUG_ON(!mirror_num); - - if (btrfs_repair_one_zone(fs_info, logical)) - return 0; - - map_length = length; - - /* - * Avoid races with device replace and make sure our bioc has devices - * associated to its stripes that don't go away while we are doing the - * read repair operation. - */ - btrfs_bio_counter_inc_blocked(fs_info); - if (btrfs_is_parity_mirror(fs_info, logical, length)) { - /* - * Note that we don't use BTRFS_MAP_WRITE because it's supposed - * to update all raid stripes, but here we just want to correct - * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad - * stripe's dev and sector. - */ - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, - &map_length, &bioc, 0); - if (ret) - goto out_counter_dec; - ASSERT(bioc->mirror_num == 1); - } else { - ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, - &map_length, &bioc, mirror_num); - if (ret) - goto out_counter_dec; - BUG_ON(mirror_num != bioc->mirror_num); - } - - sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; - dev = bioc->stripes[bioc->mirror_num - 1].dev; - btrfs_put_bioc(bioc); - - if (!dev || !dev->bdev || - !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { - ret = -EIO; - goto out_counter_dec; - } - - bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); - bio.bi_iter.bi_sector = sector; - __bio_add_page(&bio, page, length, pg_offset); - - btrfsic_check_bio(&bio); - ret = submit_bio_wait(&bio); - if (ret) { - /* try to remap that extent elsewhere? */ - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - goto out_bio_uninit; - } - - btrfs_info_rl_in_rcu(fs_info, - "read error corrected: ino %llu off %llu (dev %s sector %llu)", - ino, start, btrfs_dev_name(dev), sector); - ret = 0; - -out_bio_uninit: - bio_uninit(&bio); -out_counter_dec: - btrfs_bio_counter_dec(fs_info); - return ret; -} - -int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - u64 start = eb->start; - int i, num_pages = num_extent_pages(eb); - int ret = 0; - - if (sb_rdonly(fs_info->sb)) - return -EROFS; - - for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; - - ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p, - start - page_offset(p), mirror_num); - if (ret) - break; - start += PAGE_SIZE; - } - - return ret; -} - static int next_mirror(const struct io_failure_record *failrec, int cur_mirror) { if (cur_mirror == failrec->num_copies) @@ -691,7 +578,7 @@ int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, mirror = failrec->this_mirror; do { mirror = prev_mirror(failrec, mirror); - repair_io_failure(fs_info, ino, start, failrec->len, + btrfs_repair_io_failure(fs_info, ino, start, failrec->len, failrec->logical, page, pg_offset, mirror); } while (mirror != failrec->failed_mirror); @@ -822,7 +709,7 @@ int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_ * * Since we're only doing repair for one sector, we only need to get * a good copy of the failed sector and if we succeed, we have setup - * everything for repair_io_failure to do the rest for us. + * everything for btrfs_repair_io_failure to do the rest for us. */ failrec->this_mirror = next_mirror(failrec, failrec->this_mirror); if (failrec->this_mirror == failrec->failed_mirror) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a0bafc7f6c07..9b486419854e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -245,7 +245,6 @@ int extent_invalidate_folio(struct extent_io_tree *tree, int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); -int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num); /* * When IO fails, either with EIO or csum verification fails, we -- cgit From 1fe5ebc4e17dfbffeefe52abab75b02ad0f3a97e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:10 -0500 Subject: btrfs: move root helpers back into ctree.h These accidentally got brought into accessors.h, but belong with the btrfs_root definitions which are currently in ctree.h. Move these to make it easier to sync accessors.[ch] into btrfs-progs. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 17 ----------------- fs/btrfs/ctree.h | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index cb59b69d2af1..57ba6894a5f4 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -712,23 +712,6 @@ BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, otransid, 64); BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, stransid, 64); BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, rtransid, 64); -static inline bool btrfs_root_readonly(const struct btrfs_root *root) -{ - /* Byte-swap the constant at compile time, root_item::flags is LE */ - return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; -} - -static inline bool btrfs_root_dead(const struct btrfs_root *root) -{ - /* Byte-swap the constant at compile time, root_item::flags is LE */ - return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; -} - -static inline u64 btrfs_root_id(const struct btrfs_root *root) -{ - return root->root_key.objectid; -} - /* struct btrfs_root_backup */ BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, tree_root, 64); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 99defab7e1f6..4e58b0532b36 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -335,6 +335,23 @@ struct btrfs_root { #endif }; +static inline bool btrfs_root_readonly(const struct btrfs_root *root) +{ + /* Byte-swap the constant at compile time, root_item::flags is LE */ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; +} + +static inline bool btrfs_root_dead(const struct btrfs_root *root) +{ + /* Byte-swap the constant at compile time, root_item::flags is LE */ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; +} + +static inline u64 btrfs_root_id(const struct btrfs_root *root) +{ + return root->root_key.objectid; +} + /* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. -- cgit From 3a3178c7f7675661d7bd0c67f57420e20982bd34 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:11 -0500 Subject: btrfs: move leaf_data_end into ctree.c This is only used in ctree.c, with the exception of zero'ing out extent buffers we're getting ready to write out. In theory we shouldn't have an extent buffer with 0 items that we're writing out, however I'd rather be safe than sorry so open code it in extent_io.c, and then copy the helper into ctree.c. This will make it easier to sync accessors.[ch] into btrfs-progs, as this requires a helper that isn't defined in accessors.h. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 13 ------------- fs/btrfs/ctree.c | 13 +++++++++++++ fs/btrfs/extent_io.c | 6 +++++- 3 files changed, 18 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 57ba6894a5f4..b9d9a69685df 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -897,19 +897,6 @@ const char *btrfs_super_csum_name(u16 csum_type); const char *btrfs_super_csum_driver(u16 csum_type); size_t __attribute_const__ btrfs_get_num_csums(void); -/* - * The leaf data grows from end-to-front in the node. this returns the address - * of the start of the last item, which is the stop of the leaf data stack. - */ -static inline unsigned int leaf_data_end(const struct extent_buffer *leaf) -{ - u32 nr = btrfs_header_nritems(leaf); - - if (nr == 0) - return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); - return btrfs_item_offset(leaf, nr - 1); -} - /* struct btrfs_file_extent_item */ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, type, 8); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f75e398d7b71..dc38c24a0ffa 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -51,6 +51,19 @@ static const struct btrfs_csums { .driver = "blake2b-256" }, }; +/* + * The leaf data grows from end-to-front in the node. this returns the address + * of the start of the last item, which is the stop of the leaf data stack. + */ +static unsigned int leaf_data_end(const struct extent_buffer *leaf) +{ + u32 nr = btrfs_header_nritems(leaf); + + if (nr == 0) + return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); + return btrfs_item_offset(leaf, nr - 1); +} + int btrfs_super_csum_size(const struct btrfs_super_block *s) { u16 t = btrfs_super_csum_type(s); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8528e7d3f38f..9fc9f8068069 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2537,7 +2537,11 @@ static void prepare_eb_write(struct extent_buffer *eb) * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 */ start = btrfs_item_nr_offset(nritems); - end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); + end = BTRFS_LEAF_DATA_OFFSET; + if (nritems == 0) + end += BTRFS_LEAF_DATA_SIZE(eb->fs_info); + else + end += btrfs_item_offset(eb, nritems - 1); memzero_extent_buffer(eb, start, end - start); } } -- cgit From 6bfd0ffa6f2ae0ead92af7c4521626cd456115c5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:12 -0500 Subject: btrfs: move file_extent_item helpers into file-item.h These helpers use functions that are in multiple places, which makes it tricky to sync them into btrfs-progs. Move them to file-item.h and then include file-item.h in places that use these helpers. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 22 ---------------------- fs/btrfs/ctree.c | 1 + fs/btrfs/ctree.h | 8 -------- fs/btrfs/file-item.h | 33 +++++++++++++++++++++++++++++++++ fs/btrfs/tree-checker.c | 1 + 5 files changed, 35 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index b9d9a69685df..f0d017f9407f 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -915,16 +915,6 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, struct btrfs_file_extent_item, compression, 8); -static inline unsigned long btrfs_file_extent_inline_start( - const struct btrfs_file_extent_item *e) -{ - return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; -} - -static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) -{ - return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; -} BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, @@ -946,18 +936,6 @@ BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, other_encoding, 16); -/* - * Returns the number of bytes used by the item on disk, minus the size of any - * extent headers. If a file is compressed on disk, this is the compressed - * size. - */ -static inline u32 btrfs_file_extent_inline_item_len( - const struct extent_buffer *eb, - int nr) -{ - return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START; -} - /* btrfs_qgroup_status_item */ BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, generation, 64); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dc38c24a0ffa..e85b243be5a2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -22,6 +22,7 @@ #include "accessors.h" #include "extent-tree.h" #include "relocation.h" +#include "file-item.h" static struct kmem_cache *btrfs_path_cachep; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4e58b0532b36..428f51efd455 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -461,14 +461,6 @@ static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_fs_info *info) return BTRFS_LEAF_DATA_SIZE(info) / sizeof(struct btrfs_key_ptr); } -#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ - (offsetof(struct btrfs_file_extent_item, disk_bytenr)) -static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info) -{ - return BTRFS_MAX_ITEM_SIZE(info) - - BTRFS_FILE_EXTENT_INLINE_DATA_START; -} - static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) { return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 95b371208b30..031225668434 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -3,6 +3,39 @@ #ifndef BTRFS_FILE_ITEM_H #define BTRFS_FILE_ITEM_H +#include "accessors.h" + +#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ + (offsetof(struct btrfs_file_extent_item, disk_bytenr)) + +static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info) +{ + return BTRFS_MAX_ITEM_SIZE(info) - BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +/* + * Return the number of bytes used by the item on disk, minus the size of any + * extent headers. If a file is compressed on disk, this is the compressed + * size. + */ +static inline u32 btrfs_file_extent_inline_item_len( + const struct extent_buffer *eb, + int nr) +{ + return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +static inline unsigned long btrfs_file_extent_inline_start( + const struct btrfs_file_extent_item *e) +{ + return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) +{ + return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; +} + int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 1c2d418dda6a..32e051101a27 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -28,6 +28,7 @@ #include "btrfs_inode.h" #include "fs.h" #include "accessors.h" +#include "file-item.h" /* * Error message should follow the following format: -- cgit From 9b48addac406bd10afe447c2182b241dbde1d1a6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:13 -0500 Subject: btrfs: move eb offset helpers into extent_io.h These are very specific to how the extent buffer is defined, so this differs between btrfs-progs and the kernel. Make things easier by moving these helpers into extent_io.h so we don't have to worry about this when syncing ctree.h. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 33 --------------------------------- fs/btrfs/extent_io.h | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 428f51efd455..6169d3b28475 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -686,39 +686,6 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) } int btrfs_leaf_free_space(struct extent_buffer *leaf); -/* - * Get the correct offset inside the page of extent buffer. - * - * @eb: target extent buffer - * @start: offset inside the extent buffer - * - * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases. - */ -static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb, - unsigned long offset) -{ - /* - * For sectorsize == PAGE_SIZE case, eb->start will always be aligned - * to PAGE_SIZE, thus adding it won't cause any difference. - * - * For sectorsize < PAGE_SIZE, we must only read the data that belongs - * to the eb, thus we have to take the eb->start into consideration. - */ - return offset_in_page(offset + eb->start); -} - -static inline unsigned long get_eb_page_index(unsigned long offset) -{ - /* - * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough. - * - * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE, - * and have ensured that all tree blocks are contained in one page, - * thus we always get index == 0. - */ - return offset >> PAGE_SHIFT; -} - static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 9b486419854e..a2c82448b2e0 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -95,6 +95,39 @@ struct extent_buffer { #endif }; +/* + * Get the correct offset inside the page of extent buffer. + * + * @eb: target extent buffer + * @start: offset inside the extent buffer + * + * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases. + */ +static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb, + unsigned long offset) +{ + /* + * For sectorsize == PAGE_SIZE case, eb->start will always be aligned + * to PAGE_SIZE, thus adding it won't cause any difference. + * + * For sectorsize < PAGE_SIZE, we must only read the data that belongs + * to the eb, thus we have to take the eb->start into consideration. + */ + return offset_in_page(offset + eb->start); +} + +static inline unsigned long get_eb_page_index(unsigned long offset) +{ + /* + * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough. + * + * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE, + * and have ensured that all tree blocks are contained in one page, + * thus we always get index == 0. + */ + return offset >> PAGE_SHIFT; +} + /* * Structure to record how many bytes and which ranges are set/cleared */ -- cgit From 0e6c40ebbb18b32cf4b5fcb3173eaede89b9f440 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:14 -0500 Subject: btrfs: move the csum helpers into ctree.h These got moved because of copy+paste, but this code exists in ctree.c, so move the declarations back into ctree.h. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 5 ----- fs/btrfs/ctree.h | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index f0d017f9407f..066a662f38c3 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -892,11 +892,6 @@ BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); -int btrfs_super_csum_size(const struct btrfs_super_block *s); -const char *btrfs_super_csum_name(u16 csum_type); -const char *btrfs_super_csum_driver(u16 csum_type); -size_t __attribute_const__ btrfs_get_num_csums(void); - /* struct btrfs_file_extent_item */ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, type, 8); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6169d3b28475..6965703a81b6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -700,6 +700,11 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; } +int btrfs_super_csum_size(const struct btrfs_super_block *s); +const char *btrfs_super_csum_name(u16 csum_type); +const char *btrfs_super_csum_driver(u16 csum_type); +size_t __attribute_const__ btrfs_get_num_csums(void); + /* * We use page status Private2 to indicate there is an ordered extent with * unfinished IO. -- cgit From 42c9419a4c01910e9c46b0c2bb9090f76295bf01 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:15 -0500 Subject: btrfs: pass the extent buffer for the btrfs_item_nr helpers This is actually a change for extent tree v2, but it exists in btrfs-progs but not in the kernel. This makes it annoying to sync accessors.h with btrfs-progs, and since this is the way I need it for extent-tree v2 simply update these helpers to take the extent buffer in order to make syncing possible now, and make the extent tree v2 stuff easier moving forward. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 18 +++++++++--------- fs/btrfs/ctree.c | 35 ++++++++++++++++++----------------- fs/btrfs/extent_io.c | 2 +- fs/btrfs/tree-checker.c | 4 ++-- 4 files changed, 30 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 066a662f38c3..2b4fb696142b 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -417,37 +417,37 @@ BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32); BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); -static inline unsigned long btrfs_item_nr_offset(int nr) +static inline unsigned long btrfs_item_nr_offset(const struct extent_buffer *eb, int nr) { return offsetof(struct btrfs_leaf, items) + sizeof(struct btrfs_item) * nr; } -static inline struct btrfs_item *btrfs_item_nr(int nr) +static inline struct btrfs_item *btrfs_item_nr(const struct extent_buffer *eb, int nr) { - return (struct btrfs_item *)btrfs_item_nr_offset(nr); + return (struct btrfs_item *)btrfs_item_nr_offset(eb, nr); } #define BTRFS_ITEM_SETGET_FUNCS(member) \ static inline u32 btrfs_item_##member(const struct extent_buffer *eb, int slot) \ { \ - return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \ + return btrfs_raw_item_##member(eb, btrfs_item_nr(eb, slot)); \ } \ static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ int slot, u32 val) \ { \ - btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \ + btrfs_set_raw_item_##member(eb, btrfs_item_nr(eb, slot), val); \ } \ static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ int slot) \ { \ - struct btrfs_item *item = btrfs_item_nr(slot); \ + struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ return btrfs_token_raw_item_##member(token, item); \ } \ static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ int slot, u32 val) \ { \ - struct btrfs_item *item = btrfs_item_nr(slot); \ + struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ btrfs_set_token_raw_item_##member(token, item, val); \ } @@ -462,7 +462,7 @@ static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr) static inline void btrfs_item_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { - struct btrfs_item *item = btrfs_item_nr(nr); + struct btrfs_item *item = btrfs_item_nr(eb, nr); read_eb_member(eb, item, struct btrfs_item, key, disk_key); } @@ -470,7 +470,7 @@ static inline void btrfs_item_key(const struct extent_buffer *eb, static inline void btrfs_set_item_key(struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { - struct btrfs_item *item = btrfs_item_nr(nr); + struct btrfs_item *item = btrfs_item_nr(eb, nr); write_eb_member(eb, item, struct btrfs_item, key, disk_key); } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e85b243be5a2..cdc112d3bab2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3033,13 +3033,13 @@ static noinline int __push_leaf_right(struct btrfs_path *path, BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left), push_space); - memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), - btrfs_item_nr_offset(0), + memmove_extent_buffer(right, btrfs_item_nr_offset(right, push_items), + btrfs_item_nr_offset(right, 0), right_nritems * sizeof(struct btrfs_item)); /* copy the items from left to right */ - copy_extent_buffer(right, left, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(left_nritems - push_items), + copy_extent_buffer(right, left, btrfs_item_nr_offset(right, 0), + btrfs_item_nr_offset(left, left_nritems - push_items), push_items * sizeof(struct btrfs_item)); /* update the item pointers */ @@ -3233,8 +3233,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, /* push data from right to left */ copy_extent_buffer(left, right, - btrfs_item_nr_offset(btrfs_header_nritems(left)), - btrfs_item_nr_offset(0), + btrfs_item_nr_offset(left, btrfs_header_nritems(left)), + btrfs_item_nr_offset(right, 0), push_items * sizeof(struct btrfs_item)); push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - @@ -3272,8 +3272,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, BTRFS_LEAF_DATA_OFFSET + leaf_data_end(right), push_space); - memmove_extent_buffer(right, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(push_items), + memmove_extent_buffer(right, btrfs_item_nr_offset(right, 0), + btrfs_item_nr_offset(left, push_items), (btrfs_header_nritems(right) - push_items) * sizeof(struct btrfs_item)); } @@ -3407,8 +3407,8 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(right, nritems); data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); - copy_extent_buffer(right, l, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(mid), + copy_extent_buffer(right, l, btrfs_item_nr_offset(right, 0), + btrfs_item_nr_offset(l, mid), nritems * sizeof(struct btrfs_item)); copy_extent_buffer(right, l, @@ -3784,8 +3784,8 @@ static noinline int split_item(struct btrfs_path *path, nritems = btrfs_header_nritems(leaf); if (slot != nritems) { /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), - btrfs_item_nr_offset(slot), + memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, slot + 1), + btrfs_item_nr_offset(leaf, slot), (nritems - slot) * sizeof(struct btrfs_item)); } @@ -4077,9 +4077,10 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p ioff - batch->total_data_size); } /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr), - btrfs_item_nr_offset(slot), - (nritems - slot) * sizeof(struct btrfs_item)); + memmove_extent_buffer(leaf, + btrfs_item_nr_offset(leaf, slot + batch->nr), + btrfs_item_nr_offset(leaf, slot), + (nritems - slot) * sizeof(struct btrfs_item)); /* shift the data */ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + @@ -4333,8 +4334,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_token_item_offset(&token, i, ioff + dsize); } - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), - btrfs_item_nr_offset(slot + nr), + memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, slot), + btrfs_item_nr_offset(leaf, slot + nr), sizeof(struct btrfs_item) * (nritems - slot - nr)); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9fc9f8068069..f1df13c5fc6f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2536,7 +2536,7 @@ static void prepare_eb_write(struct extent_buffer *eb) * Leaf: * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 */ - start = btrfs_item_nr_offset(nritems); + start = btrfs_item_nr_offset(eb, nritems); end = BTRFS_LEAF_DATA_OFFSET; if (nritems == 0) end += BTRFS_LEAF_DATA_SIZE(eb->fs_info); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 32e051101a27..baad1ed7e111 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1784,10 +1784,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) /* Also check if the item pointer overlaps with btrfs item. */ if (unlikely(btrfs_item_ptr_offset(leaf, slot) < - btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item))) { + btrfs_item_nr_offset(leaf, slot) + sizeof(struct btrfs_item))) { generic_err(leaf, slot, "slot overlaps with its data, item end %lu data start %lu", - btrfs_item_nr_offset(slot) + + btrfs_item_nr_offset(leaf, slot) + sizeof(struct btrfs_item), btrfs_item_ptr_offset(leaf, slot)); return -EUCLEAN; -- cgit From e23efd8e8767165a6103cf0a4fe273f6b9f182f2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:16 -0500 Subject: btrfs: add eb to btrfs_node_key_ptr_offset This is a change needed for extent tree v2, as we will be growing the header size. This exists in btrfs-progs currently, and not having it makes syncing accessors.[ch] more problematic. So make this change to set us up for extent tree v2 and match what btrfs-progs does to make syncing easier. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/accessors.c | 2 +- fs/btrfs/accessors.h | 4 ++-- fs/btrfs/ctree.c | 28 ++++++++++++++-------------- fs/btrfs/extent_io.c | 2 +- fs/btrfs/tree-mod-log.c | 4 ++-- 5 files changed, 20 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index 7a7b7d263102..206cf1612c1d 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -168,7 +168,7 @@ DEFINE_BTRFS_SETGET_BITS(64) void btrfs_node_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { - unsigned long ptr = btrfs_node_key_ptr_offset(nr); + unsigned long ptr = btrfs_node_key_ptr_offset(eb, nr); read_eb_member(eb, (struct btrfs_key_ptr *)ptr, struct btrfs_key_ptr, key, disk_key); } diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 2b4fb696142b..88eea44fdd7f 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -392,7 +392,7 @@ static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb, btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); } -static inline unsigned long btrfs_node_key_ptr_offset(int nr) +static inline unsigned long btrfs_node_key_ptr_offset(const struct extent_buffer *eb, int nr) { return offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; @@ -406,7 +406,7 @@ static inline void btrfs_set_node_key(const struct extent_buffer *eb, { unsigned long ptr; - ptr = btrfs_node_key_ptr_offset(nr); + ptr = btrfs_node_key_ptr_offset(eb, nr); write_eb_member(eb, (struct btrfs_key_ptr *)ptr, struct btrfs_key_ptr, key, disk_key); } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cdc112d3bab2..32facf8d7319 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2612,8 +2612,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, return ret; } copy_extent_buffer(dst, src, - btrfs_node_key_ptr_offset(dst_nritems), - btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(dst, dst_nritems), + btrfs_node_key_ptr_offset(src, 0), push_items * sizeof(struct btrfs_key_ptr)); if (push_items < src_nritems) { @@ -2621,8 +2621,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, * Don't call btrfs_tree_mod_log_insert_move() here, key removal * was already fully logged by btrfs_tree_mod_log_eb_copy() above. */ - memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), - btrfs_node_key_ptr_offset(push_items), + memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0), + btrfs_node_key_ptr_offset(src, push_items), (src_nritems - push_items) * sizeof(struct btrfs_key_ptr)); } @@ -2682,8 +2682,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, } ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); BUG_ON(ret < 0); - memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), - btrfs_node_key_ptr_offset(0), + memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items), + btrfs_node_key_ptr_offset(dst, 0), (dst_nritems) * sizeof(struct btrfs_key_ptr)); @@ -2694,8 +2694,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, return ret; } copy_extent_buffer(dst, src, - btrfs_node_key_ptr_offset(0), - btrfs_node_key_ptr_offset(src_nritems - push_items), + btrfs_node_key_ptr_offset(dst, 0), + btrfs_node_key_ptr_offset(src, src_nritems - push_items), push_items * sizeof(struct btrfs_key_ptr)); btrfs_set_header_nritems(src, src_nritems - push_items); @@ -2798,8 +2798,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans, BUG_ON(ret < 0); } memmove_extent_buffer(lower, - btrfs_node_key_ptr_offset(slot + 1), - btrfs_node_key_ptr_offset(slot), + btrfs_node_key_ptr_offset(lower, slot + 1), + btrfs_node_key_ptr_offset(lower, slot), (nritems - slot) * sizeof(struct btrfs_key_ptr)); } if (level) { @@ -2881,8 +2881,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, return ret; } copy_extent_buffer(split, c, - btrfs_node_key_ptr_offset(0), - btrfs_node_key_ptr_offset(mid), + btrfs_node_key_ptr_offset(split, 0), + btrfs_node_key_ptr_offset(c, mid), (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); btrfs_set_header_nritems(split, c_nritems - mid); btrfs_set_header_nritems(c, mid); @@ -4240,8 +4240,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, BUG_ON(ret < 0); } memmove_extent_buffer(parent, - btrfs_node_key_ptr_offset(slot), - btrfs_node_key_ptr_offset(slot + 1), + btrfs_node_key_ptr_offset(parent, slot), + btrfs_node_key_ptr_offset(parent, slot + 1), sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); } else if (level) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f1df13c5fc6f..f1d5a851968f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2529,7 +2529,7 @@ static void prepare_eb_write(struct extent_buffer *eb) /* Set btree blocks beyond nritems with 0 to avoid stale content */ nritems = btrfs_header_nritems(eb); if (btrfs_header_level(eb) > 0) { - end = btrfs_node_key_ptr_offset(nritems); + end = btrfs_node_key_ptr_offset(eb, nritems); memzero_extent_buffer(eb, end, eb->len - end); } else { /* diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 146a6b198933..a555baa0143a 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -697,8 +697,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, n--; break; case BTRFS_MOD_LOG_MOVE_KEYS: - o_dst = btrfs_node_key_ptr_offset(tm->slot); - o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); + o_dst = btrfs_node_key_ptr_offset(eb, tm->slot); + o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot); memmove_extent_buffer(eb, o_dst, o_src, tm->move.nr_items * p_size); break; -- cgit From 637e3b48c22e52daa645b49b73630af3f09328ae Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:17 -0500 Subject: btrfs: add helpers for manipulating leaf items and data We have some gnarly memmove and copy_extent_buffer calls for leaf manipulation. This is because our item offsets aren't absolute, they're based on 0 being where the items start in the leaf, which is after the btrfs_header. This means any manipulation of the data requires adding sizeof(struct btrfs_header) to the offsets we pull from the items. Moving the items themselves is easier as the helpers are absolute offsets, however we of course have to call the helpers to get the offsets for the item numbers. This makes for copy_extent_buffer/memmove_extent_buffer calls that are kind of hard to reason about what's happening. Fix this by pushing this logic into helpers. For data we'll only use the item provided offsets, and the helpers will use the BTRFS_LEAF_DATA_OFFSET addition for the offsets. Additionally for the item manipulation simply pass in the item numbers, and then the helpers will call the offset helper to get the actual offset into the leaf. The diffstat makes this look like more code, but that's simply because I added comments for the helpers, it's net negative for the amount of code, and is easier to reason. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 181 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 115 insertions(+), 66 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 32facf8d7319..b5d203045dc7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -65,6 +65,91 @@ static unsigned int leaf_data_end(const struct extent_buffer *leaf) return btrfs_item_offset(leaf, nr - 1); } +/* + * Move data in a @leaf (using memmove, safe for overlapping ranges). + * + * @leaf: leaf that we're doing a memmove on + * @dst_offset: item data offset we're moving to + * @src_offset: item data offset were' moving from + * @len: length of the data we're moving + * + * Wrapper around memmove_extent_buffer() that takes into account the header on + * the leaf. The btrfs_item offset's start directly after the header, so we + * have to adjust any offsets to account for the header in the leaf. This + * handles that math to simplify the callers. + */ +static inline void memmove_leaf_data(const struct extent_buffer *leaf, + unsigned long dst_offset, + unsigned long src_offset, + unsigned long len) +{ + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + dst_offset, + BTRFS_LEAF_DATA_OFFSET + src_offset, len); +} + +/* + * Copy item data from @src into @dst at the given @offset. + * + * @dst: destination leaf that we're copying into + * @src: source leaf that we're copying from + * @dst_offset: item data offset we're copying to + * @src_offset: item data offset were' copying from + * @len: length of the data we're copying + * + * Wrapper around copy_extent_buffer() that takes into account the header on + * the leaf. The btrfs_item offset's start directly after the header, so we + * have to adjust any offsets to account for the header in the leaf. This + * handles that math to simplify the callers. + */ +static inline void copy_leaf_data(const struct extent_buffer *dst, + const struct extent_buffer *src, + unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + copy_extent_buffer(dst, src, BTRFS_LEAF_DATA_OFFSET + dst_offset, + BTRFS_LEAF_DATA_OFFSET + src_offset, len); +} + +/* + * Move items in a @leaf (using memmove). + * + * @dst: destination leaf for the items + * @dst_item: the item nr we're copying into + * @src_item: the item nr we're copying from + * @nr_items: the number of items to copy + * + * Wrapper around memmove_extent_buffer() that does the math to get the + * appropriate offsets into the leaf from the item numbers. + */ +static inline void memmove_leaf_items(const struct extent_buffer *leaf, + int dst_item, int src_item, int nr_items) +{ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item), + btrfs_item_nr_offset(leaf, src_item), + nr_items * sizeof(struct btrfs_item)); +} + +/* + * Copy items from @src into @dst at the given @offset. + * + * @dst: destination leaf for the items + * @src: source leaf for the items + * @dst_item: the item nr we're copying into + * @src_item: the item nr we're copying from + * @nr_items: the number of items to copy + * + * Wrapper around copy_extent_buffer() that does the math to get the + * appropriate offsets into the leaf from the item numbers. + */ +static inline void copy_leaf_items(const struct extent_buffer *dst, + const struct extent_buffer *src, + int dst_item, int src_item, int nr_items) +{ + copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item), + btrfs_item_nr_offset(src, src_item), + nr_items * sizeof(struct btrfs_item)); +} + int btrfs_super_csum_size(const struct btrfs_super_block *s) { u16 t = btrfs_super_csum_type(s); @@ -3022,25 +3107,17 @@ static noinline int __push_leaf_right(struct btrfs_path *path, /* make room in the right data area */ data_end = leaf_data_end(right); - memmove_extent_buffer(right, - BTRFS_LEAF_DATA_OFFSET + data_end - push_space, - BTRFS_LEAF_DATA_OFFSET + data_end, - BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); + memmove_leaf_data(right, data_end - push_space, data_end, + BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); /* copy from the left data area */ - copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET + - BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, - BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left), - push_space); + copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, + leaf_data_end(left), push_space); - memmove_extent_buffer(right, btrfs_item_nr_offset(right, push_items), - btrfs_item_nr_offset(right, 0), - right_nritems * sizeof(struct btrfs_item)); + memmove_leaf_items(right, push_items, 0, right_nritems); /* copy the items from left to right */ - copy_extent_buffer(right, left, btrfs_item_nr_offset(right, 0), - btrfs_item_nr_offset(left, left_nritems - push_items), - push_items * sizeof(struct btrfs_item)); + copy_leaf_items(right, left, 0, left_nritems - push_items, push_items); /* update the item pointers */ btrfs_init_map_token(&token, right); @@ -3232,19 +3309,13 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, WARN_ON(!empty && push_items == btrfs_header_nritems(right)); /* push data from right to left */ - copy_extent_buffer(left, right, - btrfs_item_nr_offset(left, btrfs_header_nritems(left)), - btrfs_item_nr_offset(right, 0), - push_items * sizeof(struct btrfs_item)); + copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items); push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_offset(right, push_items - 1); - copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + - leaf_data_end(left) - push_space, - BTRFS_LEAF_DATA_OFFSET + - btrfs_item_offset(right, push_items - 1), - push_space); + copy_leaf_data(left, right, leaf_data_end(left) - push_space, + btrfs_item_offset(right, push_items - 1), push_space); old_left_nritems = btrfs_header_nritems(left); BUG_ON(old_left_nritems <= 0); @@ -3267,15 +3338,12 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, if (push_items < right_nritems) { push_space = btrfs_item_offset(right, push_items - 1) - leaf_data_end(right); - memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + - BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, - BTRFS_LEAF_DATA_OFFSET + - leaf_data_end(right), push_space); + memmove_leaf_data(right, + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, + leaf_data_end(right), push_space); - memmove_extent_buffer(right, btrfs_item_nr_offset(right, 0), - btrfs_item_nr_offset(left, push_items), - (btrfs_header_nritems(right) - push_items) * - sizeof(struct btrfs_item)); + memmove_leaf_items(right, 0, push_items, + btrfs_header_nritems(right) - push_items); } btrfs_init_map_token(&token, right); @@ -3407,14 +3475,10 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(right, nritems); data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); - copy_extent_buffer(right, l, btrfs_item_nr_offset(right, 0), - btrfs_item_nr_offset(l, mid), - nritems * sizeof(struct btrfs_item)); + copy_leaf_items(right, l, 0, mid, nritems); - copy_extent_buffer(right, l, - BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - - data_copy_size, BTRFS_LEAF_DATA_OFFSET + - leaf_data_end(l), data_copy_size); + copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size, + leaf_data_end(l), data_copy_size); rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); @@ -3784,9 +3848,7 @@ static noinline int split_item(struct btrfs_path *path, nritems = btrfs_header_nritems(leaf); if (slot != nritems) { /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, slot + 1), - btrfs_item_nr_offset(leaf, slot), - (nritems - slot) * sizeof(struct btrfs_item)); + memmove_leaf_items(leaf, slot + 1, slot, nritems - slot); } btrfs_cpu_key_to_disk(&disk_key, new_key); @@ -3897,9 +3959,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) /* shift the data */ if (from_end) { - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + - data_end, old_data_start + new_size - data_end); + memmove_leaf_data(leaf, data_end + size_diff, data_end, + old_data_start + new_size - data_end); } else { struct btrfs_disk_key disk_key; u64 offset; @@ -3924,9 +3985,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) } } - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + - data_end, old_data_start - data_end); + memmove_leaf_data(leaf, data_end + size_diff, data_end, + old_data_start - data_end); offset = btrfs_disk_key_offset(&disk_key); btrfs_set_disk_key_offset(&disk_key, offset + size_diff); @@ -3991,9 +4051,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) } /* shift the data */ - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end - data_size, BTRFS_LEAF_DATA_OFFSET + - data_end, old_data - data_end); + memmove_leaf_data(leaf, data_end - data_size, data_end, + old_data - data_end); data_end = old_data; old_size = btrfs_item_size(leaf, slot); @@ -4077,16 +4136,11 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p ioff - batch->total_data_size); } /* shift the items */ - memmove_extent_buffer(leaf, - btrfs_item_nr_offset(leaf, slot + batch->nr), - btrfs_item_nr_offset(leaf, slot), - (nritems - slot) * sizeof(struct btrfs_item)); + memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot); /* shift the data */ - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end - batch->total_data_size, - BTRFS_LEAF_DATA_OFFSET + data_end, - old_data - data_end); + memmove_leaf_data(leaf, data_end - batch->total_data_size, + data_end, old_data - data_end); data_end = old_data; } @@ -4321,10 +4375,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, for (i = 0; i < nr; i++) dsize += btrfs_item_size(leaf, slot + i); - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end + dsize, - BTRFS_LEAF_DATA_OFFSET + data_end, - last_off - data_end); + memmove_leaf_data(leaf, data_end + dsize, data_end, + last_off - data_end); btrfs_init_map_token(&token, leaf); for (i = slot + nr; i < nritems; i++) { @@ -4334,10 +4386,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_token_item_offset(&token, i, ioff + dsize); } - memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, slot), - btrfs_item_nr_offset(leaf, slot + nr), - sizeof(struct btrfs_item) * - (nritems - slot - nr)); + memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr); } btrfs_set_header_nritems(leaf, nritems - nr); nritems -= nr; -- cgit From 8009adf306452e9b43db36f2d02fedfe5eca1b5e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:18 -0500 Subject: btrfs: remove BTRFS_LEAF_DATA_OFFSET This is simply the same thing as btrfs_item_nr_offset(leaf, 0), so remove this helper and replace it's usage with the above statement. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 6 ++---- fs/btrfs/ctree.c | 8 ++++---- fs/btrfs/extent_io.c | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 88eea44fdd7f..e6228ff73c81 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -9,8 +9,6 @@ struct btrfs_map_token { unsigned long offset; }; -#define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items) - void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb); /* @@ -1028,9 +1026,9 @@ BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, /* Cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ - ((type *)(BTRFS_LEAF_DATA_OFFSET + btrfs_item_offset(leaf, slot))) + ((type *)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot))) #define btrfs_item_ptr_offset(leaf, slot) \ - ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + btrfs_item_offset(leaf, slot))) + ((unsigned long)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot))) #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index b5d203045dc7..76b99bcc849d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -83,8 +83,8 @@ static inline void memmove_leaf_data(const struct extent_buffer *leaf, unsigned long src_offset, unsigned long len) { - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + dst_offset, - BTRFS_LEAF_DATA_OFFSET + src_offset, len); + memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset, + btrfs_item_nr_offset(leaf, 0) + src_offset, len); } /* @@ -106,8 +106,8 @@ static inline void copy_leaf_data(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - copy_extent_buffer(dst, src, BTRFS_LEAF_DATA_OFFSET + dst_offset, - BTRFS_LEAF_DATA_OFFSET + src_offset, len); + copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset, + btrfs_item_nr_offset(src, 0) + src_offset, len); } /* diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f1d5a851968f..83dd3aa59663 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2537,7 +2537,7 @@ static void prepare_eb_write(struct extent_buffer *eb) * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 */ start = btrfs_item_nr_offset(eb, nritems); - end = BTRFS_LEAF_DATA_OFFSET; + end = btrfs_item_nr_offset(eb, 0); if (nritems == 0) end += BTRFS_LEAF_DATA_SIZE(eb->fs_info); else -- cgit From 0c7030038e6106711c5d0b237c980905dd3244ec Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:19 -0500 Subject: btrfs: add nr_global_roots to the super block definition We already have this defined in btrfs-progs, add it to the kernel to make it easier to sync these files into btrfs-progs. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index e6228ff73c81..75c181b579eb 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -889,6 +889,8 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_nr_global_roots, struct btrfs_super_block, + nr_global_roots, 64); /* struct btrfs_file_extent_item */ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, -- cgit From 054056bd0a8da2747e51646a6ac5af6ffbae5b69 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:20 -0500 Subject: btrfs: add stack helpers for a few btrfs items We don't have these defined in the kernel because we don't have any users of these helpers. However we do use them in btrfs-progs, so define them to make keeping accessors.h in sync between progs and the kernel easier. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 75c181b579eb..ceadfc5d6c66 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -221,12 +221,26 @@ static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb, return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); } +static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val); +} + static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb, struct btrfs_chunk *c, int nr) { return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); } +static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val); +} + /* struct btrfs_block_group_item */ BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item, used, 64); @@ -248,6 +262,8 @@ BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32); /* struct btrfs_inode_ref */ BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_index, struct btrfs_inode_ref, index, 64); /* struct btrfs_inode_extref */ BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, @@ -297,6 +313,14 @@ BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, chunk_offset, 64); BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_tree, struct btrfs_dev_extent, + chunk_tree, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_objectid, struct btrfs_dev_extent, + chunk_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_offset, struct btrfs_dev_extent, + chunk_offset, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_length, struct btrfs_dev_extent, length, 64); + BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, generation, 64); BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); @@ -479,6 +503,9 @@ BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_root_ref_dirid, struct btrfs_root_ref, dirid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_root_ref_sequence, struct btrfs_root_ref, sequence, 64); +BTRFS_SETGET_STACK_FUNCS(stack_root_ref_name_len, struct btrfs_root_ref, name_len, 16); /* struct btrfs_dir_item */ BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); @@ -972,6 +999,16 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, rsv_rfer, 64); BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, rsv_excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_flags, + struct btrfs_qgroup_limit_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_rfer, + struct btrfs_qgroup_limit_item, max_rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_excl, + struct btrfs_qgroup_limit_item, max_excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_rfer, + struct btrfs_qgroup_limit_item, rsv_rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_excl, + struct btrfs_qgroup_limit_item, rsv_excl, 64); /* btrfs_dev_replace_item */ BTRFS_SETGET_FUNCS(dev_replace_src_devid, -- cgit From a4c853af0c511d7e0f7cb306bbc8a4f1dbdb64ca Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Wed, 16 Nov 2022 22:23:53 +0800 Subject: btrfs: add might_sleep() annotations Add annotations to functions that might sleep due to allocations or IO and could be called from various contexts. In case of btrfs_search_slot it's not obvious why it would sleep: btrfs_search_slot setup_nodes_for_search reada_for_balance btrfs_readahead_node_child btrfs_readahead_tree_block btrfs_find_create_tree_block alloc_extent_buffer kmem_cache_zalloc /* allocate memory non-atomically, might sleep */ kmem_cache_alloc(GFP_NOFS|__GFP_NOFAIL|__GFP_ZERO) read_extent_buffer_pages submit_extent_page /* disk IO, might sleep */ submit_one_bio Other examples where the sleeping could happen is in 3 places might sleep in update_qgroup_limit_item(), as shown below: update_qgroup_limit_item btrfs_alloc_path /* allocate memory non-atomically, might sleep */ kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS) Signed-off-by: ChenXiaoSong Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 76b99bcc849d..4754c9101a4c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -184,6 +184,8 @@ size_t __attribute_const__ btrfs_get_num_csums(void) struct btrfs_path *btrfs_alloc_path(void) { + might_sleep(); + return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); } @@ -2046,6 +2048,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, int min_write_lock_level; int prev_cmp; + might_sleep(); + lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); WARN_ON(p->nodes[0] != NULL); -- cgit From d7c9e1be2876f63fb2178a24e0c1d5733ff98d47 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Nov 2022 15:06:09 -0500 Subject: btrfs: fix uninitialized parent in insert_state I don't know how this isn't caught when we build this in the kernel, but while syncing extent-io-tree.c into btrfs-progs I got an error because parent could potentially be uninitialized when we link in a new node, specifically when the extent_io_tree is empty. This means we could have garbage in the parent color. I don't know what the ramifications are of that, but it's probably not great, so fix this by initializing parent to NULL. I spot checked all of our other usages in btrfs and we appear to be doing the correct thing everywhere else. Fixes: c7e118cf98c7 ("btrfs: open code rbtree search in insert_state") CC: stable@vger.kernel.org # 6.0+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 21fa15123af8..82ca6a11e11a 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -395,7 +395,7 @@ static int insert_state(struct extent_io_tree *tree, u32 bits, struct extent_changeset *changeset) { struct rb_node **node; - struct rb_node *parent; + struct rb_node *parent = NULL; const u64 end = state->end; set_state_bits(tree, state, bits, changeset); -- cgit From 26df39a9e5a8985674e814f0b27b25e8b4eb9ba7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Nov 2022 15:09:42 -0500 Subject: btrfs: fix uninitialized variable in find_first_clear_extent_bit This was caught when syncing extent-io-tree.c into btrfs-progs. This however isn't really a problem, the only way next would be uninitialized is if we found the range we were looking for, and in this case we don't care about next. However it's a compile error, so fix it up. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 82ca6a11e11a..9ae9cd1e7035 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -1425,7 +1425,7 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits) { struct extent_state *state; - struct extent_state *prev = NULL, *next; + struct extent_state *prev = NULL, *next = NULL; spin_lock(&tree->lock); -- cgit From 63d5429f68a3d4c4aa27e65a05196c17f86c41d6 Mon Sep 17 00:00:00 2001 From: Artem Chernyshev Date: Sat, 19 Nov 2022 11:13:29 +0300 Subject: btrfs: replace strncpy() with strscpy() Using strncpy() on NUL-terminated strings are deprecated. To avoid possible forming of non-terminated string strscpy() should be used. Found by Linux Verification Center (linuxtesting.org) with SVACE. CC: stable@vger.kernel.org # 4.9+ Signed-off-by: Artem Chernyshev Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 9 +++------ fs/btrfs/rcu-string.h | 6 +++++- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bed74a3ff574..4fd6b61b06a4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2859,13 +2859,10 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, di_args->bytes_used = btrfs_device_get_bytes_used(dev); di_args->total_bytes = btrfs_device_get_total_bytes(dev); memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); - if (dev->name) { - strncpy(di_args->path, btrfs_dev_name(dev), - sizeof(di_args->path) - 1); - di_args->path[sizeof(di_args->path) - 1] = 0; - } else { + if (dev->name) + strscpy(di_args->path, btrfs_dev_name(dev), sizeof(di_args->path)); + else di_args->path[0] = '\0'; - } out: rcu_read_unlock(); diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h index 5c1a617eb25d..5c2b66d155ef 100644 --- a/fs/btrfs/rcu-string.h +++ b/fs/btrfs/rcu-string.h @@ -18,7 +18,11 @@ static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask) (len * sizeof(char)), mask); if (!ret) return ret; - strncpy(ret->str, src, len); + /* Warn if the source got unexpectedly truncated. */ + if (WARN_ON(strscpy(ret->str, src, len) < 0)) { + kfree(ret); + return NULL; + } return ret; } -- cgit From 3a8d1db341b93e65fa76051ab833b3640842b6eb Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 21 Nov 2022 10:23:23 +0000 Subject: btrfs: unify overwrite_item() and do_overwrite_item() After commit 193df6245704 ("btrfs: search for last logged dir index if it's not cached in the inode"), there are no more callers of do_overwrite_item(), except overwrite_item(). Originally both used to be the same function, but were split in commit 086dcbfa50d3 ("btrfs: insert items in batches when logging a directory when possible"), as there was the need to execute all logic of overwrite_item() but skip the tree search, since in the context of directory logging we already had a path with a leaf to copy data from. So unify them again as there is no more need to have them split. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 76 +++++++++++++++++------------------------------------ 1 file changed, 24 insertions(+), 52 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4387cd2ade97..bcc8717f51fd 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -365,11 +365,25 @@ static int process_one_buffer(struct btrfs_root *log, return ret; } -static int do_overwrite_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +/* + * Item overwrite used by replay and tree logging. eb, slot and key all refer + * to the src data we are copying out. + * + * root is the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and + * will be released on exit). + * + * If the key is already in the destination tree the existing item is + * overwritten. If the existing item isn't big enough, it is extended. + * If it is too large, it is truncated. + * + * If the key isn't in the destination yet, a new item is inserted. + */ +static int overwrite_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) { int ret; u32 item_size; @@ -386,22 +400,10 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans, item_size = btrfs_item_size(eb, slot); src_ptr = btrfs_item_ptr_offset(eb, slot); - /* Our caller must have done a search for the key for us. */ - ASSERT(path->nodes[0] != NULL); - - /* - * And the slot must point to the exact key or the slot where the key - * should be at (the first item with a key greater than 'key') - */ - if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { - struct btrfs_key found_key; - - btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - ret = btrfs_comp_cpu_keys(&found_key, key); - ASSERT(ret >= 0); - } else { - ret = 1; - } + /* Look for the key in the destination tree. */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + return ret; if (ret == 0) { char *src_copy; @@ -579,36 +581,6 @@ no_copy: return 0; } -/* - * Item overwrite used by replay and tree logging. eb, slot and key all refer - * to the src data we are copying out. - * - * root is the tree we are copying into, and path is a scratch - * path for use in this function (it should be released on entry and - * will be released on exit). - * - * If the key is already in the destination tree the existing item is - * overwritten. If the existing item isn't big enough, it is extended. - * If it is too large, it is truncated. - * - * If the key isn't in the destination yet, a new item is inserted. - */ -static int overwrite_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) -{ - int ret; - - /* Look for the key in the destination tree. */ - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) - return ret; - - return do_overwrite_item(trans, root, path, eb, slot, key); -} - static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, struct fscrypt_str *name) { @@ -5406,7 +5378,7 @@ struct btrfs_dir_list { * has a size that doesn't match the sum of the lengths of all the logged * names - this is ok, not a problem, because at log replay time we set the * directory's i_size to the correct value (see replay_one_name() and - * do_overwrite_item()). + * overwrite_item()). */ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, struct btrfs_inode *start_inode, -- cgit From 3eb423442483d454937cdc8853e58c399ffe5514 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 21 Nov 2022 10:23:24 +0000 Subject: btrfs: remove outdated logic from overwrite_item() and add assertion As of commit 193df6245704 ("btrfs: search for last logged dir index if it's not cached in the inode"), the overwrite_item() function is always called for a root that is from a fs/subvolume tree. In other words, now it's only used during log replay to modify a fs/subvolume tree. Therefore we can remove the logic that checks if we are dealing with a log tree at overwrite_item(). So remove that logic, replacing it with an assertion and document that if we ever need to support a log root there, we will need to clone the leaf from the fs/subvolume tree and then release it before modifying the log tree, which is needed to avoid a potential deadlock, similar to the one recently fixed by a patch with the subject: "btrfs: do not modify log tree while holding a leaf from fs tree locked" Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index bcc8717f51fd..a3c43f0b1c95 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -391,11 +391,16 @@ static int overwrite_item(struct btrfs_trans_handle *trans, int save_old_i_size = 0; unsigned long src_ptr; unsigned long dst_ptr; - int overwrite_root = 0; bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) - overwrite_root = 1; + /* + * This is only used during log replay, so the root is always from a + * fs/subvolume tree. In case we ever need to support a log root, then + * we'll have to clone the leaf in the path, release the path and use + * the leaf before writing into the log tree. See the comments at + * copy_items() for more details. + */ + ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); item_size = btrfs_item_size(eb, slot); src_ptr = btrfs_item_ptr_offset(eb, slot); @@ -548,8 +553,7 @@ insert: goto no_copy; } - if (overwrite_root && - S_ISDIR(btrfs_inode_mode(eb, src_item)) && + if (S_ISDIR(btrfs_inode_mode(eb, src_item)) && S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { save_old_i_size = 1; saved_i_size = btrfs_inode_size(path->nodes[0], -- cgit From 1742e1c90c3da344f3bb9b1f1309b3f47482756a Mon Sep 17 00:00:00 2001 From: void0red Date: Wed, 23 Nov 2022 22:39:45 +0800 Subject: btrfs: fix extent map use-after-free when handling missing device in read_one_chunk Store the error code before freeing the extent_map. Though it's reference counted structure, in that function it's the first and last allocation so this would lead to a potential use-after-free. The error can happen eg. when chunk is stored on a missing device and the degraded mount option is missing. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=216721 Reported-by: eriri <1527030098@qq.com> Fixes: adfb69af7d8c ("btrfs: add_missing_dev() should return the actual error") CC: stable@vger.kernel.org # 4.9+ Signed-off-by: void0red Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index acab20f2863d..aa25fa335d3e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6976,8 +6976,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->stripes[i].dev = handle_missing_device(fs_info, devid, uuid); if (IS_ERR(map->stripes[i].dev)) { + ret = PTR_ERR(map->stripes[i].dev); free_extent_map(em); - return PTR_ERR(map->stripes[i].dev); + return ret; } } -- cgit From 162d053e15fe985f754ef495a96eb3db970c43ed Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 28 Nov 2022 15:07:30 +0000 Subject: btrfs: do not BUG_ON() on ENOMEM when dropping extent items for a range If we get -ENOMEM while dropping file extent items in a given range, at btrfs_drop_extents(), due to failure to allocate memory when attempting to increment the reference count for an extent or drop the reference count, we handle it with a BUG_ON(). This is excessive, instead we can simply abort the transaction and return the error to the caller. In fact most callers of btrfs_drop_extents(), directly or indirectly, already abort the transaction if btrfs_drop_extents() returns any error. Also, we already have error paths at btrfs_drop_extents() that may return -ENOMEM and in those cases we abort the transaction, like for example anything that changes the b+tree may return -ENOMEM due to a failure to allocate a new extent buffer when COWing an existing extent buffer, such as a call to btrfs_duplicate_item() for example. So replace the BUG_ON() calls with proper logic to abort the transaction and return the error. Reported-by: syzbot+0b1fb6b0108c27419f9f@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/00000000000089773e05ee4b9cb4@google.com/ CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 448b143a5cb2..91b00eb2440e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -380,7 +380,10 @@ next_slot: args->start - extent_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } } key.offset = args->start; } @@ -467,7 +470,10 @@ delete_extent_item: key.offset - extent_offset, 0, false); ret = btrfs_free_extent(trans, &ref); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } args->bytes_found += extent_end - key.offset; } -- cgit From b7af0635c87ff78d6bd523298ab7471f9ffd3ce5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 30 Nov 2022 15:51:20 +0000 Subject: btrfs: print transaction aborted messages with an error level Currently we print the transaction aborted message with a debug level, but a transaction abort is an exceptional event that indicates something went wrong and it's useful to have it printed with an error level as it helps analysing problems in a production environment, where debug level messages are typically not logged. For example reports from syzbot never include the transaction aborted message, since the log level on the test machines is above the debug level. So change the log level from debug to error. Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/messages.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 5ac410e313e4..190af1f698d9 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -197,13 +197,13 @@ do { \ &((trans)->fs_info->fs_state))) { \ first = true; \ if (WARN(abort_should_print_stack(errno), \ - KERN_DEBUG \ + KERN_ERR \ "BTRFS: Transaction aborted (error %d)\n", \ (errno))) { \ /* Stack trace printed. */ \ } else { \ - btrfs_debug((trans)->fs_info, \ - "Transaction aborted (error %d)", \ + btrfs_err((trans)->fs_info, \ + "Transaction aborted (error %d)", \ (errno)); \ } \ } \ -- cgit From d01c6ed6db38a4db2921591e4f5425bee1931aca Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 5 Dec 2022 14:27:05 -0500 Subject: NFS4.x/pnfs: Fix up logging of layout stateids If the layout is invalid, then just log a '0' value. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4trace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 2cff5901c689..e3fbdc8a98eb 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -1815,7 +1815,7 @@ TRACE_EVENT(pnfs_update_layout, __entry->count = count; __entry->iomode = iomode; __entry->reason = reason; - if (lo != NULL) { + if (lo != NULL && pnfs_layout_is_valid(lo)) { __entry->layoutstateid_seq = be32_to_cpu(lo->plh_stateid.seqid); __entry->layoutstateid_hash = @@ -1869,7 +1869,7 @@ DECLARE_EVENT_CLASS(pnfs_layout_event, __entry->pos = pos; __entry->count = count; __entry->iomode = iomode; - if (lo != NULL) { + if (lo != NULL && pnfs_layout_is_valid(lo)) { __entry->layoutstateid_seq = be32_to_cpu(lo->plh_stateid.seqid); __entry->layoutstateid_hash = -- cgit From e6b842741b4f39007215fd7e545cb55aa3d358a2 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 5 Dec 2022 15:31:36 -0800 Subject: pstore: Avoid kcore oops by vmap()ing with VM_IOREMAP An oops can be induced by running 'cat /proc/kcore > /dev/null' on devices using pstore with the ram backend because kmap_atomic() assumes lowmem pages are accessible with __va(). Unable to handle kernel paging request at virtual address ffffff807ff2b000 Mem abort info: ESR = 0x96000006 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x06: level 2 translation fault Data abort info: ISV = 0, ISS = 0x00000006 CM = 0, WnR = 0 swapper pgtable: 4k pages, 39-bit VAs, pgdp=0000000081d87000 [ffffff807ff2b000] pgd=180000017fe18003, p4d=180000017fe18003, pud=180000017fe18003, pmd=0000000000000000 Internal error: Oops: 96000006 [#1] PREEMPT SMP Modules linked in: dm_integrity CPU: 7 PID: 21179 Comm: perf Not tainted 5.15.67-10882-ge4eb2eb988cd #1 baa443fb8e8477896a370b31a821eb2009f9bfba Hardware name: Google Lazor (rev3 - 8) (DT) pstate: a0400009 (NzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : __memcpy+0x110/0x260 lr : vread+0x194/0x294 sp : ffffffc013ee39d0 x29: ffffffc013ee39f0 x28: 0000000000001000 x27: ffffff807ff2b000 x26: 0000000000001000 x25: ffffffc0085a2000 x24: ffffff802d4b3000 x23: ffffff80f8a60000 x22: ffffff802d4b3000 x21: ffffffc0085a2000 x20: ffffff8080b7bc68 x19: 0000000000001000 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: ffffffd3073f2e60 x14: ffffffffad588000 x13: 0000000000000000 x12: 0000000000000001 x11: 00000000000001a2 x10: 00680000fff2bf0b x9 : 03fffffff807ff2b x8 : 0000000000000001 x7 : 0000000000000000 x6 : 0000000000000000 x5 : ffffff802d4b4000 x4 : ffffff807ff2c000 x3 : ffffffc013ee3a78 x2 : 0000000000001000 x1 : ffffff807ff2b000 x0 : ffffff802d4b3000 Call trace: __memcpy+0x110/0x260 read_kcore+0x584/0x778 proc_reg_read+0xb4/0xe4 During early boot, memblock reserves the pages for the ramoops reserved memory node in DT that would otherwise be part of the direct lowmem mapping. Pstore's ram backend reuses those reserved pages to change the memory type (writeback or non-cached) by passing the pages to vmap() (see pfn_to_page() usage in persistent_ram_vmap() for more details) with specific flags. When read_kcore() starts iterating over the vmalloc region, it runs over the virtual address that vmap() returned for ramoops. In aligned_vread() the virtual address is passed to vmalloc_to_page() which returns the page struct for the reserved lowmem area. That lowmem page is passed to kmap_atomic(), which effectively calls page_to_virt() that assumes a lowmem page struct must be directly accessible with __va() and friends. These pages are mapped via vmap() though, and the lowmem mapping was never made, so accessing them via the lowmem virtual address oopses like above. Let's side-step this problem by passing VM_IOREMAP to vmap(). This will tell vread() to not include the ramoops region in the kcore. Instead the area will look like a bunch of zeros. The alternative is to teach kmap() about vmalloc areas that intersect with lowmem. Presumably such a change isn't a one-liner, and there isn't much interest in inspecting the ramoops region in kcore files anyway, so the most expedient route is taken for now. Cc: Brian Geffon Cc: Mike Rapoport Cc: Andrew Morton Fixes: 404a6043385d ("staging: android: persistent_ram: handle reserving and mapping memory") Signed-off-by: Stephen Boyd Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221205233136.3420802-1-swboyd@chromium.org --- fs/pstore/ram_core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 97dde525150a..966191d3a5ba 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -440,7 +440,11 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size, phys_addr_t addr = page_start + i * PAGE_SIZE; pages[i] = pfn_to_page(addr >> PAGE_SHIFT); } - vaddr = vmap(pages, page_count, VM_MAP, prot); + /* + * VM_IOREMAP used here to bypass this region during vread() + * and kmap_atomic() (i.e. kcore) to avoid __va() failures. + */ + vaddr = vmap(pages, page_count, VM_MAP | VM_IOREMAP, prot); kfree(pages); /* -- cgit From 6f1c1d95dc93b52a8ef9cc1f3f610c2d5e6b217b Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Fri, 23 Sep 2022 13:40:14 +0800 Subject: NFS: make sure open context mode have FMODE_EXEC when file open for exec Because file f_mode never have FMODE_EXEC, open context mode won't get FMODE_EXEC from file f_mode. Open context mode only care about FMODE_READ/ FMODE_WRITE/FMODE_EXEC, and all info about open context mode can be convert from file f_flags, so convert file f_flags to open context mode by flags_to_mode(). Signed-off-by: ChenXiaoSong Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 3 ++- fs/nfs/nfs4file.c | 12 ++++-------- 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6b2cfa59a1a2..e98ee7599eeb 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1168,7 +1168,8 @@ int nfs_open(struct inode *inode, struct file *filp) { struct nfs_open_context *ctx; - ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp); + ctx = alloc_nfs_open_context(file_dentry(filp), + flags_to_mode(filp->f_flags), filp); if (IS_ERR(ctx)) return PTR_ERR(ctx); nfs_file_set_open_context(filp, ctx); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 9eb181287879..2563ed8580f3 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -32,7 +32,6 @@ nfs4_file_open(struct inode *inode, struct file *filp) struct dentry *parent = NULL; struct inode *dir; unsigned openflags = filp->f_flags; - fmode_t f_mode; struct iattr attr; int err; @@ -51,17 +50,14 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (err) return err; - f_mode = filp->f_mode; - if ((openflags & O_ACCMODE) == 3) - f_mode |= flags_to_mode(openflags); - /* We can't create new files here */ openflags &= ~(O_CREAT|O_EXCL); parent = dget_parent(dentry); dir = d_inode(parent); - ctx = alloc_nfs_open_context(file_dentry(filp), f_mode, filp); + ctx = alloc_nfs_open_context(file_dentry(filp), + flags_to_mode(openflags), filp); err = PTR_ERR(ctx); if (IS_ERR(ctx)) goto out; @@ -366,8 +362,8 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, goto out_free_name; } - ctx = alloc_nfs_open_context(filep->f_path.dentry, filep->f_mode, - filep); + ctx = alloc_nfs_open_context(filep->f_path.dentry, + flags_to_mode(filep->f_flags), filep); if (IS_ERR(ctx)) { res = ERR_CAST(ctx); goto out_filep; -- cgit From d564d2c4c2445cb0972453933dc87c2dcaac8597 Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Fri, 23 Sep 2022 13:40:15 +0800 Subject: NFSv4: check FMODE_EXEC from open context mode in nfs4_opendata_access() After converting file f_flags to open context mode by flags_to_mode(), open context mode will have FMODE_EXEC when file open for exec, so we check FMODE_EXEC from open context mode. No functional change, just simplify the code. Signed-off-by: ChenXiaoSong Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e51044a5f550..44f1dcb6020f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2630,8 +2630,7 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data) */ static int nfs4_opendata_access(const struct cred *cred, struct nfs4_opendata *opendata, - struct nfs4_state *state, fmode_t fmode, - int openflags) + struct nfs4_state *state, fmode_t fmode) { struct nfs_access_entry cache; u32 mask, flags; @@ -2642,11 +2641,7 @@ static int nfs4_opendata_access(const struct cred *cred, return 0; mask = 0; - /* - * Use openflags to check for exec, because fmode won't - * always have FMODE_EXEC set when file open for exec. - */ - if (openflags & __FMODE_EXEC) { + if (fmode & FMODE_EXEC) { /* ONLY check for exec rights */ if (S_ISDIR(state->inode->i_mode)) mask = NFS4_ACCESS_LOOKUP; @@ -3034,7 +3029,7 @@ static unsigned nfs4_exclusive_attrset(struct nfs4_opendata *opendata, } static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, - int flags, struct nfs_open_context *ctx) + struct nfs_open_context *ctx) { struct nfs4_state_owner *sp = opendata->owner; struct nfs_server *server = sp->so_server; @@ -3095,8 +3090,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, /* Parse layoutget results before we check for access */ pnfs_parse_lgopen(state->inode, opendata->lgp, ctx); - ret = nfs4_opendata_access(sp->so_cred, opendata, state, - acc_mode, flags); + ret = nfs4_opendata_access(sp->so_cred, opendata, state, acc_mode); if (ret != 0) goto out; @@ -3170,7 +3164,7 @@ static int _nfs4_do_open(struct inode *dir, if (d_really_is_positive(dentry)) opendata->state = nfs4_get_open_state(d_inode(dentry), sp); - status = _nfs4_open_and_get_state(opendata, flags, ctx); + status = _nfs4_open_and_get_state(opendata, ctx); if (status != 0) goto err_opendata_put; state = ctx->state; -- cgit From 5559405df652008e56eee88872126fe4c451da67 Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Mon, 24 Oct 2022 00:39:45 +0800 Subject: nfs: fix possible null-ptr-deref when parsing param According to commit "vfs: parse: deal with zero length string value", kernel will set the param->string to null pointer in vfs_parse_fs_string() if fs string has zero length. Yet the problem is that, nfs_fs_context_parse_param() will dereferences the param->string, without checking whether it is a null pointer, which may trigger a null-ptr-deref bug. This patch solves it by adding sanity check on param->string in nfs_fs_context_parse_param(). Signed-off-by: Hawkins Jiawei Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/fs_context.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 09833ec102fc..9bcd53d5c7d4 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -684,6 +684,8 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, return ret; break; case Opt_vers: + if (!param->string) + goto out_invalid_value; trace_nfs_mount_assign(param->key, param->string); ret = nfs_parse_version_string(fc, param->string); if (ret < 0) @@ -696,6 +698,8 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, break; case Opt_proto: + if (!param->string) + goto out_invalid_value; trace_nfs_mount_assign(param->key, param->string); protofamily = AF_INET; switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) { @@ -732,6 +736,8 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, break; case Opt_mountproto: + if (!param->string) + goto out_invalid_value; trace_nfs_mount_assign(param->key, param->string); mountfamily = AF_INET; switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) { -- cgit From ef8d98f20dfc79777d2b66a30926533225dc6efa Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Oct 2022 10:26:51 +1100 Subject: NFS: avoid spurious warning of lost lock that is being unlocked. When the NFSv4 state manager recovers state after a server restart, it reports that locks have been lost if it finds any lock state for which recovery hasn't been successful. i.e. any for which NFS_LOCK_INITIALIZED is not set. However it only tries to recover locks that are still linked to inode->i_flctx. So if a lock has been removed from inode->i_flctx, but the state for that lock has not yet been destroyed, then a spurious warning results. nfs4_proc_unlck() calls locks_lock_inode_wait() - which removes the lock from ->i_flctx - before sending the unlock request to the server and before the final nfs4_put_lock_state() is called. This allows a window in which a spurious warning can be produced. So add a new flag NFS_LOCK_UNLOCKING which is set once the decision has been made to unlock the lock. This will prevent it from triggering any warning. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 1 + fs/nfs/nfs4proc.c | 3 ++- fs/nfs/nfs4state.c | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index cfef738d765e..5edd1704f735 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -149,6 +149,7 @@ struct nfs4_lock_state { struct nfs4_state * ls_state; /* Pointer to open state */ #define NFS_LOCK_INITIALIZED 0 #define NFS_LOCK_LOST 1 +#define NFS_LOCK_UNLOCKING 2 unsigned long ls_flags; struct nfs_seqid_counter ls_seqid; nfs4_stateid ls_stateid; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 44f1dcb6020f..40d749f29ed3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7023,12 +7023,13 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * mutex_unlock(&sp->so_delegreturn_mutex); goto out; } + lsp = request->fl_u.nfs4_fl.owner; + set_bit(NFS_LOCK_UNLOCKING, &lsp->ls_flags); up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); if (status != 0) goto out; /* Is this a delegated lock? */ - lsp = request->fl_u.nfs4_fl.owner; if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0) goto out; alloc_seqid = NFS_SERVER(inode)->nfs_client->cl_mvops->alloc_seqid; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index a2d2d5d1b088..7c1f43507813 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1619,7 +1619,8 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st spin_lock(&state->state_lock); list_for_each_entry(lock, &state->lock_states, ls_locks) { trace_nfs4_state_lock_reclaim(state, lock); - if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) + if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags) && + !test_bit(NFS_LOCK_UNLOCKING, &lock->ls_flags)) *lost_locks += 1; } spin_unlock(&state->state_lock); -- cgit From 36357fe74ef736524a29fbd3952948768510a8b9 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 30 Nov 2022 13:15:25 -0500 Subject: NFSv4.2: Set the correct size scratch buffer for decoding READ_PLUS The scratch_buf array is 16 bytes, but I was passing 32 to the xdr_set_scratch_buffer() function. Fix this by using sizeof(), which is what I probably should have been doing this whole time. Fixes: d3b00a802c84 ("NFS: Replace the READ_PLUS decoding code") Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/nfs42xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index fe1aeb0f048f..2fd465cab631 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1142,7 +1142,7 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) if (!segs) return -ENOMEM; - xdr_set_scratch_buffer(xdr, &scratch_buf, 32); + xdr_set_scratch_buffer(xdr, &scratch_buf, sizeof(scratch_buf)); status = -EIO; for (i = 0; i < segments; i++) { status = decode_read_plus_segment(xdr, &segs[i]); -- cgit From f8527028a7e52da884055c401abc04e0b0c84285 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 30 Nov 2022 13:15:26 -0500 Subject: NFSv4.2: Fix up READ_PLUS alignment Assume that the first segment will be a DATA segment, and place the data directly into the xdr pages so it doesn't need to be shifted. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/nfs42xdr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 2fd465cab631..d80ee88ca996 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -47,13 +47,14 @@ #define decode_deallocate_maxsz (op_decode_hdr_maxsz) #define encode_read_plus_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 3) -#define NFS42_READ_PLUS_SEGMENT_SIZE (1 /* data_content4 */ + \ +#define NFS42_READ_PLUS_DATA_SEGMENT_SIZE \ + (1 /* data_content4 */ + \ 2 /* data_info4.di_offset */ + \ - 2 /* data_info4.di_length */) + 1 /* data_info4.di_length */) #define decode_read_plus_maxsz (op_decode_hdr_maxsz + \ 1 /* rpr_eof */ + \ 1 /* rpr_contents count */ + \ - 2 * NFS42_READ_PLUS_SEGMENT_SIZE) + NFS42_READ_PLUS_DATA_SEGMENT_SIZE) #define encode_seek_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + \ 2 /* offset */ + \ -- cgit From a60214c2465493aac0b014d87ee19327b6204c42 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 30 Nov 2022 15:30:47 -0500 Subject: NFS: Allow very small rsize & wsize again 940261a19508 introduced nfs_io_size() to clamp the iosize to a multiple of PAGE_SIZE. This had the unintended side effect of no longer allowing iosizes less than a page, which could be useful in some situations. UDP already has an exception that causes it to fall back on the power-of-two style sizes instead. This patch adds an additional exception for very small iosizes. Reported-by: Jeff Layton Fixes: 940261a19508 ("NFS: Allow setting rsize / wsize to a multiple of PAGE_SIZE") Signed-off-by: Anna Schumaker Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 647fc3f547cb..ae7d4a8c728c 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -739,12 +739,10 @@ unsigned long nfs_io_size(unsigned long iosize, enum xprt_transports proto) iosize = NFS_DEF_FILE_IO_SIZE; else if (iosize >= NFS_MAX_FILE_IO_SIZE) iosize = NFS_MAX_FILE_IO_SIZE; - else - iosize = iosize & PAGE_MASK; - if (proto == XPRT_TRANSPORT_UDP) + if (proto == XPRT_TRANSPORT_UDP || iosize < PAGE_SIZE) return nfs_block_bits(iosize, NULL); - return iosize; + return iosize & PAGE_MASK; } /* -- cgit From 700fa9b1b361760ca6c28e2e7d6cccbdc9fb6716 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Mon, 5 Dec 2022 17:13:45 +0800 Subject: NFS: use sysfs_emit() to instead of scnprintf() Follow the advice of the Documentation/filesystems/sysfs.rst and show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. Signed-off-by: ye xingchen Signed-off-by: Trond Myklebust --- fs/nfs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 88a23af2bd5c..b0ef7e7ddb30 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -354,7 +354,7 @@ static int param_get_nfs_timeout(char *buffer, const struct kernel_param *kp) num = (num + (HZ - 1)) / HZ; } else num = -1; - return scnprintf(buffer, PAGE_SIZE, "%li\n", num); + return sysfs_emit(buffer, "%li\n", num); } static const struct kernel_param_ops param_ops_nfs_timeout = { -- cgit From 19cdc8fa5b9b768f5753eefe3da32fcbafcb0d18 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Mon, 5 Dec 2022 17:19:16 +0800 Subject: fs: nfs: sysfs: use sysfs_emit() to instead of scnprintf() Follow the advice of the Documentation/filesystems/sysfs.rst and show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. Signed-off-by: ye xingchen Signed-off-by: Trond Myklebust --- fs/nfs/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index a6f740366963..26f8ece2a997 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -82,7 +82,7 @@ static ssize_t nfs_netns_identifier_show(struct kobject *kobj, ssize_t ret; rcu_read_lock(); - ret = scnprintf(buf, PAGE_SIZE, "%s\n", rcu_dereference(c->identifier)); + ret = sysfs_emit(buf, "%s\n", rcu_dereference(c->identifier)); rcu_read_unlock(); return ret; } -- cgit From b4e4f66901658fae0614dea5bf91062a5387eda7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 6 Dec 2022 12:42:59 -0500 Subject: NFSv4.x: Fail client initialisation if state manager thread can't run If the state manager thread fails to start, then we should just mark the client initialisation as failed so that other processes or threads don't get stuck in nfs_wait_client_init_complete(). Reported-by: ChenXiaoSong Fixes: 4697bd5e9419 ("NFSv4: Fix a race in the net namespace mount notification") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 7c1f43507813..5720196141e1 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1230,6 +1230,8 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) if (IS_ERR(task)) { printk(KERN_ERR "%s: kthread_run: %ld\n", __func__, PTR_ERR(task)); + if (!nfs_client_init_is_complete(clp)) + nfs_mark_client_ready(clp, PTR_ERR(task)); nfs4_clear_state_manager_bit(clp); clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs_put_client(clp); -- cgit From ce529cc25b184e93397b94a8a322128fc0095cbb Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Wed, 30 Nov 2022 14:04:55 +0800 Subject: erofs: enable large folios for iomap mode Enable large folios for iomap mode. Then the readahead routine will pass down large folios containing multiple pages. Let's enable this for non-compressed format for now, until the compression part supports large folios later. When large folios supported, the iomap routine will allocate iomap_page for each large folio and thus we need iomap_release_folio() and iomap_invalidate_folio() to free iomap_page when these folios get reclaimed or invalidated. Signed-off-by: Jingbo Xu Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20221130060455.44532-1-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/data.c | 2 ++ fs/erofs/inode.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/erofs/data.c b/fs/erofs/data.c index fe8ac0e163f7..c9526c627dda 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -403,6 +403,8 @@ const struct address_space_operations erofs_raw_access_aops = { .readahead = erofs_readahead, .bmap = erofs_bmap, .direct_IO = noop_direct_IO, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, }; #ifdef CONFIG_FS_DAX diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index ad2a82f2eb4c..e457b8a59ee7 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -295,6 +295,8 @@ static int erofs_fill_inode(struct inode *inode) goto out_unlock; } inode->i_mapping->a_ops = &erofs_raw_access_aops; + if (!erofs_is_fscache_mode(inode->i_sb)) + mapping_set_large_folios(inode->i_mapping); #ifdef CONFIG_EROFS_FS_ONDEMAND if (erofs_is_fscache_mode(inode->i_sb)) inode->i_mapping->a_ops = &erofs_fscache_access_aops; -- cgit From 27f2a2dcc6261406b509b5022a1e5c23bf622830 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 25 Nov 2022 19:08:22 +0800 Subject: erofs: check the uniqueness of fsid in shared domain in advance When shared domain is enabled, doing mount twice with the same fsid and domain_id will trigger sysfs warning as shown below: sysfs: cannot create duplicate filename '/fs/erofs/d0,meta.bin' CPU: 15 PID: 1051 Comm: mount Not tainted 6.1.0-rc6+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) Call Trace: dump_stack_lvl+0x38/0x49 dump_stack+0x10/0x12 sysfs_warn_dup.cold+0x17/0x27 sysfs_create_dir_ns+0xb8/0xd0 kobject_add_internal+0xb1/0x240 kobject_init_and_add+0x71/0xa0 erofs_register_sysfs+0x89/0x110 erofs_fc_fill_super+0x98c/0xaf0 vfs_get_super+0x7d/0x100 get_tree_nodev+0x16/0x20 erofs_fc_get_tree+0x20/0x30 vfs_get_tree+0x24/0xb0 path_mount+0x2fa/0xa90 do_mount+0x7c/0xa0 __x64_sys_mount+0x8b/0xe0 do_syscall_64+0x30/0x60 entry_SYSCALL_64_after_hwframe+0x46/0xb0 The reason is erofs_fscache_register_cookie() doesn't guarantee the primary data blob (aka fsid) is unique in the shared domain and erofs_register_sysfs() invoked by the second mount will fail due to the duplicated fsid in the shared domain and report warning. It would be better to check the uniqueness of fsid before doing erofs_register_sysfs(), so adding a new flags parameter for erofs_fscache_register_cookie() and doing the uniqueness check if EROFS_REG_COOKIE_NEED_NOEXIST is enabled. After the patch, the error in dmesg for the duplicated mount would be: erofs: ...: erofs_domain_register_cookie: XX already exists in domain YY Reviewed-by: Jia Zhu Reviewed-by: Jingbo Xu Reviewed-by: Chao Yu Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20221125110822.3812942-1-houtao@huaweicloud.com Fixes: 7d41963759fe ("erofs: Support sharing cookies in the same domain") Signed-off-by: Gao Xiang --- fs/erofs/fscache.c | 47 +++++++++++++++++++++++++++++++++++------------ fs/erofs/internal.h | 10 ++++++++-- fs/erofs/super.c | 2 +- 3 files changed, 44 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index af5ed6b9c54d..6a792a513d6b 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -494,7 +494,8 @@ static int erofs_fscache_register_domain(struct super_block *sb) static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, - char *name, bool need_inode) + char *name, + unsigned int flags) { struct fscache_volume *volume = EROFS_SB(sb)->volume; struct erofs_fscache *ctx; @@ -516,7 +517,7 @@ struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, fscache_use_cookie(cookie, false); ctx->cookie = cookie; - if (need_inode) { + if (flags & EROFS_REG_COOKIE_NEED_INODE) { struct inode *const inode = new_inode(sb); if (!inode) { @@ -554,14 +555,15 @@ static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx) static struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb, - char *name, bool need_inode) + char *name, + unsigned int flags) { int err; struct inode *inode; struct erofs_fscache *ctx; struct erofs_domain *domain = EROFS_SB(sb)->domain; - ctx = erofs_fscache_acquire_cookie(sb, name, need_inode); + ctx = erofs_fscache_acquire_cookie(sb, name, flags); if (IS_ERR(ctx)) return ctx; @@ -589,7 +591,8 @@ out: static struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, - char *name, bool need_inode) + char *name, + unsigned int flags) { struct inode *inode; struct erofs_fscache *ctx; @@ -602,23 +605,30 @@ struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, ctx = inode->i_private; if (!ctx || ctx->domain != domain || strcmp(ctx->name, name)) continue; - igrab(inode); + if (!(flags & EROFS_REG_COOKIE_NEED_NOEXIST)) { + igrab(inode); + } else { + erofs_err(sb, "%s already exists in domain %s", name, + domain->domain_id); + ctx = ERR_PTR(-EEXIST); + } spin_unlock(&psb->s_inode_list_lock); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } spin_unlock(&psb->s_inode_list_lock); - ctx = erofs_fscache_domain_init_cookie(sb, name, need_inode); + ctx = erofs_fscache_domain_init_cookie(sb, name, flags); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, bool need_inode) + char *name, + unsigned int flags) { if (EROFS_SB(sb)->domain_id) - return erofs_domain_register_cookie(sb, name, need_inode); - return erofs_fscache_acquire_cookie(sb, name, need_inode); + return erofs_domain_register_cookie(sb, name, flags); + return erofs_fscache_acquire_cookie(sb, name, flags); } void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx) @@ -647,6 +657,7 @@ int erofs_fscache_register_fs(struct super_block *sb) int ret; struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; + unsigned int flags; if (sbi->domain_id) ret = erofs_fscache_register_domain(sb); @@ -655,8 +666,20 @@ int erofs_fscache_register_fs(struct super_block *sb) if (ret) return ret; - /* acquired domain/volume will be relinquished in kill_sb() on error */ - fscache = erofs_fscache_register_cookie(sb, sbi->fsid, true); + /* + * When shared domain is enabled, using NEED_NOEXIST to guarantee + * the primary data blob (aka fsid) is unique in the shared domain. + * + * For non-shared-domain case, fscache_acquire_volume() invoked by + * erofs_fscache_register_volume() has already guaranteed + * the uniqueness of primary data blob. + * + * Acquired domain/volume will be relinquished in kill_sb() on error. + */ + flags = EROFS_REG_COOKIE_NEED_INODE; + if (sbi->domain_id) + flags |= EROFS_REG_COOKIE_NEED_NOEXIST; + fscache = erofs_fscache_register_cookie(sb, sbi->fsid, flags); if (IS_ERR(fscache)) return PTR_ERR(fscache); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 05dc68627722..e51f27b6bde1 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -604,13 +604,18 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb, } #endif /* !CONFIG_EROFS_FS_ZIP */ +/* flags for erofs_fscache_register_cookie() */ +#define EROFS_REG_COOKIE_NEED_INODE 1 +#define EROFS_REG_COOKIE_NEED_NOEXIST 2 + /* fscache.c */ #ifdef CONFIG_EROFS_FS_ONDEMAND int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, bool need_inode); + char *name, + unsigned int flags); void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache); extern const struct address_space_operations erofs_fscache_access_aops; @@ -623,7 +628,8 @@ static inline void erofs_fscache_unregister_fs(struct super_block *sb) {} static inline struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, bool need_inode) + char *name, + unsigned int flags) { return ERR_PTR(-EOPNOTSUPP); } diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 1c7dcca702b3..481788c24a68 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -245,7 +245,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, } if (erofs_is_fscache_mode(sb)) { - fscache = erofs_fscache_register_cookie(sb, dif->path, false); + fscache = erofs_fscache_register_cookie(sb, dif->path, 0); if (IS_ERR(fscache)) return PTR_ERR(fscache); dif->fscache = fscache; -- cgit From 1282dea37b09087b8aec59f0774572c16b52276a Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 6 Dec 2022 14:03:52 +0800 Subject: erofs: clean up cached I/O strategies After commit 4c7e42552b3a ("erofs: remove useless cache strategy of DELAYEDALLOC"), only one cached I/O allocation strategy is supported: When cached I/O is preferred, page allocation is applied without direct reclaim. If allocation fails, fall back to inplace I/O. Let's get rid of z_erofs_cache_alloctype. No logical changes. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Yue Hu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20221206060352.152830-1-xiang@kernel.org --- fs/erofs/zdata.c | 77 +++++++++++++++++++++++--------------------------------- 1 file changed, 31 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index b792d424d774..b66c16473273 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -175,16 +175,6 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) DBG_BUGON(1); } -/* how to allocate cached pages for a pcluster */ -enum z_erofs_cache_alloctype { - DONTALLOC, /* don't allocate any cached pages */ - /* - * try to use cached I/O if page allocation succeeds or fallback - * to in-place I/O instead to avoid any direct reclaim. - */ - TRYALLOC, -}; - /* * tagged pointer with 1-bit tag for all compressed pages * tag 0 - the page is just found with an extra page reference @@ -292,12 +282,29 @@ struct z_erofs_decompress_frontend { .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } +static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) +{ + unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy; + + if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) + return false; + + if (fe->backmost) + return true; + + if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND && + fe->map.m_la < fe->headoffset) + return true; + + return false; +} + static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, - enum z_erofs_cache_alloctype type, struct page **pagepool) { struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); struct z_erofs_pcluster *pcl = fe->pcl; + bool shouldalloc = z_erofs_should_alloc_cache(fe); bool standalone = true; /* * optimistic allocation without direct reclaim since inplace I/O @@ -326,18 +333,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, } else { /* I/O is needed, no possible to decompress directly */ standalone = false; - switch (type) { - case TRYALLOC: - newpage = erofs_allocpage(pagepool, gfp); - if (!newpage) - continue; - set_page_private(newpage, - Z_EROFS_PREALLOCATED_PAGE); - t = tag_compressed_page_justfound(newpage); - break; - default: /* DONTALLOC */ + if (!shouldalloc) continue; - } + + /* + * try to use cached I/O if page allocation + * succeeds or fallback to in-place I/O instead + * to avoid any direct reclaim. + */ + newpage = erofs_allocpage(pagepool, gfp); + if (!newpage) + continue; + set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); + t = tag_compressed_page_justfound(newpage); } if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, @@ -637,20 +645,6 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) return true; } -static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, - unsigned int cachestrategy, - erofs_off_t la) -{ - if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) - return false; - - if (fe->backmost) - return true; - - return cachestrategy >= EROFS_ZIP_CACHE_READAROUND && - la < fe->headoffset; -} - static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, struct page *page, unsigned int pageofs, unsigned int len) @@ -687,12 +681,9 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct page *page, struct page **pagepool) { struct inode *const inode = fe->inode; - struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); bool tight = true, exclusive; - - enum z_erofs_cache_alloctype cache_strategy; unsigned int cur, end, spiltted; int err = 0; @@ -746,13 +737,7 @@ repeat: fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } else { /* bind cache first when cached decompression is preferred */ - if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, - map->m_la)) - cache_strategy = TRYALLOC; - else - cache_strategy = DONTALLOC; - - z_erofs_bind_cache(fe, cache_strategy, pagepool); + z_erofs_bind_cache(fe, pagepool); } hitted: /* -- cgit From 8669247524c73e16e4d3384c4ff882e5c5d06194 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Thu, 24 Nov 2022 11:42:11 +0800 Subject: fscache,cachefiles: add prepare_ondemand_read() callback Add prepare_ondemand_read() callback dedicated for the on-demand read scenario, so that callers from this scenario can be decoupled from netfs_io_subrequest. The original cachefiles_prepare_read() is now refactored to a generic routine accepting a parameter list instead of netfs_io_subrequest. There's no logic change, except that the debug id of subrequest and request is removed from trace_cachefiles_prep_read(). Reviewed-by: Jeff Layton Signed-off-by: Jingbo Xu Acked-by: David Howells Link: https://lore.kernel.org/r/20221124034212.81892-2-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/cachefiles/io.c | 77 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 000a28f46e59..175a25fcade8 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -385,38 +385,35 @@ static int cachefiles_write(struct netfs_cache_resources *cres, term_func, term_func_priv); } -/* - * Prepare a read operation, shortening it to a cached/uncached - * boundary as appropriate. - */ -static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq, - loff_t i_size) +static inline enum netfs_io_source +cachefiles_do_prepare_read(struct netfs_cache_resources *cres, + loff_t start, size_t *_len, loff_t i_size, + unsigned long *_flags, ino_t netfs_ino) { enum cachefiles_prepare_read_trace why; - struct netfs_io_request *rreq = subreq->rreq; - struct netfs_cache_resources *cres = &rreq->cache_resources; - struct cachefiles_object *object; + struct cachefiles_object *object = NULL; struct cachefiles_cache *cache; struct fscache_cookie *cookie = fscache_cres_cookie(cres); const struct cred *saved_cred; struct file *file = cachefiles_cres_file(cres); enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER; + size_t len = *_len; loff_t off, to; ino_t ino = file ? file_inode(file)->i_ino : 0; int rc; - _enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size); + _enter("%zx @%llx/%llx", len, start, i_size); - if (subreq->start >= i_size) { + if (start >= i_size) { ret = NETFS_FILL_WITH_ZEROES; why = cachefiles_trace_read_after_eof; goto out_no_object; } if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) { - __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); + __set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags); why = cachefiles_trace_read_no_data; - if (!test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags)) + if (!test_bit(NETFS_SREQ_ONDEMAND, _flags)) goto out_no_object; } @@ -437,7 +434,7 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest * retry: off = cachefiles_inject_read_error(); if (off == 0) - off = vfs_llseek(file, subreq->start, SEEK_DATA); + off = vfs_llseek(file, start, SEEK_DATA); if (off < 0 && off >= (loff_t)-MAX_ERRNO) { if (off == (loff_t)-ENXIO) { why = cachefiles_trace_read_seek_nxio; @@ -449,21 +446,22 @@ retry: goto out; } - if (off >= subreq->start + subreq->len) { + if (off >= start + len) { why = cachefiles_trace_read_found_hole; goto download_and_store; } - if (off > subreq->start) { + if (off > start) { off = round_up(off, cache->bsize); - subreq->len = off - subreq->start; + len = off - start; + *_len = len; why = cachefiles_trace_read_found_part; goto download_and_store; } to = cachefiles_inject_read_error(); if (to == 0) - to = vfs_llseek(file, subreq->start, SEEK_HOLE); + to = vfs_llseek(file, start, SEEK_HOLE); if (to < 0 && to >= (loff_t)-MAX_ERRNO) { trace_cachefiles_io_error(object, file_inode(file), to, cachefiles_trace_seek_error); @@ -471,12 +469,13 @@ retry: goto out; } - if (to < subreq->start + subreq->len) { - if (subreq->start + subreq->len >= i_size) + if (to < start + len) { + if (start + len >= i_size) to = round_up(to, cache->bsize); else to = round_down(to, cache->bsize); - subreq->len = to - subreq->start; + len = to - start; + *_len = len; } why = cachefiles_trace_read_have_data; @@ -484,12 +483,11 @@ retry: goto out; download_and_store: - __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); - if (test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags)) { - rc = cachefiles_ondemand_read(object, subreq->start, - subreq->len); + __set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags); + if (test_bit(NETFS_SREQ_ONDEMAND, _flags)) { + rc = cachefiles_ondemand_read(object, start, len); if (!rc) { - __clear_bit(NETFS_SREQ_ONDEMAND, &subreq->flags); + __clear_bit(NETFS_SREQ_ONDEMAND, _flags); goto retry; } ret = NETFS_INVALID_READ; @@ -497,10 +495,34 @@ download_and_store: out: cachefiles_end_secure(cache, saved_cred); out_no_object: - trace_cachefiles_prep_read(subreq, ret, why, ino); + trace_cachefiles_prep_read(object, start, len, *_flags, ret, why, ino, netfs_ino); return ret; } +/* + * Prepare a read operation, shortening it to a cached/uncached + * boundary as appropriate. + */ +static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq, + loff_t i_size) +{ + return cachefiles_do_prepare_read(&subreq->rreq->cache_resources, + subreq->start, &subreq->len, i_size, + &subreq->flags, subreq->rreq->inode->i_ino); +} + +/* + * Prepare an on-demand read operation, shortening it to a cached/uncached + * boundary as appropriate. + */ +static enum netfs_io_source +cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres, + loff_t start, size_t *_len, loff_t i_size, + unsigned long *_flags, ino_t ino) +{ + return cachefiles_do_prepare_read(cres, start, _len, i_size, _flags, ino); +} + /* * Prepare for a write to occur. */ @@ -621,6 +643,7 @@ static const struct netfs_cache_ops cachefiles_netfs_cache_ops = { .write = cachefiles_write, .prepare_read = cachefiles_prepare_read, .prepare_write = cachefiles_prepare_write, + .prepare_ondemand_read = cachefiles_prepare_ondemand_read, .query_occupancy = cachefiles_query_occupancy, }; -- cgit From 709fe09e281776b5e024fb5934c0485a866b7468 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Thu, 24 Nov 2022 11:42:12 +0800 Subject: erofs: switch to prepare_ondemand_read() in fscache mode Switch to prepare_ondemand_read() interface and a self-contained request completion to get rid of netfs_io_[request|subrequest]. The whole request will still be split into slices (subrequest) according to the cache state of the backing file. As long as one of the subrequests fails, the whole request will be marked as failed. Reviewed-by: Gao Xiang Signed-off-by: Jingbo Xu Reviewed-by: Jia Zhu Link: https://lore.kernel.org/r/20221124034212.81892-3-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/fscache.c | 261 +++++++++++++++++++---------------------------------- 1 file changed, 94 insertions(+), 167 deletions(-) (limited to 'fs') diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 6a792a513d6b..3e794891cd91 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -11,257 +11,180 @@ static DEFINE_MUTEX(erofs_domain_cookies_lock); static LIST_HEAD(erofs_domain_list); static struct vfsmount *erofs_pseudo_mnt; -static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping, +struct erofs_fscache_request { + struct netfs_cache_resources cache_resources; + struct address_space *mapping; /* The mapping being accessed */ + loff_t start; /* Start position */ + size_t len; /* Length of the request */ + size_t submitted; /* Length of submitted */ + short error; /* 0 or error that occurred */ + refcount_t ref; +}; + +static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_space *mapping, loff_t start, size_t len) { - struct netfs_io_request *rreq; + struct erofs_fscache_request *req; - rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL); - if (!rreq) + req = kzalloc(sizeof(struct erofs_fscache_request), GFP_KERNEL); + if (!req) return ERR_PTR(-ENOMEM); - rreq->start = start; - rreq->len = len; - rreq->mapping = mapping; - rreq->inode = mapping->host; - INIT_LIST_HEAD(&rreq->subrequests); - refcount_set(&rreq->ref, 1); - return rreq; -} - -static void erofs_fscache_put_request(struct netfs_io_request *rreq) -{ - if (!refcount_dec_and_test(&rreq->ref)) - return; - if (rreq->cache_resources.ops) - rreq->cache_resources.ops->end_operation(&rreq->cache_resources); - kfree(rreq); -} - -static void erofs_fscache_put_subrequest(struct netfs_io_subrequest *subreq) -{ - if (!refcount_dec_and_test(&subreq->ref)) - return; - erofs_fscache_put_request(subreq->rreq); - kfree(subreq); -} + req->mapping = mapping; + req->start = start; + req->len = len; + refcount_set(&req->ref, 1); -static void erofs_fscache_clear_subrequests(struct netfs_io_request *rreq) -{ - struct netfs_io_subrequest *subreq; - - while (!list_empty(&rreq->subrequests)) { - subreq = list_first_entry(&rreq->subrequests, - struct netfs_io_subrequest, rreq_link); - list_del(&subreq->rreq_link); - erofs_fscache_put_subrequest(subreq); - } + return req; } -static void erofs_fscache_rreq_unlock_folios(struct netfs_io_request *rreq) +static void erofs_fscache_req_complete(struct erofs_fscache_request *req) { - struct netfs_io_subrequest *subreq; struct folio *folio; - unsigned int iopos = 0; - pgoff_t start_page = rreq->start / PAGE_SIZE; - pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; - bool subreq_failed = false; + bool failed = req->error; + pgoff_t start_page = req->start / PAGE_SIZE; + pgoff_t last_page = ((req->start + req->len) / PAGE_SIZE) - 1; - XA_STATE(xas, &rreq->mapping->i_pages, start_page); - - subreq = list_first_entry(&rreq->subrequests, - struct netfs_io_subrequest, rreq_link); - subreq_failed = (subreq->error < 0); + XA_STATE(xas, &req->mapping->i_pages, start_page); rcu_read_lock(); xas_for_each(&xas, folio, last_page) { - unsigned int pgpos, pgend; - bool pg_failed = false; - if (xas_retry(&xas, folio)) continue; - - pgpos = (folio_index(folio) - start_page) * PAGE_SIZE; - pgend = pgpos + folio_size(folio); - - for (;;) { - if (!subreq) { - pg_failed = true; - break; - } - - pg_failed |= subreq_failed; - if (pgend < iopos + subreq->len) - break; - - iopos += subreq->len; - if (!list_is_last(&subreq->rreq_link, - &rreq->subrequests)) { - subreq = list_next_entry(subreq, rreq_link); - subreq_failed = (subreq->error < 0); - } else { - subreq = NULL; - subreq_failed = false; - } - if (pgend == iopos) - break; - } - - if (!pg_failed) + if (!failed) folio_mark_uptodate(folio); - folio_unlock(folio); } rcu_read_unlock(); + + if (req->cache_resources.ops) + req->cache_resources.ops->end_operation(&req->cache_resources); + + kfree(req); } -static void erofs_fscache_rreq_complete(struct netfs_io_request *rreq) +static void erofs_fscache_req_put(struct erofs_fscache_request *req) { - erofs_fscache_rreq_unlock_folios(rreq); - erofs_fscache_clear_subrequests(rreq); - erofs_fscache_put_request(rreq); + if (refcount_dec_and_test(&req->ref)) + erofs_fscache_req_complete(req); } -static void erofc_fscache_subreq_complete(void *priv, +static void erofs_fscache_subreq_complete(void *priv, ssize_t transferred_or_error, bool was_async) { - struct netfs_io_subrequest *subreq = priv; - struct netfs_io_request *rreq = subreq->rreq; + struct erofs_fscache_request *req = priv; if (IS_ERR_VALUE(transferred_or_error)) - subreq->error = transferred_or_error; - - if (atomic_dec_and_test(&rreq->nr_outstanding)) - erofs_fscache_rreq_complete(rreq); - - erofs_fscache_put_subrequest(subreq); + req->error = transferred_or_error; + erofs_fscache_req_put(req); } /* - * Read data from fscache and fill the read data into page cache described by - * @rreq, which shall be both aligned with PAGE_SIZE. @pstart describes - * the start physical address in the cache file. + * Read data from fscache (cookie, pstart, len), and fill the read data into + * page cache described by (req->mapping, lstart, len). @pstart describeis the + * start physical address in the cache file. */ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, - struct netfs_io_request *rreq, loff_t pstart) + struct erofs_fscache_request *req, loff_t pstart, size_t len) { enum netfs_io_source source; - struct super_block *sb = rreq->mapping->host->i_sb; - struct netfs_io_subrequest *subreq; - struct netfs_cache_resources *cres = &rreq->cache_resources; + struct super_block *sb = req->mapping->host->i_sb; + struct netfs_cache_resources *cres = &req->cache_resources; struct iov_iter iter; - loff_t start = rreq->start; - size_t len = rreq->len; + loff_t lstart = req->start + req->submitted; size_t done = 0; int ret; - atomic_set(&rreq->nr_outstanding, 1); + DBG_BUGON(len > req->len - req->submitted); ret = fscache_begin_read_operation(cres, cookie); if (ret) - goto out; + return ret; while (done < len) { - subreq = kzalloc(sizeof(struct netfs_io_subrequest), - GFP_KERNEL); - if (subreq) { - INIT_LIST_HEAD(&subreq->rreq_link); - refcount_set(&subreq->ref, 2); - subreq->rreq = rreq; - refcount_inc(&rreq->ref); - } else { - ret = -ENOMEM; - goto out; - } - - subreq->start = pstart + done; - subreq->len = len - done; - subreq->flags = 1 << NETFS_SREQ_ONDEMAND; + loff_t sstart = pstart + done; + size_t slen = len - done; + unsigned long flags = 1 << NETFS_SREQ_ONDEMAND; - list_add_tail(&subreq->rreq_link, &rreq->subrequests); - - source = cres->ops->prepare_read(subreq, LLONG_MAX); - if (WARN_ON(subreq->len == 0)) + source = cres->ops->prepare_ondemand_read(cres, + sstart, &slen, LLONG_MAX, &flags, 0); + if (WARN_ON(slen == 0)) source = NETFS_INVALID_READ; if (source != NETFS_READ_FROM_CACHE) { - erofs_err(sb, "failed to fscache prepare_read (source %d)", - source); - ret = -EIO; - subreq->error = ret; - erofs_fscache_put_subrequest(subreq); - goto out; + erofs_err(sb, "failed to fscache prepare_read (source %d)", source); + return -EIO; } - atomic_inc(&rreq->nr_outstanding); + refcount_inc(&req->ref); + iov_iter_xarray(&iter, READ, &req->mapping->i_pages, + lstart + done, slen); - iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, - start + done, subreq->len); - - ret = fscache_read(cres, subreq->start, &iter, - NETFS_READ_HOLE_FAIL, - erofc_fscache_subreq_complete, subreq); + ret = fscache_read(cres, sstart, &iter, NETFS_READ_HOLE_FAIL, + erofs_fscache_subreq_complete, req); if (ret == -EIOCBQUEUED) ret = 0; if (ret) { erofs_err(sb, "failed to fscache_read (ret %d)", ret); - goto out; + return ret; } - done += subreq->len; + done += slen; } -out: - if (atomic_dec_and_test(&rreq->nr_outstanding)) - erofs_fscache_rreq_complete(rreq); - - return ret; + DBG_BUGON(done != len); + req->submitted += len; + return 0; } static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) { int ret; struct super_block *sb = folio_mapping(folio)->host->i_sb; - struct netfs_io_request *rreq; + struct erofs_fscache_request *req; struct erofs_map_dev mdev = { .m_deviceid = 0, .m_pa = folio_pos(folio), }; ret = erofs_map_dev(sb, &mdev); - if (ret) - goto out; + if (ret) { + folio_unlock(folio); + return ret; + } - rreq = erofs_fscache_alloc_request(folio_mapping(folio), + req = erofs_fscache_req_alloc(folio_mapping(folio), folio_pos(folio), folio_size(folio)); - if (IS_ERR(rreq)) { - ret = PTR_ERR(rreq); - goto out; + if (IS_ERR(req)) { + folio_unlock(folio); + return PTR_ERR(req); } - return erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - rreq, mdev.m_pa); -out: - folio_unlock(folio); + ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, + req, mdev.m_pa, folio_size(folio)); + if (ret) + req->error = ret; + + erofs_fscache_req_put(req); return ret; } /* * Read into page cache in the range described by (@pos, @len). * - * On return, the caller is responsible for page unlocking if the output @unlock - * is true, or the callee will take this responsibility through netfs_io_request - * interface. + * On return, if the output @unlock is true, the caller is responsible for page + * unlocking; otherwise the callee will take this responsibility through request + * completion. * * The return value is the number of bytes successfully handled, or negative * error code on failure. The only exception is that, the length of the range - * instead of the error code is returned on failure after netfs_io_request is - * allocated, so that .readahead() could advance rac accordingly. + * instead of the error code is returned on failure after request is allocated, + * so that .readahead() could advance rac accordingly. */ static int erofs_fscache_data_read(struct address_space *mapping, loff_t pos, size_t len, bool *unlock) { struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; - struct netfs_io_request *rreq; + struct erofs_fscache_request *req; struct erofs_map_blocks map; struct erofs_map_dev mdev; struct iov_iter iter; @@ -318,13 +241,17 @@ static int erofs_fscache_data_read(struct address_space *mapping, if (ret) return ret; - rreq = erofs_fscache_alloc_request(mapping, pos, count); - if (IS_ERR(rreq)) - return PTR_ERR(rreq); + req = erofs_fscache_req_alloc(mapping, pos, count); + if (IS_ERR(req)) + return PTR_ERR(req); *unlock = false; - erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - rreq, mdev.m_pa + (pos - map.m_la)); + ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, + req, mdev.m_pa + (pos - map.m_la), count); + if (ret) + req->error = ret; + + erofs_fscache_req_put(req); return count; } -- cgit From be62c5198861156d77b60babb89e70e21c73eb7b Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Thu, 1 Dec 2022 15:42:55 +0800 Subject: erofs: support large folios for fscache mode When large folios supported, one folio can be split into several slices, each of which may be mapped to META/UNMAPPED/MAPPED, and the folio can be unlocked as a whole only when all slices have completed. Thus always allocate erofs_fscache_request for each .read_folio() or .readahead(), in which case the allocated request is responsible for unlocking folios when all slices have completed. As described above, each folio or folio range can be mapped into several slices, while these slices may be mapped to different cookies, and thus each slice needs its own netfs_cache_resources. Here we introduce chained requests to support this, where each .read_folio() or .readahead() calling can correspond to multiple requests. Each request has its own netfs_cache_resources and thus is used to access one cookie. Among these requests, there's a primary request, with the others pointing to the primary request. Signed-off-by: Jingbo Xu Reviewed-by: Jia Zhu Link: https://lore.kernel.org/r/20221201074256.16639-2-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/fscache.c | 148 +++++++++++++++++++++++++++++------------------------ 1 file changed, 80 insertions(+), 68 deletions(-) (limited to 'fs') diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 3e794891cd91..f14886c479bd 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -12,6 +12,7 @@ static LIST_HEAD(erofs_domain_list); static struct vfsmount *erofs_pseudo_mnt; struct erofs_fscache_request { + struct erofs_fscache_request *primary; struct netfs_cache_resources cache_resources; struct address_space *mapping; /* The mapping being accessed */ loff_t start; /* Start position */ @@ -38,6 +39,26 @@ static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_spac return req; } +static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary, + size_t len) +{ + struct erofs_fscache_request *req; + + /* use primary request for the first submission */ + if (!primary->submitted) { + refcount_inc(&primary->ref); + return primary; + } + + req = erofs_fscache_req_alloc(primary->mapping, + primary->start + primary->submitted, len); + if (!IS_ERR(req)) { + req->primary = primary; + refcount_inc(&primary->ref); + } + return req; +} + static void erofs_fscache_req_complete(struct erofs_fscache_request *req) { struct folio *folio; @@ -56,17 +77,19 @@ static void erofs_fscache_req_complete(struct erofs_fscache_request *req) folio_unlock(folio); } rcu_read_unlock(); - - if (req->cache_resources.ops) - req->cache_resources.ops->end_operation(&req->cache_resources); - - kfree(req); } static void erofs_fscache_req_put(struct erofs_fscache_request *req) { - if (refcount_dec_and_test(&req->ref)) - erofs_fscache_req_complete(req); + if (refcount_dec_and_test(&req->ref)) { + if (req->cache_resources.ops) + req->cache_resources.ops->end_operation(&req->cache_resources); + if (!req->primary) + erofs_fscache_req_complete(req); + else + erofs_fscache_req_put(req->primary); + kfree(req); + } } static void erofs_fscache_subreq_complete(void *priv, @@ -74,8 +97,12 @@ static void erofs_fscache_subreq_complete(void *priv, { struct erofs_fscache_request *req = priv; - if (IS_ERR_VALUE(transferred_or_error)) - req->error = transferred_or_error; + if (IS_ERR_VALUE(transferred_or_error)) { + if (req->primary) + req->primary->error = transferred_or_error; + else + req->error = transferred_or_error; + } erofs_fscache_req_put(req); } @@ -131,7 +158,6 @@ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, done += slen; } DBG_BUGON(done != len); - req->submitted += len; return 0; } @@ -167,32 +193,19 @@ static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) return ret; } -/* - * Read into page cache in the range described by (@pos, @len). - * - * On return, if the output @unlock is true, the caller is responsible for page - * unlocking; otherwise the callee will take this responsibility through request - * completion. - * - * The return value is the number of bytes successfully handled, or negative - * error code on failure. The only exception is that, the length of the range - * instead of the error code is returned on failure after request is allocated, - * so that .readahead() could advance rac accordingly. - */ -static int erofs_fscache_data_read(struct address_space *mapping, - loff_t pos, size_t len, bool *unlock) +static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) { + struct address_space *mapping = primary->mapping; struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; struct erofs_fscache_request *req; struct erofs_map_blocks map; struct erofs_map_dev mdev; struct iov_iter iter; + loff_t pos = primary->start + primary->submitted; size_t count; int ret; - *unlock = true; - map.m_la = pos; ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); if (ret) @@ -220,17 +233,19 @@ static int erofs_fscache_data_read(struct address_space *mapping, } iov_iter_zero(PAGE_SIZE - size, &iter); erofs_put_metabuf(&buf); - return PAGE_SIZE; + primary->submitted += PAGE_SIZE; + return 0; } + count = primary->len - primary->submitted; if (!(map.m_flags & EROFS_MAP_MAPPED)) { - count = len; iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, count); iov_iter_zero(count, &iter); - return count; + primary->submitted += count; + return 0; } - count = min_t(size_t, map.m_llen - (pos - map.m_la), len); + count = min_t(size_t, map.m_llen - (pos - map.m_la), count); DBG_BUGON(!count || count % PAGE_SIZE); mdev = (struct erofs_map_dev) { @@ -241,68 +256,65 @@ static int erofs_fscache_data_read(struct address_space *mapping, if (ret) return ret; - req = erofs_fscache_req_alloc(mapping, pos, count); + req = erofs_fscache_req_chain(primary, count); if (IS_ERR(req)) return PTR_ERR(req); - *unlock = false; ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, req, mdev.m_pa + (pos - map.m_la), count); - if (ret) - req->error = ret; - erofs_fscache_req_put(req); - return count; + primary->submitted += count; + return ret; } -static int erofs_fscache_read_folio(struct file *file, struct folio *folio) +static int erofs_fscache_data_read(struct erofs_fscache_request *req) { - bool unlock; int ret; - DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ); + do { + ret = erofs_fscache_data_read_slice(req); + if (ret) + req->error = ret; + } while (!ret && req->submitted < req->len); - ret = erofs_fscache_data_read(folio_mapping(folio), folio_pos(folio), - folio_size(folio), &unlock); - if (unlock) { - if (ret > 0) - folio_mark_uptodate(folio); + return ret; +} + +static int erofs_fscache_read_folio(struct file *file, struct folio *folio) +{ + struct erofs_fscache_request *req; + int ret; + + req = erofs_fscache_req_alloc(folio_mapping(folio), + folio_pos(folio), folio_size(folio)); + if (IS_ERR(req)) { folio_unlock(folio); + return PTR_ERR(req); } - return ret < 0 ? ret : 0; + + ret = erofs_fscache_data_read(req); + erofs_fscache_req_put(req); + return ret; } static void erofs_fscache_readahead(struct readahead_control *rac) { - struct folio *folio; - size_t len, done = 0; - loff_t start, pos; - bool unlock; - int ret, size; + struct erofs_fscache_request *req; if (!readahead_count(rac)) return; - start = readahead_pos(rac); - len = readahead_length(rac); + req = erofs_fscache_req_alloc(rac->mapping, + readahead_pos(rac), readahead_length(rac)); + if (IS_ERR(req)) + return; - do { - pos = start + done; - ret = erofs_fscache_data_read(rac->mapping, pos, - len - done, &unlock); - if (ret <= 0) - return; + /* The request completion will drop refs on the folios. */ + while (readahead_folio(rac)) + ; - size = ret; - while (size) { - folio = readahead_folio(rac); - size -= folio_size(folio); - if (unlock) { - folio_mark_uptodate(folio); - folio_unlock(folio); - } - } - } while ((done += ret) < len); + erofs_fscache_data_read(req); + erofs_fscache_req_put(req); } static const struct address_space_operations erofs_fscache_meta_aops = { -- cgit From e6687b89225ee9c817e6dcbadc873f6a4691e5c2 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Thu, 1 Dec 2022 15:42:56 +0800 Subject: erofs: enable large folios for fscache mode Enable large folios for fscache mode. Enable this feature for non-compressed format for now, until the compression part supports large folios later. One thing worth noting is that, the feature is not enabled for the meta data routine since meta inodes don't need large folios for now, nor do they support readahead yet. Also document this new feature. Signed-off-by: Jingbo Xu Reviewed-by: Jia Zhu Link: https://lore.kernel.org/r/20221201074256.16639-3-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index e457b8a59ee7..85932086d23f 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -295,8 +295,7 @@ static int erofs_fill_inode(struct inode *inode) goto out_unlock; } inode->i_mapping->a_ops = &erofs_raw_access_aops; - if (!erofs_is_fscache_mode(inode->i_sb)) - mapping_set_large_folios(inode->i_mapping); + mapping_set_large_folios(inode->i_mapping); #ifdef CONFIG_EROFS_FS_ONDEMAND if (erofs_is_fscache_mode(inode->i_sb)) inode->i_mapping->a_ops = &erofs_fscache_access_aops; -- cgit From 927e5010ff5bd7446a22c511ab8643b9385ddf4d Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 18 Oct 2022 18:53:13 +0800 Subject: erofs: use kmap_local_page() only for erofs_bread() Convert all mapped erofs_bread() users to use kmap_local_page() instead of kmap() or kmap_atomic(). Signed-off-by: Gao Xiang Reviewed-and-tested-by: Jingbo Xu Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20221018105313.4940-1-hsiangkao@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/data.c | 8 ++------ fs/erofs/inode.c | 1 + fs/erofs/internal.h | 3 +-- fs/erofs/xattr.c | 8 ++++---- fs/erofs/zmap.c | 4 ++-- 5 files changed, 10 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/erofs/data.c b/fs/erofs/data.c index c9526c627dda..f57f921683d7 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -13,9 +13,7 @@ void erofs_unmap_metabuf(struct erofs_buf *buf) { if (buf->kmap_type == EROFS_KMAP) - kunmap(buf->page); - else if (buf->kmap_type == EROFS_KMAP_ATOMIC) - kunmap_atomic(buf->base); + kunmap_local(buf->base); buf->base = NULL; buf->kmap_type = EROFS_NO_KMAP; } @@ -54,9 +52,7 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode, } if (buf->kmap_type == EROFS_NO_KMAP) { if (type == EROFS_KMAP) - buf->base = kmap(page); - else if (type == EROFS_KMAP_ATOMIC) - buf->base = kmap_atomic(page); + buf->base = kmap_local_page(page); buf->kmap_type = type; } else if (buf->kmap_type != type) { DBG_BUGON(1); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 85932086d23f..5b3a793103af 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -268,6 +268,7 @@ static int erofs_fill_inode(struct inode *inode) case S_IFDIR: inode->i_op = &erofs_dir_iops; inode->i_fop = &erofs_dir_fops; + inode_nohighmem(inode); break; case S_IFLNK: err = erofs_fill_symlink(inode, kaddr, ofs); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index e51f27b6bde1..bb8501c0ff5b 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -255,8 +255,7 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) enum erofs_kmap_type { EROFS_NO_KMAP, /* don't map the buffer */ - EROFS_KMAP, /* use kmap() to map the buffer */ - EROFS_KMAP_ATOMIC, /* use kmap_atomic() to map the buffer */ + EROFS_KMAP, /* use kmap_local_page() to map the buffer */ }; struct erofs_buf { diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 8106bcb5a38d..a62fb8a3318a 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -148,7 +148,7 @@ static inline int xattr_iter_fixup(struct xattr_iter *it) it->blkaddr += erofs_blknr(it->ofs); it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr, - EROFS_KMAP_ATOMIC); + EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); it->ofs = erofs_blkoff(it->ofs); @@ -174,7 +174,7 @@ static int inline_xattr_iter_begin(struct xattr_iter *it, it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs); it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr, - EROFS_KMAP_ATOMIC); + EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); return vi->xattr_isize - xattr_header_sz; @@ -368,7 +368,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr, - EROFS_KMAP_ATOMIC); + EROFS_KMAP); if (IS_ERR(it->it.kaddr)) return PTR_ERR(it->it.kaddr); it->it.blkaddr = blkaddr; @@ -580,7 +580,7 @@ static int shared_listxattr(struct listxattr_iter *it) it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr, - EROFS_KMAP_ATOMIC); + EROFS_KMAP); if (IS_ERR(it->it.kaddr)) return PTR_ERR(it->it.kaddr); it->it.blkaddr = blkaddr; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 0bb66927e3d0..749a5ac943f4 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -178,7 +178,7 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, unsigned int advise, type; m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - erofs_blknr(pos), EROFS_KMAP_ATOMIC); + erofs_blknr(pos), EROFS_KMAP); if (IS_ERR(m->kaddr)) return PTR_ERR(m->kaddr); @@ -416,7 +416,7 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, out: pos += lcn * (1 << amortizedshift); m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - erofs_blknr(pos), EROFS_KMAP_ATOMIC); + erofs_blknr(pos), EROFS_KMAP); if (IS_ERR(m->kaddr)) return PTR_ERR(m->kaddr); return unpack_compacted_index(m, amortizedshift, pos, lookahead); -- cgit From c42c0ffe81176940bd5dead474216b7198d77675 Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Mon, 5 Dec 2022 11:49:57 +0800 Subject: erofs: Fix pcluster memleak when its block address is zero syzkaller reported a memleak: https://syzkaller.appspot.com/bug?id=62f37ff612f0021641eda5b17f056f1668aa9aed unreferenced object 0xffff88811009c7f8 (size 136): ... backtrace: [] z_erofs_do_read_page+0x99b/0x1740 [] z_erofs_readahead+0x24e/0x580 [] read_pages+0x86/0x3d0 ... syzkaller constructed a case: in z_erofs_register_pcluster(), ztailpacking = false and map->m_pa = zero. This makes pcl->obj.index be zero although pcl is not a inline pcluster. Then following path adds refcount for grp, but the refcount won't be put because pcl is inline. z_erofs_readahead() z_erofs_do_read_page() # for another page z_erofs_collector_begin() erofs_find_workgroup() erofs_workgroup_get() Since it's illegal for the block address of a non-inlined pcluster to be zero, add check here to avoid registering the pcluster which would be leaked. Fixes: cecf864d3d76 ("erofs: support inline data decompression") Reported-by: syzbot+6f8cd9a0155b366d227f@syzkaller.appspotmail.com Signed-off-by: Chen Zhongjin Reviewed-by: Yue Hu Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/Y42Kz6sVkf+XqJRB@debian Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index b66c16473273..ccf7c55d477f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -496,7 +496,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) struct erofs_workgroup *grp; int err; - if (!(map->m_flags & EROFS_MAP_ENCODED)) { + if (!(map->m_flags & EROFS_MAP_ENCODED) || + (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) { DBG_BUGON(1); return -EFSCORRUPTED; } -- cgit From d5d188b8f8b38d3d71dd05993874b4fc9284ce95 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 5 Dec 2022 23:00:49 +0800 Subject: erofs: fix missing unmap if z_erofs_get_extent_compressedlen() fails Otherwise, meta buffers could be leaked. Fixes: cec6e93beadf ("erofs: support parsing big pcluster compress indexes") Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20221205150050.47784-1-hsiangkao@linux.alibaba.com --- fs/erofs/zmap.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 749a5ac943f4..98eff1259de4 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -694,7 +694,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, map->m_pa = blknr_to_addr(m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) - goto out; + goto unmap_out; } if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) { @@ -718,14 +718,12 @@ static int z_erofs_do_map_blocks(struct inode *inode, if (!err) map->m_flags |= EROFS_MAP_FULL_MAPPED; } + unmap_out: erofs_unmap_metabuf(&m.map->buf); - -out: erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o", __func__, map->m_la, map->m_pa, map->m_llen, map->m_plen, map->m_flags); - return err; } -- cgit From c505feba4c0d76084e56ec498ce819f02a7043ae Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 5 Dec 2022 23:00:50 +0800 Subject: erofs: validate the extent length for uncompressed pclusters syzkaller reported a KASAN use-after-free: https://syzkaller.appspot.com/bug?extid=2ae90e873e97f1faf6f2 The referenced fuzzed image actually has two issues: - m_pa == 0 as a non-inlined pcluster; - The logical length is longer than its physical length. The first issue has already been addressed. This patch addresses the second issue by checking the extent length validity. Reported-by: syzbot+2ae90e873e97f1faf6f2@syzkaller.appspotmail.com Fixes: 02827e1796b3 ("staging: erofs: add erofs_map_blocks_iter") Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20221205150050.47784-2-hsiangkao@linux.alibaba.com --- fs/erofs/zmap.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 98eff1259de4..0150570c33aa 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -698,6 +698,11 @@ static int z_erofs_do_map_blocks(struct inode *inode, } if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) { + if (map->m_llen > map->m_plen) { + DBG_BUGON(1); + err = -EFSCORRUPTED; + goto unmap_out; + } if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER) map->m_algorithmformat = Z_EROFS_COMPRESSION_INTERLACED; -- cgit From b5b52de3214a29911f949459a79f6640969b5487 Mon Sep 17 00:00:00 2001 From: Dave Wysochanski Date: Wed, 7 Dec 2022 13:49:15 +0000 Subject: fscache: Fix oops due to race with cookie_lru and use_cookie If a cookie expires from the LRU and the LRU_DISCARD flag is set, but the state machine has not run yet, it's possible another thread can call fscache_use_cookie and begin to use it. When the cookie_worker finally runs, it will see the LRU_DISCARD flag set, transition the cookie->state to LRU_DISCARDING, which will then withdraw the cookie. Once the cookie is withdrawn the object is removed the below oops will occur because the object associated with the cookie is now NULL. Fix the oops by clearing the LRU_DISCARD bit if another thread uses the cookie before the cookie_worker runs. BUG: kernel NULL pointer dereference, address: 0000000000000008 ... CPU: 31 PID: 44773 Comm: kworker/u130:1 Tainted: G E 6.0.0-5.dneg.x86_64 #1 Hardware name: Google Compute Engine/Google Compute Engine, BIOS Google 08/26/2022 Workqueue: events_unbound netfs_rreq_write_to_cache_work [netfs] RIP: 0010:cachefiles_prepare_write+0x28/0x90 [cachefiles] ... Call Trace: netfs_rreq_write_to_cache_work+0x11c/0x320 [netfs] process_one_work+0x217/0x3e0 worker_thread+0x4a/0x3b0 kthread+0xd6/0x100 Fixes: 12bb21a29c19 ("fscache: Implement cookie user counting and resource pinning") Reported-by: Daire Byrne Signed-off-by: Dave Wysochanski Signed-off-by: David Howells Tested-by: Daire Byrne Link: https://lore.kernel.org/r/20221117115023.1350181-1-dwysocha@redhat.com/ # v1 Link: https://lore.kernel.org/r/20221117142915.1366990-1-dwysocha@redhat.com/ # v2 Signed-off-by: Linus Torvalds --- fs/fscache/cookie.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 451d8a077e12..bce2492186d0 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -605,6 +605,14 @@ again: set_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags); queue = true; } + /* + * We could race with cookie_lru which may set LRU_DISCARD bit + * but has yet to run the cookie state machine. If this happens + * and another thread tries to use the cookie, clear LRU_DISCARD + * so we don't end up withdrawing the cookie while in use. + */ + if (test_and_clear_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags)) + fscache_see_cookie(cookie, fscache_cookie_see_lru_discard_clear); break; case FSCACHE_COOKIE_STATE_FAILED: -- cgit From b3525072835b523b397d459fadd0785d9c24bbd1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 24 Oct 2022 14:29:39 +0100 Subject: orangefs: remove variable i Variable i is just being incremented and it's never used anywhere else. The variable and the increment are redundant so remove it. Signed-off-by: Colin Ian King Signed-off-by: Mike Marshall --- fs/orangefs/inode.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 7a8c0c6e698d..eaa35a966115 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -530,7 +530,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, size_t count = iov_iter_count(iter); ssize_t total_count = 0; ssize_t ret = -EINVAL; - int i = 0; gossip_debug(GOSSIP_FILE_DEBUG, "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n", @@ -556,7 +555,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, while (iov_iter_count(iter)) { size_t each_count = iov_iter_count(iter); size_t amt_complete; - i++; /* how much to transfer in this loop iteration */ if (each_count > orangefs_bufmap_size_query()) -- cgit From 610defdccff7b955fe899a825990c2202153a22e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 17 Oct 2022 22:49:37 +0100 Subject: orangefs: remove redundant assignment to variable buffer_index The variable buffer_index is assigned a value that is never read, it is assigned just before the function returns. The assignment is redundant and can be removed. Cleans up clang scan build warning: fs/orangefs/file.c:276:3: warning: Value stored to 'buffer_index' is never read [deadcode.DeadStores] Signed-off-by: Colin Ian King Signed-off-by: Mike Marshall --- fs/orangefs/file.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 732661aa2680..167fa43b24f9 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -273,7 +273,6 @@ out: gossip_debug(GOSSIP_FILE_DEBUG, "%s(%pU): PUT buffer_index %d\n", __func__, handle, buffer_index); - buffer_index = -1; } op_release(new_op); return ret; -- cgit From ea60a4ad0cf88b411cde6888b8c890935686ecd7 Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu Date: Tue, 18 Oct 2022 12:40:04 +0800 Subject: orangefs: Fix sysfs not cleanup when dev init failed When the dev init failed, should cleanup the sysfs, otherwise, the module will never be loaded since can not create duplicate sysfs directory: sysfs: cannot create duplicate filename '/fs/orangefs' CPU: 1 PID: 6549 Comm: insmod Tainted: G W 6.0.0+ #44 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014 Call Trace: dump_stack_lvl+0x34/0x44 sysfs_warn_dup.cold+0x17/0x24 sysfs_create_dir_ns+0x16d/0x180 kobject_add_internal+0x156/0x3a0 kobject_init_and_add+0xcf/0x120 orangefs_sysfs_init+0x7e/0x3a0 [orangefs] orangefs_init+0xfe/0x1000 [orangefs] do_one_initcall+0x87/0x2a0 do_init_module+0xdf/0x320 load_module+0x2f98/0x3330 __do_sys_finit_module+0x113/0x1b0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 kobject_add_internal failed for orangefs with -EEXIST, don't try to register things with the same name in the same directory. Fixes: 2f83ace37181 ("orangefs: put register_chrdev immediately before register_filesystem") Signed-off-by: Zhang Xiaoxu Signed-off-by: Mike Marshall --- fs/orangefs/orangefs-mod.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c index cd7297815f91..5ab741c60b7e 100644 --- a/fs/orangefs/orangefs-mod.c +++ b/fs/orangefs/orangefs-mod.c @@ -141,7 +141,7 @@ static int __init orangefs_init(void) gossip_err("%s: could not initialize device subsystem %d!\n", __func__, ret); - goto cleanup_device; + goto cleanup_sysfs; } ret = register_filesystem(&orangefs_fs_type); @@ -152,11 +152,11 @@ static int __init orangefs_init(void) goto out; } - orangefs_sysfs_exit(); - -cleanup_device: orangefs_dev_cleanup(); +cleanup_sysfs: + orangefs_sysfs_exit(); + sysfs_init_failed: orangefs_debugfs_cleanup(); -- cgit From d23417a5bf3a3afc55de5442eb46e1e60458b0a1 Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu Date: Tue, 18 Oct 2022 12:40:05 +0800 Subject: orangefs: Fix kmemleak in orangefs_prepare_debugfs_help_string() When insert and remove the orangefs module, then debug_help_string will be leaked: unreferenced object 0xffff8881652ba000 (size 4096): comm "insmod", pid 1701, jiffies 4294893639 (age 13218.530s) hex dump (first 32 bytes): 43 6c 69 65 6e 74 20 44 65 62 75 67 20 4b 65 79 Client Debug Key 77 6f 72 64 73 20 61 72 65 20 75 6e 6b 6e 6f 77 words are unknow backtrace: [<0000000004e6f8e3>] kmalloc_trace+0x27/0xa0 [<0000000006f75d85>] orangefs_prepare_debugfs_help_string+0x5e/0x480 [orangefs] [<0000000091270a2a>] _sub_I_65535_1+0x57/0xf70 [crc_itu_t] [<000000004b1ee1a3>] do_one_initcall+0x87/0x2a0 [<000000001d0614ae>] do_init_module+0xdf/0x320 [<00000000efef068c>] load_module+0x2f98/0x3330 [<000000006533b44d>] __do_sys_finit_module+0x113/0x1b0 [<00000000a0da6f99>] do_syscall_64+0x35/0x80 [<000000007790b19b>] entry_SYSCALL_64_after_hwframe+0x46/0xb0 When remove the module, should always free debug_help_string. Should always free the allocated buffer when change the free_debug_help_string. Signed-off-by: Zhang Xiaoxu Signed-off-by: Mike Marshall --- fs/orangefs/orangefs-debugfs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 29eaa4544372..a848b6ef9599 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -222,6 +222,8 @@ out: void orangefs_debugfs_cleanup(void) { debugfs_remove_recursive(debug_dir); + kfree(debug_help_string); + debug_help_string = NULL; } /* open ORANGEFS_KMOD_DEBUG_HELP_FILE */ @@ -671,6 +673,7 @@ int orangefs_prepare_debugfs_help_string(int at_boot) memset(debug_help_string, 0, DEBUG_HELP_STRING_SIZE); strlcat(debug_help_string, new, string_size); mutex_unlock(&orangefs_help_file_lock); + kfree(new); } rc = 0; -- cgit From 1f2c0e8a587bcafad85019a2d80f158d8d41a868 Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu Date: Tue, 18 Oct 2022 12:40:06 +0800 Subject: orangefs: Fix kmemleak in orangefs_sysfs_init() When insert and remove the orangefs module, there are kobjects memory leaked as below: unreferenced object 0xffff88810f95af00 (size 64): comm "insmod", pid 783, jiffies 4294813439 (age 65.512s) hex dump (first 32 bytes): a0 83 af 01 81 88 ff ff 08 af 95 0f 81 88 ff ff ................ 08 af 95 0f 81 88 ff ff 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000031ab7788>] kmalloc_trace+0x27/0xa0 [<000000005a6e4dfe>] orangefs_sysfs_init+0x42/0x3a0 [<00000000722645ca>] 0xffffffffa02780fe [<000000004232d9f7>] do_one_initcall+0x87/0x2a0 [<0000000054f22384>] do_init_module+0xdf/0x320 [<000000003263bdea>] load_module+0x2f98/0x3330 [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0 [<00000000250ae02b>] do_syscall_64+0x35/0x80 [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0 unreferenced object 0xffff88810f95ae80 (size 64): comm "insmod", pid 783, jiffies 4294813439 (age 65.512s) hex dump (first 32 bytes): c8 90 0f 02 81 88 ff ff 88 ae 95 0f 81 88 ff ff ................ 88 ae 95 0f 81 88 ff ff 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000031ab7788>] kmalloc_trace+0x27/0xa0 [<000000001a4841fa>] orangefs_sysfs_init+0xc7/0x3a0 [<00000000722645ca>] 0xffffffffa02780fe [<000000004232d9f7>] do_one_initcall+0x87/0x2a0 [<0000000054f22384>] do_init_module+0xdf/0x320 [<000000003263bdea>] load_module+0x2f98/0x3330 [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0 [<00000000250ae02b>] do_syscall_64+0x35/0x80 [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0 unreferenced object 0xffff88810f95ae00 (size 64): comm "insmod", pid 783, jiffies 4294813440 (age 65.511s) hex dump (first 32 bytes): 60 87 a1 00 81 88 ff ff 08 ae 95 0f 81 88 ff ff `............... 08 ae 95 0f 81 88 ff ff 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000031ab7788>] kmalloc_trace+0x27/0xa0 [<000000005915e797>] orangefs_sysfs_init+0x12b/0x3a0 [<00000000722645ca>] 0xffffffffa02780fe [<000000004232d9f7>] do_one_initcall+0x87/0x2a0 [<0000000054f22384>] do_init_module+0xdf/0x320 [<000000003263bdea>] load_module+0x2f98/0x3330 [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0 [<00000000250ae02b>] do_syscall_64+0x35/0x80 [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0 unreferenced object 0xffff88810f95ad80 (size 64): comm "insmod", pid 783, jiffies 4294813440 (age 65.511s) hex dump (first 32 bytes): 78 90 0f 02 81 88 ff ff 88 ad 95 0f 81 88 ff ff x............... 88 ad 95 0f 81 88 ff ff 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000031ab7788>] kmalloc_trace+0x27/0xa0 [<000000007a14eb35>] orangefs_sysfs_init+0x1ac/0x3a0 [<00000000722645ca>] 0xffffffffa02780fe [<000000004232d9f7>] do_one_initcall+0x87/0x2a0 [<0000000054f22384>] do_init_module+0xdf/0x320 [<000000003263bdea>] load_module+0x2f98/0x3330 [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0 [<00000000250ae02b>] do_syscall_64+0x35/0x80 [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0 unreferenced object 0xffff88810f95ac00 (size 64): comm "insmod", pid 783, jiffies 4294813440 (age 65.531s) hex dump (first 32 bytes): e0 ff 67 02 81 88 ff ff 08 ac 95 0f 81 88 ff ff ..g............. 08 ac 95 0f 81 88 ff ff 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000031ab7788>] kmalloc_trace+0x27/0xa0 [<000000001f38adcb>] orangefs_sysfs_init+0x291/0x3a0 [<00000000722645ca>] 0xffffffffa02780fe [<000000004232d9f7>] do_one_initcall+0x87/0x2a0 [<0000000054f22384>] do_init_module+0xdf/0x320 [<000000003263bdea>] load_module+0x2f98/0x3330 [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0 [<00000000250ae02b>] do_syscall_64+0x35/0x80 [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0 unreferenced object 0xffff88810f95ab80 (size 64): comm "insmod", pid 783, jiffies 4294813441 (age 65.530s) hex dump (first 32 bytes): 50 bf 2f 02 81 88 ff ff 88 ab 95 0f 81 88 ff ff P./............. 88 ab 95 0f 81 88 ff ff 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000031ab7788>] kmalloc_trace+0x27/0xa0 [<000000009cc7d95b>] orangefs_sysfs_init+0x2f5/0x3a0 [<00000000722645ca>] 0xffffffffa02780fe [<000000004232d9f7>] do_one_initcall+0x87/0x2a0 [<0000000054f22384>] do_init_module+0xdf/0x320 [<000000003263bdea>] load_module+0x2f98/0x3330 [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0 [<00000000250ae02b>] do_syscall_64+0x35/0x80 [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0 Should add release function for each kobject_type to free the memory. Signed-off-by: Zhang Xiaoxu Signed-off-by: Mike Marshall --- fs/orangefs/orangefs-sysfs.c | 71 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c index de80b62553bb..be4ba03a01a0 100644 --- a/fs/orangefs/orangefs-sysfs.c +++ b/fs/orangefs/orangefs-sysfs.c @@ -896,9 +896,18 @@ static struct attribute *orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(orangefs_default); +static struct kobject *orangefs_obj; + +static void orangefs_obj_release(struct kobject *kobj) +{ + kfree(orangefs_obj); + orangefs_obj = NULL; +} + static struct kobj_type orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = orangefs_default_groups, + .release = orangefs_obj_release, }; static struct orangefs_attribute acache_hard_limit_attribute = @@ -934,9 +943,18 @@ static struct attribute *acache_orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(acache_orangefs_default); +static struct kobject *acache_orangefs_obj; + +static void acache_orangefs_obj_release(struct kobject *kobj) +{ + kfree(acache_orangefs_obj); + acache_orangefs_obj = NULL; +} + static struct kobj_type acache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = acache_orangefs_default_groups, + .release = acache_orangefs_obj_release, }; static struct orangefs_attribute capcache_hard_limit_attribute = @@ -972,9 +990,18 @@ static struct attribute *capcache_orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(capcache_orangefs_default); +static struct kobject *capcache_orangefs_obj; + +static void capcache_orangefs_obj_release(struct kobject *kobj) +{ + kfree(capcache_orangefs_obj); + capcache_orangefs_obj = NULL; +} + static struct kobj_type capcache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = capcache_orangefs_default_groups, + .release = capcache_orangefs_obj_release, }; static struct orangefs_attribute ccache_hard_limit_attribute = @@ -1010,9 +1037,18 @@ static struct attribute *ccache_orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(ccache_orangefs_default); +static struct kobject *ccache_orangefs_obj; + +static void ccache_orangefs_obj_release(struct kobject *kobj) +{ + kfree(ccache_orangefs_obj); + ccache_orangefs_obj = NULL; +} + static struct kobj_type ccache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = ccache_orangefs_default_groups, + .release = ccache_orangefs_obj_release, }; static struct orangefs_attribute ncache_hard_limit_attribute = @@ -1048,9 +1084,18 @@ static struct attribute *ncache_orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(ncache_orangefs_default); +static struct kobject *ncache_orangefs_obj; + +static void ncache_orangefs_obj_release(struct kobject *kobj) +{ + kfree(ncache_orangefs_obj); + ncache_orangefs_obj = NULL; +} + static struct kobj_type ncache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = ncache_orangefs_default_groups, + .release = ncache_orangefs_obj_release, }; static struct orangefs_attribute pc_acache_attribute = @@ -1079,9 +1124,18 @@ static struct attribute *pc_orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(pc_orangefs_default); +static struct kobject *pc_orangefs_obj; + +static void pc_orangefs_obj_release(struct kobject *kobj) +{ + kfree(pc_orangefs_obj); + pc_orangefs_obj = NULL; +} + static struct kobj_type pc_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = pc_orangefs_default_groups, + .release = pc_orangefs_obj_release, }; static struct orangefs_attribute stats_reads_attribute = @@ -1103,19 +1157,20 @@ static struct attribute *stats_orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(stats_orangefs_default); +static struct kobject *stats_orangefs_obj; + +static void stats_orangefs_obj_release(struct kobject *kobj) +{ + kfree(stats_orangefs_obj); + stats_orangefs_obj = NULL; +} + static struct kobj_type stats_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = stats_orangefs_default_groups, + .release = stats_orangefs_obj_release, }; -static struct kobject *orangefs_obj; -static struct kobject *acache_orangefs_obj; -static struct kobject *capcache_orangefs_obj; -static struct kobject *ccache_orangefs_obj; -static struct kobject *ncache_orangefs_obj; -static struct kobject *pc_orangefs_obj; -static struct kobject *stats_orangefs_obj; - int orangefs_sysfs_init(void) { int rc = -EINVAL; -- cgit From 31720a2b109b3080eb77e97b8f6f50a27b4ae599 Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu Date: Tue, 18 Oct 2022 12:40:07 +0800 Subject: orangefs: Fix kmemleak in orangefs_{kernel,client}_debug_init() When insert and remove the orangefs module, there are memory leaked as below: unreferenced object 0xffff88816b0cc000 (size 2048): comm "insmod", pid 783, jiffies 4294813439 (age 65.512s) hex dump (first 32 bytes): 6e 6f 6e 65 0a 00 00 00 00 00 00 00 00 00 00 00 none............ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000031ab7788>] kmalloc_trace+0x27/0xa0 [<000000005b405fee>] orangefs_debugfs_init.cold+0xaf/0x17f [<00000000e5a0085b>] 0xffffffffa02780f9 [<000000004232d9f7>] do_one_initcall+0x87/0x2a0 [<0000000054f22384>] do_init_module+0xdf/0x320 [<000000003263bdea>] load_module+0x2f98/0x3330 [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0 [<00000000250ae02b>] do_syscall_64+0x35/0x80 [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0 Use the golbal variable as the buffer rather than dynamic allocate to slove the problem. Signed-off-by: Zhang Xiaoxu Signed-off-by: Mike Marshall --- fs/orangefs/orangefs-debugfs.c | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index a848b6ef9599..1b508f543384 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -194,15 +194,10 @@ void orangefs_debugfs_init(int debug_mask) */ static void orangefs_kernel_debug_init(void) { - int rc = -ENOMEM; - char *k_buffer = NULL; + static char k_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { }; gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__); - k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL); - if (!k_buffer) - goto out; - if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) { strcpy(k_buffer, kernel_debug_string); strcat(k_buffer, "\n"); @@ -213,9 +208,6 @@ static void orangefs_kernel_debug_init(void) debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE, 0444, debug_dir, k_buffer, &kernel_debug_fops); - -out: - gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc); } @@ -299,18 +291,13 @@ static int help_show(struct seq_file *m, void *v) /* * initialize the client-debug file. */ -static int orangefs_client_debug_init(void) +static void orangefs_client_debug_init(void) { - int rc = -ENOMEM; - char *c_buffer = NULL; + static char c_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { }; gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__); - c_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL); - if (!c_buffer) - goto out; - if (strlen(client_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) { strcpy(c_buffer, client_debug_string); strcat(c_buffer, "\n"); @@ -324,13 +311,6 @@ static int orangefs_client_debug_init(void) debug_dir, c_buffer, &kernel_debug_fops); - - rc = 0; - -out: - - gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc); - return rc; } /* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/ -- cgit From cf8aa9bf97cadf85745506c6a3e244b22c268d63 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 24 Sep 2022 00:33:15 -0700 Subject: ovl: Use "buf" flexible array for memcpy() destination The "buf" flexible array needs to be the memcpy() destination to avoid false positive run-time warning from the recent FORTIFY_SOURCE hardening: memcpy: detected field-spanning write (size 93) of single field "&fh->fb" at fs/overlayfs/export.c:799 (size 21) Reported-by: syzbot+9d14351a171d0d1c7955@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/000000000000763a6c05e95a5985@google.com/ Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Signed-off-by: Miklos Szeredi --- fs/overlayfs/export.c | 2 +- fs/overlayfs/overlayfs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index e065a5b9a442..ac9c3ad04016 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -796,7 +796,7 @@ static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type) return ERR_PTR(-ENOMEM); /* Copy unaligned inner fh into aligned buffer */ - memcpy(&fh->fb, fid, buflen - OVL_FH_WIRE_OFFSET); + memcpy(fh->buf, fid, buflen - OVL_FH_WIRE_OFFSET); return fh; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index eee8f08d32b6..e74a610a117e 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -108,7 +108,7 @@ struct ovl_fh { u8 padding[3]; /* make sure fb.fid is 32bit aligned */ union { struct ovl_fb fb; - u8 buf[0]; + DECLARE_FLEX_ARRAY(u8, buf); }; } __packed; -- cgit From 5b0db51215e895a361bc63132caa7cca36a53d6a Mon Sep 17 00:00:00 2001 From: Zhang Tianci Date: Thu, 1 Sep 2022 16:29:29 +0800 Subject: ovl: Use ovl mounter's fsuid and fsgid in ovl_link() There is a wrong case of link() on overlay: $ mkdir /lower /fuse /merge $ mount -t fuse /fuse $ mkdir /fuse/upper /fuse/work $ mount -t overlay /merge -o lowerdir=/lower,upperdir=/fuse/upper,\ workdir=work $ touch /merge/file $ chown bin.bin /merge/file // the file's caller becomes "bin" $ ln /merge/file /merge/lnkfile Then we will get an error(EACCES) because fuse daemon checks the link()'s caller is "bin", it denied this request. In the changing history of ovl_link(), there are two key commits: The first is commit bb0d2b8ad296 ("ovl: fix sgid on directory") which overrides the cred's fsuid/fsgid using the new inode. The new inode's owner is initialized by inode_init_owner(), and inode->fsuid is assigned to the current user. So the override fsuid becomes the current user. We know link() is actually modifying the directory, so the caller must have the MAY_WRITE permission on the directory. The current caller may should have this permission. This is acceptable to use the caller's fsuid. The second is commit 51f7e52dc943 ("ovl: share inode for hard link") which removed the inode creation in ovl_link(). This commit move inode_init_owner() into ovl_create_object(), so the ovl_link() just give the old inode to ovl_create_or_link(). Then the override fsuid becomes the old inode's fsuid, neither the caller nor the overlay's mounter! So this is incorrect. Fix this bug by using ovl mounter's fsuid/fsgid to do underlying fs's link(). Link: https://lore.kernel.org/all/20220817102952.xnvesg3a7rbv576x@wittgenstein/T Link: https://lore.kernel.org/lkml/20220825130552.29587-1-zhangtianci.1997@bytedance.com/t Signed-off-by: Zhang Tianci Signed-off-by: Jiachen Zhang Reviewed-by: Christian Brauner (Microsoft) Fixes: 51f7e52dc943 ("ovl: share inode for hard link") Cc: # v4.8 Signed-off-by: Miklos Szeredi --- fs/overlayfs/dir.c | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 6b03457f72bb..c3032cef391e 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -592,28 +592,42 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, goto out_revert_creds; } - err = -ENOMEM; - override_cred = prepare_creds(); - if (override_cred) { + if (!attr->hardlink) { + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_revert_creds; + /* + * In the creation cases(create, mkdir, mknod, symlink), + * ovl should transfer current's fs{u,g}id to underlying + * fs. Because underlying fs want to initialize its new + * inode owner using current's fs{u,g}id. And in this + * case, the @inode is a new inode that is initialized + * in inode_init_owner() to current's fs{u,g}id. So use + * the inode's i_{u,g}id to override the cred's fs{u,g}id. + * + * But in the other hardlink case, ovl_link() does not + * create a new inode, so just use the ovl mounter's + * fs{u,g}id. + */ override_cred->fsuid = inode->i_uid; override_cred->fsgid = inode->i_gid; - if (!attr->hardlink) { - err = security_dentry_create_files_as(dentry, - attr->mode, &dentry->d_name, old_cred, - override_cred); - if (err) { - put_cred(override_cred); - goto out_revert_creds; - } + err = security_dentry_create_files_as(dentry, + attr->mode, &dentry->d_name, old_cred, + override_cred); + if (err) { + put_cred(override_cred); + goto out_revert_creds; } put_cred(override_creds(override_cred)); put_cred(override_cred); - - if (!ovl_dentry_is_whiteout(dentry)) - err = ovl_create_upper(dentry, inode, attr); - else - err = ovl_create_over_whiteout(dentry, inode, attr); } + + if (!ovl_dentry_is_whiteout(dentry)) + err = ovl_create_upper(dentry, inode, attr); + else + err = ovl_create_over_whiteout(dentry, inode, attr); + out_revert_creds: revert_creds(old_cred); return err; -- cgit From 73db6a063c785bc3965f17569102a2a61ec10a4c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 9 Sep 2022 11:07:47 +0200 Subject: ovl: port to vfs{g,u}id_t and associated helpers A while ago we introduced a dedicated vfs{g,u}id_t type in commit 1e5267cd0895 ("mnt_idmapping: add vfs{g,u}id_t"). We already switched over a good part of the VFS. Ultimately we will remove all legacy idmapped mount helpers that operate only on k{g,u}id_t in favor of the new type safe helpers that operate on vfs{g,u}id_t. Cc: Seth Forshee (Digital Ocean) Cc: Amir Goldstein Cc: Christoph Hellwig Signed-off-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi --- fs/overlayfs/util.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 81a57a8d80d9..c0c20d33691b 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -1104,13 +1104,18 @@ void ovl_copyattr(struct inode *inode) struct path realpath; struct inode *realinode; struct user_namespace *real_mnt_userns; + vfsuid_t vfsuid; + vfsgid_t vfsgid; ovl_i_path_real(inode, &realpath); realinode = d_inode(realpath.dentry); real_mnt_userns = mnt_user_ns(realpath.mnt); - inode->i_uid = i_uid_into_mnt(real_mnt_userns, realinode); - inode->i_gid = i_gid_into_mnt(real_mnt_userns, realinode); + vfsuid = i_uid_into_vfsuid(real_mnt_userns, realinode); + vfsgid = i_gid_into_vfsgid(real_mnt_userns, realinode); + + inode->i_uid = vfsuid_into_kuid(vfsuid); + inode->i_gid = vfsgid_into_kgid(vfsgid); inode->i_mode = realinode->i_mode; inode->i_atime = realinode->i_atime; inode->i_mtime = realinode->i_mtime; -- cgit From cdf5c9d1af411057e33cfbeba02ce19478dedb1e Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Mon, 5 Sep 2022 13:47:36 +0800 Subject: ovl: fix comment typos Fix two typos. Reported-by: k2ci Signed-off-by: Jiangshan Yi Signed-off-by: Miklos Szeredi --- fs/overlayfs/export.c | 2 +- fs/overlayfs/file.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index ac9c3ad04016..be866a3a92aa 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -339,7 +339,7 @@ out_iput: return dentry; } -/* Get the upper or lower dentry in stach whose on layer @idx */ +/* Get the upper or lower dentry in stack whose on layer @idx */ static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx) { struct ovl_entry *oe = dentry->d_fsdata; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index dd688a842b0b..eadbe5e0dffd 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -34,7 +34,7 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode) return 'm'; } -/* No atime modificaton nor notify on underlying */ +/* No atime modification nor notify on underlying */ #define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY) static struct file *ovl_open_realfile(const struct file *file, -- cgit From 8ea2876577b57805489e3044de7fcb0330c52f40 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 4 Oct 2022 13:34:32 +0300 Subject: ovl: do not reconnect upper index records in ovl_indexdir_cleanup() ovl_indexdir_cleanup() is called on mount of overayfs with nfs_export feature to cleanup stale index records for lower and upper files that have been deleted while overlayfs was offline. This has the side effect (good or bad) of pre populating inode cache with all the copied up upper inodes, while verifying the index entries. For copied up directories, the upper file handles are decoded to conncted upper dentries. This has the even bigger side effect of reading the content of all the parent upper directories which may take significantly more time and IO than just reading the upper inodes. Do not request connceted upper dentries for verifying upper directory index entries, because we have no use for the connected dentry. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/export.c | 4 ++-- fs/overlayfs/namei.c | 7 ++++--- fs/overlayfs/overlayfs.h | 3 ++- 3 files changed, 8 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index be866a3a92aa..a25bb3453dde 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -463,7 +463,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, /* Get connected upper overlay dir from index */ if (index) { - struct dentry *upper = ovl_index_upper(ofs, index); + struct dentry *upper = ovl_index_upper(ofs, index, true); dput(index); if (IS_ERR_OR_NULL(upper)) @@ -739,7 +739,7 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, /* Then try to get a connected upper dir by index */ if (index && d_is_dir(index)) { - struct dentry *upper = ovl_index_upper(ofs, index); + struct dentry *upper = ovl_index_upper(ofs, index, true); err = PTR_ERR(upper); if (IS_ERR_OR_NULL(upper)) diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 0fd1d5fdfc72..0ce1f9b3a046 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -487,7 +487,8 @@ fail: } /* Get upper dentry from index */ -struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) +struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index, + bool connected) { struct ovl_fh *fh; struct dentry *upper; @@ -499,7 +500,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) if (IS_ERR_OR_NULL(fh)) return ERR_CAST(fh); - upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), true); + upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), connected); kfree(fh); if (IS_ERR_OR_NULL(upper)) @@ -572,7 +573,7 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) * directly from the index dentry, but for dir index we first need to * decode the upper directory. */ - upper = ovl_index_upper(ofs, index); + upper = ovl_index_upper(ofs, index, false); if (IS_ERR_OR_NULL(upper)) { err = PTR_ERR(upper); /* diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index e74a610a117e..70c5bd81122b 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -525,7 +525,8 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox, struct dentry *real, bool is_upper, bool set); -struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index); +struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index, + bool connected); int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index); int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, struct qstr *name); -- cgit From af4dcb6d78b2b05a0431dfd3f67713bba8dc0900 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 4 Oct 2022 13:34:33 +0300 Subject: ovl: use plain list filler in indexdir and workdir cleanup Those two cleanup routines are using the helper ovl_dir_read() with the merge dir filler, which populates an rb tree, that is never used. The index dir entry names all have a long (42 bytes) constant prefix, so it is not surprising that perf top has demostrated high CPU usage by rb tree population during cleanup of a large index dir: - 9.53% ovl_fill_merge - 78.41% ovl_cache_entry_find_link.constprop.27 + 72.11% strncmp Use the plain list filler that does not populate the unneeded rb tree. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/readdir.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 2b210640036c..31227e2d85e4 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -1071,14 +1071,10 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa int err; struct inode *dir = path->dentry->d_inode; LIST_HEAD(list); - struct rb_root root = RB_ROOT; struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { - .ctx.actor = ovl_fill_merge, - .dentry = NULL, + .ctx.actor = ovl_fill_plain, .list = &list, - .root = &root, - .is_lowest = false, }; bool incompat = false; @@ -1159,14 +1155,10 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) struct inode *dir = indexdir->d_inode; struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir }; LIST_HEAD(list); - struct rb_root root = RB_ROOT; struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { - .ctx.actor = ovl_fill_merge, - .dentry = NULL, + .ctx.actor = ovl_fill_plain, .list = &list, - .root = &root, - .is_lowest = false, }; err = ovl_dir_read(&path, &rdd); -- cgit From cf4ef7801a8c880902a07712ba7ff61d8e954f85 Mon Sep 17 00:00:00 2001 From: Stanislav Goriainov Date: Wed, 31 Aug 2022 11:06:58 +0300 Subject: ovl: Add comment on upperredirect reassignment If memory for uperredirect was allocated with kstrdup() in upperdir != NULL and d.redirect != NULL path, it may seem that it can be lost when upperredirect is reassigned later, but it's not possible. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 0a2d0d3f2f291 ("ovl: Check redirect on index as well") Signed-off-by: Stanislav Goriainov Signed-off-by: Miklos Szeredi --- fs/overlayfs/namei.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 0ce1f9b3a046..46753134533a 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -1086,6 +1086,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, .mnt = ovl_upper_mnt(ofs), }; + /* + * It's safe to assign upperredirect here: the previous + * assignment of happens only if upperdentry is non-NULL, and + * this one only if upperdentry is NULL. + */ upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0); if (IS_ERR(upperredirect)) { err = PTR_ERR(upperredirect); -- cgit From 1fa9c5c5eddef0505b353031b7605c97e4257e62 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 7 Oct 2022 17:35:26 +0200 Subject: ovl: use inode instead of dentry where possible Passing dentry to some helpers is unnecessary. Simplify these cases. Signed-off-by: Miklos Szeredi --- fs/overlayfs/overlayfs.h | 6 +++--- fs/overlayfs/readdir.c | 46 ++++++++++++++++++++++++---------------------- fs/overlayfs/util.c | 6 ++---- 3 files changed, 29 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 70c5bd81122b..0b07a9ef7e5b 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -401,7 +401,7 @@ const char *ovl_dentry_get_redirect(struct dentry *dentry); void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); void ovl_inode_update(struct inode *inode, struct dentry *upperdentry); void ovl_dir_modified(struct dentry *dentry, bool impurity); -u64 ovl_dentry_version_get(struct dentry *dentry); +u64 ovl_inode_version_get(struct inode *inode); bool ovl_is_whiteout(struct dentry *dentry); struct file *ovl_path_open(const struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry, int flags); @@ -571,9 +571,9 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs); * lower dir was removed under it and possibly before it was rotated from upper * to lower layer. */ -static inline bool ovl_dir_is_real(struct dentry *dir) +static inline bool ovl_dir_is_real(struct inode *dir) { - return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir)); + return !ovl_test_flag(OVL_WHITEOUTS, dir); } /* inode.c */ diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 31227e2d85e4..8cd2b9947de1 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -235,15 +235,15 @@ void ovl_dir_cache_free(struct inode *inode) } } -static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) +static void ovl_cache_put(struct ovl_dir_file *od, struct inode *inode) { struct ovl_dir_cache *cache = od->cache; WARN_ON(cache->refcount <= 0); cache->refcount--; if (!cache->refcount) { - if (ovl_dir_cache(d_inode(dentry)) == cache) - ovl_set_dir_cache(d_inode(dentry), NULL); + if (ovl_dir_cache(inode) == cache) + ovl_set_dir_cache(inode, NULL); ovl_cache_free(&cache->entries); kfree(cache); @@ -323,15 +323,15 @@ static void ovl_dir_reset(struct file *file) { struct ovl_dir_file *od = file->private_data; struct ovl_dir_cache *cache = od->cache; - struct dentry *dentry = file->f_path.dentry; + struct inode *inode = file_inode(file); bool is_real; - if (cache && ovl_dentry_version_get(dentry) != cache->version) { - ovl_cache_put(od, dentry); + if (cache && ovl_inode_version_get(inode) != cache->version) { + ovl_cache_put(od, inode); od->cache = NULL; od->cursor = NULL; } - is_real = ovl_dir_is_real(dentry); + is_real = ovl_dir_is_real(inode); if (od->is_real != is_real) { /* is_real can only become false when dir is copied up */ if (WARN_ON(is_real)) @@ -394,9 +394,10 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) { int res; struct ovl_dir_cache *cache; + struct inode *inode = d_inode(dentry); - cache = ovl_dir_cache(d_inode(dentry)); - if (cache && ovl_dentry_version_get(dentry) == cache->version) { + cache = ovl_dir_cache(inode); + if (cache && ovl_inode_version_get(inode) == cache->version) { WARN_ON(!cache->refcount); cache->refcount++; return cache; @@ -418,8 +419,8 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) return ERR_PTR(res); } - cache->version = ovl_dentry_version_get(dentry); - ovl_set_dir_cache(d_inode(dentry), cache); + cache->version = ovl_inode_version_get(inode); + ovl_set_dir_cache(inode, cache); return cache; } @@ -596,16 +597,17 @@ static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path) { int res; struct dentry *dentry = path->dentry; + struct inode *inode = d_inode(dentry); struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_dir_cache *cache; - cache = ovl_dir_cache(d_inode(dentry)); - if (cache && ovl_dentry_version_get(dentry) == cache->version) + cache = ovl_dir_cache(inode); + if (cache && ovl_inode_version_get(inode) == cache->version) return cache; /* Impure cache is not refcounted, free it here */ - ovl_dir_cache_free(d_inode(dentry)); - ovl_set_dir_cache(d_inode(dentry), NULL); + ovl_dir_cache_free(inode); + ovl_set_dir_cache(inode, NULL); cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); if (!cache) @@ -627,13 +629,13 @@ static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path) OVL_XATTR_IMPURE); ovl_drop_write(dentry); } - ovl_clear_flag(OVL_IMPURE, d_inode(dentry)); + ovl_clear_flag(OVL_IMPURE, inode); kfree(cache); return NULL; } - cache->version = ovl_dentry_version_get(dentry); - ovl_set_dir_cache(d_inode(dentry), cache); + cache->version = ovl_inode_version_get(inode); + ovl_set_dir_cache(inode, cache); return cache; } @@ -675,7 +677,7 @@ static bool ovl_fill_real(struct dir_context *ctx, const char *name, static bool ovl_is_impure_dir(struct file *file) { struct ovl_dir_file *od = file->private_data; - struct inode *dir = d_inode(file->f_path.dentry); + struct inode *dir = file_inode(file); /* * Only upper dir can be impure, but if we are in the middle of @@ -893,7 +895,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, struct file *realfile; int err; - err = ovl_sync_status(OVL_FS(file->f_path.dentry->d_sb)); + err = ovl_sync_status(OVL_FS(file_inode(file)->i_sb)); if (err <= 0) return err; @@ -913,7 +915,7 @@ static int ovl_dir_release(struct inode *inode, struct file *file) if (od->cache) { inode_lock(inode); - ovl_cache_put(od, file->f_path.dentry); + ovl_cache_put(od, inode); inode_unlock(inode); } fput(od->realfile); @@ -942,7 +944,7 @@ static int ovl_dir_open(struct inode *inode, struct file *file) return PTR_ERR(realfile); } od->realfile = realfile; - od->is_real = ovl_dir_is_real(file->f_path.dentry); + od->is_real = ovl_dir_is_real(inode); od->is_upper = OVL_TYPE_UPPER(type); file->private_data = od; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index c0c20d33691b..bde291623c8c 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -463,7 +463,7 @@ static void ovl_dir_version_inc(struct dentry *dentry, bool impurity) * which have been copied up and have origins), so only need to note * changes to impure entries. */ - if (!ovl_dir_is_real(dentry) || impurity) + if (!ovl_dir_is_real(inode) || impurity) OVL_I(inode)->version++; } @@ -475,10 +475,8 @@ void ovl_dir_modified(struct dentry *dentry, bool impurity) ovl_dir_version_inc(dentry, impurity); } -u64 ovl_dentry_version_get(struct dentry *dentry) +u64 ovl_inode_version_get(struct inode *inode) { - struct inode *inode = d_inode(dentry); - WARN_ON(!inode_is_locked(inode)); return OVL_I(inode)->version; } -- cgit From 637d13b57d853dfc7f5743dfdfb9995c266a6031 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 7 Oct 2022 21:40:54 +0100 Subject: ovl: Kconfig: Fix spelling mistake "undelying" -> "underlying" There is a spelling mistake in a Kconfig description. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Miklos Szeredi --- fs/overlayfs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index dd188c7996b3..6708e54b0e30 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -96,7 +96,7 @@ config OVERLAY_FS_XINO_AUTO depends on 64BIT help If this config option is enabled then overlay filesystems will use - unused high bits in undelying filesystem inode numbers to map all + unused high bits in underlying filesystem inode numbers to map all inodes to a unified address space. The mapped 64bit inode numbers might not be compatible with applications that expect 32bit inodes. -- cgit From 83fb8abec29383eb0cf35495d21669e38548771b Mon Sep 17 00:00:00 2001 From: Volker Lendecke Date: Fri, 25 Nov 2022 12:26:00 +0100 Subject: cifs: Add "extbuf" and "extbuflen" args to smb2_compound_op() Will carry the variable-sized reply from SMB_FIND_FILE_POSIX_INFO Signed-off-by: Volker Lendecke Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/smb2inode.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 68e08c85fbb8..1be86ba950b3 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -59,6 +59,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, __u32 desired_access, __u32 create_disposition, __u32 create_options, umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile, + __u8 **extbuf, size_t *extbuflen, struct kvec *err_iov, int *err_buftype) { struct cop_vars *vars = NULL; @@ -539,7 +540,7 @@ int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, data, SMB2_OP_QUERY_INFO, cfile, - err_iov, err_buftype); + NULL, NULL, err_iov, err_buftype); if (rc == -EOPNOTSUPP) { if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER && ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE && @@ -555,7 +556,7 @@ int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, data, - SMB2_OP_QUERY_INFO, cfile, NULL, NULL); + SMB2_OP_QUERY_INFO, cfile, NULL, NULL, NULL, NULL); } out: @@ -589,7 +590,7 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile, - err_iov, err_buftype); + NULL, NULL, err_iov, err_buftype); if (rc == -EOPNOTSUPP) { /* BB TODO: When support for special files added to Samba re-verify this path */ if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER && @@ -606,7 +607,7 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, data, - SMB2_OP_POSIX_QUERY_INFO, cfile, NULL, NULL); + SMB2_OP_POSIX_QUERY_INFO, cfile, NULL, NULL, NULL, NULL); } out: @@ -624,7 +625,7 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, return smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); } void @@ -646,7 +647,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, tmprc = smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, CREATE_NOT_FILE, ACL_NO_MODE, - &data, SMB2_OP_SET_INFO, cfile, NULL, NULL); + &data, SMB2_OP_SET_INFO, cfile, NULL, NULL, NULL, NULL); if (tmprc == 0) cifs_i->cifsAttrs = dosattrs; } @@ -658,7 +659,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, drop_cached_dir_by_name(xid, tcon, name, cifs_sb); return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE, - NULL, SMB2_OP_RMDIR, NULL, NULL, NULL); + NULL, SMB2_OP_RMDIR, NULL, NULL, NULL, NULL, NULL); } int @@ -667,7 +668,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL); + ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL, NULL, NULL); } static int @@ -686,7 +687,7 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, } rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name, - command, cfile, NULL, NULL); + command, cfile, NULL, NULL, NULL, NULL); smb2_rename_path: kfree(smb2_to_name); return rc; @@ -727,7 +728,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); return smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE, - &eof, SMB2_OP_SET_EOF, cfile, NULL, NULL); + &eof, SMB2_OP_SET_EOF, cfile, NULL, NULL, NULL, NULL); } int @@ -754,7 +755,7 @@ smb2_set_file_info(struct inode *inode, const char *full_path, rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile, - NULL, NULL); + NULL, NULL, NULL, NULL); cifs_put_tlink(tlink); return rc; } -- cgit From 64ce47cb1b29d7d6aab6dcc287ae1fddb4876bd5 Mon Sep 17 00:00:00 2001 From: Volker Lendecke Date: Fri, 25 Nov 2022 12:37:44 +0100 Subject: cifs: Parse owner/group for stat in smb311 posix extensions stat was returning default owner and group (unlike readdir) for SMB3.1.1 POSIX extensions Signed-off-by: Volker Lendecke Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/inode.c | 13 +++++++++---- fs/cifs/smb2inode.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++--- fs/cifs/smb2proto.h | 5 ++++- 3 files changed, 59 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 4e2ca3c6e5c0..286a5400b94e 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -632,6 +632,8 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, /* Fill a cifs_fattr struct with info from POSIX info struct */ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, + struct cifs_sid *owner, + struct cifs_sid *group, struct super_block *sb, bool adjust_tz, bool symlink) { struct smb311_posix_qinfo *info = &data->posix_fi; @@ -680,8 +682,8 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_ope } /* else if reparse point ... TODO: add support for FIFO and blk dev; special file types */ - fattr->cf_uid = cifs_sb->ctx->linux_uid; /* TODO: map uid and gid from SID */ - fattr->cf_gid = cifs_sb->ctx->linux_gid; + sid_to_id(cifs_sb, owner, fattr, SIDOWNER); + sid_to_id(cifs_sb, group, fattr, SIDGROUP); cifs_dbg(FYI, "POSIX query info: mode 0x%x uniqueid 0x%llx nlink %d\n", fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink); @@ -1175,6 +1177,7 @@ smb311_posix_get_inode_info(struct inode **inode, struct cifs_fattr fattr = {0}; bool symlink = false; struct cifs_open_info_data data = {}; + struct cifs_sid owner, group; int rc = 0; int tmprc = 0; @@ -1192,7 +1195,8 @@ smb311_posix_get_inode_info(struct inode **inode, goto out; } - rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, full_path, &data, &adjust_tz, + rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, full_path, &data, + &owner, &group, &adjust_tz, &symlink); /* @@ -1201,7 +1205,8 @@ smb311_posix_get_inode_info(struct inode **inode, switch (rc) { case 0: - smb311_posix_info_to_fattr(&fattr, &data, sb, adjust_tz, symlink); + smb311_posix_info_to_fattr(&fattr, &data, &owner, &group, + sb, adjust_tz, symlink); break; case -EREMOTE: /* DFS link, no metadata available on this server */ diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 1be86ba950b3..fbd46db1023a 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -431,6 +431,21 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, &rsp_iov[1], sizeof(idata->posix_fi) /* add SIDs */, (char *)&idata->posix_fi); } + if (rc == 0) { + unsigned int length = le32_to_cpu(qi_rsp->OutputBufferLength); + + if (length > sizeof(idata->posix_fi)) { + char *base = (char *)rsp_iov[1].iov_base + + le16_to_cpu(qi_rsp->OutputBufferOffset) + + sizeof(idata->posix_fi); + *extbuflen = length - sizeof(idata->posix_fi); + *extbuf = kmemdup(base, *extbuflen, GFP_KERNEL); + if (!*extbuf) + rc = -ENOMEM; + } else { + rc = -EINVAL; + } + } if (rqst[1].rq_iov) SMB2_query_info_free(&rqst[1]); if (rqst[2].rq_iov) @@ -569,13 +584,20 @@ out: int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse) + struct cifs_open_info_data *data, + struct cifs_sid *owner, + struct cifs_sid *group, + bool *adjust_tz, bool *reparse) { int rc; __u32 create_options = 0; struct cifsFileInfo *cfile; struct kvec err_iov[3] = {}; int err_buftype[3] = {}; + __u8 *sidsbuf = NULL; + __u8 *sidsbuf_end = NULL; + size_t sidsbuflen = 0; + size_t owner_len, group_len; *adjust_tz = false; *reparse = false; @@ -590,7 +612,7 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile, - NULL, NULL, err_iov, err_buftype); + &sidsbuf, &sidsbuflen, err_iov, err_buftype); if (rc == -EOPNOTSUPP) { /* BB TODO: When support for special files added to Samba re-verify this path */ if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER && @@ -607,10 +629,31 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, data, - SMB2_OP_POSIX_QUERY_INFO, cfile, NULL, NULL, NULL, NULL); + SMB2_OP_POSIX_QUERY_INFO, cfile, + &sidsbuf, &sidsbuflen, NULL, NULL); + } + + if (rc == 0) { + sidsbuf_end = sidsbuf + sidsbuflen; + + owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end); + if (owner_len == -1) { + rc = -EINVAL; + goto out; + } + memcpy(owner, sidsbuf, owner_len); + + group_len = posix_info_sid_size( + sidsbuf + owner_len, sidsbuf_end); + if (group_len == -1) { + rc = -EINVAL; + goto out; + } + memcpy(group, sidsbuf + owner_len, group_len); } out: + kfree(sidsbuf); free_rsp_buf(err_buftype[0], err_iov[0].iov_base); free_rsp_buf(err_buftype[1], err_iov[1].iov_base); free_rsp_buf(err_buftype[2], err_iov[2].iov_base); diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index be21b5d26f67..d5d7ffb7711c 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -277,7 +277,10 @@ extern int smb2_query_info_compound(const unsigned int xid, /* query path info from the server using SMB311 POSIX extensions*/ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse); + struct cifs_open_info_data *data, + struct cifs_sid *owner, + struct cifs_sid *group, + bool *adjust_tz, bool *reparse); int posix_info_parse(const void *beg, const void *end, struct smb2_posix_info_parsed *out); int posix_info_sid_size(const void *beg, const void *end); -- cgit From 9381666e289852f93be9d7f4f7844017e04f6315 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Nov 2022 14:18:33 +0100 Subject: cifs: wire up >migrate_folio CIFS does not use page private data that needs migration, so it can just wire up filemap_migrate_folio. This prepares for removing ->writepage, which is used as a fallback if no migrate_folio method is set. Signed-off-by: Christoph Hellwig Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/file.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index cd9698209930..6be924caed39 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -5240,10 +5240,10 @@ const struct address_space_operations cifs_addr_ops = { .direct_IO = cifs_direct_io, .invalidate_folio = cifs_invalidate_folio, .launder_folio = cifs_launder_folio, + .migrate_folio = filemap_migrate_folio, /* - * TODO: investigate and if useful we could add an cifs_migratePage - * helper (under an CONFIG_MIGRATION) in the future, and also - * investigate and add an is_dirty_writeback helper if needed + * TODO: investigate and if useful we could add an is_dirty_writeback + * helper if needed */ .swap_activate = cifs_swap_activate, .swap_deactivate = cifs_swap_deactivate, @@ -5264,4 +5264,5 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .release_folio = cifs_release_folio, .invalidate_folio = cifs_invalidate_folio, .launder_folio = cifs_launder_folio, + .migrate_folio = filemap_migrate_folio, }; -- cgit From bff9018d3a52c45711bd63c446a2c80c0275e935 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Nov 2022 14:18:34 +0100 Subject: cifs: stop using generic_writepages generic_writepages is just a wrapper that calls ->writepages on a range, and thus in the way of eventually removing ->writepage. Switch cifs to just open code it in preparation of removing ->writepage. [note: I suspect just integrating the small wsize case with the rest of the writeback code might be a better idea here, but that needs someone more familiar with the code] Signed-off-by: Christoph Hellwig Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/file.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 6be924caed39..ec14e38411a1 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2646,6 +2646,21 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, return rc; } +static int +cifs_writepage_locked(struct page *page, struct writeback_control *wbc); + +static int cifs_write_one_page(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret; + + ret = cifs_writepage_locked(page, wbc); + unlock_page(page); + mapping_set_error(mapping, ret); + return ret; +} + static int cifs_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -2662,10 +2677,11 @@ static int cifs_writepages(struct address_space *mapping, /* * If wsize is smaller than the page cache size, default to writing - * one page at a time via cifs_writepage + * one page at a time. */ if (cifs_sb->ctx->wsize < PAGE_SIZE) - return generic_writepages(mapping, wbc); + return write_cache_pages(mapping, wbc, cifs_write_one_page, + mapping); xid = get_xid(); if (wbc->range_cyclic) { -- cgit From ebaad77c89921c8237ca17791d5462bd289052d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Nov 2022 14:18:35 +0100 Subject: cifs: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Now that cifs implements ->migrate_folio and doesn't call generic_writepages, the writepage method can be removed. Signed-off-by: Christoph Hellwig Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/file.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ec14e38411a1..6701257541ab 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2868,13 +2868,6 @@ retry_write: return rc; } -static int cifs_writepage(struct page *page, struct writeback_control *wbc) -{ - int rc = cifs_writepage_locked(page, wbc); - unlock_page(page); - return rc; -} - static int cifs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) @@ -5247,7 +5240,6 @@ static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio) const struct address_space_operations cifs_addr_ops = { .read_folio = cifs_read_folio, .readahead = cifs_readahead, - .writepage = cifs_writepage, .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, @@ -5272,7 +5264,6 @@ const struct address_space_operations cifs_addr_ops = { */ const struct address_space_operations cifs_addr_ops_smallbuf = { .read_folio = cifs_read_folio, - .writepage = cifs_writepage, .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, -- cgit From d406d26745aba3365ab9171b2d5cbea9c1757305 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 5 Dec 2022 23:31:53 -0300 Subject: cifs: skip alloc when request has no pages When smb3_init_transform_rq() was being called with requests (@old_rq) which had no pages, it was unnecessarily allocating a single page for every request in @new_rq. Fix this by skipping page array allocation when requests have no pages (e.g. !smb_rqst::rq_npages). Also get rid of deprecated kmap() and use kmap_local_page() instead while we're at it. Signed-off-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index bfaafd02fb1f..72b22d033ed5 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -4445,21 +4445,27 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, int rc = -ENOMEM; for (i = 1; i < num_rqst; i++) { - npages = old_rq[i - 1].rq_npages; + struct smb_rqst *old = &old_rq[i - 1]; + struct smb_rqst *new = &new_rq[i]; + + orig_len += smb_rqst_len(server, old); + new->rq_iov = old->rq_iov; + new->rq_nvec = old->rq_nvec; + + npages = old->rq_npages; + if (!npages) + continue; + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); if (!pages) goto err_free; - new_rq[i].rq_pages = pages; - new_rq[i].rq_npages = npages; - new_rq[i].rq_offset = old_rq[i - 1].rq_offset; - new_rq[i].rq_pagesz = old_rq[i - 1].rq_pagesz; - new_rq[i].rq_tailsz = old_rq[i - 1].rq_tailsz; - new_rq[i].rq_iov = old_rq[i - 1].rq_iov; - new_rq[i].rq_nvec = old_rq[i - 1].rq_nvec; - - orig_len += smb_rqst_len(server, &old_rq[i - 1]); + new->rq_pages = pages; + new->rq_npages = npages; + new->rq_offset = old->rq_offset; + new->rq_pagesz = old->rq_pagesz; + new->rq_tailsz = old->rq_tailsz; for (j = 0; j < npages; j++) { pages[j] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); @@ -4472,14 +4478,14 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, char *dst, *src; unsigned int offset, len; - rqst_page_get_length(&new_rq[i], j, &len, &offset); + rqst_page_get_length(new, j, &len, &offset); - dst = (char *) kmap(new_rq[i].rq_pages[j]) + offset; - src = (char *) kmap(old_rq[i - 1].rq_pages[j]) + offset; + dst = kmap_local_page(new->rq_pages[j]) + offset; + src = kmap_local_page(old->rq_pages[j]) + offset; memcpy(dst, src, len); - kunmap(new_rq[i].rq_pages[j]); - kunmap(old_rq[i - 1].rq_pages[j]); + kunmap(new->rq_pages[j]); + kunmap(old->rq_pages[j]); } } -- cgit From 52f31ed228212ba572c44e15e818a3a5c74122c0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 8 Dec 2022 08:29:22 -0800 Subject: xfs: dquot shrinker doesn't check for XFS_DQFLAG_FREEING Resulting in a UAF if the shrinker races with some other dquot freeing mechanism that sets XFS_DQFLAG_FREEING before the dquot is removed from the LRU. This can occur if a dquot purge races with drop_caches. Reported-by: syzbot+912776840162c13db1a3@syzkaller.appspotmail.com Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_qm.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 18bb4ec4d7c9..ff53d40a2dae 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -422,6 +422,14 @@ xfs_qm_dquot_isolate( if (!xfs_dqlock_nowait(dqp)) goto out_miss_busy; + /* + * If something else is freeing this dquot and hasn't yet removed it + * from the LRU, leave it for the freeing task to complete the freeing + * process rather than risk it being free from under us here. + */ + if (dqp->q_flags & XFS_DQFLAG_FREEING) + goto out_miss_unlock; + /* * This dquot has acquired a reference in the meantime remove it from * the freelist and try again. @@ -441,10 +449,8 @@ xfs_qm_dquot_isolate( * skip it so there is time for the IO to complete before we try to * reclaim it again on the next LRU pass. */ - if (!xfs_dqflock_nowait(dqp)) { - xfs_dqunlock(dqp); - goto out_miss_busy; - } + if (!xfs_dqflock_nowait(dqp)) + goto out_miss_unlock; if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; @@ -478,6 +484,8 @@ xfs_qm_dquot_isolate( XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims); return LRU_REMOVED; +out_miss_unlock: + xfs_dqunlock(dqp); out_miss_busy: trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); -- cgit From 1c8a8ec0a0e9a1176022a35c4daf04fe1594d270 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Nov 2022 10:43:44 +0100 Subject: f2fs: remove struct segment_allocation default_salloc_ops There is only single instance of these ops, so remove the indirection and call allocate_segment_by_default directly. Signed-off-by: Christoph Hellwig Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 ++--------- fs/f2fs/segment.h | 6 ------ 2 files changed, 2 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0ff451ea18f6..bbe6556799ce 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2926,7 +2926,7 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, return; alloc: old_segno = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); + allocate_segment_by_default(sbi, type, true); locate_dirty_segment(sbi, old_segno); } @@ -2957,10 +2957,6 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) f2fs_up_read(&SM_I(sbi)->curseg_lock); } -static const struct segment_allocation default_salloc_ops = { - .allocate_segment = allocate_segment_by_default, -}; - bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) { @@ -3284,7 +3280,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, get_atssr_segment(sbi, type, se->type, AT_SSR, se->mtime); else - sit_i->s_ops->allocate_segment(sbi, type, false); + allocate_segment_by_default(sbi, type, false); } /* * segment dirty status should be updated after segment allocation, @@ -4270,9 +4266,6 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; #endif - /* init SIT information */ - sit_i->s_ops = &default_salloc_ops; - sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; sit_i->written_valid_blocks = 0; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index be8f2d7d007b..3ad1b7b6fa94 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -222,10 +222,6 @@ struct sec_entry { unsigned int valid_blocks; /* # of valid blocks in a section */ }; -struct segment_allocation { - void (*allocate_segment)(struct f2fs_sb_info *, int, bool); -}; - #define MAX_SKIP_GC_COUNT 16 struct revoke_entry { @@ -235,8 +231,6 @@ struct revoke_entry { }; struct sit_info { - const struct segment_allocation *s_ops; - block_t sit_base_addr; /* start block address of SIT area */ block_t sit_blocks; /* # of blocks used by SIT area */ block_t written_valid_blocks; /* # of valid blocks in main area */ -- cgit From 8442d94b8ac8d5d8300725a9ffa9def526b71170 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Nov 2022 10:43:45 +0100 Subject: f2fs: open code allocate_segment_by_default allocate_segment_by_default has just two callers, which use very different code pathes inside it based on the force paramter. Just open code the logic in the two callers using a new helper to decided if a new segment should be allocated. Signed-off-by: Christoph Hellwig Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 50 ++++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index bbe6556799ce..c4e118eb7d19 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2849,31 +2849,20 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, return 0; } -/* - * flush out current segment and replace it with new segment - * This function should be returned with success, otherwise BUG - */ -static void allocate_segment_by_default(struct f2fs_sb_info *sbi, - int type, bool force) +static bool need_new_seg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); - if (force) - new_curseg(sbi, type, true); - else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && - curseg->seg_type == CURSEG_WARM_NODE) - new_curseg(sbi, type, false); - else if (curseg->alloc_type == LFS && - is_next_segment_free(sbi, curseg, type) && - likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) - new_curseg(sbi, type, false); - else if (f2fs_need_SSR(sbi) && - get_ssr_segment(sbi, type, SSR, 0)) - change_curseg(sbi, type, true); - else - new_curseg(sbi, type, false); - - stat_inc_seg_type(sbi, curseg); + if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && + curseg->seg_type == CURSEG_WARM_NODE) + return true; + if (curseg->alloc_type == LFS && + is_next_segment_free(sbi, curseg, type) && + likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return true; + if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0)) + return true; + return false; } void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, @@ -2926,7 +2915,8 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, return; alloc: old_segno = curseg->segno; - allocate_segment_by_default(sbi, type, true); + new_curseg(sbi, type, true); + stat_inc_seg_type(sbi, curseg); locate_dirty_segment(sbi, old_segno); } @@ -3276,11 +3266,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, update_sit_entry(sbi, old_blkaddr, -1); if (!__has_curseg_space(sbi, curseg)) { - if (from_gc) + /* + * Flush out current segment and replace it with new segment. + */ + if (from_gc) { get_atssr_segment(sbi, type, se->type, AT_SSR, se->mtime); - else - allocate_segment_by_default(sbi, type, false); + } else { + if (need_new_seg(sbi, type)) + new_curseg(sbi, type, false); + else + change_curseg(sbi, type, true); + stat_inc_seg_type(sbi, curseg); + } } /* * segment dirty status should be updated after segment allocation, -- cgit From 5bcd655fffaec24e849bda1207446f5cc821713e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Nov 2022 10:43:46 +0100 Subject: f2fs: remove the unused flush argument to change_curseg Signed-off-by: Christoph Hellwig Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c4e118eb7d19..9486ca49ecb1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2656,7 +2656,7 @@ bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) +static void change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -2664,9 +2664,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) struct f2fs_summary_block *sum_node; struct page *sum_page; - if (flush) - write_sum_page(sbi, curseg->sum_blk, - GET_SUM_BLOCK(sbi, curseg->segno)); + write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); __set_test_and_inuse(sbi, new_segno); @@ -2705,7 +2703,7 @@ static void get_atssr_segment(struct f2fs_sb_info *sbi, int type, struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno); curseg->seg_type = se->type; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } else { /* allocate cold segment by default */ curseg->seg_type = CURSEG_COLD_DATA; @@ -2880,7 +2878,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, goto unlock; if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0)) - change_curseg(sbi, type, true); + change_curseg(sbi, type); else new_curseg(sbi, type, true); @@ -3276,7 +3274,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, if (need_new_seg(sbi, type)) new_curseg(sbi, type, false); else - change_curseg(sbi, type, true); + change_curseg(sbi, type); stat_inc_seg_type(sbi, curseg); } } @@ -3539,7 +3537,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); @@ -3567,7 +3565,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = old_blkoff; curseg->alloc_type = old_alloc_type; -- cgit From 870af777da22505851174a34c0228042d7ed5f5f Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Fri, 25 Nov 2022 19:47:36 +0800 Subject: f2fs: do some cleanup for f2fs module init Just for cleanup, no functional changes. Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 46 ++++++---------------------------------------- fs/f2fs/data.c | 14 ++++---------- fs/f2fs/gc.c | 4 +--- fs/f2fs/recovery.c | 4 +--- fs/f2fs/super.c | 8 ++------ 5 files changed, 14 insertions(+), 62 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 74d3f2d2271f..9723f0bed923 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -567,10 +567,7 @@ MODULE_PARM_DESC(num_compress_pages, int f2fs_init_compress_mempool(void) { compress_page_pool = mempool_create_page_pool(num_compress_pages, 0); - if (!compress_page_pool) - return -ENOMEM; - - return 0; + return compress_page_pool ? 0 : -ENOMEM; } void f2fs_destroy_compress_mempool(void) @@ -1983,9 +1980,7 @@ int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) sbi->page_array_slab = f2fs_kmem_cache_create(slab_name, sbi->page_array_slab_size); - if (!sbi->page_array_slab) - return -ENOMEM; - return 0; + return sbi->page_array_slab ? 0 : -ENOMEM; } void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) @@ -1993,53 +1988,24 @@ void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) kmem_cache_destroy(sbi->page_array_slab); } -static int __init f2fs_init_cic_cache(void) +int __init f2fs_init_compress_cache(void) { cic_entry_slab = f2fs_kmem_cache_create("f2fs_cic_entry", sizeof(struct compress_io_ctx)); if (!cic_entry_slab) return -ENOMEM; - return 0; -} - -static void f2fs_destroy_cic_cache(void) -{ - kmem_cache_destroy(cic_entry_slab); -} - -static int __init f2fs_init_dic_cache(void) -{ dic_entry_slab = f2fs_kmem_cache_create("f2fs_dic_entry", sizeof(struct decompress_io_ctx)); if (!dic_entry_slab) - return -ENOMEM; - return 0; -} - -static void f2fs_destroy_dic_cache(void) -{ - kmem_cache_destroy(dic_entry_slab); -} - -int __init f2fs_init_compress_cache(void) -{ - int err; - - err = f2fs_init_cic_cache(); - if (err) - goto out; - err = f2fs_init_dic_cache(); - if (err) goto free_cic; return 0; free_cic: - f2fs_destroy_cic_cache(); -out: + kmem_cache_destroy(cic_entry_slab); return -ENOMEM; } void f2fs_destroy_compress_cache(void) { - f2fs_destroy_dic_cache(); - f2fs_destroy_cic_cache(); + kmem_cache_destroy(dic_entry_slab); + kmem_cache_destroy(cic_entry_slab); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 560fa80590e9..35c19248b1e2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -39,10 +39,8 @@ static struct bio_set f2fs_bioset; int __init f2fs_init_bioset(void) { - if (bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE, - 0, BIOSET_NEED_BVECS)) - return -ENOMEM; - return 0; + return bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE, + 0, BIOSET_NEED_BVECS); } void f2fs_destroy_bioset(void) @@ -4090,9 +4088,7 @@ int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi) sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq", WQ_UNBOUND | WQ_HIGHPRI, num_online_cpus()); - if (!sbi->post_read_wq) - return -ENOMEM; - return 0; + return sbi->post_read_wq ? 0 : -ENOMEM; } void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi) @@ -4105,9 +4101,7 @@ int __init f2fs_init_bio_entry_cache(void) { bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab", sizeof(struct bio_entry)); - if (!bio_entry_slab) - return -ENOMEM; - return 0; + return bio_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_bio_entry_cache(void) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index f1b68eda2235..d19e26b2e875 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1904,9 +1904,7 @@ int __init f2fs_create_garbage_collection_cache(void) { victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry", sizeof(struct victim_entry)); - if (!victim_entry_slab) - return -ENOMEM; - return 0; + return victim_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_garbage_collection_cache(void) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index dea95b48b647..77fd453949b1 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -923,9 +923,7 @@ int __init f2fs_create_recovery_cache(void) { fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry)); - if (!fsync_entry_slab) - return -ENOMEM; - return 0; + return fsync_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_recovery_cache(void) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index daf14b55a972..a5f6f632cf7c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -288,9 +288,7 @@ static int __init f2fs_create_casefold_cache(void) { f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name", F2FS_NAME_LEN); - if (!f2fs_cf_name_slab) - return -ENOMEM; - return 0; + return f2fs_cf_name_slab ? 0 : -ENOMEM; } static void f2fs_destroy_casefold_cache(void) @@ -4647,9 +4645,7 @@ static int __init init_inodecache(void) f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache", sizeof(struct f2fs_inode_info), 0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL); - if (!f2fs_inode_cachep) - return -ENOMEM; - return 0; + return f2fs_inode_cachep ? 0 : -ENOMEM; } static void destroy_inodecache(void) -- cgit From 78742d4d056df7d2fad241c90185d281bf924844 Mon Sep 17 00:00:00 2001 From: Luís Henriques Date: Tue, 11 Oct 2022 16:57:58 +0100 Subject: ext4: remove trailing newline from ext4_msg() message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ext4_msg() function adds a new line to the message. Remove extra '\n' from call to ext4_msg() in ext4_orphan_cleanup(). Signed-off-by: Luís Henriques Link: https://lore.kernel.org/r/20221011155758.15287-1-lhenriques@suse.de Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/orphan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 69a9cf9137a6..e5b47dda3317 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -412,7 +412,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) /* don't clear list on RO mount w/ errors */ if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { ext4_msg(sb, KERN_INFO, "Errors on filesystem, " - "clearing orphan list.\n"); + "clearing orphan list."); es->s_last_orphan = 0; } ext4_debug("Skipping orphan recovery on fs with errors.\n"); -- cgit From d323877484765aaacbb2769b06e355c2041ed115 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 26 Oct 2022 12:23:07 +0800 Subject: ext4: fix bug_on in __es_tree_search caused by bad quota inode We got a issue as fllows: ================================================================== kernel BUG at fs/ext4/extents_status.c:202! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 1 PID: 810 Comm: mount Not tainted 6.1.0-rc1-next-g9631525255e3 #352 RIP: 0010:__es_tree_search.isra.0+0xb8/0xe0 RSP: 0018:ffffc90001227900 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 0000000077512a0f RCX: 0000000000000000 RDX: 0000000000000002 RSI: 0000000000002a10 RDI: ffff8881004cd0c8 RBP: ffff888177512ac8 R08: 47ffffffffffffff R09: 0000000000000001 R10: 0000000000000001 R11: 00000000000679af R12: 0000000000002a10 R13: ffff888177512d88 R14: 0000000077512a10 R15: 0000000000000000 FS: 00007f4bd76dbc40(0000)GS:ffff88842fd00000(0000)knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005653bf993cf8 CR3: 000000017bfdf000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ext4_es_cache_extent+0xe2/0x210 ext4_cache_extents+0xd2/0x110 ext4_find_extent+0x5d5/0x8c0 ext4_ext_map_blocks+0x9c/0x1d30 ext4_map_blocks+0x431/0xa50 ext4_getblk+0x82/0x340 ext4_bread+0x14/0x110 ext4_quota_read+0xf0/0x180 v2_read_header+0x24/0x90 v2_check_quota_file+0x2f/0xa0 dquot_load_quota_sb+0x26c/0x760 dquot_load_quota_inode+0xa5/0x190 ext4_enable_quotas+0x14c/0x300 __ext4_fill_super+0x31cc/0x32c0 ext4_fill_super+0x115/0x2d0 get_tree_bdev+0x1d2/0x360 ext4_get_tree+0x19/0x30 vfs_get_tree+0x26/0xe0 path_mount+0x81d/0xfc0 do_mount+0x8d/0xc0 __x64_sys_mount+0xc0/0x160 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd ================================================================== Above issue may happen as follows: ------------------------------------- ext4_fill_super ext4_orphan_cleanup ext4_enable_quotas ext4_quota_enable ext4_iget --> get error inode <5> ext4_ext_check_inode --> Wrong imode makes it escape inspection make_bad_inode(inode) --> EXT4_BOOT_LOADER_INO set imode dquot_load_quota_inode vfs_setup_quota_inode --> check pass dquot_load_quota_sb v2_check_quota_file v2_read_header ext4_quota_read ext4_bread ext4_getblk ext4_map_blocks ext4_ext_map_blocks ext4_find_extent ext4_cache_extents ext4_es_cache_extent __es_tree_search.isra.0 ext4_es_end --> Wrong extents trigger BUG_ON In the above issue, s_usr_quota_inum is set to 5, but inode<5> contains incorrect imode and disordered extents. Because 5 is EXT4_BOOT_LOADER_INO, the ext4_ext_check_inode check in the ext4_iget function can be bypassed, finally, the extents that are not checked trigger the BUG_ON in the __es_tree_search function. To solve this issue, check whether the inode is bad_inode in vfs_setup_quota_inode(). Signed-off-by: Baokun Li Reviewed-by: Chaitanya Kulkarni Reviewed-by: Jason Yan Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221026042310.3839669-2-libaokun1@huawei.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/quota/dquot.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 0427b44bfee5..f27faf5db554 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2324,6 +2324,8 @@ static int vfs_setup_quota_inode(struct inode *inode, int type) struct super_block *sb = inode->i_sb; struct quota_info *dqopt = sb_dqopt(sb); + if (is_bad_inode(inode)) + return -EUCLEAN; if (!S_ISREG(inode->i_mode)) return -EACCES; if (IS_RDONLY(inode)) -- cgit From 07342ec259df2a35d6a34aebce010567a80a0e15 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 26 Oct 2022 12:23:08 +0800 Subject: ext4: add helper to check quota inums Before quota is enabled, a check on the preset quota inums in ext4_super_block is added to prevent wrong quota inodes from being loaded. In addition, when the quota fails to be enabled, the quota type and quota inum are printed to facilitate fault locating. Signed-off-by: Baokun Li Reviewed-by: Jason Yan Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221026042310.3839669-3-libaokun1@huawei.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/super.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0f71542cf453..6f944f5e4b3f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6886,6 +6886,20 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, return err; } +static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum) +{ + switch (type) { + case USRQUOTA: + return qf_inum == EXT4_USR_QUOTA_INO; + case GRPQUOTA: + return qf_inum == EXT4_GRP_QUOTA_INO; + case PRJQUOTA: + return qf_inum >= EXT4_GOOD_OLD_FIRST_INO; + default: + BUG(); + } +} + static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags) { @@ -6902,9 +6916,16 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, if (!qf_inums[type]) return -EPERM; + if (!ext4_check_quota_inum(type, qf_inums[type])) { + ext4_error(sb, "Bad quota inum: %lu, type: %d", + qf_inums[type], type); + return -EUCLEAN; + } + qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL); if (IS_ERR(qf_inode)) { - ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]); + ext4_error(sb, "Bad quota inode: %lu, type: %d", + qf_inums[type], type); return PTR_ERR(qf_inode); } @@ -6943,8 +6964,9 @@ int ext4_enable_quotas(struct super_block *sb) if (err) { ext4_warning(sb, "Failed to enable quota tracking " - "(type=%d, err=%d). Please run " - "e2fsck to fix.", type, err); + "(type=%d, err=%d, ino=%lu). " + "Please run e2fsck to fix.", type, + err, qf_inums[type]); for (type--; type >= 0; type--) { struct inode *inode; -- cgit From 63b1e9bccb71fe7d7e3ddc9877dbdc85e5d2d023 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 26 Oct 2022 12:23:09 +0800 Subject: ext4: add EXT4_IGET_BAD flag to prevent unexpected bad inode There are many places that will get unhappy (and crash) when ext4_iget() returns a bad inode. However, if iget the boot loader inode, allows a bad inode to be returned, because the inode may not be initialized. This mechanism can be used to bypass some checks and cause panic. To solve this problem, we add a special iget flag EXT4_IGET_BAD. Only with this flag we'd be returning bad inode from ext4_iget(), otherwise we always return the error code if the inode is bad inode.(suggested by Jan Kara) Signed-off-by: Baokun Li Reviewed-by: Jason Yan Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221026042310.3839669-4-libaokun1@huawei.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ext4.h | 3 ++- fs/ext4/inode.c | 8 +++++++- fs/ext4/ioctl.c | 3 ++- 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8d5453852f98..2b574b143bde 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2964,7 +2964,8 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode, typedef enum { EXT4_IGET_NORMAL = 0, EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ - EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */ + EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */ + EXT4_IGET_BAD = 0x0004 /* Allow to iget a bad inode */ } ext4_iget_flags; extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6d16241666e6..a4bf643aa08b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5058,8 +5058,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); - brelse(iloc.bh); + if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { + ext4_error_inode(inode, function, line, 0, + "bad inode without EXT4_IGET_BAD flag"); + ret = -EUCLEAN; + goto bad_inode; + } + brelse(iloc.bh); unlock_new_inode(inode); return inode; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 95dfea28bf4e..9ed7b9fe2132 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -374,7 +374,8 @@ static long swap_inode_boot_loader(struct super_block *sb, blkcnt_t blocks; unsigned short bytes; - inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL); + inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, + EXT4_IGET_SPECIAL | EXT4_IGET_BAD); if (IS_ERR(inode_bl)) return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl); -- cgit From 991ed014de0840c5dc405b679168924afb2952ac Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 26 Oct 2022 12:23:10 +0800 Subject: ext4: fix bug_on in __es_tree_search caused by bad boot loader inode We got a issue as fllows: ================================================================== kernel BUG at fs/ext4/extents_status.c:203! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 1 PID: 945 Comm: cat Not tainted 6.0.0-next-20221007-dirty #349 RIP: 0010:ext4_es_end.isra.0+0x34/0x42 RSP: 0018:ffffc9000143b768 EFLAGS: 00010203 RAX: 0000000000000000 RBX: ffff8881769cd0b8 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff8fc27cf7 RDI: 00000000ffffffff RBP: ffff8881769cd0bc R08: 0000000000000000 R09: ffffc9000143b5f8 R10: 0000000000000001 R11: 0000000000000001 R12: ffff8881769cd0a0 R13: ffff8881768e5668 R14: 00000000768e52f0 R15: 0000000000000000 FS: 00007f359f7f05c0(0000)GS:ffff88842fd00000(0000)knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f359f5a2000 CR3: 000000017130c000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __es_tree_search.isra.0+0x6d/0xf5 ext4_es_cache_extent+0xfa/0x230 ext4_cache_extents+0xd2/0x110 ext4_find_extent+0x5d5/0x8c0 ext4_ext_map_blocks+0x9c/0x1d30 ext4_map_blocks+0x431/0xa50 ext4_mpage_readpages+0x48e/0xe40 ext4_readahead+0x47/0x50 read_pages+0x82/0x530 page_cache_ra_unbounded+0x199/0x2a0 do_page_cache_ra+0x47/0x70 page_cache_ra_order+0x242/0x400 ondemand_readahead+0x1e8/0x4b0 page_cache_sync_ra+0xf4/0x110 filemap_get_pages+0x131/0xb20 filemap_read+0xda/0x4b0 generic_file_read_iter+0x13a/0x250 ext4_file_read_iter+0x59/0x1d0 vfs_read+0x28f/0x460 ksys_read+0x73/0x160 __x64_sys_read+0x1e/0x30 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd ================================================================== In the above issue, ioctl invokes the swap_inode_boot_loader function to swap inode<5> and inode<12>. However, inode<5> contain incorrect imode and disordered extents, and i_nlink is set to 1. The extents check for inode in the ext4_iget function can be bypassed bacause 5 is EXT4_BOOT_LOADER_INO. While links_count is set to 1, the extents are not initialized in swap_inode_boot_loader. After the ioctl command is executed successfully, the extents are swapped to inode<12>, in this case, run the `cat` command to view inode<12>. And Bug_ON is triggered due to the incorrect extents. When the boot loader inode is not initialized, its imode can be one of the following: 1) the imode is a bad type, which is marked as bad_inode in ext4_iget and set to S_IFREG. 2) the imode is good type but not S_IFREG. 3) the imode is S_IFREG. The BUG_ON may be triggered by bypassing the check in cases 1 and 2. Therefore, when the boot loader inode is bad_inode or its imode is not S_IFREG, initialize the inode to avoid triggering the BUG. Signed-off-by: Baokun Li Reviewed-by: Jason Yan Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221026042310.3839669-5-libaokun1@huawei.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 9ed7b9fe2132..e5f60057db5b 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -425,7 +425,7 @@ static long swap_inode_boot_loader(struct super_block *sb, /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(inode, inode_bl); - if (inode_bl->i_nlink == 0) { + if (is_bad_inode(inode_bl) || !S_ISREG(inode_bl->i_mode)) { /* this inode has never been used as a BOOT_LOADER */ set_nlink(inode_bl, 1); i_uid_write(inode_bl, 0); -- cgit From 3bf678a0f9c017c9ba7c581541dbc8453452a7ae Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Mon, 31 Oct 2022 13:58:33 +0800 Subject: ext4: fix undefined behavior in bit shift for ext4_check_flag_values Shifting signed 32-bit value by 31 bits is undefined, so changing significant bit to unsigned. The UBSAN warning calltrace like below: UBSAN: shift-out-of-bounds in fs/ext4/ext4.h:591:2 left shift of 1 by 31 places cannot be represented in type 'int' Call Trace: dump_stack_lvl+0x7d/0xa5 dump_stack+0x15/0x1b ubsan_epilogue+0xe/0x4e __ubsan_handle_shift_out_of_bounds+0x1e7/0x20c ext4_init_fs+0x5a/0x277 do_one_initcall+0x76/0x430 kernel_init_freeable+0x3b3/0x422 kernel_init+0x24/0x1e0 ret_from_fork+0x1f/0x30 Fixes: 9a4c80194713 ("ext4: ensure Inode flags consistency are checked at build time") Signed-off-by: Gaosheng Cui Link: https://lore.kernel.org/r/20221031055833.3966222-1-cuigaosheng1@huawei.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ext4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2b574b143bde..3afdd99bb214 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -558,7 +558,7 @@ enum { * * It's not paranoia if the Murphy's Law really *is* out to get you. :-) */ -#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG)) #define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) static inline void ext4_check_flag_values(void) -- cgit From 105c78e12468413e426625831faa7db4284e1fec Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 1 Nov 2022 22:33:12 -0700 Subject: ext4: don't allow journal inode to have encrypt flag Mounting a filesystem whose journal inode has the encrypt flag causes a NULL dereference in fscrypt_limit_io_blocks() when the 'inlinecrypt' mount option is used. The problem is that when jbd2_journal_init_inode() calls bmap(), it eventually finds its way into ext4_iomap_begin(), which calls fscrypt_limit_io_blocks(). fscrypt_limit_io_blocks() requires that if the inode is encrypted, then its encryption key must already be set up. That's not the case here, since the journal inode is never "opened" like a normal file would be. Hence the crash. A reproducer is: mkfs.ext4 -F /dev/vdb debugfs -w /dev/vdb -R "set_inode_field <8> flags 0x80808" mount /dev/vdb /mnt -o inlinecrypt To fix this, make ext4 consider journal inodes with the encrypt flag to be invalid. (Note, maybe other flags should be rejected on the journal inode too. For now, this is just the minimal fix for the above issue.) I've marked this as fixing the commit that introduced the call to fscrypt_limit_io_blocks(), since that's what made an actual crash start being possible. But this fix could be applied to any version of ext4 that supports the encrypt feature. Reported-by: syzbot+ba9dac45bc76c490b7c3@syzkaller.appspotmail.com Fixes: 38ea50daa7a4 ("ext4: support direct I/O with fscrypt using blk-crypto") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20221102053312.189962-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6f944f5e4b3f..9185131e01e5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5723,7 +5723,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, ext4_debug("Journal inode found at %p: %lld bytes\n", journal_inode, journal_inode->i_size); - if (!S_ISREG(journal_inode->i_mode)) { + if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); iput(journal_inode); return NULL; -- cgit From a71248b1accb2b42e4980afef4fa4a27fa0e36f5 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 2 Nov 2022 16:06:33 +0800 Subject: ext4: fix use-after-free in ext4_orphan_cleanup I caught a issue as follows: ================================================================== BUG: KASAN: use-after-free in __list_add_valid+0x28/0x1a0 Read of size 8 at addr ffff88814b13f378 by task mount/710 CPU: 1 PID: 710 Comm: mount Not tainted 6.1.0-rc3-next #370 Call Trace: dump_stack_lvl+0x73/0x9f print_report+0x25d/0x759 kasan_report+0xc0/0x120 __asan_load8+0x99/0x140 __list_add_valid+0x28/0x1a0 ext4_orphan_cleanup+0x564/0x9d0 [ext4] __ext4_fill_super+0x48e2/0x5300 [ext4] ext4_fill_super+0x19f/0x3a0 [ext4] get_tree_bdev+0x27b/0x450 ext4_get_tree+0x19/0x30 [ext4] vfs_get_tree+0x49/0x150 path_mount+0xaae/0x1350 do_mount+0xe2/0x110 __x64_sys_mount+0xf0/0x190 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] ================================================================== Above issue may happen as follows: ------------------------------------- ext4_fill_super ext4_orphan_cleanup --- loop1: assume last_orphan is 12 --- list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan) ext4_truncate --> return 0 ext4_inode_attach_jinode --> return -ENOMEM iput(inode) --> free inode<12> --- loop2: last_orphan is still 12 --- list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); // use inode<12> and trigger UAF To solve this issue, we need to propagate the return value of ext4_inode_attach_jinode() appropriately. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221102080633.1630225-1-libaokun1@huawei.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a4bf643aa08b..181bc161b1ac 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4231,7 +4231,8 @@ int ext4_truncate(struct inode *inode) /* If we zero-out tail of the page, we have to create jinode for jbd2 */ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { - if (ext4_inode_attach_jinode(inode) < 0) + err = ext4_inode_attach_jinode(inode); + if (err) goto out_trace; } -- cgit From 0fbcb5251fc81b58969b272c4fb7374a7b922e3e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 6 Nov 2022 14:48:35 -0800 Subject: ext4: disable fast-commit of encrypted dir operations fast-commit of create, link, and unlink operations in encrypted directories is completely broken because the unencrypted filenames are being written to the fast-commit journal instead of the encrypted filenames. These operations can't be replayed, as encryption keys aren't present at journal replay time. It is also an information leak. Until if/when we can get this working properly, make encrypted directory operations ineligible for fast-commit. Note that fast-commit operations on encrypted regular files continue to be allowed, as they seem to work. Fixes: aa75f4d3daae ("ext4: main fast-commit commit path") Cc: # v5.10+ Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20221106224841.279231-2-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 41 +++++++++++++++++++++++++---------------- fs/ext4/fast_commit.h | 1 + 2 files changed, 26 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 0f6d0a80467d..6d98f2b39b77 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -420,25 +420,34 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) struct __track_dentry_update_args *dentry_update = (struct __track_dentry_update_args *)arg; struct dentry *dentry = dentry_update->dentry; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct inode *dir = dentry->d_parent->d_inode; + struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); mutex_unlock(&ei->i_fc_lock); + + if (IS_ENCRYPTED(dir)) { + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, + NULL); + mutex_lock(&ei->i_fc_lock); + return -EOPNOTSUPP; + } + node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } node->fcd_op = dentry_update->op; - node->fcd_parent = dentry->d_parent->d_inode->i_ino; + node->fcd_parent = dir->i_ino; node->fcd_ino = inode->i_ino; if (dentry->d_name.len > DNAME_INLINE_LEN) { node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); if (!node->fcd_name.name) { kmem_cache_free(ext4_fc_dentry_cachep, node); - ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_NOMEM, NULL); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -2249,17 +2258,17 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal) journal->j_fc_cleanup_callback = ext4_fc_cleanup; } -static const char *fc_ineligible_reasons[] = { - "Extended attributes changed", - "Cross rename", - "Journal flag changed", - "Insufficient memory", - "Swap boot", - "Resize", - "Dir renamed", - "Falloc range op", - "Data journalling", - "FC Commit Failed" +static const char * const fc_ineligible_reasons[] = { + [EXT4_FC_REASON_XATTR] = "Extended attributes changed", + [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename", + [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed", + [EXT4_FC_REASON_NOMEM] = "Insufficient memory", + [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot", + [EXT4_FC_REASON_RESIZE] = "Resize", + [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed", + [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", + [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", + [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", }; int ext4_fc_info_show(struct seq_file *seq, void *v) diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index a6154c3ed135..256f2ad27204 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -96,6 +96,7 @@ enum { EXT4_FC_REASON_RENAME_DIR, EXT4_FC_REASON_FALLOC_RANGE, EXT4_FC_REASON_INODE_JOURNAL_DATA, + EXT4_FC_REASON_ENCRYPTED_FILENAME, EXT4_FC_REASON_MAX }; -- cgit From 4c0d5778385cb3618ff26a561ce41de2b7d9de70 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 6 Nov 2022 14:48:36 -0800 Subject: ext4: don't set up encryption key during jbd2 transaction Commit a80f7fcf1867 ("ext4: fixup ext4_fc_track_* functions' signature") extended the scope of the transaction in ext4_unlink() too far, making it include the call to ext4_find_entry(). However, ext4_find_entry() can deadlock when called from within a transaction because it may need to set up the directory's encryption key. Fix this by restoring the transaction to its original scope. Reported-by: syzbot+1a748d0007eeac3ab079@syzkaller.appspotmail.com Fixes: a80f7fcf1867 ("ext4: fixup ext4_fc_track_* functions' signature") Cc: # v5.10+ Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20221106224841.279231-3-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 ++-- fs/ext4/fast_commit.c | 2 +- fs/ext4/namei.c | 44 ++++++++++++++++++++++++-------------------- 3 files changed, 27 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3afdd99bb214..4e739902dc03 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3620,8 +3620,8 @@ extern void ext4_initialize_dirent_tail(struct buffer_head *bh, unsigned int blocksize); extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, struct buffer_head *bh); -extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name, - struct inode *inode); +extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name, + struct inode *inode, struct dentry *dentry); extern int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry); diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 6d98f2b39b77..da0c8228cf9c 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1397,7 +1397,7 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl, return 0; } - ret = __ext4_unlink(NULL, old_parent, &entry, inode); + ret = __ext4_unlink(old_parent, &entry, inode, NULL); /* -ENOENT ok coz it might not exist anymore. */ if (ret == -ENOENT) ret = 0; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index c08c0aba1883..a789ea9b61a0 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3204,14 +3204,20 @@ end_rmdir: return retval; } -int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name, - struct inode *inode) +int __ext4_unlink(struct inode *dir, const struct qstr *d_name, + struct inode *inode, + struct dentry *dentry /* NULL during fast_commit recovery */) { int retval = -ENOENT; struct buffer_head *bh; struct ext4_dir_entry_2 *de; + handle_t *handle; int skip_remove_dentry = 0; + /* + * Keep this outside the transaction; it may have to set up the + * directory's encryption key, which isn't GFP_NOFS-safe. + */ bh = ext4_find_entry(dir, d_name, &de, NULL); if (IS_ERR(bh)) return PTR_ERR(bh); @@ -3228,7 +3234,14 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) skip_remove_dentry = 1; else - goto out; + goto out_bh; + } + + handle = ext4_journal_start(dir, EXT4_HT_DIR, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + goto out_bh; } if (IS_DIRSYNC(dir)) @@ -3237,12 +3250,12 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name if (!skip_remove_dentry) { retval = ext4_delete_entry(handle, dir, de, bh); if (retval) - goto out; + goto out_handle; dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); retval = ext4_mark_inode_dirty(handle, dir); if (retval) - goto out; + goto out_handle; } else { retval = 0; } @@ -3255,15 +3268,17 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name ext4_orphan_add(handle, inode); inode->i_ctime = current_time(inode); retval = ext4_mark_inode_dirty(handle, inode); - -out: + if (dentry && !retval) + ext4_fc_track_unlink(handle, dentry); +out_handle: + ext4_journal_stop(handle); +out_bh: brelse(bh); return retval; } static int ext4_unlink(struct inode *dir, struct dentry *dentry) { - handle_t *handle; int retval; if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) @@ -3281,16 +3296,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (retval) goto out_trace; - handle = ext4_journal_start(dir, EXT4_HT_DIR, - EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) { - retval = PTR_ERR(handle); - goto out_trace; - } - - retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry)); - if (!retval) - ext4_fc_track_unlink(handle, dentry); + retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry); #if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid @@ -3301,8 +3307,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (IS_CASEFOLDED(dir)) d_invalidate(dentry); #endif - if (handle) - ext4_journal_stop(handle); out_trace: trace_ext4_unlink_exit(dentry, retval); -- cgit From 594bc43b410316d70bb42aeff168837888d96810 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 6 Nov 2022 14:48:37 -0800 Subject: ext4: fix leaking uninitialized memory in fast-commit journal When space at the end of fast-commit journal blocks is unused, make sure to zero it out so that uninitialized memory is not leaked to disk. Fixes: aa75f4d3daae ("ext4: main fast-commit commit path") Cc: # v5.10+ Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20221106224841.279231-4-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index da0c8228cf9c..1e8be0554239 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -737,6 +737,9 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) *crc = ext4_chksum(sbi, *crc, tl, EXT4_FC_TAG_BASE_LEN); if (pad_len > 0) ext4_fc_memzero(sb, tl + 1, pad_len, crc); + /* Don't leak uninitialized memory in the unused last byte. */ + *((u8 *)(tl + 1) + pad_len) = 0; + ext4_fc_submit_bh(sb, false); ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); @@ -793,6 +796,8 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) dst += sizeof(tail.fc_tid); tail.fc_crc = cpu_to_le32(crc); ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); + dst += sizeof(tail.fc_crc); + memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ ext4_fc_submit_bh(sb, true); -- cgit From 64b4a25c3de81a69724e888ec2db3533b43816e2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 6 Nov 2022 14:48:38 -0800 Subject: ext4: add missing validation of fast-commit record lengths Validate the inode and filename lengths in fast-commit journal records so that a malicious fast-commit journal cannot cause a crash by having invalid values for these. Also validate EXT4_FC_TAG_DEL_RANGE. Fixes: aa75f4d3daae ("ext4: main fast-commit commit path") Cc: # v5.10+ Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20221106224841.279231-5-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 38 +++++++++++++++++++------------------- fs/ext4/fast_commit.h | 2 +- 2 files changed, 20 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 1e8be0554239..d5ad4b2b235d 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1991,32 +1991,31 @@ void ext4_fc_replay_cleanup(struct super_block *sb) kfree(sbi->s_fc_replay_state.fc_modified_inodes); } -static inline bool ext4_fc_tag_len_isvalid(struct ext4_fc_tl *tl, - u8 *val, u8 *end) +static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, + int tag, int len) { - if (val + tl->fc_len > end) - return false; - - /* Here only check ADD_RANGE/TAIL/HEAD which will read data when do - * journal rescan before do CRC check. Other tags length check will - * rely on CRC check. - */ - switch (tl->fc_tag) { + switch (tag) { case EXT4_FC_TAG_ADD_RANGE: - return (sizeof(struct ext4_fc_add_range) == tl->fc_len); - case EXT4_FC_TAG_TAIL: - return (sizeof(struct ext4_fc_tail) <= tl->fc_len); - case EXT4_FC_TAG_HEAD: - return (sizeof(struct ext4_fc_head) == tl->fc_len); + return len == sizeof(struct ext4_fc_add_range); case EXT4_FC_TAG_DEL_RANGE: + return len == sizeof(struct ext4_fc_del_range); + case EXT4_FC_TAG_CREAT: case EXT4_FC_TAG_LINK: case EXT4_FC_TAG_UNLINK: - case EXT4_FC_TAG_CREAT: + len -= sizeof(struct ext4_fc_dentry_info); + return len >= 1 && len <= EXT4_NAME_LEN; case EXT4_FC_TAG_INODE: + len -= sizeof(struct ext4_fc_inode); + return len >= EXT4_GOOD_OLD_INODE_SIZE && + len <= sbi->s_inode_size; case EXT4_FC_TAG_PAD: - default: - return true; + return true; /* padding can have any length */ + case EXT4_FC_TAG_TAIL: + return len >= sizeof(struct ext4_fc_tail); + case EXT4_FC_TAG_HEAD: + return len == sizeof(struct ext4_fc_head); } + return false; } /* @@ -2079,7 +2078,8 @@ static int ext4_fc_replay_scan(journal_t *journal, cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { ext4_fc_get_tl(&tl, cur); val = cur + EXT4_FC_TAG_BASE_LEN; - if (!ext4_fc_tag_len_isvalid(&tl, val, end)) { + if (tl.fc_len > end - val || + !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) { ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -ECANCELED; goto out_err; diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 256f2ad27204..2fadb2c4780c 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -58,7 +58,7 @@ struct ext4_fc_dentry_info { __u8 fc_dname[]; }; -/* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */ +/* Value structure for EXT4_FC_TAG_INODE. */ struct ext4_fc_inode { __le32 fc_ino; __u8 fc_raw_inode[]; -- cgit From 8415ce07ecf0cc25efdd5db264a7133716e503cf Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 6 Nov 2022 14:48:39 -0800 Subject: ext4: fix unaligned memory access in ext4_fc_reserve_space() As is done elsewhere in the file, build the struct ext4_fc_tl on the stack and memcpy() it into the buffer, rather than directly writing it to a potentially-unaligned location in the buffer. Fixes: aa75f4d3daae ("ext4: main fast-commit commit path") Cc: # v5.10+ Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20221106224841.279231-6-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index d5ad4b2b235d..892fa7c7a768 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -675,6 +675,15 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) /* Ext4 commit path routines */ +/* memcpy to fc reserved space and update CRC */ +static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, + int len, u32 *crc) +{ + if (crc) + *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); + return memcpy(dst, src, len); +} + /* memzero and update CRC */ static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, u32 *crc) @@ -700,12 +709,13 @@ static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, */ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) { - struct ext4_fc_tl *tl; + struct ext4_fc_tl tl; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh; int bsize = sbi->s_journal->j_blocksize; int ret, off = sbi->s_fc_bytes % bsize; int pad_len; + u8 *dst; /* * After allocating len, we should have space at least for a 0 byte @@ -729,16 +739,18 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) return sbi->s_fc_bh->b_data + off; } /* Need to add PAD tag */ - tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); - tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); + dst = sbi->s_fc_bh->b_data + off; + tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); pad_len = bsize - off - 1 - EXT4_FC_TAG_BASE_LEN; - tl->fc_len = cpu_to_le16(pad_len); - if (crc) - *crc = ext4_chksum(sbi, *crc, tl, EXT4_FC_TAG_BASE_LEN); - if (pad_len > 0) - ext4_fc_memzero(sb, tl + 1, pad_len, crc); + tl.fc_len = cpu_to_le16(pad_len); + ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc); + dst += EXT4_FC_TAG_BASE_LEN; + if (pad_len > 0) { + ext4_fc_memzero(sb, dst, pad_len, crc); + dst += pad_len; + } /* Don't leak uninitialized memory in the unused last byte. */ - *((u8 *)(tl + 1) + pad_len) = 0; + *dst = 0; ext4_fc_submit_bh(sb, false); @@ -750,15 +762,6 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) return sbi->s_fc_bh->b_data; } -/* memcpy to fc reserved space and update CRC */ -static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, - int len, u32 *crc) -{ - if (crc) - *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); - return memcpy(dst, src, len); -} - /* * Complete a fast commit by writing tail tag. * -- cgit From 48a6a66db82b8043d298a630f22c62d43550cae5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 6 Nov 2022 14:48:40 -0800 Subject: ext4: fix off-by-one errors in fast-commit block filling Due to several different off-by-one errors, or perhaps due to a late change in design that wasn't fully reflected in the code that was actually merged, there are several very strange constraints on how fast-commit blocks are filled with tlv entries: - tlvs must start at least 10 bytes before the end of the block, even though the minimum tlv length is 8. Otherwise, the replay code will ignore them. (BUG: ext4_fc_reserve_space() could violate this requirement if called with a len of blocksize - 9 or blocksize - 8. Fortunately, this doesn't seem to happen currently.) - tlvs must end at least 1 byte before the end of the block. Otherwise the replay code will consider them to be invalid. This quirk contributed to a bug (fixed by an earlier commit) where uninitialized memory was being leaked to disk in the last byte of blocks. Also, strangely these constraints don't apply to the replay code in e2fsprogs, which will accept any tlvs in the blocks (with no bounds checks at all, but that is a separate issue...). Given that this all seems to be a bug, let's fix it by just filling blocks with tlv entries in the natural way. Note that old kernels will be unable to replay fast-commit journals created by kernels that have this commit. Fixes: aa75f4d3daae ("ext4: main fast-commit commit path") Cc: # v5.10+ Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20221106224841.279231-7-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 66 +++++++++++++++++++++++++-------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 892fa7c7a768..7ed71c652f67 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -714,43 +714,43 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) struct buffer_head *bh; int bsize = sbi->s_journal->j_blocksize; int ret, off = sbi->s_fc_bytes % bsize; - int pad_len; + int remaining; u8 *dst; /* - * After allocating len, we should have space at least for a 0 byte - * padding. + * If 'len' is too long to fit in any block alongside a PAD tlv, then we + * cannot fulfill the request. */ - if (len + EXT4_FC_TAG_BASE_LEN > bsize) + if (len > bsize - EXT4_FC_TAG_BASE_LEN) return NULL; - if (bsize - off - 1 > len + EXT4_FC_TAG_BASE_LEN) { - /* - * Only allocate from current buffer if we have enough space for - * this request AND we have space to add a zero byte padding. - */ - if (!sbi->s_fc_bh) { - ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); - if (ret) - return NULL; - sbi->s_fc_bh = bh; - } - sbi->s_fc_bytes += len; - return sbi->s_fc_bh->b_data + off; + if (!sbi->s_fc_bh) { + ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); + if (ret) + return NULL; + sbi->s_fc_bh = bh; } - /* Need to add PAD tag */ dst = sbi->s_fc_bh->b_data + off; + + /* + * Allocate the bytes in the current block if we can do so while still + * leaving enough space for a PAD tlv. + */ + remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; + if (len <= remaining) { + sbi->s_fc_bytes += len; + return dst; + } + + /* + * Else, terminate the current block with a PAD tlv, then allocate a new + * block and allocate the bytes at the start of that new block. + */ + tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); - pad_len = bsize - off - 1 - EXT4_FC_TAG_BASE_LEN; - tl.fc_len = cpu_to_le16(pad_len); + tl.fc_len = cpu_to_le16(remaining); ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc); - dst += EXT4_FC_TAG_BASE_LEN; - if (pad_len > 0) { - ext4_fc_memzero(sb, dst, pad_len, crc); - dst += pad_len; - } - /* Don't leak uninitialized memory in the unused last byte. */ - *dst = 0; + ext4_fc_memzero(sb, dst + EXT4_FC_TAG_BASE_LEN, remaining, crc); ext4_fc_submit_bh(sb, false); @@ -758,7 +758,7 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) if (ret) return NULL; sbi->s_fc_bh = bh; - sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len; + sbi->s_fc_bytes += bsize - off + len; return sbi->s_fc_bh->b_data; } @@ -789,7 +789,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) off = sbi->s_fc_bytes % bsize; tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); - tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); + tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, &crc); @@ -2056,7 +2056,7 @@ static int ext4_fc_replay_scan(journal_t *journal, state = &sbi->s_fc_replay_state; start = (u8 *)bh->b_data; - end = (__u8 *)bh->b_data + journal->j_blocksize - 1; + end = start + journal->j_blocksize; if (state->fc_replay_expected_off == 0) { state->fc_cur_tag = 0; @@ -2077,7 +2077,7 @@ static int ext4_fc_replay_scan(journal_t *journal, } state->fc_replay_expected_off++; - for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN; + for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { ext4_fc_get_tl(&tl, cur); val = cur + EXT4_FC_TAG_BASE_LEN; @@ -2195,9 +2195,9 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, #endif start = (u8 *)bh->b_data; - end = (__u8 *)bh->b_data + journal->j_blocksize - 1; + end = start + journal->j_blocksize; - for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN; + for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { ext4_fc_get_tl(&tl, cur); val = cur + EXT4_FC_TAG_BASE_LEN; -- cgit From 8805dbcb3e83a4e5a6c91edc15643a7498e576ce Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 6 Nov 2022 14:48:41 -0800 Subject: ext4: simplify fast-commit CRC calculation Instead of checksumming each field as it is added to the block, just checksum each block before it is written. This is simpler, and also much more efficient. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20221106224841.279231-8-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 54 +++++++++++++++------------------------------------ 1 file changed, 16 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 7ed71c652f67..1da85f626116 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -675,27 +675,6 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) /* Ext4 commit path routines */ -/* memcpy to fc reserved space and update CRC */ -static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, - int len, u32 *crc) -{ - if (crc) - *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); - return memcpy(dst, src, len); -} - -/* memzero and update CRC */ -static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, - u32 *crc) -{ - void *ret; - - ret = memset(dst, 0, len); - if (crc) - *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len); - return ret; -} - /* * Allocate len bytes on a fast commit buffer. * @@ -749,8 +728,9 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); tl.fc_len = cpu_to_le16(remaining); - ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc); - ext4_fc_memzero(sb, dst + EXT4_FC_TAG_BASE_LEN, remaining, crc); + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); + memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); + *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize); ext4_fc_submit_bh(sb, false); @@ -792,13 +772,15 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); - ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, &crc); + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); - ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); + memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); dst += sizeof(tail.fc_tid); + crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data, + dst - (u8 *)sbi->s_fc_bh->b_data); tail.fc_crc = cpu_to_le32(crc); - ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); + memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); dst += sizeof(tail.fc_crc); memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ @@ -824,8 +806,8 @@ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, tl.fc_tag = cpu_to_le16(tag); tl.fc_len = cpu_to_le16(len); - ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc); - ext4_fc_memcpy(sb, dst + EXT4_FC_TAG_BASE_LEN, val, len, crc); + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); + memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len); return true; } @@ -847,11 +829,11 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); - ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc); + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; - ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); + memcpy(dst, &fcd, sizeof(fcd)); dst += sizeof(fcd); - ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc); + memcpy(dst, fc_dentry->fcd_name.name, dlen); return true; } @@ -889,15 +871,11 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc) if (!dst) goto err; - if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc)) - goto err; + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; - if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) - goto err; + memcpy(dst, &fc_inode, sizeof(fc_inode)); dst += sizeof(fc_inode); - if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), - inode_len, crc)) - goto err; + memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len); ret = 0; err: brelse(iloc.bh); -- cgit From fae381a3d79bb94aa2eb752170d47458d778b797 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 7 Nov 2022 09:53:35 +0800 Subject: ext4: init quota for 'old.inode' in 'ext4_rename' Syzbot found the following issue: ext4_parse_param: s_want_extra_isize=128 ext4_inode_info_init: s_want_extra_isize=32 ext4_rename: old.inode=ffff88823869a2c8 old.dir=ffff888238699828 new.inode=ffff88823869d7e8 new.dir=ffff888238699828 __ext4_mark_inode_dirty: inode=ffff888238699828 ea_isize=32 want_ea_size=128 __ext4_mark_inode_dirty: inode=ffff88823869a2c8 ea_isize=32 want_ea_size=128 ext4_xattr_block_set: inode=ffff88823869a2c8 ------------[ cut here ]------------ WARNING: CPU: 13 PID: 2234 at fs/ext4/xattr.c:2070 ext4_xattr_block_set.cold+0x22/0x980 Modules linked in: RIP: 0010:ext4_xattr_block_set.cold+0x22/0x980 RSP: 0018:ffff888227d3f3b0 EFLAGS: 00010202 RAX: 0000000000000001 RBX: ffff88823007a000 RCX: 0000000000000000 RDX: 0000000000000a03 RSI: 0000000000000040 RDI: ffff888230078178 RBP: 0000000000000000 R08: 000000000000002c R09: ffffed1075c7df8e R10: ffff8883ae3efc6b R11: ffffed1075c7df8d R12: 0000000000000000 R13: ffff88823869a2c8 R14: ffff8881012e0460 R15: dffffc0000000000 FS: 00007f350ac1f740(0000) GS:ffff8883ae200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f350a6ed6a0 CR3: 0000000237456000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? ext4_xattr_set_entry+0x3b7/0x2320 ? ext4_xattr_block_set+0x0/0x2020 ? ext4_xattr_set_entry+0x0/0x2320 ? ext4_xattr_check_entries+0x77/0x310 ? ext4_xattr_ibody_set+0x23b/0x340 ext4_xattr_move_to_block+0x594/0x720 ext4_expand_extra_isize_ea+0x59a/0x10f0 __ext4_expand_extra_isize+0x278/0x3f0 __ext4_mark_inode_dirty.cold+0x347/0x410 ext4_rename+0xed3/0x174f vfs_rename+0x13a7/0x2510 do_renameat2+0x55d/0x920 __x64_sys_rename+0x7d/0xb0 do_syscall_64+0x3b/0xa0 entry_SYSCALL_64_after_hwframe+0x72/0xdc As 'ext4_rename' will modify 'old.inode' ctime and mark inode dirty, which may trigger expand 'extra_isize' and allocate block. If inode didn't init quota will lead to warning. To solve above issue, init 'old.inode' firstly in 'ext4_rename'. Reported-by: syzbot+98346927678ac3059c77@syzkaller.appspotmail.com Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221107015335.2524319-1-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/namei.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a789ea9b61a0..1c5518a4bdf9 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3796,6 +3796,9 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, return -EXDEV; retval = dquot_initialize(old.dir); + if (retval) + return retval; + retval = dquot_initialize(old.inode); if (retval) return retval; retval = dquot_initialize(new.dir); -- cgit From bb0fbc782ee9fa8c970c8e8e17c1a52ff9419182 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 8 Nov 2022 15:50:42 +0100 Subject: ext4: print file system UUID on mount, remount and unmount The device names are not necessarily consistent across reboots which can make it more difficult to identify the right file system when tracking down issues using system logs. Print file system UUID string on every mount, remount and unmount to make this task easier. This is similar to the functionality recently propsed for XFS. Signed-off-by: Lukas Czerner Cc: Lukas Herbolt Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20221108145042.85770-1-lczerner@redhat.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9185131e01e5..f5e6919ea650 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1206,7 +1206,8 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_sysfs(sb); if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount")) - ext4_msg(sb, KERN_INFO, "unmounting filesystem."); + ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.", + &sb->s_uuid); ext4_unregister_li_request(sb); ext4_quota_off_umount(sb); @@ -5655,8 +5656,9 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) descr = "out journal"; if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) - ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " - "Quota mode: %s.", descr, ext4_quota_mode(sb)); + ext4_msg(sb, KERN_INFO, "mounted filesystem %pU with%s. " + "Quota mode: %s.", &sb->s_uuid, descr, + ext4_quota_mode(sb)); /* Update the s_overhead_clusters if necessary */ ext4_update_overhead(sb, false); @@ -6611,8 +6613,8 @@ static int ext4_reconfigure(struct fs_context *fc) if (ret < 0) return ret; - ext4_msg(sb, KERN_INFO, "re-mounted. Quota mode: %s.", - ext4_quota_mode(sb)); + ext4_msg(sb, KERN_INFO, "re-mounted %pU. Quota mode: %s.", + &sb->s_uuid, ext4_quota_mode(sb)); return 0; } -- cgit From 89481b5fa8c0640e62ba84c6020cee895f7ac643 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 9 Nov 2022 15:43:43 +0800 Subject: ext4: correct inconsistent error msg in nojournal mode When we used the journal_async_commit mounting option in nojournal mode, the kernel told me that "can't mount with journal_checksum", was very confusing. I find that when we mount with journal_async_commit, both the JOURNAL_ASYNC_COMMIT and EXPLICIT_JOURNAL_CHECKSUM flags are set. However, in the error branch, CHECKSUM is checked before ASYNC_COMMIT. As a result, the above inconsistency occurs, and the ASYNC_COMMIT branch becomes dead code that cannot be executed. Therefore, we exchange the positions of the two judgments to make the error msg more accurate. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221109074343.4184862-1-libaokun1@huawei.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/super.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f5e6919ea650..878be47faaaf 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5288,14 +5288,15 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount3a; } else { /* Nojournal mode, all journal mount options are illegal */ - if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_checksum, fs mounted w/o journal"); + "journal_async_commit, fs mounted w/o journal"); goto failed_mount3a; } - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + + if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_async_commit, fs mounted w/o journal"); + "journal_checksum, fs mounted w/o journal"); goto failed_mount3a; } if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { -- cgit From 060f77392cab1c1cefb8a27cf6bd61f1db2441ec Mon Sep 17 00:00:00 2001 From: JunChao Sun Date: Wed, 9 Nov 2022 07:38:22 -0800 Subject: ext4: replace kmem_cache_create with KMEM_CACHE Replace kmem_cache_create with KMEM_CACHE macro that guaranteed struct alignment Signed-off-by: JunChao Sun Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221109153822.80250-1-sunjunchao2870@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 8 ++------ fs/ext4/readpage.c | 5 ++--- 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index cd0a861853e3..97eccc0028a1 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -155,9 +155,7 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, int __init ext4_init_es(void) { - ext4_es_cachep = kmem_cache_create("ext4_extent_status", - sizeof(struct extent_status), - 0, (SLAB_RECLAIM_ACCOUNT), NULL); + ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); if (ext4_es_cachep == NULL) return -ENOMEM; return 0; @@ -1807,9 +1805,7 @@ static void ext4_print_pending_tree(struct inode *inode) int __init ext4_init_pending(void) { - ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation", - sizeof(struct pending_reservation), - 0, (SLAB_RECLAIM_ACCOUNT), NULL); + ext4_pending_cachep = KMEM_CACHE(pending_reservation, SLAB_RECLAIM_ACCOUNT); if (ext4_pending_cachep == NULL) return -ENOMEM; return 0; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 3d21eae267fc..773176e7f9f5 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -410,9 +410,8 @@ int ext4_mpage_readpages(struct inode *inode, int __init ext4_init_post_read_processing(void) { - bio_post_read_ctx_cache = - kmem_cache_create("ext4_bio_post_read_ctx", - sizeof(struct bio_post_read_ctx), 0, 0, NULL); + bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT); + if (!bio_post_read_ctx_cache) goto fail; bio_post_read_ctx_pool = -- cgit From 26d75a16af285a70863ba6a81f85d81e7e65da50 Mon Sep 17 00:00:00 2001 From: Luís Henriques Date: Wed, 9 Nov 2022 18:14:45 +0000 Subject: ext4: fix error code return to user-space in ext4_get_branch() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a block is out of range in ext4_get_branch(), -ENOMEM will be returned to user-space. Obviously, this error code isn't really useful. This patch fixes it by making sure the right error code (-EFSCORRUPTED) is propagated to user-space. EUCLEAN is more informative than ENOMEM. Signed-off-by: Luís Henriques Link: https://lore.kernel.org/r/20221109181445.17843-1-lhenriques@suse.de Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/indirect.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 860fc5119009..c68bebe7ff4b 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -148,6 +148,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, struct super_block *sb = inode->i_sb; Indirect *p = chain; struct buffer_head *bh; + unsigned int key; int ret = -EIO; *err = 0; @@ -156,7 +157,13 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, if (!p->key) goto no_block; while (--depth) { - bh = sb_getblk(sb, le32_to_cpu(p->key)); + key = le32_to_cpu(p->key); + if (key > ext4_blocks_count(EXT4_SB(sb)->s_es)) { + /* the block was out of range */ + ret = -EFSCORRUPTED; + goto failure; + } + bh = sb_getblk(sb, key); if (unlikely(!bh)) { ret = -ENOMEM; goto failure; -- cgit From b76abb5157468756163fe7e3431c9fe32cba57ca Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 10 Nov 2022 12:16:29 -0800 Subject: ext4: dont return EINVAL from GETFSUUID when reporting UUID length If userspace calls this ioctl with fsu_length (the length of the fsuuid.fsu_uuid array) set to zero, ext4 copies the desired uuid length out to userspace. The kernel call returned a result from a valid input, so the return value here should be zero, not EINVAL. While we're at it, fix the copy_to_user call to make it clear that we're only copying out fsu_len. Signed-off-by: Darrick J. Wong Reviewed-by: Catherine Hoang Link: https://lore.kernel.org/r/166811138914.327006.9241306894437166566.stgit@magnolia Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ioctl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index e5f60057db5b..beedaebab21c 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1154,9 +1154,10 @@ static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi, if (fsuuid.fsu_len == 0) { fsuuid.fsu_len = UUID_SIZE; - if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid.fsu_len))) + if (copy_to_user(&ufsuuid->fsu_len, &fsuuid.fsu_len, + sizeof(fsuuid.fsu_len))) return -EFAULT; - return -EINVAL; + return 0; } if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0) -- cgit From a7e9d977e031fceefe1e7cd69ebd7202d5758b56 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 10 Nov 2022 12:16:34 -0800 Subject: ext4: don't fail GETFSUUID when the caller provides a long buffer If userspace provides a longer UUID buffer than is required, we shouldn't fail the call with EINVAL -- rather, we can fill the caller's buffer with the bytes we /can/ fill, and update the length field to reflect what we copied. This doesn't break the UAPI since we're enabling a case that currently fails, and so far Ted hasn't released a version of e2fsprogs that uses the new ext4 ioctl. Signed-off-by: Darrick J. Wong Reviewed-by: Catherine Hoang Link: https://lore.kernel.org/r/166811139478.327006.13879198441587445544.stgit@magnolia Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ioctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index beedaebab21c..202953b5db49 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1160,14 +1160,16 @@ static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi, return 0; } - if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0) + if (fsuuid.fsu_len < UUID_SIZE || fsuuid.fsu_flags != 0) return -EINVAL; lock_buffer(sbi->s_sbh); memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE); unlock_buffer(sbi->s_sbh); - if (copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE)) + fsuuid.fsu_len = UUID_SIZE; + if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid)) || + copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE)) return -EFAULT; return 0; } -- cgit From a408f33e895e455f16cf964cb5cd4979b658db7b Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 17 Nov 2022 12:03:39 +0800 Subject: ext4: fix bad checksum after online resize When online resizing is performed twice consecutively, the error message "Superblock checksum does not match superblock" is displayed for the second time. Here's the reproducer: mkfs.ext4 -F /dev/sdb 100M mount /dev/sdb /tmp/test resize2fs /dev/sdb 5G resize2fs /dev/sdb 6G To solve this issue, we moved the update of the checksum after the es->s_overhead_clusters is updated. Fixes: 026d0d27c488 ("ext4: reduce computation of overhead during resize") Fixes: de394a86658f ("ext4: update s_overhead_clusters in the superblock during an on-line resize") Signed-off-by: Baokun Li Reviewed-by: Darrick J. Wong Reviewed-by: Jan Kara Cc: stable@kernel.org Link: https://lore.kernel.org/r/20221117040341.1380702-2-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 6e88a85abd7f..70fef50ab3f9 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1476,8 +1476,6 @@ static void ext4_update_super(struct super_block *sb, * active. */ ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + reserved_blocks); - ext4_superblock_csum_set(sb); - unlock_buffer(sbi->s_sbh); /* Update the free space counts */ percpu_counter_add(&sbi->s_freeclusters_counter, @@ -1513,6 +1511,8 @@ static void ext4_update_super(struct super_block *sb, ext4_calculate_overhead(sb); es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead); + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: added group %u:" "%llu blocks(%llu free %llu reserved)\n", flex_gd->count, -- cgit From 8f49ec603ae3e213bfab2799182724e3abac55a1 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 17 Nov 2022 12:03:40 +0800 Subject: ext4: fix corrupt backup group descriptors after online resize In commit 9a8c5b0d0615 ("ext4: update the backup superblock's at the end of the online resize"), it is assumed that update_backups() only updates backup superblocks, so each b_data is treated as a backupsuper block to update its s_block_group_nr and s_checksum. However, update_backups() also updates the backup group descriptors, which causes the backup group descriptors to be corrupted. The above commit fixes the problem of invalid checksum of the backup superblock. The root cause of this problem is that the checksum of ext4_update_super() is not set correctly. This problem has been fixed in the previous patch ("ext4: fix bad checksum after online resize"). However, we do need to set block_group_nr for the backup superblock in update_backups(). When a block is in a group that contains a backup superblock, and the block is the first block in the group, the block is definitely a superblock. We add a helper function that includes setting s_block_group_nr and updating checksum, and then call it only when the above conditions are met to prevent the backup group descriptors from being incorrectly modified. Fixes: 9a8c5b0d0615 ("ext4: update the backup superblock's at the end of the online resize") Signed-off-by: Baokun Li Reviewed-by: Jan Kara Cc: stable@kernel.org Link: https://lore.kernel.org/r/20221117040341.1380702-3-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 70fef50ab3f9..d460440d6dee 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1110,6 +1110,16 @@ exit_free: return err; } +static inline void ext4_set_block_group_nr(struct super_block *sb, char *data, + ext4_group_t group) +{ + struct ext4_super_block *es = (struct ext4_super_block *) data; + + es->s_block_group_nr = cpu_to_le16(group); + if (ext4_has_metadata_csum(sb)) + es->s_checksum = ext4_superblock_csum(sb, es); +} + /* * Update the backup copies of the ext4 metadata. These don't need to be part * of the main resize transaction, because e2fsck will re-write them if there @@ -1158,7 +1168,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, while (group < sbi->s_groups_count) { struct buffer_head *bh; ext4_fsblk_t backup_block; - struct ext4_super_block *es; + int has_super = ext4_bg_has_super(sb, group); + ext4_fsblk_t first_block = ext4_group_first_block_no(sb, group); /* Out of journal space, and can't get more - abort - so sad */ err = ext4_resize_ensure_credits_batch(handle, 1); @@ -1168,8 +1179,7 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, if (meta_bg == 0) backup_block = ((ext4_fsblk_t)group) * bpg + blk_off; else - backup_block = (ext4_group_first_block_no(sb, group) + - ext4_bg_has_super(sb, group)); + backup_block = first_block + has_super; bh = sb_getblk(sb, backup_block); if (unlikely(!bh)) { @@ -1187,10 +1197,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, memcpy(bh->b_data, data, size); if (rest) memset(bh->b_data + size, 0, rest); - es = (struct ext4_super_block *) bh->b_data; - es->s_block_group_nr = cpu_to_le16(group); - if (ext4_has_metadata_csum(sb)) - es->s_checksum = ext4_superblock_csum(sb, es); + if (has_super && (backup_block == first_block)) + ext4_set_block_group_nr(sb, bh->b_data, group); set_buffer_uptodate(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, NULL, bh); -- cgit From 0aeaa2559d6d53358fca3e3fce73807367adca74 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 17 Nov 2022 12:03:41 +0800 Subject: ext4: fix corruption when online resizing a 1K bigalloc fs When a backup superblock is updated in update_backups(), the primary superblock's offset in the group (that is, sbi->s_sbh->b_blocknr) is used as the backup superblock's offset in its group. However, when the block size is 1K and bigalloc is enabled, the two offsets are not equal. This causes the backup group descriptors to be overwritten by the superblock in update_backups(). Moreover, if meta_bg is enabled, the file system will be corrupted because this feature uses backup group descriptors. To solve this issue, we use a more accurate ext4_group_first_block_no() as the offset of the backup superblock in its group. Fixes: d77147ff443b ("ext4: add support for online resizing with bigalloc") Signed-off-by: Baokun Li Reviewed-by: Jan Kara Cc: stable@kernel.org Link: https://lore.kernel.org/r/20221117040341.1380702-4-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index d460440d6dee..6b91443d6bf3 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1604,8 +1604,8 @@ exit_journal: int meta_bg = ext4_has_feature_meta_bg(sb); sector_t old_gdb = 0; - update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext4_super_block), 0); + update_backups(sb, ext4_group_first_block_no(sb, 0), + (char *)es, sizeof(struct ext4_super_block), 0); for (; gdb_num <= gdb_num_end; gdb_num++) { struct buffer_head *gdb_bh; @@ -1816,7 +1816,7 @@ errout: if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: extended group to %llu " "blocks\n", ext4_blocks_count(es)); - update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, + update_backups(sb, ext4_group_first_block_no(sb, 0), (char *)es, sizeof(struct ext4_super_block), 0); } return err; -- cgit From 7ea71af94eaaaf6d9aed24bc94a05b977a741cb9 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 17 Nov 2022 15:36:03 +0800 Subject: ext4: fix uninititialized value in 'ext4_evict_inode' Syzbot found the following issue: ===================================================== BUG: KMSAN: uninit-value in ext4_evict_inode+0xdd/0x26b0 fs/ext4/inode.c:180 ext4_evict_inode+0xdd/0x26b0 fs/ext4/inode.c:180 evict+0x365/0x9a0 fs/inode.c:664 iput_final fs/inode.c:1747 [inline] iput+0x985/0xdd0 fs/inode.c:1773 __ext4_new_inode+0xe54/0x7ec0 fs/ext4/ialloc.c:1361 ext4_mknod+0x376/0x840 fs/ext4/namei.c:2844 vfs_mknod+0x79d/0x830 fs/namei.c:3914 do_mknodat+0x47d/0xaa0 __do_sys_mknodat fs/namei.c:3992 [inline] __se_sys_mknodat fs/namei.c:3989 [inline] __ia32_sys_mknodat+0xeb/0x150 fs/namei.c:3989 do_syscall_32_irqs_on arch/x86/entry/common.c:112 [inline] __do_fast_syscall_32+0xa2/0x100 arch/x86/entry/common.c:178 do_fast_syscall_32+0x33/0x70 arch/x86/entry/common.c:203 do_SYSENTER_32+0x1b/0x20 arch/x86/entry/common.c:246 entry_SYSENTER_compat_after_hwframe+0x70/0x82 Uninit was created at: __alloc_pages+0x9f1/0xe80 mm/page_alloc.c:5578 alloc_pages+0xaae/0xd80 mm/mempolicy.c:2285 alloc_slab_page mm/slub.c:1794 [inline] allocate_slab+0x1b5/0x1010 mm/slub.c:1939 new_slab mm/slub.c:1992 [inline] ___slab_alloc+0x10c3/0x2d60 mm/slub.c:3180 __slab_alloc mm/slub.c:3279 [inline] slab_alloc_node mm/slub.c:3364 [inline] slab_alloc mm/slub.c:3406 [inline] __kmem_cache_alloc_lru mm/slub.c:3413 [inline] kmem_cache_alloc_lru+0x6f3/0xb30 mm/slub.c:3429 alloc_inode_sb include/linux/fs.h:3117 [inline] ext4_alloc_inode+0x5f/0x860 fs/ext4/super.c:1321 alloc_inode+0x83/0x440 fs/inode.c:259 new_inode_pseudo fs/inode.c:1018 [inline] new_inode+0x3b/0x430 fs/inode.c:1046 __ext4_new_inode+0x2a7/0x7ec0 fs/ext4/ialloc.c:959 ext4_mkdir+0x4d5/0x1560 fs/ext4/namei.c:2992 vfs_mkdir+0x62a/0x870 fs/namei.c:4035 do_mkdirat+0x466/0x7b0 fs/namei.c:4060 __do_sys_mkdirat fs/namei.c:4075 [inline] __se_sys_mkdirat fs/namei.c:4073 [inline] __ia32_sys_mkdirat+0xc4/0x120 fs/namei.c:4073 do_syscall_32_irqs_on arch/x86/entry/common.c:112 [inline] __do_fast_syscall_32+0xa2/0x100 arch/x86/entry/common.c:178 do_fast_syscall_32+0x33/0x70 arch/x86/entry/common.c:203 do_SYSENTER_32+0x1b/0x20 arch/x86/entry/common.c:246 entry_SYSENTER_compat_after_hwframe+0x70/0x82 CPU: 1 PID: 4625 Comm: syz-executor.2 Not tainted 6.1.0-rc4-syzkaller-62821-gcb231e2f67ec #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 ===================================================== Now, 'ext4_alloc_inode()' didn't init 'ei->i_flags'. If new inode failed before set 'ei->i_flags' in '__ext4_new_inode()', then do 'iput()'. As after 6bc0d63dad7f commit will access 'ei->i_flags' in 'ext4_evict_inode()' which will lead to access uninit-value. To solve above issue just init 'ei->i_flags' in 'ext4_alloc_inode()'. Reported-by: syzbot+57b25da729eb0b88177d@syzkaller.appspotmail.com Signed-off-by: Ye Bin Fixes: 6bc0d63dad7f ("ext4: remove EA inode entry from mbcache on inode eviction") Reviewed-by: Jan Kara Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20221117073603.2598882-1-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 878be47faaaf..28d009151d23 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1324,6 +1324,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) return NULL; inode_set_iversion(&ei->vfs_inode, 1); + ei->i_flags = 0; spin_lock_init(&ei->i_raw_lock); INIT_LIST_HEAD(&ei->i_prealloc_list); atomic_set(&ei->i_prealloc_active, 0); -- cgit From 131294c35ed6f777bd4e79d42af13b5c41bf2775 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Thu, 17 Nov 2022 10:22:07 -0500 Subject: ext4: fix delayed allocation bug in ext4_clu_mapped for bigalloc + inline When converting files with inline data to extents, delayed allocations made on a file system created with both the bigalloc and inline options can result in invalid extent status cache content, incorrect reserved cluster counts, kernel memory leaks, and potential kernel panics. With bigalloc, the code that determines whether a block must be delayed allocated searches the extent tree to see if that block maps to a previously allocated cluster. If not, the block is delayed allocated, and otherwise, it isn't. However, if the inline option is also used, and if the file containing the block is marked as able to store data inline, there isn't a valid extent tree associated with the file. The current code in ext4_clu_mapped() calls ext4_find_extent() to search the non-existent tree for a previously allocated cluster anyway, which typically finds nothing, as desired. However, a side effect of the search can be to cache invalid content from the non-existent tree (garbage) in the extent status tree, including bogus entries in the pending reservation tree. To fix this, avoid searching the extent tree when allocating blocks for bigalloc + inline files that are being converted from inline to extent mapped. Signed-off-by: Eric Whitney Link: https://lore.kernel.org/r/20221117152207.2424-1-enwlinux@gmail.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/extents.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ec008278d970..9de1c9d1a13d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5797,6 +5797,14 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) struct ext4_extent *extent; ext4_lblk_t first_lblk, first_lclu, last_lclu; + /* + * if data can be stored inline, the logical cluster isn't + * mapped - no physical clusters have been allocated, and the + * file has no extents + */ + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) + return 0; + /* search for the extent closest to the first block in the cluster */ path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); if (IS_ERR(path)) { -- cgit From 956510c0c7439e90b8103aaeaf4da92878c622f0 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 21 Nov 2022 12:21:30 +0100 Subject: fs: ext4: initialize fsdata in pagecache_write() When aops->write_begin() does not initialize fsdata, KMSAN reports an error passing the latter to aops->write_end(). Fix this by unconditionally initializing fsdata. Cc: Eric Biggers Fixes: c93d8f885809 ("ext4: add basic fs-verity support") Reported-by: syzbot+9767be679ef5016b6082@syzkaller.appspotmail.com Signed-off-by: Alexander Potapenko Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20221121112134.407362-1-glider@google.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/verity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 3c640bd7ecae..30e3b65798b5 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -79,7 +79,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); struct page *page; - void *fsdata; + void *fsdata = NULL; int res; res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); -- cgit From b40ebaf63851b3a401b0dc9263843538f64f5ce6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 21 Nov 2022 14:09:29 +0100 Subject: ext4: avoid BUG_ON when creating xattrs Commit fb0a387dcdcd ("ext4: limit block allocations for indirect-block files to < 2^32") added code to try to allocate xattr block with 32-bit block number for indirect block based files on the grounds that these files cannot use larger block numbers. It also added BUG_ON when allocated block could not fit into 32 bits. This is however bogus reasoning because xattr block is stored in inode->i_file_acl and inode->i_file_acl_hi and as such even indirect block based files can happily use full 48 bits for xattr block number. The proper handling seems to be there basically since 64-bit block number support was added. So remove the bogus limitation and BUG_ON. Cc: Eric Sandeen Fixes: fb0a387dcdcd ("ext4: limit block allocations for indirect-block files to < 2^32") Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20221121130929.32031-1-jack@suse.cz Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/xattr.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 718ef3987f94..4d1c701f0eec 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2071,19 +2071,11 @@ inserted: goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); - - /* non-extent files can't have physical blocks past 2^32 */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; - block = ext4_new_meta_blocks(handle, inode, goal, 0, NULL, &error); if (error) goto cleanup; - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); - ea_idebug(inode, "creating block %llu", (unsigned long long)block); -- cgit From a44e84a9b7764c72896f7241a0ec9ac7e7ef38dd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 23 Nov 2022 20:39:50 +0100 Subject: ext4: fix deadlock due to mbcache entry corruption When manipulating xattr blocks, we can deadlock infinitely looping inside ext4_xattr_block_set() where we constantly keep finding xattr block for reuse in mbcache but we are unable to reuse it because its reference count is too big. This happens because cache entry for the xattr block is marked as reusable (e_reusable set) although its reference count is too big. When this inconsistency happens, this inconsistent state is kept indefinitely and so ext4_xattr_block_set() keeps retrying indefinitely. The inconsistent state is caused by non-atomic update of e_reusable bit. e_reusable is part of a bitfield and e_reusable update can race with update of e_referenced bit in the same bitfield resulting in loss of one of the updates. Fix the problem by using atomic bitops instead. This bug has been around for many years, but it became *much* easier to hit after commit 65f8b80053a1 ("ext4: fix race when reusing xattr blocks"). Cc: stable@vger.kernel.org Fixes: 6048c64b2609 ("mbcache: add reusable flag to cache entries") Fixes: 65f8b80053a1 ("ext4: fix race when reusing xattr blocks") Reported-and-tested-by: Jeremi Piotrowski Reported-by: Thilo Fromm Link: https://lore.kernel.org/r/c77bf00f-4618-7149-56f1-b8d1664b9d07@linux.microsoft.com/ Signed-off-by: Jan Kara Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20221123193950.16758-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 4 ++-- fs/mbcache.c | 14 ++++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 4d1c701f0eec..6bdd502527f8 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1281,7 +1281,7 @@ retry_ref: ce = mb_cache_entry_get(ea_block_cache, hash, bh->b_blocknr); if (ce) { - ce->e_reusable = 1; + set_bit(MBE_REUSABLE_B, &ce->e_flags); mb_cache_entry_put(ea_block_cache, ce); } } @@ -2043,7 +2043,7 @@ inserted: } BHDR(new_bh)->h_refcount = cpu_to_le32(ref); if (ref == EXT4_XATTR_REFCOUNT_MAX) - ce->e_reusable = 0; + clear_bit(MBE_REUSABLE_B, &ce->e_flags); ea_bdebug(new_bh, "reusing; refcount now=%d", ref); ext4_xattr_block_csum_set(inode, new_bh); diff --git a/fs/mbcache.c b/fs/mbcache.c index e272ad738faf..2a4b8b549e93 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -100,8 +100,9 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, atomic_set(&entry->e_refcnt, 2); entry->e_key = key; entry->e_value = value; - entry->e_reusable = reusable; - entry->e_referenced = 0; + entry->e_flags = 0; + if (reusable) + set_bit(MBE_REUSABLE_B, &entry->e_flags); head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { @@ -165,7 +166,8 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache, while (node) { entry = hlist_bl_entry(node, struct mb_cache_entry, e_hash_list); - if (entry->e_key == key && entry->e_reusable && + if (entry->e_key == key && + test_bit(MBE_REUSABLE_B, &entry->e_flags) && atomic_inc_not_zero(&entry->e_refcnt)) goto out; node = node->next; @@ -284,7 +286,7 @@ EXPORT_SYMBOL(mb_cache_entry_delete_or_get); void mb_cache_entry_touch(struct mb_cache *cache, struct mb_cache_entry *entry) { - entry->e_referenced = 1; + set_bit(MBE_REFERENCED_B, &entry->e_flags); } EXPORT_SYMBOL(mb_cache_entry_touch); @@ -309,9 +311,9 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, entry = list_first_entry(&cache->c_list, struct mb_cache_entry, e_list); /* Drop initial hash reference if there is no user */ - if (entry->e_referenced || + if (test_bit(MBE_REFERENCED_B, &entry->e_flags) || atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) { - entry->e_referenced = 0; + clear_bit(MBE_REFERENCED_B, &entry->e_flags); list_move_tail(&entry->e_list, &cache->c_list); continue; } -- cgit From d73eff68a8c0ea3fc626b08ea84e4cd327ccbe5f Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 2 Dec 2022 20:04:09 +0800 Subject: ext4: make ext4_mb_initialize_context return void Change the return type to void since it always return 0, and no need to do the checking in ext4_mb_new_blocks. Signed-off-by: Guoqing Jiang Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20221202120409.24098-1-guoqing.jiang@linux.dev Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 9dad93059945..5b2ae37a8b80 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5204,7 +5204,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) mutex_lock(&ac->ac_lg->lg_mutex); } -static noinline_for_stack int +static noinline_for_stack void ext4_mb_initialize_context(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { @@ -5253,8 +5253,6 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, (unsigned) ar->lleft, (unsigned) ar->pleft, (unsigned) ar->lright, (unsigned) ar->pright, inode_is_open_for_write(ar->inode) ? "" : "non-"); - return 0; - } static noinline_for_stack void @@ -5591,11 +5589,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, goto out; } - *errp = ext4_mb_initialize_context(ac, ar); - if (*errp) { - ar->len = 0; - goto out; - } + ext4_mb_initialize_context(ac, ar); ac->ac_op = EXT4_MB_HISTORY_PREALLOC; seq = this_cpu_read(discard_pa_seq); -- cgit From 5c099c4fdc438014d5893629e70a8ba934433ee8 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Tue, 6 Dec 2022 22:41:34 +0800 Subject: ext4: fix kernel BUG in 'ext4_write_inline_data_end()' Syzbot report follow issue: ------------[ cut here ]------------ kernel BUG at fs/ext4/inline.c:227! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 3629 Comm: syz-executor212 Not tainted 6.1.0-rc5-syzkaller-00018-g59d0d52c30d4 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 RIP: 0010:ext4_write_inline_data+0x344/0x3e0 fs/ext4/inline.c:227 RSP: 0018:ffffc90003b3f368 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffff8880704e16c0 RCX: 0000000000000000 RDX: ffff888021763a80 RSI: ffffffff821e31a4 RDI: 0000000000000006 RBP: 000000000006818e R08: 0000000000000006 R09: 0000000000068199 R10: 0000000000000079 R11: 0000000000000000 R12: 000000000000000b R13: 0000000000068199 R14: ffffc90003b3f408 R15: ffff8880704e1c82 FS: 000055555723e3c0(0000) GS:ffff8880b9b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fffe8ac9080 CR3: 0000000079f81000 CR4: 0000000000350ee0 Call Trace: ext4_write_inline_data_end+0x2a3/0x12f0 fs/ext4/inline.c:768 ext4_write_end+0x242/0xdd0 fs/ext4/inode.c:1313 ext4_da_write_end+0x3ed/0xa30 fs/ext4/inode.c:3063 generic_perform_write+0x316/0x570 mm/filemap.c:3764 ext4_buffered_write_iter+0x15b/0x460 fs/ext4/file.c:285 ext4_file_write_iter+0x8bc/0x16e0 fs/ext4/file.c:700 call_write_iter include/linux/fs.h:2191 [inline] do_iter_readv_writev+0x20b/0x3b0 fs/read_write.c:735 do_iter_write+0x182/0x700 fs/read_write.c:861 vfs_iter_write+0x74/0xa0 fs/read_write.c:902 iter_file_splice_write+0x745/0xc90 fs/splice.c:686 do_splice_from fs/splice.c:764 [inline] direct_splice_actor+0x114/0x180 fs/splice.c:931 splice_direct_to_actor+0x335/0x8a0 fs/splice.c:886 do_splice_direct+0x1ab/0x280 fs/splice.c:974 do_sendfile+0xb19/0x1270 fs/read_write.c:1255 __do_sys_sendfile64 fs/read_write.c:1323 [inline] __se_sys_sendfile64 fs/read_write.c:1309 [inline] __x64_sys_sendfile64+0x1d0/0x210 fs/read_write.c:1309 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd ---[ end trace 0000000000000000 ]--- Above issue may happens as follows: ext4_da_write_begin ext4_da_write_inline_data_begin ext4_da_convert_inline_data_to_extent ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ext4_da_write_end ext4_run_li_request ext4_mb_prefetch ext4_read_block_bitmap_nowait ext4_validate_block_bitmap ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT) percpu_counter_sub(&sbi->s_freeclusters_counter,grp->bb_free); -> sbi->s_freeclusters_counter become zero ext4_da_write_begin if (ext4_nonda_switch(inode->i_sb)) -> As freeclusters_counter is zero will return true *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; ext4_write_begin ext4_da_write_end if (write_mode == FALL_BACK_TO_NONDELALLOC) ext4_write_end if (inline_data) ext4_write_inline_data_end ext4_write_inline_data BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); -> As inode is already convert to extent, so 'pos + len' > inline_size -> then trigger BUG. To solve this issue, instead of checking ext4_has_inline_data() which is only cleared after data has been written back, check the EXT4_STATE_MAY_INLINE_DATA flag in ext4_write_end(). Fixes: f19d5870cbf7 ("ext4: add normal write support for inline data") Reported-by: syzbot+4faa160fa96bfba639f8@syzkaller.appspotmail.com Reported-by: Jun Nie Signed-off-by: Ye Bin Link: https://lore.kernel.org/r/20221206144134.1919987-1-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 181bc161b1ac..a0f4d4197a0b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1315,7 +1315,8 @@ static int ext4_write_end(struct file *file, trace_ext4_write_end(inode, pos, len, copied); - if (ext4_has_inline_data(inode)) + if (ext4_has_inline_data(inode) && + ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) return ext4_write_inline_data_end(inode, pos, len, copied, page); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); -- cgit From 04e568a3b31cfbd545c04c8bfc35c20e5ccfce0f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 7 Dec 2022 12:27:04 +0100 Subject: ext4: handle redirtying in ext4_bio_write_page() Since we want to transition transaction commits to use ext4_writepages() for writing back ordered, add handling of page redirtying into ext4_bio_write_page(). Also move buffer dirty bit clearing into the same place other buffer state handling. Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20221207112722.22220-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/page-io.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 97fa7b4c645f..4e68ace86f11 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -482,6 +482,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io, /* A hole? We can safely clear the dirty bit */ if (!buffer_mapped(bh)) clear_buffer_dirty(bh); + /* + * Keeping dirty some buffer we cannot write? Make + * sure to redirty the page. This happens e.g. when + * doing writeout for transaction commit. + */ + if (buffer_dirty(bh) && !PageDirty(page)) + redirty_page_for_writepage(wbc, page); if (io->io_bio) ext4_io_submit(io); continue; @@ -489,6 +496,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, if (buffer_new(bh)) clear_buffer_new(bh); set_buffer_async_write(bh); + clear_buffer_dirty(bh); nr_to_submit++; } while ((bh = bh->b_this_page) != head); @@ -532,7 +540,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io, printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); redirty_page_for_writepage(wbc, page); do { - clear_buffer_async_write(bh); + if (buffer_async_write(bh)) { + clear_buffer_async_write(bh); + set_buffer_dirty(bh); + } bh = bh->b_this_page; } while (bh != head); goto unlock; @@ -546,7 +557,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io, io_submit_add_bh(io, inode, bounce_page ? bounce_page : page, bh); nr_submitted++; - clear_buffer_dirty(bh); } while ((bh = bh->b_this_page) != head); unlock: -- cgit From dff4ac75eeeefc4397fe7cf8ce559425bf46b1f7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 7 Dec 2022 12:27:05 +0100 Subject: ext4: move keep_towrite handling to ext4_bio_write_page() When we are writing back page but we cannot for some reason write all its buffers (e.g. because we cannot allocate blocks in current context) we have to keep TOWRITE tag set in the mapping as otherwise racing WB_SYNC_ALL writeback that could write these buffers can skip the page and result in data loss. We will need this logic for writeback during transaction commit so move the logic from ext4_writepage() to ext4_bio_write_page(). Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20221207112722.22220-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 +-- fs/ext4/inode.c | 6 ++---- fs/ext4/page-io.c | 36 +++++++++++++++++++++--------------- 3 files changed, 24 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4e739902dc03..0540929630a1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3757,8 +3757,7 @@ extern void ext4_end_io_rsv_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, - int len, - bool keep_towrite); + int len); extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a0f4d4197a0b..4c70cc525f10 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2016,7 +2016,6 @@ static int ext4_writepage(struct page *page, struct buffer_head *page_bufs = NULL; struct inode *inode = page->mapping->host; struct ext4_io_submit io_submit; - bool keep_towrite = false; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { folio_invalidate(folio, 0, folio_size(folio)); @@ -2074,7 +2073,6 @@ static int ext4_writepage(struct page *page, unlock_page(page); return 0; } - keep_towrite = true; } if (PageChecked(page) && ext4_should_journal_data(inode)) @@ -2091,7 +2089,7 @@ static int ext4_writepage(struct page *page, unlock_page(page); return -ENOMEM; } - ret = ext4_bio_write_page(&io_submit, page, len, keep_towrite); + ret = ext4_bio_write_page(&io_submit, page, len); ext4_io_submit(&io_submit); /* Drop io_end reference we got from init */ ext4_put_io_end_defer(io_submit.io_end); @@ -2125,7 +2123,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) len = size & ~PAGE_MASK; else len = PAGE_SIZE; - err = ext4_bio_write_page(&mpd->io_submit, page, len, false); + err = ext4_bio_write_page(&mpd->io_submit, page, len); if (!err) mpd->wbc->nr_to_write--; mpd->first_page++; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 4e68ace86f11..4f9ecacd10aa 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -430,8 +430,7 @@ submit_and_retry: int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, - int len, - bool keep_towrite) + int len) { struct page *bounce_page = NULL; struct inode *inode = page->mapping->host; @@ -441,14 +440,11 @@ int ext4_bio_write_page(struct ext4_io_submit *io, int nr_submitted = 0; int nr_to_submit = 0; struct writeback_control *wbc = io->io_wbc; + bool keep_towrite = false; BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - if (keep_towrite) - set_page_writeback_keepwrite(page); - else - set_page_writeback(page); ClearPageError(page); /* @@ -483,12 +479,17 @@ int ext4_bio_write_page(struct ext4_io_submit *io, if (!buffer_mapped(bh)) clear_buffer_dirty(bh); /* - * Keeping dirty some buffer we cannot write? Make - * sure to redirty the page. This happens e.g. when - * doing writeout for transaction commit. + * Keeping dirty some buffer we cannot write? Make sure + * to redirty the page and keep TOWRITE tag so that + * racing WB_SYNC_ALL writeback does not skip the page. + * This happens e.g. when doing writeout for + * transaction commit. */ - if (buffer_dirty(bh) && !PageDirty(page)) - redirty_page_for_writepage(wbc, page); + if (buffer_dirty(bh)) { + if (!PageDirty(page)) + redirty_page_for_writepage(wbc, page); + keep_towrite = true; + } if (io->io_bio) ext4_io_submit(io); continue; @@ -500,6 +501,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io, nr_to_submit++; } while ((bh = bh->b_this_page) != head); + /* Nothing to submit? Just unlock the page... */ + if (!nr_to_submit) + goto unlock; + bh = head = page_buffers(page); /* @@ -550,6 +555,11 @@ int ext4_bio_write_page(struct ext4_io_submit *io, } } + if (keep_towrite) + set_page_writeback_keepwrite(page); + else + set_page_writeback(page); + /* Now submit buffers to write */ do { if (!buffer_async_write(bh)) @@ -558,11 +568,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, bounce_page ? bounce_page : page, bh); nr_submitted++; } while ((bh = bh->b_this_page) != head); - unlock: unlock_page(page); - /* Nothing submitted - we have to end page writeback */ - if (!nr_submitted) - end_page_writeback(page); return ret; } -- cgit From 29b83c574b0a06f29126e4e18801ba0e5d2e089b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 7 Dec 2022 12:27:06 +0100 Subject: ext4: remove nr_submitted from ext4_bio_write_page() nr_submitted is the same as nr_to_submit. Drop one of them. Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20221207112722.22220-3-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/page-io.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 4f9ecacd10aa..2bdfb8a046d9 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -437,7 +437,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io, unsigned block_start; struct buffer_head *bh, *head; int ret = 0; - int nr_submitted = 0; int nr_to_submit = 0; struct writeback_control *wbc = io->io_wbc; bool keep_towrite = false; @@ -566,7 +565,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io, continue; io_submit_add_bh(io, inode, bounce_page ? bounce_page : page, bh); - nr_submitted++; } while ((bh = bh->b_this_page) != head); unlock: unlock_page(page); -- cgit From 5c27088b3bcf82c8525ec55cde7d2ddd034283b6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 7 Dec 2022 12:27:07 +0100 Subject: ext4: drop pointless IO submission from ext4_bio_write_page() We submit outstanding IO in ext4_bio_write_page() if we find a buffer we are not going to write. This is however pointless because we already handle submission of previous IO in case we detect newly added buffer head is discontiguous. So just delete the pointless IO submission call. Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20221207112722.22220-4-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/page-io.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 2bdfb8a046d9..beaec6d81074 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -489,8 +489,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io, redirty_page_for_writepage(wbc, page); keep_towrite = true; } - if (io->io_bio) - ext4_io_submit(io); continue; } if (buffer_new(bh)) -- cgit From de0039f69c95692539bcb7cfc3b01f31e1c9f342 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 7 Dec 2022 12:27:08 +0100 Subject: ext4: add support for writepages calls that cannot map blocks Add support for calls to ext4_writepages() than cannot map blocks. These will be issued from jbd2 transaction commit code. Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20221207112722.22220-5-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 62 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4c70cc525f10..186635cb1b59 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1564,6 +1564,7 @@ struct mpage_da_data { struct ext4_map_blocks map; struct ext4_io_submit io_submit; /* IO submission data */ unsigned int do_map:1; + unsigned int can_map:1; /* Can writepages call map blocks? */ unsigned int scanned_until_end:1; }; @@ -2556,18 +2557,33 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); } +/* Return true if the page needs to be written as part of transaction commit */ +static bool ext4_page_nomap_can_writeout(struct page *page) +{ + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + if (buffer_dirty(bh) && buffer_mapped(bh) && !buffer_delay(bh)) + return true; + } while ((bh = bh->b_this_page) != head); + return false; +} + /* * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages - * and underlying extent to map + * needing mapping, submit mapped pages * * @mpd - where to look for pages * * Walk dirty pages in the mapping. If they are fully mapped, submit them for - * IO immediately. When we find a page which isn't mapped we start accumulating - * extent of buffers underlying these pages that needs mapping (formed by - * either delayed or unwritten buffers). We also lock the pages containing - * these buffers. The extent found is returned in @mpd structure (starting at - * mpd->lblk with length mpd->len blocks). + * IO immediately. If we cannot map blocks, we submit just already mapped + * buffers in the page for IO and keep page dirty. When we can map blocks and + * we find a page which isn't mapped we start accumulating extent of buffers + * underlying these pages that needs mapping (formed by either delayed or + * unwritten buffers). We also lock the pages containing these buffers. The + * extent found is returned in @mpd structure (starting at mpd->lblk with + * length mpd->len blocks). * * Note that this function can attach bios to one io_end structure which are * neither logically nor physically contiguous. Although it may seem as an @@ -2658,14 +2674,30 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (mpd->map.m_len == 0) mpd->first_page = page->index; mpd->next_page = page->index + 1; - /* Add all dirty buffers to mpd */ - lblk = ((ext4_lblk_t)page->index) << - (PAGE_SHIFT - blkbits); - head = page_buffers(page); - err = mpage_process_page_bufs(mpd, head, head, lblk); - if (err <= 0) - goto out; - err = 0; + /* + * Writeout for transaction commit where we cannot + * modify metadata is simple. Just submit the page. + */ + if (!mpd->can_map) { + if (ext4_page_nomap_can_writeout(page)) { + err = mpage_submit_page(mpd, page); + if (err < 0) + goto out; + } else { + unlock_page(page); + mpd->first_page++; + } + } else { + /* Add all dirty buffers to mpd */ + lblk = ((ext4_lblk_t)page->index) << + (PAGE_SHIFT - blkbits); + head = page_buffers(page); + err = mpage_process_page_bufs(mpd, head, head, + lblk); + if (err <= 0) + goto out; + err = 0; + } left--; } pagevec_release(&pvec); @@ -2785,6 +2817,7 @@ retry: */ mpd.do_map = 0; mpd.scanned_until_end = 0; + mpd.can_map = 1; mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd.io_submit.io_end) { ret = -ENOMEM; @@ -2808,6 +2841,7 @@ retry: break; } + WARN_ON_ONCE(!mpd->can_map); /* * We have two constraints: We find one extent to map and we * must always write out whole page (makes a difference when -- cgit From 15648d599cd1c15cc678039dcab65599276fe407 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 7 Dec 2022 12:27:09 +0100 Subject: ext4: provide ext4_do_writepages() Provide ext4_do_writepages() function that takes mpage_da_data as an argument and make ext4_writepages() just a simple wrapper around it. No functional changes. Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20221207112722.22220-6-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 96 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 54 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 186635cb1b59..a023cca4b6f6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1550,9 +1550,12 @@ void ext4_da_release_space(struct inode *inode, int to_free) */ struct mpage_da_data { + /* These are input fields for ext4_do_writepages() */ struct inode *inode; struct writeback_control *wbc; + unsigned int can_map:1; /* Can writepages call map blocks? */ + /* These are internal state of ext4_do_writepages() */ pgoff_t first_page; /* The first page to write */ pgoff_t next_page; /* Current page to examine */ pgoff_t last_page; /* Last page to examine */ @@ -1564,7 +1567,6 @@ struct mpage_da_data { struct ext4_map_blocks map; struct ext4_io_submit io_submit; /* IO submission data */ unsigned int do_map:1; - unsigned int can_map:1; /* Can writepages call map blocks? */ unsigned int scanned_until_end:1; }; @@ -2710,16 +2712,16 @@ out: return err; } -static int ext4_writepages(struct address_space *mapping, - struct writeback_control *wbc) +static int ext4_do_writepages(struct mpage_da_data *mpd) { + struct writeback_control *wbc = mpd->wbc; pgoff_t writeback_index = 0; long nr_to_write = wbc->nr_to_write; int range_whole = 0; int cycled = 1; handle_t *handle = NULL; - struct mpage_da_data mpd; - struct inode *inode = mapping->host; + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; int needed_blocks, rsv_blocks = 0, ret = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); struct blk_plug plug; @@ -2794,19 +2796,18 @@ static int ext4_writepages(struct address_space *mapping, writeback_index = mapping->writeback_index; if (writeback_index) cycled = 0; - mpd.first_page = writeback_index; - mpd.last_page = -1; + mpd->first_page = writeback_index; + mpd->last_page = -1; } else { - mpd.first_page = wbc->range_start >> PAGE_SHIFT; - mpd.last_page = wbc->range_end >> PAGE_SHIFT; + mpd->first_page = wbc->range_start >> PAGE_SHIFT; + mpd->last_page = wbc->range_end >> PAGE_SHIFT; } - mpd.inode = inode; - mpd.wbc = wbc; - ext4_io_submit_init(&mpd.io_submit, wbc); + ext4_io_submit_init(&mpd->io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); + tag_pages_for_writeback(mapping, mpd->first_page, + mpd->last_page); blk_start_plug(&plug); /* @@ -2815,28 +2816,27 @@ retry: * in the block layer on device congestion while having transaction * started. */ - mpd.do_map = 0; - mpd.scanned_until_end = 0; - mpd.can_map = 1; - mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); - if (!mpd.io_submit.io_end) { + mpd->do_map = 0; + mpd->scanned_until_end = 0; + mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!mpd->io_submit.io_end) { ret = -ENOMEM; goto unplug; } - ret = mpage_prepare_extent_to_map(&mpd); + ret = mpage_prepare_extent_to_map(mpd); /* Unlock pages we didn't use */ - mpage_release_unused_pages(&mpd, false); + mpage_release_unused_pages(mpd, false); /* Submit prepared bio */ - ext4_io_submit(&mpd.io_submit); - ext4_put_io_end_defer(mpd.io_submit.io_end); - mpd.io_submit.io_end = NULL; + ext4_io_submit(&mpd->io_submit); + ext4_put_io_end_defer(mpd->io_submit.io_end); + mpd->io_submit.io_end = NULL; if (ret < 0) goto unplug; - while (!mpd.scanned_until_end && wbc->nr_to_write > 0) { + while (!mpd->scanned_until_end && wbc->nr_to_write > 0) { /* For each extent of pages we use new io_end */ - mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); - if (!mpd.io_submit.io_end) { + mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!mpd->io_submit.io_end) { ret = -ENOMEM; break; } @@ -2861,16 +2861,16 @@ retry: "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); /* Release allocated io_end */ - ext4_put_io_end(mpd.io_submit.io_end); - mpd.io_submit.io_end = NULL; + ext4_put_io_end(mpd->io_submit.io_end); + mpd->io_submit.io_end = NULL; break; } - mpd.do_map = 1; + mpd->do_map = 1; - trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); - ret = mpage_prepare_extent_to_map(&mpd); - if (!ret && mpd.map.m_len) - ret = mpage_map_and_submit_extent(handle, &mpd, + trace_ext4_da_write_pages(inode, mpd->first_page, wbc); + ret = mpage_prepare_extent_to_map(mpd); + if (!ret && mpd->map.m_len) + ret = mpage_map_and_submit_extent(handle, mpd, &give_up_on_write); /* * Caution: If the handle is synchronous, @@ -2885,12 +2885,12 @@ retry: if (!ext4_handle_valid(handle) || handle->h_sync == 0) { ext4_journal_stop(handle); handle = NULL; - mpd.do_map = 0; + mpd->do_map = 0; } /* Unlock pages we didn't use */ - mpage_release_unused_pages(&mpd, give_up_on_write); + mpage_release_unused_pages(mpd, give_up_on_write); /* Submit prepared bio */ - ext4_io_submit(&mpd.io_submit); + ext4_io_submit(&mpd->io_submit); /* * Drop our io_end reference we got from init. We have @@ -2900,11 +2900,11 @@ retry: * up doing unwritten extent conversion. */ if (handle) { - ext4_put_io_end_defer(mpd.io_submit.io_end); + ext4_put_io_end_defer(mpd->io_submit.io_end); ext4_journal_stop(handle); } else - ext4_put_io_end(mpd.io_submit.io_end); - mpd.io_submit.io_end = NULL; + ext4_put_io_end(mpd->io_submit.io_end); + mpd->io_submit.io_end = NULL; if (ret == -ENOSPC && sbi->s_journal) { /* @@ -2924,8 +2924,8 @@ unplug: blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; - mpd.last_page = writeback_index - 1; - mpd.first_page = 0; + mpd->last_page = writeback_index - 1; + mpd->first_page = 0; goto retry; } @@ -2935,7 +2935,7 @@ unplug: * Set the writeback_index so that range_cyclic * mode will write it back later */ - mapping->writeback_index = mpd.first_page; + mapping->writeback_index = mpd->first_page; out_writepages: trace_ext4_writepages_result(inode, wbc, ret, @@ -2944,6 +2944,18 @@ out_writepages: return ret; } +static int ext4_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct mpage_da_data mpd = { + .inode = mapping->host, + .wbc = wbc, + .can_map = 1, + }; + + return ext4_do_writepages(&mpd); +} + static int ext4_dax_writepages(struct address_space *mapping, struct writeback_control *wbc) { -- cgit From 29bc9cea0e13dc8043c249f5f232b9e0c441ae24 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 7 Dec 2022 12:27:10 +0100 Subject: ext4: move percpu_rwsem protection into ext4_writepages() Move protection by percpu_rwsem from ext4_do_writepages() to ext4_writepages(). We will not want to grab this protection during transaction commits as that would be prone to deadlocks and the protection is not needed. Move the shutdown state checking as well since we want to be able to complete commit while the shutdown is in progress. Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20221207112722.22220-7-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a023cca4b6f6..3aa01a9d4664 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2727,10 +2727,6 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) struct blk_plug plug; bool give_up_on_write = false; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) - return -EIO; - - percpu_down_read(&sbi->s_writepages_rwsem); trace_ext4_writepages(inode, wbc); /* @@ -2940,20 +2936,28 @@ unplug: out_writepages: trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); - percpu_up_read(&sbi->s_writepages_rwsem); return ret; } static int ext4_writepages(struct address_space *mapping, struct writeback_control *wbc) { + struct super_block *sb = mapping->host->i_sb; struct mpage_da_data mpd = { .inode = mapping->host, .wbc = wbc, .can_map = 1, }; + int ret; + + if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + return -EIO; - return ext4_do_writepages(&mpd); + percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); + ret = ext4_do_writepages(&mpd); + percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); + + return ret; } static int ext4_dax_writepages(struct address_space *mapping, -- cgit From 59205c8d4eaeeb85cf76eb1cb140283cf900412a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 7 Dec 2022 12:27:11 +0100 Subject: ext4: switch to using ext4_do_writepages() for ordered data writeout Use the standard writepages method (ext4_do_writepages()) to perform writeout of ordered data during journal commit. Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20221207112722.22220-8-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/inode.c | 16 ++++++++++++++++ fs/ext4/super.c | 3 +-- 3 files changed, 18 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0540929630a1..140e1eb300d1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3000,6 +3000,7 @@ extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3aa01a9d4664..56f6ef52fe76 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2960,6 +2960,22 @@ static int ext4_writepages(struct address_space *mapping, return ret; } +int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .range_start = jinode->i_dirty_start, + .range_end = jinode->i_dirty_end, + }; + struct mpage_da_data mpd = { + .inode = jinode->i_vfs_inode, + .wbc = &wbc, + .can_map = 0, + }; + return ext4_do_writepages(&mpd); +} + static int ext4_dax_writepages(struct address_space *mapping, struct writeback_control *wbc) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 28d009151d23..72ead3b56706 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -540,8 +540,7 @@ static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) if (ext4_should_journal_data(jinode->i_vfs_inode)) ret = ext4_journalled_submit_inode_data_buffers(jinode); else - ret = jbd2_journal_submit_inode_data_buffers(jinode); - + ret = ext4_normal_submit_inode_data_buffers(jinode); return ret; } -- cgit From f30ff35f6266993405c8659e48fddc3180692164 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 7 Dec 2022 12:27:12 +0100 Subject: jbd2: switch jbd2_submit_inode_data() to use fs-provided hook for data writeout jbd2_submit_inode_data() hardcoded use of jbd2_journal_submit_inode_data_buffers() for submission of data pages. Make it use j_submit_inode_data_buffers hook instead. This effectively switches ext4 fastcommits to use ext4_writepages() for data writeout instead of generic_writepages(). Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20221207112722.22220-9-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 2 +- fs/jbd2/commit.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 1da85f626116..4594b62f147b 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -981,7 +981,7 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal) finish_wait(&ei->i_fc_wait, &wait); } spin_unlock(&sbi->s_fc_lock); - ret = jbd2_submit_inode_data(ei->jinode); + ret = jbd2_submit_inode_data(journal, ei->jinode); if (ret) return ret; spin_lock(&sbi->s_fc_lock); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 885a7a6cc53e..4810438b7856 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -207,14 +207,13 @@ int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) } /* Send all the data buffers related to an inode */ -int jbd2_submit_inode_data(struct jbd2_inode *jinode) +int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode) { - if (!jinode || !(jinode->i_flags & JI_WRITE_DATA)) return 0; trace_jbd2_submit_inode_data(jinode->i_vfs_inode); - return jbd2_journal_submit_inode_data_buffers(jinode); + return journal->j_submit_inode_data_buffers(jinode); } EXPORT_SYMBOL(jbd2_submit_inode_data); -- cgit From 49977f9762fe0f28ec08bec5ab992a5c9623f44c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 7 Dec 2022 12:27:13 +0100 Subject: ext4: switch to using write_cache_pages() for data=journal writeout Instead of using generic_writepages(), let's use write_cache_pages() for writeout of journalled data. It will allow us to stop providing .writepage callback. Our data=journal writeback path would benefit from a larger cleanup and refactoring but that's for a separate cleanup series. Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20221207112722.22220-10-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 56f6ef52fe76..fd9a335d1411 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2712,6 +2712,12 @@ out: return err; } +static int ext4_writepage_cb(struct page *page, struct writeback_control *wbc, + void *data) +{ + return ext4_writepage(page, wbc); +} + static int ext4_do_writepages(struct mpage_da_data *mpd) { struct writeback_control *wbc = mpd->wbc; @@ -2738,7 +2744,9 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) goto out_writepages; if (ext4_should_journal_data(inode)) { - ret = generic_writepages(mapping, wbc); + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL); + blk_finish_plug(&plug); goto out_writepages; } -- cgit From dae999602eeb0c5482bb62df7fda903f1edc0ff3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 7 Dec 2022 12:27:15 +0100 Subject: ext4: stop providing .writepage hook Now we don't need .writepage hook for anything anymore. Reclaim is fine with relying on .writepages to clean pages and we often couldn't do much from the .writepage callback anyway. We only need to provide .migrate_folio callback for the ext4_journalled_aops - let's use buffer_migrate_page_norefs() there so that buffers cannot be modified under jdb2's hands as that can cause data corruption. For example when commit code does writeout of transaction buffers in jbd2_journal_write_metadata_buffer(), we don't hold page lock or have page writeback bit set or have the buffer locked. So page migration code would go and happily migrate the page elsewhere while the copy is running thus corrupting data. Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20221207112722.22220-12-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fd9a335d1411..38b7db452d37 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3725,7 +3725,6 @@ static int ext4_iomap_swap_activate(struct swap_info_struct *sis, static const struct address_space_operations ext4_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, - .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_write_end, @@ -3743,7 +3742,6 @@ static const struct address_space_operations ext4_aops = { static const struct address_space_operations ext4_journalled_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, - .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, @@ -3752,6 +3750,7 @@ static const struct address_space_operations ext4_journalled_aops = { .invalidate_folio = ext4_journalled_invalidate_folio, .release_folio = ext4_release_folio, .direct_IO = noop_direct_IO, + .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = ext4_iomap_swap_activate, @@ -3760,7 +3759,6 @@ static const struct address_space_operations ext4_journalled_aops = { static const struct address_space_operations ext4_da_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, - .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, -- cgit From 1485f726c6dec1a1f85438f2962feaa3d585526f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 7 Dec 2022 12:59:27 +0100 Subject: ext4: initialize quota before expanding inode in setproject ioctl Make sure we initialize quotas before possibly expanding inode space (and thus maybe needing to allocate external xattr block) in ext4_ioctl_setproject(). This prevents not accounting the necessary block allocation. Signed-off-by: Jan Kara Cc: stable@kernel.org Link: https://lore.kernel.org/r/20221207115937.26601-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 202953b5db49..8067ccda34e4 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -732,6 +732,10 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) if (ext4_is_quota_file(inode)) return err; + err = dquot_initialize(inode); + if (err) + return err; + err = ext4_get_inode_loc(inode, &iloc); if (err) return err; @@ -747,10 +751,6 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) brelse(iloc.bh); } - err = dquot_initialize(inode); - if (err) - return err; - handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(sb) + EXT4_QUOTA_DEL_BLOCKS(sb) + 3); -- cgit From 8994d11395f8165b3deca1971946f549f0822630 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 7 Dec 2022 12:59:28 +0100 Subject: ext4: avoid unaccounted block allocation when expanding inode When expanding inode space in ext4_expand_extra_isize_ea() we may need to allocate external xattr block. If quota is not initialized for the inode, the block allocation will not be accounted into quota usage. Make sure the quota is initialized before we try to expand inode space. Reported-by: Pengfei Xu Link: https://lore.kernel.org/all/Y5BT+k6xWqthZc1P@xpf.sh.intel.com Signed-off-by: Jan Kara Cc: stable@kernel.org Link: https://lore.kernel.org/r/20221207115937.26601-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 38b7db452d37..5a92e5186313 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5945,6 +5945,14 @@ static int __ext4_expand_extra_isize(struct inode *inode, return 0; } + /* + * We may need to allocate external xattr block so we need quotas + * initialized. Here we can be called with various locks held so we + * cannot affort to initialize quotas ourselves. So just bail. + */ + if (dquot_initialize_needed(inode)) + return -EAGAIN; + /* try to expand with EAs present */ error = ext4_expand_extra_isize_ea(inode, new_extra_isize, raw_inode, handle); -- cgit From cc12a6f25e07ed05d5825a1664b67a970842b2ca Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 8 Dec 2022 10:32:31 +0800 Subject: ext4: allocate extended attribute value in vmalloc area Now, extended attribute value maximum length is 64K. The memory requested here does not need continuous physical addresses, so it is appropriate to use kvmalloc to request memory. At the same time, it can also cope with the situation that the extended attribute will become longer in the future. Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221208023233.1231330-3-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/xattr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 6bdd502527f8..b666d3bf8b38 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2548,7 +2548,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); - buffer = kmalloc(value_size, GFP_NOFS); + buffer = kvmalloc(value_size, GFP_NOFS); b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); if (!is || !bs || !buffer || !b_entry_name) { error = -ENOMEM; @@ -2600,7 +2600,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, error = 0; out: kfree(b_entry_name); - kfree(buffer); + kvfree(buffer); if (is) brelse(is->iloc.bh); if (bs) -- cgit From e4db04f7d3dbbe16680e0ded27ea2a65b10f766a Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 8 Dec 2022 10:32:33 +0800 Subject: ext4: fix inode leak in ext4_xattr_inode_create() on an error path There is issue as follows when do setxattr with inject fault: [localhost]# fsck.ext4 -fn /dev/sda e2fsck 1.46.6-rc1 (12-Sep-2022) Pass 1: Checking inodes, blocks, and sizes Pass 2: Checking directory structure Pass 3: Checking directory connectivity Pass 4: Checking reference counts Unattached zero-length inode 15. Clear? no Unattached inode 15 Connect to /lost+found? no Pass 5: Checking group summary information /dev/sda: ********** WARNING: Filesystem still has errors ********** /dev/sda: 15/655360 files (0.0% non-contiguous), 66755/2621440 blocks This occurs in 'ext4_xattr_inode_create()'. If 'ext4_mark_inode_dirty()' fails, dropping i_nlink of the inode is needed. Or will lead to inode leak. Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221208023233.1231330-5-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/xattr.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index b666d3bf8b38..7decaaf27e82 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1441,6 +1441,9 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle, if (!err) err = ext4_inode_attach_jinode(ea_inode); if (err) { + if (ext4_xattr_inode_dec_ref(handle, ea_inode)) + ext4_warning_inode(ea_inode, + "cleanup dec ref error %d", err); iput(ea_inode); return ERR_PTR(err); } -- cgit From 1da18e38cb97e9521e93d63034521a9649524f64 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 8 Dec 2022 11:34:24 +0800 Subject: ext4: fix reserved cluster accounting in __es_remove_extent() When bigalloc is enabled, reserved cluster accounting for delayed allocation is handled in extent_status.c. With a corrupted file system, it's possible for this accounting to be incorrect, dsicovered by Syzbot: EXT4-fs error (device loop0): ext4_validate_block_bitmap:398: comm rep: bg 0: block 5: invalid block bitmap EXT4-fs (loop0): Delayed block allocation failed for inode 18 at logical offset 0 with max blocks 32 with error 28 EXT4-fs (loop0): This should not happen!! Data will be lost EXT4-fs (loop0): Total free blocks count 0 EXT4-fs (loop0): Free/Dirty block details EXT4-fs (loop0): free_blocks=0 EXT4-fs (loop0): dirty_blocks=32 EXT4-fs (loop0): Block reservation details EXT4-fs (loop0): i_reserved_data_blocks=2 EXT4-fs (loop0): Inode 18 (00000000845cd634): i_reserved_data_blocks (1) not cleared! Above issue happens as follows: Assume: sbi->s_cluster_ratio = 16 Step1: Insert delay block [0, 31] -> ei->i_reserved_data_blocks=2 Step2: ext4_writepages mpage_map_and_submit_extent -> return failed mpage_release_unused_pages -> to release [0, 30] ext4_es_remove_extent -> remove lblk=0 end=30 __es_remove_extent -> len1=0 len2=31-30=1 __es_remove_extent: ... if (len2 > 0) { ... if (len1 > 0) { ... } else { es->es_lblk = end + 1; es->es_len = len2; ... } if (count_reserved) count_rsvd(inode, lblk, ...); goto out; -> will return but didn't calculate 'reserved' ... Step3: ext4_destroy_inode -> trigger "i_reserved_data_blocks (1) not cleared!" To solve above issue if 'len2>0' call 'get_rsvd()' before goto out. Reported-by: syzbot+05a0f0ccab4a25626e38@syzkaller.appspotmail.com Fixes: 8fcc3a580651 ("ext4: rework reserved cluster accounting when invalidating pages") Signed-off-by: Ye Bin Reviewed-by: Eric Whitney Link: https://lore.kernel.org/r/20221208033426.1832460-2-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/extents_status.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 97eccc0028a1..7bc221038c6c 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1369,7 +1369,7 @@ retry: if (count_reserved) count_rsvd(inode, lblk, orig_es.es_len - len1 - len2, &orig_es, &rc); - goto out; + goto out_get_reserved; } if (len1 > 0) { @@ -1411,6 +1411,7 @@ retry: } } +out_get_reserved: if (count_reserved) *reserved = get_rsvd(inode, end, es, &rc); out: -- cgit From cfe4c1b25dd6d2f056afc00b7c98bcb3dd0b1fc3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 7 Dec 2022 17:25:10 +0100 Subject: udf: Fix preallocation discarding at indirect extent boundary When preallocation extent is the first one in the extent block, the code would corrupt extent tree header instead. Fix the problem and use udf_delete_aext() for deleting extent to avoid some code duplication. CC: stable@vger.kernel.org Signed-off-by: Jan Kara --- fs/udf/truncate.c | 45 +++++++++++++-------------------------------- 1 file changed, 13 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 532cda99644e..a9790fb32f5f 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -120,60 +120,41 @@ void udf_truncate_tail_extent(struct inode *inode) void udf_discard_prealloc(struct inode *inode) { - struct extent_position epos = { NULL, 0, {0, 0} }; + struct extent_position epos = {}; + struct extent_position prev_epos = {}; struct kernel_lb_addr eloc; uint32_t elen; uint64_t lbcount = 0; int8_t etype = -1, netype; - int adsize; struct udf_inode_info *iinfo = UDF_I(inode); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB || inode->i_size == iinfo->i_lenExtents) return; - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - adsize = sizeof(struct short_ad); - else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - adsize = sizeof(struct long_ad); - else - adsize = 0; - epos.block = iinfo->i_location; /* Find the last extent in the file */ - while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { - etype = netype; + while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 0)) != -1) { + brelse(prev_epos.bh); + prev_epos = epos; + if (prev_epos.bh) + get_bh(prev_epos.bh); + + etype = udf_next_aext(inode, &epos, &eloc, &elen, 1); lbcount += elen; } if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { - epos.offset -= adsize; lbcount -= elen; - extent_trunc(inode, &epos, &eloc, etype, elen, 0); - if (!epos.bh) { - iinfo->i_lenAlloc = - epos.offset - - udf_file_entry_alloc_offset(inode); - mark_inode_dirty(inode); - } else { - struct allocExtDesc *aed = - (struct allocExtDesc *)(epos.bh->b_data); - aed->lengthAllocDescs = - cpu_to_le32(epos.offset - - sizeof(struct allocExtDesc)); - if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || - UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) - udf_update_tag(epos.bh->b_data, epos.offset); - else - udf_update_tag(epos.bh->b_data, - sizeof(struct allocExtDesc)); - mark_buffer_dirty_inode(epos.bh, inode); - } + udf_delete_aext(inode, prev_epos); + udf_free_blocks(inode->i_sb, inode, &eloc, 0, + DIV_ROUND_UP(elen, 1 << inode->i_blkbits)); } /* This inode entry is in-memory only and thus we don't have to mark * the inode dirty */ iinfo->i_lenExtents = lbcount; brelse(epos.bh); + brelse(prev_epos.bh); } static void udf_update_alloc_ext_desc(struct inode *inode, -- cgit From 6ad53f0f71c52871202a7bf096feb2c59db33fc5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 7 Dec 2022 17:34:33 +0100 Subject: udf: Do not bother looking for prealloc extents if i_lenExtents matches i_size If rounded block-rounded i_lenExtents matches block rounded i_size, there are no preallocation extents. Do not bother walking extent linked list. CC: stable@vger.kernel.org Signed-off-by: Jan Kara --- fs/udf/truncate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index a9790fb32f5f..036ebd892b85 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -127,9 +127,10 @@ void udf_discard_prealloc(struct inode *inode) uint64_t lbcount = 0; int8_t etype = -1, netype; struct udf_inode_info *iinfo = UDF_I(inode); + int bsize = 1 << inode->i_blkbits; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB || - inode->i_size == iinfo->i_lenExtents) + ALIGN(inode->i_size, bsize) == ALIGN(iinfo->i_lenExtents, bsize)) return; epos.block = iinfo->i_location; -- cgit From 16d0556568148bdcaa45d077cac9f8f7077cf70a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 7 Dec 2022 18:17:34 +0100 Subject: udf: Discard preallocation before extending file with a hole When extending file with a hole, we tried to preserve existing preallocation for the file. However that is not very useful and complicates code because the previous extent may need to be rounded to block boundary as well (which we forgot to do thus causing data corruption for sequence like: xfs_io -f -c "pwrite 0x75e63 11008" -c "truncate 0x7b24b" \ -c "truncate 0xabaa3" -c "pwrite 0xac70b 22954" \ -c "pwrite 0x93a43 11358" -c "pwrite 0xb8e65 52211" file with 512-byte block size. Just discard preallocation before extending file to simplify things and also fix this data corruption. CC: stable@vger.kernel.org Signed-off-by: Jan Kara --- fs/udf/inode.c | 46 ++++++++++++++++++---------------------------- 1 file changed, 18 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 0246b1b86fb9..44988e4c3fb2 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -434,6 +434,12 @@ static int udf_get_block(struct inode *inode, sector_t block, iinfo->i_next_alloc_goal++; } + /* + * Block beyond EOF and prealloc extents? Just discard preallocation + * as it is not useful and complicates things. + */ + if (((loff_t)block) << inode->i_blkbits > iinfo->i_lenExtents) + udf_discard_prealloc(inode); udf_clear_extent_cache(inode); phys = inode_getblk(inode, block, &err, &new); if (!phys) @@ -483,8 +489,6 @@ static int udf_do_extend_file(struct inode *inode, uint32_t add; int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); struct super_block *sb = inode->i_sb; - struct kernel_lb_addr prealloc_loc = {}; - uint32_t prealloc_len = 0; struct udf_inode_info *iinfo; int err; @@ -505,19 +509,6 @@ static int udf_do_extend_file(struct inode *inode, ~(sb->s_blocksize - 1); } - /* Last extent are just preallocated blocks? */ - if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) == - EXT_NOT_RECORDED_ALLOCATED) { - /* Save the extent so that we can reattach it to the end */ - prealloc_loc = last_ext->extLocation; - prealloc_len = last_ext->extLength; - /* Mark the extent as a hole */ - last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | - (last_ext->extLength & UDF_EXTENT_LENGTH_MASK); - last_ext->extLocation.logicalBlockNum = 0; - last_ext->extLocation.partitionReferenceNum = 0; - } - /* Can we merge with the previous extent? */ if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) == EXT_NOT_RECORDED_NOT_ALLOCATED) { @@ -545,7 +536,7 @@ static int udf_do_extend_file(struct inode *inode, * more extents, we may need to enter possible following * empty indirect extent. */ - if (new_block_bytes || prealloc_len) + if (new_block_bytes) udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0); } @@ -579,17 +570,6 @@ static int udf_do_extend_file(struct inode *inode, } out: - /* Do we have some preallocated blocks saved? */ - if (prealloc_len) { - err = udf_add_aext(inode, last_pos, &prealloc_loc, - prealloc_len, 1); - if (err) - return err; - last_ext->extLocation = prealloc_loc; - last_ext->extLength = prealloc_len; - count++; - } - /* last_pos should point to the last written extent... */ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) last_pos->offset -= sizeof(struct short_ad); @@ -642,8 +622,17 @@ static int udf_extend_file(struct inode *inode, loff_t newsize) else BUG(); + /* + * When creating hole in file, just don't bother with preserving + * preallocation. It likely won't be very useful anyway. + */ + udf_discard_prealloc(inode); + etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset); within_final_block = (etype != -1); + /* We don't expect extents past EOF... */ + WARN_ON_ONCE(etype != -1 && + elen > ((loff_t)offset + 1) << inode->i_blkbits); if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) || (epos.bh && epos.offset == sizeof(struct allocExtDesc))) { @@ -772,10 +761,11 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, goto out_free; } - /* Are we beyond EOF? */ + /* Are we beyond EOF and preallocated extent? */ if (etype == -1) { int ret; loff_t hole_len; + isBeyondEOF = true; if (count) { if (c) -- cgit From 1f3868f06855c97a4954c99b36f3fc9eb8f60326 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 8 Dec 2022 13:03:30 +0100 Subject: udf: Fix extending file within last block When extending file within last block it can happen that the extent is already rounded to the blocksize and thus contains the offset we want to grow up to. In such case we would mistakenly expand the last extent and make it one block longer than it should be, exposing unallocated block in a file and causing data corruption. Fix the problem by properly detecting this case and bailing out. CC: stable@vger.kernel.org Signed-off-by: Jan Kara --- fs/udf/inode.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 44988e4c3fb2..1d7c2a812fc1 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -585,13 +585,17 @@ out: static void udf_do_extend_final_block(struct inode *inode, struct extent_position *last_pos, struct kernel_long_ad *last_ext, - uint32_t final_block_len) + uint32_t new_elen) { - struct super_block *sb = inode->i_sb; uint32_t added_bytes; - added_bytes = final_block_len - - (last_ext->extLength & (sb->s_blocksize - 1)); + /* + * Extent already large enough? It may be already rounded up to block + * size... + */ + if (new_elen <= (last_ext->extLength & UDF_EXTENT_LENGTH_MASK)) + return; + added_bytes = (last_ext->extLength & UDF_EXTENT_LENGTH_MASK) - new_elen; last_ext->extLength += added_bytes; UDF_I(inode)->i_lenExtents += added_bytes; @@ -608,12 +612,12 @@ static int udf_extend_file(struct inode *inode, loff_t newsize) int8_t etype; struct super_block *sb = inode->i_sb; sector_t first_block = newsize >> sb->s_blocksize_bits, offset; - unsigned long partial_final_block; + loff_t new_elen; int adsize; struct udf_inode_info *iinfo = UDF_I(inode); struct kernel_long_ad extent; int err = 0; - int within_final_block; + bool within_last_ext; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); @@ -629,9 +633,9 @@ static int udf_extend_file(struct inode *inode, loff_t newsize) udf_discard_prealloc(inode); etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset); - within_final_block = (etype != -1); + within_last_ext = (etype != -1); /* We don't expect extents past EOF... */ - WARN_ON_ONCE(etype != -1 && + WARN_ON_ONCE(within_last_ext && elen > ((loff_t)offset + 1) << inode->i_blkbits); if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) || @@ -648,19 +652,17 @@ static int udf_extend_file(struct inode *inode, loff_t newsize) extent.extLength |= etype << 30; } - partial_final_block = newsize & (sb->s_blocksize - 1); + new_elen = ((loff_t)offset << inode->i_blkbits) | + (newsize & (sb->s_blocksize - 1)); /* File has extent covering the new size (could happen when extending * inside a block)? */ - if (within_final_block) { + if (within_last_ext) { /* Extending file within the last file block */ - udf_do_extend_final_block(inode, &epos, &extent, - partial_final_block); + udf_do_extend_final_block(inode, &epos, &extent, new_elen); } else { - loff_t add = ((loff_t)offset << sb->s_blocksize_bits) | - partial_final_block; - err = udf_do_extend_file(inode, &epos, &extent, add); + err = udf_do_extend_file(inode, &epos, &extent, new_elen); } if (err < 0) -- cgit From 22ae4c114f77b55a4c5036e8f70409a0799a08f8 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 2 Nov 2022 14:44:50 -0400 Subject: nfsd: fix up the filecache laundrette scheduling We don't really care whether there are hashed entries when it comes to scheduling the laundrette. They might all be non-gc entries, after all. We only want to schedule it if there are entries on the LRU. Switch to using list_lru_count, and move the check into nfsd_file_gc_worker. The other callsite in nfsd_file_put doesn't need to count entries, since it only schedules the laundrette after adding an entry to the LRU. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 1418bc889aa0..da7c9cb12403 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -211,12 +211,9 @@ static const struct rhashtable_params nfsd_file_rhash_params = { static void nfsd_file_schedule_laundrette(void) { - if ((atomic_read(&nfsd_file_rhash_tbl.nelems) == 0) || - test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 0) - return; - - queue_delayed_work(system_wq, &nfsd_filecache_laundrette, - NFSD_LAUNDRETTE_DELAY); + if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags)) + queue_delayed_work(system_wq, &nfsd_filecache_laundrette, + NFSD_LAUNDRETTE_DELAY); } static void @@ -666,7 +663,8 @@ static void nfsd_file_gc_worker(struct work_struct *work) { nfsd_file_gc(); - nfsd_file_schedule_laundrette(); + if (list_lru_count(&nfsd_file_lru)) + nfsd_file_schedule_laundrette(); } static unsigned long -- cgit From d7064eaf688cfe454c50db9f59298463d80d403c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 3 Nov 2022 16:22:48 -0400 Subject: NFSD: Add an nfsd_file_fsync tracepoint Add a tracepoint to capture the number of filecache-triggered fsync calls and which files needed it. Also, record when an fsync triggers a write verifier reset. Examples: <...>-97 [007] 262.505611: nfsd_file_free: inode=0xffff888171e08140 ref=0 flags=GC may=WRITE nf_file=0xffff8881373d2400 <...>-97 [007] 262.505612: nfsd_file_fsync: inode=0xffff888171e08140 ref=0 flags=GC may=WRITE nf_file=0xffff8881373d2400 ret=0 <...>-97 [007] 262.505623: nfsd_file_free: inode=0xffff888171e08dc0 ref=0 flags=GC may=WRITE nf_file=0xffff8881373d1e00 <...>-97 [007] 262.505624: nfsd_file_fsync: inode=0xffff888171e08dc0 ref=0 flags=GC may=WRITE nf_file=0xffff8881373d1e00 ret=0 Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton --- fs/nfsd/filecache.c | 5 ++++- fs/nfsd/trace.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index da7c9cb12403..1998b4d5f692 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -336,10 +336,13 @@ static void nfsd_file_fsync(struct nfsd_file *nf) { struct file *file = nf->nf_file; + int ret; if (!file || !(file->f_mode & FMODE_WRITE)) return; - if (vfs_fsync(file, 1) != 0) + ret = vfs_fsync(file, 1); + trace_nfsd_file_fsync(nf, ret); + if (ret) nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); } diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 0177cfa50556..33567b87cc6b 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -1238,6 +1238,37 @@ DEFINE_EVENT(nfsd_file_lruwalk_class, name, \ DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_removed); DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_shrinker_removed); +TRACE_EVENT(nfsd_file_fsync, + TP_PROTO( + const struct nfsd_file *nf, + int ret + ), + TP_ARGS(nf, ret), + TP_STRUCT__entry( + __field(void *, nf_inode) + __field(int, nf_ref) + __field(int, ret) + __field(unsigned long, nf_flags) + __field(unsigned char, nf_may) + __field(struct file *, nf_file) + ), + TP_fast_assign( + __entry->nf_inode = nf->nf_inode; + __entry->nf_ref = refcount_read(&nf->nf_ref); + __entry->ret = ret; + __entry->nf_flags = nf->nf_flags; + __entry->nf_may = nf->nf_may; + __entry->nf_file = nf->nf_file; + ), + TP_printk("inode=%p ref=%d flags=%s may=%s nf_file=%p ret=%d", + __entry->nf_inode, + __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nfsd_may_flags(__entry->nf_may), + __entry->nf_file, __entry->ret + ) +); + #include "cache.h" TRACE_DEFINE_ENUM(RC_DROPIT); -- cgit From 18ebd35b61b4693a0ddc270b6d4f18def232e770 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 6 Nov 2022 14:02:39 -0500 Subject: lockd: set other missing fields when unlocking files vfs_lock_file() expects the struct file_lock to be fully initialised by the caller. Re-exported NFSv3 has been seen to Oops if the fl_file field is NULL. Fixes: aec158242b87 ("lockd: set fl_owner when unlocking files") Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Link: https://bugzilla.kernel.org/show_bug.cgi?id=216582 Signed-off-by: Chuck Lever --- fs/lockd/svcsubs.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index e1c4617de771..3515f17eaf3f 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -176,7 +176,7 @@ nlm_delete_file(struct nlm_file *file) } } -static int nlm_unlock_files(struct nlm_file *file, fl_owner_t owner) +static int nlm_unlock_files(struct nlm_file *file, const struct file_lock *fl) { struct file_lock lock; @@ -184,12 +184,15 @@ static int nlm_unlock_files(struct nlm_file *file, fl_owner_t owner) lock.fl_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; - lock.fl_owner = owner; - if (file->f_file[O_RDONLY] && - vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, &lock, NULL)) + lock.fl_owner = fl->fl_owner; + lock.fl_pid = fl->fl_pid; + lock.fl_flags = FL_POSIX; + + lock.fl_file = file->f_file[O_RDONLY]; + if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL)) goto out_err; - if (file->f_file[O_WRONLY] && - vfs_lock_file(file->f_file[O_WRONLY], F_SETLK, &lock, NULL)) + lock.fl_file = file->f_file[O_WRONLY]; + if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL)) goto out_err; return 0; out_err: @@ -226,7 +229,7 @@ again: if (match(lockhost, host)) { spin_unlock(&flctx->flc_lock); - if (nlm_unlock_files(file, fl->fl_owner)) + if (nlm_unlock_files(file, fl)) return 1; goto again; } -- cgit From 01d53a88c08951f88f2a42f1f1e6568928e0590e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 7 Nov 2022 06:58:41 -0500 Subject: nfsd: return error if nfs4_setacl fails With the addition of POSIX ACLs to struct nfsd_attrs, we no longer return an error if setting the ACL fails. Ensure we return the na_aclerr error on SETATTR if there is one. Fixes: c0cbe70742f4 ("NFSD: add posix ACLs to struct nfsd_attrs") Cc: Neil Brown Reported-by: Yongcheng Yang Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6f7e0c5e62d2..a6173677b766 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1135,6 +1135,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 0, (time64_t)0); if (!status) status = nfserrno(attrs.na_labelerr); + if (!status) + status = nfserrno(attrs.na_aclerr); out: nfsd_attrs_free(&attrs); fh_drop_write(&cstate->current_fh); -- cgit From 85a0d0c9a58002ef7d1bf5e3ea630f4fbd42a4f0 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Fri, 11 Nov 2022 17:18:35 +0800 Subject: NFSD: Use struct_size() helper in alloc_session() Use struct_size() helper to simplify the code, no functional changes. Signed-off-by: Xiu Jianfeng Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2ec981fd2985..97badca3135e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1833,13 +1833,12 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs, int numslots = fattrs->maxreqs; int slotsize = slot_bytes(fattrs); struct nfsd4_session *new; - int mem, i; + int i; - BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *) - + sizeof(struct nfsd4_session) > PAGE_SIZE); - mem = numslots * sizeof(struct nfsd4_slot *); + BUILD_BUG_ON(struct_size(new, se_slots, NFSD_MAX_SLOTS_PER_SESSION) + > PAGE_SIZE); - new = kzalloc(sizeof(*new) + mem, GFP_KERNEL); + new = kzalloc(struct_size(new, se_slots, numslots), GFP_KERNEL); if (!new) return NULL; /* allocate each struct nfsd4_slot and data cache in one piece */ -- cgit From 75c7940d2a86d3f1b60a0a265478cb8fc887b970 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 11 Nov 2022 14:36:36 -0500 Subject: lockd: set missing fl_flags field when retrieving args Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 1 + fs/lockd/svcproc.c | 1 + 2 files changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 284b019cb652..b72023a6b4c1 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -52,6 +52,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, *filp = file; /* Set up the missing parts of the file_lock structure */ + lock->fl.fl_flags = FL_POSIX; lock->fl.fl_file = file->f_file[mode]; lock->fl.fl_pid = current->tgid; lock->fl.fl_start = (loff_t)lock->lock_start; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index e35c05e27806..32784f508c81 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -77,6 +77,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, /* Set up the missing parts of the file_lock structure */ mode = lock_to_openmode(&lock->fl); + lock->fl.fl_flags = FL_POSIX; lock->fl.fl_file = file->f_file[mode]; lock->fl.fl_pid = current->tgid; lock->fl.fl_lmops = &nlmsvc_lock_operations; -- cgit From 69efce009f7df888e1fede3cb2913690eb829f52 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 11 Nov 2022 14:36:37 -0500 Subject: lockd: ensure we use the correct file descriptor when unlocking Shared locks are set on O_RDONLY descriptors and exclusive locks are set on O_WRONLY ones. nlmsvc_unlock however calls vfs_lock_file twice, once for each descriptor, but it doesn't reset fl_file. Ensure that it does. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svclock.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 9c1aa75441e1..9eae99e08e69 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -659,11 +659,13 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) nlmsvc_cancel_blocked(net, file, lock); lock->fl.fl_type = F_UNLCK; - if (file->f_file[O_RDONLY]) - error = vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, + lock->fl.fl_file = file->f_file[O_RDONLY]; + if (lock->fl.fl_file) + error = vfs_lock_file(lock->fl.fl_file, F_SETLK, &lock->fl, NULL); - if (file->f_file[O_WRONLY]) - error = vfs_lock_file(file->f_file[O_WRONLY], F_SETLK, + lock->fl.fl_file = file->f_file[O_WRONLY]; + if (lock->fl.fl_file) + error |= vfs_lock_file(lock->fl.fl_file, F_SETLK, &lock->fl, NULL); return (error < 0)? nlm_lck_denied_nolocks : nlm_granted; -- cgit From 9f27783b4dd235ef3c8dbf69fc6322777450323c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 11 Nov 2022 14:36:38 -0500 Subject: lockd: fix file selection in nlmsvc_cancel_blocked We currently do a lock_to_openmode call based on the arguments from the NLM_UNLOCK call, but that will always set the fl_type of the lock to F_UNLCK, and the O_RDONLY descriptor is always chosen. Fix it to use the file_lock from the block instead. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svclock.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 9eae99e08e69..4e30f3c50970 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -699,9 +699,10 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l block = nlmsvc_lookup_block(file, lock); mutex_unlock(&file->f_mutex); if (block != NULL) { - mode = lock_to_openmode(&lock->fl); - vfs_cancel_lock(block->b_file->f_file[mode], - &block->b_call->a_args.lock.fl); + struct file_lock *fl = &block->b_call->a_args.lock.fl; + + mode = lock_to_openmode(fl); + vfs_cancel_lock(block->b_file->f_file[mode], fl); status = nlmsvc_unlink_block(block); nlmsvc_release_block(block); } -- cgit From 79a1d88a36f77374c77fd41a4386d8c2736b8704 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 16 Nov 2022 10:28:36 -0500 Subject: NFSD: pass range end to vfs_fsync_range() instead of count _nfsd_copy_file_range() calls vfs_fsync_range() with an offset and count (bytes written), but the former wants the start and end bytes of the range to sync. Fix it up. Fixes: eac0b17a77fb ("NFSD add vfs_fsync after async copy is done") Signed-off-by: Brian Foster Tested-by: Dai Ngo Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a6173677b766..73ed32ad23a2 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1642,6 +1642,7 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy, u64 src_pos = copy->cp_src_pos; u64 dst_pos = copy->cp_dst_pos; int status; + loff_t end; /* See RFC 7862 p.67: */ if (bytes_total == 0) @@ -1661,8 +1662,8 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy, /* for a non-zero asynchronous copy do a commit of data */ if (nfsd4_copy_is_async(copy) && copy->cp_res.wr_bytes_written > 0) { since = READ_ONCE(dst->f_wb_err); - status = vfs_fsync_range(dst, copy->cp_dst_pos, - copy->cp_res.wr_bytes_written, 0); + end = copy->cp_dst_pos + copy->cp_res.wr_bytes_written - 1; + status = vfs_fsync_range(dst, copy->cp_dst_pos, end, 0); if (!status) status = filemap_check_wb_err(dst->f_mapping, since); if (!status) -- cgit From 247c01ff5f8d66e62a404c91733be52fecb8b7f6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 14 Nov 2022 08:57:43 -0500 Subject: trace: Relocate event helper files Steven Rostedt says: > The include/trace/events/ directory should only hold files that > are to create events, not headers that hold helper functions. > > Can you please move them out of include/trace/events/ as that > directory is "special" in the creation of events. Signed-off-by: Chuck Lever Acked-by: Leon Romanovsky Acked-by: Steven Rostedt (Google) Acked-by: Anna Schumaker --- fs/nfs/nfs4trace.h | 6 +++--- fs/nfs/nfstrace.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 2cff5901c689..633cc64a04da 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -9,10 +9,10 @@ #define _TRACE_NFS4_H #include -#include +#include -#include -#include +#include +#include #define show_nfs_fattr_flags(valid) \ __print_flags((unsigned long)valid, "|", \ diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 8c6cc58679ff..642f6921852f 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -11,9 +11,9 @@ #include #include -#include -#include -#include +#include +#include +#include #define nfs_show_cache_validity(v) \ __print_flags(v, "|", \ -- cgit From a1049eb47f20b9eabf9afb218578fff16b4baca6 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Wed, 16 Nov 2022 19:44:45 -0800 Subject: NFSD: refactoring courtesy_client_reaper to a generic low memory shrinker Refactoring courtesy_client_reaper to generic low memory shrinker so it can be used for other purposes. Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 97badca3135e..00b3fa164fe8 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4361,7 +4361,7 @@ out: } static unsigned long -nfsd_courtesy_client_count(struct shrinker *shrink, struct shrink_control *sc) +nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { int cnt; struct nfsd_net *nn = container_of(shrink, @@ -4374,7 +4374,7 @@ nfsd_courtesy_client_count(struct shrinker *shrink, struct shrink_control *sc) } static unsigned long -nfsd_courtesy_client_scan(struct shrinker *shrink, struct shrink_control *sc) +nfsd4_state_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) { return SHRINK_STOP; } @@ -4401,8 +4401,8 @@ nfsd4_init_leases_net(struct nfsd_net *nn) nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB); atomic_set(&nn->nfsd_courtesy_clients, 0); - nn->nfsd_client_shrinker.scan_objects = nfsd_courtesy_client_scan; - nn->nfsd_client_shrinker.count_objects = nfsd_courtesy_client_count; + nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan; + nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count; nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS; return register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client"); } @@ -6151,17 +6151,24 @@ laundromat_main(struct work_struct *laundry) } static void -courtesy_client_reaper(struct work_struct *reaper) +courtesy_client_reaper(struct nfsd_net *nn) { struct list_head reaplist; - struct delayed_work *dwork = to_delayed_work(reaper); - struct nfsd_net *nn = container_of(dwork, struct nfsd_net, - nfsd_shrinker_work); nfs4_get_courtesy_client_reaplist(nn, &reaplist); nfs4_process_client_reaplist(&reaplist); } +static void +nfsd4_state_shrinker_worker(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct nfsd_net *nn = container_of(dwork, struct nfsd_net, + nfsd_shrinker_work); + + courtesy_client_reaper(nn); +} + static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) { if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle)) @@ -7985,7 +7992,7 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->blocked_locks_lru); INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); - INIT_DELAYED_WORK(&nn->nfsd_shrinker_work, courtesy_client_reaper); + INIT_DELAYED_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker); get_net(net); return 0; -- cgit From 3959066b697b5dfbb7141124ae9665337d4bc638 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Wed, 16 Nov 2022 19:44:46 -0800 Subject: NFSD: add support for sending CB_RECALL_ANY Add XDR encode and decode function for CB_RECALL_ANY. Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever --- fs/nfsd/nfs4callback.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/nfsd/state.h | 1 + fs/nfsd/xdr4.h | 5 ++++ fs/nfsd/xdr4cb.h | 6 +++++ 4 files changed, 84 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index f0e69edf5f0f..1b57f2c2f0bb 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -76,6 +76,17 @@ static __be32 *xdr_encode_empty_array(__be32 *p) * 1 Protocol" */ +static void encode_uint32(struct xdr_stream *xdr, u32 n) +{ + WARN_ON_ONCE(xdr_stream_encode_u32(xdr, n) < 0); +} + +static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap, + size_t len) +{ + WARN_ON_ONCE(xdr_stream_encode_uint32_array(xdr, bitmap, len) < 0); +} + /* * nfs_cb_opnum4 * @@ -328,6 +339,24 @@ static void encode_cb_recall4args(struct xdr_stream *xdr, hdr->nops++; } +/* + * CB_RECALLANY4args + * + * struct CB_RECALLANY4args { + * uint32_t craa_objects_to_keep; + * bitmap4 craa_type_mask; + * }; + */ +static void +encode_cb_recallany4args(struct xdr_stream *xdr, + struct nfs4_cb_compound_hdr *hdr, struct nfsd4_cb_recall_any *ra) +{ + encode_nfs_cb_opnum4(xdr, OP_CB_RECALL_ANY); + encode_uint32(xdr, ra->ra_keep); + encode_bitmap4(xdr, ra->ra_bmval, ARRAY_SIZE(ra->ra_bmval)); + hdr->nops++; +} + /* * CB_SEQUENCE4args * @@ -482,6 +511,26 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, encode_cb_nops(&hdr); } +/* + * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects + */ +static void +nfs4_xdr_enc_cb_recall_any(struct rpc_rqst *req, + struct xdr_stream *xdr, const void *data) +{ + const struct nfsd4_callback *cb = data; + struct nfsd4_cb_recall_any *ra; + struct nfs4_cb_compound_hdr hdr = { + .ident = cb->cb_clp->cl_cb_ident, + .minorversion = cb->cb_clp->cl_minorversion, + }; + + ra = container_of(cb, struct nfsd4_cb_recall_any, ra_cb); + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_recallany4args(xdr, &hdr, ra); + encode_cb_nops(&hdr); +} /* * NFSv4.0 and NFSv4.1 XDR decode functions @@ -520,6 +569,28 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status); } +/* + * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects + */ +static int +nfs4_xdr_dec_cb_recall_any(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfsd4_callback *cb = data; + struct nfs4_cb_compound_hdr hdr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + status = decode_cb_op_status(xdr, OP_CB_RECALL_ANY, &cb->cb_status); + return status; +} + #ifdef CONFIG_NFSD_PNFS /* * CB_LAYOUTRECALL4args @@ -783,6 +854,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = { #endif PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock), PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload), + PROC(CB_RECALL_ANY, COMPOUND, cb_recall_any, cb_recall_any), }; static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)]; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index eadd7f465bf5..e30882f8b851 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -636,6 +636,7 @@ enum nfsd4_cb_op { NFSPROC4_CLNT_CB_OFFLOAD, NFSPROC4_CLNT_CB_SEQUENCE, NFSPROC4_CLNT_CB_NOTIFY_LOCK, + NFSPROC4_CLNT_CB_RECALL_ANY, }; /* Returns true iff a is later than b: */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 0eb00105d845..4fd2cf6d1d2d 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -896,5 +896,10 @@ struct nfsd4_operation { union nfsd4_op_u *); }; +struct nfsd4_cb_recall_any { + struct nfsd4_callback ra_cb; + u32 ra_keep; + u32 ra_bmval[1]; +}; #endif diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index 547cf07cf4e0..0d39af1b00a0 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -48,3 +48,9 @@ #define NFS4_dec_cb_offload_sz (cb_compound_dec_hdr_sz + \ cb_sequence_dec_sz + \ op_dec_sz) +#define NFS4_enc_cb_recall_any_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 1 + 1 + 1) +#define NFS4_dec_cb_recall_any_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) -- cgit From 44df6f439a1790a5f602e3842879efa88f346672 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Wed, 16 Nov 2022 19:44:47 -0800 Subject: NFSD: add delegation reaper to react to low memory condition The delegation reaper is called by nfsd memory shrinker's on the 'count' callback. It scans the client list and sends the courtesy CB_RECALL_ANY to the clients that hold delegations. To avoid flooding the clients with CB_RECALL_ANY requests, the delegation reaper sends only one CB_RECALL_ANY request to each client per 5 seconds. Signed-off-by: Dai Ngo [ cel: moved definition of RCA4_TYPE_MASK_RDATA_DLG ] Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++--- fs/nfsd/state.h | 5 +++ 2 files changed, 89 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 00b3fa164fe8..c981704ec6a1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2144,6 +2144,7 @@ static void __free_client(struct kref *k) kfree(clp->cl_nii_domain.data); kfree(clp->cl_nii_name.data); idr_destroy(&clp->cl_stateids); + kfree(clp->cl_ra); kmem_cache_free(client_slab, clp); } @@ -2871,6 +2872,36 @@ static const struct tree_descr client_files[] = { [3] = {""}, }; +static int +nfsd4_cb_recall_any_done(struct nfsd4_callback *cb, + struct rpc_task *task) +{ + switch (task->tk_status) { + case -NFS4ERR_DELAY: + rpc_delay(task, 2 * HZ); + return 0; + default: + return 1; + } +} + +static void +nfsd4_cb_recall_any_release(struct nfsd4_callback *cb) +{ + struct nfs4_client *clp = cb->cb_clp; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + spin_lock(&nn->client_lock); + clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags); + put_client_renew_locked(clp); + spin_unlock(&nn->client_lock); +} + +static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = { + .done = nfsd4_cb_recall_any_done, + .release = nfsd4_cb_recall_any_release, +}; + static struct nfs4_client *create_client(struct xdr_netobj name, struct svc_rqst *rqstp, nfs4_verifier *verf) { @@ -2908,6 +2939,14 @@ static struct nfs4_client *create_client(struct xdr_netobj name, free_client(clp); return NULL; } + clp->cl_ra = kzalloc(sizeof(*clp->cl_ra), GFP_KERNEL); + if (!clp->cl_ra) { + free_client(clp); + return NULL; + } + clp->cl_ra_time = 0; + nfsd4_init_cb(&clp->cl_ra->ra_cb, clp, &nfsd4_cb_recall_any_ops, + NFSPROC4_CLNT_CB_RECALL_ANY); return clp; } @@ -4363,14 +4402,16 @@ out: static unsigned long nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { - int cnt; + int count; struct nfsd_net *nn = container_of(shrink, struct nfsd_net, nfsd_client_shrinker); - cnt = atomic_read(&nn->nfsd_courtesy_clients); - if (cnt > 0) + count = atomic_read(&nn->nfsd_courtesy_clients); + if (!count) + count = atomic_long_read(&num_delegations); + if (count) mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); - return (unsigned long)cnt; + return (unsigned long)count; } static unsigned long @@ -6159,6 +6200,44 @@ courtesy_client_reaper(struct nfsd_net *nn) nfs4_process_client_reaplist(&reaplist); } +static void +deleg_reaper(struct nfsd_net *nn) +{ + struct list_head *pos, *next; + struct nfs4_client *clp; + struct list_head cblist; + + INIT_LIST_HEAD(&cblist); + spin_lock(&nn->client_lock); + list_for_each_safe(pos, next, &nn->client_lru) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + if (clp->cl_state != NFSD4_ACTIVE || + list_empty(&clp->cl_delegations) || + atomic_read(&clp->cl_delegs_in_recall) || + test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) || + (ktime_get_boottime_seconds() - + clp->cl_ra_time < 5)) { + continue; + } + list_add(&clp->cl_ra_cblist, &cblist); + + /* release in nfsd4_cb_recall_any_release */ + atomic_inc(&clp->cl_rpc_users); + set_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags); + clp->cl_ra_time = ktime_get_boottime_seconds(); + } + spin_unlock(&nn->client_lock); + + while (!list_empty(&cblist)) { + clp = list_first_entry(&cblist, struct nfs4_client, + cl_ra_cblist); + list_del_init(&clp->cl_ra_cblist); + clp->cl_ra->ra_keep = 0; + clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG); + nfsd4_run_cb(&clp->cl_ra->ra_cb); + } +} + static void nfsd4_state_shrinker_worker(struct work_struct *work) { @@ -6167,6 +6246,7 @@ nfsd4_state_shrinker_worker(struct work_struct *work) nfsd_shrinker_work); courtesy_client_reaper(nn); + deleg_reaper(nn); } static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index e30882f8b851..e94634d30591 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -368,6 +368,7 @@ struct nfs4_client { #define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */ #define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \ 1 << NFSD4_CLIENT_CB_KILL) +#define NFSD4_CLIENT_CB_RECALL_ANY (6) unsigned long cl_flags; const struct cred *cl_cb_cred; struct rpc_clnt *cl_cb_client; @@ -411,6 +412,10 @@ struct nfs4_client { unsigned int cl_state; atomic_t cl_delegs_in_recall; + + struct nfsd4_cb_recall_any *cl_ra; + time64_t cl_ra_time; + struct list_head cl_ra_cblist; }; /* struct nfs4_client_reset -- cgit From 638593be55c0b37a1930038460a9918215d5c24b Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Wed, 16 Nov 2022 19:44:48 -0800 Subject: NFSD: add CB_RECALL_ANY tracepoints Add tracepoints to trace start and end of CB_RECALL_ANY operation. Signed-off-by: Dai Ngo [ cel: added show_rca_mask() macro ] Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 2 ++ fs/nfsd/trace.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c981704ec6a1..e1e85c21f12b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2876,6 +2876,7 @@ static int nfsd4_cb_recall_any_done(struct nfsd4_callback *cb, struct rpc_task *task) { + trace_nfsd_cb_recall_any_done(cb, task); switch (task->tk_status) { case -NFS4ERR_DELAY: rpc_delay(task, 2 * HZ); @@ -6234,6 +6235,7 @@ deleg_reaper(struct nfsd_net *nn) list_del_init(&clp->cl_ra_cblist); clp->cl_ra->ra_keep = 0; clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG); + trace_nfsd_cb_recall_any(clp->cl_ra); nfsd4_run_cb(&clp->cl_ra->ra_cb); } } diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 33567b87cc6b..46b8f68a2497 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -9,9 +9,12 @@ #define _NFSD_TRACE_H #include +#include +#include #include "export.h" #include "nfsfh.h" +#include "xdr4.h" #define NFSD_TRACE_PROC_RES_FIELDS \ __field(unsigned int, netns_ino) \ @@ -1562,6 +1565,32 @@ TRACE_EVENT(nfsd_cb_offload, __entry->fh_hash, __entry->count, __entry->status) ); +TRACE_EVENT(nfsd_cb_recall_any, + TP_PROTO( + const struct nfsd4_cb_recall_any *ra + ), + TP_ARGS(ra), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, keep) + __field(unsigned long, bmval0) + __sockaddr(addr, ra->ra_cb.cb_clp->cl_cb_conn.cb_addrlen) + ), + TP_fast_assign( + __entry->cl_boot = ra->ra_cb.cb_clp->cl_clientid.cl_boot; + __entry->cl_id = ra->ra_cb.cb_clp->cl_clientid.cl_id; + __entry->keep = ra->ra_keep; + __entry->bmval0 = ra->ra_bmval[0]; + __assign_sockaddr(addr, &ra->ra_cb.cb_clp->cl_addr, + ra->ra_cb.cb_clp->cl_cb_conn.cb_addrlen); + ), + TP_printk("addr=%pISpc client %08x:%08x keep=%u bmval0=%s", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->keep, show_rca_mask(__entry->bmval0) + ) +); + DECLARE_EVENT_CLASS(nfsd_cb_done_class, TP_PROTO( const stateid_t *stp, @@ -1601,6 +1630,27 @@ DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_notify_lock_done); DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_layout_done); DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_offload_done); +TRACE_EVENT(nfsd_cb_recall_any_done, + TP_PROTO( + const struct nfsd4_callback *cb, + const struct rpc_task *task + ), + TP_ARGS(cb, task), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(int, status) + ), + TP_fast_assign( + __entry->status = task->tk_status; + __entry->cl_boot = cb->cb_clp->cl_clientid.cl_boot; + __entry->cl_id = cb->cb_clp->cl_clientid.cl_id; + ), + TP_printk("client %08x:%08x status=%d", + __entry->cl_boot, __entry->cl_id, __entry->status + ) +); + #endif /* _NFSD_TRACE_H */ #undef TRACE_INCLUDE_PATH -- cgit From 9315564747cb6a570e99196b3a4880fb817635fd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 26 Nov 2022 15:55:30 -0500 Subject: NFSD: Use only RQ_DROPME to signal the need to drop a reply Clean up: NFSv2 has the only two usages of rpc_drop_reply in the NFSD code base. Since NFSv2 is going away at some point, replace these in order to simplify the "drop this reply?" check in nfsd_dispatch(). Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton --- fs/nfsd/nfsproc.c | 4 ++-- fs/nfsd/nfssvc.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 52fc222c34f2..a5570cf75f3f 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -211,7 +211,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) if (resp->status == nfs_ok) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) - return rpc_drop_reply; + __set_bit(RQ_DROPME, &rqstp->rq_flags); return rpc_success; } @@ -246,7 +246,7 @@ nfsd_proc_write(struct svc_rqst *rqstp) if (resp->status == nfs_ok) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) - return rpc_drop_reply; + __set_bit(RQ_DROPME, &rqstp->rq_flags); return rpc_success; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 62e473b0ca52..56fba1cba3af 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -1060,7 +1060,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) svcxdr_init_encode(rqstp); *statp = proc->pc_func(rqstp); - if (*statp == rpc_drop_reply || test_bit(RQ_DROPME, &rqstp->rq_flags)) + if (test_bit(RQ_DROPME, &rqstp->rq_flags)) goto out_update_drop; if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream)) -- cgit From e78e274eb22d966258a3845acc71d3c5b8ee2ea8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 2 Dec 2022 12:48:59 -0800 Subject: NFSD: Avoid clashing function prototypes When built with Control Flow Integrity, function prototypes between caller and function declaration must match. These mismatches are visible at compile time with the new -Wcast-function-type-strict in Clang[1]. There were 97 warnings produced by NFS. For example: fs/nfsd/nfs4xdr.c:2228:17: warning: cast from '__be32 (*)(struct nfsd4_compoundargs *, struct nfsd4_access *)' (aka 'unsigned int (*)(struct nfsd4_compoundargs *, struct nfsd4_access *)') to 'nfsd4_dec' (aka 'unsigned int (*)(struct nfsd4_compoundargs *, void *)') converts to incompatible function type [-Wcast-function-type-strict] [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access, ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The enc/dec callbacks were defined as passing "void *" as the second argument, but were being implicitly cast to a new type. Replace the argument with union nfsd4_op_u, and perform explicit member selection in the function body. There are no resulting binary differences. Changes were made mechanically using the following Coccinelle script, with minor by-hand fixes for members that didn't already match their existing argument name: @find@ identifier func; type T, opsT; identifier ops, N; @@ opsT ops[] = { [N] = (T) func, }; @already_void@ identifier find.func; identifier name; @@ func(..., -void +union nfsd4_op_u *name) { ... } @proto depends on !already_void@ identifier find.func; type T; identifier name; position p; @@ func@p(..., T name ) { ... } @script:python get_member@ type_name << proto.T; member; @@ coccinelle.member = cocci.make_ident(type_name.split("_", 1)[1].split(' ',1)[0]) @convert@ identifier find.func; type proto.T; identifier proto.name; position proto.p; identifier get_member.member; @@ func@p(..., - T name + union nfsd4_op_u *u ) { + T name = &u->member; ... } @cast@ identifier find.func; type T, opsT; identifier ops, N; @@ opsT ops[] = { [N] = - (T) func, }; Cc: Chuck Lever Cc: Jeff Layton Cc: Gustavo A. R. Silva Cc: linux-nfs@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 632 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 377 insertions(+), 255 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 1e4990fb1692..2b4ae858c89b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -770,16 +770,18 @@ nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs) static __be32 nfsd4_decode_access(struct nfsd4_compoundargs *argp, - struct nfsd4_access *access) + union nfsd4_op_u *u) { + struct nfsd4_access *access = &u->access; if (xdr_stream_decode_u32(argp->xdr, &access->ac_req_access) < 0) return nfserr_bad_xdr; return nfs_ok; } static __be32 -nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) +nfsd4_decode_close(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_close *close = &u->close; if (xdr_stream_decode_u32(argp->xdr, &close->cl_seqid) < 0) return nfserr_bad_xdr; return nfsd4_decode_stateid4(argp, &close->cl_stateid); @@ -787,8 +789,9 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) static __be32 -nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit) +nfsd4_decode_commit(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_commit *commit = &u->commit; if (xdr_stream_decode_u64(argp->xdr, &commit->co_offset) < 0) return nfserr_bad_xdr; if (xdr_stream_decode_u32(argp->xdr, &commit->co_count) < 0) @@ -798,8 +801,9 @@ nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit } static __be32 -nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create) +nfsd4_decode_create(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_create *create = &u->create; __be32 *p, status; memset(create, 0, sizeof(*create)); @@ -844,22 +848,25 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create } static inline __be32 -nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr) +nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_delegreturn *dr = &u->delegreturn; return nfsd4_decode_stateid4(argp, &dr->dr_stateid); } static inline __be32 -nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr) +nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_getattr *getattr = &u->getattr; memset(getattr, 0, sizeof(*getattr)); return nfsd4_decode_bitmap4(argp, getattr->ga_bmval, ARRAY_SIZE(getattr->ga_bmval)); } static __be32 -nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link) +nfsd4_decode_link(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_link *link = &u->link; memset(link, 0, sizeof(*link)); return nfsd4_decode_component4(argp, &link->li_name, &link->li_namelen); } @@ -907,8 +914,9 @@ nfsd4_decode_locker4(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) } static __be32 -nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) +nfsd4_decode_lock(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_lock *lock = &u->lock; memset(lock, 0, sizeof(*lock)); if (xdr_stream_decode_u32(argp->xdr, &lock->lk_type) < 0) return nfserr_bad_xdr; @@ -924,8 +932,9 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) } static __be32 -nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) +nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_lockt *lockt = &u->lockt; memset(lockt, 0, sizeof(*lockt)); if (xdr_stream_decode_u32(argp->xdr, &lockt->lt_type) < 0) return nfserr_bad_xdr; @@ -940,8 +949,9 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) } static __be32 -nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) +nfsd4_decode_locku(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_locku *locku = &u->locku; __be32 status; if (xdr_stream_decode_u32(argp->xdr, &locku->lu_type) < 0) @@ -962,8 +972,9 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) } static __be32 -nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup) +nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_lookup *lookup = &u->lookup; return nfsd4_decode_component4(argp, &lookup->lo_name, &lookup->lo_len); } @@ -1143,8 +1154,9 @@ nfsd4_decode_open_claim4(struct nfsd4_compoundargs *argp, } static __be32 -nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) +nfsd4_decode_open(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_open *open = &u->open; __be32 status; u32 dummy; @@ -1171,8 +1183,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) } static __be32 -nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf) +nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_open_confirm *open_conf = &u->open_confirm; __be32 status; if (argp->minorversion >= 1) @@ -1190,8 +1204,10 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con } static __be32 -nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down) +nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_open_downgrade *open_down = &u->open_downgrade; __be32 status; memset(open_down, 0, sizeof(*open_down)); @@ -1209,8 +1225,9 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d } static __be32 -nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh) +nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_putfh *putfh = &u->putfh; __be32 *p; if (xdr_stream_decode_u32(argp->xdr, &putfh->pf_fhlen) < 0) @@ -1229,7 +1246,7 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh) } static __be32 -nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p) +nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p) { if (argp->minorversion == 0) return nfs_ok; @@ -1237,8 +1254,9 @@ nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p) } static __be32 -nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) +nfsd4_decode_read(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_read *read = &u->read; __be32 status; memset(read, 0, sizeof(*read)); @@ -1254,8 +1272,9 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) } static __be32 -nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir) +nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_readdir *readdir = &u->readdir; __be32 status; memset(readdir, 0, sizeof(*readdir)); @@ -1276,15 +1295,17 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read } static __be32 -nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove) +nfsd4_decode_remove(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_remove *remove = &u->remove; memset(&remove->rm_cinfo, 0, sizeof(remove->rm_cinfo)); return nfsd4_decode_component4(argp, &remove->rm_name, &remove->rm_namelen); } static __be32 -nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename) +nfsd4_decode_rename(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_rename *rename = &u->rename; __be32 status; memset(rename, 0, sizeof(*rename)); @@ -1295,22 +1316,25 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename } static __be32 -nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid) +nfsd4_decode_renew(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + clientid_t *clientid = &u->renew; return nfsd4_decode_clientid4(argp, clientid); } static __be32 nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, - struct nfsd4_secinfo *secinfo) + union nfsd4_op_u *u) { + struct nfsd4_secinfo *secinfo = &u->secinfo; secinfo->si_exp = NULL; return nfsd4_decode_component4(argp, &secinfo->si_name, &secinfo->si_namelen); } static __be32 -nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) +nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_setattr *setattr = &u->setattr; __be32 status; memset(setattr, 0, sizeof(*setattr)); @@ -1324,8 +1348,9 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta } static __be32 -nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid) +nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_setclientid *setclientid = &u->setclientid; __be32 *p, status; memset(setclientid, 0, sizeof(*setclientid)); @@ -1367,8 +1392,10 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient } static __be32 -nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c) +nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_setclientid_confirm *scd_c = &u->setclientid_confirm; __be32 status; if (argp->minorversion >= 1) @@ -1382,8 +1409,9 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s /* Also used for NVERIFY */ static __be32 -nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify) +nfsd4_decode_verify(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_verify *verify = &u->verify; __be32 *p, status; memset(verify, 0, sizeof(*verify)); @@ -1409,8 +1437,9 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify } static __be32 -nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) +nfsd4_decode_write(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_write *write = &u->write; __be32 status; status = nfsd4_decode_stateid4(argp, &write->wr_stateid); @@ -1434,8 +1463,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) } static __be32 -nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner) +nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner; __be32 status; if (argp->minorversion >= 1) @@ -1452,16 +1483,20 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel return nfs_ok; } -static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc) +static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_backchannel_ctl *bc = &u->backchannel_ctl; memset(bc, 0, sizeof(*bc)); if (xdr_stream_decode_u32(argp->xdr, &bc->bc_cb_program) < 0) return nfserr_bad_xdr; return nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec); } -static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) +static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session; u32 use_conn_in_rdma_mode; __be32 status; @@ -1603,8 +1638,9 @@ nfsd4_decode_nfs_impl_id4(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, - struct nfsd4_exchange_id *exid) + union nfsd4_op_u *u) { + struct nfsd4_exchange_id *exid = &u->exchange_id; __be32 status; memset(exid, 0, sizeof(*exid)); @@ -1656,8 +1692,9 @@ nfsd4_decode_channel_attrs4(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, - struct nfsd4_create_session *sess) + union nfsd4_op_u *u) { + struct nfsd4_create_session *sess = &u->create_session; __be32 status; memset(sess, 0, sizeof(*sess)); @@ -1681,23 +1718,26 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp, - struct nfsd4_destroy_session *destroy_session) + union nfsd4_op_u *u) { + struct nfsd4_destroy_session *destroy_session = &u->destroy_session; return nfsd4_decode_sessionid4(argp, &destroy_session->sessionid); } static __be32 nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp, - struct nfsd4_free_stateid *free_stateid) + union nfsd4_op_u *u) { + struct nfsd4_free_stateid *free_stateid = &u->free_stateid; return nfsd4_decode_stateid4(argp, &free_stateid->fr_stateid); } #ifdef CONFIG_NFSD_PNFS static __be32 nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, - struct nfsd4_getdeviceinfo *gdev) + union nfsd4_op_u *u) { + struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo; __be32 status; memset(gdev, 0, sizeof(*gdev)); @@ -1717,8 +1757,9 @@ nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, - struct nfsd4_layoutcommit *lcp) + union nfsd4_op_u *u) { + struct nfsd4_layoutcommit *lcp = &u->layoutcommit; __be32 *p, status; memset(lcp, 0, sizeof(*lcp)); @@ -1753,8 +1794,9 @@ nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, - struct nfsd4_layoutget *lgp) + union nfsd4_op_u *u) { + struct nfsd4_layoutget *lgp = &u->layoutget; __be32 status; memset(lgp, 0, sizeof(*lgp)); @@ -1781,8 +1823,9 @@ nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, - struct nfsd4_layoutreturn *lrp) + union nfsd4_op_u *u) { + struct nfsd4_layoutreturn *lrp = &u->layoutreturn; memset(lrp, 0, sizeof(*lrp)); if (xdr_stream_decode_bool(argp->xdr, &lrp->lr_reclaim) < 0) return nfserr_bad_xdr; @@ -1795,8 +1838,9 @@ nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, #endif /* CONFIG_NFSD_PNFS */ static __be32 nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, - struct nfsd4_secinfo_no_name *sin) + union nfsd4_op_u *u) { + struct nfsd4_secinfo_no_name *sin = &u->secinfo_no_name; if (xdr_stream_decode_u32(argp->xdr, &sin->sin_style) < 0) return nfserr_bad_xdr; @@ -1806,8 +1850,9 @@ static __be32 nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, - struct nfsd4_sequence *seq) + union nfsd4_op_u *u) { + struct nfsd4_sequence *seq = &u->sequence; __be32 *p, status; status = nfsd4_decode_sessionid4(argp, &seq->sessionid); @@ -1826,8 +1871,10 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, } static __be32 -nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid) +nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_test_stateid *test_stateid = &u->test_stateid; struct nfsd4_test_stateid_id *stateid; __be32 status; u32 i; @@ -1852,14 +1899,16 @@ nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_sta } static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, - struct nfsd4_destroy_clientid *dc) + union nfsd4_op_u *u) { + struct nfsd4_destroy_clientid *dc = &u->destroy_clientid; return nfsd4_decode_clientid4(argp, &dc->clientid); } static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, - struct nfsd4_reclaim_complete *rc) + union nfsd4_op_u *u) { + struct nfsd4_reclaim_complete *rc = &u->reclaim_complete; if (xdr_stream_decode_bool(argp->xdr, &rc->rca_one_fs) < 0) return nfserr_bad_xdr; return nfs_ok; @@ -1867,8 +1916,9 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, - struct nfsd4_fallocate *fallocate) + union nfsd4_op_u *u) { + struct nfsd4_fallocate *fallocate = &u->allocate; __be32 status; status = nfsd4_decode_stateid4(argp, &fallocate->falloc_stateid); @@ -1924,8 +1974,9 @@ static __be32 nfsd4_decode_nl4_server(struct nfsd4_compoundargs *argp, } static __be32 -nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) +nfsd4_decode_copy(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_copy *copy = &u->copy; u32 consecutive, i, count, sync; struct nl4_server *ns_dummy; __be32 status; @@ -1982,8 +2033,9 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) static __be32 nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp, - struct nfsd4_copy_notify *cn) + union nfsd4_op_u *u) { + struct nfsd4_copy_notify *cn = &u->copy_notify; __be32 status; memset(cn, 0, sizeof(*cn)); @@ -2002,16 +2054,18 @@ nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp, - struct nfsd4_offload_status *os) + union nfsd4_op_u *u) { + struct nfsd4_offload_status *os = &u->offload_status; os->count = 0; os->status = 0; return nfsd4_decode_stateid4(argp, &os->stateid); } static __be32 -nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) +nfsd4_decode_seek(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_seek *seek = &u->seek; __be32 status; status = nfsd4_decode_stateid4(argp, &seek->seek_stateid); @@ -2028,8 +2082,9 @@ nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) } static __be32 -nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone) +nfsd4_decode_clone(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_clone *clone = &u->clone; __be32 status; status = nfsd4_decode_stateid4(argp, &clone->cl_src_stateid); @@ -2154,8 +2209,9 @@ nfsd4_decode_xattr_name(struct nfsd4_compoundargs *argp, char **namep) */ static __be32 nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp, - struct nfsd4_getxattr *getxattr) + union nfsd4_op_u *u) { + struct nfsd4_getxattr *getxattr = &u->getxattr; __be32 status; u32 maxcount; @@ -2173,8 +2229,9 @@ nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp, - struct nfsd4_setxattr *setxattr) + union nfsd4_op_u *u) { + struct nfsd4_setxattr *setxattr = &u->setxattr; u32 flags, maxcount, size; __be32 status; @@ -2214,8 +2271,9 @@ nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp, - struct nfsd4_listxattrs *listxattrs) + union nfsd4_op_u *u) { + struct nfsd4_listxattrs *listxattrs = &u->listxattrs; u32 maxcount; memset(listxattrs, 0, sizeof(*listxattrs)); @@ -2245,113 +2303,114 @@ nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_removexattr(struct nfsd4_compoundargs *argp, - struct nfsd4_removexattr *removexattr) + union nfsd4_op_u *u) { + struct nfsd4_removexattr *removexattr = &u->removexattr; memset(removexattr, 0, sizeof(*removexattr)); return nfsd4_decode_xattr_name(argp, &removexattr->rmxa_name); } static __be32 -nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) +nfsd4_decode_noop(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p) { return nfs_ok; } static __be32 -nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p) +nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p) { return nfserr_notsupp; } -typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *); +typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u); static const nfsd4_dec nfsd4_dec_ops[] = { - [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access, - [OP_CLOSE] = (nfsd4_dec)nfsd4_decode_close, - [OP_COMMIT] = (nfsd4_dec)nfsd4_decode_commit, - [OP_CREATE] = (nfsd4_dec)nfsd4_decode_create, - [OP_DELEGPURGE] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_DELEGRETURN] = (nfsd4_dec)nfsd4_decode_delegreturn, - [OP_GETATTR] = (nfsd4_dec)nfsd4_decode_getattr, - [OP_GETFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_LINK] = (nfsd4_dec)nfsd4_decode_link, - [OP_LOCK] = (nfsd4_dec)nfsd4_decode_lock, - [OP_LOCKT] = (nfsd4_dec)nfsd4_decode_lockt, - [OP_LOCKU] = (nfsd4_dec)nfsd4_decode_locku, - [OP_LOOKUP] = (nfsd4_dec)nfsd4_decode_lookup, - [OP_LOOKUPP] = (nfsd4_dec)nfsd4_decode_noop, - [OP_NVERIFY] = (nfsd4_dec)nfsd4_decode_verify, - [OP_OPEN] = (nfsd4_dec)nfsd4_decode_open, - [OP_OPENATTR] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm, - [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, - [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, - [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_putpubfh, - [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_READ] = (nfsd4_dec)nfsd4_decode_read, - [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, - [OP_READLINK] = (nfsd4_dec)nfsd4_decode_noop, - [OP_REMOVE] = (nfsd4_dec)nfsd4_decode_remove, - [OP_RENAME] = (nfsd4_dec)nfsd4_decode_rename, - [OP_RENEW] = (nfsd4_dec)nfsd4_decode_renew, - [OP_RESTOREFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_SAVEFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_SECINFO] = (nfsd4_dec)nfsd4_decode_secinfo, - [OP_SETATTR] = (nfsd4_dec)nfsd4_decode_setattr, - [OP_SETCLIENTID] = (nfsd4_dec)nfsd4_decode_setclientid, - [OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm, - [OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify, - [OP_WRITE] = (nfsd4_dec)nfsd4_decode_write, - [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner, + [OP_ACCESS] = nfsd4_decode_access, + [OP_CLOSE] = nfsd4_decode_close, + [OP_COMMIT] = nfsd4_decode_commit, + [OP_CREATE] = nfsd4_decode_create, + [OP_DELEGPURGE] = nfsd4_decode_notsupp, + [OP_DELEGRETURN] = nfsd4_decode_delegreturn, + [OP_GETATTR] = nfsd4_decode_getattr, + [OP_GETFH] = nfsd4_decode_noop, + [OP_LINK] = nfsd4_decode_link, + [OP_LOCK] = nfsd4_decode_lock, + [OP_LOCKT] = nfsd4_decode_lockt, + [OP_LOCKU] = nfsd4_decode_locku, + [OP_LOOKUP] = nfsd4_decode_lookup, + [OP_LOOKUPP] = nfsd4_decode_noop, + [OP_NVERIFY] = nfsd4_decode_verify, + [OP_OPEN] = nfsd4_decode_open, + [OP_OPENATTR] = nfsd4_decode_notsupp, + [OP_OPEN_CONFIRM] = nfsd4_decode_open_confirm, + [OP_OPEN_DOWNGRADE] = nfsd4_decode_open_downgrade, + [OP_PUTFH] = nfsd4_decode_putfh, + [OP_PUTPUBFH] = nfsd4_decode_putpubfh, + [OP_PUTROOTFH] = nfsd4_decode_noop, + [OP_READ] = nfsd4_decode_read, + [OP_READDIR] = nfsd4_decode_readdir, + [OP_READLINK] = nfsd4_decode_noop, + [OP_REMOVE] = nfsd4_decode_remove, + [OP_RENAME] = nfsd4_decode_rename, + [OP_RENEW] = nfsd4_decode_renew, + [OP_RESTOREFH] = nfsd4_decode_noop, + [OP_SAVEFH] = nfsd4_decode_noop, + [OP_SECINFO] = nfsd4_decode_secinfo, + [OP_SETATTR] = nfsd4_decode_setattr, + [OP_SETCLIENTID] = nfsd4_decode_setclientid, + [OP_SETCLIENTID_CONFIRM] = nfsd4_decode_setclientid_confirm, + [OP_VERIFY] = nfsd4_decode_verify, + [OP_WRITE] = nfsd4_decode_write, + [OP_RELEASE_LOCKOWNER] = nfsd4_decode_release_lockowner, /* new operations for NFSv4.1 */ - [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_backchannel_ctl, - [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session, - [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id, - [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session, - [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, - [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid, - [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_BACKCHANNEL_CTL] = nfsd4_decode_backchannel_ctl, + [OP_BIND_CONN_TO_SESSION] = nfsd4_decode_bind_conn_to_session, + [OP_EXCHANGE_ID] = nfsd4_decode_exchange_id, + [OP_CREATE_SESSION] = nfsd4_decode_create_session, + [OP_DESTROY_SESSION] = nfsd4_decode_destroy_session, + [OP_FREE_STATEID] = nfsd4_decode_free_stateid, + [OP_GET_DIR_DELEGATION] = nfsd4_decode_notsupp, #ifdef CONFIG_NFSD_PNFS - [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdeviceinfo, - [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, - [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, - [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, + [OP_GETDEVICEINFO] = nfsd4_decode_getdeviceinfo, + [OP_GETDEVICELIST] = nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = nfsd4_decode_layoutcommit, + [OP_LAYOUTGET] = nfsd4_decode_layoutget, + [OP_LAYOUTRETURN] = nfsd4_decode_layoutreturn, #else - [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICEINFO] = nfsd4_decode_notsupp, + [OP_GETDEVICELIST] = nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = nfsd4_decode_notsupp, + [OP_LAYOUTGET] = nfsd4_decode_notsupp, + [OP_LAYOUTRETURN] = nfsd4_decode_notsupp, #endif - [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name, - [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, - [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid, - [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid, - [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, + [OP_SECINFO_NO_NAME] = nfsd4_decode_secinfo_no_name, + [OP_SEQUENCE] = nfsd4_decode_sequence, + [OP_SET_SSV] = nfsd4_decode_notsupp, + [OP_TEST_STATEID] = nfsd4_decode_test_stateid, + [OP_WANT_DELEGATION] = nfsd4_decode_notsupp, + [OP_DESTROY_CLIENTID] = nfsd4_decode_destroy_clientid, + [OP_RECLAIM_COMPLETE] = nfsd4_decode_reclaim_complete, /* new operations for NFSv4.2 */ - [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, - [OP_COPY] = (nfsd4_dec)nfsd4_decode_copy, - [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_copy_notify, - [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, - [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_offload_status, - [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_offload_status, - [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_read, - [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek, - [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_CLONE] = (nfsd4_dec)nfsd4_decode_clone, + [OP_ALLOCATE] = nfsd4_decode_fallocate, + [OP_COPY] = nfsd4_decode_copy, + [OP_COPY_NOTIFY] = nfsd4_decode_copy_notify, + [OP_DEALLOCATE] = nfsd4_decode_fallocate, + [OP_IO_ADVISE] = nfsd4_decode_notsupp, + [OP_LAYOUTERROR] = nfsd4_decode_notsupp, + [OP_LAYOUTSTATS] = nfsd4_decode_notsupp, + [OP_OFFLOAD_CANCEL] = nfsd4_decode_offload_status, + [OP_OFFLOAD_STATUS] = nfsd4_decode_offload_status, + [OP_READ_PLUS] = nfsd4_decode_read, + [OP_SEEK] = nfsd4_decode_seek, + [OP_WRITE_SAME] = nfsd4_decode_notsupp, + [OP_CLONE] = nfsd4_decode_clone, /* RFC 8276 extended atributes operations */ - [OP_GETXATTR] = (nfsd4_dec)nfsd4_decode_getxattr, - [OP_SETXATTR] = (nfsd4_dec)nfsd4_decode_setxattr, - [OP_LISTXATTRS] = (nfsd4_dec)nfsd4_decode_listxattrs, - [OP_REMOVEXATTR] = (nfsd4_dec)nfsd4_decode_removexattr, + [OP_GETXATTR] = nfsd4_decode_getxattr, + [OP_SETXATTR] = nfsd4_decode_setxattr, + [OP_LISTXATTRS] = nfsd4_decode_listxattrs, + [OP_REMOVEXATTR] = nfsd4_decode_removexattr, }; static inline bool @@ -3630,8 +3689,10 @@ nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid) } static __be32 -nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) +nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_access *access = &u->access; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -3643,8 +3704,10 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ return 0; } -static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts) +static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -3660,8 +3723,10 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, } static __be32 -nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) +nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_close *close = &u->close; struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &close->cl_stateid); @@ -3669,8 +3734,10 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c static __be32 -nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit) +nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_commit *commit = &u->commit; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -3683,8 +3750,10 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ } static __be32 -nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create) +nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_create *create = &u->create; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -3697,8 +3766,10 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ } static __be32 -nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr) +nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_getattr *getattr = &u->getattr; struct svc_fh *fhp = getattr->ga_fhp; struct xdr_stream *xdr = resp->xdr; @@ -3707,8 +3778,10 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 } static __be32 -nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp) +nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct svc_fh **fhpp = &u->getfh; struct xdr_stream *xdr = resp->xdr; struct svc_fh *fhp = *fhpp; unsigned int len; @@ -3762,8 +3835,10 @@ again: } static __be32 -nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock) +nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_lock *lock = &u->lock; struct xdr_stream *xdr = resp->xdr; if (!nfserr) @@ -3775,8 +3850,10 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo } static __be32 -nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt) +nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_lockt *lockt = &u->lockt; struct xdr_stream *xdr = resp->xdr; if (nfserr == nfserr_denied) @@ -3785,8 +3862,10 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l } static __be32 -nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku) +nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_locku *locku = &u->locku; struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &locku->lu_stateid); @@ -3794,8 +3873,10 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l static __be32 -nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link) +nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_link *link = &u->link; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -3808,8 +3889,10 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li static __be32 -nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) +nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_open *open = &u->open; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -3902,16 +3985,20 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op } static __be32 -nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) +nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_open_confirm *oc = &u->open_confirm; struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid); } static __be32 -nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) +nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_open_downgrade *od = &u->open_downgrade; struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &od->od_stateid); @@ -4010,8 +4097,9 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, static __be32 nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_read *read) + union nfsd4_op_u *u) { + struct nfsd4_read *read = &u->read; bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags); unsigned long maxcount; struct xdr_stream *xdr = resp->xdr; @@ -4052,8 +4140,10 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, } static __be32 -nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink) +nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_readlink *readlink = &u->readlink; __be32 *p, *maxcount_p, zero = xdr_zero; struct xdr_stream *xdr = resp->xdr; int length_offset = xdr->buf->len; @@ -4097,8 +4187,10 @@ out_err: } static __be32 -nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir) +nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_readdir *readdir = &u->readdir; int maxcount; int bytes_left; loff_t offset; @@ -4188,8 +4280,10 @@ err_no_verf: } static __be32 -nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove) +nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_remove *remove = &u->remove; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4201,8 +4295,10 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ } static __be32 -nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename) +nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_rename *rename = &u->rename; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4284,8 +4380,9 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp) static __be32 nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_secinfo *secinfo) + union nfsd4_op_u *u) { + struct nfsd4_secinfo *secinfo = &u->secinfo; struct xdr_stream *xdr = resp->xdr; return nfsd4_do_encode_secinfo(xdr, secinfo->si_exp); @@ -4293,8 +4390,9 @@ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_secinfo_no_name *secinfo) + union nfsd4_op_u *u) { + struct nfsd4_secinfo_no_name *secinfo = &u->secinfo_no_name; struct xdr_stream *xdr = resp->xdr; return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp); @@ -4305,8 +4403,10 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, * regardless of the error status. */ static __be32 -nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr) +nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_setattr *setattr = &u->setattr; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4329,8 +4429,10 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 } static __be32 -nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd) +nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_setclientid *scd = &u->setclientid; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4353,8 +4455,10 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n } static __be32 -nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write) +nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_write *write = &u->write; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4370,8 +4474,9 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w static __be32 nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_exchange_id *exid) + union nfsd4_op_u *u) { + struct nfsd4_exchange_id *exid = &u->exchange_id; struct xdr_stream *xdr = resp->xdr; __be32 *p; char *major_id; @@ -4448,8 +4553,9 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_create_session *sess) + union nfsd4_op_u *u) { + struct nfsd4_create_session *sess = &u->create_session; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4501,8 +4607,9 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_sequence *seq) + union nfsd4_op_u *u) { + struct nfsd4_sequence *seq = &u->sequence; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4524,8 +4631,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_test_stateid *test_stateid) + union nfsd4_op_u *u) { + struct nfsd4_test_stateid *test_stateid = &u->test_stateid; struct xdr_stream *xdr = resp->xdr; struct nfsd4_test_stateid_id *stateid, *next; __be32 *p; @@ -4545,8 +4653,9 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, #ifdef CONFIG_NFSD_PNFS static __be32 nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_getdeviceinfo *gdev) + union nfsd4_op_u *u) { + struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo; struct xdr_stream *xdr = resp->xdr; const struct nfsd4_layout_ops *ops; u32 starting_len = xdr->buf->len, needed_len; @@ -4601,8 +4710,9 @@ toosmall: static __be32 nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_layoutget *lgp) + union nfsd4_op_u *u) { + struct nfsd4_layoutget *lgp = &u->layoutget; struct xdr_stream *xdr = resp->xdr; const struct nfsd4_layout_ops *ops; __be32 *p; @@ -4628,8 +4738,9 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_layoutcommit *lcp) + union nfsd4_op_u *u) { + struct nfsd4_layoutcommit *lcp = &u->layoutcommit; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4649,8 +4760,9 @@ nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_layoutreturn *lrp) + union nfsd4_op_u *u) { + struct nfsd4_layoutreturn *lrp = &u->layoutreturn; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4735,8 +4847,9 @@ nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns) static __be32 nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_copy *copy) + union nfsd4_op_u *u) { + struct nfsd4_copy *copy = &u->copy; __be32 *p; nfserr = nfsd42_encode_write_res(resp, ©->cp_res, @@ -4752,8 +4865,9 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_offload_status *os) + union nfsd4_op_u *u) { + struct nfsd4_offload_status *os = &u->offload_status; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4803,8 +4917,9 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, static __be32 nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_read *read) + union nfsd4_op_u *u) { + struct nfsd4_read *read = &u->read; struct file *file = read->rd_nf->nf_file; struct xdr_stream *xdr = resp->xdr; int starting_len = xdr->buf->len; @@ -4840,8 +4955,9 @@ out: static __be32 nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_copy_notify *cn) + union nfsd4_op_u *u) { + struct nfsd4_copy_notify *cn = &u->copy_notify; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4875,8 +4991,9 @@ nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_seek *seek) + union nfsd4_op_u *u) { + struct nfsd4_seek *seek = &u->seek; __be32 *p; p = xdr_reserve_space(resp->xdr, 4 + 8); @@ -4887,7 +5004,8 @@ nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, } static __be32 -nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) +nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *p) { return nfserr; } @@ -4938,8 +5056,9 @@ nfsd4_vbuf_to_stream(struct xdr_stream *xdr, char *buf, u32 buflen) static __be32 nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_getxattr *getxattr) + union nfsd4_op_u *u) { + struct nfsd4_getxattr *getxattr = &u->getxattr; struct xdr_stream *xdr = resp->xdr; __be32 *p, err; @@ -4962,8 +5081,9 @@ nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_setxattr *setxattr) + union nfsd4_op_u *u) { + struct nfsd4_setxattr *setxattr = &u->setxattr; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -5003,8 +5123,9 @@ nfsd4_listxattr_validate_cookie(struct nfsd4_listxattrs *listxattrs, static __be32 nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_listxattrs *listxattrs) + union nfsd4_op_u *u) { + struct nfsd4_listxattrs *listxattrs = &u->listxattrs; struct xdr_stream *xdr = resp->xdr; u32 cookie_offset, count_offset, eof; u32 left, xdrleft, slen, count; @@ -5114,8 +5235,9 @@ out: static __be32 nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_removexattr *removexattr) + union nfsd4_op_u *u) { + struct nfsd4_removexattr *removexattr = &u->removexattr; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -5127,7 +5249,7 @@ nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr, return 0; } -typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); +typedef __be32(*nfsd4_enc)(struct nfsd4_compoundres *, __be32, union nfsd4_op_u *u); /* * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1 @@ -5135,93 +5257,93 @@ typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); * done in the decoding phase. */ static const nfsd4_enc nfsd4_enc_ops[] = { - [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access, - [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close, - [OP_COMMIT] = (nfsd4_enc)nfsd4_encode_commit, - [OP_CREATE] = (nfsd4_enc)nfsd4_encode_create, - [OP_DELEGPURGE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_DELEGRETURN] = (nfsd4_enc)nfsd4_encode_noop, - [OP_GETATTR] = (nfsd4_enc)nfsd4_encode_getattr, - [OP_GETFH] = (nfsd4_enc)nfsd4_encode_getfh, - [OP_LINK] = (nfsd4_enc)nfsd4_encode_link, - [OP_LOCK] = (nfsd4_enc)nfsd4_encode_lock, - [OP_LOCKT] = (nfsd4_enc)nfsd4_encode_lockt, - [OP_LOCKU] = (nfsd4_enc)nfsd4_encode_locku, - [OP_LOOKUP] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop, - [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop, - [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open, - [OP_OPENATTR] = (nfsd4_enc)nfsd4_encode_noop, - [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm, - [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade, - [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_PUTPUBFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_PUTROOTFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_READ] = (nfsd4_enc)nfsd4_encode_read, - [OP_READDIR] = (nfsd4_enc)nfsd4_encode_readdir, - [OP_READLINK] = (nfsd4_enc)nfsd4_encode_readlink, - [OP_REMOVE] = (nfsd4_enc)nfsd4_encode_remove, - [OP_RENAME] = (nfsd4_enc)nfsd4_encode_rename, - [OP_RENEW] = (nfsd4_enc)nfsd4_encode_noop, - [OP_RESTOREFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_SAVEFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_SECINFO] = (nfsd4_enc)nfsd4_encode_secinfo, - [OP_SETATTR] = (nfsd4_enc)nfsd4_encode_setattr, - [OP_SETCLIENTID] = (nfsd4_enc)nfsd4_encode_setclientid, - [OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop, - [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop, - [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write, - [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop, + [OP_ACCESS] = nfsd4_encode_access, + [OP_CLOSE] = nfsd4_encode_close, + [OP_COMMIT] = nfsd4_encode_commit, + [OP_CREATE] = nfsd4_encode_create, + [OP_DELEGPURGE] = nfsd4_encode_noop, + [OP_DELEGRETURN] = nfsd4_encode_noop, + [OP_GETATTR] = nfsd4_encode_getattr, + [OP_GETFH] = nfsd4_encode_getfh, + [OP_LINK] = nfsd4_encode_link, + [OP_LOCK] = nfsd4_encode_lock, + [OP_LOCKT] = nfsd4_encode_lockt, + [OP_LOCKU] = nfsd4_encode_locku, + [OP_LOOKUP] = nfsd4_encode_noop, + [OP_LOOKUPP] = nfsd4_encode_noop, + [OP_NVERIFY] = nfsd4_encode_noop, + [OP_OPEN] = nfsd4_encode_open, + [OP_OPENATTR] = nfsd4_encode_noop, + [OP_OPEN_CONFIRM] = nfsd4_encode_open_confirm, + [OP_OPEN_DOWNGRADE] = nfsd4_encode_open_downgrade, + [OP_PUTFH] = nfsd4_encode_noop, + [OP_PUTPUBFH] = nfsd4_encode_noop, + [OP_PUTROOTFH] = nfsd4_encode_noop, + [OP_READ] = nfsd4_encode_read, + [OP_READDIR] = nfsd4_encode_readdir, + [OP_READLINK] = nfsd4_encode_readlink, + [OP_REMOVE] = nfsd4_encode_remove, + [OP_RENAME] = nfsd4_encode_rename, + [OP_RENEW] = nfsd4_encode_noop, + [OP_RESTOREFH] = nfsd4_encode_noop, + [OP_SAVEFH] = nfsd4_encode_noop, + [OP_SECINFO] = nfsd4_encode_secinfo, + [OP_SETATTR] = nfsd4_encode_setattr, + [OP_SETCLIENTID] = nfsd4_encode_setclientid, + [OP_SETCLIENTID_CONFIRM] = nfsd4_encode_noop, + [OP_VERIFY] = nfsd4_encode_noop, + [OP_WRITE] = nfsd4_encode_write, + [OP_RELEASE_LOCKOWNER] = nfsd4_encode_noop, /* NFSv4.1 operations */ - [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop, - [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session, - [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, - [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, - [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop, - [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, - [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_BACKCHANNEL_CTL] = nfsd4_encode_noop, + [OP_BIND_CONN_TO_SESSION] = nfsd4_encode_bind_conn_to_session, + [OP_EXCHANGE_ID] = nfsd4_encode_exchange_id, + [OP_CREATE_SESSION] = nfsd4_encode_create_session, + [OP_DESTROY_SESSION] = nfsd4_encode_noop, + [OP_FREE_STATEID] = nfsd4_encode_noop, + [OP_GET_DIR_DELEGATION] = nfsd4_encode_noop, #ifdef CONFIG_NFSD_PNFS - [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdeviceinfo, - [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, - [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, - [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, + [OP_GETDEVICEINFO] = nfsd4_encode_getdeviceinfo, + [OP_GETDEVICELIST] = nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = nfsd4_encode_layoutcommit, + [OP_LAYOUTGET] = nfsd4_encode_layoutget, + [OP_LAYOUTRETURN] = nfsd4_encode_layoutreturn, #else - [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, - [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICEINFO] = nfsd4_encode_noop, + [OP_GETDEVICELIST] = nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = nfsd4_encode_noop, + [OP_LAYOUTGET] = nfsd4_encode_noop, + [OP_LAYOUTRETURN] = nfsd4_encode_noop, #endif - [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name, - [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, - [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, - [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_test_stateid, - [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, - [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop, - [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SECINFO_NO_NAME] = nfsd4_encode_secinfo_no_name, + [OP_SEQUENCE] = nfsd4_encode_sequence, + [OP_SET_SSV] = nfsd4_encode_noop, + [OP_TEST_STATEID] = nfsd4_encode_test_stateid, + [OP_WANT_DELEGATION] = nfsd4_encode_noop, + [OP_DESTROY_CLIENTID] = nfsd4_encode_noop, + [OP_RECLAIM_COMPLETE] = nfsd4_encode_noop, /* NFSv4.2 operations */ - [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_COPY] = (nfsd4_enc)nfsd4_encode_copy, - [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_copy_notify, - [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop, - [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop, - [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_offload_status, - [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_read_plus, - [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek, - [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop, - [OP_CLONE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_ALLOCATE] = nfsd4_encode_noop, + [OP_COPY] = nfsd4_encode_copy, + [OP_COPY_NOTIFY] = nfsd4_encode_copy_notify, + [OP_DEALLOCATE] = nfsd4_encode_noop, + [OP_IO_ADVISE] = nfsd4_encode_noop, + [OP_LAYOUTERROR] = nfsd4_encode_noop, + [OP_LAYOUTSTATS] = nfsd4_encode_noop, + [OP_OFFLOAD_CANCEL] = nfsd4_encode_noop, + [OP_OFFLOAD_STATUS] = nfsd4_encode_offload_status, + [OP_READ_PLUS] = nfsd4_encode_read_plus, + [OP_SEEK] = nfsd4_encode_seek, + [OP_WRITE_SAME] = nfsd4_encode_noop, + [OP_CLONE] = nfsd4_encode_noop, /* RFC 8276 extended atributes operations */ - [OP_GETXATTR] = (nfsd4_enc)nfsd4_encode_getxattr, - [OP_SETXATTR] = (nfsd4_enc)nfsd4_encode_setxattr, - [OP_LISTXATTRS] = (nfsd4_enc)nfsd4_encode_listxattrs, - [OP_REMOVEXATTR] = (nfsd4_enc)nfsd4_encode_removexattr, + [OP_GETXATTR] = nfsd4_encode_getxattr, + [OP_SETXATTR] = nfsd4_encode_setxattr, + [OP_LISTXATTRS] = nfsd4_encode_listxattrs, + [OP_REMOVEXATTR] = nfsd4_encode_removexattr, }; /* -- cgit From 7fd461c47c6cfab4ca4d003790ec276209e52978 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 30 Nov 2022 13:15:27 -0500 Subject: NFSv4.2: Change the default KConfig value for READ_PLUS Now that we've worked out performance issues and have a server patch addressing the failed xfstests, we can safely enable this feature by default. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/Kconfig | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 14a72224b657..1ead5bd740c2 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -209,8 +209,8 @@ config NFS_DISABLE_UDP_SUPPORT config NFS_V4_2_READ_PLUS bool "NFS: Enable support for the NFSv4.2 READ_PLUS operation" depends on NFS_V4_2 - default n + default y help - This is intended for developers only. The READ_PLUS operation has - been shown to have issues under specific conditions and should not - be used in production. + Choose Y here to enable the use of READ_PLUS over NFS v4.2. READ_PLUS + attempts to improve read performance by compressing out sparse holes + in the file contents. -- cgit From e0c49bd2b4d3cd1751491eb2d940bce968ac65e9 Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Fri, 9 Dec 2022 18:04:48 +0800 Subject: fs: sysv: Fix sysv_nblocks() returns wrong value sysv_nblocks() returns 'blocks' rather than 'res', which only counting the number of triple-indirect blocks and causing sysv_getattr() gets a wrong result. [AV: this is actually a sysv counterpart of minixfs fix - 0fcd426de9d0 "[PATCH] minix block usage counting fix" in historical tree; mea culpa, should've thought to check fs/sysv back then...] Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Chen Zhongjin Signed-off-by: Al Viro --- fs/sysv/itree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index d4ec9bb97de9..3b8567564e7e 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -438,7 +438,7 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size) res += blocks; direct = 1; } - return blocks; + return res; } int sysv_getattr(struct user_namespace *mnt_userns, const struct path *path, -- cgit From 37ba7b005a7a4454046bd8659c7a9c5330552396 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 29 Oct 2022 00:01:38 +0900 Subject: ksmbd: set SMB2_SESSION_FLAG_ENCRYPT_DATA when enforcing data encryption for this share Currently, SMB2_SESSION_FLAG_ENCRYPT_DATA is always set session setup response. Since this forces data encryption from the client, there is a problem that data is always encrypted regardless of the use of the cifs seal mount option. SMB2_SESSION_FLAG_ENCRYPT_DATA should be set according to KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION flags, and in case of KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF, encryption mode is turned off for all connections. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/ksmbd_netlink.h | 1 + fs/ksmbd/smb2ops.c | 10 ++++++++-- fs/ksmbd/smb2pdu.c | 8 +++++--- 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index ff07c67f4565..b6bd8311e6b4 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -74,6 +74,7 @@ struct ksmbd_heartbeat { #define KSMBD_GLOBAL_FLAG_SMB2_LEASES BIT(0) #define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION BIT(1) #define KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL BIT(2) +#define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF BIT(3) /* * IPC request for ksmbd server startup diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c index ab23da2120b9..e401302478c3 100644 --- a/fs/ksmbd/smb2ops.c +++ b/fs/ksmbd/smb2ops.c @@ -247,8 +247,9 @@ void init_smb3_02_server(struct ksmbd_conn *conn) if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING; - if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION && - conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION) + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION || + (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) && + conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)) conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) @@ -271,6 +272,11 @@ int init_smb3_11_server(struct ksmbd_conn *conn) if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING; + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION || + (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) && + conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)) + conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION; + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index b2fc85d440d0..56d68ddc409c 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -903,7 +903,7 @@ static void decode_encrypt_ctxt(struct ksmbd_conn *conn, return; } - if (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION)) + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) return; for (i = 0; i < cph_cnt; i++) { @@ -1508,7 +1508,8 @@ static int ntlm_authenticate(struct ksmbd_work *work) return -EINVAL; } sess->enc = true; - rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION) + rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; /* * signing is disable if encryption is enable * on this session @@ -1599,7 +1600,8 @@ static int krb5_authenticate(struct ksmbd_work *work) return -EINVAL; } sess->enc = true; - rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION) + rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; sess->sign = false; } -- cgit From 7ecbe92696bb7fe32c80b6cf64736a0d157717a9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 11 Nov 2022 08:11:53 -0500 Subject: ksmbd: use F_SETLK when unlocking a file ksmbd seems to be trying to use a cmd value of 0 when unlocking a file. That activity requires a type of F_UNLCK with a cmd of F_SETLK. For local POSIX locking, it doesn't matter much since vfs_lock_file ignores @cmd, but filesystems that define their own ->lock operation expect to see it set sanely. Cc: David Howells Signed-off-by: Jeff Layton Reviewed-by: David Howells Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 56d68ddc409c..de8e367095c9 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -6753,7 +6753,7 @@ static int smb2_set_flock_flags(struct file_lock *flock, int flags) case SMB2_LOCKFLAG_UNLOCK: ksmbd_debug(SMB, "received unlock request\n"); flock->fl_type = F_UNLCK; - cmd = 0; + cmd = F_SETLK; break; } @@ -7131,7 +7131,7 @@ out: rlock->fl_start = smb_lock->start; rlock->fl_end = smb_lock->end; - rc = vfs_lock_file(filp, 0, rlock, NULL); + rc = vfs_lock_file(filp, F_SETLK, rlock, NULL); if (rc) pr_err("rollback unlock fail : %d\n", rc); -- cgit From 30429388531b120902126a39cf64f877fc5c7773 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 15 Nov 2022 09:35:10 -0600 Subject: ksmbd: replace one-element arrays with flexible-array members One-element arrays are deprecated, and we are replacing them with flexible array members instead. So, replace one-element arrays with flexible-array members in multiple structs in fs/ksmbd/smb_common.h and one in fs/ksmbd/smb2pdu.h. Important to mention is that doing a build before/after this patch results in no binary output differences. This helps with the ongoing efforts to tighten the FORTIFY_SOURCE routines on memcpy() and help us make progress towards globally enabling -fstrict-flex-arrays=3 [1]. Link: https://github.com/KSPP/linux/issues/242 Link: https://github.com/KSPP/linux/issues/79 Link: https://gcc.gnu.org/pipermail/gcc-patches/2022-October/602902.html [1] Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Reviewed-by: Sergey Senozhatsky Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 4 ++-- fs/ksmbd/smb2pdu.h | 2 +- fs/ksmbd/smb_common.h | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index de8e367095c9..79261493a212 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -3440,7 +3440,7 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, goto free_conv_name; } - struct_sz = readdir_info_level_struct_sz(info_level) - 1 + conv_len; + struct_sz = readdir_info_level_struct_sz(info_level) + conv_len; next_entry_offset = ALIGN(struct_sz, KSMBD_DIR_INFO_ALIGNMENT); d_info->last_entry_off_align = next_entry_offset - struct_sz; @@ -3692,7 +3692,7 @@ static int reserve_populate_dentry(struct ksmbd_dir_info *d_info, return -EOPNOTSUPP; conv_len = (d_info->name_len + 1) * 2; - next_entry_offset = ALIGN(struct_sz - 1 + conv_len, + next_entry_offset = ALIGN(struct_sz + conv_len, KSMBD_DIR_INFO_ALIGNMENT); if (next_entry_offset > d_info->out_buf_len) { diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index 092fdd3f8750..aa5dbe54f5a1 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -443,7 +443,7 @@ struct smb2_posix_info { /* SidBuffer contain two sids (UNIX user sid(16), UNIX group sid(16)) */ u8 SidBuffer[32]; __le32 name_len; - u8 name[1]; + u8 name[]; /* * var sized owner SID * var sized group SID diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h index 318c16fa81da..e663ab9ea759 100644 --- a/fs/ksmbd/smb_common.h +++ b/fs/ksmbd/smb_common.h @@ -277,14 +277,14 @@ struct file_directory_info { __le64 AllocationSize; __le32 ExtFileAttributes; __le32 FileNameLength; - char FileName[1]; + char FileName[]; } __packed; /* level 0x101 FF resp data */ struct file_names_info { __le32 NextEntryOffset; __u32 FileIndex; __le32 FileNameLength; - char FileName[1]; + char FileName[]; } __packed; /* level 0xc FF resp data */ struct file_full_directory_info { @@ -299,7 +299,7 @@ struct file_full_directory_info { __le32 ExtFileAttributes; __le32 FileNameLength; __le32 EaSize; - char FileName[1]; + char FileName[]; } __packed; /* level 0x102 FF resp */ struct file_both_directory_info { @@ -317,7 +317,7 @@ struct file_both_directory_info { __u8 ShortNameLength; __u8 Reserved; __u8 ShortName[24]; - char FileName[1]; + char FileName[]; } __packed; /* level 0x104 FFrsp data */ struct file_id_both_directory_info { @@ -337,7 +337,7 @@ struct file_id_both_directory_info { __u8 ShortName[24]; __le16 Reserved2; __le64 UniqueId; - char FileName[1]; + char FileName[]; } __packed; struct file_id_full_dir_info { @@ -354,7 +354,7 @@ struct file_id_full_dir_info { __le32 EaSize; /* EA size */ __le32 Reserved; __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/ - char FileName[1]; + char FileName[]; } __packed; /* level 0x105 FF rsp data */ struct smb_version_values { -- cgit From bc044414fa0326a4e5c3c509c00b1fcaf621b5f4 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Wed, 16 Nov 2022 20:22:37 +0800 Subject: ksmbd: Fix resource leak in ksmbd_session_rpc_open() When ksmbd_rpc_open() fails then it must call ksmbd_rpc_id_free() to undo the result of ksmbd_ipc_id_alloc(). Fixes: e2f34481b24d ("cifsd: add server-side procedures for SMB3") Signed-off-by: Xiu Jianfeng Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/mgmt/user_session.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ksmbd/mgmt/user_session.c b/fs/ksmbd/mgmt/user_session.c index 3fa2139a0b30..92b1603b5abe 100644 --- a/fs/ksmbd/mgmt/user_session.c +++ b/fs/ksmbd/mgmt/user_session.c @@ -108,15 +108,17 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) entry->method = method; entry->id = ksmbd_ipc_id_alloc(); if (entry->id < 0) - goto error; + goto free_entry; resp = ksmbd_rpc_open(sess, entry->id); if (!resp) - goto error; + goto free_id; kvfree(resp); return entry->id; -error: +free_id: + ksmbd_rpc_id_free(entry->id); +free_entry: list_del(&entry->list); kfree(entry); return -EINVAL; -- cgit From 01f6c61bae3d658058ee6322af77acea26a5ee3a Mon Sep 17 00:00:00 2001 From: Marios Makassikis Date: Tue, 29 Nov 2022 12:19:33 +0100 Subject: ksmbd: Fix resource leak in smb2_lock() "flock" is leaked if an error happens before smb2_lock_init(), as the lock is not added to the lock_list to be cleaned up. Signed-off-by: Marios Makassikis Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 79261493a212..88d2c23369fe 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -6857,6 +6857,7 @@ int smb2_lock(struct ksmbd_work *work) if (lock_start > U64_MAX - lock_length) { pr_err("Invalid lock range requested\n"); rsp->hdr.Status = STATUS_INVALID_LOCK_RANGE; + locks_free_lock(flock); goto out; } @@ -6876,6 +6877,7 @@ int smb2_lock(struct ksmbd_work *work) "the end offset(%llx) is smaller than the start offset(%llx)\n", flock->fl_end, flock->fl_start); rsp->hdr.Status = STATUS_INVALID_LOCK_RANGE; + locks_free_lock(flock); goto out; } @@ -6887,6 +6889,7 @@ int smb2_lock(struct ksmbd_work *work) flock->fl_type != F_UNLCK) { pr_err("conflict two locks in one request\n"); err = -EINVAL; + locks_free_lock(flock); goto out; } } @@ -6895,6 +6898,7 @@ int smb2_lock(struct ksmbd_work *work) smb_lock = smb2_lock_init(flock, cmd, flags, &lock_list); if (!smb_lock) { err = -EINVAL; + locks_free_lock(flock); goto out; } } -- cgit From 72ee45fd46d0d3578c4e6046f66fae3218543ce3 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Wed, 7 Dec 2022 09:29:27 +0800 Subject: ksmbd: Convert to use sysfs_emit()/sysfs_emit_at() APIs Follow the advice of the Documentation/filesystems/sysfs.rst and show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. Signed-off-by: ye xingchen Reviewed-by: Sergey Senozhatsky Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/server.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c index a0d635304754..394b6ceac431 100644 --- a/fs/ksmbd/server.c +++ b/fs/ksmbd/server.c @@ -432,11 +432,9 @@ static ssize_t stats_show(struct class *class, struct class_attribute *attr, "reset", "shutdown" }; - - ssize_t sz = scnprintf(buf, PAGE_SIZE, "%d %s %d %lu\n", stats_version, - state[server_conf.state], server_conf.tcp_port, - server_conf.ipc_last_active / HZ); - return sz; + return sysfs_emit(buf, "%d %s %d %lu\n", stats_version, + state[server_conf.state], server_conf.tcp_port, + server_conf.ipc_last_active / HZ); } static ssize_t kill_server_store(struct class *class, @@ -468,19 +466,13 @@ static ssize_t debug_show(struct class *class, struct class_attribute *attr, for (i = 0; i < ARRAY_SIZE(debug_type_strings); i++) { if ((ksmbd_debug_types >> i) & 1) { - pos = scnprintf(buf + sz, - PAGE_SIZE - sz, - "[%s] ", - debug_type_strings[i]); + pos = sysfs_emit_at(buf, sz, "[%s] ", debug_type_strings[i]); } else { - pos = scnprintf(buf + sz, - PAGE_SIZE - sz, - "%s ", - debug_type_strings[i]); + pos = sysfs_emit_at(buf, sz, "%s ", debug_type_strings[i]); } sz += pos; } - sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); + sz += sysfs_emit_at(buf, sz, "\n"); return sz; } -- cgit From ff39899be80b9d90d5e13775eb9fd150338b6e15 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Thu, 21 Jul 2022 09:59:32 +0800 Subject: exfat: simplify empty entry hint This commit adds exfat_set_empty_hint()/exfat_reset_empty_hint() to reduce code complexity and make code more readable. Signed-off-by: Yuezhang Mo Reviewed-by: Andy Wu Reviewed-by: Aoyama Wataru Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/dir.c | 58 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 0fc08fdcba73..c0e60ff9ec7d 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -897,6 +897,29 @@ free_es: return NULL; } +static inline void exfat_reset_empty_hint(struct exfat_hint_femp *hint_femp) +{ + hint_femp->eidx = EXFAT_HINT_NONE; + hint_femp->count = 0; +} + +static inline void exfat_set_empty_hint(struct exfat_inode_info *ei, + struct exfat_hint_femp *candi_empty, struct exfat_chain *clu, + int dentry, int num_entries) +{ + if (ei->hint_femp.eidx == EXFAT_HINT_NONE || + ei->hint_femp.eidx > dentry) { + if (candi_empty->count == 0) { + candi_empty->cur = *clu; + candi_empty->eidx = dentry; + } + + candi_empty->count++; + if (candi_empty->count == num_entries) + ei->hint_femp = *candi_empty; + } +} + enum { DIRENT_STEP_FILE, DIRENT_STEP_STRM, @@ -921,7 +944,7 @@ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, { int i, rewind = 0, dentry = 0, end_eidx = 0, num_ext = 0, len; int order, step, name_len = 0; - int dentries_per_clu, num_empty = 0; + int dentries_per_clu; unsigned int entry_type; unsigned short *uniname = NULL; struct exfat_chain clu; @@ -939,10 +962,13 @@ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, end_eidx = dentry; } - candi_empty.eidx = EXFAT_HINT_NONE; + exfat_reset_empty_hint(&ei->hint_femp); + rewind: order = 0; step = DIRENT_STEP_FILE; + exfat_reset_empty_hint(&candi_empty); + while (clu.dir != EXFAT_EOF_CLUSTER) { i = dentry & (dentries_per_clu - 1); for (; i < dentries_per_clu; i++, dentry++) { @@ -962,26 +988,8 @@ rewind: entry_type == TYPE_DELETED) { step = DIRENT_STEP_FILE; - num_empty++; - if (candi_empty.eidx == EXFAT_HINT_NONE && - num_empty == 1) { - exfat_chain_set(&candi_empty.cur, - clu.dir, clu.size, clu.flags); - } - - if (candi_empty.eidx == EXFAT_HINT_NONE && - num_empty >= num_entries) { - candi_empty.eidx = - dentry - (num_empty - 1); - WARN_ON(candi_empty.eidx < 0); - candi_empty.count = num_empty; - - if (ei->hint_femp.eidx == - EXFAT_HINT_NONE || - candi_empty.eidx <= - ei->hint_femp.eidx) - ei->hint_femp = candi_empty; - } + exfat_set_empty_hint(ei, &candi_empty, &clu, + dentry, num_entries); brelse(bh); if (entry_type == TYPE_UNUSED) @@ -989,8 +997,7 @@ rewind: continue; } - num_empty = 0; - candi_empty.eidx = EXFAT_HINT_NONE; + exfat_reset_empty_hint(&candi_empty); if (entry_type == TYPE_FILE || entry_type == TYPE_DIR) { step = DIRENT_STEP_FILE; @@ -1090,9 +1097,6 @@ not_found: rewind = 1; dentry = 0; clu.dir = p_dir->dir; - /* reset empty hint */ - num_empty = 0; - candi_empty.eidx = EXFAT_HINT_NONE; goto rewind; } -- cgit From e298c8a818a3e517582e60c412f4a41b3a1647c5 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Mon, 7 Nov 2022 17:22:13 +0900 Subject: exfat: hint the empty entry which at the end of cluster chain After traversing all directory entries, hint the empty directory entry no matter whether or not there are enough empty directory entries. After this commit, hint the empty directory entries like this: 1. Hint the deleted directory entries if enough; 2. Hint the deleted and unused directory entries which at the end of the cluster chain no matter whether enough or not(Add by this commit); 3. If no any empty directory entries, hint the empty directory entries in the new cluster(Add by this commit). This avoids repeated traversal of directory entries, reduces CPU usage, and improves the performance of creating files and directories(especially on low-performance CPUs). Test create 5000 files in a class 4 SD card on imx6q-sabrelite with: for ((i=0;i<5;i++)); do sync time (for ((j=1;j<=1000;j++)); do touch file$((i*1000+j)); done) done The more files, the more performance improvements. Before After Improvement 1~1000 25.360s 22.168s 14.40% 1001~2000 38.242s 28.72ss 33.15% 2001~3000 49.134s 35.037s 40.23% 3001~4000 62.042s 41.624s 49.05% 4001~5000 73.629s 46.772s 57.42% Signed-off-by: Yuezhang Mo Reviewed-by: Andy Wu Reviewed-by: Aoyama Wataru Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/dir.c | 26 ++++++++++++++++++++++---- fs/exfat/namei.c | 33 +++++++++++++++++++++------------ 2 files changed, 43 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index c0e60ff9ec7d..30d0ac43b66c 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -905,17 +905,24 @@ static inline void exfat_reset_empty_hint(struct exfat_hint_femp *hint_femp) static inline void exfat_set_empty_hint(struct exfat_inode_info *ei, struct exfat_hint_femp *candi_empty, struct exfat_chain *clu, - int dentry, int num_entries) + int dentry, int num_entries, int entry_type) { if (ei->hint_femp.eidx == EXFAT_HINT_NONE || ei->hint_femp.eidx > dentry) { + int total_entries = EXFAT_B_TO_DEN(i_size_read(&ei->vfs_inode)); + if (candi_empty->count == 0) { candi_empty->cur = *clu; candi_empty->eidx = dentry; } - candi_empty->count++; - if (candi_empty->count == num_entries) + if (entry_type == TYPE_UNUSED) + candi_empty->count += total_entries - dentry; + else + candi_empty->count++; + + if (candi_empty->count == num_entries || + candi_empty->count + candi_empty->eidx == total_entries) ei->hint_femp = *candi_empty; } } @@ -989,7 +996,8 @@ rewind: step = DIRENT_STEP_FILE; exfat_set_empty_hint(ei, &candi_empty, &clu, - dentry, num_entries); + dentry, num_entries, + entry_type); brelse(bh); if (entry_type == TYPE_UNUSED) @@ -1100,6 +1108,16 @@ not_found: goto rewind; } + /* + * set the EXFAT_EOF_CLUSTER flag to avoid search + * from the beginning again when allocated a new cluster + */ + if (ei->hint_femp.eidx == EXFAT_HINT_NONE) { + ei->hint_femp.cur.dir = EXFAT_EOF_CLUSTER; + ei->hint_femp.eidx = p_dir->size * dentries_per_clu; + ei->hint_femp.count = 0; + } + /* initialized hint_stat */ hint_stat->clu = p_dir->dir; hint_stat->eidx = 0; diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index b617bebc3d0f..99e00a36c029 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -224,11 +224,18 @@ static int exfat_search_empty_slot(struct super_block *sb, if (hint_femp->eidx != EXFAT_HINT_NONE) { dentry = hint_femp->eidx; - if (num_entries <= hint_femp->count) { - hint_femp->eidx = EXFAT_HINT_NONE; - return dentry; - } + /* + * If hint_femp->count is enough, it is needed to check if + * there are actual empty entries. + * Otherwise, and if "dentry + hint_famp->count" is also equal + * to "p_dir->size * dentries_per_clu", it means ENOSPC. + */ + if (dentry + hint_femp->count == p_dir->size * dentries_per_clu && + num_entries > hint_femp->count) + return -ENOSPC; + + hint_femp->eidx = EXFAT_HINT_NONE; exfat_chain_dup(&clu, &hint_femp->cur); } else { exfat_chain_dup(&clu, p_dir); @@ -293,6 +300,12 @@ static int exfat_search_empty_slot(struct super_block *sb, } } + hint_femp->eidx = p_dir->size * dentries_per_clu - num_empty; + hint_femp->count = num_empty; + if (num_empty == 0) + exfat_chain_set(&hint_femp->cur, EXFAT_EOF_CLUSTER, 0, + clu.flags); + return -ENOSPC; } @@ -369,15 +382,11 @@ static int exfat_find_empty_entry(struct inode *inode, if (exfat_ent_set(sb, last_clu, clu.dir)) return -EIO; - if (hint_femp.eidx == EXFAT_HINT_NONE) { - /* the special case that new dentry - * should be allocated from the start of new cluster - */ - hint_femp.eidx = EXFAT_B_TO_DEN_IDX(p_dir->size, sbi); - hint_femp.count = sbi->dentries_per_clu; - + if (hint_femp.cur.dir == EXFAT_EOF_CLUSTER) exfat_chain_set(&hint_femp.cur, clu.dir, 0, clu.flags); - } + + hint_femp.count += sbi->dentries_per_clu; + hint_femp.cur.size++; p_dir->size++; size = EXFAT_CLU_TO_B(p_dir->size, sbi); -- cgit From f83d8a3b532097276266b5e81073ea46e27b17ab Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Thu, 7 Apr 2022 15:55:56 +0800 Subject: exfat: reduce the size of exfat_entry_set_cache In normal, there are 19 directory entries at most for a file or a directory. - A file directory entry - A stream extension directory entry - 1~17 file name directory entry So the directory entries are in 3 sectors at most, it is enough for struct exfat_entry_set_cache to pre-allocate 3 bh. This commit changes the size of struct exfat_entry_set_cache as: Before After 32-bit system 88 32 bytes 64-bit system 168 48 bytes Signed-off-by: Yuezhang Mo Reviewed-by: Andy Wu Reviewed-by: Aoyama Wataru Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/exfat_fs.h | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index a8f8eee4937c..af55018ff22e 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -9,6 +9,7 @@ #include #include #include +#include #define EXFAT_ROOT_INO 1 @@ -41,6 +42,14 @@ enum { #define ES_2_ENTRIES 2 #define ES_ALL_ENTRIES 0 +#define ES_IDX_FILE 0 +#define ES_IDX_STREAM 1 +#define ES_IDX_FIRST_FILENAME 2 +#define EXFAT_FILENAME_ENTRY_NUM(name_len) \ + DIV_ROUND_UP(name_len, EXFAT_FILE_NAME_LEN) +#define ES_IDX_LAST_FILENAME(name_len) \ + (ES_IDX_FIRST_FILENAME + EXFAT_FILENAME_ENTRY_NUM(name_len) - 1) + #define DIR_DELETED 0xFFFF0321 /* type values */ @@ -68,9 +77,6 @@ enum { #define MAX_NAME_LENGTH 255 /* max len of file name excluding NULL */ #define MAX_VFSNAME_BUF_SIZE ((MAX_NAME_LENGTH + 1) * MAX_CHARSET_SIZE) -/* Enough size to hold 256 dentry (even 512 Byte sector) */ -#define DIR_CACHE_SIZE (256*sizeof(struct exfat_dentry)/512+1) - #define EXFAT_HINT_NONE -1 #define EXFAT_MIN_SUBDIR 2 @@ -125,6 +131,17 @@ enum { #define BITS_PER_BYTE_MASK 0x7 #define IGNORED_BITS_REMAINED(clu, clu_base) ((1 << ((clu) - (clu_base))) - 1) +#define ES_ENTRY_NUM(name_len) (ES_IDX_LAST_FILENAME(name_len) + 1) +/* 19 entries = 1 file entry + 1 stream entry + 17 filename entries */ +#define ES_MAX_ENTRY_NUM ES_ENTRY_NUM(MAX_NAME_LENGTH) + +/* + * 19 entries x 32 bytes/entry = 608 bytes. + * The 608 bytes are in 3 sectors at most (even 512 Byte sector). + */ +#define DIR_CACHE_SIZE \ + (DIV_ROUND_UP(EXFAT_DEN_TO_B(ES_MAX_ENTRY_NUM), SECTOR_SIZE) + 1) + struct exfat_dentry_namebuf { char *lfn; int lfnbuf_len; /* usually MAX_UNINAME_BUF_SIZE */ @@ -166,11 +183,11 @@ struct exfat_hint { struct exfat_entry_set_cache { struct super_block *sb; - bool modified; unsigned int start_off; int num_bh; struct buffer_head *bh[DIR_CACHE_SIZE]; unsigned int num_entries; + bool modified; }; struct exfat_dir_entry { -- cgit From a3ff29a95fde16906304455aa8c0bd84eb770258 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Wed, 9 Nov 2022 13:50:22 +0800 Subject: exfat: support dynamic allocate bh for exfat_entry_set_cache In special cases, a file or a directory may occupied more than 19 directory entries, pre-allocating 3 bh is not enough. Such as - Support vendor secondary directory entry in the future. - Since file directory entry is damaged, the SecondaryCount field is bigger than 18. So this commit supports dynamic allocation of bh. Signed-off-by: Yuezhang Mo Reviewed-by: Andy Wu Reviewed-by: Aoyama Wataru Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/dir.c | 15 +++++++++++++++ fs/exfat/exfat_fs.h | 5 ++++- 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 30d0ac43b66c..03e9c9e3966e 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -615,6 +615,10 @@ int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync) bforget(es->bh[i]); else brelse(es->bh[i]); + + if (IS_DYNAMIC_ES(es)) + kfree(es->bh); + kfree(es); return err; } @@ -847,6 +851,7 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, /* byte offset in sector */ off = EXFAT_BLK_OFFSET(byte_offset, sb); es->start_off = off; + es->bh = es->__bh; /* sector offset in cluster */ sec = EXFAT_B_TO_BLK(byte_offset, sb); @@ -866,6 +871,16 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, es->num_entries = num_entries; num_bh = EXFAT_B_TO_BLK_ROUND_UP(off + num_entries * DENTRY_SIZE, sb); + if (num_bh > ARRAY_SIZE(es->__bh)) { + es->bh = kmalloc_array(num_bh, sizeof(*es->bh), GFP_KERNEL); + if (!es->bh) { + brelse(bh); + kfree(es); + return NULL; + } + es->bh[0] = bh; + } + for (i = 1; i < num_bh; i++) { /* get the next sector */ if (exfat_is_last_sector_in_cluster(sbi, sec)) { diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index af55018ff22e..82395ae80dba 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -185,11 +185,14 @@ struct exfat_entry_set_cache { struct super_block *sb; unsigned int start_off; int num_bh; - struct buffer_head *bh[DIR_CACHE_SIZE]; + struct buffer_head *__bh[DIR_CACHE_SIZE]; + struct buffer_head **bh; unsigned int num_entries; bool modified; }; +#define IS_DYNAMIC_ES(es) ((es)->__bh != (es)->bh) + struct exfat_dir_entry { struct exfat_chain dir; int entry; -- cgit From 20914ff6dd56dd6b548bf5dd90bff09ef89999e4 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Thu, 17 Nov 2022 11:37:13 +0800 Subject: exfat: move exfat_entry_set_cache from heap to stack The size of struct exfat_entry_set_cache is only 56 bytes on 64-bit system, and allocating from stack is more efficient than allocating from heap. Signed-off-by: Yuezhang Mo Reviewed-by: Andy Wu Reviewed-by: Aoyama Wataru Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/dir.c | 35 +++++++++++++++-------------------- fs/exfat/exfat_fs.h | 5 +++-- fs/exfat/inode.c | 13 ++++++------- fs/exfat/namei.c | 11 +++++------ 4 files changed, 29 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 03e9c9e3966e..a3fb609dd129 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -33,10 +33,9 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb, struct exfat_chain *p_dir, int entry, unsigned short *uniname) { int i; - struct exfat_entry_set_cache *es; + struct exfat_entry_set_cache es; - es = exfat_get_dentry_set(sb, p_dir, entry, ES_ALL_ENTRIES); - if (!es) + if (exfat_get_dentry_set(&es, sb, p_dir, entry, ES_ALL_ENTRIES)) return; /* @@ -45,8 +44,8 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb, * Third entry : first file-name entry * So, the index of first file-name dentry should start from 2. */ - for (i = 2; i < es->num_entries; i++) { - struct exfat_dentry *ep = exfat_get_dentry_cached(es, i); + for (i = 2; i < es.num_entries; i++) { + struct exfat_dentry *ep = exfat_get_dentry_cached(&es, i); /* end of name entry */ if (exfat_get_entry_type(ep) != TYPE_EXTEND) @@ -56,7 +55,7 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb, uniname += EXFAT_FILE_NAME_LEN; } - exfat_free_dentry_set(es, false); + exfat_free_dentry_set(&es, false); } /* read a directory entry from the opened directory */ @@ -619,7 +618,6 @@ int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync) if (IS_DYNAMIC_ES(es)) kfree(es->bh); - kfree(es); return err; } @@ -816,14 +814,14 @@ struct exfat_dentry *exfat_get_dentry_cached( * pointer of entry set on success, * NULL on failure. */ -struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, - struct exfat_chain *p_dir, int entry, unsigned int type) +int exfat_get_dentry_set(struct exfat_entry_set_cache *es, + struct super_block *sb, struct exfat_chain *p_dir, int entry, + unsigned int type) { int ret, i, num_bh; unsigned int off, byte_offset, clu = 0; sector_t sec; struct exfat_sb_info *sbi = EXFAT_SB(sb); - struct exfat_entry_set_cache *es; struct exfat_dentry *ep; int num_entries; enum exfat_validate_dentry_mode mode = ES_MODE_STARTED; @@ -831,17 +829,15 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, if (p_dir->dir == DIR_DELETED) { exfat_err(sb, "access to deleted dentry"); - return NULL; + return -EIO; } byte_offset = EXFAT_DEN_TO_B(entry); ret = exfat_walk_fat_chain(sb, p_dir, byte_offset, &clu); if (ret) - return NULL; + return ret; - es = kzalloc(sizeof(*es), GFP_KERNEL); - if (!es) - return NULL; + memset(es, 0, sizeof(*es)); es->sb = sb; es->modified = false; @@ -859,7 +855,7 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, bh = sb_bread(sb, sec); if (!bh) - goto free_es; + return -EIO; es->bh[es->num_bh++] = bh; ep = exfat_get_dentry_cached(es, 0); @@ -875,8 +871,7 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, es->bh = kmalloc_array(num_bh, sizeof(*es->bh), GFP_KERNEL); if (!es->bh) { brelse(bh); - kfree(es); - return NULL; + return -ENOMEM; } es->bh[0] = bh; } @@ -905,11 +900,11 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode)) goto free_es; } - return es; + return 0; free_es: exfat_free_dentry_set(es, false); - return NULL; + return -EIO; } static inline void exfat_reset_empty_hint(struct exfat_hint_femp *hint_femp) diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 82395ae80dba..2ffe5792b1a9 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -490,8 +490,9 @@ struct exfat_dentry *exfat_get_dentry(struct super_block *sb, struct exfat_chain *p_dir, int entry, struct buffer_head **bh); struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es, int num); -struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, - struct exfat_chain *p_dir, int entry, unsigned int type); +int exfat_get_dentry_set(struct exfat_entry_set_cache *es, + struct super_block *sb, struct exfat_chain *p_dir, int entry, + unsigned int type); int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync); int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir); diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 5590a1e83126..cdcf037a304f 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -21,7 +21,7 @@ int __exfat_write_inode(struct inode *inode, int sync) { unsigned long long on_disk_size; struct exfat_dentry *ep, *ep2; - struct exfat_entry_set_cache *es = NULL; + struct exfat_entry_set_cache es; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); @@ -42,11 +42,10 @@ int __exfat_write_inode(struct inode *inode, int sync) exfat_set_volume_dirty(sb); /* get the directory entry of given file or directory */ - es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES); - if (!es) + if (exfat_get_dentry_set(&es, sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES)) return -EIO; - ep = exfat_get_dentry_cached(es, 0); - ep2 = exfat_get_dentry_cached(es, 1); + ep = exfat_get_dentry_cached(&es, 0); + ep2 = exfat_get_dentry_cached(&es, 1); ep->dentry.file.attr = cpu_to_le16(exfat_make_attr(inode)); @@ -83,8 +82,8 @@ int __exfat_write_inode(struct inode *inode, int sync) ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER; } - exfat_update_dir_chksum_with_entry_set(es); - return exfat_free_dentry_set(es, sync); + exfat_update_dir_chksum_with_entry_set(&es); + return exfat_free_dentry_set(&es, sync); } int exfat_write_inode(struct inode *inode, struct writeback_control *wbc) diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 99e00a36c029..8d72527dfb78 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -604,7 +604,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname, struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(dir); struct exfat_dentry *ep, *ep2; - struct exfat_entry_set_cache *es; + struct exfat_entry_set_cache es; /* for optimized dir & entry to prevent long traverse of cluster chain */ struct exfat_hint hint_opt; @@ -644,11 +644,10 @@ static int exfat_find(struct inode *dir, struct qstr *qname, if (cdir.flags & ALLOC_NO_FAT_CHAIN) cdir.size -= dentry / sbi->dentries_per_clu; dentry = hint_opt.eidx; - es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES); - if (!es) + if (exfat_get_dentry_set(&es, sb, &cdir, dentry, ES_2_ENTRIES)) return -EIO; - ep = exfat_get_dentry_cached(es, 0); - ep2 = exfat_get_dentry_cached(es, 1); + ep = exfat_get_dentry_cached(&es, 0); + ep2 = exfat_get_dentry_cached(&es, 1); info->type = exfat_get_entry_type(ep); info->attr = le16_to_cpu(ep->dentry.file.attr); @@ -677,7 +676,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname, ep->dentry.file.access_time, ep->dentry.file.access_date, 0); - exfat_free_dentry_set(es, false); + exfat_free_dentry_set(&es, false); if (ei->start_clu == EXFAT_FREE_CLUSTER) { exfat_fs_error(sb, -- cgit From 3b9681acb0ef739343d8cfd35e054aab9597f1dc Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Thu, 17 Mar 2022 18:12:40 +0800 Subject: exfat: rename exfat_free_dentry_set() to exfat_put_dentry_set() Since struct exfat_entry_set_cache is allocated from stack, no need to free, so rename exfat_free_dentry_set() to exfat_put_dentry_set(). After renaming, the new function pair is exfat_get_dentry_set()/exfat_put_dentry_set(). Signed-off-by: Yuezhang Mo Reviewed-by: Andy Wu Reviewed-by: Aoyama Wataru Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/dir.c | 16 ++++++++-------- fs/exfat/exfat_fs.h | 2 +- fs/exfat/inode.c | 2 +- fs/exfat/namei.c | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index a3fb609dd129..a9a0b3e46af2 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -55,7 +55,7 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb, uniname += EXFAT_FILE_NAME_LEN; } - exfat_free_dentry_set(&es, false); + exfat_put_dentry_set(&es, false); } /* read a directory entry from the opened directory */ @@ -602,7 +602,7 @@ void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es) es->modified = true; } -int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync) +int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync) { int i, err = 0; @@ -860,7 +860,7 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es, ep = exfat_get_dentry_cached(es, 0); if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode)) - goto free_es; + goto put_es; num_entries = type == ES_ALL_ENTRIES ? ep->dentry.file.num_ext + 1 : type; @@ -882,7 +882,7 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es, if (p_dir->flags == ALLOC_NO_FAT_CHAIN) clu++; else if (exfat_get_next_cluster(sb, &clu)) - goto free_es; + goto put_es; sec = exfat_cluster_to_sector(sbi, clu); } else { sec++; @@ -890,7 +890,7 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es, bh = sb_bread(sb, sec); if (!bh) - goto free_es; + goto put_es; es->bh[es->num_bh++] = bh; } @@ -898,12 +898,12 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es, for (i = 1; i < num_entries; i++) { ep = exfat_get_dentry_cached(es, i); if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode)) - goto free_es; + goto put_es; } return 0; -free_es: - exfat_free_dentry_set(es, false); +put_es: + exfat_put_dentry_set(es, false); return -EIO; } diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 2ffe5792b1a9..324acc57d029 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -493,7 +493,7 @@ struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es, int exfat_get_dentry_set(struct exfat_entry_set_cache *es, struct super_block *sb, struct exfat_chain *p_dir, int entry, unsigned int type); -int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync); +int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync); int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir); /* inode.c */ diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index cdcf037a304f..a84eae72556d 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -83,7 +83,7 @@ int __exfat_write_inode(struct inode *inode, int sync) } exfat_update_dir_chksum_with_entry_set(&es); - return exfat_free_dentry_set(&es, sync); + return exfat_put_dentry_set(&es, sync); } int exfat_write_inode(struct inode *inode, struct writeback_control *wbc) diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 8d72527dfb78..57510d7f58cf 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -676,7 +676,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname, ep->dentry.file.access_time, ep->dentry.file.access_date, 0); - exfat_free_dentry_set(&es, false); + exfat_put_dentry_set(&es, false); if (ei->start_clu == EXFAT_FREE_CLUSTER) { exfat_fs_error(sb, -- cgit From f3fe3954c09f97d8227d9d2edc807796a8b228ab Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Thu, 17 Mar 2022 19:39:20 +0800 Subject: exfat: replace magic numbers with Macros Code refinement, no functional changes. Signed-off-by: Yuezhang Mo Reviewed-by: Andy Wu Reviewed-by: Aoyama Wataru Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/dir.c | 12 ++++++------ fs/exfat/inode.c | 4 ++-- fs/exfat/namei.c | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index a9a0b3e46af2..c05493fc9124 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -44,7 +44,7 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb, * Third entry : first file-name entry * So, the index of first file-name dentry should start from 2. */ - for (i = 2; i < es.num_entries; i++) { + for (i = ES_IDX_FIRST_FILENAME; i < es.num_entries; i++) { struct exfat_dentry *ep = exfat_get_dentry_cached(&es, i); /* end of name entry */ @@ -336,7 +336,7 @@ int exfat_calc_num_entries(struct exfat_uni_name *p_uniname) return -EINVAL; /* 1 file entry + 1 stream entry + name entries */ - return ((len - 1) / EXFAT_FILE_NAME_LEN + 3); + return ES_ENTRY_NUM(len); } unsigned int exfat_get_entry_type(struct exfat_dentry *ep) @@ -591,13 +591,13 @@ void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es) unsigned short chksum = 0; struct exfat_dentry *ep; - for (i = 0; i < es->num_entries; i++) { + for (i = ES_IDX_FILE; i < es->num_entries; i++) { ep = exfat_get_dentry_cached(es, i); chksum = exfat_calc_chksum16(ep, DENTRY_SIZE, chksum, chksum_type); chksum_type = CS_DEFAULT; } - ep = exfat_get_dentry_cached(es, 0); + ep = exfat_get_dentry_cached(es, ES_IDX_FILE); ep->dentry.file.checksum = cpu_to_le16(chksum); es->modified = true; } @@ -858,7 +858,7 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es, return -EIO; es->bh[es->num_bh++] = bh; - ep = exfat_get_dentry_cached(es, 0); + ep = exfat_get_dentry_cached(es, ES_IDX_FILE); if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode)) goto put_es; @@ -895,7 +895,7 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es, } /* validate cached dentries */ - for (i = 1; i < num_entries; i++) { + for (i = ES_IDX_STREAM; i < num_entries; i++) { ep = exfat_get_dentry_cached(es, i); if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode)) goto put_es; diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index a84eae72556d..dac5001bae9e 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -44,8 +44,8 @@ int __exfat_write_inode(struct inode *inode, int sync) /* get the directory entry of given file or directory */ if (exfat_get_dentry_set(&es, sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES)) return -EIO; - ep = exfat_get_dentry_cached(&es, 0); - ep2 = exfat_get_dentry_cached(&es, 1); + ep = exfat_get_dentry_cached(&es, ES_IDX_FILE); + ep2 = exfat_get_dentry_cached(&es, ES_IDX_STREAM); ep->dentry.file.attr = cpu_to_le16(exfat_make_attr(inode)); diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 57510d7f58cf..01e4e8c60bbe 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -646,8 +646,8 @@ static int exfat_find(struct inode *dir, struct qstr *qname, dentry = hint_opt.eidx; if (exfat_get_dentry_set(&es, sb, &cdir, dentry, ES_2_ENTRIES)) return -EIO; - ep = exfat_get_dentry_cached(&es, 0); - ep2 = exfat_get_dentry_cached(&es, 1); + ep = exfat_get_dentry_cached(&es, ES_IDX_FILE); + ep2 = exfat_get_dentry_cached(&es, ES_IDX_STREAM); info->type = exfat_get_entry_type(ep); info->attr = le16_to_cpu(ep->dentry.file.attr); -- cgit From 088f1343d9108c16fca064951d85e6de9f5cab42 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Tue, 16 Aug 2022 16:55:06 +0800 Subject: exfat: remove call ilog2() from exfat_readdir() There is no need to call ilog2() for the conversions between cluster and dentry in exfat_readdir(), because these conversions can be replaced with EXFAT_DEN_TO_CLU()/EXFAT_CLU_TO_DEN(). Code refinement, no functional changes. Signed-off-by: Yuezhang Mo Reviewed-by: Andy Wu Reviewed-by: Aoyama Wataru Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/dir.c | 9 ++++----- fs/exfat/exfat_fs.h | 10 ++++++++-- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index c05493fc9124..397ea2d98848 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -61,7 +61,7 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb, /* read a directory entry from the opened directory */ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_entry *dir_entry) { - int i, dentries_per_clu, dentries_per_clu_bits = 0, num_ext; + int i, dentries_per_clu, num_ext; unsigned int type, clu_offset, max_dentries; struct exfat_chain dir, clu; struct exfat_uni_name uni_name; @@ -83,11 +83,10 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags); dentries_per_clu = sbi->dentries_per_clu; - dentries_per_clu_bits = ilog2(dentries_per_clu); max_dentries = (unsigned int)min_t(u64, MAX_EXFAT_DENTRIES, - (u64)sbi->num_clusters << dentries_per_clu_bits); + (u64)EXFAT_CLU_TO_DEN(sbi->num_clusters, sbi)); - clu_offset = dentry >> dentries_per_clu_bits; + clu_offset = EXFAT_DEN_TO_CLU(dentry, sbi); exfat_chain_dup(&clu, &dir); if (clu.flags == ALLOC_NO_FAT_CHAIN) { @@ -162,7 +161,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent dir_entry->entry = dentry; brelse(bh); - ei->hint_bmap.off = dentry >> dentries_per_clu_bits; + ei->hint_bmap.off = EXFAT_DEN_TO_CLU(dentry, sbi); ei->hint_bmap.clu = clu.dir; *cpos = EXFAT_DEN_TO_B(dentry + 1 + num_ext); diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 324acc57d029..37e8af8042aa 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -101,11 +101,17 @@ enum { /* * helpers for block size to dentry size conversion. */ -#define EXFAT_B_TO_DEN_IDX(b, sbi) \ - ((b) << ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS)) #define EXFAT_B_TO_DEN(b) ((b) >> DENTRY_SIZE_BITS) #define EXFAT_DEN_TO_B(b) ((b) << DENTRY_SIZE_BITS) +/* + * helpers for cluster size to dentry size conversion. + */ +#define EXFAT_CLU_TO_DEN(clu, sbi) \ + ((clu) << ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS)) +#define EXFAT_DEN_TO_CLU(dentry, sbi) \ + ((dentry) >> ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS)) + /* * helpers for fat entry. */ -- cgit From 015c0d4f6b1e65857de88279f07d7ecc5e305137 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Mon, 15 Aug 2022 10:15:16 +0800 Subject: exfat: remove unneeded codes from __exfat_rename() The code gets the dentry, but the dentry is not used, remove the code. Code refinement, no functional changes. Signed-off-by: Yuezhang Mo Reviewed-by: Andy Wu Reviewed-by: Aoyama Wataru Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/namei.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 01e4e8c60bbe..347c8df45bd0 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -1175,7 +1175,7 @@ static int __exfat_rename(struct inode *old_parent_inode, struct exfat_inode_info *new_ei = NULL; unsigned int new_entry_type = TYPE_UNUSED; int new_entry = 0; - struct buffer_head *old_bh, *new_bh = NULL; + struct buffer_head *new_bh = NULL; /* check the validity of pointer parameters */ if (new_path == NULL || strlen(new_path) == 0) @@ -1191,13 +1191,6 @@ static int __exfat_rename(struct inode *old_parent_inode, EXFAT_I(old_parent_inode)->flags); dentry = ei->entry; - ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh); - if (!ep) { - ret = -EIO; - goto out; - } - brelse(old_bh); - /* check whether new dir is existing directory and empty */ if (new_inode) { ret = -EIO; -- cgit From 72880cb5f157514d797d5f6ab3184bbde671a18a Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Sun, 10 Apr 2022 16:12:14 +0800 Subject: exfat: remove unnecessary arguments from exfat_find_dir_entry() This commit removes argument 'num_entries' and 'type' from exfat_find_dir_entry(). Code refinement, no functional changes. Signed-off-by: Yuezhang Mo Reviewed-by: Andy Wu Reviewed-by: Aoyama Wataru Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/dir.c | 12 +++++++----- fs/exfat/exfat_fs.h | 3 +-- fs/exfat/namei.c | 10 ++-------- 3 files changed, 10 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 397ea2d98848..8121a7e073bc 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -956,7 +956,7 @@ enum { */ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname, - int num_entries, unsigned int type, struct exfat_hint *hint_opt) + struct exfat_hint *hint_opt) { int i, rewind = 0, dentry = 0, end_eidx = 0, num_ext = 0, len; int order, step, name_len = 0; @@ -967,6 +967,10 @@ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, struct exfat_hint *hint_stat = &ei->hint_stat; struct exfat_hint_femp candi_empty; struct exfat_sb_info *sbi = EXFAT_SB(sb); + int num_entries = exfat_calc_num_entries(p_uniname); + + if (num_entries < 0) + return num_entries; dentries_per_clu = sbi->dentries_per_clu; @@ -1020,10 +1024,8 @@ rewind: step = DIRENT_STEP_FILE; hint_opt->clu = clu.dir; hint_opt->eidx = i; - if (type == TYPE_ALL || type == entry_type) { - num_ext = ep->dentry.file.num_ext; - step = DIRENT_STEP_STRM; - } + num_ext = ep->dentry.file.num_ext; + step = DIRENT_STEP_STRM; brelse(bh); continue; } diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 37e8af8042aa..21fec01d68ff 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -71,7 +71,6 @@ enum { #define TYPE_PADDING 0x0402 #define TYPE_ACLTAB 0x0403 #define TYPE_BENIGN_SEC 0x0800 -#define TYPE_ALL 0x0FFF #define MAX_CHARSET_SIZE 6 /* max size of multi-byte character */ #define MAX_NAME_LENGTH 255 /* max len of file name excluding NULL */ @@ -490,7 +489,7 @@ void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es); int exfat_calc_num_entries(struct exfat_uni_name *p_uniname); int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname, - int num_entries, unsigned int type, struct exfat_hint *hint_opt); + struct exfat_hint *hint_opt); int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu); struct exfat_dentry *exfat_get_dentry(struct super_block *sb, struct exfat_chain *p_dir, int entry, struct buffer_head **bh); diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 347c8df45bd0..5f995eba5dbb 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -597,7 +597,7 @@ unlock: static int exfat_find(struct inode *dir, struct qstr *qname, struct exfat_dir_entry *info) { - int ret, dentry, num_entries, count; + int ret, dentry, count; struct exfat_chain cdir; struct exfat_uni_name uni_name; struct super_block *sb = dir->i_sb; @@ -616,10 +616,6 @@ static int exfat_find(struct inode *dir, struct qstr *qname, if (ret) return ret; - num_entries = exfat_calc_num_entries(&uni_name); - if (num_entries < 0) - return num_entries; - /* check the validation of hint_stat and initialize it if required */ if (ei->version != (inode_peek_iversion_raw(dir) & 0xffffffff)) { ei->hint_stat.clu = cdir.dir; @@ -629,9 +625,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname, } /* search the file name for directories */ - dentry = exfat_find_dir_entry(sb, ei, &cdir, &uni_name, - num_entries, TYPE_ALL, &hint_opt); - + dentry = exfat_find_dir_entry(sb, ei, &cdir, &uni_name, &hint_opt); if (dentry < 0) return dentry; /* -error value */ -- cgit From e981917b3fae689e9372647a38746444205bb905 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Thu, 17 Nov 2022 10:36:21 +0800 Subject: exfat: remove argument 'size' from exfat_truncate() argument 'size' is not used in exfat_truncate(), remove it. Code refinement, no functional changes. Signed-off-by: Yuezhang Mo Reviewed-by: Andy Wu Reviewed-by: Aoyama Wataru Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/exfat_fs.h | 2 +- fs/exfat/file.c | 4 ++-- fs/exfat/inode.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 21fec01d68ff..ae048802f9db 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -449,7 +449,7 @@ int exfat_trim_fs(struct inode *inode, struct fstrim_range *range); /* file.c */ extern const struct file_operations exfat_file_operations; int __exfat_truncate(struct inode *inode, loff_t new_size); -void exfat_truncate(struct inode *inode, loff_t size); +void exfat_truncate(struct inode *inode); int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *attr); int exfat_getattr(struct user_namespace *mnt_userns, const struct path *path, diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 4e0793f35e8f..7c97c1df1305 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -189,7 +189,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) return 0; } -void exfat_truncate(struct inode *inode, loff_t size) +void exfat_truncate(struct inode *inode) { struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); @@ -310,7 +310,7 @@ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * __exfat_write_inode() is called from exfat_truncate(), inode * is already written by it, so mark_inode_dirty() is unneeded. */ - exfat_truncate(inode, attr->ia_size); + exfat_truncate(inode); up_write(&EXFAT_I(inode)->truncate_lock); } else mark_inode_dirty(inode); diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index dac5001bae9e..0d147f8a1f7c 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -362,7 +362,7 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to) if (to > i_size_read(inode)) { truncate_pagecache(inode, i_size_read(inode)); inode->i_mtime = inode->i_ctime = current_time(inode); - exfat_truncate(inode, EXFAT_I(inode)->i_size_aligned); + exfat_truncate(inode); } } -- cgit From f7cde96710a4362dca199458d3de04f631178453 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Mon, 28 Mar 2022 16:37:58 +0800 Subject: exfat: remove i_size_write() from __exfat_truncate() The file/directory size is updated into inode by i_size_write() before __exfat_truncate() is called, so it is redundant to re-update by i_size_write() in __exfat_truncate(). Code refinement, no functional changes. Signed-off-by: Yuezhang Mo Reviewed-by: Andy Wu Reviewed-by: Aoyama Wataru Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/exfat_fs.h | 2 +- fs/exfat/file.c | 8 +++----- fs/exfat/inode.c | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index ae048802f9db..a1e7feb22079 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -448,7 +448,7 @@ int exfat_trim_fs(struct inode *inode, struct fstrim_range *range); /* file.c */ extern const struct file_operations exfat_file_operations; -int __exfat_truncate(struct inode *inode, loff_t new_size); +int __exfat_truncate(struct inode *inode); void exfat_truncate(struct inode *inode); int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *attr); diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 7c97c1df1305..f5b29072775d 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -93,7 +93,7 @@ static int exfat_sanitize_mode(const struct exfat_sb_info *sbi, } /* resize the file length */ -int __exfat_truncate(struct inode *inode, loff_t new_size) +int __exfat_truncate(struct inode *inode) { unsigned int num_clusters_new, num_clusters_phys; unsigned int last_clu = EXFAT_FREE_CLUSTER; @@ -113,7 +113,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) exfat_chain_set(&clu, ei->start_clu, num_clusters_phys, ei->flags); - if (new_size > 0) { + if (i_size_read(inode) > 0) { /* * Truncate FAT chain num_clusters after the first cluster * num_clusters = min(new, phys); @@ -143,8 +143,6 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) ei->start_clu = EXFAT_EOF_CLUSTER; } - i_size_write(inode, new_size); - if (ei->type == TYPE_FILE) ei->attr |= ATTR_ARCHIVE; @@ -207,7 +205,7 @@ void exfat_truncate(struct inode *inode) goto write_size; } - err = __exfat_truncate(inode, i_size_read(inode)); + err = __exfat_truncate(inode); if (err) goto write_size; diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 0d147f8a1f7c..95adc4b2e436 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -626,7 +626,7 @@ void exfat_evict_inode(struct inode *inode) if (!inode->i_nlink) { i_size_write(inode, 0); mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock); - __exfat_truncate(inode, 0); + __exfat_truncate(inode); mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock); } -- cgit From 3720dd6dcac38d03424d6ba38107f39af5318bcf Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 1 Nov 2022 10:53:22 -0700 Subject: filemap: convert replace_page_cache_page() to replace_page_cache_folio() Patch series "Removing the lru_cache_add() wrapper". This patchset replaces all calls of lru_cache_add() with the folio equivalent: folio_add_lru(). This is allows us to get rid of the wrapper The series passes xfstests and the userfaultfd selftests. This patch (of 5): Eliminates 7 calls to compound_head(). Link: https://lkml.kernel.org/r/20221101175326.13265-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20221101175326.13265-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Miklos Szeredi Signed-off-by: Andrew Morton --- fs/fuse/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b4a6e0a1b945..26817a2db463 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -837,7 +837,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (WARN_ON(PageMlocked(oldpage))) goto out_fallback_unlock; - replace_page_cache_page(oldpage, newpage); + replace_page_cache_folio(page_folio(oldpage), page_folio(newpage)); get_page(newpage); -- cgit From 063aaad792eef49a11d7575dc9914b43c0fa3792 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 1 Nov 2022 10:53:23 -0700 Subject: fuse: convert fuse_try_move_page() to use folios Converts the function to try to move folios instead of pages. Also converts fuse_check_page() to fuse_get_folio() since this is its only caller. This change removes 15 calls to compound_head(). Link: https://lkml.kernel.org/r/20221101175326.13265-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Miklos Szeredi Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Signed-off-by: Andrew Morton --- fs/fuse/dev.c | 55 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 26817a2db463..204c332cd343 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -764,11 +764,11 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) return ncpy; } -static int fuse_check_page(struct page *page) +static int fuse_check_folio(struct folio *folio) { - if (page_mapcount(page) || - page->mapping != NULL || - (page->flags & PAGE_FLAGS_CHECK_AT_PREP & + if (folio_mapped(folio) || + folio->mapping != NULL || + (folio->flags & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced | 1 << PG_uptodate | @@ -778,7 +778,7 @@ static int fuse_check_page(struct page *page) 1 << PG_reclaim | 1 << PG_waiters | LRU_GEN_MASK | LRU_REFS_MASK))) { - dump_page(page, "fuse: trying to steal weird page"); + dump_page(&folio->page, "fuse: trying to steal weird page"); return 1; } return 0; @@ -787,11 +787,11 @@ static int fuse_check_page(struct page *page) static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) { int err; - struct page *oldpage = *pagep; - struct page *newpage; + struct folio *oldfolio = page_folio(*pagep); + struct folio *newfolio; struct pipe_buffer *buf = cs->pipebufs; - get_page(oldpage); + folio_get(oldfolio); err = unlock_request(cs->req); if (err) goto out_put_old; @@ -814,35 +814,36 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (!pipe_buf_try_steal(cs->pipe, buf)) goto out_fallback; - newpage = buf->page; + newfolio = page_folio(buf->page); - if (!PageUptodate(newpage)) - SetPageUptodate(newpage); + if (!folio_test_uptodate(newfolio)) + folio_mark_uptodate(newfolio); - ClearPageMappedToDisk(newpage); + folio_clear_mappedtodisk(newfolio); - if (fuse_check_page(newpage) != 0) + if (fuse_check_folio(newfolio) != 0) goto out_fallback_unlock; /* * This is a new and locked page, it shouldn't be mapped or * have any special flags on it */ - if (WARN_ON(page_mapped(oldpage))) + if (WARN_ON(folio_mapped(oldfolio))) goto out_fallback_unlock; - if (WARN_ON(page_has_private(oldpage))) + if (WARN_ON(folio_has_private(oldfolio))) goto out_fallback_unlock; - if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage))) + if (WARN_ON(folio_test_dirty(oldfolio) || + folio_test_writeback(oldfolio))) goto out_fallback_unlock; - if (WARN_ON(PageMlocked(oldpage))) + if (WARN_ON(folio_test_mlocked(oldfolio))) goto out_fallback_unlock; - replace_page_cache_folio(page_folio(oldpage), page_folio(newpage)); + replace_page_cache_folio(oldfolio, newfolio); - get_page(newpage); + folio_get(newfolio); if (!(buf->flags & PIPE_BUF_FLAG_LRU)) - lru_cache_add(newpage); + folio_add_lru(newfolio); /* * Release while we have extra ref on stolen page. Otherwise @@ -855,28 +856,28 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (test_bit(FR_ABORTED, &cs->req->flags)) err = -ENOENT; else - *pagep = newpage; + *pagep = &newfolio->page; spin_unlock(&cs->req->waitq.lock); if (err) { - unlock_page(newpage); - put_page(newpage); + folio_unlock(newfolio); + folio_put(newfolio); goto out_put_old; } - unlock_page(oldpage); + folio_unlock(oldfolio); /* Drop ref for ap->pages[] array */ - put_page(oldpage); + folio_put(oldfolio); cs->len = 0; err = 0; out_put_old: /* Drop ref obtained in this function */ - put_page(oldpage); + folio_put(oldfolio); return err; out_fallback_unlock: - unlock_page(newpage); + folio_unlock(newfolio); out_fallback: cs->pg = buf->page; cs->offset = buf->offset; -- cgit From 169004265860327182ecf92297b25b6271e81e96 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:28:51 +0000 Subject: fsdax: introduce page->share for fsdax in reflink mode Patch series "fsdax,xfs: fix warning messages", v2. Many testcases failed in dax+reflink mode with warning message in dmesg. Such as generic/051,075,127. The warning message is like this: [ 775.509337] ------------[ cut here ]------------ [ 775.509636] WARNING: CPU: 1 PID: 16815 at fs/dax.c:386 dax_insert_entry.cold+0x2e/0x69 [ 775.510151] Modules linked in: auth_rpcgss oid_registry nfsv4 algif_hash af_alg af_packet nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink ip6table_filter ip6_tables iptable_filter ip_tables x_tables dax_pmem nd_pmem nd_btt sch_fq_codel configfs xfs libcrc32c fuse [ 775.524288] CPU: 1 PID: 16815 Comm: fsx Kdump: loaded Tainted: G W 6.1.0-rc4+ #164 eb34e4ee4200c7cbbb47de2b1892c5a3e027fd6d [ 775.524904] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Arch Linux 1.16.0-3-3 04/01/2014 [ 775.525460] RIP: 0010:dax_insert_entry.cold+0x2e/0x69 [ 775.525797] Code: c7 c7 18 eb e0 81 48 89 4c 24 20 48 89 54 24 10 e8 73 6d ff ff 48 83 7d 18 00 48 8b 54 24 10 48 8b 4c 24 20 0f 84 e3 e9 b9 ff <0f> 0b e9 dc e9 b9 ff 48 c7 c6 a0 20 c3 81 48 c7 c7 f0 ea e0 81 48 [ 775.526708] RSP: 0000:ffffc90001d57b30 EFLAGS: 00010082 [ 775.527042] RAX: 000000000000002a RBX: 0000000000000000 RCX: 0000000000000042 [ 775.527396] RDX: ffffea000a0f6c80 RSI: ffffffff81dfab1b RDI: 00000000ffffffff [ 775.527819] RBP: ffffea000a0f6c40 R08: 0000000000000000 R09: ffffffff820625e0 [ 775.528241] R10: ffffc90001d579d8 R11: ffffffff820d2628 R12: ffff88815fc98320 [ 775.528598] R13: ffffc90001d57c18 R14: 0000000000000000 R15: 0000000000000001 [ 775.528997] FS: 00007f39fc75d740(0000) GS:ffff88817bc80000(0000) knlGS:0000000000000000 [ 775.529474] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 775.529800] CR2: 00007f39fc772040 CR3: 0000000107eb6001 CR4: 00000000003706e0 [ 775.530214] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 775.530592] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 775.531002] Call Trace: [ 775.531230] [ 775.531444] dax_fault_iter+0x267/0x6c0 [ 775.531719] dax_iomap_pte_fault+0x198/0x3d0 [ 775.532002] __xfs_filemap_fault+0x24a/0x2d0 [xfs aa8d25411432b306d9554da38096f4ebb86bdfe7] [ 775.532603] __do_fault+0x30/0x1e0 [ 775.532903] do_fault+0x314/0x6c0 [ 775.533166] __handle_mm_fault+0x646/0x1250 [ 775.533480] handle_mm_fault+0xc1/0x230 [ 775.533810] do_user_addr_fault+0x1ac/0x610 [ 775.534110] exc_page_fault+0x63/0x140 [ 775.534389] asm_exc_page_fault+0x22/0x30 [ 775.534678] RIP: 0033:0x7f39fc55820a [ 775.534950] Code: 00 01 00 00 00 74 99 83 f9 c0 0f 87 7b fe ff ff c5 fe 6f 4e 20 48 29 fe 48 83 c7 3f 49 8d 0c 10 48 83 e7 c0 48 01 fe 48 29 f9 a4 c4 c1 7e 7f 00 c4 c1 7e 7f 48 20 c5 f8 77 c3 0f 1f 44 00 00 [ 775.535839] RSP: 002b:00007ffc66a08118 EFLAGS: 00010202 [ 775.536157] RAX: 00007f39fc772001 RBX: 0000000000042001 RCX: 00000000000063c1 [ 775.536537] RDX: 0000000000006400 RSI: 00007f39fac42050 RDI: 00007f39fc772040 [ 775.536919] RBP: 0000000000006400 R08: 00007f39fc772001 R09: 0000000000042000 [ 775.537304] R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000001 [ 775.537694] R13: 00007f39fc772000 R14: 0000000000006401 R15: 0000000000000003 [ 775.538086] [ 775.538333] ---[ end trace 0000000000000000 ]--- This also affects dax+noreflink mode if we run the test after a dax+reflink test. So, the most urgent thing is solving the warning messages. With these fixes, most warning messages in dax_associate_entry() are gone. But honestly, generic/388 will randomly failed with the warning. The case shutdown the xfs when fsstress is running, and do it for many times. I think the reason is that dax pages in use are not able to be invalidated in time when fs is shutdown. The next time dax page to be associated, it still remains the mapping value set last time. I'll keep on solving it. The warning message in dax_writeback_one() can also be fixed because of the dax unshare. This patch (of 8): fsdax page is used not only when CoW, but also mapread. To make the it easily understood, use 'share' to indicate that the dax page is shared by more than one extent. And add helper functions to use it. Also, the flag needs to be renamed to PAGE_MAPPING_DAX_SHARED. [ruansy.fnst@fujitsu.com: rename several functions] Link: https://lkml.kernel.org/r/1669972991-246-1-git-send-email-ruansy.fnst@fujitsu.com [ruansy.fnst@fujitsu.com: v2.2] Link: https://lkml.kernel.org/r/1670381359-53-1-git-send-email-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/1669908538-55-1-git-send-email-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/1669908538-55-2-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: Alistair Popple Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 1c6867810cbd..84fadea08705 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -334,35 +334,41 @@ static unsigned long dax_end_pfn(void *entry) for (pfn = dax_to_pfn(entry); \ pfn < dax_end_pfn(entry); pfn++) -static inline bool dax_mapping_is_cow(struct address_space *mapping) +static inline bool dax_page_is_shared(struct page *page) { - return (unsigned long)mapping == PAGE_MAPPING_DAX_COW; + return page->mapping == PAGE_MAPPING_DAX_SHARED; } /* - * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount. + * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the + * refcount. */ -static inline void dax_mapping_set_cow(struct page *page) +static inline void dax_page_share_get(struct page *page) { - if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) { + if (page->mapping != PAGE_MAPPING_DAX_SHARED) { /* * Reset the index if the page was already mapped * regularly before. */ if (page->mapping) - page->index = 1; - page->mapping = (void *)PAGE_MAPPING_DAX_COW; + page->share = 1; + page->mapping = PAGE_MAPPING_DAX_SHARED; } - page->index++; + page->share++; +} + +static inline unsigned long dax_page_share_put(struct page *page) +{ + return --page->share; } /* - * When it is called in dax_insert_entry(), the cow flag will indicate that + * When it is called in dax_insert_entry(), the shared flag will indicate that * whether this entry is shared by multiple files. If so, set the page->mapping - * FS_DAX_MAPPING_COW, and use page->index as refcount. + * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount. */ static void dax_associate_entry(void *entry, struct address_space *mapping, - struct vm_area_struct *vma, unsigned long address, bool cow) + struct vm_area_struct *vma, unsigned long address, bool shared) { unsigned long size = dax_entry_size(entry), pfn, index; int i = 0; @@ -374,8 +380,8 @@ static void dax_associate_entry(void *entry, struct address_space *mapping, for_each_mapped_pfn(entry, pfn) { struct page *page = pfn_to_page(pfn); - if (cow) { - dax_mapping_set_cow(page); + if (shared) { + dax_page_share_get(page); } else { WARN_ON_ONCE(page->mapping); page->mapping = mapping; @@ -396,9 +402,9 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping, struct page *page = pfn_to_page(pfn); WARN_ON_ONCE(trunc && page_ref_count(page) > 1); - if (dax_mapping_is_cow(page->mapping)) { - /* keep the CoW flag if this page is still shared */ - if (page->index-- > 0) + if (dax_page_is_shared(page)) { + /* keep the shared flag if this page is still shared */ + if (dax_page_share_put(page) > 0) continue; } else WARN_ON_ONCE(page->mapping && page->mapping != mapping); -- cgit From f80e1668888f34c0764822e74953c997daf2ccdb Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:28:52 +0000 Subject: fsdax: invalidate pages when CoW CoW changes the share state of a dax page, but the share count of the page isn't updated. The next time access this page, it should have been a newly accessed, but old association exists. So, we need to clear the share state when CoW happens, in both dax_iomap_rw() and dax_zero_iter(). Link: https://lkml.kernel.org/r/1669908538-55-3-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 84fadea08705..c975d075e77b 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1264,6 +1264,15 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) return length; + /* + * invalidate the pages whose sharing state is to be changed + * because of CoW. + */ + if (iomap->flags & IOMAP_F_SHARED) + invalidate_inode_pages2_range(iter->inode->i_mapping, + pos >> PAGE_SHIFT, + (pos + length - 1) >> PAGE_SHIFT); + do { unsigned offset = offset_in_page(pos); unsigned size = min_t(u64, PAGE_SIZE - offset, length); @@ -1324,12 +1333,13 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, struct iov_iter *iter) { const struct iomap *iomap = &iomi->iomap; - const struct iomap *srcmap = &iomi->srcmap; + const struct iomap *srcmap = iomap_iter_srcmap(iomi); loff_t length = iomap_length(iomi); loff_t pos = iomi->pos; struct dax_device *dax_dev = iomap->dax_dev; loff_t end = pos + length, done = 0; bool write = iov_iter_rw(iter) == WRITE; + bool cow = write && iomap->flags & IOMAP_F_SHARED; ssize_t ret = 0; size_t xfer; int id; @@ -1356,7 +1366,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, * into page tables. We have to tear down these mappings so that data * written by write(2) is visible in mmap. */ - if (iomap->flags & IOMAP_F_NEW) { + if (iomap->flags & IOMAP_F_NEW || cow) { invalidate_inode_pages2_range(iomi->inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); @@ -1390,8 +1400,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, break; } - if (write && - srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) { + if (cow) { ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap, kaddr); if (ret) -- cgit From 708dfad2eb4169324189782edd6d3763237e0489 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:28:53 +0000 Subject: fsdax: zero the edges if source is HOLE or UNWRITTEN If srcmap contains invalid data, such as HOLE and UNWRITTEN, the dest page should be zeroed. Otherwise, since it's a pmem, old data may remains on the dest page, the result of CoW will be incorrect. The function name is also not easy to understand, rename it to "dax_iomap_copy_around()", which means it copies data around the range. [akpm@linux-foundation.org: update dax_iomap_copy_around() kerneldoc, per Darrick] Link: https://lkml.kernel.org/r/1669973145-318-1-git-send-email-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/1669908538-55-4-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Reviewed-by: Allison Henderson Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 79 ++++++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index c975d075e77b..359b958eb835 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1092,7 +1092,8 @@ out: } /** - * dax_iomap_cow_copy - Copy the data from source to destination before write + * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page + * by copying the data before and after the range to be written. * @pos: address to do copy from. * @length: size of copy operation. * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE) @@ -1101,35 +1102,50 @@ out: * * This can be called from two places. Either during DAX write fault (page * aligned), to copy the length size data to daddr. Or, while doing normal DAX - * write operation, dax_iomap_actor() might call this to do the copy of either + * write operation, dax_iomap_iter() might call this to do the copy of either * start or end unaligned address. In the latter case the rest of the copy of - * aligned ranges is taken care by dax_iomap_actor() itself. + * aligned ranges is taken care by dax_iomap_iter() itself. + * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the + * area to make sure no old data remains. */ -static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size, +static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, const struct iomap *srcmap, void *daddr) { loff_t head_off = pos & (align_size - 1); size_t size = ALIGN(head_off + length, align_size); loff_t end = pos + length; loff_t pg_end = round_up(end, align_size); + /* copy_all is usually in page fault case */ bool copy_all = head_off == 0 && end == pg_end; + /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */ + bool zero_edge = srcmap->flags & IOMAP_F_SHARED || + srcmap->type == IOMAP_UNWRITTEN; void *saddr = 0; int ret = 0; - ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); - if (ret) - return ret; + if (!zero_edge) { + ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); + if (ret) + return ret; + } if (copy_all) { - ret = copy_mc_to_kernel(daddr, saddr, length); - return ret ? -EIO : 0; + if (zero_edge) + memset(daddr, 0, size); + else + ret = copy_mc_to_kernel(daddr, saddr, length); + goto out; } /* Copy the head part of the range */ if (head_off) { - ret = copy_mc_to_kernel(daddr, saddr, head_off); - if (ret) - return -EIO; + if (zero_edge) + memset(daddr, 0, head_off); + else { + ret = copy_mc_to_kernel(daddr, saddr, head_off); + if (ret) + return -EIO; + } } /* Copy the tail part of the range */ @@ -1137,12 +1153,19 @@ static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size, loff_t tail_off = head_off + length; loff_t tail_len = pg_end - end; - ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off, - tail_len); - if (ret) - return -EIO; + if (zero_edge) + memset(daddr + tail_off, 0, tail_len); + else { + ret = copy_mc_to_kernel(daddr + tail_off, + saddr + tail_off, tail_len); + if (ret) + return -EIO; + } } - return 0; +out: + if (zero_edge) + dax_flush(srcmap->dax_dev, daddr, size); + return ret ? -EIO : 0; } /* @@ -1241,13 +1264,10 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) if (ret < 0) return ret; memset(kaddr + offset, 0, size); - if (srcmap->addr != iomap->addr) { - ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap, - kaddr); - if (ret < 0) - return ret; - dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE); - } else + if (iomap->flags & IOMAP_F_SHARED) + ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap, + kaddr); + else dax_flush(iomap->dax_dev, kaddr + offset, size); return ret; } @@ -1401,8 +1421,8 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, } if (cow) { - ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap, - kaddr); + ret = dax_iomap_copy_around(pos, length, PAGE_SIZE, + srcmap, kaddr); if (ret) break; } @@ -1547,7 +1567,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, struct xa_state *xas, void **entry, bool pmd) { const struct iomap *iomap = &iter->iomap; - const struct iomap *srcmap = &iter->srcmap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); size_t size = pmd ? PMD_SIZE : PAGE_SIZE; loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT; bool write = iter->flags & IOMAP_WRITE; @@ -1578,9 +1598,8 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags); - if (write && - srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) { - err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr); + if (write && iomap->flags & IOMAP_F_SHARED) { + err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr); if (err) return dax_fault_return(err); } -- cgit From c6f0b395b2110aa26a134a9a395875b1ec0a5aae Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:28:54 +0000 Subject: fsdax,xfs: set the shared flag when file extent is shared If a dax page is shared, mapread at different offsets can also trigger page fault on same dax page. So, change the flag from "cow" to "shared". And get the shared flag from filesystem when read. Link: https://lkml.kernel.org/r/1669908538-55-5-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 19 +++++++------------ fs/xfs/xfs_iomap.c | 2 +- 2 files changed, 8 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 359b958eb835..fa547ce41add 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -846,12 +846,6 @@ static bool dax_fault_is_synchronous(const struct iomap_iter *iter, (iter->iomap.flags & IOMAP_F_DIRTY); } -static bool dax_fault_is_cow(const struct iomap_iter *iter) -{ - return (iter->flags & IOMAP_WRITE) && - (iter->iomap.flags & IOMAP_F_SHARED); -} - /* * By this point grab_mapping_entry() has ensured that we have a locked entry * of the appropriate size so we don't have to worry about downgrading PMDs to @@ -865,13 +859,14 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, { struct address_space *mapping = vmf->vma->vm_file->f_mapping; void *new_entry = dax_make_entry(pfn, flags); - bool dirty = !dax_fault_is_synchronous(iter, vmf->vma); - bool cow = dax_fault_is_cow(iter); + bool write = iter->flags & IOMAP_WRITE; + bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma); + bool shared = iter->iomap.flags & IOMAP_F_SHARED; if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) { + if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) { unsigned long index = xas->xa_index; /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) @@ -883,12 +878,12 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, xas_reset(xas); xas_lock_irq(xas); - if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { + if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { void *old; dax_disassociate_entry(entry, mapping, false); dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address, - cow); + shared); /* * Only swap our new entry into the page cache if the current * entry is a zero page or an empty entry. If a normal PTE or @@ -908,7 +903,7 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, if (dirty) xas_set_mark(xas, PAGECACHE_TAG_DIRTY); - if (cow) + if (write && shared) xas_set_mark(xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irq(xas); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 07da03976ec1..881de99766ca 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1215,7 +1215,7 @@ xfs_read_iomap_begin( return error; error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, 0); - if (!error && (flags & IOMAP_REPORT)) + if (!error && ((flags & IOMAP_REPORT) || IS_DAX(inode))) error = xfs_reflink_trim_around_shared(ip, &imap, &shared); xfs_iunlock(ip, lockmode); -- cgit From 0e79e3736d54bb8efbc9fb29cc3b54a132783565 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:31:41 +0000 Subject: fsdax: dedupe: iter two files at the same time The iomap_iter() on a range of one file may loop more than once. In this case, the inner dst_iter can update its iomap but the outer src_iter can't. This may cause the wrong remapping in filesystem. Let them called at the same time. Link: https://lkml.kernel.org/r/1669908701-93-1-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index fa547ce41add..8fb928cd9dce 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1965,15 +1965,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, .len = len, .flags = IOMAP_DAX, }; - int ret; + int ret, compared = 0; - while ((ret = iomap_iter(&src_iter, ops)) > 0) { - while ((ret = iomap_iter(&dst_iter, ops)) > 0) { - dst_iter.processed = dax_range_compare_iter(&src_iter, - &dst_iter, len, same); - } - if (ret <= 0) - src_iter.processed = ret; + while ((ret = iomap_iter(&src_iter, ops)) > 0 && + (ret = iomap_iter(&dst_iter, ops)) > 0) { + compared = dax_range_compare_iter(&src_iter, &dst_iter, len, + same); + if (compared < 0) + return ret; + src_iter.processed = dst_iter.processed = compared; } return ret; } -- cgit From 64e6edc185da7e101e867c4732c097fedb1da08e Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:32:10 +0000 Subject: xfs: use dax ops for zero and truncate in fsdax mode Zero and truncate on a dax file may execute CoW. So use dax ops which contains end work for CoW. Link: https://lkml.kernel.org/r/1669908730-131-1-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/xfs/xfs_iomap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 881de99766ca..d9401d0300ad 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1370,7 +1370,7 @@ xfs_zero_range( if (IS_DAX(inode)) return dax_zero_range(inode, pos, len, did_zero, - &xfs_direct_write_iomap_ops); + &xfs_dax_write_iomap_ops); return iomap_zero_range(inode, pos, len, did_zero, &xfs_buffered_write_iomap_ops); } @@ -1385,7 +1385,7 @@ xfs_truncate_page( if (IS_DAX(inode)) return dax_truncate_page(inode, pos, did_zero, - &xfs_direct_write_iomap_ops); + &xfs_dax_write_iomap_ops); return iomap_truncate_page(inode, pos, did_zero, &xfs_buffered_write_iomap_ops); } -- cgit From d984648e428bf88cbd94ebe346c73632cb92fffb Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:32:33 +0000 Subject: fsdax,xfs: port unshare to fsdax Implement unshare in fsdax mode: copy data from srcmap to iomap. Link: https://lkml.kernel.org/r/1669908753-169-1-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_reflink.c | 8 ++++++-- 2 files changed, 58 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 8fb928cd9dce..c48a3a93ab29 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1245,6 +1245,58 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, } #endif /* CONFIG_FS_DAX_PMD */ +static s64 dax_unshare_iter(struct iomap_iter *iter) +{ + struct iomap *iomap = &iter->iomap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t pos = iter->pos; + loff_t length = iomap_length(iter); + int id = 0; + s64 ret = 0; + void *daddr = NULL, *saddr = NULL; + + /* don't bother with blocks that are not shared to start with */ + if (!(iomap->flags & IOMAP_F_SHARED)) + return length; + /* don't bother with holes or unwritten extents */ + if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) + return length; + + id = dax_read_lock(); + ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL); + if (ret < 0) + goto out_unlock; + + ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL); + if (ret < 0) + goto out_unlock; + + ret = copy_mc_to_kernel(daddr, saddr, length); + if (ret) + ret = -EIO; + +out_unlock: + dax_read_unlock(id); + return ret; +} + +int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, + const struct iomap_ops *ops) +{ + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .len = len, + .flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX, + }; + int ret; + + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = dax_unshare_iter(&iter); + return ret; +} +EXPORT_SYMBOL_GPL(dax_file_unshare); + static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) { const struct iomap *iomap = &iter->iomap; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 93bdd25680bc..fe46bce8cae6 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1693,8 +1693,12 @@ xfs_reflink_unshare( inode_dio_wait(inode); - error = iomap_file_unshare(inode, offset, len, - &xfs_buffered_write_iomap_ops); + if (IS_DAX(inode)) + error = dax_file_unshare(inode, offset, len, + &xfs_dax_write_iomap_ops); + else + error = iomap_file_unshare(inode, offset, len, + &xfs_buffered_write_iomap_ops); if (error) goto out; -- cgit From 480017957d6380d3336a8e80ad90f70415bb86f7 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:32:53 +0000 Subject: xfs: remove restrictions for fsdax and reflink Since the basic function for fsdax and reflink has been implemented, remove the restrictions of them for widly test. Link: https://lkml.kernel.org/r/1669908773-207-1-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/xfs/xfs_ioctl.c | 4 ---- fs/xfs/xfs_iops.c | 4 ---- 2 files changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 1f783e979629..13f1b2add390 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1138,10 +1138,6 @@ xfs_ioctl_setattr_xflags( if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip)) ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; - /* Don't allow us to set DAX mode for a reflinked file for now. */ - if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip)) - return -EINVAL; - /* diflags2 only valid for v3 inodes. */ i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); if (i_flags2 && !xfs_has_v3inodes(mp)) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 2e10e1c66ad6..bf0495f7a5e1 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1185,10 +1185,6 @@ xfs_inode_supports_dax( if (!S_ISREG(VFS_I(ip)->i_mode)) return false; - /* Only supported on non-reflinked files. */ - if (xfs_is_reflink_inode(ip)) - return false; - /* Block size must match page size */ if (mp->m_sb.sb_blocksize != PAGE_SIZE) return false; -- cgit From a11774122180a782b327b0a9a5091d99c91a4db7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:38 +0100 Subject: extfat: remove ->writepage Patch series "start removing writepage instances v2". The VM doesn't need or want ->writepage for writeback and is fine with just having ->writepages as long as ->migrate_folio is implemented. This series removes all ->writepage instances that use block_write_full_page directly and also have a plain mpage_writepages based ->writepages. This patch (of 7): ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and remove the ->writepage implementation. Link: https://lkml.kernel.org/r/20221202102644.770505-1-hch@lst.de Link: https://lkml.kernel.org/r/20221202102644.770505-2-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Namjae Jeon Acked-by: Johannes Weiner Cc: Bob Copeland Cc: Dave Kleikamp Cc: Jan Kara Cc: Mikulas Patocka Cc: OGAWA Hirofumi Cc: Sungjong Seo Signed-off-by: Andrew Morton --- fs/exfat/inode.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 5590a1e83126..eac95bcd9a8a 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -345,11 +345,6 @@ static void exfat_readahead(struct readahead_control *rac) mpage_readahead(rac, exfat_get_block); } -static int exfat_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, exfat_get_block, wbc); -} - static int exfat_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -473,12 +468,12 @@ static const struct address_space_operations exfat_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = exfat_read_folio, .readahead = exfat_readahead, - .writepage = exfat_writepage, .writepages = exfat_writepages, .write_begin = exfat_write_begin, .write_end = exfat_write_end, .direct_IO = exfat_direct_IO, - .bmap = exfat_aop_bmap + .bmap = exfat_aop_bmap, + .migrate_folio = buffer_migrate_folio, }; static inline unsigned long exfat_hash(loff_t i_pos) -- cgit From ee649af0d9a60ea61a5dad99ef5d6b4aa346f0a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:39 +0100 Subject: fat: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and remove the ->writepage implementation. Link: https://lkml.kernel.org/r/20221202102644.770505-3-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- fs/fat/inode.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 1cbcc4608dc7..d99b8549ec8f 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -194,11 +194,6 @@ static int fat_get_block(struct inode *inode, sector_t iblock, return 0; } -static int fat_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, fat_get_block, wbc); -} - static int fat_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -346,12 +341,12 @@ static const struct address_space_operations fat_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = fat_read_folio, .readahead = fat_readahead, - .writepage = fat_writepage, .writepages = fat_writepages, .write_begin = fat_write_begin, .write_end = fat_write_end, .direct_IO = fat_direct_IO, - .bmap = _fat_bmap + .bmap = _fat_bmap, + .migrate_folio = buffer_migrate_folio, }; /* -- cgit From ba195d9f14829690b8e4f67549960d83169a314e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:40 +0100 Subject: hfs: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and stop wiring up ->writepage for hfs_aops. Link: https://lkml.kernel.org/r/20221202102644.770505-4-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- fs/hfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index c4526f16355d..16466a5e88b4 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -173,12 +173,12 @@ const struct address_space_operations hfs_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = hfs_read_folio, - .writepage = hfs_writepage, .write_begin = hfs_write_begin, .write_end = generic_write_end, .bmap = hfs_bmap, .direct_IO = hfs_direct_IO, .writepages = hfs_writepages, + .migrate_folio = buffer_migrate_folio, }; /* -- cgit From 12f9b9a73dc603e658bf24eed2777cecdaf4103e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:41 +0100 Subject: hfsplus: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and stop wiring up ->writepage for hfsplus_aops. Link: https://lkml.kernel.org/r/20221202102644.770505-5-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- fs/hfsplus/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index aeab83ed1c9c..d6572ad2407a 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -170,12 +170,12 @@ const struct address_space_operations hfsplus_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = hfsplus_read_folio, - .writepage = hfsplus_writepage, .write_begin = hfsplus_write_begin, .write_end = generic_write_end, .bmap = hfsplus_bmap, .direct_IO = hfsplus_direct_IO, .writepages = hfsplus_writepages, + .migrate_folio = buffer_migrate_folio, }; const struct dentry_operations hfsplus_dentry_operations = { -- cgit From cd2e6024260de27a523e0af6ee47a20a6b8b8aa8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:42 +0100 Subject: hpfs: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and remove the ->writepage implementation. Link: https://lkml.kernel.org/r/20221202102644.770505-6-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- fs/hpfs/file.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index f7547a62c81f..88952d4a631e 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -163,11 +163,6 @@ static int hpfs_read_folio(struct file *file, struct folio *folio) return mpage_read_folio(folio, hpfs_get_block); } -static int hpfs_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, hpfs_get_block, wbc); -} - static void hpfs_readahead(struct readahead_control *rac) { mpage_readahead(rac, hpfs_get_block); @@ -248,12 +243,12 @@ const struct address_space_operations hpfs_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = hpfs_read_folio, - .writepage = hpfs_writepage, .readahead = hpfs_readahead, .writepages = hpfs_writepages, .write_begin = hpfs_write_begin, .write_end = hpfs_write_end, - .bmap = _hpfs_bmap + .bmap = _hpfs_bmap, + .migrate_folio = buffer_migrate_folio, }; const struct file_operations hpfs_file_ops = -- cgit From 2274c3b281bb47e6980ae42fb8dc93b7a38192d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:43 +0100 Subject: jfs: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and remove the ->writepage implementation. Link: https://lkml.kernel.org/r/20221202102644.770505-7-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Dave Kleikamp Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- fs/jfs/inode.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index d1ec920aa030..8ac10e396050 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -264,11 +264,6 @@ int jfs_get_block(struct inode *ip, sector_t lblock, return rc; } -static int jfs_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, jfs_get_block, wbc); -} - static int jfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -355,12 +350,12 @@ const struct address_space_operations jfs_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = jfs_read_folio, .readahead = jfs_readahead, - .writepage = jfs_writepage, .writepages = jfs_writepages, .write_begin = jfs_write_begin, .write_end = jfs_write_end, .bmap = jfs_bmap, .direct_IO = jfs_direct_IO, + .migrate_folio = buffer_migrate_folio, }; /* -- cgit From 1bda9dad5aa0199c8592bac32b91afbf8ea236ff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:44 +0100 Subject: omfs: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and remove the ->writepage implementation. Link: https://lkml.kernel.org/r/20221202102644.770505-8-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Bob Copeland Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- fs/omfs/file.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/omfs/file.c b/fs/omfs/file.c index fa7fe2393ff6..3a5b4b88a583 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -294,11 +294,6 @@ static void omfs_readahead(struct readahead_control *rac) mpage_readahead(rac, omfs_get_block); } -static int omfs_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, omfs_get_block, wbc); -} - static int omfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -375,10 +370,10 @@ const struct address_space_operations omfs_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = omfs_read_folio, .readahead = omfs_readahead, - .writepage = omfs_writepage, .writepages = omfs_writepages, .write_begin = omfs_write_begin, .write_end = generic_write_end, .bmap = omfs_bmap, + .migrate_folio = buffer_migrate_folio, }; -- cgit From 8614d6c5eda005ad72b37afeaae2879d7c101b18 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 5 Dec 2022 18:30:07 +0100 Subject: mm: do not show fs mm pc for VM_LOCKONFAULT pages When VM_LOCKONFAULT was added, /proc/PID/smaps wasn't hooked up to it, so looking at /proc/PID/smaps, it shows '??' instead of something intelligable. This can be reached by userspace by simply calling `mlock2(..., MLOCK_ONFAULT);`. Fix this by adding "lf" to denote VM_LOCKONFAULT. Link: https://lkml.kernel.org/r/20221205173007.580210-1-Jason@zx2c4.com Fixes: de60f5f10c58 ("mm: introduce VM_LOCKONFAULT") Signed-off-by: Jason A. Donenfeld Acked-by: Vlastimil Babka Cc: Eric B Munson Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 89338950afd3..e35a0398db63 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -674,6 +674,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_RAND_READ)] = "rr", [ilog2(VM_DONTCOPY)] = "dc", [ilog2(VM_DONTEXPAND)] = "de", + [ilog2(VM_LOCKONFAULT)] = "lf", [ilog2(VM_ACCOUNT)] = "ac", [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", -- cgit From c9a934c7d88413a35861387a11e901554810b122 Mon Sep 17 00:00:00 2001 From: Alexey Asemov Date: Sun, 27 Nov 2022 09:46:38 +0300 Subject: ocfs2: always read both high and low parts of dinode link count When filesystem is using indexed-dirs feature, maximum link count values can spill over to i_links_count_hi, up to OCFS2_DX_LINK_MAX links. ocfs2_read_links_count() checks for OCFS2_INDEXED_DIR_FL flag in dinode, but this flag is only valid for directories so for files the check causes high part of the link count not being read back from file dinodes resulting in wrong link count value when file has >65535 links. As ocfs2_set_links_count() always writes both high and low parts of link count, the flag check on reading may be removed. Link: https://lkml.kernel.org/r/cbfca02b-b39f-89de-e1a8-904a6c60407e@alex-at.net Signed-off-by: Alexey Asemov Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/ocfs2.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 740b64238312..a503c553bab2 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -560,8 +560,7 @@ static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di) u32 nlink = le16_to_cpu(di->i_links_count); u32 hi = le16_to_cpu(di->i_links_count_hi); - if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL)) - nlink |= (hi << OCFS2_LINKS_HI_SHIFT); + nlink |= (hi << OCFS2_LINKS_HI_SHIFT); return nlink; } -- cgit From 8d824e69d9f3fa3121b2dda25053bae71e2460d2 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Wed, 30 Nov 2022 06:59:59 +0000 Subject: hfs: fix OOB Read in __hfs_brec_find Syzbot reported a OOB read bug: ================================================================== BUG: KASAN: slab-out-of-bounds in hfs_strcmp+0x117/0x190 fs/hfs/string.c:84 Read of size 1 at addr ffff88807eb62c4e by task kworker/u4:1/11 CPU: 1 PID: 11 Comm: kworker/u4:1 Not tainted 6.1.0-rc6-syzkaller-00308-g644e9524388a #0 Workqueue: writeback wb_workfn (flush-7:0) Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1b1/0x28e lib/dump_stack.c:106 print_address_description+0x74/0x340 mm/kasan/report.c:284 print_report+0x107/0x1f0 mm/kasan/report.c:395 kasan_report+0xcd/0x100 mm/kasan/report.c:495 hfs_strcmp+0x117/0x190 fs/hfs/string.c:84 __hfs_brec_find+0x213/0x5c0 fs/hfs/bfind.c:75 hfs_brec_find+0x276/0x520 fs/hfs/bfind.c:138 hfs_write_inode+0x34c/0xb40 fs/hfs/inode.c:462 write_inode fs/fs-writeback.c:1440 [inline] If the input inode of hfs_write_inode() is incorrect: struct inode struct hfs_inode_info struct hfs_cat_key struct hfs_name u8 len # len is greater than HFS_NAMELEN(31) which is the maximum length of an HFS filename OOB read occurred: hfs_write_inode() hfs_brec_find() __hfs_brec_find() hfs_cat_keycmp() hfs_strcmp() # OOB read occurred due to len is too large Fix this by adding a Check on len in hfs_write_inode() before calling hfs_brec_find(). Link: https://lkml.kernel.org/r/20221130065959.2168236-1-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reported-by: Cc: Damien Le Moal Cc: Ira Weiny Cc: Jeff Layton Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Nanyong Sun Cc: Viacheslav Dubeyko Signed-off-by: Andrew Morton --- fs/hfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index c4526f16355d..a0746be3c1de 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -458,6 +458,8 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) /* panic? */ return -EIO; + if (HFS_I(main_inode)->cat_key.CName.len > HFS_NAMELEN) + return -EIO; fd.search_key->cat = HFS_I(main_inode)->cat_key; if (hfs_brec_find(&fd)) /* panic? */ -- cgit From c53ed55cb275344086e32a7080a6b19cb183650b Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 2 Dec 2022 03:00:38 +0000 Subject: hfs: Fix OOB Write in hfs_asc2mac Syzbot reported a OOB Write bug: loop0: detected capacity change from 0 to 64 ================================================================== BUG: KASAN: slab-out-of-bounds in hfs_asc2mac+0x467/0x9a0 fs/hfs/trans.c:133 Write of size 1 at addr ffff88801848314e by task syz-executor391/3632 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1b1/0x28e lib/dump_stack.c:106 print_address_description+0x74/0x340 mm/kasan/report.c:284 print_report+0x107/0x1f0 mm/kasan/report.c:395 kasan_report+0xcd/0x100 mm/kasan/report.c:495 hfs_asc2mac+0x467/0x9a0 fs/hfs/trans.c:133 hfs_cat_build_key+0x92/0x170 fs/hfs/catalog.c:28 hfs_lookup+0x1ab/0x2c0 fs/hfs/dir.c:31 lookup_open fs/namei.c:3391 [inline] open_last_lookups fs/namei.c:3481 [inline] path_openat+0x10e6/0x2df0 fs/namei.c:3710 do_filp_open+0x264/0x4f0 fs/namei.c:3740 If in->len is much larger than HFS_NAMELEN(31) which is the maximum length of an HFS filename, a OOB write could occur in hfs_asc2mac(). In that case, when the dst reaches the boundary, the srclen is still greater than 0, which causes a OOB write. Fix this by adding a check on dstlen in while() before writing to dst address. Link: https://lkml.kernel.org/r/20221202030038.1391945-1-zhangpeng362@huawei.com Fixes: 328b92278650 ("[PATCH] hfs: NLS support") Signed-off-by: ZhangPeng Reviewed-by: Viacheslav Dubeyko Reported-by: Signed-off-by: Andrew Morton --- fs/hfs/trans.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c index 39f5e343bf4d..fdb0edb8a607 100644 --- a/fs/hfs/trans.c +++ b/fs/hfs/trans.c @@ -109,7 +109,7 @@ void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, const struct qstr if (nls_io) { wchar_t ch; - while (srclen > 0) { + while (srclen > 0 && dstlen > 0) { size = nls_io->char2uni(src, srclen, &ch); if (size < 0) { ch = '?'; -- cgit From 9f2b5debc07073e6dfdd774e3594d0224b991927 Mon Sep 17 00:00:00 2001 From: Aditya Garg Date: Wed, 7 Dec 2022 03:05:40 +0000 Subject: hfsplus: fix bug causing custom uid and gid being unable to be assigned with mount Despite specifying UID and GID in mount command, the specified UID and GID were not being assigned. This patch fixes this issue. Link: https://lkml.kernel.org/r/C0264BF5-059C-45CF-B8DA-3A3BD2C803A2@live.com Signed-off-by: Aditya Garg Reviewed-by: Viacheslav Dubeyko Cc: Signed-off-by: Andrew Morton --- fs/hfsplus/hfsplus_fs.h | 2 ++ fs/hfsplus/inode.c | 4 ++-- fs/hfsplus/options.c | 4 ++++ 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index a5db2e3b2980..6aa919e59483 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -198,6 +198,8 @@ struct hfsplus_sb_info { #define HFSPLUS_SB_HFSX 3 #define HFSPLUS_SB_CASEFOLD 4 #define HFSPLUS_SB_NOBARRIER 5 +#define HFSPLUS_SB_UID 6 +#define HFSPLUS_SB_GID 7 static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb) { diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index aeab83ed1c9c..b675581aa9d0 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -192,11 +192,11 @@ static void hfsplus_get_perms(struct inode *inode, mode = be16_to_cpu(perms->mode); i_uid_write(inode, be32_to_cpu(perms->owner)); - if (!i_uid_read(inode) && !mode) + if ((test_bit(HFSPLUS_SB_UID, &sbi->flags)) || (!i_uid_read(inode) && !mode)) inode->i_uid = sbi->uid; i_gid_write(inode, be32_to_cpu(perms->group)); - if (!i_gid_read(inode) && !mode) + if ((test_bit(HFSPLUS_SB_GID, &sbi->flags)) || (!i_gid_read(inode) && !mode)) inode->i_gid = sbi->gid; if (dir) { diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index 047e05c57560..c94a58762ad6 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -140,6 +140,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) if (!uid_valid(sbi->uid)) { pr_err("invalid uid specified\n"); return 0; + } else { + set_bit(HFSPLUS_SB_UID, &sbi->flags); } break; case opt_gid: @@ -151,6 +153,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) if (!gid_valid(sbi->gid)) { pr_err("invalid gid specified\n"); return 0; + } else { + set_bit(HFSPLUS_SB_GID, &sbi->flags); } break; case opt_part: -- cgit From e4b731ccb0975fd97283e0c0d9841a89063ec31a Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 18 Oct 2022 09:03:29 +0800 Subject: ceph: remove useless session parameter for check_caps() The session parameter makes no sense any more. Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 2 +- fs/ceph/caps.c | 23 +++++++++-------------- fs/ceph/file.c | 17 +++++++---------- fs/ceph/inode.c | 6 +++--- fs/ceph/ioctl.c | 2 +- fs/ceph/super.h | 3 +-- 6 files changed, 22 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index dcf701b05cc1..ff6e3c279a79 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1367,7 +1367,7 @@ out: folio_put(folio); if (check_cap) - ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); + ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY); return copied; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index e54814d0c2f7..1323fa28ab01 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1898,8 +1898,7 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci) * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without * further delay. */ -void ceph_check_caps(struct ceph_inode_info *ci, int flags, - struct ceph_mds_session *session) +void ceph_check_caps(struct ceph_inode_info *ci, int flags) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); @@ -1913,15 +1912,12 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, bool queue_invalidate = false; bool tried_invalidate = false; bool queue_writeback = false; - - if (session) - ceph_get_mds_session(session); + struct ceph_mds_session *session = NULL; spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { /* Don't send messages until we get async create reply */ spin_unlock(&ci->i_ceph_lock); - ceph_put_mds_session(session); return; } @@ -2851,7 +2847,7 @@ static void check_max_size(struct inode *inode, loff_t endoff) check = 1; spin_unlock(&ci->i_ceph_lock); if (check) - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY); } static inline int get_used_fmode(int caps) @@ -3140,7 +3136,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, switch (mode) { case PUT_CAP_REFS_SYNC: if (last) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); else if (flushsnaps) ceph_flush_snaps(ci, NULL); break; @@ -3255,7 +3251,7 @@ unlock: spin_unlock(&ci->i_ceph_lock); if (last) { - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); } else if (flush_snaps) { ceph_flush_snaps(ci, NULL); } @@ -3604,10 +3600,9 @@ static void handle_cap_grant(struct inode *inode, mutex_unlock(&session->s_mutex); if (check_caps == 1) - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL, - session); + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL); else if (check_caps == 2) - ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session); + ceph_check_caps(ci, CHECK_CAPS_NOINVAL); } /* @@ -4333,7 +4328,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) if (inode) { spin_unlock(&mdsc->cap_delay_lock); dout("check_delayed_caps on %p\n", inode); - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); iput(inode); spin_lock(&mdsc->cap_delay_lock); } @@ -4362,7 +4357,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s) dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode)); spin_unlock(&mdsc->cap_dirty_lock); ceph_wait_on_async_create(inode); - ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); + ceph_check_caps(ci, CHECK_CAPS_FLUSH); iput(inode); spin_lock(&mdsc->cap_dirty_lock); } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 04fd34557de8..4e68220bc06d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -313,7 +313,7 @@ int ceph_renew_caps(struct inode *inode, int fmode) spin_unlock(&ci->i_ceph_lock); dout("renew caps %p want %s issued %s updating mds_wanted\n", inode, ceph_cap_string(wanted), ceph_cap_string(issued)); - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); return 0; } spin_unlock(&ci->i_ceph_lock); @@ -408,7 +408,7 @@ int ceph_open(struct inode *inode, struct file *file) if ((issued & wanted) != wanted && (mds_wanted & wanted) != wanted && ceph_snap(inode) != CEPH_SNAPDIR) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); return ceph_init_file(inode, file, fmode); } else if (ceph_snap(inode) != CEPH_NOSNAP && @@ -1092,7 +1092,7 @@ static void ceph_aio_complete(struct inode *inode, loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len; if (endoff > i_size_read(inode)) { if (ceph_inode_set_size(inode, endoff)) - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY); } spin_lock(&ci->i_ceph_lock); @@ -1421,8 +1421,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (write && pos > size) { if (ceph_inode_set_size(inode, pos)) ceph_check_caps(ceph_inode(inode), - CHECK_CAPS_AUTHONLY, - NULL); + CHECK_CAPS_AUTHONLY); } } @@ -1577,8 +1576,7 @@ out: check_caps = ceph_inode_set_size(inode, pos); if (check_caps) ceph_check_caps(ceph_inode(inode), - CHECK_CAPS_AUTHONLY, - NULL); + CHECK_CAPS_AUTHONLY); } } @@ -1906,7 +1904,7 @@ retry_snap: if (dirty) __mark_inode_dirty(inode, dirty); if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos)) - ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); + ceph_check_caps(ci, CHECK_CAPS_FLUSH); } dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", @@ -2521,8 +2519,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, /* Let the MDS know about dst file size change */ if (ceph_inode_set_size(dst_inode, dst_off) || ceph_quota_is_max_bytes_approaching(dst_inode, dst_off)) - ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH, - NULL); + ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH); } /* Mark Fw dirty */ spin_lock(&dst_ci->i_ceph_lock); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index bad9eeb6a1a5..12173c00129f 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1909,7 +1909,7 @@ static void ceph_do_invalidate_pages(struct inode *inode) mutex_unlock(&ci->i_truncate_mutex); out: if (check) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); } /* @@ -1969,7 +1969,7 @@ retry: mutex_unlock(&ci->i_truncate_mutex); if (wrbuffer_refs == 0) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); wake_up_all(&ci->i_cap_wq); } @@ -1991,7 +1991,7 @@ static void ceph_inode_work(struct work_struct *work) __ceph_do_pending_vmtruncate(inode); if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask)) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask)) ceph_flush_snaps(ci, NULL); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 6e061bf62ad4..deac817647eb 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -253,7 +253,7 @@ static long ceph_ioctl_lazyio(struct file *file) spin_unlock(&ci->i_ceph_lock); dout("ioctl_layzio: file %p marked lazy\n", file); - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); } else { dout("ioctl_layzio: file %p already lazy\n", file); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 40630e6f691c..e8bc1d0d2614 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1200,8 +1200,7 @@ extern void ceph_remove_capsnap(struct inode *inode, extern void ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession); extern bool __ceph_should_report_size(struct ceph_inode_info *ci); -extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, - struct ceph_mds_session *session); +extern void ceph_check_caps(struct ceph_inode_info *ci, int flags); extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc); extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); extern int ceph_drop_caps_for_unlink(struct inode *inode); -- cgit From 68c62bee9d081cf815310b3a96e38d94fc16007d Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 17 Oct 2022 22:17:35 +0800 Subject: ceph: try to check caps immediately after async creating finishes We should call the check_caps() again immediately after the async creating finishes in case the MDS is waiting for caps revocation to finish. Link: https://tracker.ceph.com/issues/46904 Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 ++ fs/ceph/file.c | 9 +++++++++ fs/ceph/super.h | 2 ++ 3 files changed, 13 insertions(+) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 1323fa28ab01..4b159f97fe7b 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1916,6 +1916,8 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags) spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { + ci->i_ceph_flags |= CEPH_I_ASYNC_CHECK_CAPS; + /* Don't send messages until we get async create reply */ spin_unlock(&ci->i_ceph_lock); return; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4e68220bc06d..a6681b12e280 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -534,14 +534,23 @@ static void wake_async_create_waiters(struct inode *inode, struct ceph_mds_session *session) { struct ceph_inode_info *ci = ceph_inode(inode); + bool check_cap = false; spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); + + if (ci->i_ceph_flags & CEPH_I_ASYNC_CHECK_CAPS) { + ci->i_ceph_flags &= ~CEPH_I_ASYNC_CHECK_CAPS; + check_cap = true; + } } ceph_kick_flushing_inode_caps(session, ci); spin_unlock(&ci->i_ceph_lock); + + if (check_cap) + ceph_check_caps(ci, CHECK_CAPS_FLUSH); } static void ceph_async_create_cb(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index e8bc1d0d2614..e1bd6f487226 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -593,6 +593,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */ #define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT) #define CEPH_I_SHUTDOWN (1 << 13) /* inode is no longer usable */ +#define CEPH_I_ASYNC_CHECK_CAPS (1 << 14) /* check caps immediately after async + creating finishes */ /* * Masks of ceph inode work. -- cgit From c19204cbd65c12fdcd34fb8f5d645007238ed5cd Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 8 Dec 2022 16:11:00 -0600 Subject: cifs: minor cleanup of some headers checkpatch showed formatting problems with extra spaces, and extra semicolon and some missing blank lines in some cifs headers. Reviewed-by: Paulo Alcantara (SUSE) Reviewed-by: Germano Percossi Signed-off-by: Steve French --- fs/cifs/cifs_ioctl.h | 2 +- fs/cifs/cifsfs.h | 4 ++-- fs/cifs/cifsglob.h | 7 +++++-- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index d86d78d5bfdc..332588e77c31 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -108,7 +108,7 @@ struct smb3_notify_info { #define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify) #define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info) #define CIFS_IOC_NOTIFY_INFO _IOWR(CIFS_IOCTL_MAGIC, 11, struct smb3_notify_info) -#define CIFS_IOC_SHUTDOWN _IOR ('X', 125, __u32) +#define CIFS_IOC_SHUTDOWN _IOR('X', 125, __u32) /* * Flags for going down operation diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 388b745a978e..00a573e0ad0e 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -105,8 +105,8 @@ extern int cifs_lock(struct file *, int, struct file_lock *); extern int cifs_fsync(struct file *, loff_t, loff_t, int); extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int); extern int cifs_flush(struct file *, fl_owner_t id); -extern int cifs_file_mmap(struct file * , struct vm_area_struct *); -extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *); +extern int cifs_file_mmap(struct file *file, struct vm_area_struct *vma); +extern int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma); extern const struct file_operations cifs_dir_ops; extern int cifs_dir_open(struct inode *inode, struct file *file); extern int cifs_readdir(struct file *file, struct dir_context *ctx); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 1420acf987f0..cd3a173e65b1 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -785,6 +785,7 @@ static inline unsigned int in_flight(struct TCP_Server_Info *server) { unsigned int num; + spin_lock(&server->req_lock); num = server->in_flight; spin_unlock(&server->req_lock); @@ -795,6 +796,7 @@ static inline bool has_credits(struct TCP_Server_Info *server, int *credits, int num_credits) { int num; + spin_lock(&server->req_lock); num = *credits; spin_unlock(&server->req_lock); @@ -1025,7 +1027,7 @@ struct cifs_ses { struct TCP_Server_Info *server; /* pointer to server info */ int ses_count; /* reference counter */ enum ses_status_enum ses_status; /* updates protected by cifs_tcp_ses_lock */ - unsigned overrideSecFlg; /* if non-zero override global sec flags */ + unsigned int overrideSecFlg; /* if non-zero override global sec flags */ char *serverOS; /* name of operating system underlying server */ char *serverNOS; /* name of network operating system of server */ char *serverDomain; /* security realm of server */ @@ -1381,7 +1383,7 @@ struct cifsFileInfo { __u32 pid; /* process id who opened file */ struct cifs_fid fid; /* file id from remote */ struct list_head rlist; /* reconnect list */ - /* BB add lock scope info here if needed */ ; + /* BB add lock scope info here if needed */ /* lock scope id (0 if none) */ struct dentry *dentry; struct tcon_link *tlink; @@ -1769,6 +1771,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param, int number_of_items) { int i; + if ((number_of_items == 0) || (param == NULL)) return; for (i = 0; i < number_of_items; i++) { -- cgit From 9544597b5b63ff1674d60e069f93555ab924b62b Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 9 Dec 2022 16:55:51 -0600 Subject: cifs: fix various whitespace errors in headers Fix some extra spaces and a few comments that were unnecessarily split over two lines. These were some trivial issues pointed out by checkpatch) Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/cifspdu.h | 50 +++++++++++++++++++++----------------------------- fs/cifs/cifsproto.h | 2 +- 2 files changed, 22 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index d1abaeea974a..623caece2b10 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -1429,7 +1429,7 @@ typedef struct smb_com_transaction_change_notify_req { __u8 WatchTree; /* 1 = Monitor subdirectories */ __u8 Reserved2; __le16 ByteCount; -/* __u8 Pad[3];*/ +/* __u8 Pad[3];*/ /* __u8 Data[1];*/ } __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_REQ; @@ -1752,8 +1752,7 @@ struct smb_com_transaction2_sfi_rsp { struct smb_hdr hdr; /* wct = 10 + SetupCount */ struct trans2_resp t2; __u16 ByteCount; - __u16 Reserved2; /* parameter word reserved - - present for infolevels > 100 */ + __u16 Reserved2; /* parameter word reserved - present for infolevels > 100 */ } __attribute__((packed)); struct smb_t2_qfi_req { @@ -1768,8 +1767,7 @@ struct smb_t2_qfi_rsp { struct smb_hdr hdr; /* wct = 10 + SetupCount */ struct trans2_resp t2; __u16 ByteCount; - __u16 Reserved2; /* parameter word reserved - - present for infolevels > 100 */ + __u16 Reserved2; /* parameter word reserved - present for infolevels > 100 */ } __attribute__((packed)); /* @@ -2146,13 +2144,11 @@ typedef struct { #define CIFS_UNIX_POSIX_PATH_OPS_CAP 0x00000020 /* Allow new POSIX path based calls including posix open and posix unlink */ -#define CIFS_UNIX_LARGE_READ_CAP 0x00000040 /* support reads >128K (up - to 0xFFFF00 */ +#define CIFS_UNIX_LARGE_READ_CAP 0x00000040 /* support reads >128K (up to 0xFFFF00 */ #define CIFS_UNIX_LARGE_WRITE_CAP 0x00000080 #define CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP 0x00000100 /* can do SPNEGO crypt */ #define CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP 0x00000200 /* must do */ -#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and - QFS PROXY call */ +#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and QFS PROXY call */ #ifdef CONFIG_CIFS_POSIX /* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send LockingX instead of posix locking call on unix sess (and we do not expect @@ -2368,8 +2364,7 @@ typedef struct { struct file_allocation_info { __le64 AllocationSize; /* Note old Samba srvr rounds this up too much */ -} __attribute__((packed)); /* size used on disk, for level 0x103 for set, - 0x105 for query */ +} __packed; /* size used on disk, for level 0x103 for set, 0x105 for query */ struct file_end_of_file_info { __le64 FileSize; /* offset to end of file */ @@ -2409,8 +2404,7 @@ struct cifs_posix_acl { /* access conrol list (ACL) */ __le16 access_entry_count; /* access ACL - count of entries */ __le16 default_entry_count; /* default ACL - count of entries */ struct cifs_posix_ace ace_array[]; - /* followed by - struct cifs_posix_ace default_ace_arraay[] */ + /* followed by struct cifs_posix_ace default_ace_array[] */ } __attribute__((packed)); /* level 0x204 */ /* types of access control entries already defined in posix_acl.h */ @@ -2429,17 +2423,17 @@ struct cifs_posix_acl { /* access conrol list (ACL) */ /* end of POSIX ACL definitions */ /* POSIX Open Flags */ -#define SMB_O_RDONLY 0x1 -#define SMB_O_WRONLY 0x2 -#define SMB_O_RDWR 0x4 -#define SMB_O_CREAT 0x10 -#define SMB_O_EXCL 0x20 -#define SMB_O_TRUNC 0x40 -#define SMB_O_APPEND 0x80 -#define SMB_O_SYNC 0x100 -#define SMB_O_DIRECTORY 0x200 -#define SMB_O_NOFOLLOW 0x400 -#define SMB_O_DIRECT 0x800 +#define SMB_O_RDONLY 0x1 +#define SMB_O_WRONLY 0x2 +#define SMB_O_RDWR 0x4 +#define SMB_O_CREAT 0x10 +#define SMB_O_EXCL 0x20 +#define SMB_O_TRUNC 0x40 +#define SMB_O_APPEND 0x80 +#define SMB_O_SYNC 0x100 +#define SMB_O_DIRECTORY 0x200 +#define SMB_O_NOFOLLOW 0x400 +#define SMB_O_DIRECT 0x800 typedef struct { __le32 OpenFlags; /* same as NT CreateX */ @@ -2716,15 +2710,13 @@ typedef struct file_xattr_info { __u32 xattr_value_len; char xattr_name[]; /* followed by xattr_value[xattr_value_len], no pad */ -} __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute info - level 0x205 */ +} __packed FILE_XATTR_INFO; /* extended attribute info level 0x205 */ /* flags for lsattr and chflags commands removed arein uapi/linux/fs.h */ typedef struct file_chattr_info { __le64 mask; /* list of all possible attribute bits */ __le64 mode; /* list of actual attribute bits on this inode */ -} __attribute__((packed)) FILE_CHATTR_INFO; /* ext attributes - (chattr, chflags) level 0x206 */ -#endif /* POSIX */ +} __packed FILE_CHATTR_INFO; /* ext attributes (chattr, chflags) level 0x206 */ +#endif /* POSIX */ #endif /* _CIFSPDU_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 83e83d8beabb..f216fa269c85 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -124,7 +124,7 @@ extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *, struct kvec * /* resp vec */); extern int SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *ptcon, - struct smb_hdr *in_buf , + struct smb_hdr *in_buf, struct smb_hdr *out_buf, int *bytes_returned); void -- cgit From 2bfd81043e944af0e52835ef6d9b41795af22341 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 11 Dec 2022 13:54:21 -0600 Subject: cifs: fix missing display of three mount options Three mount options: "tcpnodelay" and "noautotune" and "noblocksend" were not displayed when passed in on cifs/smb3 mounts (e.g. displayed in /proc/mounts e.g.). No change to defaults so these are not displayed if not specified on mount. Cc: stable@vger.kernel.org Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 712a43161448..6094cb2ff099 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -678,9 +678,15 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",echo_interval=%lu", tcon->ses->server->echo_interval / HZ); - /* Only display max_credits if it was overridden on mount */ + /* Only display the following if overridden on mount */ if (tcon->ses->server->max_credits != SMB2_MAX_CREDITS_AVAILABLE) seq_printf(s, ",max_credits=%u", tcon->ses->server->max_credits); + if (tcon->ses->server->tcp_nodelay) + seq_puts(s, ",tcpnodelay"); + if (tcon->ses->server->noautotune) + seq_puts(s, ",noautotune"); + if (tcon->ses->server->noblocksnd) + seq_puts(s, ",noblocksend"); if (tcon->snapshot_time) seq_printf(s, ",snapshot=%llu", tcon->snapshot_time); -- cgit From 9d91f8108ebfed54284332e04d2073107df18794 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 11 Dec 2022 14:44:31 -0600 Subject: cifs: print warning when conflicting soft vs. hard mount options specified If the user specifies conflicting hard vs. soft mount options (or nosoft vs. nohard) print a warning to dmesg We were missing a warning when a user e.g. mounted with both "hard,soft" mount options. Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/fs_context.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 45119597c765..2c92a821e028 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -884,16 +884,21 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->nodfs = 1; break; case Opt_hard: - if (result.negated) + if (result.negated) { + if (ctx->retry == 1) + cifs_dbg(VFS, "conflicting hard vs. soft mount options\n"); ctx->retry = 0; - else + } else ctx->retry = 1; break; case Opt_soft: if (result.negated) ctx->retry = 1; - else + else { + if (ctx->retry == 1) + cifs_dbg(VFS, "conflicting hard vs soft mount options\n"); ctx->retry = 0; + } break; case Opt_mapposix: if (result.negated) -- cgit From f7f291e14dde32a07b1f0aa06921d28f875a7b54 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Sun, 11 Dec 2022 18:18:55 -0300 Subject: cifs: fix oops during encryption When running xfstests against Azure the following oops occurred on an arm64 system Unable to handle kernel write to read-only memory at virtual address ffff0001221cf000 Mem abort info: ESR = 0x9600004f EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x0f: level 3 permission fault Data abort info: ISV = 0, ISS = 0x0000004f CM = 0, WnR = 1 swapper pgtable: 4k pages, 48-bit VAs, pgdp=00000000294f3000 [ffff0001221cf000] pgd=18000001ffff8003, p4d=18000001ffff8003, pud=18000001ff82e003, pmd=18000001ff71d003, pte=00600001221cf787 Internal error: Oops: 9600004f [#1] PREEMPT SMP ... pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--) pc : __memcpy+0x40/0x230 lr : scatterwalk_copychunks+0xe0/0x200 sp : ffff800014e92de0 x29: ffff800014e92de0 x28: ffff000114f9de80 x27: 0000000000000008 x26: 0000000000000008 x25: ffff800014e92e78 x24: 0000000000000008 x23: 0000000000000001 x22: 0000040000000000 x21: ffff000000000000 x20: 0000000000000001 x19: ffff0001037c4488 x18: 0000000000000014 x17: 235e1c0d6efa9661 x16: a435f9576b6edd6c x15: 0000000000000058 x14: 0000000000000001 x13: 0000000000000008 x12: ffff000114f2e590 x11: ffffffffffffffff x10: 0000040000000000 x9 : ffff8000105c3580 x8 : 2e9413b10000001a x7 : 534b4410fb86b005 x6 : 534b4410fb86b005 x5 : ffff0001221cf008 x4 : ffff0001037c4490 x3 : 0000000000000001 x2 : 0000000000000008 x1 : ffff0001037c4488 x0 : ffff0001221cf000 Call trace: __memcpy+0x40/0x230 scatterwalk_map_and_copy+0x98/0x100 crypto_ccm_encrypt+0x150/0x180 crypto_aead_encrypt+0x2c/0x40 crypt_message+0x750/0x880 smb3_init_transform_rq+0x298/0x340 smb_send_rqst.part.11+0xd8/0x180 smb_send_rqst+0x3c/0x100 compound_send_recv+0x534/0xbc0 smb2_query_info_compound+0x32c/0x440 smb2_set_ea+0x438/0x4c0 cifs_xattr_set+0x5d4/0x7c0 This is because in scatterwalk_copychunks(), we attempted to write to a buffer (@sign) that was allocated in the stack (vmalloc area) by crypt_message() and thus accessing its remaining 8 (x2) bytes ended up crossing a page boundary. To simply fix it, we could just pass @sign kmalloc'd from crypt_message() and then we're done. Luckily, we don't seem to pass any other vmalloc'd buffers in smb_rqst::rq_iov... Instead, let's map the correct pages and offsets from vmalloc buffers as well in cifs_sg_set_buf() and then avoiding such oopses. Signed-off-by: Paulo Alcantara (SUSE) Cc: stable@vger.kernel.org Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 68 +++++++++++++++++++++++++ fs/cifs/cifsproto.h | 4 +- fs/cifs/misc.c | 4 +- fs/cifs/smb2ops.c | 143 +++++++++++++++++++++++++--------------------------- 4 files changed, 140 insertions(+), 79 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index cd3a173e65b1..703685e2db5e 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -2140,4 +2142,70 @@ static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const dst->FileNameLength = src->FileNameLength; } +static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst, + int num_rqst, + const u8 *sig) +{ + unsigned int len, skip; + unsigned int nents = 0; + unsigned long addr; + int i, j; + + /* Assumes the first rqst has a transform header as the first iov. + * I.e. + * rqst[0].rq_iov[0] is transform header + * rqst[0].rq_iov[1+] data to be encrypted/decrypted + * rqst[1+].rq_iov[0+] data to be encrypted/decrypted + */ + for (i = 0; i < num_rqst; i++) { + /* + * The first rqst has a transform header where the + * first 20 bytes are not part of the encrypted blob. + */ + for (j = 0; j < rqst[i].rq_nvec; j++) { + struct kvec *iov = &rqst[i].rq_iov[j]; + + skip = (i == 0) && (j == 0) ? 20 : 0; + addr = (unsigned long)iov->iov_base + skip; + if (unlikely(is_vmalloc_addr((void *)addr))) { + len = iov->iov_len - skip; + nents += DIV_ROUND_UP(offset_in_page(addr) + len, + PAGE_SIZE); + } else { + nents++; + } + } + nents += rqst[i].rq_npages; + } + nents += DIV_ROUND_UP(offset_in_page(sig) + SMB2_SIGNATURE_SIZE, PAGE_SIZE); + return nents; +} + +/* We can not use the normal sg_set_buf() as we will sometimes pass a + * stack object as buf. + */ +static inline struct scatterlist *cifs_sg_set_buf(struct scatterlist *sg, + const void *buf, + unsigned int buflen) +{ + unsigned long addr = (unsigned long)buf; + unsigned int off = offset_in_page(addr); + + addr &= PAGE_MASK; + if (unlikely(is_vmalloc_addr((void *)addr))) { + do { + unsigned int len = min_t(unsigned int, buflen, PAGE_SIZE - off); + + sg_set_page(sg++, vmalloc_to_page((void *)addr), len, off); + + off = 0; + addr += PAGE_SIZE; + buflen -= len; + } while (buflen); + } else { + sg_set_page(sg++, virt_to_page(addr), buflen, off); + } + return sg; +} + #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index f216fa269c85..9c6147ca029d 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -600,8 +600,8 @@ int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw); int cifs_alloc_hash(const char *name, struct shash_desc **sdesc); void cifs_free_hash(struct shash_desc **sdesc); -extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page, - unsigned int *len, unsigned int *offset); +void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page, + unsigned int *len, unsigned int *offset); struct cifs_chan * cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server); int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 3e68d8208cf5..1cbecd64d697 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1136,8 +1136,8 @@ cifs_free_hash(struct shash_desc **sdesc) * @len: Where to store the length for this page: * @offset: Where to store the offset for this page */ -void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page, - unsigned int *len, unsigned int *offset) +void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page, + unsigned int *len, unsigned int *offset) { *len = rqst->rq_pagesz; *offset = (page == 0) ? rqst->rq_offset : 0; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 72b22d033ed5..6e772b31e02a 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -4204,69 +4204,82 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8); } -/* We can not use the normal sg_set_buf() as we will sometimes pass a - * stack object as buf. - */ -static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf, - unsigned int buflen) +static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst *rqst, + int num_rqst, const u8 *sig, u8 **iv, + struct aead_request **req, struct scatterlist **sgl, + unsigned int *num_sgs) { - void *addr; - /* - * VMAP_STACK (at least) puts stack into the vmalloc address space - */ - if (is_vmalloc_addr(buf)) - addr = vmalloc_to_page(buf); - else - addr = virt_to_page(buf); - sg_set_page(sg, addr, buflen, offset_in_page(buf)); + unsigned int req_size = sizeof(**req) + crypto_aead_reqsize(tfm); + unsigned int iv_size = crypto_aead_ivsize(tfm); + unsigned int len; + u8 *p; + + *num_sgs = cifs_get_num_sgs(rqst, num_rqst, sig); + + len = iv_size; + len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1); + len = ALIGN(len, crypto_tfm_ctx_alignment()); + len += req_size; + len = ALIGN(len, __alignof__(struct scatterlist)); + len += *num_sgs * sizeof(**sgl); + + p = kmalloc(len, GFP_ATOMIC); + if (!p) + return NULL; + + *iv = (u8 *)PTR_ALIGN(p, crypto_aead_alignmask(tfm) + 1); + *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size, + crypto_tfm_ctx_alignment()); + *sgl = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size, + __alignof__(struct scatterlist)); + return p; } -/* Assumes the first rqst has a transform header as the first iov. - * I.e. - * rqst[0].rq_iov[0] is transform header - * rqst[0].rq_iov[1+] data to be encrypted/decrypted - * rqst[1+].rq_iov[0+] data to be encrypted/decrypted - */ -static struct scatterlist * -init_sg(int num_rqst, struct smb_rqst *rqst, u8 *sign) +static void *smb2_get_aead_req(struct crypto_aead *tfm, const struct smb_rqst *rqst, + int num_rqst, const u8 *sig, u8 **iv, + struct aead_request **req, struct scatterlist **sgl) { - unsigned int sg_len; + unsigned int off, len, skip; struct scatterlist *sg; - unsigned int i; - unsigned int j; - unsigned int idx = 0; - int skip; - - sg_len = 1; - for (i = 0; i < num_rqst; i++) - sg_len += rqst[i].rq_nvec + rqst[i].rq_npages; + unsigned int num_sgs; + unsigned long addr; + int i, j; + void *p; - sg = kmalloc_array(sg_len, sizeof(struct scatterlist), GFP_KERNEL); - if (!sg) + p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, sgl, &num_sgs); + if (!p) return NULL; - sg_init_table(sg, sg_len); + sg_init_table(*sgl, num_sgs); + sg = *sgl; + + /* Assumes the first rqst has a transform header as the first iov. + * I.e. + * rqst[0].rq_iov[0] is transform header + * rqst[0].rq_iov[1+] data to be encrypted/decrypted + * rqst[1+].rq_iov[0+] data to be encrypted/decrypted + */ for (i = 0; i < num_rqst; i++) { + /* + * The first rqst has a transform header where the + * first 20 bytes are not part of the encrypted blob. + */ for (j = 0; j < rqst[i].rq_nvec; j++) { - /* - * The first rqst has a transform header where the - * first 20 bytes are not part of the encrypted blob - */ - skip = (i == 0) && (j == 0) ? 20 : 0; - smb2_sg_set_buf(&sg[idx++], - rqst[i].rq_iov[j].iov_base + skip, - rqst[i].rq_iov[j].iov_len - skip); - } + struct kvec *iov = &rqst[i].rq_iov[j]; + skip = (i == 0) && (j == 0) ? 20 : 0; + addr = (unsigned long)iov->iov_base + skip; + len = iov->iov_len - skip; + sg = cifs_sg_set_buf(sg, (void *)addr, len); + } for (j = 0; j < rqst[i].rq_npages; j++) { - unsigned int len, offset; - - rqst_page_get_length(&rqst[i], j, &len, &offset); - sg_set_page(&sg[idx++], rqst[i].rq_pages[j], len, offset); + rqst_page_get_length(&rqst[i], j, &len, &off); + sg_set_page(sg++, rqst[i].rq_pages[j], len, off); } } - smb2_sg_set_buf(&sg[idx], sign, SMB2_SIGNATURE_SIZE); - return sg; + cifs_sg_set_buf(sg, sig, SMB2_SIGNATURE_SIZE); + + return p; } static int @@ -4314,11 +4327,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, u8 sign[SMB2_SIGNATURE_SIZE] = {}; u8 key[SMB3_ENC_DEC_KEY_SIZE]; struct aead_request *req; - char *iv; - unsigned int iv_len; + u8 *iv; DECLARE_CRYPTO_WAIT(wait); struct crypto_aead *tfm; unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); + void *creq; rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key); if (rc) { @@ -4352,32 +4365,15 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, return rc; } - req = aead_request_alloc(tfm, GFP_KERNEL); - if (!req) { - cifs_server_dbg(VFS, "%s: Failed to alloc aead request\n", __func__); + creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg); + if (unlikely(!creq)) return -ENOMEM; - } if (!enc) { memcpy(sign, &tr_hdr->Signature, SMB2_SIGNATURE_SIZE); crypt_len += SMB2_SIGNATURE_SIZE; } - sg = init_sg(num_rqst, rqst, sign); - if (!sg) { - cifs_server_dbg(VFS, "%s: Failed to init sg\n", __func__); - rc = -ENOMEM; - goto free_req; - } - - iv_len = crypto_aead_ivsize(tfm); - iv = kzalloc(iv_len, GFP_KERNEL); - if (!iv) { - cifs_server_dbg(VFS, "%s: Failed to alloc iv\n", __func__); - rc = -ENOMEM; - goto free_sg; - } - if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES_GCM_NONCE); @@ -4386,6 +4382,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES_CCM_NONCE); } + aead_request_set_tfm(req, tfm); aead_request_set_crypt(req, sg, sg, crypt_len, iv); aead_request_set_ad(req, assoc_data_len); @@ -4398,11 +4395,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, if (!rc && enc) memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE); - kfree_sensitive(iv); -free_sg: - kfree_sensitive(sg); -free_req: - kfree_sensitive(req); + kfree_sensitive(creq); return rc; } -- cgit From a9438b44bc7015b18931e312bbd249a25bb59a65 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Dec 2022 12:36:33 +0100 Subject: writeback: Add asserts for adding freed inode to lists In the past we had several use-after-free issues with inodes getting added to writeback lists after evict() removed them. These are painful to debug so add some asserts to catch the problem earlier. The only non-obvious change in the commit is that we need to tweak redirty_tail_locked() to avoid triggering assertion in inode_io_list_move_locked(). Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20221212113633.29181-1-jack@suse.cz Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 443f83382b9b..6cd172c4cb3e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -121,6 +121,7 @@ static bool inode_io_list_move_locked(struct inode *inode, { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); + WARN_ON_ONCE(inode->i_state & I_FREEING); list_move(&inode->i_io_list, head); @@ -280,6 +281,7 @@ static void inode_cgwb_move_to_attached(struct inode *inode, { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); + WARN_ON_ONCE(inode->i_state & I_FREEING); inode->i_state &= ~I_SYNC_QUEUED; if (wb != &wb->bdi->wb) @@ -1129,6 +1131,7 @@ static void inode_cgwb_move_to_attached(struct inode *inode, { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); + WARN_ON_ONCE(inode->i_state & I_FREEING); inode->i_state &= ~I_SYNC_QUEUED; list_del_init(&inode->i_io_list); @@ -1294,6 +1297,17 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&inode->i_lock); + inode->i_state &= ~I_SYNC_QUEUED; + /* + * When the inode is being freed just don't bother with dirty list + * tracking. Flush worker will ignore this inode anyway and it will + * trigger assertions in inode_io_list_move_locked(). + */ + if (inode->i_state & I_FREEING) { + list_del_init(&inode->i_io_list); + wb_io_lists_depopulated(wb); + return; + } if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -1302,7 +1316,6 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb) inode->dirtied_when = jiffies; } inode_io_list_move_locked(inode, wb, &wb->b_dirty); - inode->i_state &= ~I_SYNC_QUEUED; } static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) -- cgit From 23e188a16423a6e65290abf39dd427ff047e6843 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 10 Dec 2022 18:10:42 +0800 Subject: writeback: remove obsolete macro EXPIRE_DIRTY_ATIME EXPIRE_DIRTY_ATIME is not used anymore. Remove it. Signed-off-by: Miaohe Lin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221210101042.2012931-1-linmiaohe@huawei.com Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6cd172c4cb3e..6c113585e782 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1358,8 +1358,6 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) return ret; } -#define EXPIRE_DIRTY_ATIME 0x0001 - /* * Move expired (dirtied before dirtied_before) dirty inodes from * @delaying_queue to @dispatch_queue. -- cgit From e480751970e84bc13ab5c288dbbe16b0638cc088 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 24 Nov 2022 11:37:08 +0800 Subject: f2fs: remove F2FS_SET_FEATURE() and F2FS_CLEAR_FEATURE() macro F2FS_SET_FEATURE() and F2FS_CLEAR_FEATURE() have never been used since they were introduced by this commit 76f105a2dbcd("f2fs: add feature facility in superblock"). So let's remove them. BTW, convert f2fs_sb_has_##name to return bool. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 296683648d4f..cf738f1275b2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -203,10 +203,6 @@ struct f2fs_mount_info { #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) #define F2FS_HAS_FEATURE(sbi, mask) __F2FS_HAS_FEATURE(sbi->raw_super, mask) -#define F2FS_SET_FEATURE(sbi, mask) \ - (sbi->raw_super->feature |= cpu_to_le32(mask)) -#define F2FS_CLEAR_FEATURE(sbi, mask) \ - (sbi->raw_super->feature &= ~cpu_to_le32(mask)) /* * Default values for user and/or group using reserved blocks @@ -4387,7 +4383,7 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode) } #define F2FS_FEATURE_FUNCS(name, flagname) \ -static inline int f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \ +static inline bool f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \ { \ return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \ } -- cgit From ed8ac22b6b75804743f1dae6563d75f85cfd1483 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 24 Nov 2022 10:48:42 +0800 Subject: f2fs: introduce f2fs_is_readonly() for readability Introduce f2fs_is_readonly() and use it to simplify code. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/super.c | 5 ++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cf738f1275b2..eb8c27c4e5fc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4575,6 +4575,11 @@ static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs, } } +static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi) +{ + return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb); +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a5f6f632cf7c..79bf1faf4161 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1351,8 +1351,7 @@ default_check: return -EINVAL; } - if ((f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb)) && - test_opt(sbi, FLUSH_MERGE)) { + if (f2fs_is_readonly(sbi) && test_opt(sbi, FLUSH_MERGE)) { f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode"); return -EINVAL; } @@ -2083,7 +2082,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, MERGE_CHECKPOINT); F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; - if (!f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) + if (!f2fs_is_readonly(sbi)) set_opt(sbi, FLUSH_MERGE); if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) set_opt(sbi, DISCARD); -- cgit From 12607c1ba7637e750402f555b6695c50fce77a2b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 30 Nov 2022 09:36:43 -0800 Subject: f2fs: specify extent cache for read explicitly Let's descrbie it's read extent cache. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 4 ++-- fs/f2fs/f2fs.h | 10 +++++----- fs/f2fs/inode.c | 2 +- fs/f2fs/node.c | 2 +- fs/f2fs/node.h | 2 +- fs/f2fs/segment.c | 4 ++-- fs/f2fs/super.c | 12 ++++++------ 7 files changed, 18 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 932c070173b9..8cd87aee0292 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -383,7 +383,7 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage) if (!i_ext || !i_ext->len) return; - get_extent_info(&ei, i_ext); + get_read_extent_info(&ei, i_ext); write_lock(&et->lock); if (atomic_read(&et->node_cnt)) @@ -710,7 +710,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) unsigned int node_cnt = 0, tree_cnt = 0; int remained; - if (!test_opt(sbi, EXTENT_CACHE)) + if (!test_opt(sbi, READ_EXTENT_CACHE)) return 0; if (!atomic_read(&sbi->total_zombie_tree)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index eb8c27c4e5fc..1c39f8145b61 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -92,7 +92,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_FLUSH_MERGE 0x00000400 #define F2FS_MOUNT_NOBARRIER 0x00000800 #define F2FS_MOUNT_FASTBOOT 0x00001000 -#define F2FS_MOUNT_EXTENT_CACHE 0x00002000 +#define F2FS_MOUNT_READ_EXTENT_CACHE 0x00002000 #define F2FS_MOUNT_DATA_FLUSH 0x00008000 #define F2FS_MOUNT_FAULT_INJECTION 0x00010000 #define F2FS_MOUNT_USRQUOTA 0x00080000 @@ -600,7 +600,7 @@ enum { #define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ /* number of extent info in extent cache we try to shrink */ -#define EXTENT_CACHE_SHRINK_NUMBER 128 +#define READ_EXTENT_CACHE_SHRINK_NUMBER 128 #define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS #define RECOVERY_MIN_RA_BLOCKS 1 @@ -830,7 +830,7 @@ struct f2fs_inode_info { loff_t original_i_size; /* original i_size before atomic write */ }; -static inline void get_extent_info(struct extent_info *ext, +static inline void get_read_extent_info(struct extent_info *ext, struct f2fs_extent *i_ext) { ext->fofs = le32_to_cpu(i_ext->fofs); @@ -838,7 +838,7 @@ static inline void get_extent_info(struct extent_info *ext, ext->len = le32_to_cpu(i_ext->len); } -static inline void set_raw_extent(struct extent_info *ext, +static inline void set_raw_read_extent(struct extent_info *ext, struct f2fs_extent *i_ext) { i_ext->fofs = cpu_to_le32(ext->fofs); @@ -4407,7 +4407,7 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - if (!test_opt(sbi, EXTENT_CACHE) || + if (!test_opt(sbi, READ_EXTENT_CACHE) || is_inode_flag_set(inode, FI_NO_EXTENT) || (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && !f2fs_sb_has_readonly(sbi))) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 577f109b4e1d..2c705c60019b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -629,7 +629,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (et) { read_lock(&et->lock); - set_raw_extent(&et->largest, &ri->i_ext); + set_raw_read_extent(&et->largest, &ri->i_ext); read_unlock(&et->lock); } else { memset(&ri->i_ext, 0, sizeof(ri->i_ext)); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b9ee5a1176a0..84b147966080 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -85,7 +85,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct ino_entry); mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); - } else if (type == EXTENT_CACHE) { + } else if (type == READ_EXTENT_CACHE) { mem_size = (atomic_read(&sbi->total_ext_tree) * sizeof(struct extent_tree) + atomic_read(&sbi->total_ext_node) * diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 3c09cae058b0..0aa48704c77a 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -146,7 +146,7 @@ enum mem_type { NAT_ENTRIES, /* indicates the cached nat entry */ DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ - EXTENT_CACHE, /* indicates extent cache */ + READ_EXTENT_CACHE, /* indicates read extent cache */ DISCARD_CACHE, /* indicates memory of cached discard cmds */ COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9486ca49ecb1..51de358bc452 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -449,8 +449,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) return; /* try to shrink extent cache when there is no enough memory */ - if (!f2fs_available_free_memory(sbi, EXTENT_CACHE)) - f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); + if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE)) + f2fs_shrink_extent_tree(sbi, READ_EXTENT_CACHE_SHRINK_NUMBER); /* check the # of cached NAT entries */ if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 79bf1faf4161..412c2e7352c0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -814,10 +814,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) set_opt(sbi, FASTBOOT); break; case Opt_extent_cache: - set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noextent_cache: - clear_opt(sbi, EXTENT_CACHE); + clear_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noinline_data: clear_opt(sbi, INLINE_DATA); @@ -1954,7 +1954,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",barrier"); if (test_opt(sbi, FASTBOOT)) seq_puts(seq, ",fastboot"); - if (test_opt(sbi, EXTENT_CACHE)) + if (test_opt(sbi, READ_EXTENT_CACHE)) seq_puts(seq, ",extent_cache"); else seq_puts(seq, ",noextent_cache"); @@ -2076,7 +2076,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); - set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, READ_EXTENT_CACHE); set_opt(sbi, NOHEAP); clear_opt(sbi, DISABLE_CHECKPOINT); set_opt(sbi, MERGE_CHECKPOINT); @@ -2218,7 +2218,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_ckpt = false, need_stop_ckpt = false; bool need_restart_flush = false, need_stop_flush = false; bool need_restart_discard = false, need_stop_discard = false; - bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); + bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE); bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); @@ -2308,7 +2308,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } /* disallow enable/disable extent_cache dynamically */ - if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { + if (no_read_extent_cache == !!test_opt(sbi, READ_EXTENT_CACHE)) { err = -EINVAL; f2fs_warn(sbi, "switch extent_cache option is not allowed"); goto restore_opts; -- cgit From 3bac20a8f011b8ed4012b43f4f33010432b3c647 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 30 Nov 2022 09:44:58 -0800 Subject: f2fs: move internal functions into extent_cache.c No functional change. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 88 ++++++++++++++++++++++++++++++++++++++++++++------ fs/f2fs/f2fs.h | 69 ++------------------------------------- 2 files changed, 81 insertions(+), 76 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 8cd87aee0292..2a8e31e6d518 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -15,6 +15,77 @@ #include "node.h" #include +static void __set_extent_info(struct extent_info *ei, + unsigned int fofs, unsigned int len, + block_t blk, bool keep_clen) +{ + ei->fofs = fofs; + ei->blk = blk; + ei->len = len; + + if (keep_clen) + return; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + ei->c_len = 0; +#endif +} + +static bool f2fs_may_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + /* + * for recovered files during mount do not create extents + * if shrinker is not registered. + */ + if (list_empty(&sbi->s_list)) + return false; + + if (!test_opt(sbi, READ_EXTENT_CACHE) || + is_inode_flag_set(inode, FI_NO_EXTENT) || + (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && + !f2fs_sb_has_readonly(sbi))) + return false; + + return S_ISREG(inode->i_mode); +} + +static void __try_update_largest_extent(struct extent_tree *et, + struct extent_node *en) +{ + if (en->ei.len <= et->largest.len) + return; + + et->largest = en->ei; + et->largest_updated = true; +} + +static bool __is_extent_mergeable(struct extent_info *back, + struct extent_info *front) +{ +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (back->c_len && back->len != back->c_len) + return false; + if (front->c_len && front->len != front->c_len) + return false; +#endif + return (back->fofs + back->len == front->fofs && + back->blk + back->len == front->blk); +} + +static bool __is_back_mergeable(struct extent_info *cur, + struct extent_info *back) +{ + return __is_extent_mergeable(back, cur); +} + +static bool __is_front_mergeable(struct extent_info *cur, + struct extent_info *front) +{ + return __is_extent_mergeable(cur, front); +} + static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, unsigned int ofs) { @@ -591,16 +662,16 @@ static void f2fs_update_extent_tree_range(struct inode *inode, if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) { if (parts) { - set_extent_info(&ei, end, - end - dei.fofs + dei.blk, - org_end - end); + __set_extent_info(&ei, + end, org_end - end, + end - dei.fofs + dei.blk, false); en1 = __insert_extent_tree(sbi, et, &ei, NULL, NULL, true); next_en = en1; } else { - en->ei.fofs = end; - en->ei.blk += end - dei.fofs; - en->ei.len -= end - dei.fofs; + __set_extent_info(&en->ei, + end, en->ei.len - (end - dei.fofs), + en->ei.blk + (end - dei.fofs), true); next_en = en; } parts++; @@ -632,8 +703,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, /* 3. update extent in extent cache */ if (blkaddr) { - - set_extent_info(&ei, fofs, blkaddr, len); + __set_extent_info(&ei, fofs, len, blkaddr, false); if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) __insert_extent_tree(sbi, et, &ei, insert_p, insert_parent, leftmost); @@ -692,7 +762,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode, if (en) goto unlock_out; - set_extent_info(&ei, fofs, blkaddr, llen); + __set_extent_info(&ei, fofs, llen, blkaddr, true); ei.c_len = c_len; if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1c39f8145b61..04fdf010bb77 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -621,7 +621,7 @@ struct rb_entry { struct extent_info { unsigned int fofs; /* start offset in a file */ unsigned int len; /* length of the extent */ - u32 blk; /* start block address of the extent */ + block_t blk; /* start block address of the extent */ #ifdef CONFIG_F2FS_FS_COMPRESSION unsigned int c_len; /* physical extent length of compressed blocks */ #endif @@ -846,17 +846,6 @@ static inline void set_raw_read_extent(struct extent_info *ext, i_ext->len = cpu_to_le32(ext->len); } -static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, - u32 blk, unsigned int len) -{ - ei->fofs = fofs; - ei->blk = blk; - ei->len = len; -#ifdef CONFIG_F2FS_FS_COMPRESSION - ei->c_len = 0; -#endif -} - static inline bool __is_discard_mergeable(struct discard_info *back, struct discard_info *front, unsigned int max_len) { @@ -876,41 +865,6 @@ static inline bool __is_discard_front_mergeable(struct discard_info *cur, return __is_discard_mergeable(cur, front, max_len); } -static inline bool __is_extent_mergeable(struct extent_info *back, - struct extent_info *front) -{ -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (back->c_len && back->len != back->c_len) - return false; - if (front->c_len && front->len != front->c_len) - return false; -#endif - return (back->fofs + back->len == front->fofs && - back->blk + back->len == front->blk); -} - -static inline bool __is_back_mergeable(struct extent_info *cur, - struct extent_info *back) -{ - return __is_extent_mergeable(back, cur); -} - -static inline bool __is_front_mergeable(struct extent_info *cur, - struct extent_info *front) -{ - return __is_extent_mergeable(cur, front); -} - -extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); -static inline void __try_update_largest_extent(struct extent_tree *et, - struct extent_node *en) -{ - if (en->ei.len > et->largest.len) { - et->largest = en->ei; - et->largest_updated = true; - } -} - /* * For free nid management */ @@ -2581,6 +2535,7 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } +extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, struct inode *inode, bool is_inode) { @@ -4403,26 +4358,6 @@ F2FS_FEATURE_FUNCS(casefold, CASEFOLD); F2FS_FEATURE_FUNCS(compression, COMPRESSION); F2FS_FEATURE_FUNCS(readonly, RO); -static inline bool f2fs_may_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - - if (!test_opt(sbi, READ_EXTENT_CACHE) || - is_inode_flag_set(inode, FI_NO_EXTENT) || - (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && - !f2fs_sb_has_readonly(sbi))) - return false; - - /* - * for recovered files during mount do not create extents - * if shrinker is not registered. - */ - if (list_empty(&sbi->s_list)) - return false; - - return S_ISREG(inode->i_mode); -} - #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, block_t blkaddr) -- cgit From 749d543c0d451fff31e8f7a3e0a031ffcbf1ebb1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 30 Nov 2022 10:01:18 -0800 Subject: f2fs: remove unnecessary __init_extent_tree Added into the caller. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 2a8e31e6d518..c6810347e205 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -386,21 +386,6 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) return et; } -static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_info *ei) -{ - struct rb_node **p = &et->root.rb_root.rb_node; - struct extent_node *en; - - en = __attach_extent_node(sbi, et, ei, NULL, p, true); - if (!en) - return NULL; - - et->largest = en->ei; - et->cached_en = en; - return en; -} - static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et) { @@ -460,8 +445,12 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage) if (atomic_read(&et->node_cnt)) goto out; - en = __init_extent_tree(sbi, et, &ei); + en = __attach_extent_node(sbi, et, &ei, NULL, + &et->root.rb_root.rb_node, true); if (en) { + et->largest = en->ei; + et->cached_en = en; + spin_lock(&sbi->extent_lock); list_add_tail(&en->list, &sbi->extent_list); spin_unlock(&sbi->extent_lock); -- cgit From e7547daccd6a37522f0af74ec4b5a3036f3dd328 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 30 Nov 2022 09:26:29 -0800 Subject: f2fs: refactor extent_cache to support for read and more This patch prepares extent_cache to be ready for addition. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 20 +-- fs/f2fs/debug.c | 65 ++++--- fs/f2fs/extent_cache.c | 463 +++++++++++++++++++++++++++++-------------------- fs/f2fs/f2fs.h | 119 ++++++++----- fs/f2fs/file.c | 8 +- fs/f2fs/gc.c | 4 +- fs/f2fs/inode.c | 6 +- fs/f2fs/node.c | 8 +- fs/f2fs/segment.c | 3 +- fs/f2fs/shrinker.c | 19 +- 10 files changed, 434 insertions(+), 281 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 35c19248b1e2..75abd450730b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1126,7 +1126,7 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { dn->data_blkaddr = blkaddr; f2fs_set_data_blkaddr(dn); - f2fs_update_extent_cache(dn); + f2fs_update_read_extent_cache(dn); } /* dn->ofs_in_node will be returned with up-to-date last block pointer */ @@ -1195,7 +1195,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) struct extent_info ei = {0, }; struct inode *inode = dn->inode; - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn->data_blkaddr = ei.blk + index - ei.fofs; return 0; } @@ -1217,7 +1217,7 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, if (!page) return ERR_PTR(-ENOMEM); - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ)) { @@ -1485,7 +1485,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, pgofs = (pgoff_t)map->m_lblk; end = pgofs + maxblocks; - if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) { + if (!create && f2fs_lookup_read_extent_cache(inode, pgofs, &ei)) { if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO && map->m_may_create) goto next_dnode; @@ -1695,7 +1695,7 @@ skip: if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; - f2fs_update_extent_cache_range(&dn, + f2fs_update_read_extent_cache_range(&dn, start_pgofs, map->m_pblk + ofs, map->m_len - ofs); } @@ -1740,7 +1740,7 @@ sync_out: if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; - f2fs_update_extent_cache_range(&dn, + f2fs_update_read_extent_cache_range(&dn, start_pgofs, map->m_pblk + ofs, map->m_len - ofs); } @@ -2201,7 +2201,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (f2fs_cluster_is_empty(cc)) goto out; - if (f2fs_lookup_extent_cache(inode, start_idx, &ei)) + if (f2fs_lookup_read_extent_cache(inode, start_idx, &ei)) from_dnode = false; if (!from_dnode) @@ -2635,7 +2635,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) set_new_dnode(&dn, inode, NULL, NULL, 0); if (need_inplace_update(fio) && - f2fs_lookup_extent_cache(inode, page->index, &ei)) { + f2fs_lookup_read_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, @@ -3359,7 +3359,7 @@ restart: } else if (locked) { err = f2fs_get_block(&dn, index); } else { - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; } else { /* hole case */ @@ -3400,7 +3400,7 @@ static int __find_data_block(struct inode *inode, pgoff_t index, set_new_dnode(&dn, inode, ipage, ipage, 0); - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; } else { /* hole case */ diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a216dcdf6941..a9baa121d829 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -72,15 +72,23 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->main_area_zones = si->main_area_sections / le32_to_cpu(raw_super->secs_per_zone); - /* validation check of the segment numbers */ + /* general extent cache stats */ + for (i = 0; i < NR_EXTENT_CACHES; i++) { + struct extent_tree_info *eti = &sbi->extent_tree[i]; + + si->hit_cached[i] = atomic64_read(&sbi->read_hit_cached[i]); + si->hit_rbtree[i] = atomic64_read(&sbi->read_hit_rbtree[i]); + si->total_ext[i] = atomic64_read(&sbi->total_hit_ext[i]); + si->hit_total[i] = si->hit_cached[i] + si->hit_rbtree[i]; + si->ext_tree[i] = atomic_read(&eti->total_ext_tree); + si->zombie_tree[i] = atomic_read(&eti->total_zombie_tree); + si->ext_node[i] = atomic_read(&eti->total_ext_node); + } + /* read extent_cache only */ si->hit_largest = atomic64_read(&sbi->read_hit_largest); - si->hit_cached = atomic64_read(&sbi->read_hit_cached); - si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree); - si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; - si->total_ext = atomic64_read(&sbi->total_hit_ext); - si->ext_tree = atomic_read(&sbi->total_ext_tree); - si->zombie_tree = atomic_read(&sbi->total_zombie_tree); - si->ext_node = atomic_read(&sbi->total_ext_node); + si->hit_total[EX_READ] += si->hit_largest; + + /* validation check of the segment numbers */ si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); @@ -294,10 +302,16 @@ get_cache: sizeof(struct nat_entry_set); for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); - si->cache_mem += atomic_read(&sbi->total_ext_tree) * + + for (i = 0; i < NR_EXTENT_CACHES; i++) { + struct extent_tree_info *eti = &sbi->extent_tree[i]; + + si->ext_mem[i] = atomic_read(&eti->total_ext_tree) * sizeof(struct extent_tree); - si->cache_mem += atomic_read(&sbi->total_ext_node) * + si->ext_mem[i] += atomic_read(&eti->total_ext_node) * sizeof(struct extent_node); + si->cache_mem += si->ext_mem[i]; + } si->page_mem = 0; if (sbi->node_inode) { @@ -490,16 +504,18 @@ static int stat_show(struct seq_file *s, void *v) si->bg_node_blks); seq_printf(s, "BG skip : IO: %u, Other: %u\n", si->io_skip_bggc, si->other_skip_bggc); - seq_puts(s, "\nExtent Cache:\n"); + seq_puts(s, "\nExtent Cache (Read):\n"); seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n", - si->hit_largest, si->hit_cached, - si->hit_rbtree); + si->hit_largest, si->hit_cached[EX_READ], + si->hit_rbtree[EX_READ]); seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n", - !si->total_ext ? 0 : - div64_u64(si->hit_total * 100, si->total_ext), - si->hit_total, si->total_ext); + !si->total_ext[EX_READ] ? 0 : + div64_u64(si->hit_total[EX_READ] * 100, + si->total_ext[EX_READ]), + si->hit_total[EX_READ], si->total_ext[EX_READ]); seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", - si->ext_tree, si->zombie_tree, si->ext_node); + si->ext_tree[EX_READ], si->zombie_tree[EX_READ], + si->ext_node[EX_READ]); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - DIO (R: %4d, W: %4d)\n", si->nr_dio_read, si->nr_dio_write); @@ -566,8 +582,10 @@ static int stat_show(struct seq_file *s, void *v) (si->base_mem + si->cache_mem + si->page_mem) >> 10); seq_printf(s, " - static: %llu KB\n", si->base_mem >> 10); - seq_printf(s, " - cached: %llu KB\n", + seq_printf(s, " - cached all: %llu KB\n", si->cache_mem >> 10); + seq_printf(s, " - read extent cache: %llu KB\n", + si->ext_mem[EX_READ] >> 10); seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } @@ -600,10 +618,15 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) si->sbi = sbi; sbi->stat_info = si; - atomic64_set(&sbi->total_hit_ext, 0); - atomic64_set(&sbi->read_hit_rbtree, 0); + /* general extent cache stats */ + for (i = 0; i < NR_EXTENT_CACHES; i++) { + atomic64_set(&sbi->total_hit_ext[i], 0); + atomic64_set(&sbi->read_hit_rbtree[i], 0); + atomic64_set(&sbi->read_hit_cached[i], 0); + } + + /* read extent_cache only */ atomic64_set(&sbi->read_hit_largest, 0); - atomic64_set(&sbi->read_hit_cached, 0); atomic_set(&sbi->inline_xattr, 0); atomic_set(&sbi->inline_inode, 0); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index c6810347e205..654a14ab8977 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -17,21 +17,37 @@ static void __set_extent_info(struct extent_info *ei, unsigned int fofs, unsigned int len, - block_t blk, bool keep_clen) + block_t blk, bool keep_clen, + enum extent_type type) { ei->fofs = fofs; - ei->blk = blk; ei->len = len; - if (keep_clen) - return; - + if (type == EX_READ) { + ei->blk = blk; + if (keep_clen) + return; #ifdef CONFIG_F2FS_FS_COMPRESSION - ei->c_len = 0; + ei->c_len = 0; #endif + } +} + +static bool __may_read_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, READ_EXTENT_CACHE)) + return false; + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return false; + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && + !f2fs_sb_has_readonly(sbi)) + return false; + return S_ISREG(inode->i_mode); } -static bool f2fs_may_extent_tree(struct inode *inode) +static bool __may_extent_tree(struct inode *inode, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -42,18 +58,16 @@ static bool f2fs_may_extent_tree(struct inode *inode) if (list_empty(&sbi->s_list)) return false; - if (!test_opt(sbi, READ_EXTENT_CACHE) || - is_inode_flag_set(inode, FI_NO_EXTENT) || - (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && - !f2fs_sb_has_readonly(sbi))) - return false; - - return S_ISREG(inode->i_mode); + if (type == EX_READ) + return __may_read_extent_tree(inode); + return false; } static void __try_update_largest_extent(struct extent_tree *et, struct extent_node *en) { + if (et->type != EX_READ) + return; if (en->ei.len <= et->largest.len) return; @@ -62,28 +76,31 @@ static void __try_update_largest_extent(struct extent_tree *et, } static bool __is_extent_mergeable(struct extent_info *back, - struct extent_info *front) + struct extent_info *front, enum extent_type type) { + if (type == EX_READ) { #ifdef CONFIG_F2FS_FS_COMPRESSION - if (back->c_len && back->len != back->c_len) - return false; - if (front->c_len && front->len != front->c_len) - return false; + if (back->c_len && back->len != back->c_len) + return false; + if (front->c_len && front->len != front->c_len) + return false; #endif - return (back->fofs + back->len == front->fofs && - back->blk + back->len == front->blk); + return (back->fofs + back->len == front->fofs && + back->blk + back->len == front->blk); + } + return false; } static bool __is_back_mergeable(struct extent_info *cur, - struct extent_info *back) + struct extent_info *back, enum extent_type type) { - return __is_extent_mergeable(back, cur); + return __is_extent_mergeable(back, cur, type); } static bool __is_front_mergeable(struct extent_info *cur, - struct extent_info *front) + struct extent_info *front, enum extent_type type) { - return __is_extent_mergeable(cur, front); + return __is_extent_mergeable(cur, front, type); } static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, @@ -308,6 +325,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, struct rb_node *parent, struct rb_node **p, bool leftmost) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; struct extent_node *en; en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi); @@ -321,16 +339,18 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, rb_link_node(&en->rb_node, parent, p); rb_insert_color_cached(&en->rb_node, &et->root, leftmost); atomic_inc(&et->node_cnt); - atomic_inc(&sbi->total_ext_node); + atomic_inc(&eti->total_ext_node); return en; } static void __detach_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + rb_erase_cached(&en->rb_node, &et->root); atomic_dec(&et->node_cnt); - atomic_dec(&sbi->total_ext_node); + atomic_dec(&eti->total_ext_node); if (et->cached_en == en) et->cached_en = NULL; @@ -346,42 +366,47 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi, static void __release_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { - spin_lock(&sbi->extent_lock); + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + + spin_lock(&eti->extent_lock); f2fs_bug_on(sbi, list_empty(&en->list)); list_del_init(&en->list); - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); __detach_extent_node(sbi, et, en); } -static struct extent_tree *__grab_extent_tree(struct inode *inode) +static struct extent_tree *__grab_extent_tree(struct inode *inode, + enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree_info *eti = &sbi->extent_tree[type]; struct extent_tree *et; nid_t ino = inode->i_ino; - mutex_lock(&sbi->extent_tree_lock); - et = radix_tree_lookup(&sbi->extent_tree_root, ino); + mutex_lock(&eti->extent_tree_lock); + et = radix_tree_lookup(&eti->extent_tree_root, ino); if (!et) { et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS, true, NULL); - f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); + f2fs_radix_tree_insert(&eti->extent_tree_root, ino, et); memset(et, 0, sizeof(struct extent_tree)); et->ino = ino; + et->type = type; et->root = RB_ROOT_CACHED; et->cached_en = NULL; rwlock_init(&et->lock); INIT_LIST_HEAD(&et->list); atomic_set(&et->node_cnt, 0); - atomic_inc(&sbi->total_ext_tree); + atomic_inc(&eti->total_ext_tree); } else { - atomic_dec(&sbi->total_zombie_tree); + atomic_dec(&eti->total_zombie_tree); list_del_init(&et->list); } - mutex_unlock(&sbi->extent_tree_lock); + mutex_unlock(&eti->extent_tree_lock); /* never died until evict_inode */ - F2FS_I(inode)->extent_tree = et; + F2FS_I(inode)->extent_tree[type] = et; return et; } @@ -415,35 +440,38 @@ static void __drop_largest_extent(struct extent_tree *et, } /* return true, if inode page is changed */ -static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage) +static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage, + enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree_info *eti = &sbi->extent_tree[type]; struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL; struct extent_tree *et; struct extent_node *en; struct extent_info ei; - if (!f2fs_may_extent_tree(inode)) { - /* drop largest extent */ - if (i_ext && i_ext->len) { + if (!__may_extent_tree(inode, type)) { + /* drop largest read extent */ + if (type == EX_READ && i_ext && i_ext->len) { f2fs_wait_on_page_writeback(ipage, NODE, true, true); i_ext->len = 0; set_page_dirty(ipage); - return; } - return; + goto out; } - et = __grab_extent_tree(inode); + et = __grab_extent_tree(inode, type); if (!i_ext || !i_ext->len) - return; + goto out; + + BUG_ON(type != EX_READ); get_read_extent_info(&ei, i_ext); write_lock(&et->lock); if (atomic_read(&et->node_cnt)) - goto out; + goto unlock_out; en = __attach_extent_node(sbi, et, &ei, NULL, &et->root.rb_root.rb_node, true); @@ -451,37 +479,40 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage) et->largest = en->ei; et->cached_en = en; - spin_lock(&sbi->extent_lock); - list_add_tail(&en->list, &sbi->extent_list); - spin_unlock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); + list_add_tail(&en->list, &eti->extent_list); + spin_unlock(&eti->extent_lock); } -out: +unlock_out: write_unlock(&et->lock); +out: + if (type == EX_READ && !F2FS_I(inode)->extent_tree[EX_READ]) + set_inode_flag(inode, FI_NO_EXTENT); } void f2fs_init_extent_tree(struct inode *inode, struct page *ipage) { - __f2fs_init_extent_tree(inode, ipage); - - if (!F2FS_I(inode)->extent_tree) - set_inode_flag(inode, FI_NO_EXTENT); + /* initialize read cache */ + __f2fs_init_extent_tree(inode, ipage, EX_READ); } -static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) +static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; struct extent_node *en; bool ret = false; f2fs_bug_on(sbi, !et); - trace_f2fs_lookup_extent_tree_start(inode, pgofs); + trace_f2fs_lookup_extent_tree_start(inode, pgofs, type); read_lock(&et->lock); - if (et->largest.fofs <= pgofs && + if (type == EX_READ && + et->largest.fofs <= pgofs && et->largest.fofs + et->largest.len > pgofs) { *ei = et->largest; ret = true; @@ -495,23 +526,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; if (en == et->cached_en) - stat_inc_cached_node_hit(sbi); + stat_inc_cached_node_hit(sbi, type); else - stat_inc_rbtree_node_hit(sbi); + stat_inc_rbtree_node_hit(sbi, type); *ei = en->ei; - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); + list_move_tail(&en->list, &eti->extent_list); et->cached_en = en; } - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); ret = true; out: - stat_inc_total_hit(sbi); + stat_inc_total_hit(sbi, type); read_unlock(&et->lock); - trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei); + if (type == EX_READ) + trace_f2fs_lookup_read_extent_tree_end(inode, pgofs, ei); return ret; } @@ -520,18 +552,20 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, struct extent_node *prev_ex, struct extent_node *next_ex) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; struct extent_node *en = NULL; - if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) { + if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei, et->type)) { prev_ex->ei.len += ei->len; ei = &prev_ex->ei; en = prev_ex; } - if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { + if (next_ex && __is_front_mergeable(ei, &next_ex->ei, et->type)) { next_ex->ei.fofs = ei->fofs; - next_ex->ei.blk = ei->blk; next_ex->ei.len += ei->len; + if (et->type == EX_READ) + next_ex->ei.blk = ei->blk; if (en) __release_extent_node(sbi, et, prev_ex); @@ -543,12 +577,12 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, __try_update_largest_extent(et, en); - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); + list_move_tail(&en->list, &eti->extent_list); et->cached_en = en; } - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); return en; } @@ -558,6 +592,7 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, struct rb_node *insert_parent, bool leftmost) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; struct rb_node **p; struct rb_node *parent = NULL; struct extent_node *en = NULL; @@ -580,47 +615,50 @@ do_insert: __try_update_largest_extent(et, en); /* update in global extent list */ - spin_lock(&sbi->extent_lock); - list_add_tail(&en->list, &sbi->extent_list); + spin_lock(&eti->extent_lock); + list_add_tail(&en->list, &eti->extent_list); et->cached_en = en; - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); return en; } -static void f2fs_update_extent_tree_range(struct inode *inode, - pgoff_t fofs, block_t blkaddr, unsigned int len) +static void __update_extent_tree_range(struct inode *inode, + struct extent_info *tei, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; struct extent_node *en = NULL, *en1 = NULL; struct extent_node *prev_en = NULL, *next_en = NULL; struct extent_info ei, dei, prev; struct rb_node **insert_p = NULL, *insert_parent = NULL; + unsigned int fofs = tei->fofs, len = tei->len; unsigned int end = fofs + len; - unsigned int pos = (unsigned int)fofs; bool updated = false; bool leftmost = false; if (!et) return; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len, 0); - + if (type == EX_READ) + trace_f2fs_update_read_extent_tree_range(inode, fofs, len, + tei->blk, 0); write_lock(&et->lock); - if (is_inode_flag_set(inode, FI_NO_EXTENT)) { - write_unlock(&et->lock); - return; - } + if (type == EX_READ) { + if (is_inode_flag_set(inode, FI_NO_EXTENT)) { + write_unlock(&et->lock); + return; + } - prev = et->largest; - dei.len = 0; + prev = et->largest; + dei.len = 0; - /* - * drop largest extent before lookup, in case it's already - * been shrunk from extent tree - */ - __drop_largest_extent(et, fofs, len); + /* + * drop largest extent before lookup, in case it's already + * been shrunk from extent tree + */ + __drop_largest_extent(et, fofs, len); + } /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, @@ -641,26 +679,30 @@ static void f2fs_update_extent_tree_range(struct inode *inode, dei = en->ei; org_end = dei.fofs + dei.len; - f2fs_bug_on(sbi, pos >= org_end); + f2fs_bug_on(sbi, fofs >= org_end); - if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { - en->ei.len = pos - en->ei.fofs; + if (fofs > dei.fofs && (type != EX_READ || + fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN)) { + en->ei.len = fofs - en->ei.fofs; prev_en = en; parts = 1; } - if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) { + if (end < org_end && (type != EX_READ || + org_end - end >= F2FS_MIN_EXTENT_LEN)) { if (parts) { __set_extent_info(&ei, end, org_end - end, - end - dei.fofs + dei.blk, false); + end - dei.fofs + dei.blk, false, + type); en1 = __insert_extent_tree(sbi, et, &ei, NULL, NULL, true); next_en = en1; } else { __set_extent_info(&en->ei, end, en->ei.len - (end - dei.fofs), - en->ei.blk + (end - dei.fofs), true); + en->ei.blk + (end - dei.fofs), true, + type); next_en = en; } parts++; @@ -690,9 +732,11 @@ static void f2fs_update_extent_tree_range(struct inode *inode, en = next_en; } - /* 3. update extent in extent cache */ - if (blkaddr) { - __set_extent_info(&ei, fofs, len, blkaddr, false); + /* 3. update extent in read extent cache */ + BUG_ON(type != EX_READ); + + if (tei->blk) { + __set_extent_info(&ei, fofs, len, tei->blk, false, EX_READ); if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) __insert_extent_tree(sbi, et, &ei, insert_p, insert_parent, leftmost); @@ -722,19 +766,20 @@ static void f2fs_update_extent_tree_range(struct inode *inode, } #ifdef CONFIG_F2FS_FS_COMPRESSION -void f2fs_update_extent_tree_range_compressed(struct inode *inode, +void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int llen, unsigned int c_len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ]; struct extent_node *en = NULL; struct extent_node *prev_en = NULL, *next_en = NULL; struct extent_info ei; struct rb_node **insert_p = NULL, *insert_parent = NULL; bool leftmost = false; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen, c_len); + trace_f2fs_update_read_extent_tree_range(inode, fofs, llen, + blkaddr, c_len); /* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */ if (is_inode_flag_set(inode, FI_NO_EXTENT)) @@ -751,7 +796,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode, if (en) goto unlock_out; - __set_extent_info(&ei, fofs, llen, blkaddr, true); + __set_extent_info(&ei, fofs, llen, blkaddr, true, EX_READ); ei.c_len = c_len; if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) @@ -762,24 +807,43 @@ unlock_out: } #endif -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type) { + struct extent_info ei; + + if (!__may_extent_tree(dn->inode, type)) + return; + + ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + + dn->ofs_in_node; + ei.len = 1; + + if (type == EX_READ) { + if (dn->data_blkaddr == NEW_ADDR) + ei.blk = NULL_ADDR; + else + ei.blk = dn->data_blkaddr; + } + __update_extent_tree_range(dn->inode, &ei, type); +} + +static unsigned int __shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink, + enum extent_type type) +{ + struct extent_tree_info *eti = &sbi->extent_tree[type]; struct extent_tree *et, *next; struct extent_node *en; unsigned int node_cnt = 0, tree_cnt = 0; int remained; - if (!test_opt(sbi, READ_EXTENT_CACHE)) - return 0; - - if (!atomic_read(&sbi->total_zombie_tree)) + if (!atomic_read(&eti->total_zombie_tree)) goto free_node; - if (!mutex_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&eti->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ - list_for_each_entry_safe(et, next, &sbi->zombie_list, list) { + list_for_each_entry_safe(et, next, &eti->zombie_list, list) { if (atomic_read(&et->node_cnt)) { write_lock(&et->lock); node_cnt += __free_extent_tree(sbi, et); @@ -787,61 +851,100 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) } f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); list_del_init(&et->list); - radix_tree_delete(&sbi->extent_tree_root, et->ino); + radix_tree_delete(&eti->extent_tree_root, et->ino); kmem_cache_free(extent_tree_slab, et); - atomic_dec(&sbi->total_ext_tree); - atomic_dec(&sbi->total_zombie_tree); + atomic_dec(&eti->total_ext_tree); + atomic_dec(&eti->total_zombie_tree); tree_cnt++; if (node_cnt + tree_cnt >= nr_shrink) goto unlock_out; cond_resched(); } - mutex_unlock(&sbi->extent_tree_lock); + mutex_unlock(&eti->extent_tree_lock); free_node: /* 2. remove LRU extent entries */ - if (!mutex_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&eti->extent_tree_lock)) goto out; remained = nr_shrink - (node_cnt + tree_cnt); - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); for (; remained > 0; remained--) { - if (list_empty(&sbi->extent_list)) + if (list_empty(&eti->extent_list)) break; - en = list_first_entry(&sbi->extent_list, + en = list_first_entry(&eti->extent_list, struct extent_node, list); et = en->et; if (!write_trylock(&et->lock)) { /* refresh this extent node's position in extent list */ - list_move_tail(&en->list, &sbi->extent_list); + list_move_tail(&en->list, &eti->extent_list); continue; } list_del_init(&en->list); - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); __detach_extent_node(sbi, et, en); write_unlock(&et->lock); node_cnt++; - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); } - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); unlock_out: - mutex_unlock(&sbi->extent_tree_lock); + mutex_unlock(&eti->extent_tree_lock); out: - trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); + trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt, type); return node_cnt + tree_cnt; } -unsigned int f2fs_destroy_extent_node(struct inode *inode) +/* read extent cache operations */ +bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (!__may_extent_tree(inode, EX_READ)) + return false; + + return __lookup_extent_tree(inode, pgofs, ei, EX_READ); +} + +void f2fs_update_read_extent_cache(struct dnode_of_data *dn) +{ + return __update_extent_cache(dn, EX_READ); +} + +void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len) +{ + struct extent_info ei = { + .fofs = fofs, + .len = len, + .blk = blkaddr, + }; + + if (!__may_extent_tree(dn->inode, EX_READ)) + return; + + __update_extent_tree_range(dn->inode, &ei, EX_READ); +} + +unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + if (!test_opt(sbi, READ_EXTENT_CACHE)) + return 0; + + return __shrink_extent_tree(sbi, nr_shrink, EX_READ); +} + +static unsigned int __destroy_extent_node(struct inode *inode, + enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; unsigned int node_cnt = 0; if (!et || !atomic_read(&et->node_cnt)) @@ -854,31 +957,44 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode) return node_cnt; } -void f2fs_drop_extent_tree(struct inode *inode) +void f2fs_destroy_extent_node(struct inode *inode) +{ + __destroy_extent_node(inode, EX_READ); +} + +static void __drop_extent_tree(struct inode *inode, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; bool updated = false; - if (!f2fs_may_extent_tree(inode)) + if (!__may_extent_tree(inode, type)) return; write_lock(&et->lock); - set_inode_flag(inode, FI_NO_EXTENT); __free_extent_tree(sbi, et); - if (et->largest.len) { - et->largest.len = 0; - updated = true; + if (type == EX_READ) { + set_inode_flag(inode, FI_NO_EXTENT); + if (et->largest.len) { + et->largest.len = 0; + updated = true; + } } write_unlock(&et->lock); if (updated) f2fs_mark_inode_dirty_sync(inode, true); } -void f2fs_destroy_extent_tree(struct inode *inode) +void f2fs_drop_extent_tree(struct inode *inode) +{ + __drop_extent_tree(inode, EX_READ); +} + +static void __destroy_extent_tree(struct inode *inode, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; unsigned int node_cnt = 0; if (!et) @@ -886,76 +1002,49 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (inode->i_nlink && !is_bad_inode(inode) && atomic_read(&et->node_cnt)) { - mutex_lock(&sbi->extent_tree_lock); - list_add_tail(&et->list, &sbi->zombie_list); - atomic_inc(&sbi->total_zombie_tree); - mutex_unlock(&sbi->extent_tree_lock); + mutex_lock(&eti->extent_tree_lock); + list_add_tail(&et->list, &eti->zombie_list); + atomic_inc(&eti->total_zombie_tree); + mutex_unlock(&eti->extent_tree_lock); return; } /* free all extent info belong to this extent tree */ - node_cnt = f2fs_destroy_extent_node(inode); + node_cnt = __destroy_extent_node(inode, type); /* delete extent tree entry in radix tree */ - mutex_lock(&sbi->extent_tree_lock); + mutex_lock(&eti->extent_tree_lock); f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); - radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); + radix_tree_delete(&eti->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); - atomic_dec(&sbi->total_ext_tree); - mutex_unlock(&sbi->extent_tree_lock); + atomic_dec(&eti->total_ext_tree); + mutex_unlock(&eti->extent_tree_lock); - F2FS_I(inode)->extent_tree = NULL; + F2FS_I(inode)->extent_tree[type] = NULL; - trace_f2fs_destroy_extent_tree(inode, node_cnt); + trace_f2fs_destroy_extent_tree(inode, node_cnt, type); } -bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) -{ - if (!f2fs_may_extent_tree(inode)) - return false; - - return f2fs_lookup_extent_tree(inode, pgofs, ei); -} - -void f2fs_update_extent_cache(struct dnode_of_data *dn) +void f2fs_destroy_extent_tree(struct inode *inode) { - pgoff_t fofs; - block_t blkaddr; - - if (!f2fs_may_extent_tree(dn->inode)) - return; - - if (dn->data_blkaddr == NEW_ADDR) - blkaddr = NULL_ADDR; - else - blkaddr = dn->data_blkaddr; - - fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + - dn->ofs_in_node; - f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1); + __destroy_extent_tree(inode, EX_READ); } -void f2fs_update_extent_cache_range(struct dnode_of_data *dn, - pgoff_t fofs, block_t blkaddr, unsigned int len) - +static void __init_extent_tree_info(struct extent_tree_info *eti) { - if (!f2fs_may_extent_tree(dn->inode)) - return; - - f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len); + INIT_RADIX_TREE(&eti->extent_tree_root, GFP_NOIO); + mutex_init(&eti->extent_tree_lock); + INIT_LIST_HEAD(&eti->extent_list); + spin_lock_init(&eti->extent_lock); + atomic_set(&eti->total_ext_tree, 0); + INIT_LIST_HEAD(&eti->zombie_list); + atomic_set(&eti->total_zombie_tree, 0); + atomic_set(&eti->total_ext_node, 0); } void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) { - INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); - mutex_init(&sbi->extent_tree_lock); - INIT_LIST_HEAD(&sbi->extent_list); - spin_lock_init(&sbi->extent_lock); - atomic_set(&sbi->total_ext_tree, 0); - INIT_LIST_HEAD(&sbi->zombie_list); - atomic_set(&sbi->total_zombie_tree, 0); - atomic_set(&sbi->total_ext_node, 0); + __init_extent_tree_info(&sbi->extent_tree[EX_READ]); } int __init f2fs_create_extent_cache(void) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 04fdf010bb77..7c68bedee649 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -596,16 +596,22 @@ enum { /* dirty segments threshold for triggering CP */ #define DEFAULT_DIRTY_THRESHOLD 4 +#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS +#define RECOVERY_MIN_RA_BLOCKS 1 + +#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */ + /* for in-memory extent cache entry */ #define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ /* number of extent info in extent cache we try to shrink */ #define READ_EXTENT_CACHE_SHRINK_NUMBER 128 -#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS -#define RECOVERY_MIN_RA_BLOCKS 1 - -#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */ +/* extent cache type */ +enum extent_type { + EX_READ, + NR_EXTENT_CACHES, +}; struct rb_entry { struct rb_node rb_node; /* rb node located in rb-tree */ @@ -621,10 +627,17 @@ struct rb_entry { struct extent_info { unsigned int fofs; /* start offset in a file */ unsigned int len; /* length of the extent */ - block_t blk; /* start block address of the extent */ + union { + /* read extent_cache */ + struct { + /* start block address of the extent */ + block_t blk; #ifdef CONFIG_F2FS_FS_COMPRESSION - unsigned int c_len; /* physical extent length of compressed blocks */ + /* physical extent length of compressed blocks */ + unsigned int c_len; #endif + }; + }; }; struct extent_node { @@ -636,13 +649,25 @@ struct extent_node { struct extent_tree { nid_t ino; /* inode number */ + enum extent_type type; /* keep the extent tree type */ struct rb_root_cached root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ - struct extent_info largest; /* largested extent info */ struct list_head list; /* to be used by sbi->zombie_list */ rwlock_t lock; /* protect extent info rb-tree */ atomic_t node_cnt; /* # of extent node in rb-tree*/ bool largest_updated; /* largest extent updated */ + struct extent_info largest; /* largest cached extent for EX_READ */ +}; + +struct extent_tree_info { + struct radix_tree_root extent_tree_root;/* cache extent cache entries */ + struct mutex extent_tree_lock; /* locking extent radix tree */ + struct list_head extent_list; /* lru list for shrinker */ + spinlock_t extent_lock; /* locking extent lru list */ + atomic_t total_ext_tree; /* extent tree count */ + struct list_head zombie_list; /* extent zombie tree list */ + atomic_t total_zombie_tree; /* extent zombie tree count */ + atomic_t total_ext_node; /* extent info count */ }; /* @@ -805,7 +830,8 @@ struct f2fs_inode_info { struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ struct task_struct *atomic_write_task; /* store atomic write task */ - struct extent_tree *extent_tree; /* cached extent_tree entry */ + struct extent_tree *extent_tree[NR_EXTENT_CACHES]; + /* cached extent_tree entry */ struct inode *cow_inode; /* copy-on-write inode for atomic write */ /* avoid racing between foreground op and gc */ @@ -1626,14 +1652,7 @@ struct f2fs_sb_info { struct mutex flush_lock; /* for flush exclusion */ /* for extent tree cache */ - struct radix_tree_root extent_tree_root;/* cache extent cache entries */ - struct mutex extent_tree_lock; /* locking extent radix tree */ - struct list_head extent_list; /* lru list for shrinker */ - spinlock_t extent_lock; /* locking extent lru list */ - atomic_t total_ext_tree; /* extent tree count */ - struct list_head zombie_list; /* extent zombie tree list */ - atomic_t total_zombie_tree; /* extent zombie tree count */ - atomic_t total_ext_node; /* extent info count */ + struct extent_tree_info extent_tree[NR_EXTENT_CACHES]; /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ @@ -1718,10 +1737,14 @@ struct f2fs_sb_info { unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ - atomic64_t total_hit_ext; /* # of lookup extent cache */ - atomic64_t read_hit_rbtree; /* # of hit rbtree extent node */ - atomic64_t read_hit_largest; /* # of hit largest extent node */ - atomic64_t read_hit_cached; /* # of hit cached extent node */ + /* # of lookup extent cache */ + atomic64_t total_hit_ext[NR_EXTENT_CACHES]; + /* # of hit rbtree extent node */ + atomic64_t read_hit_rbtree[NR_EXTENT_CACHES]; + /* # of hit cached extent node */ + atomic64_t read_hit_cached[NR_EXTENT_CACHES]; + /* # of hit largest extent node in read extent cache */ + atomic64_t read_hit_largest; atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ @@ -3823,9 +3846,17 @@ struct f2fs_stat_info { struct f2fs_sb_info *sbi; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; - unsigned long long hit_largest, hit_cached, hit_rbtree; - unsigned long long hit_total, total_ext; - int ext_tree, zombie_tree, ext_node; + unsigned long long hit_cached[NR_EXTENT_CACHES]; + unsigned long long hit_rbtree[NR_EXTENT_CACHES]; + unsigned long long total_ext[NR_EXTENT_CACHES]; + unsigned long long hit_total[NR_EXTENT_CACHES]; + int ext_tree[NR_EXTENT_CACHES]; + int zombie_tree[NR_EXTENT_CACHES]; + int ext_node[NR_EXTENT_CACHES]; + /* to count memory footprint */ + unsigned long long ext_mem[NR_EXTENT_CACHES]; + /* for read extent cache */ + unsigned long long hit_largest; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; @@ -3884,10 +3915,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++) #define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) #define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) -#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) -#define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree)) +#define stat_inc_total_hit(sbi, type) (atomic64_inc(&(sbi)->total_hit_ext[type])) +#define stat_inc_rbtree_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_rbtree[type])) #define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) -#define stat_inc_cached_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_cached)) +#define stat_inc_cached_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_cached[type])) #define stat_inc_inline_xattr(inode) \ do { \ if (f2fs_has_inline_xattr(inode)) \ @@ -4010,10 +4041,10 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_other_skip_bggc_count(sbi) do { } while (0) #define stat_inc_dirty_inode(sbi, type) do { } while (0) #define stat_dec_dirty_inode(sbi, type) do { } while (0) -#define stat_inc_total_hit(sbi) do { } while (0) -#define stat_inc_rbtree_node_hit(sbi) do { } while (0) +#define stat_inc_total_hit(sbi, type) do { } while (0) +#define stat_inc_rbtree_node_hit(sbi, type) do { } while (0) #define stat_inc_largest_node_hit(sbi) do { } while (0) -#define stat_inc_cached_node_hit(sbi) do { } while (0) +#define stat_inc_cached_node_hit(sbi, type) do { } while (0) #define stat_inc_inline_xattr(inode) do { } while (0) #define stat_dec_inline_xattr(inode) do { } while (0) #define stat_inc_inline_inode(inode) do { } while (0) @@ -4119,20 +4150,23 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, bool force, bool *leftmost); bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, struct rb_root_cached *root, bool check_key); -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); void f2fs_init_extent_tree(struct inode *inode, struct page *ipage); void f2fs_drop_extent_tree(struct inode *inode); -unsigned int f2fs_destroy_extent_node(struct inode *inode); +void f2fs_destroy_extent_node(struct inode *inode); void f2fs_destroy_extent_tree(struct inode *inode); -bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei); -void f2fs_update_extent_cache(struct dnode_of_data *dn); -void f2fs_update_extent_cache_range(struct dnode_of_data *dn, - pgoff_t fofs, block_t blkaddr, unsigned int len); void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi); int __init f2fs_create_extent_cache(void); void f2fs_destroy_extent_cache(void); +/* read extent cache ops */ +bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_read_extent_cache(struct dnode_of_data *dn); +void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len); +unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, + int nr_shrink); + /* * sysfs.c */ @@ -4202,9 +4236,9 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, struct writeback_control *wbc, enum iostat_type io_type); int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index); -void f2fs_update_extent_tree_range_compressed(struct inode *inode, - pgoff_t fofs, block_t blkaddr, unsigned int llen, - unsigned int c_len); +void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, + unsigned int llen, unsigned int c_len); int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write); @@ -4285,9 +4319,10 @@ static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { } #define inc_compr_inode_stat(inode) do { } while (0) -static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode, - pgoff_t fofs, block_t blkaddr, unsigned int llen, - unsigned int c_len) { } +static inline void f2fs_update_read_extent_tree_range_compressed( + struct inode *inode, + pgoff_t fofs, block_t blkaddr, + unsigned int llen, unsigned int c_len) { } #endif static inline int set_compress_context(struct inode *inode) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ab0a0d3730f6..cbe7c24065c7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -618,7 +618,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) */ fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + ofs; - f2fs_update_extent_cache_range(dn, fofs, 0, len); + f2fs_update_read_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); } dn->ofs_in_node = ofs; @@ -1496,7 +1496,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, f2fs_set_data_blkaddr(dn); } - f2fs_update_extent_cache_range(dn, start, 0, index - start); + f2fs_update_read_extent_cache_range(dn, start, 0, index - start); return ret; } @@ -2558,7 +2558,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, struct f2fs_map_blocks map = { .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = false }; - struct extent_info ei = {0, 0, 0}; + struct extent_info ei = {0, }; pgoff_t pg_start, pg_end, next_pgofs; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; @@ -2590,7 +2590,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, * lookup mapping info in extent cache, skip defragmenting if physical * block addresses are continuous. */ - if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, pg_start, &ei)) { if (ei.fofs + ei.len >= pg_end) goto out; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d19e26b2e875..f0c6506d8975 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1146,7 +1146,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei = {0, 0, 0}; + struct extent_info ei = {0, }; struct f2fs_io_info fio = { .sbi = sbi, .ino = inode->i_ino, @@ -1164,7 +1164,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (!page) return -ENOMEM; - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ))) { diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2c705c60019b..086f201f15a0 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -262,8 +262,8 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (fi->extent_tree) { - struct extent_info *ei = &fi->extent_tree->largest; + if (fi->extent_tree[EX_READ]) { + struct extent_info *ei = &fi->extent_tree[EX_READ]->largest; if (ei->len && (!f2fs_is_valid_blkaddr(sbi, ei->blk, @@ -607,7 +607,7 @@ retry: void f2fs_update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ]; f2fs_wait_on_page_writeback(node_page, NODE, true, true); set_page_dirty(node_page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 84b147966080..07419c3e42a5 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -86,9 +86,11 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == READ_EXTENT_CACHE) { - mem_size = (atomic_read(&sbi->total_ext_tree) * + struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; + + mem_size = (atomic_read(&eti->total_ext_tree) * sizeof(struct extent_tree) + - atomic_read(&sbi->total_ext_node) * + atomic_read(&eti->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == DISCARD_CACHE) { @@ -859,7 +861,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) blkaddr = data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node + 1); - f2fs_update_extent_tree_range_compressed(dn->inode, + f2fs_update_read_extent_tree_range_compressed(dn->inode, index, blkaddr, F2FS_I(dn->inode)->i_cluster_size, c_len); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 51de358bc452..8722d1a13c17 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -450,7 +450,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) /* try to shrink extent cache when there is no enough memory */ if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE)) - f2fs_shrink_extent_tree(sbi, READ_EXTENT_CACHE_SHRINK_NUMBER); + f2fs_shrink_read_extent_tree(sbi, + READ_EXTENT_CACHE_SHRINK_NUMBER); /* check the # of cached NAT entries */ if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index dd3c3c7a90ec..33c490e69ae3 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -28,10 +28,13 @@ static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) return count > 0 ? count : 0; } -static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) +static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi, + enum extent_type type) { - return atomic_read(&sbi->total_zombie_tree) + - atomic_read(&sbi->total_ext_node); + struct extent_tree_info *eti = &sbi->extent_tree[type]; + + return atomic_read(&eti->total_zombie_tree) + + atomic_read(&eti->total_ext_node); } unsigned long f2fs_shrink_count(struct shrinker *shrink, @@ -53,8 +56,8 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, } spin_unlock(&f2fs_list_lock); - /* count extent cache entries */ - count += __count_extent_cache(sbi); + /* count read extent cache entries */ + count += __count_extent_cache(sbi, EX_READ); /* count clean nat cache entries */ count += __count_nat_entries(sbi); @@ -99,8 +102,8 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink, sbi->shrinker_run_no = run_no; - /* shrink extent cache entries */ - freed += f2fs_shrink_extent_tree(sbi, nr >> 1); + /* shrink read extent cache entries */ + freed += f2fs_shrink_read_extent_tree(sbi, nr >> 1); /* shrink clean nat cache entries */ if (freed < nr) @@ -130,7 +133,7 @@ void f2fs_join_shrinker(struct f2fs_sb_info *sbi) void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) { - f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi)); + f2fs_shrink_read_extent_tree(sbi, __count_extent_cache(sbi, EX_READ)); spin_lock(&f2fs_list_lock); list_del_init(&sbi->s_list); -- cgit From 72840cccc0a1a0a0dc1bb27b669a9111be6d0f6a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Dec 2022 13:51:09 -0800 Subject: f2fs: allocate the extent_cache by default Let's allocate it to remove the runtime complexity. Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 38 +++++++++++++++++++------------------- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/inode.c | 6 ++++-- fs/f2fs/namei.c | 4 ++-- 4 files changed, 27 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 654a14ab8977..305f969e3ad1 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -47,20 +47,23 @@ static bool __may_read_extent_tree(struct inode *inode) return S_ISREG(inode->i_mode); } -static bool __may_extent_tree(struct inode *inode, enum extent_type type) +static bool __init_may_extent_tree(struct inode *inode, enum extent_type type) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + if (type == EX_READ) + return __may_read_extent_tree(inode); + return false; +} +static bool __may_extent_tree(struct inode *inode, enum extent_type type) +{ /* * for recovered files during mount do not create extents * if shrinker is not registered. */ - if (list_empty(&sbi->s_list)) + if (list_empty(&F2FS_I_SB(inode)->s_list)) return false; - if (type == EX_READ) - return __may_read_extent_tree(inode); - return false; + return __init_may_extent_tree(inode, type); } static void __try_update_largest_extent(struct extent_tree *et, @@ -439,20 +442,18 @@ static void __drop_largest_extent(struct extent_tree *et, } } -/* return true, if inode page is changed */ -static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage, - enum extent_type type) +void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree_info *eti = &sbi->extent_tree[type]; - struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL; + struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; + struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext; struct extent_tree *et; struct extent_node *en; struct extent_info ei; - if (!__may_extent_tree(inode, type)) { + if (!__may_extent_tree(inode, EX_READ)) { /* drop largest read extent */ - if (type == EX_READ && i_ext && i_ext->len) { + if (i_ext && i_ext->len) { f2fs_wait_on_page_writeback(ipage, NODE, true, true); i_ext->len = 0; set_page_dirty(ipage); @@ -460,13 +461,11 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage, goto out; } - et = __grab_extent_tree(inode, type); + et = __grab_extent_tree(inode, EX_READ); if (!i_ext || !i_ext->len) goto out; - BUG_ON(type != EX_READ); - get_read_extent_info(&ei, i_ext); write_lock(&et->lock); @@ -486,14 +485,15 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage, unlock_out: write_unlock(&et->lock); out: - if (type == EX_READ && !F2FS_I(inode)->extent_tree[EX_READ]) + if (!F2FS_I(inode)->extent_tree[EX_READ]) set_inode_flag(inode, FI_NO_EXTENT); } -void f2fs_init_extent_tree(struct inode *inode, struct page *ipage) +void f2fs_init_extent_tree(struct inode *inode) { /* initialize read cache */ - __f2fs_init_extent_tree(inode, ipage, EX_READ); + if (__init_may_extent_tree(inode, EX_READ)) + __grab_extent_tree(inode, EX_READ); } static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7c68bedee649..ec52e06f8e61 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4150,7 +4150,7 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, bool force, bool *leftmost); bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, struct rb_root_cached *root, bool check_key); -void f2fs_init_extent_tree(struct inode *inode, struct page *ipage); +void f2fs_init_extent_tree(struct inode *inode); void f2fs_drop_extent_tree(struct inode *inode); void f2fs_destroy_extent_node(struct inode *inode); void f2fs_destroy_extent_tree(struct inode *inode); @@ -4159,6 +4159,7 @@ int __init f2fs_create_extent_cache(void); void f2fs_destroy_extent_cache(void); /* read extent cache ops */ +void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage); bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, struct extent_info *ei); void f2fs_update_read_extent_cache(struct dnode_of_data *dn); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 086f201f15a0..c845c16f97d0 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -392,8 +392,6 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - f2fs_init_extent_tree(inode, node_page); - get_inline_info(inode, ri); fi->i_extra_isize = f2fs_has_extra_attr(inode) ? @@ -479,6 +477,10 @@ static int do_read_inode(struct inode *inode) } init_idisk_time(inode); + + /* Need all the flag bits */ + f2fs_init_read_extent_tree(inode, node_page); + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 58a91ce8fe08..46de782c2baa 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -284,8 +284,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, } F2FS_I(inode)->i_inline_xattr_size = xattr_size; - f2fs_init_extent_tree(inode, NULL); - F2FS_I(inode)->i_flags = f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); @@ -311,6 +309,8 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, f2fs_set_inode_flags(inode); + f2fs_init_extent_tree(inode); + trace_f2fs_new_inode(inode, 0); return inode; -- cgit From 71644dff481180ba024ac4f5cb1f068756357adf Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 1 Dec 2022 17:37:15 -0800 Subject: f2fs: add block_age-based extent cache This patch introduces a runtime hot/cold data separation method for f2fs, in order to improve the accuracy for data temperature classification, reduce the garbage collection overhead after long-term data updates. Enhanced hot/cold data separation can record data block update frequency as "age" of the extent per inode, and take use of the age info to indicate better temperature type for data block allocation: - It records total data blocks allocated since mount; - When file extent has been updated, it calculate the count of data blocks allocated since last update as the age of the extent; - Before the data block allocated, it searches for the age info and chooses the suitable segment for allocation. Test and result: - Prepare: create about 30000 files * 3% for cold files (with cold file extension like .apk, from 3M to 10M) * 50% for warm files (with random file extension like .FcDxq, from 1K to 4M) * 47% for hot files (with hot file extension like .db, from 1K to 256K) - create(5%)/random update(90%)/delete(5%) the files * total write amount is about 70G * fsync will be called for .db files, and buffered write will be used for other files The storage of test device is large enough(128G) so that it will not switch to SSR mode during the test. Benefit: dirty segment count increment reduce about 14% - before: Dirty +21110 - after: Dirty +18286 Signed-off-by: qixiaoyu1 Signed-off-by: xiongping1 Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 21 ++++++ fs/f2fs/extent_cache.c | 183 ++++++++++++++++++++++++++++++++++++++++++++++++- fs/f2fs/f2fs.h | 38 ++++++++++ fs/f2fs/file.c | 1 + fs/f2fs/inode.c | 1 + fs/f2fs/node.c | 10 +-- fs/f2fs/node.h | 1 + fs/f2fs/segment.c | 33 +++++++++ fs/f2fs/shrinker.c | 10 ++- fs/f2fs/super.c | 14 ++++ fs/f2fs/sysfs.c | 24 +++++++ 11 files changed, 329 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a9baa121d829..8f1ef742551f 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -88,6 +88,9 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->hit_largest = atomic64_read(&sbi->read_hit_largest); si->hit_total[EX_READ] += si->hit_largest; + /* block age extent_cache only */ + si->allocated_data_blocks = atomic64_read(&sbi->allocated_data_blocks); + /* validation check of the segment numbers */ si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); @@ -516,6 +519,22 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree[EX_READ], si->zombie_tree[EX_READ], si->ext_node[EX_READ]); + seq_puts(s, "\nExtent Cache (Block Age):\n"); + seq_printf(s, " - Allocated Data Blocks: %llu\n", + si->allocated_data_blocks); + seq_printf(s, " - Hit Count: L1:%llu L2:%llu\n", + si->hit_cached[EX_BLOCK_AGE], + si->hit_rbtree[EX_BLOCK_AGE]); + seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n", + !si->total_ext[EX_BLOCK_AGE] ? 0 : + div64_u64(si->hit_total[EX_BLOCK_AGE] * 100, + si->total_ext[EX_BLOCK_AGE]), + si->hit_total[EX_BLOCK_AGE], + si->total_ext[EX_BLOCK_AGE]); + seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", + si->ext_tree[EX_BLOCK_AGE], + si->zombie_tree[EX_BLOCK_AGE], + si->ext_node[EX_BLOCK_AGE]); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - DIO (R: %4d, W: %4d)\n", si->nr_dio_read, si->nr_dio_write); @@ -586,6 +605,8 @@ static int stat_show(struct seq_file *s, void *v) si->cache_mem >> 10); seq_printf(s, " - read extent cache: %llu KB\n", si->ext_mem[EX_READ] >> 10); + seq_printf(s, " - block age extent cache: %llu KB\n", + si->ext_mem[EX_BLOCK_AGE] >> 10); seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 305f969e3ad1..1bd38a78ebba 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -6,6 +6,10 @@ * Copyright (c) 2015 Samsung Electronics * Authors: Jaegeuk Kim * Chao Yu + * + * block_age-based extent cache added by: + * Copyright (c) 2022 xiaomi Co., Ltd. + * http://www.xiaomi.com/ */ #include @@ -18,6 +22,7 @@ static void __set_extent_info(struct extent_info *ei, unsigned int fofs, unsigned int len, block_t blk, bool keep_clen, + unsigned long age, unsigned long last_blocks, enum extent_type type) { ei->fofs = fofs; @@ -30,6 +35,9 @@ static void __set_extent_info(struct extent_info *ei, #ifdef CONFIG_F2FS_FS_COMPRESSION ei->c_len = 0; #endif + } else if (type == EX_BLOCK_AGE) { + ei->age = age; + ei->last_blocks = last_blocks; } } @@ -47,10 +55,27 @@ static bool __may_read_extent_tree(struct inode *inode) return S_ISREG(inode->i_mode); } +static bool __may_age_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, AGE_EXTENT_CACHE)) + return false; + /* don't cache block age info for cold file */ + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) + return false; + if (file_is_cold(inode)) + return false; + + return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); +} + static bool __init_may_extent_tree(struct inode *inode, enum extent_type type) { if (type == EX_READ) return __may_read_extent_tree(inode); + else if (type == EX_BLOCK_AGE) + return __may_age_extent_tree(inode); return false; } @@ -90,6 +115,11 @@ static bool __is_extent_mergeable(struct extent_info *back, #endif return (back->fofs + back->len == front->fofs && back->blk + back->len == front->blk); + } else if (type == EX_BLOCK_AGE) { + return (back->fofs + back->len == front->fofs && + abs(back->age - front->age) <= SAME_AGE_REGION && + abs(back->last_blocks - front->last_blocks) <= + SAME_AGE_REGION); } return false; } @@ -489,11 +519,22 @@ out: set_inode_flag(inode, FI_NO_EXTENT); } +void f2fs_init_age_extent_tree(struct inode *inode) +{ + if (!__init_may_extent_tree(inode, EX_BLOCK_AGE)) + return; + __grab_extent_tree(inode, EX_BLOCK_AGE); +} + void f2fs_init_extent_tree(struct inode *inode) { /* initialize read cache */ if (__init_may_extent_tree(inode, EX_READ)) __grab_extent_tree(inode, EX_READ); + + /* initialize block age cache */ + if (__init_may_extent_tree(inode, EX_BLOCK_AGE)) + __grab_extent_tree(inode, EX_BLOCK_AGE); } static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, @@ -544,6 +585,8 @@ out: if (type == EX_READ) trace_f2fs_lookup_read_extent_tree_end(inode, pgofs, ei); + else if (type == EX_BLOCK_AGE) + trace_f2fs_lookup_age_extent_tree_end(inode, pgofs, ei); return ret; } @@ -642,6 +685,10 @@ static void __update_extent_tree_range(struct inode *inode, if (type == EX_READ) trace_f2fs_update_read_extent_tree_range(inode, fofs, len, tei->blk, 0); + else if (type == EX_BLOCK_AGE) + trace_f2fs_update_age_extent_tree_range(inode, fofs, len, + tei->age, tei->last_blocks); + write_lock(&et->lock); if (type == EX_READ) { @@ -694,6 +741,7 @@ static void __update_extent_tree_range(struct inode *inode, __set_extent_info(&ei, end, org_end - end, end - dei.fofs + dei.blk, false, + dei.age, dei.last_blocks, type); en1 = __insert_extent_tree(sbi, et, &ei, NULL, NULL, true); @@ -702,6 +750,7 @@ static void __update_extent_tree_range(struct inode *inode, __set_extent_info(&en->ei, end, en->ei.len - (end - dei.fofs), en->ei.blk + (end - dei.fofs), true, + dei.age, dei.last_blocks, type); next_en = en; } @@ -732,11 +781,15 @@ static void __update_extent_tree_range(struct inode *inode, en = next_en; } + if (type == EX_BLOCK_AGE) + goto update_age_extent_cache; + /* 3. update extent in read extent cache */ BUG_ON(type != EX_READ); if (tei->blk) { - __set_extent_info(&ei, fofs, len, tei->blk, false, EX_READ); + __set_extent_info(&ei, fofs, len, tei->blk, false, + 0, 0, EX_READ); if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) __insert_extent_tree(sbi, et, &ei, insert_p, insert_parent, leftmost); @@ -758,7 +811,17 @@ static void __update_extent_tree_range(struct inode *inode, et->largest_updated = false; updated = true; } + goto out_read_extent_cache; +update_age_extent_cache: + if (!tei->last_blocks) + goto out_read_extent_cache; + __set_extent_info(&ei, fofs, len, 0, false, + tei->age, tei->last_blocks, EX_BLOCK_AGE); + if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) + __insert_extent_tree(sbi, et, &ei, + insert_p, insert_parent, leftmost); +out_read_extent_cache: write_unlock(&et->lock); if (updated) @@ -796,7 +859,7 @@ void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, if (en) goto unlock_out; - __set_extent_info(&ei, fofs, llen, blkaddr, true, EX_READ); + __set_extent_info(&ei, fofs, llen, blkaddr, true, 0, 0, EX_READ); ei.c_len = c_len; if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) @@ -807,6 +870,72 @@ unlock_out: } #endif +static unsigned long long __calculate_block_age(unsigned long long new, + unsigned long long old) +{ + unsigned long long diff; + + diff = (new >= old) ? new - (new - old) : new + (old - new); + + return div_u64(diff * LAST_AGE_WEIGHT, 100); +} + +/* This returns a new age and allocated blocks in ei */ +static int __get_new_block_age(struct inode *inode, struct extent_info *ei) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + loff_t f_size = i_size_read(inode); + unsigned long long cur_blocks = + atomic64_read(&sbi->allocated_data_blocks); + + /* + * When I/O is not aligned to a PAGE_SIZE, update will happen to the last + * file block even in seq write. So don't record age for newly last file + * block here. + */ + if ((f_size >> PAGE_SHIFT) == ei->fofs && f_size & (PAGE_SIZE - 1) && + ei->blk == NEW_ADDR) + return -EINVAL; + + if (__lookup_extent_tree(inode, ei->fofs, ei, EX_BLOCK_AGE)) { + unsigned long long cur_age; + + if (cur_blocks >= ei->last_blocks) + cur_age = cur_blocks - ei->last_blocks; + else + /* allocated_data_blocks overflow */ + cur_age = ULLONG_MAX - ei->last_blocks + cur_blocks; + + if (ei->age) + ei->age = __calculate_block_age(cur_age, ei->age); + else + ei->age = cur_age; + ei->last_blocks = cur_blocks; + WARN_ON(ei->age > cur_blocks); + return 0; + } + + f2fs_bug_on(sbi, ei->blk == NULL_ADDR); + + /* the data block was allocated for the first time */ + if (ei->blk == NEW_ADDR) + goto out; + + if (__is_valid_data_blkaddr(ei->blk) && + !f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC_ENHANCE)) { + f2fs_bug_on(sbi, 1); + return -EINVAL; + } +out: + /* + * init block age with zero, this can happen when the block age extent + * was reclaimed due to memory constraint or system reboot + */ + ei->age = 0; + ei->last_blocks = cur_blocks; + return 0; +} + static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type) { struct extent_info ei; @@ -823,6 +952,10 @@ static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type typ ei.blk = NULL_ADDR; else ei.blk = dn->data_blkaddr; + } else if (type == EX_BLOCK_AGE) { + ei.blk = dn->data_blkaddr; + if (__get_new_block_age(dn->inode, &ei)) + return; } __update_extent_tree_range(dn->inode, &ei, type); } @@ -940,6 +1073,43 @@ unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrin return __shrink_extent_tree(sbi, nr_shrink, EX_READ); } +/* block age extent cache operations */ +bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (!__may_extent_tree(inode, EX_BLOCK_AGE)) + return false; + + return __lookup_extent_tree(inode, pgofs, ei, EX_BLOCK_AGE); +} + +void f2fs_update_age_extent_cache(struct dnode_of_data *dn) +{ + return __update_extent_cache(dn, EX_BLOCK_AGE); +} + +void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, unsigned int len) +{ + struct extent_info ei = { + .fofs = fofs, + .len = len, + }; + + if (!__may_extent_tree(dn->inode, EX_BLOCK_AGE)) + return; + + __update_extent_tree_range(dn->inode, &ei, EX_BLOCK_AGE); +} + +unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + if (!test_opt(sbi, AGE_EXTENT_CACHE)) + return 0; + + return __shrink_extent_tree(sbi, nr_shrink, EX_BLOCK_AGE); +} + static unsigned int __destroy_extent_node(struct inode *inode, enum extent_type type) { @@ -960,6 +1130,7 @@ static unsigned int __destroy_extent_node(struct inode *inode, void f2fs_destroy_extent_node(struct inode *inode) { __destroy_extent_node(inode, EX_READ); + __destroy_extent_node(inode, EX_BLOCK_AGE); } static void __drop_extent_tree(struct inode *inode, enum extent_type type) @@ -988,6 +1159,7 @@ static void __drop_extent_tree(struct inode *inode, enum extent_type type) void f2fs_drop_extent_tree(struct inode *inode) { __drop_extent_tree(inode, EX_READ); + __drop_extent_tree(inode, EX_BLOCK_AGE); } static void __destroy_extent_tree(struct inode *inode, enum extent_type type) @@ -1028,6 +1200,7 @@ static void __destroy_extent_tree(struct inode *inode, enum extent_type type) void f2fs_destroy_extent_tree(struct inode *inode) { __destroy_extent_tree(inode, EX_READ); + __destroy_extent_tree(inode, EX_BLOCK_AGE); } static void __init_extent_tree_info(struct extent_tree_info *eti) @@ -1045,6 +1218,12 @@ static void __init_extent_tree_info(struct extent_tree_info *eti) void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) { __init_extent_tree_info(&sbi->extent_tree[EX_READ]); + __init_extent_tree_info(&sbi->extent_tree[EX_BLOCK_AGE]); + + /* initialize for block age extents */ + atomic64_set(&sbi->allocated_data_blocks, 0); + sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD; + sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD; } int __init f2fs_create_extent_cache(void) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ec52e06f8e61..e8953c3dc81a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -107,6 +107,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 #define F2FS_MOUNT_GC_MERGE 0x20000000 #define F2FS_MOUNT_COMPRESS_CACHE 0x40000000 +#define F2FS_MOUNT_AGE_EXTENT_CACHE 0x80000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -607,9 +608,22 @@ enum { /* number of extent info in extent cache we try to shrink */ #define READ_EXTENT_CACHE_SHRINK_NUMBER 128 +/* number of age extent info in extent cache we try to shrink */ +#define AGE_EXTENT_CACHE_SHRINK_NUMBER 128 +#define LAST_AGE_WEIGHT 30 +#define SAME_AGE_REGION 1024 + +/* + * Define data block with age less than 1GB as hot data + * define data block with age less than 10GB but more than 1GB as warm data + */ +#define DEF_HOT_DATA_AGE_THRESHOLD 262144 +#define DEF_WARM_DATA_AGE_THRESHOLD 2621440 + /* extent cache type */ enum extent_type { EX_READ, + EX_BLOCK_AGE, NR_EXTENT_CACHES, }; @@ -637,6 +651,13 @@ struct extent_info { unsigned int c_len; #endif }; + /* block age extent_cache */ + struct { + /* block age of the extent */ + unsigned long long age; + /* last total blocks allocated */ + unsigned long long last_blocks; + }; }; }; @@ -1653,6 +1674,11 @@ struct f2fs_sb_info { /* for extent tree cache */ struct extent_tree_info extent_tree[NR_EXTENT_CACHES]; + atomic64_t allocated_data_blocks; /* for block age extent_cache */ + + /* The threshold used for hot and warm data seperation*/ + unsigned int hot_data_age_threshold; + unsigned int warm_data_age_threshold; /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ @@ -3857,6 +3883,8 @@ struct f2fs_stat_info { unsigned long long ext_mem[NR_EXTENT_CACHES]; /* for read extent cache */ unsigned long long hit_largest; + /* for block age extent cache */ + unsigned long long allocated_data_blocks; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; @@ -4168,6 +4196,16 @@ void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); +/* block age extent cache ops */ +void f2fs_init_age_extent_tree(struct inode *inode); +bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_age_extent_cache(struct dnode_of_data *dn); +void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, unsigned int len); +unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, + int nr_shrink); + /* * sysfs.c */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index cbe7c24065c7..56c23b5e9d65 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -619,6 +619,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + ofs; f2fs_update_read_extent_cache_range(dn, fofs, 0, len); + f2fs_update_age_extent_cache_range(dn, fofs, nr_free); dec_valid_block_count(sbi, dn->inode, nr_free); } dn->ofs_in_node = ofs; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index c845c16f97d0..ff6cf66ed46b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -480,6 +480,7 @@ static int do_read_inode(struct inode *inode) /* Need all the flag bits */ f2fs_init_read_extent_tree(inode, node_page); + f2fs_init_age_extent_tree(inode); f2fs_put_page(node_page, 1); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 07419c3e42a5..dde4c0458704 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -60,7 +60,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) avail_ram = val.totalram - val.totalhigh; /* - * give 25%, 25%, 50%, 50%, 50% memory for each components respectively + * give 25%, 25%, 50%, 50%, 25%, 25% memory for each components respectively */ if (type == FREE_NIDS) { mem_size = (nm_i->nid_cnt[FREE_NID] * @@ -85,14 +85,16 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct ino_entry); mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); - } else if (type == READ_EXTENT_CACHE) { - struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; + } else if (type == READ_EXTENT_CACHE || type == AGE_EXTENT_CACHE) { + enum extent_type etype = type == READ_EXTENT_CACHE ? + EX_READ : EX_BLOCK_AGE; + struct extent_tree_info *eti = &sbi->extent_tree[etype]; mem_size = (atomic_read(&eti->total_ext_tree) * sizeof(struct extent_tree) + atomic_read(&eti->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; - res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == DISCARD_CACHE) { mem_size = (atomic_read(&dcc->discard_cmd_cnt) * sizeof(struct discard_cmd)) >> PAGE_SHIFT; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 0aa48704c77a..99454d46a939 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -147,6 +147,7 @@ enum mem_type { DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ READ_EXTENT_CACHE, /* indicates read extent cache */ + AGE_EXTENT_CACHE, /* indicates age extent cache */ DISCARD_CACHE, /* indicates memory of cached discard cmds */ COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8722d1a13c17..dee712f7225f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -453,6 +453,11 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) f2fs_shrink_read_extent_tree(sbi, READ_EXTENT_CACHE_SHRINK_NUMBER); + /* try to shrink age extent cache when there is no enough memory */ + if (!f2fs_available_free_memory(sbi, AGE_EXTENT_CACHE)) + f2fs_shrink_age_extent_tree(sbi, + AGE_EXTENT_CACHE_SHRINK_NUMBER); + /* check the # of cached NAT entries */ if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) f2fs_try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); @@ -3151,10 +3156,28 @@ static int __get_segment_type_4(struct f2fs_io_info *fio) } } +static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_info ei; + + if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) { + if (!ei.age) + return NO_CHECK_TYPE; + if (ei.age <= sbi->hot_data_age_threshold) + return CURSEG_HOT_DATA; + if (ei.age <= sbi->warm_data_age_threshold) + return CURSEG_WARM_DATA; + return CURSEG_COLD_DATA; + } + return NO_CHECK_TYPE; +} + static int __get_segment_type_6(struct f2fs_io_info *fio) { if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; + int type; if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) return CURSEG_COLD_DATA_PINNED; @@ -3169,6 +3192,11 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) } if (file_is_cold(inode) || f2fs_need_compress_data(inode)) return CURSEG_COLD_DATA; + + type = __get_age_segment_type(inode, fio->page->index); + if (type != NO_CHECK_TYPE) + return type; + if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || f2fs_is_cow_file(inode)) @@ -3287,6 +3315,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); + if (IS_DATASEG(type)) + atomic64_inc(&sbi->allocated_data_blocks); + up_write(&sit_i->sentry_lock); if (page && IS_NODESEG(type)) { @@ -3414,6 +3445,8 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn, struct f2fs_summary sum; f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); + if (fio->io_type == FS_DATA_IO || fio->io_type == FS_CP_DATA_IO) + f2fs_update_age_extent_cache(dn); set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 33c490e69ae3..83d6fb97dcae 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -59,6 +59,9 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, /* count read extent cache entries */ count += __count_extent_cache(sbi, EX_READ); + /* count block age extent cache entries */ + count += __count_extent_cache(sbi, EX_BLOCK_AGE); + /* count clean nat cache entries */ count += __count_nat_entries(sbi); @@ -102,8 +105,11 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink, sbi->shrinker_run_no = run_no; + /* shrink extent cache entries */ + freed += f2fs_shrink_age_extent_tree(sbi, nr >> 2); + /* shrink read extent cache entries */ - freed += f2fs_shrink_read_extent_tree(sbi, nr >> 1); + freed += f2fs_shrink_read_extent_tree(sbi, nr >> 2); /* shrink clean nat cache entries */ if (freed < nr) @@ -134,6 +140,8 @@ void f2fs_join_shrinker(struct f2fs_sb_info *sbi) void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) { f2fs_shrink_read_extent_tree(sbi, __count_extent_cache(sbi, EX_READ)); + f2fs_shrink_age_extent_tree(sbi, + __count_extent_cache(sbi, EX_BLOCK_AGE)); spin_lock(&f2fs_list_lock); list_del_init(&sbi->s_list); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 412c2e7352c0..180d8b804d13 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -163,6 +163,7 @@ enum { Opt_nogc_merge, Opt_discard_unit, Opt_memory_mode, + Opt_age_extent_cache, Opt_err, }; @@ -241,6 +242,7 @@ static match_table_t f2fs_tokens = { {Opt_nogc_merge, "nogc_merge"}, {Opt_discard_unit, "discard_unit=%s"}, {Opt_memory_mode, "memory=%s"}, + {Opt_age_extent_cache, "age_extent_cache"}, {Opt_err, NULL}, }; @@ -1257,6 +1259,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } kfree(name); break; + case Opt_age_extent_cache: + set_opt(sbi, AGE_EXTENT_CACHE); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1958,6 +1963,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",extent_cache"); else seq_puts(seq, ",noextent_cache"); + if (test_opt(sbi, AGE_EXTENT_CACHE)) + seq_puts(seq, ",age_extent_cache"); if (test_opt(sbi, DATA_FLUSH)) seq_puts(seq, ",data_flush"); @@ -2219,6 +2226,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_flush = false, need_stop_flush = false; bool need_restart_discard = false, need_stop_discard = false; bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE); + bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE); bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); @@ -2313,6 +2321,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) f2fs_warn(sbi, "switch extent_cache option is not allowed"); goto restore_opts; } + /* disallow enable/disable age extent_cache dynamically */ + if (no_age_extent_cache == !!test_opt(sbi, AGE_EXTENT_CACHE)) { + err = -EINVAL; + f2fs_warn(sbi, "switch age_extent_cache option is not allowed"); + goto restore_opts; + } if (no_io_align == !!F2FS_IO_ALIGNED(sbi)) { err = -EINVAL; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index a4745d596310..2ab215110596 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -668,6 +668,24 @@ out: return count; } + if (!strcmp(a->attr.name, "hot_data_age_threshold")) { + if (t == 0 || t >= sbi->warm_data_age_threshold) + return -EINVAL; + if (t == *ui) + return count; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "warm_data_age_threshold")) { + if (t == 0 || t <= sbi->hot_data_age_threshold) + return -EINVAL; + if (t == *ui) + return count; + *ui = (unsigned int)t; + return count; + } + *ui = (unsigned int)t; return count; @@ -923,6 +941,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, peak_atomic_write, peak_atomic_write); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, committed_atomic_block, committed_atomic_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block); +/* For block age extent cache */ +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, hot_data_age_threshold, hot_data_age_threshold); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, warm_data_age_threshold, warm_data_age_threshold); + #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_urgent_sleep_time), @@ -1018,6 +1040,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(peak_atomic_write), ATTR_LIST(committed_atomic_block), ATTR_LIST(revoked_atomic_block), + ATTR_LIST(hot_data_age_threshold), + ATTR_LIST(warm_data_age_threshold), NULL, }; ATTRIBUTE_GROUPS(f2fs); -- cgit From db8dcd25ec84120d4e57a7f17a566825cec17ae8 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 7 Dec 2022 13:42:17 +0000 Subject: f2fs: Fix spelling mistake in label: free_bio_enrty_cache -> free_bio_entry_cache There is a spelling mistake in a label name. Fix it. Signed-off-by: Colin Ian King Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 180d8b804d13..c02a717cf880 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4723,7 +4723,7 @@ static int __init init_f2fs_fs(void) goto free_iostat; err = f2fs_init_bioset(); if (err) - goto free_bio_enrty_cache; + goto free_bio_entry_cache; err = f2fs_init_compress_mempool(); if (err) goto free_bioset; @@ -4740,7 +4740,7 @@ free_compress_mempool: f2fs_destroy_compress_mempool(); free_bioset: f2fs_destroy_bioset(); -free_bio_enrty_cache: +free_bio_entry_cache: f2fs_destroy_bio_entry_cache(); free_iostat: f2fs_destroy_iostat_processing(); -- cgit From 15e38ee44d50cad264da80ef75626b9224ddc4a3 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Mon, 5 Dec 2022 22:56:03 +0800 Subject: f2fs: fix iostat parameter for discard Just like other data we count uses the number of bytes as the basic unit, but discard uses the number of cmds as the statistical unit. In fact the discard command contains the number of blocks, so let's change to the number of bytes as the base unit. Fixes: b0af6d491a6b ("f2fs: add app/fs io stat") Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index dee712f7225f..f1845a032885 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1187,7 +1187,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, atomic_inc(&dcc->issued_discard); - f2fs_update_iostat(sbi, NULL, FS_DISCARD, 1); + f2fs_update_iostat(sbi, NULL, FS_DISCARD, len * F2FS_BLKSIZE); lstart += len; start += len; -- cgit From 25547439f1dcc3def6062bd3e69165cd806a594e Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Fri, 2 Dec 2022 12:58:41 +0800 Subject: f2fs: don't call f2fs_issue_discard_timeout() when discard_cmd_cnt is 0 in f2fs_put_super() No need to call f2fs_issue_discard_timeout() in f2fs_put_super, when no discard command requires issue. Since the caller of f2fs_issue_discard_timeout() usually judges the number of discard commands before using it. Let's move this logic to f2fs_issue_discard_timeout(). By the way, use f2fs_realtime_discard_enable to simplify the code. Reported-by: kernel test robot Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 ++++-- fs/f2fs/super.c | 8 ++------ 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f1845a032885..a9099a754dd2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1661,6 +1661,9 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) struct discard_policy dpolicy; bool dropped; + if (!atomic_read(&dcc->discard_cmd_cnt)) + return false; + __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); __issue_discard_cmd(sbi, &dpolicy); @@ -2116,8 +2119,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) * Recovery can cache discard commands, so in error path of * fill_super(), it needs to give a chance to handle them. */ - if (unlikely(atomic_read(&dcc->discard_cmd_cnt))) - f2fs_issue_discard_timeout(sbi); + f2fs_issue_discard_timeout(sbi); kfree(dcc); SM_I(sbi)->dcc_info = NULL; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c02a717cf880..1f812b9ce985 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1581,8 +1581,7 @@ static void f2fs_put_super(struct super_block *sb) /* be sure to wait for any on-going discard commands */ dropped = f2fs_issue_discard_timeout(sbi); - if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) && - !sbi->discard_blks && !dropped) { + if (f2fs_realtime_discard_enable(sbi) && !sbi->discard_blks && !dropped) { struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; @@ -2233,7 +2232,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool no_discard = !test_opt(sbi, DISCARD); bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); bool block_unit_discard = f2fs_block_unit_discard(sbi); - struct discard_cmd_control *dcc; #ifdef CONFIG_QUOTA int i, j; #endif @@ -2420,10 +2418,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_flush; need_stop_discard = true; } else { - dcc = SM_I(sbi)->dcc_info; f2fs_stop_discard_thread(sbi); - if (atomic_read(&dcc->discard_cmd_cnt)) - f2fs_issue_discard_timeout(sbi); + f2fs_issue_discard_timeout(sbi); need_restart_discard = true; } } -- cgit From 7411143f2021530d7641fbb40daaada4ee63f7e6 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 29 Nov 2022 12:15:23 +0800 Subject: f2fs: fix some format WARNING in debug.c and sysfs.c To fix: WARNING: function definition argument 'struct f2fs_attr *' should also have an identifier name + ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); WARNING: return sysfs_emit(...) formats should include a terminating newline + return sysfs_emit(buf, "(none)"); WARNING: Prefer 'unsigned int' to bare use of 'unsigned' + unsigned npages = NODE_MAPPING(sbi)->nrpages; WARNING: Missing a blank line after declarations + unsigned npages = COMPRESS_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; WARNING: quoted string split across lines + seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, " + "Cur time: %4d(ms), Peak time: %4d(ms))\n", Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 45 +++++++++++++++++++++++---------------------- fs/f2fs/sysfs.c | 10 +++++----- 2 files changed, 28 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 8f1ef742551f..32af4f0c5735 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -318,18 +318,19 @@ get_cache: si->page_mem = 0; if (sbi->node_inode) { - unsigned npages = NODE_MAPPING(sbi)->nrpages; + unsigned long npages = NODE_MAPPING(sbi)->nrpages; si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } if (sbi->meta_inode) { - unsigned npages = META_MAPPING(sbi)->nrpages; + unsigned long npages = META_MAPPING(sbi)->nrpages; si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } #ifdef CONFIG_F2FS_FS_COMPRESSION if (sbi->compress_inode) { - unsigned npages = COMPRESS_MAPPING(sbi)->nrpages; + unsigned long npages = COMPRESS_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } #endif @@ -477,28 +478,28 @@ static int stat_show(struct seq_file *s, void *v) si->meta_count[META_NAT]); seq_printf(s, " - ssa blocks : %u\n", si->meta_count[META_SSA]); - seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, " - "Cur time: %4d(ms), Peak time: %4d(ms))\n", - si->nr_queued_ckpt, si->nr_issued_ckpt, - si->nr_total_ckpt, si->cur_ckpt_time, - si->peak_ckpt_time); + seq_puts(s, "CP merge:\n"); + seq_printf(s, " - Queued : %4d\n", si->nr_queued_ckpt); + seq_printf(s, " - Issued : %4d\n", si->nr_issued_ckpt); + seq_printf(s, " - Total : %4d\n", si->nr_total_ckpt); + seq_printf(s, " - Cur time : %4d(ms)\n", si->cur_ckpt_time); + seq_printf(s, " - Peak time : %4d(ms)\n", si->peak_ckpt_time); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", si->data_segs, si->bg_data_segs); seq_printf(s, " - node segments : %d (%d)\n", si->node_segs, si->bg_node_segs); - seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), " - "Idle Greedy (%d), Idle AT (%d), " - "Urgent High (%d), Urgent Mid (%d), " - "Urgent Low (%d)\n", - si->sbi->gc_reclaimed_segs[GC_NORMAL], - si->sbi->gc_reclaimed_segs[GC_IDLE_CB], - si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY], - si->sbi->gc_reclaimed_segs[GC_IDLE_AT], - si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH], - si->sbi->gc_reclaimed_segs[GC_URGENT_MID], - si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]); + seq_puts(s, " - Reclaimed segs :\n"); + seq_printf(s, " - Normal : %d\n", si->sbi->gc_reclaimed_segs[GC_NORMAL]); + seq_printf(s, " - Idle CB : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_CB]); + seq_printf(s, " - Idle Greedy : %d\n", + si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY]); + seq_printf(s, " - Idle AT : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_AT]); + seq_printf(s, " - Urgent High : %d\n", + si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH]); + seq_printf(s, " - Urgent Mid : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_MID]); + seq_printf(s, " - Urgent Low : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]); seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks, si->bg_data_blks + si->bg_node_blks); seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks, @@ -540,11 +541,11 @@ static int stat_show(struct seq_file *s, void *v) si->nr_dio_read, si->nr_dio_write); seq_printf(s, " - IO_R (Data: %4d, Node: %4d, Meta: %4d\n", si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta); - seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), " - "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", + seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), ", si->nr_wb_cp_data, si->nr_wb_data, si->nr_flushing, si->nr_flushed, - si->flush_list_empty, + si->flush_list_empty); + seq_printf(s, "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", si->nr_discarding, si->nr_discarded, si->nr_discard_cmd, si->undiscard_blks); seq_printf(s, " - atomic IO: %4d (Max. %4d)\n", diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 2ab215110596..83a366f3ee80 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -53,9 +53,9 @@ static const char *gc_mode_names[MAX_GC_MODE] = { struct f2fs_attr { struct attribute attr; - ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); - ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, - const char *, size_t); + ssize_t (*show)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf); + ssize_t (*store)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, + const char *buf, size_t len); int struct_type; int offset; int id; @@ -232,13 +232,13 @@ static ssize_t encoding_show(struct f2fs_attr *a, (sb->s_encoding->version >> 8) & 0xff, sb->s_encoding->version & 0xff); #endif - return sysfs_emit(buf, "(none)"); + return sysfs_emit(buf, "(none)\n"); } static ssize_t mounted_time_sec_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sysfs_emit(buf, "%llu", SIT_I(sbi)->mounted_time); + return sysfs_emit(buf, "%llu\n", SIT_I(sbi)->mounted_time); } #ifdef CONFIG_F2FS_STAT_FS -- cgit From 26a8057a1ada97b528b93fdf3ac4fd03170f1900 Mon Sep 17 00:00:00 2001 From: Yuwei Guan Date: Sun, 11 Dec 2022 21:08:41 +0800 Subject: f2fs: reset wait_ms to default if any of the victims have been selected In non-foreground gc mode, if no victim is selected, the gc process will wait for no_gc_sleep_time before waking up again. In this subsequent time, even though a victim will be selected, the gc process still waits for no_gc_sleep_time before waking up. The configuration of wait_ms is not reasonable. After any of the victims have been selected, we need to reset wait_ms to default sleep time from no_gc_sleep_time. Signed-off-by: Yuwei Guan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index f0c6506d8975..d7a9d84ba57c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -141,6 +141,10 @@ do_gc: /* don't bother wait_ms by foreground gc */ if (!foreground) wait_ms = gc_th->no_gc_sleep_time; + } else { + /* reset wait_ms to default sleep time */ + if (wait_ms == gc_th->no_gc_sleep_time) + wait_ms = gc_th->min_sleep_time; } if (foreground) -- cgit From 40306b4d1ba25970dafd53432e8daa5d591ebd99 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Tue, 13 Dec 2022 09:40:32 +0800 Subject: exfat: fix overflow in sector and cluster conversion According to the exFAT specification, there are at most 2^32-11 clusters in a volume. so using 'int' is not enough for cluster index, the return value type of exfat_sector_to_cluster() should be 'unsigned int'. Signed-off-by: Yuezhang Mo Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/exfat_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index a1e7feb22079..bc6d21d7c5ad 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -400,7 +400,7 @@ static inline sector_t exfat_cluster_to_sector(struct exfat_sb_info *sbi, sbi->data_start_sector; } -static inline int exfat_sector_to_cluster(struct exfat_sb_info *sbi, +static inline unsigned int exfat_sector_to_cluster(struct exfat_sb_info *sbi, sector_t sec) { return ((sec - sbi->data_start_sector) >> sbi->sect_per_clus_bits) + -- cgit From 36955d368dc101be885ad2c71618e3c3a93cd8ee Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Thu, 17 Nov 2022 11:31:30 +0800 Subject: exfat: reuse exfat_find_location() to simplify exfat_get_dentry_set() In exfat_get_dentry_set(), part of the code is the same as exfat_find_location(), reuse exfat_find_location() to simplify exfat_get_dentry_set(). Code refinement, no functional changes. Signed-off-by: Yuezhang Mo Reviewed-by: Andy Wu Reviewed-by: Aoyama Wataru Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/dir.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 8121a7e073bc..1dfa67f307f1 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -818,7 +818,7 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es, unsigned int type) { int ret, i, num_bh; - unsigned int off, byte_offset, clu = 0; + unsigned int off; sector_t sec; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_dentry *ep; @@ -831,27 +831,16 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es, return -EIO; } - byte_offset = EXFAT_DEN_TO_B(entry); - ret = exfat_walk_fat_chain(sb, p_dir, byte_offset, &clu); + ret = exfat_find_location(sb, p_dir, entry, &sec, &off); if (ret) return ret; memset(es, 0, sizeof(*es)); es->sb = sb; es->modified = false; - - /* byte offset in cluster */ - byte_offset = EXFAT_CLU_OFFSET(byte_offset, sbi); - - /* byte offset in sector */ - off = EXFAT_BLK_OFFSET(byte_offset, sb); es->start_off = off; es->bh = es->__bh; - /* sector offset in cluster */ - sec = EXFAT_B_TO_BLK(byte_offset, sb); - sec += exfat_cluster_to_sector(sbi, clu); - bh = sb_bread(sb, sec); if (!bh) return -EIO; @@ -878,6 +867,8 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es, for (i = 1; i < num_bh; i++) { /* get the next sector */ if (exfat_is_last_sector_in_cluster(sbi, sec)) { + unsigned int clu = exfat_sector_to_cluster(sbi, sec); + if (p_dir->flags == ALLOC_NO_FAT_CHAIN) clu++; else if (exfat_get_next_cluster(sb, &clu)) -- cgit From d74f4a3f6d88a2416564bc6bf937e423a4ae8f8e Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 14 Dec 2022 10:39:11 +0800 Subject: cifs: Remove duplicated include in cifsglob.h ./fs/cifs/cifsglob.h: linux/scatterlist.h is included more than once. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=3459 Fixes: f7f291e14dde ("cifs: fix oops during encryption") Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 703685e2db5e..82f2d3070c26 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -23,7 +23,6 @@ #include "cifs_fs_sb.h" #include "cifsacl.h" #include -#include #include #include "../smbfs_common/smb2pdu.h" #include "smb2pdu.h" -- cgit