diff options
Diffstat (limited to 'fs')
797 files changed, 37332 insertions, 27288 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 4dac4a0dc5f4..c397c51f80d9 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -17,34 +17,64 @@ #include "v9fs_vfs.h" #include "fid.h" -static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name) +static struct posix_acl *v9fs_fid_get_acl(struct p9_fid *fid, const char *name) { ssize_t size; void *value = NULL; struct posix_acl *acl = NULL; size = v9fs_fid_xattr_get(fid, name, NULL, 0); - if (size > 0) { - value = kzalloc(size, GFP_NOFS); - if (!value) - return ERR_PTR(-ENOMEM); - size = v9fs_fid_xattr_get(fid, name, value, size); - if (size > 0) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - goto err_out; - } - } else if (size == -ENODATA || size == 0 || - size == -ENOSYS || size == -EOPNOTSUPP) { - acl = NULL; - } else - acl = ERR_PTR(-EIO); - -err_out: + if (size < 0) + return ERR_PTR(size); + if (size == 0) + return ERR_PTR(-ENODATA); + + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + + size = v9fs_fid_xattr_get(fid, name, value, size); + if (size < 0) + acl = ERR_PTR(size); + else if (size == 0) + acl = ERR_PTR(-ENODATA); + else + acl = posix_acl_from_xattr(&init_user_ns, value, size); kfree(value); return acl; } +static struct posix_acl *v9fs_acl_get(struct dentry *dentry, const char *name) +{ + struct p9_fid *fid; + struct posix_acl *acl = NULL; + + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return ERR_CAST(fid); + + acl = v9fs_fid_get_acl(fid, name); + p9_fid_put(fid); + return acl; +} + +static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, const char *name) +{ + int retval; + struct posix_acl *acl = NULL; + + acl = v9fs_fid_get_acl(fid, name); + if (!IS_ERR(acl)) + return acl; + + retval = PTR_ERR(acl); + if (retval == -ENODATA || retval == -ENOSYS || retval == -EOPNOTSUPP) + return NULL; + + /* map everything else to -EIO */ + return ERR_PTR(-EIO); +} + int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) { int retval = 0; @@ -89,7 +119,7 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type) return acl; } -struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu) +struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type, bool rcu) { struct v9fs_session_info *v9ses; @@ -109,6 +139,112 @@ struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu) } +struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, int type) +{ + struct v9fs_session_info *v9ses; + + v9ses = v9fs_dentry2v9ses(dentry); + /* We allow set/get/list of acl when access=client is not specified. */ + if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) + return v9fs_acl_get(dentry, posix_acl_xattr_name(type)); + return v9fs_get_cached_acl(d_inode(dentry), type); +} + +int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + struct posix_acl *acl, int type) +{ + int retval; + size_t size = 0; + void *value = NULL; + const char *acl_name; + struct v9fs_session_info *v9ses; + struct inode *inode = d_inode(dentry); + + if (acl) { + retval = posix_acl_valid(inode->i_sb->s_user_ns, acl); + if (retval) + goto err_out; + + size = posix_acl_xattr_size(acl->a_count); + + value = kzalloc(size, GFP_NOFS); + if (!value) { + retval = -ENOMEM; + goto err_out; + } + + retval = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (retval < 0) + goto err_out; + } + + /* + * set the attribute on the remote. Without even looking at the + * xattr value. We leave it to the server to validate + */ + acl_name = posix_acl_xattr_name(type); + v9ses = v9fs_dentry2v9ses(dentry); + if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) { + retval = v9fs_xattr_set(dentry, acl_name, value, size, 0); + goto err_out; + } + + if (S_ISLNK(inode->i_mode)) { + retval = -EOPNOTSUPP; + goto err_out; + } + + if (!inode_owner_or_capable(&init_user_ns, inode)) { + retval = -EPERM; + goto err_out; + } + + switch (type) { + case ACL_TYPE_ACCESS: + if (acl) { + struct iattr iattr = {}; + struct posix_acl *acl_mode = acl; + + retval = posix_acl_update_mode(&init_user_ns, inode, + &iattr.ia_mode, + &acl_mode); + if (retval) + goto err_out; + if (!acl_mode) { + /* + * ACL can be represented by the mode bits. + * So don't update ACL below. + */ + kfree(value); + value = NULL; + size = 0; + } + iattr.ia_valid = ATTR_MODE; + /* + * FIXME should we update ctime ? + * What is the following setxattr update the mode ? + */ + v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr); + } + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) { + retval = acl ? -EINVAL : 0; + goto err_out; + } + break; + } + + retval = v9fs_xattr_set(dentry, acl_name, value, size, 0); + if (!retval) + set_cached_acl(inode, type, acl); + +err_out: + kfree(value); + return retval; +} + static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl) { int retval; @@ -207,124 +343,3 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep, *modep = mode; return 0; } - -static int v9fs_xattr_get_acl(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, void *buffer, size_t size) -{ - struct v9fs_session_info *v9ses; - struct posix_acl *acl; - int error; - - v9ses = v9fs_dentry2v9ses(dentry); - /* - * We allow set/get/list of acl when access=client is not specified - */ - if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) - return v9fs_xattr_get(dentry, handler->name, buffer, size); - - acl = v9fs_get_cached_acl(inode, handler->flags); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int v9fs_xattr_set_acl(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) -{ - int retval; - struct posix_acl *acl; - struct v9fs_session_info *v9ses; - - v9ses = v9fs_dentry2v9ses(dentry); - /* - * set the attribute on the remote. Without even looking at the - * xattr value. We leave it to the server to validate - */ - if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) - return v9fs_xattr_set(dentry, handler->name, value, size, - flags); - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(&init_user_ns, inode)) - return -EPERM; - if (value) { - /* update the cached acl value */ - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - retval = posix_acl_valid(inode->i_sb->s_user_ns, acl); - if (retval) - goto err_out; - } - } else - acl = NULL; - - switch (handler->flags) { - case ACL_TYPE_ACCESS: - if (acl) { - struct iattr iattr = { 0 }; - struct posix_acl *old_acl = acl; - - retval = posix_acl_update_mode(&init_user_ns, inode, - &iattr.ia_mode, &acl); - if (retval) - goto err_out; - if (!acl) { - /* - * ACL can be represented - * by the mode bits. So don't - * update ACL. - */ - posix_acl_release(old_acl); - value = NULL; - size = 0; - } - iattr.ia_valid = ATTR_MODE; - /* FIXME should we update ctime ? - * What is the following setxattr update the - * mode ? - */ - v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr); - } - break; - case ACL_TYPE_DEFAULT: - if (!S_ISDIR(inode->i_mode)) { - retval = acl ? -EINVAL : 0; - goto err_out; - } - break; - default: - BUG(); - } - retval = v9fs_xattr_set(dentry, handler->name, value, size, flags); - if (!retval) - set_cached_acl(inode, handler->flags, acl); -err_out: - posix_acl_release(acl); - return retval; -} - -const struct xattr_handler v9fs_xattr_acl_access_handler = { - .name = XATTR_NAME_POSIX_ACL_ACCESS, - .flags = ACL_TYPE_ACCESS, - .get = v9fs_xattr_get_acl, - .set = v9fs_xattr_set_acl, -}; - -const struct xattr_handler v9fs_xattr_acl_default_handler = { - .name = XATTR_NAME_POSIX_ACL_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .get = v9fs_xattr_get_acl, - .set = v9fs_xattr_set_acl, -}; diff --git a/fs/9p/acl.h b/fs/9p/acl.h index ce5175d463dd..4c60a2bce5de 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -8,8 +8,12 @@ #ifdef CONFIG_9P_FS_POSIX_ACL int v9fs_get_acl(struct inode *inode, struct p9_fid *fid); -struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, +struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type, bool rcu); +struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, int type); +int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + struct posix_acl *acl, int type); int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid); int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid, struct posix_acl *dacl, struct posix_acl *acl); @@ -17,7 +21,9 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep, struct posix_acl **dpacl, struct posix_acl **pacl); void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl); #else +#define v9fs_iop_get_inode_acl NULL #define v9fs_iop_get_acl NULL +#define v9fs_iop_set_acl NULL static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) { return 0; diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 47b9a1122f34..a19891015f19 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -40,7 +40,7 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) size_t len = subreq->len - subreq->transferred; int total, err; - iov_iter_xarray(&to, READ, &rreq->mapping->i_pages, pos, len); + iov_iter_xarray(&to, ITER_DEST, &rreq->mapping->i_pages, pos, len); total = p9_client_read(fid, pos, &to, &err); @@ -172,7 +172,7 @@ static int v9fs_vfs_write_folio_locked(struct folio *folio) len = min_t(loff_t, i_size - start, len); - iov_iter_xarray(&from, WRITE, &folio_mapping(folio)->i_pages, start, len); + iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len); /* We should have writeback_fid always set */ BUG_ON(!v9inode->writeback_fid); diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 000fbaae9b18..3bb95adc9619 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -109,7 +109,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) struct iov_iter to; int n; - iov_iter_kvec(&to, READ, &kvec, 1, buflen); + iov_iter_kvec(&to, ITER_DEST, &kvec, 1, buflen); n = p9_client_read(file->private_data, ctx->pos, &to, &err); if (err) diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 5cfa4b4f070f..03c1743c4aff 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -983,14 +983,18 @@ const struct inode_operations v9fs_dir_inode_operations_dotl = { .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .listxattr = v9fs_listxattr, + .get_inode_acl = v9fs_iop_get_inode_acl, .get_acl = v9fs_iop_get_acl, + .set_acl = v9fs_iop_set_acl, }; const struct inode_operations v9fs_file_inode_operations_dotl = { .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .listxattr = v9fs_listxattr, + .get_inode_acl = v9fs_iop_get_inode_acl, .get_acl = v9fs_iop_get_acl, + .set_acl = v9fs_iop_set_acl, }; const struct inode_operations v9fs_symlink_inode_operations_dotl = { diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 1f9298a4bd42..b6984311e00a 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -8,6 +8,7 @@ #include <linux/fs.h> #include <linux/sched.h> #include <linux/uio.h> +#include <linux/posix_acl_xattr.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -24,7 +25,7 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, struct iov_iter to; int err; - iov_iter_kvec(&to, READ, &kvec, 1, buffer_size); + iov_iter_kvec(&to, ITER_DEST, &kvec, 1, buffer_size); attr_fid = p9_client_xattrwalk(fid, name, &attr_size); if (IS_ERR(attr_fid)) { @@ -109,7 +110,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, struct iov_iter from; int retval, err; - iov_iter_kvec(&from, WRITE, &kvec, 1, value_len); + iov_iter_kvec(&from, ITER_SOURCE, &kvec, 1, value_len); p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n", name, value_len, flags); @@ -182,9 +183,9 @@ static struct xattr_handler v9fs_xattr_security_handler = { const struct xattr_handler *v9fs_xattr_handlers[] = { &v9fs_xattr_user_handler, &v9fs_xattr_trusted_handler, -#ifdef CONFIG_9P_FS_POSIX_ACL - &v9fs_xattr_acl_access_handler, - &v9fs_xattr_acl_default_handler, +#ifdef CONFIG_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif #ifdef CONFIG_9P_FS_SECURITY &v9fs_xattr_security_handler, diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h index 3e11fc3331eb..b5636e544c8a 100644 --- a/fs/9p/xattr.h +++ b/fs/9p/xattr.h @@ -11,8 +11,6 @@ #include <net/9p/client.h> extern const struct xattr_handler *v9fs_xattr_handlers[]; -extern const struct xattr_handler v9fs_xattr_acl_access_handler; -extern const struct xattr_handler v9fs_xattr_acl_default_handler; ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, void *buffer, size_t buffer_size); diff --git a/fs/Kconfig b/fs/Kconfig index a547307c1ae8..2685a4d0d353 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -235,6 +235,7 @@ config ARCH_SUPPORTS_HUGETLBFS config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN + depends on (SYSFS || SYSCTL) help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 21e154516bf2..93539aac0e5b 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -58,7 +58,7 @@ config ARCH_USE_GNU_PROPERTY config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y if !BINFMT_ELF - depends on ARM || ((M68K || SUPERH) && !MMU) + depends on ARM || ((M68K || SUPERH || XTENSA) && !MMU) select ELFCORE help ELF FDPIC binaries are based on ELF, but allow the individual load @@ -142,39 +142,6 @@ config BINFMT_ZFLAT help Support FLAT format compressed binaries -config HAVE_AOUT - def_bool n - -config BINFMT_AOUT - tristate "Kernel support for a.out and ECOFF binaries" - depends on HAVE_AOUT - help - A.out (Assembler.OUTput) is a set of formats for libraries and - executables used in the earliest versions of UNIX. Linux used - the a.out formats QMAGIC and ZMAGIC until they were replaced - with the ELF format. - - The conversion to ELF started in 1995. This option is primarily - provided for historical interest and for the benefit of those - who need to run binaries from that era. - - Most people should answer N here. If you think you may have - occasional use for this format, enable module support above - and answer M here to compile this support as a module called - binfmt_aout. - - If any crucial components of your system (such as /sbin/init - or /lib/ld.so) are still in a.out format, you will have to - say Y here. - -config OSF4_COMPAT - bool "OSF/1 v4 readv/writev compatibility" - depends on ALPHA && BINFMT_AOUT - help - Say Y if you are using OSF/1 binaries (like Netscape and Acrobat) - with v4 shared libraries freely available from Compaq. If you're - going to use shared libraries from Tru64 version 5.0 or later, say N. - config BINFMT_MISC tristate "Kernel support for MISC binaries" help diff --git a/fs/Makefile b/fs/Makefile index 93b80529f8e8..4dea17840761 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -38,7 +38,6 @@ obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FS_VERITY) += verity/ obj-$(CONFIG_FILE_LOCKING) += locks.o -obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o diff --git a/fs/affs/super.c b/fs/affs/super.c index 4c5f30a83336..58b391446ae1 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -276,7 +276,7 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, char *vol = match_strdup(&args[0]); if (!vol) return 0; - strlcpy(volume, vol, 32); + strscpy(volume, vol, 32); kfree(vol); break; } diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 0a090d614e76..7dcd59693a0c 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -298,7 +298,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) if (call->count2 != call->count && call->count2 != 0) return afs_protocol_error(call, afs_eproto_cb_count); call->iter = &call->def_iter; - iov_iter_discard(&call->def_iter, READ, call->count2 * 3 * 4); + iov_iter_discard(&call->def_iter, ITER_DEST, call->count2 * 3 * 4); call->unmarshall++; fallthrough; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 56ae5cd5184f..104df2964225 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -24,9 +24,9 @@ static int afs_readdir(struct file *file, struct dir_context *ctx); static int afs_d_revalidate(struct dentry *dentry, unsigned int flags); static int afs_d_delete(const struct dentry *dentry); static void afs_d_iput(struct dentry *dentry, struct inode *inode); -static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen, +static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); -static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, +static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl); @@ -305,7 +305,7 @@ expand: req->actual_len = i_size; /* May change */ req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */ req->data_version = dvnode->status.data_version; /* May change */ - iov_iter_xarray(&req->def_iter, READ, &dvnode->netfs.inode.i_mapping->i_pages, + iov_iter_xarray(&req->def_iter, ITER_DEST, &dvnode->netfs.inode.i_mapping->i_pages, 0, i_size); req->iter = &req->def_iter; @@ -568,7 +568,7 @@ static int afs_readdir(struct file *file, struct dir_context *ctx) * - if afs_dir_iterate_block() spots this function, it'll pass the FID * uniquifier through dtype */ -static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, +static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype) { struct afs_lookup_one_cookie *cookie = @@ -584,16 +584,16 @@ static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, if (cookie->name.len != nlen || memcmp(cookie->name.name, name, nlen) != 0) { - _leave(" = 0 [no]"); - return 0; + _leave(" = true [keep looking]"); + return true; } cookie->fid.vnode = ino; cookie->fid.unique = dtype; cookie->found = 1; - _leave(" = -1 [found]"); - return -1; + _leave(" = false [found]"); + return false; } /* @@ -636,12 +636,11 @@ static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry, * - if afs_dir_iterate_block() spots this function, it'll pass the FID * uniquifier through dtype */ -static int afs_lookup_filldir(struct dir_context *ctx, const char *name, +static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype) { struct afs_lookup_cookie *cookie = container_of(ctx, struct afs_lookup_cookie, ctx); - int ret; _enter("{%s,%u},%s,%u,,%llu,%u", cookie->name.name, cookie->name.len, name, nlen, @@ -663,12 +662,10 @@ static int afs_lookup_filldir(struct dir_context *ctx, const char *name, cookie->fids[1].unique = dtype; cookie->found = 1; if (cookie->one_only) - return -1; + return false; } - ret = cookie->nr_fids >= 50 ? -1 : 0; - _leave(" = %d", ret); - return ret; + return cookie->nr_fids < 50; } /* diff --git a/fs/afs/file.c b/fs/afs/file.c index d1cfb235c4b9..2eeab57df133 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -324,7 +324,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq) fsreq->vnode = vnode; fsreq->iter = &fsreq->def_iter; - iov_iter_xarray(&fsreq->def_iter, READ, + iov_iter_xarray(&fsreq->def_iter, ITER_DEST, &fsreq->vnode->netfs.inode.i_mapping->i_pages, fsreq->pos, fsreq->len); @@ -346,7 +346,7 @@ static int afs_symlink_read_folio(struct file *file, struct folio *folio) fsreq->len = folio_size(folio); fsreq->vnode = vnode; fsreq->iter = &fsreq->def_iter; - iov_iter_xarray(&fsreq->def_iter, READ, &folio->mapping->i_pages, + iov_iter_xarray(&fsreq->def_iter, ITER_DEST, &folio->mapping->i_pages, fsreq->pos, fsreq->len); ret = afs_fetch_data(fsreq->vnode, fsreq); diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index c0031a3ab42f..3ac5fcf98d0d 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -167,8 +167,8 @@ responded: clear_bit(AFS_SERVER_FL_HAS_FS64, &server->flags); } - if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) && - rtt_us < server->probe.rtt) { + rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us); + if (rtt_us < server->probe.rtt) { server->probe.rtt = rtt_us; server->rtt = rtt_us; alist->preferred = index; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 723d162078a3..9ba7b68375c9 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1301,7 +1301,7 @@ static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t si call->iov_len = size; call->kvec[0].iov_base = buf; call->kvec[0].iov_len = size; - iov_iter_kvec(&call->def_iter, READ, call->kvec, 1, size); + iov_iter_kvec(&call->def_iter, ITER_DEST, call->kvec, 1, size); } static inline void afs_extract_to_tmp(struct afs_call *call) @@ -1319,7 +1319,7 @@ static inline void afs_extract_to_tmp64(struct afs_call *call) static inline void afs_extract_discard(struct afs_call *call, size_t size) { call->iov_len = size; - iov_iter_discard(&call->def_iter, READ, size); + iov_iter_discard(&call->def_iter, ITER_DEST, size); } static inline void afs_extract_to_buf(struct afs_call *call, size_t size) diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index eccc3cd0cb70..c62939e5ea1f 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -359,7 +359,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size); + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iov, 1, call->request_size); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = MSG_WAITALL | (call->write_iter ? MSG_MORE : 0); @@ -400,7 +400,7 @@ error_do_abort: RX_USER_ABORT, ret, "KSD"); } else { len = 0; - iov_iter_kvec(&msg.msg_iter, READ, NULL, 0, 0); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, NULL, 0, 0); rxrpc_kernel_recv_data(call->net->socket, rxcall, &msg.msg_iter, &len, false, &call->abort_code, &call->service_id); @@ -485,7 +485,7 @@ static void afs_deliver_to_call(struct afs_call *call) ) { if (state == AFS_CALL_SV_AWAIT_ACK) { len = 0; - iov_iter_kvec(&call->def_iter, READ, NULL, 0, 0); + iov_iter_kvec(&call->def_iter, ITER_DEST, NULL, 0, 0); ret = rxrpc_kernel_recv_data(call->net->socket, call->rxcall, &call->def_iter, &len, false, &remote_abort, @@ -822,7 +822,7 @@ void afs_send_empty_reply(struct afs_call *call) msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE, NULL, 0, 0); + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, NULL, 0, 0); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -862,7 +862,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) iov[0].iov_len = len; msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len); + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iov, 1, len); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; diff --git a/fs/afs/server.c b/fs/afs/server.c index 4981baf97835..b5237206eac3 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -406,7 +406,7 @@ void afs_put_server(struct afs_net *net, struct afs_server *server, if (!server) return; - a = atomic_inc_return(&server->active); + a = atomic_read(&server->active); zero = __refcount_dec_and_test(&server->ref, &r); trace_afs_server(debug_id, r - 1, a, reason); if (unlikely(zero)) diff --git a/fs/afs/write.c b/fs/afs/write.c index 9ebdd36eaf2f..08fd456dde67 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -609,7 +609,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, */ afs_write_to_cache(vnode, start, len, i_size, caching); - iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len); + iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len); ret = afs_store_data(vnode, &iter, start, false); } else { _debug("write discard %x @%llx [%llx]", len, start, i_size); @@ -1000,7 +1000,7 @@ int afs_launder_folio(struct folio *folio) bv[0].bv_page = &folio->page; bv[0].bv_offset = f; bv[0].bv_len = t - f; - iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len); + iov_iter_bvec(&iter, ITER_SOURCE, bv, 1, bv[0].bv_len); trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio); ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true); @@ -951,16 +951,13 @@ static bool __get_reqs_available(struct kioctx *ctx) local_irq_save(flags); kcpu = this_cpu_ptr(ctx->cpu); if (!kcpu->reqs_available) { - int old, avail = atomic_read(&ctx->reqs_available); + int avail = atomic_read(&ctx->reqs_available); do { if (avail < ctx->req_batch) goto out; - - old = avail; - avail = atomic_cmpxchg(&ctx->reqs_available, - avail, avail - ctx->req_batch); - } while (avail != old); + } while (!atomic_try_cmpxchg(&ctx->reqs_available, + &avail, avail - ctx->req_batch)); kcpu->reqs_available += ctx->req_batch; } @@ -1555,7 +1552,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, if (unlikely(!file->f_op->read_iter)) return -EINVAL; - ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); + ret = aio_setup_rw(ITER_DEST, iocb, &iovec, vectored, compat, &iter); if (ret < 0) return ret; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); @@ -1583,7 +1580,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, if (unlikely(!file->f_op->write_iter)) return -EINVAL; - ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); + ret = aio_setup_rw(ITER_SOURCE, iocb, &iovec, vectored, compat, &iter); if (ret < 0) return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index e0c3e33c4177..24192a7667ed 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -32,7 +32,7 @@ static struct inode *anon_inode_inode; */ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen) { - return dynamic_dname(dentry, buffer, buflen, "anon_inode:%s", + return dynamic_dname(buffer, buflen, "anon_inode:%s", dentry->d_name.name); } diff --git a/fs/attr.c b/fs/attr.c index 1552a5f23d6b..b45f30e516fa 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -18,6 +18,70 @@ #include <linux/evm.h> #include <linux/ima.h> +#include "internal.h" + +/** + * setattr_should_drop_sgid - determine whether the setgid bit needs to be + * removed + * @mnt_userns: user namespace of the mount @inode was found from + * @inode: inode to check + * + * This function determines whether the setgid bit needs to be removed. + * We retain backwards compatibility and require setgid bit to be removed + * unconditionally if S_IXGRP is set. Otherwise we have the exact same + * requirements as setattr_prepare() and setattr_copy(). + * + * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise. + */ +int setattr_should_drop_sgid(struct user_namespace *mnt_userns, + const struct inode *inode) +{ + umode_t mode = inode->i_mode; + + if (!(mode & S_ISGID)) + return 0; + if (mode & S_IXGRP) + return ATTR_KILL_SGID; + if (!in_group_or_capable(mnt_userns, inode, + i_gid_into_vfsgid(mnt_userns, inode))) + return ATTR_KILL_SGID; + return 0; +} + +/** + * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to + * be dropped + * @mnt_userns: user namespace of the mount @inode was found from + * @inode: inode to check + * + * This function determines whether the set{g,u}id bits need to be removed. + * If the setuid bit needs to be removed ATTR_KILL_SUID is returned. If the + * setgid bit needs to be removed ATTR_KILL_SGID is returned. If both + * set{g,u}id bits need to be removed the corresponding mask of both flags is + * returned. + * + * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits + * to remove, 0 otherwise. + */ +int setattr_should_drop_suidgid(struct user_namespace *mnt_userns, + struct inode *inode) +{ + umode_t mode = inode->i_mode; + int kill = 0; + + /* suid always must be killed */ + if (unlikely(mode & S_ISUID)) + kill = ATTR_KILL_SUID; + + kill |= setattr_should_drop_sgid(mnt_userns, inode); + + if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) + return kill; + + return 0; +} +EXPORT_SYMBOL(setattr_should_drop_suidgid); + /** * chown_ok - verify permissions to chown inode * @mnt_userns: user namespace of the mount @inode was found from @@ -140,8 +204,7 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry, vfsgid = i_gid_into_vfsgid(mnt_userns, inode); /* Also check the setgid bit! */ - if (!vfsgid_in_group_p(vfsgid) && - !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!in_group_or_capable(mnt_userns, inode, vfsgid)) attr->ia_mode &= ~S_ISGID; } @@ -251,9 +314,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); - if (!vfsgid_in_group_p(vfsgid) && - !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!in_group_or_capable(mnt_userns, inode, + i_gid_into_vfsgid(mnt_userns, inode))) mode &= ~S_ISGID; inode->i_mode = mode; } @@ -375,7 +437,7 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, } } if (ia_valid & ATTR_KILL_SGID) { - if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + if (mode & S_ISGID) { if (!(ia_valid & ATTR_MODE)) { ia_valid = attr->ia_valid |= ATTR_MODE; attr->ia_mode = inode->i_mode; diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 12b8fdcc445b..92737166203f 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -147,14 +147,14 @@ static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, } static int bad_inode_tmpfile(struct user_namespace *mnt_userns, - struct inode *inode, struct dentry *dentry, + struct inode *inode, struct file *file, umode_t mode) { return -EIO; } static int bad_inode_set_acl(struct user_namespace *mnt_userns, - struct inode *inode, struct posix_acl *acl, + struct dentry *dentry, struct posix_acl *acl, int type) { return -EIO; @@ -177,7 +177,7 @@ static const struct inode_operations bad_inode_ops = .setattr = bad_inode_setattr, .listxattr = bad_inode_listxattr, .get_link = bad_inode_get_link, - .get_acl = bad_inode_get_acl, + .get_inode_acl = bad_inode_get_acl, .fiemap = bad_inode_fiemap, .update_time = bad_inode_update_time, .atomic_open = bad_inode_atomic_open, diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c deleted file mode 100644 index 0dcfc691e7e2..000000000000 --- a/fs/binfmt_aout.c +++ /dev/null @@ -1,342 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/fs/binfmt_aout.c - * - * Copyright (C) 1991, 1992, 1996 Linus Torvalds - */ - -#include <linux/module.h> - -#include <linux/time.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/a.out.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/file.h> -#include <linux/stat.h> -#include <linux/fcntl.h> -#include <linux/ptrace.h> -#include <linux/user.h> -#include <linux/binfmts.h> -#include <linux/personality.h> -#include <linux/init.h> -#include <linux/coredump.h> -#include <linux/slab.h> -#include <linux/sched/task_stack.h> - -#include <linux/uaccess.h> -#include <asm/cacheflush.h> - -static int load_aout_binary(struct linux_binprm *); -static int load_aout_library(struct file*); - -static struct linux_binfmt aout_format = { - .module = THIS_MODULE, - .load_binary = load_aout_binary, - .load_shlib = load_aout_library, -}; - -#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) - -static int set_brk(unsigned long start, unsigned long end) -{ - start = PAGE_ALIGN(start); - end = PAGE_ALIGN(end); - if (end > start) - return vm_brk(start, end - start); - return 0; -} - -/* - * create_aout_tables() parses the env- and arg-strings in new user - * memory and creates the pointer tables from them, and puts their - * addresses on the "stack", returning the new stack pointer value. - */ -static unsigned long __user *create_aout_tables(char __user *p, struct linux_binprm * bprm) -{ - char __user * __user *argv; - char __user * __user *envp; - unsigned long __user *sp; - int argc = bprm->argc; - int envc = bprm->envc; - - sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p); -#ifdef __alpha__ -/* whee.. test-programs are so much fun. */ - put_user(0, --sp); - put_user(0, --sp); - if (bprm->loader) { - put_user(0, --sp); - put_user(1003, --sp); - put_user(bprm->loader, --sp); - put_user(1002, --sp); - } - put_user(bprm->exec, --sp); - put_user(1001, --sp); -#endif - sp -= envc+1; - envp = (char __user * __user *) sp; - sp -= argc+1; - argv = (char __user * __user *) sp; -#ifndef __alpha__ - put_user((unsigned long) envp,--sp); - put_user((unsigned long) argv,--sp); -#endif - put_user(argc,--sp); - current->mm->arg_start = (unsigned long) p; - while (argc-->0) { - char c; - put_user(p,argv++); - do { - get_user(c,p++); - } while (c); - } - put_user(NULL,argv); - current->mm->arg_end = current->mm->env_start = (unsigned long) p; - while (envc-->0) { - char c; - put_user(p,envp++); - do { - get_user(c,p++); - } while (c); - } - put_user(NULL,envp); - current->mm->env_end = (unsigned long) p; - return sp; -} - -/* - * These are the functions used to load a.out style executables and shared - * libraries. There is no binary dependent code anywhere else. - */ - -static int load_aout_binary(struct linux_binprm * bprm) -{ - struct pt_regs *regs = current_pt_regs(); - struct exec ex; - unsigned long error; - unsigned long fd_offset; - unsigned long rlim; - int retval; - - ex = *((struct exec *) bprm->buf); /* exec-header */ - if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && - N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || - N_TRSIZE(ex) || N_DRSIZE(ex) || - i_size_read(file_inode(bprm->file)) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { - return -ENOEXEC; - } - - /* - * Requires a mmap handler. This prevents people from using a.out - * as part of an exploit attack against /proc-related vulnerabilities. - */ - if (!bprm->file->f_op->mmap) - return -ENOEXEC; - - fd_offset = N_TXTOFF(ex); - - /* Check initial limits. This avoids letting people circumvent - * size limits imposed on them by creating programs with large - * arrays in the data or bss. - */ - rlim = rlimit(RLIMIT_DATA); - if (rlim >= RLIM_INFINITY) - rlim = ~0; - if (ex.a_data + ex.a_bss > rlim) - return -ENOMEM; - - /* Flush all traces of the currently running executable */ - retval = begin_new_exec(bprm); - if (retval) - return retval; - - /* OK, This is the point of no return */ -#ifdef __alpha__ - SET_AOUT_PERSONALITY(bprm, ex); -#else - set_personality(PER_LINUX); -#endif - setup_new_exec(bprm); - - current->mm->end_code = ex.a_text + - (current->mm->start_code = N_TXTADDR(ex)); - current->mm->end_data = ex.a_data + - (current->mm->start_data = N_DATADDR(ex)); - current->mm->brk = ex.a_bss + - (current->mm->start_brk = N_BSSADDR(ex)); - - retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); - if (retval < 0) - return retval; - - - if (N_MAGIC(ex) == OMAGIC) { - unsigned long text_addr, map_size; - loff_t pos; - - text_addr = N_TXTADDR(ex); - -#ifdef __alpha__ - pos = fd_offset; - map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1; -#else - pos = 32; - map_size = ex.a_text+ex.a_data; -#endif - error = vm_brk(text_addr & PAGE_MASK, map_size); - if (error) - return error; - - error = read_code(bprm->file, text_addr, pos, - ex.a_text+ex.a_data); - if ((signed long)error < 0) - return error; - } else { - if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && - (N_MAGIC(ex) != NMAGIC) && printk_ratelimit()) - { - printk(KERN_NOTICE "executable not page aligned\n"); - } - - if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit()) - { - printk(KERN_WARNING - "fd_offset is not page aligned. Please convert program: %pD\n", - bprm->file); - } - - if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { - error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); - if (error) - return error; - - read_code(bprm->file, N_TXTADDR(ex), fd_offset, - ex.a_text + ex.a_data); - goto beyond_if; - } - - error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, - PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE, - fd_offset); - - if (error != N_TXTADDR(ex)) - return error; - - error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE, - fd_offset + ex.a_text); - if (error != N_DATADDR(ex)) - return error; - } -beyond_if: - set_binfmt(&aout_format); - - retval = set_brk(current->mm->start_brk, current->mm->brk); - if (retval < 0) - return retval; - - current->mm->start_stack = - (unsigned long) create_aout_tables((char __user *) bprm->p, bprm); -#ifdef __alpha__ - regs->gp = ex.a_gpvalue; -#endif - finalize_exec(bprm); - start_thread(regs, ex.a_entry, current->mm->start_stack); - return 0; -} - -static int load_aout_library(struct file *file) -{ - struct inode * inode; - unsigned long bss, start_addr, len; - unsigned long error; - int retval; - struct exec ex; - loff_t pos = 0; - - inode = file_inode(file); - - retval = -ENOEXEC; - error = kernel_read(file, &ex, sizeof(ex), &pos); - if (error != sizeof(ex)) - goto out; - - /* We come in here for the regular a.out style of shared libraries */ - if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) || - N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) || - i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { - goto out; - } - - /* - * Requires a mmap handler. This prevents people from using a.out - * as part of an exploit attack against /proc-related vulnerabilities. - */ - if (!file->f_op->mmap) - goto out; - - if (N_FLAGS(ex)) - goto out; - - /* For QMAGIC, the starting address is 0x20 into the page. We mask - this off to get the starting address for the page */ - - start_addr = ex.a_entry & 0xfffff000; - - if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) { - if (printk_ratelimit()) - { - printk(KERN_WARNING - "N_TXTOFF is not page aligned. Please convert library: %pD\n", - file); - } - retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - if (retval) - goto out; - - read_code(file, start_addr, N_TXTOFF(ex), - ex.a_text + ex.a_data); - retval = 0; - goto out; - } - /* Now use mmap to map the library into memory. */ - error = vm_mmap(file, start_addr, ex.a_text + ex.a_data, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE, - N_TXTOFF(ex)); - retval = error; - if (error != start_addr) - goto out; - - len = PAGE_ALIGN(ex.a_text + ex.a_data); - bss = ex.a_text + ex.a_data + ex.a_bss; - if (bss > len) { - retval = vm_brk(start_addr + len, bss - len); - if (retval) - goto out; - } - retval = 0; -out: - return retval; -} - -static int __init init_aout_binfmt(void) -{ - register_binfmt(&aout_format); - return 0; -} - -static void __exit exit_aout_binfmt(void) -{ - unregister_binfmt(&aout_format); -} - -core_initcall(init_aout_binfmt); -module_exit(exit_aout_binfmt); -MODULE_LICENSE("GPL"); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 63c7ebb0da89..de63572a9404 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -248,7 +248,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, } while (0) #ifdef ARCH_DLINFO - /* + /* * ARCH_DLINFO must come first so PPC can do its special alignment of * AUXV. * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in @@ -456,13 +456,13 @@ static unsigned long maximum_alignment(struct elf_phdr *cmds, int nr) * * Loads ELF program headers from the binary file elf_file, which has the ELF * header pointed to by elf_ex, into a newly allocated array. The caller is - * responsible for freeing the allocated data. Returns an ERR_PTR upon failure. + * responsible for freeing the allocated data. Returns NULL upon failure. */ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex, struct file *elf_file) { struct elf_phdr *elf_phdata = NULL; - int retval, err = -1; + int retval = -1; unsigned int size; /* @@ -484,15 +484,9 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex, /* Read in the program headers */ retval = elf_read(elf_file, elf_phdata, size, elf_ex->e_phoff); - if (retval < 0) { - err = retval; - goto out; - } - /* Success! */ - err = 0; out: - if (err) { + if (retval) { kfree(elf_phdata); elf_phdata = NULL; } @@ -911,7 +905,7 @@ static int load_elf_binary(struct linux_binprm *bprm) interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL); if (!interp_elf_ex) { retval = -ENOMEM; - goto out_free_ph; + goto out_free_file; } /* Get the exec headers */ @@ -1020,7 +1014,7 @@ out_free_interp: executable_stack); if (retval < 0) goto out_free_dentry; - + elf_bss = 0; elf_brk = 0; @@ -1043,7 +1037,7 @@ out_free_interp: if (unlikely (elf_brk > elf_bss)) { unsigned long nbyte; - + /* There was a PT_LOAD segment with p_memsz > p_filesz before this one. Map anonymous pages, if needed, and clear the area. */ @@ -1166,7 +1160,7 @@ out_free_interp: error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, elf_prot, elf_flags, total_size); if (BAD_ADDR(error)) { - retval = IS_ERR((void *)error) ? + retval = IS_ERR_VALUE(error) ? PTR_ERR((void*)error) : -EINVAL; goto out_free_dentry; } @@ -1251,7 +1245,7 @@ out_free_interp: interpreter, load_bias, interp_elf_phdata, &arch_state); - if (!IS_ERR((void *)elf_entry)) { + if (!IS_ERR_VALUE(elf_entry)) { /* * load_elf_interp() returns relocation * adjustment @@ -1260,7 +1254,7 @@ out_free_interp: elf_entry += interp_elf_ex->e_entry; } if (BAD_ADDR(elf_entry)) { - retval = IS_ERR((void *)elf_entry) ? + retval = IS_ERR_VALUE(elf_entry) ? (int)elf_entry : -EINVAL; goto out_free_dentry; } @@ -1354,6 +1348,7 @@ out: out_free_dentry: kfree(interp_elf_ex); kfree(interp_elf_phdata); +out_free_file: allow_write_access(interpreter); if (interpreter) fput(interpreter); @@ -1520,7 +1515,7 @@ static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) phdr->p_align = 0; } -static void fill_note(struct memelfnote *note, const char *name, int type, +static void fill_note(struct memelfnote *note, const char *name, int type, unsigned int sz, void *data) { note->name = name; @@ -1723,7 +1718,6 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm return 0; } -#ifdef CORE_DUMP_USE_REGSET #include <linux/regset.h> struct elf_thread_core_info { @@ -1744,6 +1738,7 @@ struct elf_note_info { int thread_notes; }; +#ifdef CORE_DUMP_USE_REGSET /* * When a regset has a writeback hook, we call it on each thread before * dumping user memory. On register window machines, this makes sure the @@ -1823,34 +1818,58 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, return 1; } +#else +static int fill_thread_core_info(struct elf_thread_core_info *t, + const struct user_regset_view *view, + long signr, struct elf_note_info *info) +{ + struct task_struct *p = t->task; + elf_fpregset_t *fpu; + + fill_prstatus(&t->prstatus.common, p, signr); + elf_core_copy_task_regs(p, &t->prstatus.pr_reg); + + fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + &(t->prstatus)); + info->size += notesize(&t->notes[0]); + + fpu = kzalloc(sizeof(elf_fpregset_t), GFP_KERNEL); + if (!fpu || !elf_core_copy_task_fpregs(p, fpu)) { + kfree(fpu); + return 1; + } + + t->prstatus.pr_fpvalid = 1; + fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(*fpu), fpu); + info->size += notesize(&t->notes[1]); + + return 1; +} +#endif static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, struct coredump_params *cprm) { struct task_struct *dump_task = current; - const struct user_regset_view *view = task_user_regset_view(dump_task); + const struct user_regset_view *view; struct elf_thread_core_info *t; struct elf_prpsinfo *psinfo; struct core_thread *ct; - unsigned int i; - - info->size = 0; - info->thread = NULL; psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); - if (psinfo == NULL) { - info->psinfo.data = NULL; /* So we don't free this wrongly */ + if (!psinfo) return 0; - } - fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); +#ifdef CORE_DUMP_USE_REGSET + view = task_user_regset_view(dump_task); + /* * Figure out how many notes we're going to need for each thread. */ info->thread_notes = 0; - for (i = 0; i < view->n; ++i) + for (int i = 0; i < view->n; ++i) if (view->regsets[i].core_note_type != 0) ++info->thread_notes; @@ -1869,11 +1888,23 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, */ fill_elf_header(elf, phdrs, view->e_machine, view->e_flags); +#else + view = NULL; + info->thread_notes = 2; + fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); +#endif /* * Allocate a structure for each thread. */ - for (ct = &dump_task->signal->core_state->dumper; ct; ct = ct->next) { + info->thread = kzalloc(offsetof(struct elf_thread_core_info, + notes[info->thread_notes]), + GFP_KERNEL); + if (unlikely(!info->thread)) + return 0; + + info->thread->task = dump_task; + for (ct = dump_task->signal->core_state->dumper.next; ct; ct = ct->next) { t = kzalloc(offsetof(struct elf_thread_core_info, notes[info->thread_notes]), GFP_KERNEL); @@ -1881,17 +1912,8 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, return 0; t->task = ct->task; - if (ct->task == dump_task || !info->thread) { - t->next = info->thread; - info->thread = t; - } else { - /* - * Make sure to keep the original task at - * the head of the list. - */ - t->next = info->thread->next; - info->thread->next = t; - } + t->next = info->thread->next; + info->thread->next = t; } /* @@ -1919,11 +1941,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, return 1; } -static size_t get_note_info_size(struct elf_note_info *info) -{ - return info->size; -} - /* * Write all the notes for each thread. When writing the first thread, the * process-wide notes are interleaved after the first thread-specific note. @@ -1978,197 +1995,6 @@ static void free_note_info(struct elf_note_info *info) kvfree(info->files.data); } -#else - -/* Here is the structure in which status of each thread is captured. */ -struct elf_thread_status -{ - struct list_head list; - struct elf_prstatus prstatus; /* NT_PRSTATUS */ - elf_fpregset_t fpu; /* NT_PRFPREG */ - struct task_struct *thread; - struct memelfnote notes[3]; - int num_notes; -}; - -/* - * In order to add the specific thread information for the elf file format, - * we need to keep a linked list of every threads pr_status and then create - * a single section for them in the final core file. - */ -static int elf_dump_thread_status(long signr, struct elf_thread_status *t) -{ - int sz = 0; - struct task_struct *p = t->thread; - t->num_notes = 0; - - fill_prstatus(&t->prstatus.common, p, signr); - elf_core_copy_task_regs(p, &t->prstatus.pr_reg); - - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), - &(t->prstatus)); - t->num_notes++; - sz += notesize(&t->notes[0]); - - if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL, - &t->fpu))) { - fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), - &(t->fpu)); - t->num_notes++; - sz += notesize(&t->notes[1]); - } - return sz; -} - -struct elf_note_info { - struct memelfnote *notes; - struct memelfnote *notes_files; - struct elf_prstatus *prstatus; /* NT_PRSTATUS */ - struct elf_prpsinfo *psinfo; /* NT_PRPSINFO */ - struct list_head thread_list; - elf_fpregset_t *fpu; - user_siginfo_t csigdata; - int thread_status_size; - int numnote; -}; - -static int elf_note_info_init(struct elf_note_info *info) -{ - memset(info, 0, sizeof(*info)); - INIT_LIST_HEAD(&info->thread_list); - - /* Allocate space for ELF notes */ - info->notes = kmalloc_array(8, sizeof(struct memelfnote), GFP_KERNEL); - if (!info->notes) - return 0; - info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL); - if (!info->psinfo) - return 0; - info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL); - if (!info->prstatus) - return 0; - info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL); - if (!info->fpu) - return 0; - return 1; -} - -static int fill_note_info(struct elfhdr *elf, int phdrs, - struct elf_note_info *info, - struct coredump_params *cprm) -{ - struct core_thread *ct; - struct elf_thread_status *ets; - - if (!elf_note_info_init(info)) - return 0; - - for (ct = current->signal->core_state->dumper.next; - ct; ct = ct->next) { - ets = kzalloc(sizeof(*ets), GFP_KERNEL); - if (!ets) - return 0; - - ets->thread = ct->task; - list_add(&ets->list, &info->thread_list); - } - - list_for_each_entry(ets, &info->thread_list, list) { - int sz; - - sz = elf_dump_thread_status(cprm->siginfo->si_signo, ets); - info->thread_status_size += sz; - } - /* now collect the dump for the current */ - memset(info->prstatus, 0, sizeof(*info->prstatus)); - fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo); - elf_core_copy_regs(&info->prstatus->pr_reg, cprm->regs); - - /* Set up header */ - fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); - - /* - * Set up the notes in similar form to SVR4 core dumps made - * with info from their /proc. - */ - - fill_note(info->notes + 0, "CORE", NT_PRSTATUS, - sizeof(*info->prstatus), info->prstatus); - fill_psinfo(info->psinfo, current->group_leader, current->mm); - fill_note(info->notes + 1, "CORE", NT_PRPSINFO, - sizeof(*info->psinfo), info->psinfo); - - fill_siginfo_note(info->notes + 2, &info->csigdata, cprm->siginfo); - fill_auxv_note(info->notes + 3, current->mm); - info->numnote = 4; - - if (fill_files_note(info->notes + info->numnote, cprm) == 0) { - info->notes_files = info->notes + info->numnote; - info->numnote++; - } - - /* Try to dump the FPU. */ - info->prstatus->pr_fpvalid = - elf_core_copy_task_fpregs(current, cprm->regs, info->fpu); - if (info->prstatus->pr_fpvalid) - fill_note(info->notes + info->numnote++, - "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu); - return 1; -} - -static size_t get_note_info_size(struct elf_note_info *info) -{ - int sz = 0; - int i; - - for (i = 0; i < info->numnote; i++) - sz += notesize(info->notes + i); - - sz += info->thread_status_size; - - return sz; -} - -static int write_note_info(struct elf_note_info *info, - struct coredump_params *cprm) -{ - struct elf_thread_status *ets; - int i; - - for (i = 0; i < info->numnote; i++) - if (!writenote(info->notes + i, cprm)) - return 0; - - /* write out the thread status notes section */ - list_for_each_entry(ets, &info->thread_list, list) { - for (i = 0; i < ets->num_notes; i++) - if (!writenote(&ets->notes[i], cprm)) - return 0; - } - - return 1; -} - -static void free_note_info(struct elf_note_info *info) -{ - while (!list_empty(&info->thread_list)) { - struct list_head *tmp = info->thread_list.next; - list_del(tmp); - kfree(list_entry(tmp, struct elf_thread_status, list)); - } - - /* Free data possibly allocated by fill_files_note(): */ - if (info->notes_files) - kvfree(info->notes_files->data); - - kfree(info->prstatus); - kfree(info->psinfo); - kfree(info->notes); - kfree(info->fpu); -} - -#endif - static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, elf_addr_t e_shoff, int segs) { @@ -2232,7 +2058,7 @@ static int elf_core_dump(struct coredump_params *cprm) /* Write notes phdr entry */ { - size_t sz = get_note_info_size(&info); + size_t sz = info.size; /* For cell spufs */ sz += elf_coredump_extra_notes_size(); @@ -2294,7 +2120,7 @@ static int elf_core_dump(struct coredump_params *cprm) if (!elf_core_write_extra_phdrs(cprm, offset)) goto end_coredump; - /* write out the notes section */ + /* write out the notes section */ if (!write_note_info(&info, cprm)) goto end_coredump; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 08d0c8797828..096e3520a0b1 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -434,8 +434,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) current->mm->start_stack = current->mm->start_brk + stack_size; #endif - if (create_elf_fdpic_tables(bprm, current->mm, - &exec_params, &interp_params) < 0) + retval = create_elf_fdpic_tables(bprm, current->mm, &exec_params, + &interp_params); + if (retval < 0) goto error; kdebug("- start_code %lx", current->mm->start_code); @@ -1603,7 +1604,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!elf_core_write_extra_phdrs(cprm, offset)) goto end_coredump; - /* write out the notes section */ + /* write out the notes section */ if (!writenote(thread_list->notes, cprm)) goto end_coredump; if (!writenote(&psinfo_note, cprm)) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index e1eae7ea823a..bb202ad369d5 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -44,10 +44,10 @@ static LIST_HEAD(entries); static int enabled = 1; enum {Enabled, Magic}; -#define MISC_FMT_PRESERVE_ARGV0 (1 << 31) -#define MISC_FMT_OPEN_BINARY (1 << 30) -#define MISC_FMT_CREDENTIALS (1 << 29) -#define MISC_FMT_OPEN_FILE (1 << 28) +#define MISC_FMT_PRESERVE_ARGV0 (1UL << 31) +#define MISC_FMT_OPEN_BINARY (1UL << 30) +#define MISC_FMT_CREDENTIALS (1UL << 29) +#define MISC_FMT_OPEN_FILE (1UL << 28) typedef struct { struct list_head list; diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 99f9995670ea..555c962fdad6 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -23,15 +23,15 @@ obj-$(CONFIG_BTRFS_FS) := btrfs.o btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ file-item.o inode-item.o disk-io.o \ - transaction.o inode.o file.o tree-defrag.o \ - extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ + transaction.o inode.o file.o defrag.o \ + extent_map.o sysfs.o accessors.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ - subpage.o tree-mod-log.o + subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/accessors.c index 12455b2b41de..206cf1612c1d 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/accessors.c @@ -4,8 +4,9 @@ */ #include <asm/unaligned.h> - +#include "messages.h" #include "ctree.h" +#include "accessors.h" static bool check_setget_bounds(const struct extent_buffer *eb, const void *ptr, unsigned off, int size) @@ -23,6 +24,13 @@ static bool check_setget_bounds(const struct extent_buffer *eb, return true; } +void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb) +{ + token->eb = eb; + token->kaddr = page_address(eb->pages[0]); + token->offset = 0; +} + /* * Macro templates that define helpers to read/write extent buffer data of a * given size, that are also used via ctree.h for access to item members by @@ -160,7 +168,7 @@ DEFINE_BTRFS_SETGET_BITS(64) void btrfs_node_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { - unsigned long ptr = btrfs_node_key_ptr_offset(nr); + unsigned long ptr = btrfs_node_key_ptr_offset(eb, nr); read_eb_member(eb, (struct btrfs_key_ptr *)ptr, struct btrfs_key_ptr, key, disk_key); } diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h new file mode 100644 index 000000000000..ceadfc5d6c66 --- /dev/null +++ b/fs/btrfs/accessors.h @@ -0,0 +1,1073 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ACCESSORS_H +#define BTRFS_ACCESSORS_H + +struct btrfs_map_token { + struct extent_buffer *eb; + char *kaddr; + unsigned long offset; +}; + +void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb); + +/* + * Some macros to generate set/get functions for the struct fields. This + * assumes there is a lefoo_to_cpu for every type, so lets make a simple one + * for u8: + */ +#define le8_to_cpu(v) (v) +#define cpu_to_le8(v) (v) +#define __le8 u8 + +static inline u8 get_unaligned_le8(const void *p) +{ + return *(u8 *)p; +} + +static inline void put_unaligned_le8(u8 val, void *p) +{ + *(u8 *)p = val; +} + +#define read_eb_member(eb, ptr, type, member, result) (\ + read_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#define write_eb_member(eb, ptr, type, member, result) (\ + write_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#define DECLARE_BTRFS_SETGET_BITS(bits) \ +u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ + const void *ptr, unsigned long off); \ +void btrfs_set_token_##bits(struct btrfs_map_token *token, \ + const void *ptr, unsigned long off, \ + u##bits val); \ +u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ + const void *ptr, unsigned long off); \ +void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val); + +DECLARE_BTRFS_SETGET_BITS(8) +DECLARE_BTRFS_SETGET_BITS(16) +DECLARE_BTRFS_SETGET_BITS(32) +DECLARE_BTRFS_SETGET_BITS(64) + +#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ + const type *s) \ +{ \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + return btrfs_get_##bits(eb, s, offsetof(type, member)); \ +} \ +static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ + u##bits val) \ +{ \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + btrfs_set_##bits(eb, s, offsetof(type, member), val); \ +} \ +static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ + const type *s) \ +{ \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + return btrfs_get_token_##bits(token, s, offsetof(type, member));\ +} \ +static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ + type *s, u##bits val) \ +{ \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ +} + +#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ +{ \ + const type *p = page_address(eb->pages[0]) + \ + offset_in_page(eb->start); \ + return get_unaligned_le##bits(&p->member); \ +} \ +static inline void btrfs_set_##name(const struct extent_buffer *eb, \ + u##bits val) \ +{ \ + type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \ + put_unaligned_le##bits(val, &p->member); \ +} + +#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(const type *s) \ +{ \ + return get_unaligned_le##bits(&s->member); \ +} \ +static inline void btrfs_set_##name(type *s, u##bits val) \ +{ \ + put_unaligned_le##bits(val, &s->member); \ +} + +static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, + struct btrfs_dev_item *s) +{ + static_assert(sizeof(u64) == + sizeof(((struct btrfs_dev_item *)0))->total_bytes); + return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, + total_bytes)); +} +static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, + struct btrfs_dev_item *s, + u64 val) +{ + static_assert(sizeof(u64) == + sizeof(((struct btrfs_dev_item *)0))->total_bytes); + WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); + btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); +} + +BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); +BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); +BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); +BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, start_offset, 64); +BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); +BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); +BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); +BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); +BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, + io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, + io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, dev_group, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, + seek_speed, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, + bandwidth, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, + generation, 64); + +static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d) +{ + return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid); +} + +static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d) +{ + return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid); +} + +BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); +BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); +BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); +BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); +BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); +BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); +BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); + +static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) +{ + return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); +} + +BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, + stripe_len, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, + num_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, + sub_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); + +static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, int nr) +{ + unsigned long offset = (unsigned long)c; + + offset += offsetof(struct btrfs_chunk, stripe); + offset += nr * sizeof(struct btrfs_stripe); + return (struct btrfs_stripe *)offset; +} + +static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); +} + +static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); +} + +static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val); +} + +static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); +} + +static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val); +} + +/* struct btrfs_block_group_item */ +BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item, + used, 64); +BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item, used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); + +BTRFS_SETGET_FUNCS(block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); +BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags, + struct btrfs_block_group_item, flags, 64); + +/* struct btrfs_free_space_info */ +BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, + extent_count, 32); +BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32); + +/* struct btrfs_inode_ref */ +BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); +BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_index, struct btrfs_inode_ref, index, 64); + +/* struct btrfs_inode_extref */ +BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, + parent_objectid, 64); +BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref, + name_len, 16); +BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64); + +/* struct btrfs_inode_item */ +BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); +BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); +BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); +BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); +BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); +BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, + sequence, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, + transid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, nbytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, + block_group, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); +BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); +BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); + +/* struct btrfs_dev_extent */ +BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, + chunk_objectid, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, + chunk_offset, 64); +BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_tree, struct btrfs_dev_extent, + chunk_tree, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_objectid, struct btrfs_dev_extent, + chunk_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_offset, struct btrfs_dev_extent, + chunk_offset, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_length, struct btrfs_dev_extent, length, 64); + +BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); +BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, generation, 64); +BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); + +BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); + +static inline void btrfs_tree_block_key(const struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, root, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, + objectid, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, + offset, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 32); + +BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32); + +BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, + type, 8); +BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, + offset, 64); + +static inline u32 btrfs_extent_inline_ref_size(int type) +{ + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) + return sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_SHARED_DATA_REF_KEY) + return sizeof(struct btrfs_shared_data_ref) + + sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_EXTENT_DATA_REF_KEY) + return sizeof(struct btrfs_extent_data_ref) + + offsetof(struct btrfs_extent_inline_ref, offset); + return 0; +} + +/* struct btrfs_node */ +BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); +BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, blockptr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr, + generation, 64); + +static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline unsigned long btrfs_node_key_ptr_offset(const struct extent_buffer *eb, int nr) +{ + return offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; +} + +void btrfs_node_key(const struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr); + +static inline void btrfs_set_node_key(const struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + unsigned long ptr; + + ptr = btrfs_node_key_ptr_offset(eb, nr); + write_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); +} + +/* struct btrfs_item */ +BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); + +static inline unsigned long btrfs_item_nr_offset(const struct extent_buffer *eb, int nr) +{ + return offsetof(struct btrfs_leaf, items) + + sizeof(struct btrfs_item) * nr; +} + +static inline struct btrfs_item *btrfs_item_nr(const struct extent_buffer *eb, int nr) +{ + return (struct btrfs_item *)btrfs_item_nr_offset(eb, nr); +} + +#define BTRFS_ITEM_SETGET_FUNCS(member) \ +static inline u32 btrfs_item_##member(const struct extent_buffer *eb, int slot) \ +{ \ + return btrfs_raw_item_##member(eb, btrfs_item_nr(eb, slot)); \ +} \ +static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ + int slot, u32 val) \ +{ \ + btrfs_set_raw_item_##member(eb, btrfs_item_nr(eb, slot), val); \ +} \ +static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ + int slot) \ +{ \ + struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ + return btrfs_token_raw_item_##member(token, item); \ +} \ +static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ + int slot, u32 val) \ +{ \ + struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ + btrfs_set_token_raw_item_##member(token, item, val); \ +} + +BTRFS_ITEM_SETGET_FUNCS(offset) +BTRFS_ITEM_SETGET_FUNCS(size); + +static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr) +{ + return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr); +} + +static inline void btrfs_item_key(const struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(eb, nr); + + read_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +static inline void btrfs_set_item_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(eb, nr); + + write_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); + +/* struct btrfs_root_ref */ +BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); +BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); +BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_root_ref_dirid, struct btrfs_root_ref, dirid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_root_ref_sequence, struct btrfs_root_ref, sequence, 64); +BTRFS_SETGET_STACK_FUNCS(stack_root_ref_name_len, struct btrfs_root_ref, name_len, 16); + +/* struct btrfs_dir_item */ +BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); +BTRFS_SETGET_FUNCS(dir_flags, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); +BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dir_flags, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, data_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, name_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, transid, 64); + +static inline u8 btrfs_dir_ftype(const struct extent_buffer *eb, + const struct btrfs_dir_item *item) +{ + return btrfs_dir_flags_to_ftype(btrfs_dir_flags(eb, item)); +} + +static inline u8 btrfs_stack_dir_ftype(const struct btrfs_dir_item *item) +{ + return btrfs_dir_flags_to_ftype(btrfs_stack_dir_flags(item)); +} + +static inline void btrfs_dir_item_key(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, + struct btrfs_dir_item *item, + const struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, + num_entries, 64); +BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, + num_bitmaps, 64); +BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, + generation, 64); + +static inline void btrfs_free_space_key(const struct extent_buffer *eb, + const struct btrfs_free_space_header *h, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + +static inline void btrfs_set_free_space_key(struct extent_buffer *eb, + struct btrfs_free_space_header *h, + const struct btrfs_disk_key *key) +{ + write_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + +/* struct btrfs_disk_key */ +BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, objectid, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); + +#ifdef __LITTLE_ENDIAN + +/* + * Optimized helpers for little-endian architectures where CPU and on-disk + * structures have the same endianness and we can skip conversions. + */ + +static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key, + const struct btrfs_disk_key *disk_key) +{ + memcpy(cpu_key, disk_key, sizeof(struct btrfs_key)); +} + +static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key, + const struct btrfs_key *cpu_key) +{ + memcpy(disk_key, cpu_key, sizeof(struct btrfs_key)); +} + +static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *cpu_key, int nr) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_node_key(eb, disk_key, nr); +} + +static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *cpu_key, int nr) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_item_key(eb, disk_key, nr); +} + +static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, + struct btrfs_key *cpu_key) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_dir_item_key(eb, item, disk_key); +} + +#else + +static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, + const struct btrfs_disk_key *disk) +{ + cpu->offset = le64_to_cpu(disk->offset); + cpu->type = disk->type; + cpu->objectid = le64_to_cpu(disk->objectid); +} + +static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, + const struct btrfs_key *cpu) +{ + disk->offset = cpu_to_le64(cpu->offset); + disk->type = cpu->type; + disk->objectid = cpu_to_le64(cpu->objectid); +} + +static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + + btrfs_node_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + + btrfs_item_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, + struct btrfs_key *key) +{ + struct btrfs_disk_key disk_key; + + btrfs_dir_item_key(eb, item, &disk_key); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +#endif + +/* struct btrfs_header */ +BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); +BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, generation, 64); +BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); +BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); +BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); +BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, nritems, 32); +BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64); + +static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag) +{ + return (btrfs_header_flags(eb) & flag) == flag; +} + +static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + + btrfs_set_header_flags(eb, flags | flag); +} + +static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + + btrfs_set_header_flags(eb, flags & ~flag); +} + +static inline int btrfs_header_backref_rev(const struct extent_buffer *eb) +{ + u64 flags = btrfs_header_flags(eb); + + return flags >> BTRFS_BACKREF_REV_SHIFT; +} + +static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, int rev) +{ + u64 flags = btrfs_header_flags(eb); + + flags &= ~BTRFS_BACKREF_REV_MASK; + flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; + btrfs_set_header_flags(eb, flags); +} + +static inline int btrfs_is_leaf(const struct extent_buffer *eb) +{ + return btrfs_header_level(eb) == 0; +} + +/* struct btrfs_root_item */ +BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, generation, 64); +BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); + +BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8); +BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); +BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); +BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); +BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, + last_snapshot, 64); +BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item, + generation_v2, 64); +BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, ctransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, otransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, stransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, rtransid, 64); + +/* struct btrfs_root_backup */ +BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, + tree_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, + tree_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, + tree_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, + chunk_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, + chunk_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, + extent_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, + extent_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, + extent_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, + fs_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, + fs_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, + fs_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, + dev_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, + dev_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, + dev_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, + csum_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, + csum_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, + csum_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, + num_devices, 64); + +/* struct btrfs_balance_item */ +BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); + +static inline void btrfs_balance_data(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_set_balance_data(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_balance_meta(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_set_balance_meta(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_balance_sys(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void btrfs_set_balance_sys(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, + const struct btrfs_disk_balance_args *disk) +{ + memset(cpu, 0, sizeof(*cpu)); + + cpu->profiles = le64_to_cpu(disk->profiles); + cpu->usage = le64_to_cpu(disk->usage); + cpu->devid = le64_to_cpu(disk->devid); + cpu->pstart = le64_to_cpu(disk->pstart); + cpu->pend = le64_to_cpu(disk->pend); + cpu->vstart = le64_to_cpu(disk->vstart); + cpu->vend = le64_to_cpu(disk->vend); + cpu->target = le64_to_cpu(disk->target); + cpu->flags = le64_to_cpu(disk->flags); + cpu->limit = le64_to_cpu(disk->limit); + cpu->stripes_min = le32_to_cpu(disk->stripes_min); + cpu->stripes_max = le32_to_cpu(disk->stripes_max); +} + +static inline void btrfs_cpu_balance_args_to_disk( + struct btrfs_disk_balance_args *disk, + const struct btrfs_balance_args *cpu) +{ + memset(disk, 0, sizeof(*disk)); + + disk->profiles = cpu_to_le64(cpu->profiles); + disk->usage = cpu_to_le64(cpu->usage); + disk->devid = cpu_to_le64(cpu->devid); + disk->pstart = cpu_to_le64(cpu->pstart); + disk->pend = cpu_to_le64(cpu->pend); + disk->vstart = cpu_to_le64(cpu->vstart); + disk->vend = cpu_to_le64(cpu->vend); + disk->target = cpu_to_le64(cpu->target); + disk->flags = cpu_to_le64(cpu->flags); + disk->limit = cpu_to_le64(cpu->limit); + disk->stripes_min = cpu_to_le32(cpu->stripes_min); + disk->stripes_max = cpu_to_le32(cpu->stripes_max); +} + +/* struct btrfs_super_block */ +BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); +BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, + struct btrfs_super_block, sys_chunk_array_size, 32); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, + struct btrfs_super_block, chunk_root_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, + root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, + chunk_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, log_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, + log_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, + sectorsize, 32); +BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, + nodesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, + stripesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, + root_dir_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, + num_devices, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, + compat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, + compat_ro_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, + incompat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, + csum_type, 16); +BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, + cache_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); +BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, + uuid_tree_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_nr_global_roots, struct btrfs_super_block, + nr_global_roots, 64); + +/* struct btrfs_file_extent_item */ +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, + type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, + struct btrfs_file_extent_item, disk_bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset, + struct btrfs_file_extent_item, offset, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, + struct btrfs_file_extent_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, + struct btrfs_file_extent_item, num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes, + struct btrfs_file_extent_item, ram_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, + struct btrfs_file_extent_item, disk_num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, + struct btrfs_file_extent_item, compression, 8); + + +BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); +BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, + disk_bytenr, 64); +BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, + generation, 64); +BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, + disk_num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, + offset, 64); +BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, + num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, + ram_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, + compression, 8); +BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, + encryption, 8); +BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, + other_encoding, 16); + +/* btrfs_qgroup_status_item */ +BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, + version, 64); +BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, + flags, 64); +BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, + rescan, 64); + +/* btrfs_qgroup_info_item */ +BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, + rfer_cmpr, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, + excl_cmpr, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation, + struct btrfs_qgroup_info_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item, + rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr, + struct btrfs_qgroup_info_item, rfer_cmpr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item, + excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr, + struct btrfs_qgroup_info_item, excl_cmpr, 64); + +/* btrfs_qgroup_limit_item */ +BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, flags, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, + max_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, + max_excl, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, + rsv_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, + rsv_excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_flags, + struct btrfs_qgroup_limit_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_rfer, + struct btrfs_qgroup_limit_item, max_rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_excl, + struct btrfs_qgroup_limit_item, max_excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_rfer, + struct btrfs_qgroup_limit_item, rsv_rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_excl, + struct btrfs_qgroup_limit_item, rsv_excl, 64); + +/* btrfs_dev_replace_item */ +BTRFS_SETGET_FUNCS(dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, + 64); +BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, + replace_state, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, + time_started, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, + time_stopped, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, + num_write_errors, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, num_uncorrectable_read_errors, + 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, + cursor_left, 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, + cursor_right, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, + cont_reading_from_srcdev_mode, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, + struct btrfs_dev_replace_item, replace_state, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, + struct btrfs_dev_replace_item, time_started, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, + struct btrfs_dev_replace_item, time_stopped, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, + struct btrfs_dev_replace_item, num_write_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, + num_uncorrectable_read_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, + struct btrfs_dev_replace_item, cursor_left, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, + struct btrfs_dev_replace_item, cursor_right, 64); + +/* btrfs_verity_descriptor_item */ +BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, + encryption, 8); +BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item, + size, 64); +BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, + struct btrfs_verity_descriptor_item, encryption, 8); +BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, + struct btrfs_verity_descriptor_item, size, 64); + +/* Cast into the data area of the leaf. */ +#define btrfs_item_ptr(leaf, slot, type) \ + ((type *)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot))) + +#define btrfs_item_ptr_offset(leaf, slot) \ + ((unsigned long)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot))) + +#endif diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 548d6a5477b4..3da1779e8b79 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -11,10 +11,10 @@ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/slab.h> - #include "ctree.h" #include "btrfs_inode.h" #include "xattr.h" +#include "acl.h" struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) { @@ -110,10 +110,11 @@ out: return ret; } -int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int ret; + struct inode *inode = d_inode(dentry); umode_t old_mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h new file mode 100644 index 000000000000..39bd36e6eeb7 --- /dev/null +++ b/fs/btrfs/acl.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ACL_H +#define BTRFS_ACL_H + +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + +struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); +int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + struct posix_acl *acl, int type); +int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, + struct posix_acl *acl, int type); + +#else + +#define btrfs_get_acl NULL +#define btrfs_set_acl NULL +static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct posix_acl *acl, + int type) +{ + return -EOPNOTSUPP; +} + +#endif + +#endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index d385357e19b6..21c92c74bf71 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -15,49 +15,76 @@ #include "locking.h" #include "misc.h" #include "tree-mod-log.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "relocation.h" +#include "tree-checker.h" -/* Just an arbitrary number so we can be sure this happened */ -#define BACKREF_FOUND_SHARED 6 +/* Just arbitrary numbers so we can be sure one of these happened. */ +#define BACKREF_FOUND_SHARED 6 +#define BACKREF_FOUND_NOT_SHARED 7 struct extent_inode_elem { u64 inum; u64 offset; + u64 num_bytes; struct extent_inode_elem *next; }; -static int check_extent_in_eb(const struct btrfs_key *key, +static int check_extent_in_eb(struct btrfs_backref_walk_ctx *ctx, + const struct btrfs_key *key, const struct extent_buffer *eb, const struct btrfs_file_extent_item *fi, - u64 extent_item_pos, - struct extent_inode_elem **eie, - bool ignore_offset) + struct extent_inode_elem **eie) { - u64 offset = 0; + const u64 data_len = btrfs_file_extent_num_bytes(eb, fi); + u64 offset = key->offset; struct extent_inode_elem *e; + const u64 *root_ids; + int root_count; + bool cached; - if (!ignore_offset && - !btrfs_file_extent_compression(eb, fi) && + if (!btrfs_file_extent_compression(eb, fi) && !btrfs_file_extent_encryption(eb, fi) && !btrfs_file_extent_other_encoding(eb, fi)) { u64 data_offset; - u64 data_len; data_offset = btrfs_file_extent_offset(eb, fi); - data_len = btrfs_file_extent_num_bytes(eb, fi); - if (extent_item_pos < data_offset || - extent_item_pos >= data_offset + data_len) + if (ctx->extent_item_pos < data_offset || + ctx->extent_item_pos >= data_offset + data_len) return 1; - offset = extent_item_pos - data_offset; + offset += ctx->extent_item_pos - data_offset; + } + + if (!ctx->indirect_ref_iterator || !ctx->cache_lookup) + goto add_inode_elem; + + cached = ctx->cache_lookup(eb->start, ctx->user_ctx, &root_ids, + &root_count); + if (!cached) + goto add_inode_elem; + + for (int i = 0; i < root_count; i++) { + int ret; + + ret = ctx->indirect_ref_iterator(key->objectid, offset, + data_len, root_ids[i], + ctx->user_ctx); + if (ret) + return ret; } +add_inode_elem: e = kmalloc(sizeof(*e), GFP_NOFS); if (!e) return -ENOMEM; e->next = *eie; e->inum = key->objectid; - e->offset = key->offset + offset; + e->offset = offset; + e->num_bytes = data_len; *eie = e; return 0; @@ -73,10 +100,9 @@ static void free_inode_elem_list(struct extent_inode_elem *eie) } } -static int find_extent_in_eb(const struct extent_buffer *eb, - u64 wanted_disk_byte, u64 extent_item_pos, - struct extent_inode_elem **eie, - bool ignore_offset) +static int find_extent_in_eb(struct btrfs_backref_walk_ctx *ctx, + const struct extent_buffer *eb, + struct extent_inode_elem **eie) { u64 disk_byte; struct btrfs_key key; @@ -102,11 +128,11 @@ static int find_extent_in_eb(const struct extent_buffer *eb, continue; /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte != wanted_disk_byte) + if (disk_byte != ctx->bytenr) continue; - ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie, ignore_offset); - if (ret < 0) + ret = check_extent_in_eb(ctx, &key, eb, fi, eie); + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) return ret; } @@ -135,9 +161,30 @@ struct preftrees { * - decremented when a ref->count transitions to <1 */ struct share_check { - u64 root_objectid; + struct btrfs_backref_share_check_ctx *ctx; + struct btrfs_root *root; u64 inum; + u64 data_bytenr; + u64 data_extent_gen; + /* + * Counts number of inodes that refer to an extent (different inodes in + * the same root or different roots) that we could find. The sharedness + * check typically stops once this counter gets greater than 1, so it + * may not reflect the total number of inodes. + */ int share_count; + /* + * The number of times we found our inode refers to the data extent we + * are determining the sharedness. In other words, how many file extent + * items we could find for our inode that point to our target data + * extent. The value we get here after finishing the extent sharedness + * check may be smaller than reality, but if it ends up being greater + * than 1, then we know for sure the inode has multiple file extent + * items that point to our inode, and we can safely assume it's useful + * to cache the sharedness check result. + */ + int self_ref_count; + bool have_delayed_delete_refs; }; static inline int extent_is_shared(struct share_check *sc) @@ -206,7 +253,7 @@ static int prelim_ref_compare(struct prelim_ref *ref1, } static void update_share_count(struct share_check *sc, int oldcount, - int newcount) + int newcount, struct prelim_ref *newref) { if ((!sc) || (oldcount == 0 && newcount < 1)) return; @@ -215,6 +262,11 @@ static void update_share_count(struct share_check *sc, int oldcount, sc->share_count--; else if (oldcount < 1 && newcount > 0) sc->share_count++; + + if (newref->root_id == sc->root->root_key.objectid && + newref->wanted_disk_byte == sc->data_bytenr && + newref->key_for_search.objectid == sc->inum) + sc->self_ref_count += newref->count; } /* @@ -265,14 +317,14 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, * BTRFS_[ADD|DROP]_DELAYED_REF actions. */ update_share_count(sc, ref->count, - ref->count + newref->count); + ref->count + newref->count, newref); ref->count += newref->count; free_pref(newref); return; } } - update_share_count(sc, 0, newref->count); + update_share_count(sc, 0, newref->count, newref); preftree->count++; trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count); rb_link_node(&newref->rbnode, parent, p); @@ -288,8 +340,10 @@ static void prelim_release(struct preftree *preftree) struct prelim_ref *ref, *next_ref; rbtree_postorder_for_each_entry_safe(ref, next_ref, - &preftree->root.rb_root, rbnode) + &preftree->root.rb_root, rbnode) { + free_inode_elem_list(ref->inode_list); free_pref(ref); + } preftree->root = RB_ROOT_CACHED; preftree->count = 0; @@ -413,11 +467,11 @@ static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr) return 0; } -static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, +static int add_all_parents(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_root *root, struct btrfs_path *path, struct ulist *parents, struct preftrees *preftrees, struct prelim_ref *ref, - int level, u64 time_seq, const u64 *extent_item_pos, - bool ignore_offset) + int level) { int ret = 0; int slot; @@ -453,10 +507,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (path->slots[0] >= btrfs_header_nritems(eb) || is_shared_data_backref(preftrees, eb->start) || ref->root_id != btrfs_header_owner(eb)) { - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_leaf(root, path); else - ret = btrfs_next_old_leaf(root, path, time_seq); + ret = btrfs_next_old_leaf(root, path, ctx->time_seq); } while (!ret && count < ref->count) { @@ -477,10 +531,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (slot == 0 && (is_shared_data_backref(preftrees, eb->start) || ref->root_id != btrfs_header_owner(eb))) { - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_leaf(root, path); else - ret = btrfs_next_old_leaf(root, path, time_seq); + ret = btrfs_next_old_leaf(root, path, ctx->time_seq); continue; } fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); @@ -494,11 +548,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, count++; else goto next; - if (extent_item_pos) { - ret = check_extent_in_eb(&key, eb, fi, - *extent_item_pos, - &eie, ignore_offset); - if (ret < 0) + if (!ctx->ignore_extent_item_pos) { + ret = check_extent_in_eb(ctx, &key, eb, fi, &eie); + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || + ret < 0) break; } if (ret > 0) @@ -507,7 +560,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie, (void **)&old, GFP_NOFS); if (ret < 0) break; - if (!ret && extent_item_pos) { + if (!ret && !ctx->ignore_extent_item_pos) { while (old->next) old = old->next; old->next = eie; @@ -515,16 +568,17 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie = NULL; } next: - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_item(root, path); else - ret = btrfs_next_old_item(root, path, time_seq); + ret = btrfs_next_old_item(root, path, ctx->time_seq); } - if (ret > 0) - ret = 0; - else if (ret < 0) + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) free_inode_elem_list(eie); + else if (ret > 0) + ret = 0; + return ret; } @@ -532,11 +586,10 @@ next: * resolve an indirect backref in the form (root_id, key, level) * to a logical address */ -static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 time_seq, +static int resolve_indirect_ref(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_path *path, struct preftrees *preftrees, - struct prelim_ref *ref, struct ulist *parents, - const u64 *extent_item_pos, bool ignore_offset) + struct prelim_ref *ref, struct ulist *parents) { struct btrfs_root *root; struct extent_buffer *eb; @@ -554,9 +607,9 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, * here. */ if (path->search_commit_root) - root = btrfs_get_fs_root_commit_root(fs_info, path, ref->root_id); + root = btrfs_get_fs_root_commit_root(ctx->fs_info, path, ref->root_id); else - root = btrfs_get_fs_root(fs_info, ref->root_id, false); + root = btrfs_get_fs_root(ctx->fs_info, ref->root_id, false); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out_free; @@ -568,17 +621,17 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - if (btrfs_is_testing(fs_info)) { + if (btrfs_is_testing(ctx->fs_info)) { ret = -ENOENT; goto out; } if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); - else if (time_seq == BTRFS_SEQ_LAST) + else if (ctx->time_seq == BTRFS_SEQ_LAST) root_level = btrfs_header_level(root->node); else - root_level = btrfs_old_root_level(root, time_seq); + root_level = btrfs_old_root_level(root, ctx->time_seq); if (root_level + 1 == level) goto out; @@ -606,12 +659,12 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, search_key.offset >= LLONG_MAX) search_key.offset = 0; path->lowest_level = level; - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); else - ret = btrfs_search_old_slot(root, &search_key, path, time_seq); + ret = btrfs_search_old_slot(root, &search_key, path, ctx->time_seq); - btrfs_debug(fs_info, + btrfs_debug(ctx->fs_info, "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)", ref->root_id, level, ref->count, ret, ref->key_for_search.objectid, ref->key_for_search.type, @@ -629,8 +682,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, eb = path->nodes[level]; } - ret = add_all_parents(root, path, parents, preftrees, ref, level, - time_seq, extent_item_pos, ignore_offset); + ret = add_all_parents(ctx, root, path, parents, preftrees, ref, level); out: btrfs_put_root(root); out_free: @@ -647,6 +699,18 @@ unode_aux_to_inode_list(struct ulist_node *node) return (struct extent_inode_elem *)(uintptr_t)node->aux; } +static void free_leaf_list(struct ulist *ulist) +{ + struct ulist_node *node; + struct ulist_iterator uiter; + + ULIST_ITER_INIT(&uiter); + while ((node = ulist_next(ulist, &uiter))) + free_inode_elem_list(unode_aux_to_inode_list(node)); + + ulist_free(ulist); +} + /* * We maintain three separate rbtrees: one for direct refs, one for * indirect refs which have a key, and one for indirect refs which do not @@ -663,11 +727,10 @@ unode_aux_to_inode_list(struct ulist_node *node) * rbtree as they are encountered. The new backrefs are subsequently * resolved as above. */ -static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 time_seq, +static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_path *path, struct preftrees *preftrees, - const u64 *extent_item_pos, - struct share_check *sc, bool ignore_offset) + struct share_check *sc) { int err; int ret = 0; @@ -704,21 +767,18 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, continue; } - if (sc && sc->root_objectid && - ref->root_id != sc->root_objectid) { + if (sc && ref->root_id != sc->root->root_key.objectid) { free_pref(ref); ret = BACKREF_FOUND_SHARED; goto out; } - err = resolve_indirect_ref(fs_info, path, time_seq, preftrees, - ref, parents, extent_item_pos, - ignore_offset); + err = resolve_indirect_ref(ctx, path, preftrees, ref, parents); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. */ if (err == -ENOENT) { - prelim_ref_insert(fs_info, &preftrees->direct, ref, + prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref, NULL); continue; } else if (err) { @@ -747,7 +807,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, memcpy(new_ref, ref, sizeof(*ref)); new_ref->parent = node->val; new_ref->inode_list = unode_aux_to_inode_list(node); - prelim_ref_insert(fs_info, &preftrees->direct, + prelim_ref_insert(ctx->fs_info, &preftrees->direct, new_ref, NULL); } @@ -755,13 +815,17 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, * Now it's a direct ref, put it in the direct tree. We must * do this last because the ref could be merged/freed here. */ - prelim_ref_insert(fs_info, &preftrees->direct, ref, NULL); + prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref, NULL); ulist_reinit(parents); cond_resched(); } out: - ulist_free(parents); + /* + * We may have inode lists attached to refs in the parents ulist, so we + * must free them before freeing the ulist and its refs. + */ + free_leaf_list(parents); return ret; } @@ -777,6 +841,8 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, struct rb_node *node; while ((node = rb_first_cached(&tree->root))) { + struct btrfs_tree_parent_check check = { 0 }; + ref = rb_entry(node, struct prelim_ref, rbnode); rb_erase_cached(node, &tree->root); @@ -784,8 +850,10 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, BUG_ON(ref->key_for_search.type); BUG_ON(!ref->wanted_disk_byte); - eb = read_tree_block(fs_info, ref->wanted_disk_byte, - ref->root_id, 0, ref->level - 1, NULL); + check.level = ref->level - 1; + check.owner_root = ref->root_id; + + eb = read_tree_block(fs_info, ref->wanted_disk_byte, &check); if (IS_ERR(eb)) { free_pref(ref); return PTR_ERR(eb); @@ -820,16 +888,11 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, struct preftrees *preftrees, struct share_check *sc) { struct btrfs_delayed_ref_node *node; - struct btrfs_delayed_extent_op *extent_op = head->extent_op; struct btrfs_key key; - struct btrfs_key tmp_op_key; struct rb_node *n; int count; int ret = 0; - if (extent_op && extent_op->update_key) - btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key); - spin_lock(&head->lock); for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) { node = rb_entry(n, struct btrfs_delayed_ref_node, @@ -855,10 +918,16 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, case BTRFS_TREE_BLOCK_REF_KEY: { /* NORMAL INDIRECT METADATA backref */ struct btrfs_delayed_tree_ref *ref; + struct btrfs_key *key_ptr = NULL; + + if (head->extent_op && head->extent_op->update_key) { + btrfs_disk_key_to_cpu(&key, &head->extent_op->key); + key_ptr = &key; + } ref = btrfs_delayed_node_to_tree_ref(node); ret = add_indirect_ref(fs_info, preftrees, ref->root, - &tmp_op_key, ref->level + 1, + key_ptr, ref->level + 1, node->bytenr, count, sc, GFP_ATOMIC); break; @@ -884,13 +953,22 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, key.offset = ref->offset; /* - * Found a inum that doesn't match our known inum, we - * know it's shared. + * If we have a share check context and a reference for + * another inode, we can't exit immediately. This is + * because even if this is a BTRFS_ADD_DELAYED_REF + * reference we may find next a BTRFS_DROP_DELAYED_REF + * which cancels out this ADD reference. + * + * If this is a DROP reference and there was no previous + * ADD reference, then we need to signal that when we + * process references from the extent tree (through + * add_inline_refs() and add_keyed_refs()), we should + * not exit early if we find a reference for another + * inode, because one of the delayed DROP references + * may cancel that reference in the extent tree. */ - if (sc && sc->inum && ref->objectid != sc->inum) { - ret = BACKREF_FOUND_SHARED; - goto out; - } + if (sc && count < 0) + sc->have_delayed_delete_refs = true; ret = add_indirect_ref(fs_info, preftrees, ref->root, &key, 0, node->bytenr, count, sc, @@ -920,7 +998,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, } if (!ret) ret = extent_is_shared(sc); -out: + spin_unlock(&head->lock); return ret; } @@ -930,8 +1008,8 @@ out: * * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED. */ -static int add_inline_refs(const struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 bytenr, +static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_path *path, int *info_level, struct preftrees *preftrees, struct share_check *sc) { @@ -956,6 +1034,13 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + + if (ctx->check_extent_item) { + ret = ctx->check_extent_item(ctx->bytenr, ei, leaf, ctx->user_ctx); + if (ret) + return ret; + } + flags = btrfs_extent_flags(leaf, ei); btrfs_item_key_to_cpu(leaf, &found_key, slot); @@ -991,9 +1076,9 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, switch (type) { case BTRFS_SHARED_BLOCK_REF_KEY: - ret = add_direct_ref(fs_info, preftrees, + ret = add_direct_ref(ctx->fs_info, preftrees, *info_level + 1, offset, - bytenr, 1, NULL, GFP_NOFS); + ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_SHARED_DATA_REF_KEY: { struct btrfs_shared_data_ref *sdref; @@ -1002,14 +1087,14 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, sdref = (struct btrfs_shared_data_ref *)(iref + 1); count = btrfs_shared_data_ref_count(leaf, sdref); - ret = add_direct_ref(fs_info, preftrees, 0, offset, - bytenr, count, sc, GFP_NOFS); + ret = add_direct_ref(ctx->fs_info, preftrees, 0, offset, + ctx->bytenr, count, sc, GFP_NOFS); break; } case BTRFS_TREE_BLOCK_REF_KEY: - ret = add_indirect_ref(fs_info, preftrees, offset, + ret = add_indirect_ref(ctx->fs_info, preftrees, offset, NULL, *info_level + 1, - bytenr, 1, NULL, GFP_NOFS); + ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_extent_data_ref *dref; @@ -1023,16 +1108,20 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum) { + if (sc && key.objectid != sc->inum && + !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; } root = btrfs_extent_data_ref_root(leaf, dref); - ret = add_indirect_ref(fs_info, preftrees, root, - &key, 0, bytenr, count, - sc, GFP_NOFS); + if (!ctx->skip_data_ref || + !ctx->skip_data_ref(root, key.objectid, key.offset, + ctx->user_ctx)) + ret = add_indirect_ref(ctx->fs_info, preftrees, + root, &key, 0, ctx->bytenr, + count, sc, GFP_NOFS); break; } default: @@ -1051,8 +1140,9 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, * * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED. */ -static int add_keyed_refs(struct btrfs_root *extent_root, - struct btrfs_path *path, u64 bytenr, +static int add_keyed_refs(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_root *extent_root, + struct btrfs_path *path, int info_level, struct preftrees *preftrees, struct share_check *sc) { @@ -1075,7 +1165,7 @@ static int add_keyed_refs(struct btrfs_root *extent_root, leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid != bytenr) + if (key.objectid != ctx->bytenr) break; if (key.type < BTRFS_TREE_BLOCK_REF_KEY) continue; @@ -1087,7 +1177,7 @@ static int add_keyed_refs(struct btrfs_root *extent_root, /* SHARED DIRECT METADATA backref */ ret = add_direct_ref(fs_info, preftrees, info_level + 1, key.offset, - bytenr, 1, NULL, GFP_NOFS); + ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_SHARED_DATA_REF_KEY: { /* SHARED DIRECT FULL backref */ @@ -1098,14 +1188,14 @@ static int add_keyed_refs(struct btrfs_root *extent_root, struct btrfs_shared_data_ref); count = btrfs_shared_data_ref_count(leaf, sdref); ret = add_direct_ref(fs_info, preftrees, 0, - key.offset, bytenr, count, + key.offset, ctx->bytenr, count, sc, GFP_NOFS); break; } case BTRFS_TREE_BLOCK_REF_KEY: /* NORMAL INDIRECT METADATA backref */ ret = add_indirect_ref(fs_info, preftrees, key.offset, - NULL, info_level + 1, bytenr, + NULL, info_level + 1, ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_EXTENT_DATA_REF_KEY: { @@ -1122,15 +1212,20 @@ static int add_keyed_refs(struct btrfs_root *extent_root, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum) { + if (sc && key.objectid != sc->inum && + !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; } root = btrfs_extent_data_ref_root(leaf, dref); - ret = add_indirect_ref(fs_info, preftrees, root, - &key, 0, bytenr, count, - sc, GFP_NOFS); + + if (!ctx->skip_data_ref || + !ctx->skip_data_ref(root, key.objectid, key.offset, + ctx->user_ctx)) + ret = add_indirect_ref(fs_info, preftrees, root, + &key, 0, ctx->bytenr, + count, sc, GFP_NOFS); break; } default: @@ -1145,34 +1240,141 @@ static int add_keyed_refs(struct btrfs_root *extent_root, } /* + * The caller has joined a transaction or is holding a read lock on the + * fs_info->commit_root_sem semaphore, so no need to worry about the root's last + * snapshot field changing while updating or checking the cache. + */ +static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, + struct btrfs_root *root, + u64 bytenr, int level, bool *is_shared) +{ + struct btrfs_backref_shared_cache_entry *entry; + + if (!ctx->use_path_cache) + return false; + + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) + return false; + + /* + * Level -1 is used for the data extent, which is not reliable to cache + * because its reference count can increase or decrease without us + * realizing. We cache results only for extent buffers that lead from + * the root node down to the leaf with the file extent item. + */ + ASSERT(level >= 0); + + entry = &ctx->path_cache_entries[level]; + + /* Unused cache entry or being used for some other extent buffer. */ + if (entry->bytenr != bytenr) + return false; + + /* + * We cached a false result, but the last snapshot generation of the + * root changed, so we now have a snapshot. Don't trust the result. + */ + if (!entry->is_shared && + entry->gen != btrfs_root_last_snapshot(&root->root_item)) + return false; + + /* + * If we cached a true result and the last generation used for dropping + * a root changed, we can not trust the result, because the dropped root + * could be a snapshot sharing this extent buffer. + */ + if (entry->is_shared && + entry->gen != btrfs_get_last_root_drop_gen(root->fs_info)) + return false; + + *is_shared = entry->is_shared; + /* + * If the node at this level is shared, than all nodes below are also + * shared. Currently some of the nodes below may be marked as not shared + * because we have just switched from one leaf to another, and switched + * also other nodes above the leaf and below the current level, so mark + * them as shared. + */ + if (*is_shared) { + for (int i = 0; i < level; i++) { + ctx->path_cache_entries[i].is_shared = true; + ctx->path_cache_entries[i].gen = entry->gen; + } + } + + return true; +} + +/* + * The caller has joined a transaction or is holding a read lock on the + * fs_info->commit_root_sem semaphore, so no need to worry about the root's last + * snapshot field changing while updating or checking the cache. + */ +static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, + struct btrfs_root *root, + u64 bytenr, int level, bool is_shared) +{ + struct btrfs_backref_shared_cache_entry *entry; + u64 gen; + + if (!ctx->use_path_cache) + return; + + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) + return; + + /* + * Level -1 is used for the data extent, which is not reliable to cache + * because its reference count can increase or decrease without us + * realizing. We cache results only for extent buffers that lead from + * the root node down to the leaf with the file extent item. + */ + ASSERT(level >= 0); + + if (is_shared) + gen = btrfs_get_last_root_drop_gen(root->fs_info); + else + gen = btrfs_root_last_snapshot(&root->root_item); + + entry = &ctx->path_cache_entries[level]; + entry->bytenr = bytenr; + entry->is_shared = is_shared; + entry->gen = gen; + + /* + * If we found an extent buffer is shared, set the cache result for all + * extent buffers below it to true. As nodes in the path are COWed, + * their sharedness is moved to their children, and if a leaf is COWed, + * then the sharedness of a data extent becomes direct, the refcount of + * data extent is increased in the extent item at the extent tree. + */ + if (is_shared) { + for (int i = 0; i < level; i++) { + entry = &ctx->path_cache_entries[i]; + entry->is_shared = is_shared; + entry->gen = gen; + } + } +} + +/* * this adds all existing backrefs (inline backrefs, backrefs and delayed * refs) for the given bytenr to the refs list, merges duplicates and resolves * indirect refs to their parent bytenr. * When roots are found, they're added to the roots list * - * If time_seq is set to BTRFS_SEQ_LAST, it will not search delayed_refs, and - * behave much like trans == NULL case, the difference only lies in it will not - * commit root. - * The special case is for qgroup to search roots in commit_transaction(). - * - * @sc - if !NULL, then immediately return BACKREF_FOUND_SHARED when a - * shared extent is detected. + * @ctx: Backref walking context object, must be not NULL. + * @sc: If !NULL, then immediately return BACKREF_FOUND_SHARED when a + * shared extent is detected. * * Otherwise this returns 0 for success and <0 for an error. * - * If ignore_offset is set to false, only extent refs whose offsets match - * extent_item_pos are returned. If true, every extent ref is returned - * and extent_item_pos is ignored. - * * FIXME some caching might speed things up */ -static int find_parent_nodes(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist *refs, - struct ulist *roots, const u64 *extent_item_pos, - struct share_check *sc, bool ignore_offset) +static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, + struct share_check *sc) { - struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); + struct btrfs_root *root = btrfs_extent_root(ctx->fs_info, ctx->bytenr); struct btrfs_key key; struct btrfs_path *path; struct btrfs_delayed_ref_root *delayed_refs = NULL; @@ -1188,9 +1390,13 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, .indirect_missing_keys = PREFTREE_INIT }; - key.objectid = bytenr; + /* Roots ulist is not needed when using a sharedness check context. */ + if (sc) + ASSERT(ctx->roots == NULL); + + key.objectid = ctx->bytenr; key.offset = (u64)-1; - if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + if (btrfs_fs_incompat(ctx->fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; @@ -1198,12 +1404,12 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - if (!trans) { + if (!ctx->trans) { path->search_commit_root = 1; path->skip_locking = 1; } - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) path->skip_locking = 1; again: @@ -1219,17 +1425,17 @@ again: goto out; } - if (trans && likely(trans->type != __TRANS_DUMMY) && - time_seq != BTRFS_SEQ_LAST) { + if (ctx->trans && likely(ctx->trans->type != __TRANS_DUMMY) && + ctx->time_seq != BTRFS_SEQ_LAST) { /* * We have a specific time_seq we care about and trans which * means we have the path lock, we need to grab the ref head and * lock it so we have a consistent view of the refs at the given * time. */ - delayed_refs = &trans->transaction->delayed_refs; + delayed_refs = &ctx->trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(delayed_refs, ctx->bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { refcount_inc(&head->refs); @@ -1247,7 +1453,7 @@ again: goto again; } spin_unlock(&delayed_refs->lock); - ret = add_delayed_refs(fs_info, head, time_seq, + ret = add_delayed_refs(ctx->fs_info, head, ctx->time_seq, &preftrees, sc); mutex_unlock(&head->mutex); if (ret) @@ -1265,30 +1471,96 @@ again: leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid == bytenr && + if (key.objectid == ctx->bytenr && (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY)) { - ret = add_inline_refs(fs_info, path, bytenr, - &info_level, &preftrees, sc); + ret = add_inline_refs(ctx, path, &info_level, + &preftrees, sc); if (ret) goto out; - ret = add_keyed_refs(root, path, bytenr, info_level, + ret = add_keyed_refs(ctx, root, path, info_level, &preftrees, sc); if (ret) goto out; } } + /* + * If we have a share context and we reached here, it means the extent + * is not directly shared (no multiple reference items for it), + * otherwise we would have exited earlier with a return value of + * BACKREF_FOUND_SHARED after processing delayed references or while + * processing inline or keyed references from the extent tree. + * The extent may however be indirectly shared through shared subtrees + * as a result from creating snapshots, so we determine below what is + * its parent node, in case we are dealing with a metadata extent, or + * what's the leaf (or leaves), from a fs tree, that has a file extent + * item pointing to it in case we are dealing with a data extent. + */ + ASSERT(extent_is_shared(sc) == 0); + + /* + * If we are here for a data extent and we have a share_check structure + * it means the data extent is not directly shared (does not have + * multiple reference items), so we have to check if a path in the fs + * tree (going from the root node down to the leaf that has the file + * extent item pointing to the data extent) is shared, that is, if any + * of the extent buffers in the path is referenced by other trees. + */ + if (sc && ctx->bytenr == sc->data_bytenr) { + /* + * If our data extent is from a generation more recent than the + * last generation used to snapshot the root, then we know that + * it can not be shared through subtrees, so we can skip + * resolving indirect references, there's no point in + * determining the extent buffers for the path from the fs tree + * root node down to the leaf that has the file extent item that + * points to the data extent. + */ + if (sc->data_extent_gen > + btrfs_root_last_snapshot(&sc->root->root_item)) { + ret = BACKREF_FOUND_NOT_SHARED; + goto out; + } + + /* + * If we are only determining if a data extent is shared or not + * and the corresponding file extent item is located in the same + * leaf as the previous file extent item, we can skip resolving + * indirect references for a data extent, since the fs tree path + * is the same (same leaf, so same path). We skip as long as the + * cached result for the leaf is valid and only if there's only + * one file extent item pointing to the data extent, because in + * the case of multiple file extent items, they may be located + * in different leaves and therefore we have multiple paths. + */ + if (sc->ctx->curr_leaf_bytenr == sc->ctx->prev_leaf_bytenr && + sc->self_ref_count == 1) { + bool cached; + bool is_shared; + + cached = lookup_backref_shared_cache(sc->ctx, sc->root, + sc->ctx->curr_leaf_bytenr, + 0, &is_shared); + if (cached) { + if (is_shared) + ret = BACKREF_FOUND_SHARED; + else + ret = BACKREF_FOUND_NOT_SHARED; + goto out; + } + } + } + btrfs_release_path(path); - ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0); + ret = add_missing_keys(ctx->fs_info, &preftrees, path->skip_locking == 0); if (ret) goto out; WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root)); - ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, - extent_item_pos, sc, ignore_offset); + ret = resolve_indirect_refs(ctx, path, &preftrees, sc); if (ret) goto out; @@ -1315,25 +1587,22 @@ again: * e.g. different offsets would not be merged, * and would retain their original ref->count < 0. */ - if (roots && ref->count && ref->root_id && ref->parent == 0) { - if (sc && sc->root_objectid && - ref->root_id != sc->root_objectid) { - ret = BACKREF_FOUND_SHARED; - goto out; - } - + if (ctx->roots && ref->count && ref->root_id && ref->parent == 0) { /* no parent == root of tree */ - ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); + ret = ulist_add(ctx->roots, ref->root_id, 0, GFP_NOFS); if (ret < 0) goto out; } if (ref->count && ref->parent) { - if (extent_item_pos && !ref->inode_list && + if (!ctx->ignore_extent_item_pos && !ref->inode_list && ref->level == 0) { + struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb; - eb = read_tree_block(fs_info, ref->parent, 0, - 0, ref->level, NULL); + check.level = ref->level; + + eb = read_tree_block(ctx->fs_info, ref->parent, + &check); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; @@ -1346,21 +1615,27 @@ again: if (!path->skip_locking) btrfs_tree_read_lock(eb); - ret = find_extent_in_eb(eb, bytenr, - *extent_item_pos, &eie, ignore_offset); + ret = find_extent_in_eb(ctx, eb, &eie); if (!path->skip_locking) btrfs_tree_read_unlock(eb); free_extent_buffer(eb); - if (ret < 0) + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || + ret < 0) goto out; ref->inode_list = eie; + /* + * We transferred the list ownership to the ref, + * so set to NULL to avoid a double free in case + * an error happens after this. + */ + eie = NULL; } - ret = ulist_add_merge_ptr(refs, ref->parent, + ret = ulist_add_merge_ptr(ctx->refs, ref->parent, ref->inode_list, (void **)&eie, GFP_NOFS); if (ret < 0) goto out; - if (!ret && extent_item_pos) { + if (!ret && !ctx->ignore_extent_item_pos) { /* * We've recorded that parent, so we must extend * its inode list here. @@ -1379,6 +1654,14 @@ again: eie->next = ref->inode_list; } eie = NULL; + /* + * We have transferred the inode list ownership from + * this ref to the ref we added to the 'refs' ulist. + * So set this ref's inode list to NULL to avoid + * use-after-free when our caller uses it or double + * frees in case an error happens before we return. + */ + ref->inode_list = NULL; } cond_resched(); } @@ -1390,52 +1673,36 @@ out: prelim_release(&preftrees.indirect); prelim_release(&preftrees.indirect_missing_keys); - if (ret < 0) + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) free_inode_elem_list(eie); return ret; } -static void free_leaf_list(struct ulist *blocks) -{ - struct ulist_node *node = NULL; - struct extent_inode_elem *eie; - struct ulist_iterator uiter; - - ULIST_ITER_INIT(&uiter); - while ((node = ulist_next(blocks, &uiter))) { - if (!node->aux) - continue; - eie = unode_aux_to_inode_list(node); - free_inode_elem_list(eie); - node->aux = 0; - } - - ulist_free(blocks); -} - /* - * Finds all leafs with a reference to the specified combination of bytenr and - * offset. key_list_head will point to a list of corresponding keys (caller must - * free each list element). The leafs will be stored in the leafs ulist, which - * must be freed with ulist_free. + * Finds all leaves with a reference to the specified combination of + * @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are + * added to the ulist at @ctx->refs, and that ulist is allocated by this + * function. The caller should free the ulist with free_leaf_list() if + * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is + * enough. * - * returns 0 on success, <0 on error + * Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated. */ -int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **leafs, - const u64 *extent_item_pos, bool ignore_offset) +int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx) { int ret; - *leafs = ulist_alloc(GFP_NOFS); - if (!*leafs) + ASSERT(ctx->refs == NULL); + + ctx->refs = ulist_alloc(GFP_NOFS); + if (!ctx->refs) return -ENOMEM; - ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - *leafs, NULL, extent_item_pos, NULL, ignore_offset); - if (ret < 0 && ret != -ENOENT) { - free_leaf_list(*leafs); + ret = find_parent_nodes(ctx, NULL); + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || + (ret < 0 && ret != -ENOENT)) { + free_leaf_list(ctx->refs); + ctx->refs = NULL; return ret; } @@ -1443,7 +1710,7 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, } /* - * walk all backrefs for a given extent to find all roots that reference this + * Walk all backrefs for a given extent to find all roots that reference this * extent. Walking a backref means finding all extents that reference this * extent and in turn walk the backrefs of those, too. Naturally this is a * recursive process, but here it is implemented in an iterative fashion: We @@ -1451,76 +1718,115 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, * list. In turn, we find all referencing extents for those, further appending * to the list. The way we iterate the list allows adding more elements after * the current while iterating. The process stops when we reach the end of the - * list. Found roots are added to the roots list. + * list. * - * returns 0 on success, < 0 on error. + * Found roots are added to @ctx->roots, which is allocated by this function if + * it points to NULL, in which case the caller is responsible for freeing it + * after it's not needed anymore. + * This function requires @ctx->refs to be NULL, as it uses it for allocating a + * ulist to do temporary work, and frees it before returning. + * + * Returns 0 on success, < 0 on error. */ -static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots, - bool ignore_offset) +static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx) { - struct ulist *tmp; - struct ulist_node *node = NULL; + const u64 orig_bytenr = ctx->bytenr; + const bool orig_ignore_extent_item_pos = ctx->ignore_extent_item_pos; + bool roots_ulist_allocated = false; struct ulist_iterator uiter; - int ret; + int ret = 0; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - return -ENOMEM; - *roots = ulist_alloc(GFP_NOFS); - if (!*roots) { - ulist_free(tmp); + ASSERT(ctx->refs == NULL); + + ctx->refs = ulist_alloc(GFP_NOFS); + if (!ctx->refs) return -ENOMEM; + + if (!ctx->roots) { + ctx->roots = ulist_alloc(GFP_NOFS); + if (!ctx->roots) { + ulist_free(ctx->refs); + ctx->refs = NULL; + return -ENOMEM; + } + roots_ulist_allocated = true; } + ctx->ignore_extent_item_pos = true; + ULIST_ITER_INIT(&uiter); while (1) { - ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - tmp, *roots, NULL, NULL, ignore_offset); + struct ulist_node *node; + + ret = find_parent_nodes(ctx, NULL); if (ret < 0 && ret != -ENOENT) { - ulist_free(tmp); - ulist_free(*roots); - *roots = NULL; - return ret; + if (roots_ulist_allocated) { + ulist_free(ctx->roots); + ctx->roots = NULL; + } + break; } - node = ulist_next(tmp, &uiter); + ret = 0; + node = ulist_next(ctx->refs, &uiter); if (!node) break; - bytenr = node->val; + ctx->bytenr = node->val; cond_resched(); } - ulist_free(tmp); - return 0; + ulist_free(ctx->refs); + ctx->refs = NULL; + ctx->bytenr = orig_bytenr; + ctx->ignore_extent_item_pos = orig_ignore_extent_item_pos; + + return ret; } -int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots, +int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx, bool skip_commit_root_sem) { int ret; - if (!trans && !skip_commit_root_sem) - down_read(&fs_info->commit_root_sem); - ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, - time_seq, roots, false); - if (!trans && !skip_commit_root_sem) - up_read(&fs_info->commit_root_sem); + if (!ctx->trans && !skip_commit_root_sem) + down_read(&ctx->fs_info->commit_root_sem); + ret = btrfs_find_all_roots_safe(ctx); + if (!ctx->trans && !skip_commit_root_sem) + up_read(&ctx->fs_info->commit_root_sem); return ret; } -/** - * Check if an extent is shared or not +struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void) +{ + struct btrfs_backref_share_check_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + ulist_init(&ctx->refs); + + return ctx; +} + +void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx) +{ + if (!ctx) + return; + + ulist_release(&ctx->refs); + kfree(ctx); +} + +/* + * Check if a data extent is shared or not. * - * @root: root inode belongs to - * @inum: inode number of the inode whose extent we are checking - * @bytenr: logical bytenr of the extent we are checking - * @roots: list of roots this extent is shared among - * @tmp: temporary list used for iteration + * @inode: The inode whose extent we are checking. + * @bytenr: Logical bytenr of the extent we are checking. + * @extent_gen: Generation of the extent (file extent item) or 0 if it is + * not known. + * @ctx: A backref sharedness check context. * - * btrfs_check_shared uses the backref walking code but will short + * btrfs_is_data_extent_shared uses the backref walking code but will short * circuit as soon as it finds a root or inode that doesn't match the * one passed in. This provides a significant performance benefit for * callers (such as fiemap) which want to know whether the extent is @@ -1531,9 +1837,12 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, * * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. */ -int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, - struct ulist *roots, struct ulist *tmp) +int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, + u64 extent_gen, + struct btrfs_backref_share_check_ctx *ctx) { + struct btrfs_backref_walk_ctx walk_ctx = { 0 }; + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct ulist_iterator uiter; @@ -1541,13 +1850,23 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem); int ret = 0; struct share_check shared = { - .root_objectid = root->root_key.objectid, - .inum = inum, + .ctx = ctx, + .root = root, + .inum = btrfs_ino(inode), + .data_bytenr = bytenr, + .data_extent_gen = extent_gen, .share_count = 0, + .self_ref_count = 0, + .have_delayed_delete_refs = false, }; + int level; - ulist_init(roots); - ulist_init(tmp); + for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) { + if (ctx->prev_extents_cache[i].bytenr == bytenr) + return ctx->prev_extents_cache[i].is_shared; + } + + ulist_init(&ctx->refs); trans = btrfs_join_transaction_nostart(root); if (IS_ERR(trans)) { @@ -1559,28 +1878,88 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, down_read(&fs_info->commit_root_sem); } else { btrfs_get_tree_mod_seq(fs_info, &elem); + walk_ctx.time_seq = elem.seq; } + walk_ctx.ignore_extent_item_pos = true; + walk_ctx.trans = trans; + walk_ctx.fs_info = fs_info; + walk_ctx.refs = &ctx->refs; + + /* -1 means we are in the bytenr of the data extent. */ + level = -1; ULIST_ITER_INIT(&uiter); + ctx->use_path_cache = true; while (1) { - ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, - roots, NULL, &shared, false); - if (ret == BACKREF_FOUND_SHARED) { - /* this is the only condition under which we return 1 */ - ret = 1; + bool is_shared; + bool cached; + + walk_ctx.bytenr = bytenr; + ret = find_parent_nodes(&walk_ctx, &shared); + if (ret == BACKREF_FOUND_SHARED || + ret == BACKREF_FOUND_NOT_SHARED) { + /* If shared must return 1, otherwise return 0. */ + ret = (ret == BACKREF_FOUND_SHARED) ? 1 : 0; + if (level >= 0) + store_backref_shared_cache(ctx, root, bytenr, + level, ret == 1); break; } if (ret < 0 && ret != -ENOENT) break; ret = 0; - node = ulist_next(tmp, &uiter); + + /* + * If our data extent was not directly shared (without multiple + * reference items), than it might have a single reference item + * with a count > 1 for the same offset, which means there are 2 + * (or more) file extent items that point to the data extent - + * this happens when a file extent item needs to be split and + * then one item gets moved to another leaf due to a b+tree leaf + * split when inserting some item. In this case the file extent + * items may be located in different leaves and therefore some + * of the leaves may be referenced through shared subtrees while + * others are not. Since our extent buffer cache only works for + * a single path (by far the most common case and simpler to + * deal with), we can not use it if we have multiple leaves + * (which implies multiple paths). + */ + if (level == -1 && ctx->refs.nnodes > 1) + ctx->use_path_cache = false; + + if (level >= 0) + store_backref_shared_cache(ctx, root, bytenr, + level, false); + node = ulist_next(&ctx->refs, &uiter); if (!node) break; bytenr = node->val; + level++; + cached = lookup_backref_shared_cache(ctx, root, bytenr, level, + &is_shared); + if (cached) { + ret = (is_shared ? 1 : 0); + break; + } shared.share_count = 0; + shared.have_delayed_delete_refs = false; cond_resched(); } + /* + * Cache the sharedness result for the data extent if we know our inode + * has more than 1 file extent item that refers to the data extent. + */ + if (ret >= 0 && shared.self_ref_count > 1) { + int slot = ctx->prev_extents_cache_slot; + + ctx->prev_extents_cache[slot].bytenr = shared.data_bytenr; + ctx->prev_extents_cache[slot].is_shared = (ret == 1); + + slot = (slot + 1) % BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; + ctx->prev_extents_cache_slot = slot; + } + if (trans) { btrfs_put_tree_mod_seq(fs_info, &elem); btrfs_end_transaction(trans); @@ -1588,8 +1967,9 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, up_read(&fs_info->commit_root_sem); } out: - ulist_release(roots); - ulist_release(tmp); + ulist_release(&ctx->refs); + ctx->prev_leaf_bytenr = ctx->curr_leaf_bytenr; + return ret; } @@ -1936,7 +2316,7 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, "ref for %llu resolved, key (%llu EXTEND_DATA %llu), root %llu", extent_item_objectid, eie->inum, eie->offset, root); - ret = iterate(eie->inum, eie->offset, root, ctx); + ret = iterate(eie->inum, eie->offset, eie->num_bytes, root, ctx); if (ret) { btrfs_debug(fs_info, "stopping iteration for %llu due to ret=%d", @@ -1953,82 +2333,128 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, * the given parameters. * when the iterator function returns a non-zero value, iteration stops. */ -int iterate_extent_inodes(struct btrfs_fs_info *fs_info, - u64 extent_item_objectid, u64 extent_item_pos, - int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx, - bool ignore_offset) +int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, + bool search_commit_root, + iterate_extent_inodes_t *iterate, void *user_ctx) { int ret; - struct btrfs_trans_handle *trans = NULL; - struct ulist *refs = NULL; - struct ulist *roots = NULL; - struct ulist_node *ref_node = NULL; - struct ulist_node *root_node = NULL; + struct ulist *refs; + struct ulist_node *ref_node; struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem); struct ulist_iterator ref_uiter; - struct ulist_iterator root_uiter; - btrfs_debug(fs_info, "resolving all inodes for extent %llu", - extent_item_objectid); + btrfs_debug(ctx->fs_info, "resolving all inodes for extent %llu", + ctx->bytenr); + + ASSERT(ctx->trans == NULL); + ASSERT(ctx->roots == NULL); if (!search_commit_root) { - trans = btrfs_attach_transaction(fs_info->tree_root); + struct btrfs_trans_handle *trans; + + trans = btrfs_attach_transaction(ctx->fs_info->tree_root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) return PTR_ERR(trans); trans = NULL; } + ctx->trans = trans; } - if (trans) - btrfs_get_tree_mod_seq(fs_info, &seq_elem); - else - down_read(&fs_info->commit_root_sem); + if (ctx->trans) { + btrfs_get_tree_mod_seq(ctx->fs_info, &seq_elem); + ctx->time_seq = seq_elem.seq; + } else { + down_read(&ctx->fs_info->commit_root_sem); + } - ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, - seq_elem.seq, &refs, - &extent_item_pos, ignore_offset); + ret = btrfs_find_all_leafs(ctx); if (ret) goto out; + refs = ctx->refs; + ctx->refs = NULL; ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { - ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val, - seq_elem.seq, &roots, - ignore_offset); + const u64 leaf_bytenr = ref_node->val; + struct ulist_node *root_node; + struct ulist_iterator root_uiter; + struct extent_inode_elem *inode_list; + + inode_list = (struct extent_inode_elem *)(uintptr_t)ref_node->aux; + + if (ctx->cache_lookup) { + const u64 *root_ids; + int root_count; + bool cached; + + cached = ctx->cache_lookup(leaf_bytenr, ctx->user_ctx, + &root_ids, &root_count); + if (cached) { + for (int i = 0; i < root_count; i++) { + ret = iterate_leaf_refs(ctx->fs_info, + inode_list, + root_ids[i], + leaf_bytenr, + iterate, + user_ctx); + if (ret) + break; + } + continue; + } + } + + if (!ctx->roots) { + ctx->roots = ulist_alloc(GFP_NOFS); + if (!ctx->roots) { + ret = -ENOMEM; + break; + } + } + + ctx->bytenr = leaf_bytenr; + ret = btrfs_find_all_roots_safe(ctx); if (ret) break; + + if (ctx->cache_store) + ctx->cache_store(leaf_bytenr, ctx->roots, ctx->user_ctx); + ULIST_ITER_INIT(&root_uiter); - while (!ret && (root_node = ulist_next(roots, &root_uiter))) { - btrfs_debug(fs_info, + while (!ret && (root_node = ulist_next(ctx->roots, &root_uiter))) { + btrfs_debug(ctx->fs_info, "root %llu references leaf %llu, data list %#llx", root_node->val, ref_node->val, ref_node->aux); - ret = iterate_leaf_refs(fs_info, - (struct extent_inode_elem *) - (uintptr_t)ref_node->aux, - root_node->val, - extent_item_objectid, - iterate, ctx); + ret = iterate_leaf_refs(ctx->fs_info, inode_list, + root_node->val, ctx->bytenr, + iterate, user_ctx); } - ulist_free(roots); + ulist_reinit(ctx->roots); } free_leaf_list(refs); out: - if (trans) { - btrfs_put_tree_mod_seq(fs_info, &seq_elem); - btrfs_end_transaction(trans); + if (ctx->trans) { + btrfs_put_tree_mod_seq(ctx->fs_info, &seq_elem); + btrfs_end_transaction(ctx->trans); + ctx->trans = NULL; } else { - up_read(&fs_info->commit_root_sem); + up_read(&ctx->fs_info->commit_root_sem); } + ulist_free(ctx->roots); + ctx->roots = NULL; + + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP) + ret = 0; + return ret; } -static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) +static int build_ino_list(u64 inum, u64 offset, u64 num_bytes, u64 root, void *ctx) { struct btrfs_data_container *inodes = ctx; const size_t c = 3 * sizeof(u64); @@ -2052,8 +2478,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, void *ctx, bool ignore_offset) { + struct btrfs_backref_walk_ctx walk_ctx = { 0 }; int ret; - u64 extent_item_pos; u64 flags = 0; struct btrfs_key found_key; int search_commit_root = path->search_commit_root; @@ -2065,12 +2491,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) return -EINVAL; - extent_item_pos = logical - found_key.objectid; - ret = iterate_extent_inodes(fs_info, found_key.objectid, - extent_item_pos, search_commit_root, - build_ino_list, ctx, ignore_offset); + walk_ctx.bytenr = found_key.objectid; + if (ignore_offset) + walk_ctx.ignore_extent_item_pos = true; + else + walk_ctx.extent_item_pos = logical - found_key.objectid; + walk_ctx.fs_info = fs_info; - return ret; + return iterate_extent_inodes(&walk_ctx, search_commit_root, + build_ino_list, ctx); } static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, @@ -2323,12 +2752,11 @@ void free_ipath(struct inode_fs_paths *ipath) kfree(ipath); } -struct btrfs_backref_iter *btrfs_backref_iter_alloc( - struct btrfs_fs_info *fs_info, gfp_t gfp_flag) +struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info) { struct btrfs_backref_iter *ret; - ret = kzalloc(sizeof(*ret), gfp_flag); + ret = kzalloc(sizeof(*ret), GFP_NOFS); if (!ret) return NULL; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 2759de7d324c..ef6bbea3f456 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -7,18 +7,193 @@ #define BTRFS_BACKREF_H #include <linux/btrfs.h> +#include "messages.h" #include "ulist.h" #include "disk-io.h" #include "extent_io.h" +/* + * Used by implementations of iterate_extent_inodes_t (see definition below) to + * signal that backref iteration can stop immediately and no error happened. + * The value must be non-negative and must not be 0, 1 (which is a common return + * value from things like btrfs_search_slot() and used internally in the backref + * walking code) and different from BACKREF_FOUND_SHARED and + * BACKREF_FOUND_NOT_SHARED + */ +#define BTRFS_ITERATE_EXTENT_INODES_STOP 5 + +/* + * Should return 0 if no errors happened and iteration of backrefs should + * continue. Can return BTRFS_ITERATE_EXTENT_INODES_STOP or any other non-zero + * value to immediately stop iteration and possibly signal an error back to + * the caller. + */ +typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 num_bytes, + u64 root, void *ctx); + +/* + * Context and arguments for backref walking functions. Some of the fields are + * to be filled by the caller of such functions while other are filled by the + * functions themselves, as described below. + */ +struct btrfs_backref_walk_ctx { + /* + * The address of the extent for which we are doing backref walking. + * Can be either a data extent or a metadata extent. + * + * Must always be set by the top level caller. + */ + u64 bytenr; + /* + * Offset relative to the target extent. This is only used for data + * extents, and it's meaningful because we can have file extent items + * that point only to a section of a data extent ("bookend" extents), + * and we want to filter out any that don't point to a section of the + * data extent containing the given offset. + * + * Must always be set by the top level caller. + */ + u64 extent_item_pos; + /* + * If true and bytenr corresponds to a data extent, then references from + * all file extent items that point to the data extent are considered, + * @extent_item_pos is ignored. + */ + bool ignore_extent_item_pos; + /* A valid transaction handle or NULL. */ + struct btrfs_trans_handle *trans; + /* + * The file system's info object, can not be NULL. + * + * Must always be set by the top level caller. + */ + struct btrfs_fs_info *fs_info; + /* + * Time sequence acquired from btrfs_get_tree_mod_seq(), in case the + * caller joined the tree mod log to get a consistent view of b+trees + * while we do backref walking, or BTRFS_SEQ_LAST. + * When using BTRFS_SEQ_LAST, delayed refs are not checked and it uses + * commit roots when searching b+trees - this is a special case for + * qgroups used during a transaction commit. + */ + u64 time_seq; + /* + * Used to collect the bytenr of metadata extents that point to the + * target extent. + */ + struct ulist *refs; + /* + * List used to collect the IDs of the roots from which the target + * extent is accessible. Can be NULL in case the caller does not care + * about collecting root IDs. + */ + struct ulist *roots; + /* + * Used by iterate_extent_inodes() and the main backref walk code + * (find_parent_nodes()). Lookup and store functions for an optional + * cache which maps the logical address (bytenr) of leaves to an array + * of root IDs. + */ + bool (*cache_lookup)(u64 leaf_bytenr, void *user_ctx, + const u64 **root_ids_ret, int *root_count_ret); + void (*cache_store)(u64 leaf_bytenr, const struct ulist *root_ids, + void *user_ctx); + /* + * If this is not NULL, then the backref walking code will call this + * for each indirect data extent reference as soon as it finds one, + * before collecting all the remaining backrefs and before resolving + * indirect backrefs. This allows for the caller to terminate backref + * walking as soon as it finds one backref that matches some specific + * criteria. The @cache_lookup and @cache_store callbacks should not + * be NULL in order to use this callback. + */ + iterate_extent_inodes_t *indirect_ref_iterator; + /* + * If this is not NULL, then the backref walking code will call this for + * each extent item it's meant to process before it actually starts + * processing it. If this returns anything other than 0, then it stops + * the backref walking code immediately. + */ + int (*check_extent_item)(u64 bytenr, const struct btrfs_extent_item *ei, + const struct extent_buffer *leaf, void *user_ctx); + /* + * If this is not NULL, then the backref walking code will call this for + * each extent data ref it finds (BTRFS_EXTENT_DATA_REF_KEY keys) before + * processing that data ref. If this callback return false, then it will + * ignore this data ref and it will never resolve the indirect data ref, + * saving time searching for leaves in a fs tree with file extent items + * matching the data ref. + */ + bool (*skip_data_ref)(u64 root, u64 ino, u64 offset, void *user_ctx); + /* Context object to pass to the callbacks defined above. */ + void *user_ctx; +}; + struct inode_fs_paths { struct btrfs_path *btrfs_path; struct btrfs_root *fs_root; struct btrfs_data_container *fspath; }; -typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, - void *ctx); +struct btrfs_backref_shared_cache_entry { + u64 bytenr; + u64 gen; + bool is_shared; +}; + +#define BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE 8 + +struct btrfs_backref_share_check_ctx { + /* Ulists used during backref walking. */ + struct ulist refs; + /* + * The current leaf the caller of btrfs_is_data_extent_shared() is at. + * Typically the caller (at the moment only fiemap) tries to determine + * the sharedness of data extents point by file extent items from entire + * leaves. + */ + u64 curr_leaf_bytenr; + /* + * The previous leaf the caller was at in the previous call to + * btrfs_is_data_extent_shared(). This may be the same as the current + * leaf. On the first call it must be 0. + */ + u64 prev_leaf_bytenr; + /* + * A path from a root to a leaf that has a file extent item pointing to + * a given data extent should never exceed the maximum b+tree height. + */ + struct btrfs_backref_shared_cache_entry path_cache_entries[BTRFS_MAX_LEVEL]; + bool use_path_cache; + /* + * Cache the sharedness result for the last few extents we have found, + * but only for extents for which we have multiple file extent items + * that point to them. + * It's very common to have several file extent items that point to the + * same extent (bytenr) but with different offsets and lengths. This + * typically happens for COW writes, partial writes into prealloc + * extents, NOCOW writes after snapshoting a root, hole punching or + * reflinking within the same file (less common perhaps). + * So keep a small cache with the lookup results for the extent pointed + * by the last few file extent items. This cache is checked, with a + * linear scan, whenever btrfs_is_data_extent_shared() is called, so + * it must be small so that it does not negatively affect performance in + * case we don't have multiple file extent items that point to the same + * data extent. + */ + struct { + u64 bytenr; + bool is_shared; + } prev_extents_cache[BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE]; + /* + * The slot in the prev_extents_cache array that will be used for + * storing the sharedness result of a new data extent. + */ + int prev_extents_cache_slot; +}; + +struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void); +void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx); int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_path *path, struct btrfs_key *found_key, @@ -28,11 +203,9 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, struct btrfs_key *key, struct btrfs_extent_item *ei, u32 item_size, u64 *out_root, u8 *out_level); -int iterate_extent_inodes(struct btrfs_fs_info *fs_info, - u64 extent_item_objectid, - u64 extent_offset, int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx, - bool ignore_offset); +int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, + bool search_commit_root, + iterate_extent_inodes_t *iterate, void *user_ctx); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, void *ctx, @@ -40,13 +213,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); -int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **leafs, - const u64 *extent_item_pos, bool ignore_offset); -int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots, +int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx); +int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx, bool skip_commit_root_sem); char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, u32 name_len, unsigned long name_off, @@ -62,8 +230,9 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, struct btrfs_inode_extref **ret_extref, u64 *found_off); -int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, - struct ulist *roots, struct ulist *tmp_ulist); +int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, + u64 extent_gen, + struct btrfs_backref_share_check_ctx *ctx); int __init btrfs_prelim_ref_init(void); void __cold btrfs_prelim_ref_exit(void); @@ -94,8 +263,7 @@ struct btrfs_backref_iter { u32 end_ptr; }; -struct btrfs_backref_iter *btrfs_backref_iter_alloc( - struct btrfs_fs_info *fs_info, gfp_t gfp_flag); +struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info); static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter) { diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c new file mode 100644 index 000000000000..b8fb7ef6b520 --- /dev/null +++ b/fs/btrfs/bio.c @@ -0,0 +1,381 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2022 Christoph Hellwig. + */ + +#include <linux/bio.h> +#include "bio.h" +#include "ctree.h" +#include "volumes.h" +#include "raid56.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "dev-replace.h" +#include "rcu-string.h" +#include "zoned.h" + +static struct bio_set btrfs_bioset; + +/* + * Initialize a btrfs_bio structure. This skips the embedded bio itself as it + * is already initialized by the block layer. + */ +static inline void btrfs_bio_init(struct btrfs_bio *bbio, + btrfs_bio_end_io_t end_io, void *private) +{ + memset(bbio, 0, offsetof(struct btrfs_bio, bio)); + bbio->end_io = end_io; + bbio->private = private; +} + +/* + * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for + * btrfs, and is used for all I/O submitted through btrfs_submit_bio. + * + * Just like the underlying bio_alloc_bioset it will not fail as it is backed by + * a mempool. + */ +struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + btrfs_bio_end_io_t end_io, void *private) +{ + struct bio *bio; + + bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); + btrfs_bio_init(btrfs_bio(bio), end_io, private); + return bio; +} + +struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, + btrfs_bio_end_io_t end_io, void *private) +{ + struct bio *bio; + struct btrfs_bio *bbio; + + ASSERT(offset <= UINT_MAX && size <= UINT_MAX); + + bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); + bbio = btrfs_bio(bio); + btrfs_bio_init(bbio, end_io, private); + + bio_trim(bio, offset >> 9, size >> 9); + bbio->iter = bio->bi_iter; + return bio; +} + +static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) +{ + if (!dev || !dev->bdev) + return; + if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) + return; + + if (btrfs_op(bio) == BTRFS_MAP_WRITE) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + if (!(bio->bi_opf & REQ_RAHEAD)) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); + if (bio->bi_opf & REQ_PREFLUSH) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); +} + +static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, + struct bio *bio) +{ + if (bio->bi_opf & REQ_META) + return fs_info->endio_meta_workers; + return fs_info->endio_workers; +} + +static void btrfs_end_bio_work(struct work_struct *work) +{ + struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); + + bbio->end_io(bbio); +} + +static void btrfs_simple_end_io(struct bio *bio) +{ + struct btrfs_fs_info *fs_info = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); + + btrfs_bio_counter_dec(fs_info); + + if (bio->bi_status) + btrfs_log_dev_io_error(bio, bbio->device); + + if (bio_op(bio) == REQ_OP_READ) { + INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); + queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); + } else { + bbio->end_io(bbio); + } +} + +static void btrfs_raid56_end_io(struct bio *bio) +{ + struct btrfs_io_context *bioc = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); + + btrfs_bio_counter_dec(bioc->fs_info); + bbio->mirror_num = bioc->mirror_num; + bbio->end_io(bbio); + + btrfs_put_bioc(bioc); +} + +static void btrfs_orig_write_end_io(struct bio *bio) +{ + struct btrfs_io_stripe *stripe = bio->bi_private; + struct btrfs_io_context *bioc = stripe->bioc; + struct btrfs_bio *bbio = btrfs_bio(bio); + + btrfs_bio_counter_dec(bioc->fs_info); + + if (bio->bi_status) { + atomic_inc(&bioc->error); + btrfs_log_dev_io_error(bio, stripe->dev); + } + + /* + * Only send an error to the higher layers if it is beyond the tolerance + * threshold. + */ + if (atomic_read(&bioc->error) > bioc->max_errors) + bio->bi_status = BLK_STS_IOERR; + else + bio->bi_status = BLK_STS_OK; + + bbio->end_io(bbio); + btrfs_put_bioc(bioc); +} + +static void btrfs_clone_write_end_io(struct bio *bio) +{ + struct btrfs_io_stripe *stripe = bio->bi_private; + + if (bio->bi_status) { + atomic_inc(&stripe->bioc->error); + btrfs_log_dev_io_error(bio, stripe->dev); + } + + /* Pass on control to the original bio this one was cloned from */ + bio_endio(stripe->bioc->orig_bio); + bio_put(bio); +} + +static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) +{ + if (!dev || !dev->bdev || + test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || + (btrfs_op(bio) == BTRFS_MAP_WRITE && + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { + bio_io_error(bio); + return; + } + + bio_set_dev(bio, dev->bdev); + + /* + * For zone append writing, bi_sector must point the beginning of the + * zone + */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + + if (btrfs_dev_is_sequential(dev, physical)) { + u64 zone_start = round_down(physical, + dev->fs_info->zone_size); + + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; + } else { + bio->bi_opf &= ~REQ_OP_ZONE_APPEND; + bio->bi_opf |= REQ_OP_WRITE; + } + } + btrfs_debug_in_rcu(dev->fs_info, + "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", + __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, + (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), + dev->devid, bio->bi_iter.bi_size); + + btrfsic_check_bio(bio); + submit_bio(bio); +} + +static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) +{ + struct bio *orig_bio = bioc->orig_bio, *bio; + + ASSERT(bio_op(orig_bio) != REQ_OP_READ); + + /* Reuse the bio embedded into the btrfs_bio for the last mirror */ + if (dev_nr == bioc->num_stripes - 1) { + bio = orig_bio; + bio->bi_end_io = btrfs_orig_write_end_io; + } else { + bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); + bio_inc_remaining(orig_bio); + bio->bi_end_io = btrfs_clone_write_end_io; + } + + bio->bi_private = &bioc->stripes[dev_nr]; + bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; + bioc->stripes[dev_nr].bioc = bioc; + btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); +} + +void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) +{ + u64 logical = bio->bi_iter.bi_sector << 9; + u64 length = bio->bi_iter.bi_size; + u64 map_length = length; + struct btrfs_io_context *bioc = NULL; + struct btrfs_io_stripe smap; + int ret; + + btrfs_bio_counter_inc_blocked(fs_info); + ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num, 1); + if (ret) { + btrfs_bio_counter_dec(fs_info); + btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); + return; + } + + if (map_length < length) { + btrfs_crit(fs_info, + "mapping failed logical %llu bio len %llu len %llu", + logical, length, map_length); + BUG(); + } + + if (!bioc) { + /* Single mirror read/write fast path */ + btrfs_bio(bio)->mirror_num = mirror_num; + btrfs_bio(bio)->device = smap.dev; + bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; + bio->bi_private = fs_info; + bio->bi_end_io = btrfs_simple_end_io; + btrfs_submit_dev_bio(smap.dev, bio); + } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + /* Parity RAID write or read recovery */ + bio->bi_private = bioc; + bio->bi_end_io = btrfs_raid56_end_io; + if (bio_op(bio) == REQ_OP_READ) + raid56_parity_recover(bio, bioc, mirror_num); + else + raid56_parity_write(bio, bioc); + } else { + /* Write to multiple mirrors */ + int total_devs = bioc->num_stripes; + int dev_nr; + + bioc->orig_bio = bio; + for (dev_nr = 0; dev_nr < total_devs; dev_nr++) + btrfs_submit_mirrored_bio(bioc, dev_nr); + } +} + +/* + * Submit a repair write. + * + * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a + * RAID setup. Here we only want to write the one bad copy, so we do the + * mapping ourselves and submit the bio directly. + * + * The I/O is issued sychronously to block the repair read completion from + * freeing the bio. + */ +int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num) +{ + struct btrfs_device *dev; + struct bio_vec bvec; + struct bio bio; + u64 map_length = 0; + u64 sector; + struct btrfs_io_context *bioc = NULL; + int ret = 0; + + ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); + BUG_ON(!mirror_num); + + if (btrfs_repair_one_zone(fs_info, logical)) + return 0; + + map_length = length; + + /* + * Avoid races with device replace and make sure our bioc has devices + * associated to its stripes that don't go away while we are doing the + * read repair operation. + */ + btrfs_bio_counter_inc_blocked(fs_info); + if (btrfs_is_parity_mirror(fs_info, logical, length)) { + /* + * Note that we don't use BTRFS_MAP_WRITE because it's supposed + * to update all raid stripes, but here we just want to correct + * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad + * stripe's dev and sector. + */ + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, + &map_length, &bioc, 0); + if (ret) + goto out_counter_dec; + ASSERT(bioc->mirror_num == 1); + } else { + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, + &map_length, &bioc, mirror_num); + if (ret) + goto out_counter_dec; + BUG_ON(mirror_num != bioc->mirror_num); + } + + sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; + dev = bioc->stripes[bioc->mirror_num - 1].dev; + btrfs_put_bioc(bioc); + + if (!dev || !dev->bdev || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { + ret = -EIO; + goto out_counter_dec; + } + + bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); + bio.bi_iter.bi_sector = sector; + __bio_add_page(&bio, page, length, pg_offset); + + btrfsic_check_bio(&bio); + ret = submit_bio_wait(&bio); + if (ret) { + /* try to remap that extent elsewhere? */ + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + goto out_bio_uninit; + } + + btrfs_info_rl_in_rcu(fs_info, + "read error corrected: ino %llu off %llu (dev %s sector %llu)", + ino, start, btrfs_dev_name(dev), sector); + ret = 0; + +out_bio_uninit: + bio_uninit(&bio); +out_counter_dec: + btrfs_bio_counter_dec(fs_info); + return ret; +} + +int __init btrfs_bioset_init(void) +{ + if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_bio, bio), + BIOSET_NEED_BVECS)) + return -ENOMEM; + return 0; +} + +void __cold btrfs_bioset_exit(void) +{ + bioset_exit(&btrfs_bioset); +} diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h new file mode 100644 index 000000000000..b12f84b3b341 --- /dev/null +++ b/fs/btrfs/bio.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2022 Christoph Hellwig. + */ + +#ifndef BTRFS_BIO_H +#define BTRFS_BIO_H + +#include <linux/bio.h> +#include <linux/workqueue.h> +#include "tree-checker.h" + +struct btrfs_bio; +struct btrfs_fs_info; + +#define BTRFS_BIO_INLINE_CSUM_SIZE 64 + +/* + * Maximum number of sectors for a single bio to limit the size of the + * checksum array. This matches the number of bio_vecs per bio and thus the + * I/O size for buffered I/O. + */ +#define BTRFS_MAX_BIO_SECTORS (256) + +typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); + +/* + * Additional info to pass along bio. + * + * Mostly for btrfs specific features like csum and mirror_num. + */ +struct btrfs_bio { + unsigned int mirror_num:7; + + /* + * Extra indicator for metadata bios. + * For some btrfs bios they use pages without a mapping, thus + * we can not rely on page->mapping->host to determine if + * it's a metadata bio. + */ + unsigned int is_metadata:1; + struct bvec_iter iter; + + /* for direct I/O */ + u64 file_offset; + + /* @device is for stripe IO submission. */ + struct btrfs_device *device; + union { + /* For data checksum verification. */ + struct { + u8 *csum; + u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; + }; + + /* For metadata parentness verification. */ + struct btrfs_tree_parent_check parent_check; + }; + + /* End I/O information supplied to btrfs_bio_alloc */ + btrfs_bio_end_io_t end_io; + void *private; + + /* For read end I/O handling */ + struct work_struct end_io_work; + + /* + * This member must come last, bio_alloc_bioset will allocate enough + * bytes for entire btrfs_bio but relies on bio being last. + */ + struct bio bio; +}; + +static inline struct btrfs_bio *btrfs_bio(struct bio *bio) +{ + return container_of(bio, struct btrfs_bio, bio); +} + +int __init btrfs_bioset_init(void); +void __cold btrfs_bioset_exit(void); + +struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + btrfs_bio_end_io_t end_io, void *private); +struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, + btrfs_bio_end_io_t end_io, void *private); + + +static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) +{ + bbio->bio.bi_status = status; + bbio->end_io(bbio); +} + +static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) +{ + if (bbio->is_metadata) + return; + if (bbio->csum != bbio->csum_inline) { + kfree(bbio->csum); + bbio->csum = NULL; + } +} + +/* + * Iterate through a btrfs_bio (@bbio) on a per-sector basis. + * + * bvl - struct bio_vec + * bbio - struct btrfs_bio + * iters - struct bvec_iter + * bio_offset - unsigned int + */ +#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ + for ((iter) = (bbio)->iter, (bio_offset) = 0; \ + (iter).bi_size && \ + (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ + (bio_offset) += fs_info->sectorsize, \ + bio_advance_iter_single(&(bbio)->bio, &(iter), \ + (fs_info)->sectorsize)) + +void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num); +int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num); + +#endif diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index e0375ba9d0fe..708d843daa72 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -17,6 +17,21 @@ #include "discard.h" #include "raid56.h" #include "zoned.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" + +#ifdef CONFIG_BTRFS_DEBUG +int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + + return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && + block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || + (btrfs_test_opt(fs_info, FRAGMENT_DATA) && + block_group->flags & BTRFS_BLOCK_GROUP_DATA); +} +#endif /* * Return target flags in extended format or 0 if restripe for this chunk_type @@ -284,7 +299,7 @@ struct btrfs_block_group *btrfs_next_block_group( return cache; } -/** +/* * Check if we can do a NOCOW write for a given extent. * * @fs_info: The filesystem information object. @@ -325,11 +340,9 @@ struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, return bg; } -/** +/* * Decrement the number of NOCOW writers in a block group. * - * @bg: The block group. - * * This is meant to be called after a previous call to btrfs_inc_nocow_writers(), * and on the block group returned by that call. Typically this is called after * creating an ordered extent for a NOCOW write, to prevent races with scrub and @@ -593,8 +606,6 @@ next: if (need_resched() || rwsem_is_contended(&fs_info->commit_root_sem)) { - if (wakeup) - caching_ctl->progress = last; btrfs_release_path(path); up_read(&fs_info->commit_root_sem); mutex_unlock(&caching_ctl->mutex); @@ -618,9 +629,6 @@ next: key.objectid = last; key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; - - if (wakeup) - caching_ctl->progress = last; btrfs_release_path(path); goto next; } @@ -655,7 +663,6 @@ next: total_found += add_new_free_space(block_group, last, block_group->start + block_group->length); - caching_ctl->progress = (u64)-1; out: btrfs_free_path(path); @@ -725,8 +732,6 @@ done: } #endif - caching_ctl->progress = (u64)-1; - up_read(&fs_info->commit_root_sem); btrfs_free_excluded_extents(block_group); mutex_unlock(&caching_ctl->mutex); @@ -755,7 +760,6 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) mutex_init(&caching_ctl->mutex); init_waitqueue_head(&caching_ctl->wait); caching_ctl->block_group = cache; - caching_ctl->progress = cache->start; refcount_set(&caching_ctl->count, 2); btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); @@ -772,7 +776,6 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) WARN_ON(cache->caching_ctl); cache->caching_ctl = caching_ctl; cache->cached = BTRFS_CACHE_STARTED; - cache->has_caching_ctl = 1; spin_unlock(&cache->lock); write_lock(&fs_info->block_group_cache_lock); @@ -988,32 +991,31 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, kobject_put(kobj); } - if (block_group->has_caching_ctl) - caching_ctl = btrfs_get_caching_control(block_group); if (block_group->cached == BTRFS_CACHE_STARTED) btrfs_wait_block_group_cache_done(block_group); - if (block_group->has_caching_ctl) { - write_lock(&fs_info->block_group_cache_lock); - if (!caching_ctl) { - struct btrfs_caching_control *ctl; - - list_for_each_entry(ctl, - &fs_info->caching_block_groups, list) - if (ctl->block_group == block_group) { - caching_ctl = ctl; - refcount_inc(&caching_ctl->count); - break; - } - } - if (caching_ctl) - list_del_init(&caching_ctl->list); - write_unlock(&fs_info->block_group_cache_lock); - if (caching_ctl) { - /* Once for the caching bgs list and once for us. */ - btrfs_put_caching_control(caching_ctl); - btrfs_put_caching_control(caching_ctl); + + write_lock(&fs_info->block_group_cache_lock); + caching_ctl = btrfs_get_caching_control(block_group); + if (!caching_ctl) { + struct btrfs_caching_control *ctl; + + list_for_each_entry(ctl, &fs_info->caching_block_groups, list) { + if (ctl->block_group == block_group) { + caching_ctl = ctl; + refcount_inc(&caching_ctl->count); + break; + } } } + if (caching_ctl) + list_del_init(&caching_ctl->list); + write_unlock(&fs_info->block_group_cache_lock); + + if (caching_ctl) { + /* Once for the caching bgs list and once for us. */ + btrfs_put_caching_control(caching_ctl); + btrfs_put_caching_control(caching_ctl); + } spin_lock(&trans->transaction->dirty_bgs_lock); WARN_ON(!list_empty(&block_group->dirty_list)); @@ -1034,12 +1036,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, < block_group->zone_unusable); WARN_ON(block_group->space_info->disk_total < block_group->length * factor); - WARN_ON(block_group->zone_is_active && + WARN_ON(test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &block_group->runtime_flags) && block_group->space_info->active_total_bytes < block_group->length); } block_group->space_info->total_bytes -= block_group->length; - if (block_group->zone_is_active) + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) block_group->space_info->active_total_bytes -= block_group->length; block_group->space_info->bytes_readonly -= (block_group->length - block_group->zone_unusable); @@ -1069,7 +1072,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, goto out; spin_lock(&block_group->lock); - block_group->removed = 1; + set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags); + /* * At this point trimming or scrub can't start on this block group, * because we removed the block group from the rbtree @@ -1304,6 +1308,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; + if (btrfs_fs_closing(fs_info)) + return; + /* * Long running balances can keep us blocked here for eternity, so * simply skip deletion if we're unable to get the mutex. @@ -1533,6 +1540,30 @@ static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) return true; } +static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed) +{ + const struct btrfs_space_info *space_info = bg->space_info; + const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); + const u64 new_val = bg->used; + const u64 old_val = new_val + bytes_freed; + u64 thresh; + + if (reclaim_thresh == 0) + return false; + + thresh = mult_perc(bg->length, reclaim_thresh); + + /* + * If we were below the threshold before don't reclaim, we are likely a + * brand new block group and we don't want to relocate new block groups. + */ + if (old_val < thresh) + return false; + if (new_val >= thresh) + return false; + return true; +} + void btrfs_reclaim_bgs_work(struct work_struct *work) { struct btrfs_fs_info *fs_info = @@ -1543,6 +1574,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; + if (btrfs_fs_closing(fs_info)) + return; + if (!btrfs_should_reclaim(fs_info)) return; @@ -1597,6 +1631,40 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) up_write(&space_info->groups_sem); goto next; } + if (bg->used == 0) { + /* + * It is possible that we trigger relocation on a block + * group as its extents are deleted and it first goes + * below the threshold, then shortly after goes empty. + * + * In this case, relocating it does delete it, but has + * some overhead in relocation specific metadata, looking + * for the non-existent extents and running some extra + * transactions, which we can avoid by using one of the + * other mechanisms for dealing with empty block groups. + */ + if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_mark_bg_unused(bg); + spin_unlock(&bg->lock); + up_write(&space_info->groups_sem); + goto next; + + } + /* + * The block group might no longer meet the reclaim condition by + * the time we get around to reclaiming it, so to avoid + * reclaiming overly full block_groups, skip reclaiming them. + * + * Since the decision making process also depends on the amount + * being freed, pass in a fake giant value to skip that extra + * check, which is more meaningful when adding to the list in + * the first place. + */ + if (!should_reclaim_block_group(bg, bg->length)) { + spin_unlock(&bg->lock); + up_write(&space_info->groups_sem); + goto next; + } spin_unlock(&bg->lock); /* Get out fast, in case we're unmounting the filesystem */ @@ -1743,8 +1811,8 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) write_sequnlock(&fs_info->profiles_lock); } -/** - * Map a physical disk address to a list of logical addresses +/* + * Map a physical disk address to a list of logical addresses. * * @fs_info: the filesystem * @chunk_start: logical address of block group @@ -1890,16 +1958,6 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) return 0; } -static void link_block_group(struct btrfs_block_group *cache) -{ - struct btrfs_space_info *space_info = cache->space_info; - int index = btrfs_bg_flags_to_raid_index(cache->flags); - - down_write(&space_info->groups_sem); - list_add_tail(&cache->list, &space_info->block_groups[index]); - up_write(&space_info->groups_sem); -} - static struct btrfs_block_group *btrfs_create_block_group_cache( struct btrfs_fs_info *fs_info, u64 start) { @@ -1937,7 +1995,8 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( btrfs_init_free_space_ctl(cache, cache->free_space_ctl); atomic_set(&cache->frozen, 0); mutex_init(&cache->free_space_lock); - btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); + cache->full_stripe_locks_root.root = RB_ROOT; + mutex_init(&cache->full_stripe_locks_root.lock); return cache; } @@ -2002,7 +2061,6 @@ static int read_one_block_group(struct btrfs_fs_info *info, int need_clear) { struct btrfs_block_group *cache; - struct btrfs_space_info *space_info; const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS); int ret; @@ -2014,6 +2072,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, cache->length = key->offset; cache->used = btrfs_stack_block_group_used(bgi); + cache->commit_used = cache->used; cache->flags = btrfs_stack_block_group_flags(bgi); cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); @@ -2078,11 +2137,9 @@ static int read_one_block_group(struct btrfs_fs_info *info, /* Should not have any excluded extents. Just in case, though. */ btrfs_free_excluded_extents(cache); } else if (cache->length == cache->used) { - cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; btrfs_free_excluded_extents(cache); } else if (cache->used == 0) { - cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; add_new_free_space(cache, cache->start, cache->start + cache->length); @@ -2095,14 +2152,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, goto error; } trace_btrfs_add_block_group(info, cache, 0); - btrfs_update_space_info(info, cache->flags, cache->length, - cache->used, cache->bytes_super, - cache->zone_unusable, cache->zone_is_active, - &space_info); - - cache->space_info = space_info; - - link_block_group(cache); + btrfs_add_bg_to_space_info(info, cache); set_avail_alloc_bits(info, cache->flags); if (btrfs_chunk_writeable(info, cache->start)) { @@ -2126,7 +2176,6 @@ error: static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) { struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct btrfs_space_info *space_info; struct rb_node *node; int ret = 0; @@ -2146,7 +2195,6 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) /* Fill dummy cache as FULL */ bg->length = em->len; bg->flags = map->type; - bg->last_byte_to_unpin = (u64)-1; bg->cached = BTRFS_CACHE_FINISHED; bg->used = em->len; bg->flags = map->type; @@ -2167,10 +2215,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) break; } - btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, - 0, 0, false, &space_info); - bg->space_info = space_info; - link_block_group(bg); + btrfs_add_bg_to_space_info(fs_info, bg); set_avail_alloc_bits(fs_info, bg->flags); } @@ -2190,7 +2235,16 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) int need_clear = 0; u64 cache_gen; - if (!root) + /* + * Either no extent root (with ibadroots rescue option) or we have + * unsupported RO options. The fs can never be mounted read-write, so no + * need to waste time searching block group items. + * + * This also allows new extent tree related changes to be RO compat, + * no need for a full incompat flag. + */ + if (!root || (btrfs_super_compat_ro_flags(info->super_copy) & + ~BTRFS_FEATURE_COMPAT_RO_SUPP)) return fill_dummy_bgs(info); key.objectid = 0; @@ -2425,7 +2479,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) ret = insert_block_group_item(trans, block_group); if (ret) btrfs_abort_transaction(trans, ret); - if (!block_group->chunk_item_inserted) { + if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, + &block_group->runtime_flags)) { mutex_lock(&fs_info->chunk_mutex); ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group); mutex_unlock(&fs_info->chunk_mutex); @@ -2494,12 +2549,11 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran set_free_space_tree_thresholds(cache); cache->used = bytes_used; cache->flags = type; - cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; cache->global_root_id = calculate_global_root_id(fs_info, cache->start); if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) - cache->needs_free_space = 1; + set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags); ret = btrfs_load_block_group_zone_info(cache, true); if (ret) { @@ -2519,14 +2573,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran btrfs_free_excluded_extents(cache); -#ifdef CONFIG_BTRFS_DEBUG - if (btrfs_should_fragment_free_space(cache)) { - u64 new_bytes_used = size - bytes_used; - - bytes_used += new_bytes_used >> 1; - fragment_free_space(cache); - } -#endif /* * Ensure the corresponding space_info object is created and * assigned to our block group. We want our bg to be added to the rbtree @@ -2547,12 +2593,17 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran * the rbtree, update the space info's counters. */ trace_btrfs_add_block_group(fs_info, cache, 1); - btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, - cache->bytes_super, cache->zone_unusable, - cache->zone_is_active, &cache->space_info); + btrfs_add_bg_to_space_info(fs_info, cache); btrfs_update_global_block_rsv(fs_info); - link_block_group(cache); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(cache)) { + u64 new_bytes_used = size - bytes_used; + + cache->space_info->bytes_used += new_bytes_used >> 1; + fragment_free_space(cache); + } +#endif list_add_tail(&cache->bg_list, &trans->new_bgs); trans->delayed_ref_updates++; @@ -2713,6 +2764,25 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_block_group_item bgi; struct btrfs_key key; + u64 old_commit_used; + u64 used; + + /* + * Block group items update can be triggered out of commit transaction + * critical section, thus we need a consistent view of used bytes. + * We cannot use cache->used directly outside of the spin lock, as it + * may be changed. + */ + spin_lock(&cache->lock); + old_commit_used = cache->commit_used; + used = cache->used; + /* No change in used bytes, can safely skip it. */ + if (cache->commit_used == used) { + spin_unlock(&cache->lock); + return 0; + } + cache->commit_used = used; + spin_unlock(&cache->lock); key.objectid = cache->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; @@ -2727,7 +2797,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; bi = btrfs_item_ptr_offset(leaf, path->slots[0]); - btrfs_set_stack_block_group_used(&bgi, cache->used); + btrfs_set_stack_block_group_used(&bgi, used); btrfs_set_stack_block_group_chunk_objectid(&bgi, cache->global_root_id); btrfs_set_stack_block_group_flags(&bgi, cache->flags); @@ -2735,6 +2805,12 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); fail: btrfs_release_path(path); + /* We didn't update the block group item, need to revert @commit_used. */ + if (ret < 0) { + spin_lock(&cache->lock); + cache->commit_used = old_commit_used; + spin_unlock(&cache->lock); + } return ret; } @@ -2869,7 +2945,7 @@ again: cache_size *= fs_info->sectorsize; ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0, - cache_size); + cache_size, false); if (ret) goto out_put; @@ -3232,31 +3308,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) return ret; } -static inline bool should_reclaim_block_group(struct btrfs_block_group *bg, - u64 bytes_freed) -{ - const struct btrfs_space_info *space_info = bg->space_info; - const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); - const u64 new_val = bg->used; - const u64 old_val = new_val + bytes_freed; - u64 thresh; - - if (reclaim_thresh == 0) - return false; - - thresh = div_factor_fine(bg->length, reclaim_thresh); - - /* - * If we were below the threshold before don't reclaim, we are likely a - * brand new block group and we don't want to relocate new block groups. - */ - if (old_val < thresh) - return false; - if (new_val >= thresh) - return false; - return true; -} - int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc) { @@ -3368,8 +3419,9 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, return ret; } -/** - * btrfs_add_reserved_bytes - update the block_group and space info counters +/* + * Update the block_group and space info counters. + * * @cache: The cache we are manipulating * @ram_bytes: The number of bytes of file content, and will be same to * @num_bytes except for the compress path. @@ -3412,8 +3464,9 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, return ret; } -/** - * btrfs_free_reserved_bytes - update the block_group and space info counters +/* + * Update the block_group and space info counters. + * * @cache: The cache we are manipulating * @num_bytes: The number of bytes in question * @delalloc: The blocks are allocated for the delalloc write @@ -3470,13 +3523,13 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info, */ if (force == CHUNK_ALLOC_LIMITED) { thresh = btrfs_super_total_bytes(fs_info->super_copy); - thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); + thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1)); if (sinfo->total_bytes - bytes_used < thresh) return 1; } - if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8)) + if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80)) return 0; return 1; } @@ -3965,35 +4018,24 @@ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, void btrfs_put_block_group_cache(struct btrfs_fs_info *info) { struct btrfs_block_group *block_group; - u64 last = 0; - while (1) { - struct inode *inode; + block_group = btrfs_lookup_first_block_group(info, 0); + while (block_group) { + btrfs_wait_block_group_cache_done(block_group); + spin_lock(&block_group->lock); + if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, + &block_group->runtime_flags)) { + struct inode *inode = block_group->inode; - block_group = btrfs_lookup_first_block_group(info, last); - while (block_group) { - btrfs_wait_block_group_cache_done(block_group); - spin_lock(&block_group->lock); - if (block_group->iref) - break; + block_group->inode = NULL; spin_unlock(&block_group->lock); - block_group = btrfs_next_block_group(block_group); - } - if (!block_group) { - if (last == 0) - break; - last = 0; - continue; - } - inode = block_group->inode; - block_group->iref = 0; - block_group->inode = NULL; - spin_unlock(&block_group->lock); - ASSERT(block_group->io_ctl.inode == NULL); - iput(inode); - last = block_group->start + block_group->length; - btrfs_put_block_group(block_group); + ASSERT(block_group->io_ctl.inode == NULL); + iput(inode); + } else { + spin_unlock(&block_group->lock); + } + block_group = btrfs_next_block_group(block_group); } } @@ -4129,7 +4171,7 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) spin_lock(&block_group->lock); cleanup = (atomic_dec_and_test(&block_group->frozen) && - block_group->removed); + test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)); spin_unlock(&block_group->lock); if (cleanup) { @@ -4150,7 +4192,7 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) * tasks trimming this block group have left 1 entry each one. * Free them if any. */ - __btrfs_remove_free_space_cache(block_group->free_space_ctl); + btrfs_remove_free_space_cache(block_group); } } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 6b3cdc4cbc41..a02ea76fd6cf 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -46,19 +46,48 @@ enum btrfs_chunk_alloc_enum { CHUNK_ALLOC_FORCE_FOR_EXTENT, }; +/* Block group flags set at runtime */ +enum btrfs_block_group_flags { + BLOCK_GROUP_FLAG_IREF, + BLOCK_GROUP_FLAG_REMOVED, + BLOCK_GROUP_FLAG_TO_COPY, + BLOCK_GROUP_FLAG_RELOCATING_REPAIR, + BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, + BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, + /* Does the block group need to be added to the free space tree? */ + BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, + /* Indicate that the block group is placed on a sequential zone */ + BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, +}; + +enum btrfs_caching_type { + BTRFS_CACHE_NO, + BTRFS_CACHE_STARTED, + BTRFS_CACHE_FINISHED, + BTRFS_CACHE_ERROR, +}; + struct btrfs_caching_control { struct list_head list; struct mutex mutex; wait_queue_head_t wait; struct btrfs_work work; struct btrfs_block_group *block_group; - u64 progress; refcount_t count; }; /* Once caching_thread() finds this much free space, it will wake up waiters. */ #define CACHING_CTL_WAKE_UP SZ_2M +/* + * Tree to record all locked full stripes of a RAID5/6 block group + */ +struct btrfs_full_stripe_locks_tree { + struct rb_root root; + struct mutex lock; +}; + struct btrfs_block_group { struct btrfs_fs_info *fs_info; struct inode *inode; @@ -75,6 +104,12 @@ struct btrfs_block_group { u64 global_root_id; /* + * The last committed used bytes of this block group, if the above @used + * is still the same as @commit_used, we don't need to update block + * group item of this block group. + */ + u64 commit_used; + /* * If the free space extent count exceeds this number, convert the block * group to bitmaps. */ @@ -95,23 +130,15 @@ struct btrfs_block_group { /* For raid56, this is a full stripe, without parity */ unsigned long full_stripe_len; + unsigned long runtime_flags; unsigned int ro; - unsigned int iref:1; - unsigned int has_caching_ctl:1; - unsigned int removed:1; - unsigned int to_copy:1; - unsigned int relocating_repair:1; - unsigned int chunk_item_inserted:1; - unsigned int zone_is_active:1; - unsigned int zoned_data_reloc_ongoing:1; int disk_cache_state; /* Cache tracking stuff */ int cached; struct btrfs_caching_control *caching_ctl; - u64 last_byte_to_unpin; struct btrfs_space_info *space_info; @@ -186,15 +213,6 @@ struct btrfs_block_group { struct mutex free_space_lock; /* - * Does the block group need to be added to the free space tree? - * Protected by free_space_lock. - */ - int needs_free_space; - - /* Flag indicating this block group is placed on a sequential zone */ - bool seq_zone; - - /* * Number of extents in this block group used for swap files. * All accesses protected by the spinlock 'lock'. */ @@ -234,16 +252,7 @@ static inline bool btrfs_is_block_group_data_only( } #ifdef CONFIG_BTRFS_DEBUG -static inline int btrfs_should_fragment_free_space( - struct btrfs_block_group *block_group) -{ - struct btrfs_fs_info *fs_info = block_group->fs_info; - - return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && - block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || - (btrfs_test_opt(fs_info, FRAGMENT_DATA) && - block_group->flags & BTRFS_BLOCK_GROUP_DATA); -} +int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group); #endif struct btrfs_block_group *btrfs_lookup_first_block_group( @@ -305,8 +314,6 @@ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); int btrfs_free_block_groups(struct btrfs_fs_info *info); -void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache, - struct btrfs_caching_control *caching_ctl); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, struct block_device *bdev, u64 physical, u64 **logical, int *naddrs, int *stripe_len); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 06be0644dd37..5367a14d44d2 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -7,6 +7,8 @@ #include "transaction.h" #include "block-group.h" #include "disk-io.h" +#include "fs.h" +#include "accessors.h" /* * HOW DO BLOCK RESERVES WORK @@ -225,7 +227,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, return ret; } -int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent) { u64 num_bytes = 0; int ret = -ENOSPC; @@ -234,7 +236,7 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) return 0; spin_lock(&block_rsv->lock); - num_bytes = div_factor(block_rsv->size, min_factor); + num_bytes = mult_perc(block_rsv->size, min_percent); if (block_rsv->reserved >= num_bytes) ret = 0; spin_unlock(&block_rsv->lock); @@ -286,7 +288,7 @@ u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, */ if (block_rsv == delayed_rsv) target = global_rsv; - else if (block_rsv != global_rsv && !delayed_rsv->full) + else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv)) target = delayed_rsv; if (target && block_rsv->space_info != target->space_info) @@ -323,31 +325,6 @@ void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, spin_unlock(&block_rsv->lock); } -int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *dest, u64 num_bytes, - int min_factor) -{ - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - u64 min_bytes; - - if (global_rsv->space_info != dest->space_info) - return -ENOSPC; - - spin_lock(&global_rsv->lock); - min_bytes = div_factor(global_rsv->size, min_factor); - if (global_rsv->reserved < min_bytes + num_bytes) { - spin_unlock(&global_rsv->lock); - return -ENOSPC; - } - global_rsv->reserved -= num_bytes; - if (global_rsv->reserved < global_rsv->size) - global_rsv->full = false; - spin_unlock(&global_rsv->lock); - - btrfs_block_rsv_add_bytes(dest, num_bytes, true); - return 0; -} - void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; @@ -424,6 +401,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root) case BTRFS_CSUM_TREE_OBJECTID: case BTRFS_EXTENT_TREE_OBJECTID: case BTRFS_FREE_SPACE_TREE_OBJECTID: + case BTRFS_BLOCK_GROUP_TREE_OBJECTID: root->block_rsv = &fs_info->delayed_refs_rsv; break; case BTRFS_ROOT_TREE_OBJECTID: @@ -551,5 +529,17 @@ try_reserve: if (!ret) return global_rsv; } + + /* + * All hope is lost, but of course our reservations are overly + * pessimistic, so instead of possibly having an ENOSPC abort here, try + * one last time to force a reservation if there's enough actual space + * on disk to make the reservation. + */ + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize, + BTRFS_RESERVE_FLUSH_EMERGENCY); + if (!ret) + return block_rsv; + return ERR_PTR(ret); } diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 0c183709be00..4cc41c9aaa82 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -4,6 +4,7 @@ #define BTRFS_BLOCK_RSV_H struct btrfs_trans_handle; +struct btrfs_root; enum btrfs_reserve_flush_enum; /* @@ -62,7 +63,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush); -int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent); int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 min_reserved, enum btrfs_reserve_flush_enum flush); @@ -70,9 +71,6 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes, bool update_size); int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *dest, u64 num_bytes, - int min_factor); void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes, bool update_size); u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, @@ -92,4 +90,13 @@ static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, btrfs_block_rsv_release(fs_info, block_rsv, 0, NULL); } +/* + * Fast path to check if the reserve is full, may be carefully used outside of + * locks. + */ +static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv) +{ + return data_race(rsv->full); +} + #endif /* BTRFS_BLOCK_RSV_H */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b160b8e124e0..195c09e20609 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -65,6 +65,8 @@ enum { * on the same file. */ BTRFS_INODE_VERITY_IN_PROGRESS, + /* Set when this inode is a free space inode. */ + BTRFS_INODE_FREE_SPACE_INODE, }; /* in memory btrfs inode */ @@ -94,7 +96,8 @@ struct btrfs_inode { /* special utility tree used to record which mirrors have already been * tried when checksums fail for a given block */ - struct extent_io_tree io_failure_tree; + struct rb_root io_failure_tree; + spinlock_t io_failure_lock; /* * Keep track of where the inode has extent items mapped in order to @@ -250,11 +253,6 @@ struct btrfs_inode { struct inode vfs_inode; }; -static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode) -{ - return inode->root->fs_info->sectorsize; -} - static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) { return container_of(inode, struct btrfs_inode, vfs_inode); @@ -272,13 +270,6 @@ static inline unsigned long btrfs_inode_hash(u64 objectid, return (unsigned long)h; } -static inline void btrfs_insert_inode_hash(struct inode *inode) -{ - unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root); - - __insert_inode_hash(inode, h); -} - #if BITS_PER_LONG == 32 /* @@ -312,13 +303,7 @@ static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size) static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) { - struct btrfs_root *root = inode->root; - - if (root == root->fs_info->tree_root && - btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID) - return true; - - return false; + return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags); } static inline bool is_data_inode(struct inode *inode) @@ -426,29 +411,142 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes -static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, - u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) -{ - struct btrfs_root *root = inode->root; - const u32 csum_size = root->fs_info->csum_size; - - /* Output minus objectid, which is more meaningful */ - if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) - btrfs_warn_rl(root->fs_info, -"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - root->root_key.objectid, btrfs_ino(inode), - logical_start, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), - mirror_num); - else - btrfs_warn_rl(root->fs_info, -"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - root->root_key.objectid, btrfs_ino(inode), - logical_start, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), - mirror_num); -} +void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); +void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type); +void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); +blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio); +blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, + struct bio *bio, + u64 dio_file_offset); +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + u32 pgoff, u8 *csum, const u8 * const csum_expected); +int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff); +unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, + u64 start, u64 end); +noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, + u64 *orig_start, u64 *orig_block_len, + u64 *ram_bytes, bool nowait, bool strict); + +void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode); +struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); +int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, struct btrfs_inode *inode, + const struct fscrypt_str *name); +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct btrfs_inode *parent_inode, struct btrfs_inode *inode, + const struct fscrypt_str *name, int add_backref, u64 index); +int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry); +int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, + int front); + +int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, + bool in_reclaim_context); +int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, + unsigned int extra_bits, + struct extent_state **cached_state); + +struct btrfs_new_inode_args { + /* Input */ + struct inode *dir; + struct dentry *dentry; + struct inode *inode; + bool orphan; + bool subvol; + + /* Output from btrfs_new_inode_prepare(), input to btrfs_create_new_inode(). */ + struct posix_acl *default_acl; + struct posix_acl *acl; + struct fscrypt_name fname; +}; + +int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, + unsigned int *trans_num_items); +int btrfs_create_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_new_inode_args *args); +void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); +struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, + struct inode *dir); + void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, + u32 bits); +void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, + struct extent_state *state, u32 bits); +void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, + struct extent_state *other); +void btrfs_split_delalloc_extent(struct btrfs_inode *inode, + struct extent_state *orig, u64 split); +void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); +vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); +void btrfs_evict_inode(struct inode *inode); +struct inode *btrfs_alloc_inode(struct super_block *sb); +void btrfs_destroy_inode(struct inode *inode); +void btrfs_free_inode(struct inode *inode); +int btrfs_drop_inode(struct inode *inode); +int __init btrfs_init_cachep(void); +void __cold btrfs_destroy_cachep(void); +struct inode *btrfs_iget_path(struct super_block *s, u64 ino, + struct btrfs_root *root, struct btrfs_path *path); +struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); +struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, + struct page *page, size_t pg_offset, + u64 start, u64 end); +int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode); +int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode); +int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); +int btrfs_orphan_cleanup(struct btrfs_root *root); +int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); +void btrfs_add_delayed_iput(struct btrfs_inode *inode); +void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); +int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info); +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); +int btrfs_prealloc_file_range_trans(struct inode *inode, + struct btrfs_trans_handle *trans, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, struct writeback_control *wbc); +int btrfs_writepage_cow_fixup(struct page *page); +void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, + struct page *page, u64 start, + u64 end, bool uptodate); +int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, + int compress_type); +int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, u64 disk_bytenr, + u64 disk_io_size, + struct page **pages); +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, + struct btrfs_ioctl_encoded_io_args *encoded); +ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); + +ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before); +struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before); + +extern const struct dentry_operations btrfs_dentry_operations; + +/* Inode locking type flags, by default the exclusive lock is taken. */ +enum btrfs_ilock_type { + ENUM_BIT(BTRFS_ILOCK_SHARED), + ENUM_BIT(BTRFS_ILOCK_TRY), + ENUM_BIT(BTRFS_ILOCK_MMAP), +}; + +int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags); +void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags); +void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, + const u64 del_bytes); +void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 98c6e5feab19..82e49d985019 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -82,6 +82,7 @@ #include <linux/mm.h> #include <linux/string.h> #include <crypto/hash.h> +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -92,6 +93,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "compression.h" +#include "accessors.h" #define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 #define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 @@ -755,7 +757,7 @@ static int btrfsic_process_superblock_dev_mirror( btrfs_info_in_rcu(fs_info, "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)", superblock_bdev, - rcu_str_deref(device->name), dev_bytenr, + btrfs_dev_name(device), dev_bytenr, dev_state->bdev, dev_bytenr, superblock_mirror_num); list_add(&superblock_tmp->all_blocks_node, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index e84d22c5c6a8..5122ca79f7ea 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -8,6 +8,7 @@ #include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> +#include <linux/pagevec.h> #include <linux/highmem.h> #include <linux/kthread.h> #include <linux/time.h> @@ -15,22 +16,26 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/writeback.h> +#include <linux/psi.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/log2.h> #include <crypto/hash.h> #include "misc.h" #include "ctree.h" +#include "fs.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "volumes.h" +#include "bio.h" #include "ordered-data.h" #include "compression.h" #include "extent_io.h" #include "extent_map.h" #include "subpage.h" #include "zoned.h" +#include "file-item.h" +#include "super.h" static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; @@ -114,7 +119,7 @@ static int compression_decompress_bio(struct list_head *ws, } static int compression_decompress(int type, struct list_head *ws, - unsigned char *data_in, struct page *dest_page, + const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { switch (type) { @@ -152,9 +157,7 @@ static void finish_compressed_bio_read(struct compressed_bio *cb) } /* Do io completion on the original bio */ - if (cb->status != BLK_STS_OK) - cb->orig_bio->bi_status = cb->status; - bio_endio(cb->orig_bio); + btrfs_bio_end_io(btrfs_bio(cb->orig_bio), cb->status); /* Finally free the cb struct */ kfree(cb->compressed_pages); @@ -166,16 +169,15 @@ static void finish_compressed_bio_read(struct compressed_bio *cb) * before decompressing it into the original bio and freeing the uncompressed * pages. */ -static void end_compressed_bio_read(struct bio *bio) +static void end_compressed_bio_read(struct btrfs_bio *bbio) { - struct compressed_bio *cb = bio->bi_private; + struct compressed_bio *cb = bbio->private; struct inode *inode = cb->inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_inode *bi = BTRFS_I(inode); bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) && !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); - blk_status_t status = bio->bi_status; - struct btrfs_bio *bbio = btrfs_bio(bio); + blk_status_t status = bbio->bio.bi_status; struct bvec_iter iter; struct bio_vec bv; u32 offset; @@ -184,18 +186,17 @@ static void end_compressed_bio_read(struct bio *bio) u64 start = bbio->file_offset + offset; if (!status && - (!csum || !btrfs_check_data_csum(inode, bbio, offset, + (!csum || !btrfs_check_data_csum(bi, bbio, offset, bv.bv_page, bv.bv_offset))) { - clean_io_failure(fs_info, &bi->io_failure_tree, - &bi->io_tree, start, bv.bv_page, - btrfs_ino(bi), bv.bv_offset); + btrfs_clean_io_failure(bi, start, bv.bv_page, + bv.bv_offset); } else { int ret; refcount_inc(&cb->pending_ios); - ret = btrfs_repair_one_sector(inode, bbio, offset, + ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, bv.bv_page, bv.bv_offset, - btrfs_submit_data_read_bio); + true); if (ret) { refcount_dec(&cb->pending_ios); status = errno_to_blk_status(ret); @@ -209,7 +210,7 @@ static void end_compressed_bio_read(struct bio *bio) if (refcount_dec_and_test(&cb->pending_ios)) finish_compressed_bio_read(cb); btrfs_bio_free_csum(bbio); - bio_put(bio); + bio_put(&bbio->bio); } /* @@ -222,8 +223,7 @@ static noinline void end_compressed_writeback(struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; - struct page *pages[16]; - unsigned long nr_pages = end_index - index + 1; + struct folio_batch fbatch; const int errno = blk_status_to_errno(cb->status); int i; int ret; @@ -231,24 +231,23 @@ static noinline void end_compressed_writeback(struct inode *inode, if (errno) mapping_set_error(inode->i_mapping, errno); - while (nr_pages > 0) { - ret = find_get_pages_contig(inode->i_mapping, index, - min_t(unsigned long, - nr_pages, ARRAY_SIZE(pages)), pages); - if (ret == 0) { - nr_pages -= 1; - index += 1; - continue; - } + folio_batch_init(&fbatch); + while (index <= end_index) { + ret = filemap_get_folios(inode->i_mapping, &index, end_index, + &fbatch); + + if (ret == 0) + return; + for (i = 0; i < ret; i++) { + struct folio *folio = fbatch.folios[i]; + if (errno) - SetPageError(pages[i]); - btrfs_page_clamp_clear_writeback(fs_info, pages[i], + folio_set_error(folio); + btrfs_page_clamp_clear_writeback(fs_info, &folio->page, cb->start, cb->len); - put_page(pages[i]); } - nr_pages -= ret; - index += ret; + folio_batch_release(&fbatch); } /* the inode may be gone now */ } @@ -301,20 +300,20 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) * This also calls the writeback end hooks for the file pages so that metadata * and checksums can be updated in the file. */ -static void end_compressed_bio_write(struct bio *bio) +static void end_compressed_bio_write(struct btrfs_bio *bbio) { - struct compressed_bio *cb = bio->bi_private; + struct compressed_bio *cb = bbio->private; - if (bio->bi_status) - cb->status = bio->bi_status; + if (bbio->bio.bi_status) + cb->status = bbio->bio.bi_status; if (refcount_dec_and_test(&cb->pending_ios)) { struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - btrfs_record_physical_zoned(cb->inode, cb->start, bio); + btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio); queue_work(fs_info->compressed_write_workers, &cb->write_end_work); } - bio_put(bio); + bio_put(&bbio->bio); } /* @@ -335,7 +334,8 @@ static void end_compressed_bio_write(struct bio *bio) static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, - blk_opf_t opf, bio_end_io_t endio_func, + blk_opf_t opf, + btrfs_bio_end_io_t endio_func, u64 *next_stripe_start) { struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); @@ -344,12 +344,8 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte struct bio *bio; int ret; - bio = btrfs_bio_alloc(BIO_MAX_VECS); - + bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb); bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; - bio->bi_opf = opf; - bio->bi_private = cb; - bio->bi_end_io = endio_func; em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize); if (IS_ERR(em)) { @@ -478,8 +474,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, if (!skip_sum) { ret = btrfs_csum_one_bio(inode, bio, start, true); if (ret) { - bio->bi_status = ret; - bio_endio(bio); + btrfs_bio_end_io(btrfs_bio(bio), ret); break; } } @@ -519,7 +514,8 @@ static u64 bio_end_offset(struct bio *bio) */ static noinline int add_ra_bio_pages(struct inode *inode, u64 compressed_end, - struct compressed_bio *cb) + struct compressed_bio *cb, + int *memstall, unsigned long *pflags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long end_index; @@ -588,6 +584,11 @@ static noinline int add_ra_bio_pages(struct inode *inode, continue; } + if (!*memstall && PageWorkingset(page)) { + psi_memstall_enter(pflags); + *memstall = 1; + } + ret = set_page_extent_mapped(page); if (ret < 0) { unlock_page(page); @@ -596,7 +597,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, } page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1; - lock_extent(tree, cur, page_end); + lock_extent(tree, cur, page_end, NULL); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); read_unlock(&em_tree->lock); @@ -610,7 +611,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, (cur + fs_info->sectorsize > extent_map_end(em)) || (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) { free_extent_map(em); - unlock_extent(tree, cur, page_end); + unlock_extent(tree, cur, page_end, NULL); unlock_page(page); put_page(page); break; @@ -630,7 +631,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, add_size = min(em->start + em->len, page_end + 1) - cur; ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur)); if (ret != add_size) { - unlock_extent(tree, cur, page_end); + unlock_extent(tree, cur, page_end, NULL); unlock_page(page); put_page(page); break; @@ -674,6 +675,8 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; + unsigned long pflags; + int memstall = 0; blk_status_t ret; int ret2; int i; @@ -729,7 +732,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, goto fail; } - add_ra_bio_pages(inode, em_start + em_len, cb); + add_ra_bio_pages(inode, em_start + em_len, cb, &memstall, &pflags); /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; @@ -799,8 +802,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL); if (ret) { - comp_bio->bi_status = ret; - bio_endio(comp_bio); + btrfs_bio_end_io(btrfs_bio(comp_bio), ret); break; } @@ -810,6 +812,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } } + if (memstall) + psi_memstall_leave(&pflags); + if (refcount_dec_and_test(&cb->pending_ios)) finish_compressed_bio_read(cb); return; @@ -826,8 +831,7 @@ fail: kfree(cb); out: free_extent_map(em); - bio->bi_status = ret; - bio_endio(bio); + btrfs_bio_end_io(btrfs_bio(bio), ret); return; } @@ -1228,7 +1232,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) * single page, and we want to read a single page out of it. * start_byte tells us the offset into the compressed data we're interested in */ -int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, +int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { struct list_head *workspace; @@ -1242,12 +1246,13 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, return ret; } -void __init btrfs_init_compress(void) +int __init btrfs_init_compress(void) { btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); zstd_init_workspace_manager(); + return 0; } void __cold btrfs_exit_compress(void) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 1aa02903de69..6209d40a1e08 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -6,6 +6,7 @@ #ifndef BTRFS_COMPRESSION_H #define BTRFS_COMPRESSION_H +#include <linux/blk_types.h> #include <linux/sizes.h> struct btrfs_inode; @@ -77,7 +78,7 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level) return ((type_level & 0xF0) >> 4); } -void __init btrfs_init_compress(void); +int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, @@ -85,7 +86,7 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, unsigned long *out_pages, unsigned long *total_in, unsigned long *total_out); -int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, +int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); @@ -149,7 +150,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zlib_decompress(struct list_head *ws, unsigned char *data_in, +int zlib_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); struct list_head *zlib_alloc_workspace(unsigned int level); @@ -160,7 +161,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int lzo_decompress(struct list_head *ws, unsigned char *data_in, +int lzo_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); struct list_head *lzo_alloc_workspace(unsigned int level); @@ -170,7 +171,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zstd_decompress(struct list_head *ws, unsigned char *data_in, +int zstd_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); void zstd_init_workspace_manager(void); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ebfa35fe1c38..4754c9101a4c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -8,6 +8,7 @@ #include <linux/rbtree.h> #include <linux/mm.h> #include <linux/error-injection.h> +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -17,6 +18,13 @@ #include "qgroup.h" #include "tree-mod-log.h" #include "tree-checker.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "relocation.h" +#include "file-item.h" + +static struct kmem_cache *btrfs_path_cachep; static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); @@ -44,6 +52,104 @@ static const struct btrfs_csums { .driver = "blake2b-256" }, }; +/* + * The leaf data grows from end-to-front in the node. this returns the address + * of the start of the last item, which is the stop of the leaf data stack. + */ +static unsigned int leaf_data_end(const struct extent_buffer *leaf) +{ + u32 nr = btrfs_header_nritems(leaf); + + if (nr == 0) + return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); + return btrfs_item_offset(leaf, nr - 1); +} + +/* + * Move data in a @leaf (using memmove, safe for overlapping ranges). + * + * @leaf: leaf that we're doing a memmove on + * @dst_offset: item data offset we're moving to + * @src_offset: item data offset were' moving from + * @len: length of the data we're moving + * + * Wrapper around memmove_extent_buffer() that takes into account the header on + * the leaf. The btrfs_item offset's start directly after the header, so we + * have to adjust any offsets to account for the header in the leaf. This + * handles that math to simplify the callers. + */ +static inline void memmove_leaf_data(const struct extent_buffer *leaf, + unsigned long dst_offset, + unsigned long src_offset, + unsigned long len) +{ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset, + btrfs_item_nr_offset(leaf, 0) + src_offset, len); +} + +/* + * Copy item data from @src into @dst at the given @offset. + * + * @dst: destination leaf that we're copying into + * @src: source leaf that we're copying from + * @dst_offset: item data offset we're copying to + * @src_offset: item data offset were' copying from + * @len: length of the data we're copying + * + * Wrapper around copy_extent_buffer() that takes into account the header on + * the leaf. The btrfs_item offset's start directly after the header, so we + * have to adjust any offsets to account for the header in the leaf. This + * handles that math to simplify the callers. + */ +static inline void copy_leaf_data(const struct extent_buffer *dst, + const struct extent_buffer *src, + unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset, + btrfs_item_nr_offset(src, 0) + src_offset, len); +} + +/* + * Move items in a @leaf (using memmove). + * + * @dst: destination leaf for the items + * @dst_item: the item nr we're copying into + * @src_item: the item nr we're copying from + * @nr_items: the number of items to copy + * + * Wrapper around memmove_extent_buffer() that does the math to get the + * appropriate offsets into the leaf from the item numbers. + */ +static inline void memmove_leaf_items(const struct extent_buffer *leaf, + int dst_item, int src_item, int nr_items) +{ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item), + btrfs_item_nr_offset(leaf, src_item), + nr_items * sizeof(struct btrfs_item)); +} + +/* + * Copy items from @src into @dst at the given @offset. + * + * @dst: destination leaf for the items + * @src: source leaf for the items + * @dst_item: the item nr we're copying into + * @src_item: the item nr we're copying from + * @nr_items: the number of items to copy + * + * Wrapper around copy_extent_buffer() that does the math to get the + * appropriate offsets into the leaf from the item numbers. + */ +static inline void copy_leaf_items(const struct extent_buffer *dst, + const struct extent_buffer *src, + int dst_item, int src_item, int nr_items) +{ + copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item), + btrfs_item_nr_offset(src, src_item), + nr_items * sizeof(struct btrfs_item)); +} + int btrfs_super_csum_size(const struct btrfs_super_block *s) { u16 t = btrfs_super_csum_type(s); @@ -78,6 +184,8 @@ size_t __attribute_const__ btrfs_get_num_csums(void) struct btrfs_path *btrfs_alloc_path(void) { + might_sleep(); + return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); } @@ -114,6 +222,22 @@ noinline void btrfs_release_path(struct btrfs_path *p) } /* + * We want the transaction abort to print stack trace only for errors where the + * cause could be a bug, eg. due to ENOSPC, and not for common errors that are + * caused by external factors. + */ +bool __cold abort_should_print_stack(int errno) +{ + switch (errno) { + case -EIO: + case -EROFS: + case -ENOMEM: + return false; + } + return true; +} + +/* * safely gets a reference on the root node of a tree. A lock * is not taken, so a concurrent writer may put a different node * at the root of the tree. See btrfs_lock_root_node for the @@ -471,7 +595,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, } else { WARN_ON(trans->transid != btrfs_header_generation(parent)); btrfs_tree_mod_log_insert_key(parent, parent_slot, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, @@ -834,19 +958,22 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, int slot) { int level = btrfs_header_level(parent); + struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb; - struct btrfs_key first_key; if (slot < 0 || slot >= btrfs_header_nritems(parent)) return ERR_PTR(-ENOENT); BUG_ON(level == 0); - btrfs_node_key_to_cpu(parent, &first_key, slot); + check.level = level - 1; + check.transid = btrfs_node_ptr_generation(parent, slot); + check.owner_root = btrfs_header_owner(parent); + check.has_first_key = true; + btrfs_node_key_to_cpu(parent, &check.first_key, slot); + eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), - btrfs_header_owner(parent), - btrfs_node_ptr_generation(parent, slot), - level - 1, &first_key); + &check); if (IS_ERR(eb)) return eb; if (!extent_buffer_uptodate(eb)) { @@ -1000,7 +1127,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -1046,7 +1173,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -1148,7 +1275,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -1202,7 +1329,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(right, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -1405,10 +1532,10 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, const struct btrfs_key *key) { struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_tree_parent_check check = { 0 }; u64 blocknr; u64 gen; struct extent_buffer *tmp; - struct btrfs_key first_key; int ret; int parent_level; bool unlock_up; @@ -1417,7 +1544,11 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, blocknr = btrfs_node_blockptr(*eb_ret, slot); gen = btrfs_node_ptr_generation(*eb_ret, slot); parent_level = btrfs_header_level(*eb_ret); - btrfs_node_key_to_cpu(*eb_ret, &first_key, slot); + btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); + check.has_first_key = true; + check.level = parent_level - 1; + check.transid = gen; + check.owner_root = root->root_key.objectid; /* * If we need to read an extent buffer from disk and we are holding locks @@ -1439,7 +1570,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, * parents (shared tree blocks). */ if (btrfs_verify_level_key(tmp, - parent_level - 1, &first_key, gen)) { + parent_level - 1, &check.first_key, gen)) { free_extent_buffer(tmp); return -EUCLEAN; } @@ -1447,11 +1578,16 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, return 0; } + if (p->nowait) { + free_extent_buffer(tmp); + return -EAGAIN; + } + if (unlock_up) btrfs_unlock_up_safe(p, level + 1); /* now we're allowed to do a blocking uptodate check */ - ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key); + ret = btrfs_read_extent_buffer(tmp, &check); if (ret) { free_extent_buffer(tmp); btrfs_release_path(p); @@ -1467,6 +1603,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, ret = -EAGAIN; goto out; + } else if (p->nowait) { + return -EAGAIN; } if (unlock_up) { @@ -1479,8 +1617,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (p->reada != READA_NONE) reada_for_search(fs_info, p, level, slot, key->objectid); - tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, - gen, parent_level - 1, &first_key); + tmp = read_tree_block(fs_info, blocknr, &check); if (IS_ERR(tmp)) { btrfs_release_path(p); return PTR_ERR(tmp); @@ -1634,7 +1771,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, * We don't know the level of the root node until we actually * have it read locked */ - b = btrfs_read_lock_root_node(root); + if (p->nowait) { + b = btrfs_try_read_lock_root_node(root); + if (IS_ERR(b)) + return b; + } else { + b = btrfs_read_lock_root_node(root); + } level = btrfs_header_level(b); if (level > write_lock_level) goto out; @@ -1905,11 +2048,20 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, int min_write_lock_level; int prev_cmp; + might_sleep(); + lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); WARN_ON(p->nodes[0] != NULL); BUG_ON(!cow && ins_len); + /* + * For now only allow nowait for read only operations. There's no + * strict reason why we can't, we just only need it for reads so it's + * only implemented for reads. + */ + ASSERT(!p->nowait || !cow); + if (ins_len < 0) { lowest_unlock = 2; @@ -1936,7 +2088,12 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (p->need_commit_sem) { ASSERT(p->search_commit_root); - down_read(&fs_info->commit_root_sem); + if (p->nowait) { + if (!down_read_trylock(&fs_info->commit_root_sem)) + return -EAGAIN; + } else { + down_read(&fs_info->commit_root_sem); + } } again: @@ -2082,7 +2239,15 @@ cow_done: btrfs_tree_lock(b); p->locks[level] = BTRFS_WRITE_LOCK; } else { - btrfs_tree_read_lock(b); + if (p->nowait) { + if (!btrfs_try_tree_read_lock(b)) { + free_extent_buffer(b); + ret = -EAGAIN; + goto done; + } + } else { + btrfs_tree_read_lock(b); + } p->locks[level] = BTRFS_READ_LOCK; } p->nodes[level] = b; @@ -2131,6 +2296,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, lowest_level = p->lowest_level; WARN_ON(p->nodes[0] != NULL); + ASSERT(!p->nowait); if (p->search_commit_root) { BUG_ON(time_seq); @@ -2307,7 +2473,7 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, return ret; } -/** +/* * Search for a valid slot for the given path. * * @root: The root node of the tree. @@ -2366,7 +2532,7 @@ static void fixup_low_keys(struct btrfs_path *path, break; t = path->nodes[i]; ret = btrfs_tree_mod_log_insert_key(t, tslot, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); @@ -2535,8 +2701,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, return ret; } copy_extent_buffer(dst, src, - btrfs_node_key_ptr_offset(dst_nritems), - btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(dst, dst_nritems), + btrfs_node_key_ptr_offset(src, 0), push_items * sizeof(struct btrfs_key_ptr)); if (push_items < src_nritems) { @@ -2544,8 +2710,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, * Don't call btrfs_tree_mod_log_insert_move() here, key removal * was already fully logged by btrfs_tree_mod_log_eb_copy() above. */ - memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), - btrfs_node_key_ptr_offset(push_items), + memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0), + btrfs_node_key_ptr_offset(src, push_items), (src_nritems - push_items) * sizeof(struct btrfs_key_ptr)); } @@ -2605,8 +2771,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, } ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); BUG_ON(ret < 0); - memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), - btrfs_node_key_ptr_offset(0), + memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items), + btrfs_node_key_ptr_offset(dst, 0), (dst_nritems) * sizeof(struct btrfs_key_ptr)); @@ -2617,8 +2783,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, return ret; } copy_extent_buffer(dst, src, - btrfs_node_key_ptr_offset(0), - btrfs_node_key_ptr_offset(src_nritems - push_items), + btrfs_node_key_ptr_offset(dst, 0), + btrfs_node_key_ptr_offset(src, src_nritems - push_items), push_items * sizeof(struct btrfs_key_ptr)); btrfs_set_header_nritems(src, src_nritems - push_items); @@ -2721,13 +2887,13 @@ static void insert_ptr(struct btrfs_trans_handle *trans, BUG_ON(ret < 0); } memmove_extent_buffer(lower, - btrfs_node_key_ptr_offset(slot + 1), - btrfs_node_key_ptr_offset(slot), + btrfs_node_key_ptr_offset(lower, slot + 1), + btrfs_node_key_ptr_offset(lower, slot), (nritems - slot) * sizeof(struct btrfs_key_ptr)); } if (level) { ret = btrfs_tree_mod_log_insert_key(lower, slot, - BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); + BTRFS_MOD_LOG_KEY_ADD); BUG_ON(ret < 0); } btrfs_set_node_key(lower, key, slot); @@ -2804,8 +2970,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, return ret; } copy_extent_buffer(split, c, - btrfs_node_key_ptr_offset(0), - btrfs_node_key_ptr_offset(mid), + btrfs_node_key_ptr_offset(split, 0), + btrfs_node_key_ptr_offset(c, mid), (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); btrfs_set_header_nritems(split, c_nritems - mid); btrfs_set_header_nritems(c, mid); @@ -2945,25 +3111,17 @@ static noinline int __push_leaf_right(struct btrfs_path *path, /* make room in the right data area */ data_end = leaf_data_end(right); - memmove_extent_buffer(right, - BTRFS_LEAF_DATA_OFFSET + data_end - push_space, - BTRFS_LEAF_DATA_OFFSET + data_end, - BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); + memmove_leaf_data(right, data_end - push_space, data_end, + BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); /* copy from the left data area */ - copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET + - BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, - BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left), - push_space); + copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, + leaf_data_end(left), push_space); - memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), - btrfs_item_nr_offset(0), - right_nritems * sizeof(struct btrfs_item)); + memmove_leaf_items(right, push_items, 0, right_nritems); /* copy the items from left to right */ - copy_extent_buffer(right, left, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(left_nritems - push_items), - push_items * sizeof(struct btrfs_item)); + copy_leaf_items(right, left, 0, left_nritems - push_items, push_items); /* update the item pointers */ btrfs_init_map_token(&token, right); @@ -3155,19 +3313,13 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, WARN_ON(!empty && push_items == btrfs_header_nritems(right)); /* push data from right to left */ - copy_extent_buffer(left, right, - btrfs_item_nr_offset(btrfs_header_nritems(left)), - btrfs_item_nr_offset(0), - push_items * sizeof(struct btrfs_item)); + copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items); push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_offset(right, push_items - 1); - copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + - leaf_data_end(left) - push_space, - BTRFS_LEAF_DATA_OFFSET + - btrfs_item_offset(right, push_items - 1), - push_space); + copy_leaf_data(left, right, leaf_data_end(left) - push_space, + btrfs_item_offset(right, push_items - 1), push_space); old_left_nritems = btrfs_header_nritems(left); BUG_ON(old_left_nritems <= 0); @@ -3190,15 +3342,12 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, if (push_items < right_nritems) { push_space = btrfs_item_offset(right, push_items - 1) - leaf_data_end(right); - memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + - BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, - BTRFS_LEAF_DATA_OFFSET + - leaf_data_end(right), push_space); + memmove_leaf_data(right, + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, + leaf_data_end(right), push_space); - memmove_extent_buffer(right, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(push_items), - (btrfs_header_nritems(right) - push_items) * - sizeof(struct btrfs_item)); + memmove_leaf_items(right, 0, push_items, + btrfs_header_nritems(right) - push_items); } btrfs_init_map_token(&token, right); @@ -3330,14 +3479,10 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(right, nritems); data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); - copy_extent_buffer(right, l, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(mid), - nritems * sizeof(struct btrfs_item)); + copy_leaf_items(right, l, 0, mid, nritems); - copy_extent_buffer(right, l, - BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - - data_copy_size, BTRFS_LEAF_DATA_OFFSET + - leaf_data_end(l), data_copy_size); + copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size, + leaf_data_end(l), data_copy_size); rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); @@ -3707,9 +3852,7 @@ static noinline int split_item(struct btrfs_path *path, nritems = btrfs_header_nritems(leaf); if (slot != nritems) { /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), - btrfs_item_nr_offset(slot), - (nritems - slot) * sizeof(struct btrfs_item)); + memmove_leaf_items(leaf, slot + 1, slot, nritems - slot); } btrfs_cpu_key_to_disk(&disk_key, new_key); @@ -3820,9 +3963,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) /* shift the data */ if (from_end) { - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + - data_end, old_data_start + new_size - data_end); + memmove_leaf_data(leaf, data_end + size_diff, data_end, + old_data_start + new_size - data_end); } else { struct btrfs_disk_key disk_key; u64 offset; @@ -3847,9 +3989,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) } } - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + - data_end, old_data_start - data_end); + memmove_leaf_data(leaf, data_end + size_diff, data_end, + old_data_start - data_end); offset = btrfs_disk_key_offset(&disk_key); btrfs_set_disk_key_offset(&disk_key, offset + size_diff); @@ -3914,9 +4055,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) } /* shift the data */ - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end - data_size, BTRFS_LEAF_DATA_OFFSET + - data_end, old_data - data_end); + memmove_leaf_data(leaf, data_end - data_size, data_end, + old_data - data_end); data_end = old_data; old_size = btrfs_item_size(leaf, slot); @@ -3929,14 +4069,15 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) } } -/** - * setup_items_for_insert - Helper called before inserting one or more items - * to a leaf. Main purpose is to save stack depth by doing the bulk of the work - * in a function that doesn't call btrfs_search_slot +/* + * Make space in the node before inserting one or more items. * * @root: root we are inserting items to * @path: points to the leaf/slot where we are going to insert new items * @batch: information about the batch of items to insert + * + * Main purpose is to save stack depth by doing the bulk of the work in a + * function that doesn't call btrfs_search_slot */ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_item_batch *batch) @@ -3999,15 +4140,11 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p ioff - batch->total_data_size); } /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr), - btrfs_item_nr_offset(slot), - (nritems - slot) * sizeof(struct btrfs_item)); + memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot); /* shift the data */ - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end - batch->total_data_size, - BTRFS_LEAF_DATA_OFFSET + data_end, - old_data - data_end); + memmove_leaf_data(leaf, data_end - batch->total_data_size, + data_end, old_data - data_end); data_end = old_data; } @@ -4161,13 +4298,13 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, BUG_ON(ret < 0); } memmove_extent_buffer(parent, - btrfs_node_key_ptr_offset(slot), - btrfs_node_key_ptr_offset(slot + 1), + btrfs_node_key_ptr_offset(parent, slot), + btrfs_node_key_ptr_offset(parent, slot + 1), sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); } else if (level) { ret = btrfs_tree_mod_log_insert_key(parent, slot, - BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE); BUG_ON(ret < 0); } @@ -4242,10 +4379,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, for (i = 0; i < nr; i++) dsize += btrfs_item_size(leaf, slot + i); - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end + dsize, - BTRFS_LEAF_DATA_OFFSET + data_end, - last_off - data_end); + memmove_leaf_data(leaf, data_end + dsize, data_end, + last_off - data_end); btrfs_init_map_token(&token, leaf); for (i = slot + nr; i < nritems; i++) { @@ -4255,10 +4390,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_token_item_offset(&token, i, ioff + dsize); } - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), - btrfs_item_nr_offset(slot + nr), - sizeof(struct btrfs_item) * - (nritems - slot - nr)); + memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr); } btrfs_set_header_nritems(leaf, nritems - nr); nritems -= nr; @@ -4432,6 +4564,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, int ret = 1; int keep_locks = path->keep_locks; + ASSERT(!path->nowait); path->keep_locks = 1; again: cur = btrfs_read_lock_root_node(root); @@ -4612,6 +4745,13 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, int ret; int i; + /* + * The nowait semantics are used only for write paths, where we don't + * use the tree mod log and sequence numbers. + */ + if (time_seq) + ASSERT(!path->nowait); + nritems = btrfs_header_nritems(path->nodes[0]); if (nritems == 0) return 1; @@ -4630,7 +4770,14 @@ again: if (path->need_commit_sem) { path->need_commit_sem = 0; need_commit_sem = true; - down_read(&fs_info->commit_root_sem); + if (path->nowait) { + if (!down_read_trylock(&fs_info->commit_root_sem)) { + ret = -EAGAIN; + goto done; + } + } else { + down_read(&fs_info->commit_root_sem); + } } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); } @@ -4706,7 +4853,7 @@ again: next = c; ret = read_block_for_search(root, path, &next, level, slot, &key); - if (ret == -EAGAIN) + if (ret == -EAGAIN && !path->nowait) goto again; if (ret < 0) { @@ -4716,6 +4863,10 @@ again: if (!path->skip_locking) { ret = btrfs_try_tree_read_lock(next); + if (!ret && path->nowait) { + ret = -EAGAIN; + goto done; + } if (!ret && time_seq) { /* * If we don't get the lock, we may be racing @@ -4746,7 +4897,7 @@ again: ret = read_block_for_search(root, path, &next, level, 0, &key); - if (ret == -EAGAIN) + if (ret == -EAGAIN && !path->nowait) goto again; if (ret < 0) { @@ -4754,8 +4905,16 @@ again: goto done; } - if (!path->skip_locking) - btrfs_tree_read_lock(next); + if (!path->skip_locking) { + if (path->nowait) { + if (!btrfs_try_tree_read_lock(next)) { + ret = -EAGAIN; + goto done; + } + } else { + btrfs_tree_read_lock(next); + } + } } ret = 0; done: @@ -4773,6 +4932,14 @@ done: return ret; } +int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) +{ + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) + return btrfs_next_old_leaf(root, path, time_seq); + return 0; +} + /* * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps * searching until it gets past min_objectid or finds an item of 'type' @@ -4856,3 +5023,18 @@ int btrfs_previous_extent_item(struct btrfs_root *root, } return 1; } + +int __init btrfs_ctree_init(void) +{ + btrfs_path_cachep = kmem_cache_create("btrfs_path", + sizeof(struct btrfs_path), 0, + SLAB_MEM_SPREAD, NULL); + if (!btrfs_path_cachep) + return -ENOMEM; + return 0; +} + +void __cold btrfs_ctree_exit(void) +{ + kmem_cache_destroy(btrfs_path_cachep); +} diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index df8c99c99df9..6965703a81b6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -28,12 +28,15 @@ #include <linux/refcount.h> #include <linux/crc32c.h> #include <linux/iomap.h> +#include <linux/fscrypt.h> #include "extent-io-tree.h" #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" #include "block-rsv.h" #include "locking.h" +#include "misc.h" +#include "fs.h" struct btrfs_trans_handle; struct btrfs_transaction; @@ -41,353 +44,15 @@ struct btrfs_pending_snapshot; struct btrfs_delayed_ref_root; struct btrfs_space_info; struct btrfs_block_group; -extern struct kmem_cache *btrfs_trans_handle_cachep; -extern struct kmem_cache *btrfs_bit_radix_cachep; -extern struct kmem_cache *btrfs_path_cachep; -extern struct kmem_cache *btrfs_free_space_cachep; -extern struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_ordered_sum; struct btrfs_ref; struct btrfs_bio; struct btrfs_ioctl_encoded_io_args; - -#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ - -/* - * Maximum number of mirrors that can be available for all profiles counting - * the target device of dev-replace as one. During an active device replace - * procedure, the target device of the copy operation is a mirror for the - * filesystem data as well that can be used to read data in order to repair - * read errors on other disks. - * - * Current value is derived from RAID1C4 with 4 copies. - */ -#define BTRFS_MAX_MIRRORS (4 + 1) - -#define BTRFS_MAX_LEVEL 8 - -#define BTRFS_OLDEST_GENERATION 0ULL - -/* - * we can actually store much bigger names, but lets not confuse the rest - * of linux - */ -#define BTRFS_NAME_LEN 255 - -/* - * Theoretical limit is larger, but we keep this down to a sane - * value. That should limit greatly the possibility of collisions on - * inode ref items. - */ -#define BTRFS_LINK_MAX 65535U - -#define BTRFS_EMPTY_DIR_SIZE 0 - -/* ioprio of readahead is set to idle */ -#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) - -#define BTRFS_DIRTY_METADATA_THRESH SZ_32M - -/* - * Use large batch size to reduce overhead of metadata updates. On the reader - * side, we only read it when we are close to ENOSPC and the read overhead is - * mostly related to the number of CPUs, so it is OK to use arbitrary large - * value here. - */ -#define BTRFS_TOTAL_BYTES_PINNED_BATCH SZ_128M - -#define BTRFS_MAX_EXTENT_SIZE SZ_128M - -/* - * Deltas are an effective way to populate global statistics. Give macro names - * to make it clear what we're doing. An example is discard_extents in - * btrfs_free_space_ctl. - */ -#define BTRFS_STAT_NR_ENTRIES 2 -#define BTRFS_STAT_CURR 0 -#define BTRFS_STAT_PREV 1 - -static inline unsigned long btrfs_chunk_item_size(int num_stripes) -{ - BUG_ON(num_stripes == 0); - return sizeof(struct btrfs_chunk) + - sizeof(struct btrfs_stripe) * (num_stripes - 1); -} - -/* - * Runtime (in-memory) states of filesystem - */ -enum { - /* Global indicator of serious filesystem errors */ - BTRFS_FS_STATE_ERROR, - /* - * Filesystem is being remounted, allow to skip some operations, like - * defrag - */ - BTRFS_FS_STATE_REMOUNTING, - /* Filesystem in RO mode */ - BTRFS_FS_STATE_RO, - /* Track if a transaction abort has been reported on this filesystem */ - BTRFS_FS_STATE_TRANS_ABORTED, - /* - * Bio operations should be blocked on this filesystem because a source - * or target device is being destroyed as part of a device replace - */ - BTRFS_FS_STATE_DEV_REPLACING, - /* The btrfs_fs_info created for self-tests */ - BTRFS_FS_STATE_DUMMY_FS_INFO, - - BTRFS_FS_STATE_NO_CSUMS, - - /* Indicates there was an error cleaning up a log tree. */ - BTRFS_FS_STATE_LOG_CLEANUP_ERROR, - - BTRFS_FS_STATE_COUNT -}; - -#define BTRFS_BACKREF_REV_MAX 256 -#define BTRFS_BACKREF_REV_SHIFT 56 -#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ - BTRFS_BACKREF_REV_SHIFT) - -#define BTRFS_OLD_BACKREF_REV 0 -#define BTRFS_MIXED_BACKREF_REV 1 - -/* - * every tree block (leaf or node) starts with this header. - */ -struct btrfs_header { - /* these first four must match the super block */ - u8 csum[BTRFS_CSUM_SIZE]; - u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ - __le64 bytenr; /* which block this node is supposed to live in */ - __le64 flags; - - /* allowed to be different from the super from here on down */ - u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; - __le64 generation; - __le64 owner; - __le32 nritems; - u8 level; -} __attribute__ ((__packed__)); - -/* - * this is a very generous portion of the super block, giving us - * room to translate 14 chunks with 3 stripes each. - */ -#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 - -/* - * just in case we somehow lose the roots and are not able to mount, - * we store an array of the roots from previous transactions - * in the super. - */ -#define BTRFS_NUM_BACKUP_ROOTS 4 -struct btrfs_root_backup { - __le64 tree_root; - __le64 tree_root_gen; - - __le64 chunk_root; - __le64 chunk_root_gen; - - __le64 extent_root; - __le64 extent_root_gen; - - __le64 fs_root; - __le64 fs_root_gen; - - __le64 dev_root; - __le64 dev_root_gen; - - __le64 csum_root; - __le64 csum_root_gen; - - __le64 total_bytes; - __le64 bytes_used; - __le64 num_devices; - /* future */ - __le64 unused_64[4]; - - u8 tree_root_level; - u8 chunk_root_level; - u8 extent_root_level; - u8 fs_root_level; - u8 dev_root_level; - u8 csum_root_level; - /* future and to align */ - u8 unused_8[10]; -} __attribute__ ((__packed__)); - -#define BTRFS_SUPER_INFO_OFFSET SZ_64K -#define BTRFS_SUPER_INFO_SIZE 4096 - -/* - * The reserved space at the beginning of each device. - * It covers the primary super block and leaves space for potential use by other - * tools like bootloaders or to lower potential damage of accidental overwrite. - */ -#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) - -/* - * the super block basically lists the main trees of the FS - * it currently lacks any block count etc etc - */ -struct btrfs_super_block { - /* the first 4 fields must match struct btrfs_header */ - u8 csum[BTRFS_CSUM_SIZE]; - /* FS specific UUID, visible to user */ - u8 fsid[BTRFS_FSID_SIZE]; - __le64 bytenr; /* this block number */ - __le64 flags; - - /* allowed to be different from the btrfs_header from here own down */ - __le64 magic; - __le64 generation; - __le64 root; - __le64 chunk_root; - __le64 log_root; - - /* - * This member has never been utilized since the very beginning, thus - * it's always 0 regardless of kernel version. We always use - * generation + 1 to read log tree root. So here we mark it deprecated. - */ - __le64 __unused_log_root_transid; - __le64 total_bytes; - __le64 bytes_used; - __le64 root_dir_objectid; - __le64 num_devices; - __le32 sectorsize; - __le32 nodesize; - __le32 __unused_leafsize; - __le32 stripesize; - __le32 sys_chunk_array_size; - __le64 chunk_root_generation; - __le64 compat_flags; - __le64 compat_ro_flags; - __le64 incompat_flags; - __le16 csum_type; - u8 root_level; - u8 chunk_root_level; - u8 log_root_level; - struct btrfs_dev_item dev_item; - - char label[BTRFS_LABEL_SIZE]; - - __le64 cache_generation; - __le64 uuid_tree_generation; - - /* the UUID written into btree blocks */ - u8 metadata_uuid[BTRFS_FSID_SIZE]; - - /* Extent tree v2 */ - __le64 block_group_root; - __le64 block_group_root_generation; - u8 block_group_root_level; - - /* future expansion */ - u8 reserved8[7]; - __le64 reserved[25]; - u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; - struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; - - /* Padded to 4096 bytes */ - u8 padding[565]; -} __attribute__ ((__packed__)); -static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); - -/* - * Compat flags that we support. If any incompat flags are set other than the - * ones specified below then we will fail to mount - */ -#define BTRFS_FEATURE_COMPAT_SUPP 0ULL -#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL -#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL - -#define BTRFS_FEATURE_COMPAT_RO_SUPP \ - (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ - BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \ - BTRFS_FEATURE_COMPAT_RO_VERITY) - -#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL -#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL - -#ifdef CONFIG_BTRFS_DEBUG -/* - * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG - */ -#define BTRFS_FEATURE_INCOMPAT_SUPP \ - (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ - BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ - BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ - BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ - BTRFS_FEATURE_INCOMPAT_RAID56 | \ - BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ - BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ - BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ - BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ - BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ - BTRFS_FEATURE_INCOMPAT_ZONED | \ - BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) -#else -#define BTRFS_FEATURE_INCOMPAT_SUPP \ - (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ - BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ - BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ - BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ - BTRFS_FEATURE_INCOMPAT_RAID56 | \ - BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ - BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ - BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ - BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ - BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ - BTRFS_FEATURE_INCOMPAT_ZONED) -#endif - -#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ - (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) -#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL - -/* - * A leaf is full of items. offset and size tell us where to find - * the item in the leaf (relative to the start of the data area) - */ -struct btrfs_item { - struct btrfs_disk_key key; - __le32 offset; - __le32 size; -} __attribute__ ((__packed__)); - -/* - * leaves have an item area and a data area: - * [item0, item1....itemN] [free space] [dataN...data1, data0] - * - * The data is separate from the items to get the keys closer together - * during searches. - */ -struct btrfs_leaf { - struct btrfs_header header; - struct btrfs_item items[]; -} __attribute__ ((__packed__)); - -/* - * all non-leaf blocks are nodes, they hold only keys and pointers to - * other blocks - */ -struct btrfs_key_ptr { - struct btrfs_disk_key key; - __le64 blockptr; - __le64 generation; -} __attribute__ ((__packed__)); - -struct btrfs_node { - struct btrfs_header header; - struct btrfs_key_ptr ptrs[]; -} __attribute__ ((__packed__)); +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_balance_control; +struct btrfs_delayed_root; +struct reloc_control; /* Read ahead values for struct btrfs_path.reada */ enum { @@ -443,676 +108,11 @@ struct btrfs_path { * header (ie. sizeof(struct btrfs_item) is not included). */ unsigned int search_for_extension:1; -}; -#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ - sizeof(struct btrfs_item)) -struct btrfs_dev_replace { - u64 replace_state; /* see #define above */ - time64_t time_started; /* seconds since 1-Jan-1970 */ - time64_t time_stopped; /* seconds since 1-Jan-1970 */ - atomic64_t num_write_errors; - atomic64_t num_uncorrectable_read_errors; - - u64 cursor_left; - u64 committed_cursor_left; - u64 cursor_left_last_write_of_item; - u64 cursor_right; - - u64 cont_reading_from_srcdev_mode; /* see #define above */ - - int is_valid; - int item_needs_writeback; - struct btrfs_device *srcdev; - struct btrfs_device *tgtdev; - - struct mutex lock_finishing_cancel_unmount; - struct rw_semaphore rwsem; - - struct btrfs_scrub_progress scrub_progress; - - struct percpu_counter bio_counter; - wait_queue_head_t replace_wait; -}; - -/* - * free clusters are used to claim free space in relatively large chunks, - * allowing us to do less seeky writes. They are used for all metadata - * allocations. In ssd_spread mode they are also used for data allocations. - */ -struct btrfs_free_cluster { - spinlock_t lock; - spinlock_t refill_lock; - struct rb_root root; - - /* largest extent in this cluster */ - u64 max_size; - - /* first extent starting offset */ - u64 window_start; - - /* We did a full search and couldn't create a cluster */ - bool fragmented; - - struct btrfs_block_group *block_group; - /* - * when a cluster is allocated from a block group, we put the - * cluster onto a list in the block group so that it can - * be freed before the block group is freed. - */ - struct list_head block_group_list; -}; - -enum btrfs_caching_type { - BTRFS_CACHE_NO, - BTRFS_CACHE_STARTED, - BTRFS_CACHE_FINISHED, - BTRFS_CACHE_ERROR, -}; - -/* - * Tree to record all locked full stripes of a RAID5/6 block group - */ -struct btrfs_full_stripe_locks_tree { - struct rb_root root; - struct mutex lock; -}; - -/* Discard control. */ -/* - * Async discard uses multiple lists to differentiate the discard filter - * parameters. Index 0 is for completely free block groups where we need to - * ensure the entire block group is trimmed without being lossy. Indices - * afterwards represent monotonically decreasing discard filter sizes to - * prioritize what should be discarded next. - */ -#define BTRFS_NR_DISCARD_LISTS 3 -#define BTRFS_DISCARD_INDEX_UNUSED 0 -#define BTRFS_DISCARD_INDEX_START 1 - -struct btrfs_discard_ctl { - struct workqueue_struct *discard_workers; - struct delayed_work work; - spinlock_t lock; - struct btrfs_block_group *block_group; - struct list_head discard_list[BTRFS_NR_DISCARD_LISTS]; - u64 prev_discard; - u64 prev_discard_time; - atomic_t discardable_extents; - atomic64_t discardable_bytes; - u64 max_discard_size; - u64 delay_ms; - u32 iops_limit; - u32 kbps_limit; - u64 discard_extent_bytes; - u64 discard_bitmap_bytes; - atomic64_t discard_bytes_saved; -}; - -void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); - -/* fs_info */ -struct reloc_control; -struct btrfs_device; -struct btrfs_fs_devices; -struct btrfs_balance_control; -struct btrfs_delayed_root; - -/* - * Block group or device which contains an active swapfile. Used for preventing - * unsafe operations while a swapfile is active. - * - * These are sorted on (ptr, inode) (note that a block group or device can - * contain more than one swapfile). We compare the pointer values because we - * don't actually care what the object is, we just need a quick check whether - * the object exists in the rbtree. - */ -struct btrfs_swapfile_pin { - struct rb_node node; - void *ptr; - struct inode *inode; - /* - * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr - * points to a struct btrfs_device. - */ - bool is_block_group; - /* - * Only used when 'is_block_group' is true and it is the number of - * extents used by a swapfile for this block group ('ptr' field). - */ - int bg_extent_count; -}; - -bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); - -enum { - BTRFS_FS_CLOSING_START, - BTRFS_FS_CLOSING_DONE, - BTRFS_FS_LOG_RECOVERING, - BTRFS_FS_OPEN, - BTRFS_FS_QUOTA_ENABLED, - BTRFS_FS_UPDATE_UUID_TREE_GEN, - BTRFS_FS_CREATING_FREE_SPACE_TREE, - BTRFS_FS_BTREE_ERR, - BTRFS_FS_LOG1_ERR, - BTRFS_FS_LOG2_ERR, - BTRFS_FS_QUOTA_OVERRIDE, - /* Used to record internally whether fs has been frozen */ - BTRFS_FS_FROZEN, - /* - * Indicate that balance has been set up from the ioctl and is in the - * main phase. The fs_info::balance_ctl is initialized. - */ - BTRFS_FS_BALANCE_RUNNING, - - /* - * Indicate that relocation of a chunk has started, it's set per chunk - * and is toggled between chunks. - */ - BTRFS_FS_RELOC_RUNNING, - - /* Indicate that the cleaner thread is awake and doing something. */ - BTRFS_FS_CLEANER_RUNNING, - - /* - * The checksumming has an optimized version and is considered fast, - * so we don't need to offload checksums to workqueues. - */ - BTRFS_FS_CSUM_IMPL_FAST, - - /* Indicate that the discard workqueue can service discards. */ - BTRFS_FS_DISCARD_RUNNING, - - /* Indicate that we need to cleanup space cache v1 */ - BTRFS_FS_CLEANUP_SPACE_CACHE_V1, - - /* Indicate that we can't trust the free space tree for caching yet */ - BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, - - /* Indicate whether there are any tree modification log users */ - BTRFS_FS_TREE_MOD_LOG_USERS, - - /* Indicate that we want the transaction kthread to commit right now. */ - BTRFS_FS_COMMIT_TRANS, - - /* Indicate we have half completed snapshot deletions pending. */ - BTRFS_FS_UNFINISHED_DROPS, - - /* Indicate we have to finish a zone to do next allocation. */ - BTRFS_FS_NEED_ZONE_FINISH, - -#if BITS_PER_LONG == 32 - /* Indicate if we have error/warn message printed on 32bit systems */ - BTRFS_FS_32BIT_ERROR, - BTRFS_FS_32BIT_WARN, -#endif + /* Stop search if any locks need to be taken (for read) */ + unsigned int nowait:1; }; /* - * Exclusive operations (device replace, resize, device add/remove, balance) - */ -enum btrfs_exclusive_operation { - BTRFS_EXCLOP_NONE, - BTRFS_EXCLOP_BALANCE_PAUSED, - BTRFS_EXCLOP_BALANCE, - BTRFS_EXCLOP_DEV_ADD, - BTRFS_EXCLOP_DEV_REMOVE, - BTRFS_EXCLOP_DEV_REPLACE, - BTRFS_EXCLOP_RESIZE, - BTRFS_EXCLOP_SWAP_ACTIVATE, -}; - -/* Store data about transaction commits, exported via sysfs. */ -struct btrfs_commit_stats { - /* Total number of commits */ - u64 commit_count; - /* The maximum commit duration so far in ns */ - u64 max_commit_dur; - /* The last commit duration in ns */ - u64 last_commit_dur; - /* The total commit duration in ns */ - u64 total_commit_dur; -}; - -struct btrfs_fs_info { - u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; - unsigned long flags; - struct btrfs_root *tree_root; - struct btrfs_root *chunk_root; - struct btrfs_root *dev_root; - struct btrfs_root *fs_root; - struct btrfs_root *quota_root; - struct btrfs_root *uuid_root; - struct btrfs_root *data_reloc_root; - struct btrfs_root *block_group_root; - - /* the log root tree is a directory of all the other log roots */ - struct btrfs_root *log_root_tree; - - /* The tree that holds the global roots (csum, extent, etc) */ - rwlock_t global_root_lock; - struct rb_root global_root_tree; - - spinlock_t fs_roots_radix_lock; - struct radix_tree_root fs_roots_radix; - - /* block group cache stuff */ - rwlock_t block_group_cache_lock; - struct rb_root_cached block_group_cache_tree; - - /* keep track of unallocated space */ - atomic64_t free_chunk_space; - - /* Track ranges which are used by log trees blocks/logged data extents */ - struct extent_io_tree excluded_extents; - - /* logical->physical extent mapping */ - struct extent_map_tree mapping_tree; - - /* - * block reservation for extent, checksum, root tree and - * delayed dir index item - */ - struct btrfs_block_rsv global_block_rsv; - /* block reservation for metadata operations */ - struct btrfs_block_rsv trans_block_rsv; - /* block reservation for chunk tree */ - struct btrfs_block_rsv chunk_block_rsv; - /* block reservation for delayed operations */ - struct btrfs_block_rsv delayed_block_rsv; - /* block reservation for delayed refs */ - struct btrfs_block_rsv delayed_refs_rsv; - - struct btrfs_block_rsv empty_block_rsv; - - u64 generation; - u64 last_trans_committed; - /* - * Generation of the last transaction used for block group relocation - * since the filesystem was last mounted (or 0 if none happened yet). - * Must be written and read while holding btrfs_fs_info::commit_root_sem. - */ - u64 last_reloc_trans; - u64 avg_delayed_ref_runtime; - - /* - * this is updated to the current trans every time a full commit - * is required instead of the faster short fsync log commits - */ - u64 last_trans_log_full_commit; - unsigned long mount_opt; - /* - * Track requests for actions that need to be done during transaction - * commit (like for some mount options). - */ - unsigned long pending_changes; - unsigned long compress_type:4; - unsigned int compress_level; - u32 commit_interval; - /* - * It is a suggestive number, the read side is safe even it gets a - * wrong number because we will write out the data into a regular - * extent. The write side(mount/remount) is under ->s_umount lock, - * so it is also safe. - */ - u64 max_inline; - - struct btrfs_transaction *running_transaction; - wait_queue_head_t transaction_throttle; - wait_queue_head_t transaction_wait; - wait_queue_head_t transaction_blocked_wait; - wait_queue_head_t async_submit_wait; - - /* - * Used to protect the incompat_flags, compat_flags, compat_ro_flags - * when they are updated. - * - * Because we do not clear the flags for ever, so we needn't use - * the lock on the read side. - * - * We also needn't use the lock when we mount the fs, because - * there is no other task which will update the flag. - */ - spinlock_t super_lock; - struct btrfs_super_block *super_copy; - struct btrfs_super_block *super_for_commit; - struct super_block *sb; - struct inode *btree_inode; - struct mutex tree_log_mutex; - struct mutex transaction_kthread_mutex; - struct mutex cleaner_mutex; - struct mutex chunk_mutex; - - /* - * this is taken to make sure we don't set block groups ro after - * the free space cache has been allocated on them - */ - struct mutex ro_block_group_mutex; - - /* this is used during read/modify/write to make sure - * no two ios are trying to mod the same stripe at the same - * time - */ - struct btrfs_stripe_hash_table *stripe_hash_table; - - /* - * this protects the ordered operations list only while we are - * processing all of the entries on it. This way we make - * sure the commit code doesn't find the list temporarily empty - * because another function happens to be doing non-waiting preflush - * before jumping into the main commit. - */ - struct mutex ordered_operations_mutex; - - struct rw_semaphore commit_root_sem; - - struct rw_semaphore cleanup_work_sem; - - struct rw_semaphore subvol_sem; - - spinlock_t trans_lock; - /* - * the reloc mutex goes with the trans lock, it is taken - * during commit to protect us from the relocation code - */ - struct mutex reloc_mutex; - - struct list_head trans_list; - struct list_head dead_roots; - struct list_head caching_block_groups; - - spinlock_t delayed_iput_lock; - struct list_head delayed_iputs; - atomic_t nr_delayed_iputs; - wait_queue_head_t delayed_iputs_wait; - - atomic64_t tree_mod_seq; - - /* this protects tree_mod_log and tree_mod_seq_list */ - rwlock_t tree_mod_log_lock; - struct rb_root tree_mod_log; - struct list_head tree_mod_seq_list; - - atomic_t async_delalloc_pages; - - /* - * this is used to protect the following list -- ordered_roots. - */ - spinlock_t ordered_root_lock; - - /* - * all fs/file tree roots in which there are data=ordered extents - * pending writeback are added into this list. - * - * these can span multiple transactions and basically include - * every dirty data page that isn't from nodatacow - */ - struct list_head ordered_roots; - - struct mutex delalloc_root_mutex; - spinlock_t delalloc_root_lock; - /* all fs/file tree roots that have delalloc inodes. */ - struct list_head delalloc_roots; - - /* - * there is a pool of worker threads for checksumming during writes - * and a pool for checksumming after reads. This is because readers - * can run with FS locks held, and the writers may be waiting for - * those locks. We don't want ordering in the pending list to cause - * deadlocks, and so the two are serviced separately. - * - * A third pool does submit_bio to avoid deadlocking with the other - * two - */ - struct btrfs_workqueue *workers; - struct btrfs_workqueue *hipri_workers; - struct btrfs_workqueue *delalloc_workers; - struct btrfs_workqueue *flush_workers; - struct workqueue_struct *endio_workers; - struct workqueue_struct *endio_meta_workers; - struct workqueue_struct *endio_raid56_workers; - struct workqueue_struct *rmw_workers; - struct workqueue_struct *compressed_write_workers; - struct btrfs_workqueue *endio_write_workers; - struct btrfs_workqueue *endio_freespace_worker; - struct btrfs_workqueue *caching_workers; - - /* - * fixup workers take dirty pages that didn't properly go through - * the cow mechanism and make them safe to write. It happens - * for the sys_munmap function call path - */ - struct btrfs_workqueue *fixup_workers; - struct btrfs_workqueue *delayed_workers; - - struct task_struct *transaction_kthread; - struct task_struct *cleaner_kthread; - u32 thread_pool_size; - - struct kobject *space_info_kobj; - struct kobject *qgroups_kobj; - - /* used to keep from writing metadata until there is a nice batch */ - struct percpu_counter dirty_metadata_bytes; - struct percpu_counter delalloc_bytes; - struct percpu_counter ordered_bytes; - s32 dirty_metadata_batch; - s32 delalloc_batch; - - struct list_head dirty_cowonly_roots; - - struct btrfs_fs_devices *fs_devices; - - /* - * The space_info list is effectively read only after initial - * setup. It is populated at mount time and cleaned up after - * all block groups are removed. RCU is used to protect it. - */ - struct list_head space_info; - - struct btrfs_space_info *data_sinfo; - - struct reloc_control *reloc_ctl; - - /* data_alloc_cluster is only used in ssd_spread mode */ - struct btrfs_free_cluster data_alloc_cluster; - - /* all metadata allocations go through this cluster */ - struct btrfs_free_cluster meta_alloc_cluster; - - /* auto defrag inodes go here */ - spinlock_t defrag_inodes_lock; - struct rb_root defrag_inodes; - atomic_t defrag_running; - - /* Used to protect avail_{data, metadata, system}_alloc_bits */ - seqlock_t profiles_lock; - /* - * these three are in extended format (availability of single - * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other - * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits) - */ - u64 avail_data_alloc_bits; - u64 avail_metadata_alloc_bits; - u64 avail_system_alloc_bits; - - /* restriper state */ - spinlock_t balance_lock; - struct mutex balance_mutex; - atomic_t balance_pause_req; - atomic_t balance_cancel_req; - struct btrfs_balance_control *balance_ctl; - wait_queue_head_t balance_wait_q; - - /* Cancellation requests for chunk relocation */ - atomic_t reloc_cancel_req; - - u32 data_chunk_allocations; - u32 metadata_ratio; - - void *bdev_holder; - - /* private scrub information */ - struct mutex scrub_lock; - atomic_t scrubs_running; - atomic_t scrub_pause_req; - atomic_t scrubs_paused; - atomic_t scrub_cancel_req; - wait_queue_head_t scrub_pause_wait; - /* - * The worker pointers are NULL iff the refcount is 0, ie. scrub is not - * running. - */ - refcount_t scrub_workers_refcnt; - struct workqueue_struct *scrub_workers; - struct workqueue_struct *scrub_wr_completion_workers; - struct workqueue_struct *scrub_parity_workers; - struct btrfs_subpage_info *subpage_info; - - struct btrfs_discard_ctl discard_ctl; - -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - u32 check_integrity_print_mask; -#endif - /* is qgroup tracking in a consistent state? */ - u64 qgroup_flags; - - /* holds configuration and tracking. Protected by qgroup_lock */ - struct rb_root qgroup_tree; - spinlock_t qgroup_lock; - - /* - * used to avoid frequently calling ulist_alloc()/ulist_free() - * when doing qgroup accounting, it must be protected by qgroup_lock. - */ - struct ulist *qgroup_ulist; - - /* - * Protect user change for quota operations. If a transaction is needed, - * it must be started before locking this lock. - */ - struct mutex qgroup_ioctl_lock; - - /* list of dirty qgroups to be written at next commit */ - struct list_head dirty_qgroups; - - /* used by qgroup for an efficient tree traversal */ - u64 qgroup_seq; - - /* qgroup rescan items */ - struct mutex qgroup_rescan_lock; /* protects the progress item */ - struct btrfs_key qgroup_rescan_progress; - struct btrfs_workqueue *qgroup_rescan_workers; - struct completion qgroup_rescan_completion; - struct btrfs_work qgroup_rescan_work; - bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */ - - /* filesystem state */ - unsigned long fs_state; - - struct btrfs_delayed_root *delayed_root; - - /* Extent buffer radix tree */ - spinlock_t buffer_lock; - /* Entries are eb->start / sectorsize */ - struct radix_tree_root buffer_radix; - - /* next backup root to be overwritten */ - int backup_root_index; - - /* device replace state */ - struct btrfs_dev_replace dev_replace; - - struct semaphore uuid_tree_rescan_sem; - - /* Used to reclaim the metadata space in the background. */ - struct work_struct async_reclaim_work; - struct work_struct async_data_reclaim_work; - struct work_struct preempt_reclaim_work; - - /* Reclaim partially filled block groups in the background */ - struct work_struct reclaim_bgs_work; - struct list_head reclaim_bgs; - int bg_reclaim_threshold; - - spinlock_t unused_bgs_lock; - struct list_head unused_bgs; - struct mutex unused_bg_unpin_mutex; - /* Protect block groups that are going to be deleted */ - struct mutex reclaim_bgs_lock; - - /* Cached block sizes */ - u32 nodesize; - u32 sectorsize; - /* ilog2 of sectorsize, use to avoid 64bit division */ - u32 sectorsize_bits; - u32 csum_size; - u32 csums_per_leaf; - u32 stripesize; - - /* - * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular - * filesystem, on zoned it depends on the device constraints. - */ - u64 max_extent_size; - - /* Block groups and devices containing active swapfiles. */ - spinlock_t swapfile_pins_lock; - struct rb_root swapfile_pins; - - struct crypto_shash *csum_shash; - - /* Type of exclusive operation running, protected by super_lock */ - enum btrfs_exclusive_operation exclusive_operation; - - /* - * Zone size > 0 when in ZONED mode, otherwise it's used for a check - * if the mode is enabled - */ - u64 zone_size; - - /* Max size to emit ZONE_APPEND write command */ - u64 max_zone_append_size; - struct mutex zoned_meta_io_lock; - spinlock_t treelog_bg_lock; - u64 treelog_bg; - - /* - * Start of the dedicated data relocation block group, protected by - * relocation_bg_lock. - */ - spinlock_t relocation_bg_lock; - u64 data_reloc_bg; - struct mutex zoned_data_reloc_io_lock; - - u64 nr_global_roots; - - spinlock_t zone_active_bgs_lock; - struct list_head zone_active_bgs; - - /* Updates are not protected by any lock */ - struct btrfs_commit_stats commit_stats; - -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - spinlock_t ref_verify_lock; - struct rb_root block_tree; -#endif - -#ifdef CONFIG_BTRFS_DEBUG - struct kobject *debug_kobj; - struct kobject *discard_debug_kobj; - struct list_head allocated_roots; - - spinlock_t eb_leak_lock; - struct list_head allocated_ebs; -#endif -}; - -static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) -{ - return sb->s_fs_info; -} - -/* * The state of btrfs root */ enum { @@ -1174,11 +174,6 @@ enum { BTRFS_ROOT_RESET_LOCKDEP_CLASS, }; -static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) -{ - clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); -} - /* * Record swapped tree blocks of a subvolume tree for delayed subtree trace * code. For detail check comment in fs/btrfs/qgroup.c. @@ -1340,6 +335,23 @@ struct btrfs_root { #endif }; +static inline bool btrfs_root_readonly(const struct btrfs_root *root) +{ + /* Byte-swap the constant at compile time, root_item::flags is LE */ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; +} + +static inline bool btrfs_root_dead(const struct btrfs_root *root) +{ + /* Byte-swap the constant at compile time, root_item::flags is LE */ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; +} + +static inline u64 btrfs_root_id(const struct btrfs_root *root) +{ + return root->root_key.objectid; +} + /* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. @@ -1431,17 +443,14 @@ struct btrfs_drop_extents_args { struct btrfs_file_private { void *filldir_buf; + struct extent_state *llseek_cached_state; }; - static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) { - return info->nodesize - sizeof(struct btrfs_header); } -#define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items) - static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info) { return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item); @@ -1452,1272 +461,14 @@ static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_fs_info *info) return BTRFS_LEAF_DATA_SIZE(info) / sizeof(struct btrfs_key_ptr); } -#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ - (offsetof(struct btrfs_file_extent_item, disk_bytenr)) -static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info) -{ - return BTRFS_MAX_ITEM_SIZE(info) - - BTRFS_FILE_EXTENT_INLINE_DATA_START; -} - static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) { return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); } -/* - * Flags for mount options. - * - * Note: don't forget to add new options to btrfs_show_options() - */ -enum { - BTRFS_MOUNT_NODATASUM = (1UL << 0), - BTRFS_MOUNT_NODATACOW = (1UL << 1), - BTRFS_MOUNT_NOBARRIER = (1UL << 2), - BTRFS_MOUNT_SSD = (1UL << 3), - BTRFS_MOUNT_DEGRADED = (1UL << 4), - BTRFS_MOUNT_COMPRESS = (1UL << 5), - BTRFS_MOUNT_NOTREELOG = (1UL << 6), - BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7), - BTRFS_MOUNT_SSD_SPREAD = (1UL << 8), - BTRFS_MOUNT_NOSSD = (1UL << 9), - BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10), - BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11), - BTRFS_MOUNT_SPACE_CACHE = (1UL << 12), - BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13), - BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14), - BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15), - BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), - BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), - BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), - BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19), - BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20), - BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21), - BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22), - BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23), - BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24), - BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25), - BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26), - BTRFS_MOUNT_REF_VERIFY = (1UL << 27), - BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), - BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), - BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), -}; - -#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) -#define BTRFS_DEFAULT_MAX_INLINE (2048) - -#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) -#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) -#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) -#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ - BTRFS_MOUNT_##opt) - -#define btrfs_set_and_info(fs_info, opt, fmt, args...) \ -do { \ - if (!btrfs_test_opt(fs_info, opt)) \ - btrfs_info(fs_info, fmt, ##args); \ - btrfs_set_opt(fs_info->mount_opt, opt); \ -} while (0) - -#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ -do { \ - if (btrfs_test_opt(fs_info, opt)) \ - btrfs_info(fs_info, fmt, ##args); \ - btrfs_clear_opt(fs_info->mount_opt, opt); \ -} while (0) - -/* - * Requests for changes that need to be done during transaction commit. - * - * Internal mount options that are used for special handling of the real - * mount options (eg. cannot be set during remount and have to be set during - * transaction commit) - */ - -#define BTRFS_PENDING_COMMIT (0) - -#define btrfs_test_pending(info, opt) \ - test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) -#define btrfs_set_pending(info, opt) \ - set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) -#define btrfs_clear_pending(info, opt) \ - clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) - -/* - * Helpers for setting pending mount option changes. - * - * Expects corresponding macros - * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name - */ -#define btrfs_set_pending_and_info(info, opt, fmt, args...) \ -do { \ - if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \ - btrfs_info((info), fmt, ##args); \ - btrfs_set_pending((info), SET_##opt); \ - btrfs_clear_pending((info), CLEAR_##opt); \ - } \ -} while(0) - -#define btrfs_clear_pending_and_info(info, opt, fmt, args...) \ -do { \ - if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \ - btrfs_info((info), fmt, ##args); \ - btrfs_set_pending((info), CLEAR_##opt); \ - btrfs_clear_pending((info), SET_##opt); \ - } \ -} while(0) - -/* - * Inode flags - */ -#define BTRFS_INODE_NODATASUM (1U << 0) -#define BTRFS_INODE_NODATACOW (1U << 1) -#define BTRFS_INODE_READONLY (1U << 2) -#define BTRFS_INODE_NOCOMPRESS (1U << 3) -#define BTRFS_INODE_PREALLOC (1U << 4) -#define BTRFS_INODE_SYNC (1U << 5) -#define BTRFS_INODE_IMMUTABLE (1U << 6) -#define BTRFS_INODE_APPEND (1U << 7) -#define BTRFS_INODE_NODUMP (1U << 8) -#define BTRFS_INODE_NOATIME (1U << 9) -#define BTRFS_INODE_DIRSYNC (1U << 10) -#define BTRFS_INODE_COMPRESS (1U << 11) - -#define BTRFS_INODE_ROOT_ITEM_INIT (1U << 31) - -#define BTRFS_INODE_FLAG_MASK \ - (BTRFS_INODE_NODATASUM | \ - BTRFS_INODE_NODATACOW | \ - BTRFS_INODE_READONLY | \ - BTRFS_INODE_NOCOMPRESS | \ - BTRFS_INODE_PREALLOC | \ - BTRFS_INODE_SYNC | \ - BTRFS_INODE_IMMUTABLE | \ - BTRFS_INODE_APPEND | \ - BTRFS_INODE_NODUMP | \ - BTRFS_INODE_NOATIME | \ - BTRFS_INODE_DIRSYNC | \ - BTRFS_INODE_COMPRESS | \ - BTRFS_INODE_ROOT_ITEM_INIT) - -#define BTRFS_INODE_RO_VERITY (1U << 0) - -#define BTRFS_INODE_RO_FLAG_MASK (BTRFS_INODE_RO_VERITY) - -struct btrfs_map_token { - struct extent_buffer *eb; - char *kaddr; - unsigned long offset; -}; - #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ ((bytes) >> (fs_info)->sectorsize_bits) -static inline void btrfs_init_map_token(struct btrfs_map_token *token, - struct extent_buffer *eb) -{ - token->eb = eb; - token->kaddr = page_address(eb->pages[0]); - token->offset = 0; -} - -/* some macros to generate set/get functions for the struct fields. This - * assumes there is a lefoo_to_cpu for every type, so lets make a simple - * one for u8: - */ -#define le8_to_cpu(v) (v) -#define cpu_to_le8(v) (v) -#define __le8 u8 - -static inline u8 get_unaligned_le8(const void *p) -{ - return *(u8 *)p; -} - -static inline void put_unaligned_le8(u8 val, void *p) -{ - *(u8 *)p = val; -} - -#define read_eb_member(eb, ptr, type, member, result) (\ - read_extent_buffer(eb, (char *)(result), \ - ((unsigned long)(ptr)) + \ - offsetof(type, member), \ - sizeof(((type *)0)->member))) - -#define write_eb_member(eb, ptr, type, member, result) (\ - write_extent_buffer(eb, (char *)(result), \ - ((unsigned long)(ptr)) + \ - offsetof(type, member), \ - sizeof(((type *)0)->member))) - -#define DECLARE_BTRFS_SETGET_BITS(bits) \ -u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off); \ -void btrfs_set_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off, \ - u##bits val); \ -u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ - const void *ptr, unsigned long off); \ -void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ - unsigned long off, u##bits val); - -DECLARE_BTRFS_SETGET_BITS(8) -DECLARE_BTRFS_SETGET_BITS(16) -DECLARE_BTRFS_SETGET_BITS(32) -DECLARE_BTRFS_SETGET_BITS(64) - -#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ - const type *s) \ -{ \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ - return btrfs_get_##bits(eb, s, offsetof(type, member)); \ -} \ -static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ - u##bits val) \ -{ \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ - btrfs_set_##bits(eb, s, offsetof(type, member), val); \ -} \ -static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ - const type *s) \ -{ \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ - return btrfs_get_token_##bits(token, s, offsetof(type, member));\ -} \ -static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ - type *s, u##bits val) \ -{ \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ - btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ -} - -#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ -{ \ - const type *p = page_address(eb->pages[0]) + \ - offset_in_page(eb->start); \ - return get_unaligned_le##bits(&p->member); \ -} \ -static inline void btrfs_set_##name(const struct extent_buffer *eb, \ - u##bits val) \ -{ \ - type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \ - put_unaligned_le##bits(val, &p->member); \ -} - -#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(const type *s) \ -{ \ - return get_unaligned_le##bits(&s->member); \ -} \ -static inline void btrfs_set_##name(type *s, u##bits val) \ -{ \ - put_unaligned_le##bits(val, &s->member); \ -} - -static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, - struct btrfs_dev_item *s) -{ - static_assert(sizeof(u64) == - sizeof(((struct btrfs_dev_item *)0))->total_bytes); - return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, - total_bytes)); -} -static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, - struct btrfs_dev_item *s, - u64 val) -{ - static_assert(sizeof(u64) == - sizeof(((struct btrfs_dev_item *)0))->total_bytes); - WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); - btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); -} - - -BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); -BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); -BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); -BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); -BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, - start_offset, 64); -BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); -BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); -BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); -BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); -BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); -BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); - -BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, - total_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, - bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, - io_align, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, - io_width, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, - sector_size, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, - dev_group, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, - seek_speed, 8); -BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, - bandwidth, 8); -BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, - generation, 64); - -static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d) -{ - return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid); -} - -static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d) -{ - return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid); -} - -BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); -BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); -BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); -BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); -BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); -BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); -BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); -BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); -BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); -BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); -BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); - -static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) -{ - return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); -} - -BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, - stripe_len, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, - io_align, 32); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, - io_width, 32); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, - sector_size, 32); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, - num_stripes, 16); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, - sub_stripes, 16); -BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); - -static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, - int nr) -{ - unsigned long offset = (unsigned long)c; - offset += offsetof(struct btrfs_chunk, stripe); - offset += nr * sizeof(struct btrfs_stripe); - return (struct btrfs_stripe *)offset; -} - -static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) -{ - return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); -} - -static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb, - struct btrfs_chunk *c, int nr) -{ - return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); -} - -static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb, - struct btrfs_chunk *c, int nr) -{ - return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); -} - -/* struct btrfs_block_group_item */ -BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item, - used, 64); -BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item, - used, 64); -BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid, - struct btrfs_block_group_item, chunk_objectid, 64); - -BTRFS_SETGET_FUNCS(block_group_chunk_objectid, - struct btrfs_block_group_item, chunk_objectid, 64); -BTRFS_SETGET_FUNCS(block_group_flags, - struct btrfs_block_group_item, flags, 64); -BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags, - struct btrfs_block_group_item, flags, 64); - -/* struct btrfs_free_space_info */ -BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, - extent_count, 32); -BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32); - -/* struct btrfs_inode_ref */ -BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); -BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); - -/* struct btrfs_inode_extref */ -BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, - parent_objectid, 64); -BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref, - name_len, 16); -BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64); - -/* struct btrfs_inode_item */ -BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); -BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); -BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); -BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); -BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); -BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); -BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); -BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); -BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); -BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); -BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); -BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, - sequence, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, - transid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, - nbytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, - block_group, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); -BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); -BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); -BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); -BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); - -/* struct btrfs_dev_extent */ -BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, - chunk_tree, 64); -BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, - chunk_objectid, 64); -BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, - chunk_offset, 64); -BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); -BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); -BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, - generation, 64); -BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); - -BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); - -static inline void btrfs_tree_block_key(const struct extent_buffer *eb, - struct btrfs_tree_block_info *item, - struct btrfs_disk_key *key) -{ - read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); -} - -static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb, - struct btrfs_tree_block_info *item, - struct btrfs_disk_key *key) -{ - write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); -} - -BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, - root, 64); -BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, - objectid, 64); -BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, - offset, 64); -BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, - count, 32); - -BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, - count, 32); - -BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, - type, 8); -BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, - offset, 64); - -static inline u32 btrfs_extent_inline_ref_size(int type) -{ - if (type == BTRFS_TREE_BLOCK_REF_KEY || - type == BTRFS_SHARED_BLOCK_REF_KEY) - return sizeof(struct btrfs_extent_inline_ref); - if (type == BTRFS_SHARED_DATA_REF_KEY) - return sizeof(struct btrfs_shared_data_ref) + - sizeof(struct btrfs_extent_inline_ref); - if (type == BTRFS_EXTENT_DATA_REF_KEY) - return sizeof(struct btrfs_extent_data_ref) + - offsetof(struct btrfs_extent_inline_ref, offset); - return 0; -} - -/* struct btrfs_node */ -BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); -BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, - blockptr, 64); -BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr, - generation, 64); - -static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); -} - -static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb, - int nr, u64 val) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); -} - -static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); -} - -static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb, - int nr, u64 val) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); -} - -static inline unsigned long btrfs_node_key_ptr_offset(int nr) -{ - return offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; -} - -void btrfs_node_key(const struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr); - -static inline void btrfs_set_node_key(const struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) -{ - unsigned long ptr; - ptr = btrfs_node_key_ptr_offset(nr); - write_eb_member(eb, (struct btrfs_key_ptr *)ptr, - struct btrfs_key_ptr, key, disk_key); -} - -/* struct btrfs_item */ -BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32); -BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32); -BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); -BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); - -static inline unsigned long btrfs_item_nr_offset(int nr) -{ - return offsetof(struct btrfs_leaf, items) + - sizeof(struct btrfs_item) * nr; -} - -static inline struct btrfs_item *btrfs_item_nr(int nr) -{ - return (struct btrfs_item *)btrfs_item_nr_offset(nr); -} - -#define BTRFS_ITEM_SETGET_FUNCS(member) \ -static inline u32 btrfs_item_##member(const struct extent_buffer *eb, \ - int slot) \ -{ \ - return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \ -} \ -static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ - int slot, u32 val) \ -{ \ - btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \ -} \ -static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ - int slot) \ -{ \ - struct btrfs_item *item = btrfs_item_nr(slot); \ - return btrfs_token_raw_item_##member(token, item); \ -} \ -static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ - int slot, u32 val) \ -{ \ - struct btrfs_item *item = btrfs_item_nr(slot); \ - btrfs_set_token_raw_item_##member(token, item, val); \ -} - -BTRFS_ITEM_SETGET_FUNCS(offset) -BTRFS_ITEM_SETGET_FUNCS(size); - -static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr) -{ - return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr); -} - -static inline void btrfs_item_key(const struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) -{ - struct btrfs_item *item = btrfs_item_nr(nr); - read_eb_member(eb, item, struct btrfs_item, key, disk_key); -} - -static inline void btrfs_set_item_key(struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) -{ - struct btrfs_item *item = btrfs_item_nr(nr); - write_eb_member(eb, item, struct btrfs_item, key, disk_key); -} - -BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); - -/* - * struct btrfs_root_ref - */ -BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); -BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); -BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); - -/* struct btrfs_dir_item */ -BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); -BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); -BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); -BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8); -BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, - data_len, 16); -BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, - name_len, 16); -BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, - transid, 64); - -static inline void btrfs_dir_item_key(const struct extent_buffer *eb, - const struct btrfs_dir_item *item, - struct btrfs_disk_key *key) -{ - read_eb_member(eb, item, struct btrfs_dir_item, location, key); -} - -static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, - struct btrfs_dir_item *item, - const struct btrfs_disk_key *key) -{ - write_eb_member(eb, item, struct btrfs_dir_item, location, key); -} - -BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, - num_entries, 64); -BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, - num_bitmaps, 64); -BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, - generation, 64); - -static inline void btrfs_free_space_key(const struct extent_buffer *eb, - const struct btrfs_free_space_header *h, - struct btrfs_disk_key *key) -{ - read_eb_member(eb, h, struct btrfs_free_space_header, location, key); -} - -static inline void btrfs_set_free_space_key(struct extent_buffer *eb, - struct btrfs_free_space_header *h, - const struct btrfs_disk_key *key) -{ - write_eb_member(eb, h, struct btrfs_free_space_header, location, key); -} - -/* struct btrfs_disk_key */ -BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, - objectid, 64); -BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); -BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); - -#ifdef __LITTLE_ENDIAN - -/* - * Optimized helpers for little-endian architectures where CPU and on-disk - * structures have the same endianness and we can skip conversions. - */ - -static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key, - const struct btrfs_disk_key *disk_key) -{ - memcpy(cpu_key, disk_key, sizeof(struct btrfs_key)); -} - -static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key, - const struct btrfs_key *cpu_key) -{ - memcpy(disk_key, cpu_key, sizeof(struct btrfs_key)); -} - -static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, - struct btrfs_key *cpu_key, int nr) -{ - struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; - - btrfs_node_key(eb, disk_key, nr); -} - -static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, - struct btrfs_key *cpu_key, int nr) -{ - struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; - - btrfs_item_key(eb, disk_key, nr); -} - -static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, - const struct btrfs_dir_item *item, - struct btrfs_key *cpu_key) -{ - struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; - - btrfs_dir_item_key(eb, item, disk_key); -} - -#else - -static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, - const struct btrfs_disk_key *disk) -{ - cpu->offset = le64_to_cpu(disk->offset); - cpu->type = disk->type; - cpu->objectid = le64_to_cpu(disk->objectid); -} - -static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, - const struct btrfs_key *cpu) -{ - disk->offset = cpu_to_le64(cpu->offset); - disk->type = cpu->type; - disk->objectid = cpu_to_le64(cpu->objectid); -} - -static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, - struct btrfs_key *key, int nr) -{ - struct btrfs_disk_key disk_key; - btrfs_node_key(eb, &disk_key, nr); - btrfs_disk_key_to_cpu(key, &disk_key); -} - -static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, - struct btrfs_key *key, int nr) -{ - struct btrfs_disk_key disk_key; - btrfs_item_key(eb, &disk_key, nr); - btrfs_disk_key_to_cpu(key, &disk_key); -} - -static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, - const struct btrfs_dir_item *item, - struct btrfs_key *key) -{ - struct btrfs_disk_key disk_key; - btrfs_dir_item_key(eb, item, &disk_key); - btrfs_disk_key_to_cpu(key, &disk_key); -} - -#endif - -/* struct btrfs_header */ -BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); -BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, - generation, 64); -BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); -BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); -BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); -BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); -BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64); -BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, - nritems, 32); -BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64); - -static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag) -{ - return (btrfs_header_flags(eb) & flag) == flag; -} - -static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) -{ - u64 flags = btrfs_header_flags(eb); - btrfs_set_header_flags(eb, flags | flag); -} - -static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) -{ - u64 flags = btrfs_header_flags(eb); - btrfs_set_header_flags(eb, flags & ~flag); -} - -static inline int btrfs_header_backref_rev(const struct extent_buffer *eb) -{ - u64 flags = btrfs_header_flags(eb); - return flags >> BTRFS_BACKREF_REV_SHIFT; -} - -static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, - int rev) -{ - u64 flags = btrfs_header_flags(eb); - flags &= ~BTRFS_BACKREF_REV_MASK; - flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; - btrfs_set_header_flags(eb, flags); -} - -static inline int btrfs_is_leaf(const struct extent_buffer *eb) -{ - return btrfs_header_level(eb) == 0; -} - -/* struct btrfs_root_item */ -BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, - generation, 64); -BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); -BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); -BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); - -BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); -BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8); -BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); -BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); -BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); -BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); -BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); -BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, - last_snapshot, 64); -BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item, - generation_v2, 64); -BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, - ctransid, 64); -BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, - otransid, 64); -BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, - stransid, 64); -BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, - rtransid, 64); - -static inline bool btrfs_root_readonly(const struct btrfs_root *root) -{ - /* Byte-swap the constant at compile time, root_item::flags is LE */ - return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; -} - -static inline bool btrfs_root_dead(const struct btrfs_root *root) -{ - /* Byte-swap the constant at compile time, root_item::flags is LE */ - return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; -} - -static inline u64 btrfs_root_id(const struct btrfs_root *root) -{ - return root->root_key.objectid; -} - -/* struct btrfs_root_backup */ -BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, - tree_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, - tree_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, - tree_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, - chunk_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, - chunk_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, - chunk_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, - extent_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, - extent_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, - extent_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, - fs_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, - fs_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, - fs_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, - dev_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, - dev_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, - dev_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, - csum_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, - csum_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, - csum_root_level, 8); -BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, - total_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, - bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, - num_devices, 64); - -/* - * For extent tree v2 we overload the extent root with the block group root, as - * we will have multiple extent roots. - */ -BTRFS_SETGET_STACK_FUNCS(backup_block_group_root, struct btrfs_root_backup, - extent_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_gen, struct btrfs_root_backup, - extent_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_level, - struct btrfs_root_backup, extent_root_level, 8); - -/* struct btrfs_balance_item */ -BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); - -static inline void btrfs_balance_data(const struct extent_buffer *eb, - const struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); -} - -static inline void btrfs_set_balance_data(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - const struct btrfs_disk_balance_args *ba) -{ - write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); -} - -static inline void btrfs_balance_meta(const struct extent_buffer *eb, - const struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); -} - -static inline void btrfs_set_balance_meta(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - const struct btrfs_disk_balance_args *ba) -{ - write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); -} - -static inline void btrfs_balance_sys(const struct extent_buffer *eb, - const struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); -} - -static inline void btrfs_set_balance_sys(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - const struct btrfs_disk_balance_args *ba) -{ - write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); -} - -static inline void -btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, - const struct btrfs_disk_balance_args *disk) -{ - memset(cpu, 0, sizeof(*cpu)); - - cpu->profiles = le64_to_cpu(disk->profiles); - cpu->usage = le64_to_cpu(disk->usage); - cpu->devid = le64_to_cpu(disk->devid); - cpu->pstart = le64_to_cpu(disk->pstart); - cpu->pend = le64_to_cpu(disk->pend); - cpu->vstart = le64_to_cpu(disk->vstart); - cpu->vend = le64_to_cpu(disk->vend); - cpu->target = le64_to_cpu(disk->target); - cpu->flags = le64_to_cpu(disk->flags); - cpu->limit = le64_to_cpu(disk->limit); - cpu->stripes_min = le32_to_cpu(disk->stripes_min); - cpu->stripes_max = le32_to_cpu(disk->stripes_max); -} - -static inline void -btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, - const struct btrfs_balance_args *cpu) -{ - memset(disk, 0, sizeof(*disk)); - - disk->profiles = cpu_to_le64(cpu->profiles); - disk->usage = cpu_to_le64(cpu->usage); - disk->devid = cpu_to_le64(cpu->devid); - disk->pstart = cpu_to_le64(cpu->pstart); - disk->pend = cpu_to_le64(cpu->pend); - disk->vstart = cpu_to_le64(cpu->vstart); - disk->vend = cpu_to_le64(cpu->vend); - disk->target = cpu_to_le64(cpu->target); - disk->flags = cpu_to_le64(cpu->flags); - disk->limit = cpu_to_le64(cpu->limit); - disk->stripes_min = cpu_to_le32(cpu->stripes_min); - disk->stripes_max = cpu_to_le32(cpu->stripes_max); -} - -/* struct btrfs_super_block */ -BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); -BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); -BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, - struct btrfs_super_block, sys_chunk_array_size, 32); -BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, - struct btrfs_super_block, chunk_root_generation, 64); -BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, - root_level, 8); -BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, - chunk_root, 64); -BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, - chunk_root_level, 8); -BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, - log_root, 64); -BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, - log_root_level, 8); -BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, - total_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, - bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, - sectorsize, 32); -BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, - nodesize, 32); -BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, - stripesize, 32); -BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, - root_dir_objectid, 64); -BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, - num_devices, 64); -BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, - compat_flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, - compat_ro_flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, - incompat_flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, - csum_type, 16); -BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, - cache_generation, 64); -BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); -BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, - uuid_tree_generation, 64); -BTRFS_SETGET_STACK_FUNCS(super_block_group_root, struct btrfs_super_block, - block_group_root, 64); -BTRFS_SETGET_STACK_FUNCS(super_block_group_root_generation, - struct btrfs_super_block, - block_group_root_generation, 64); -BTRFS_SETGET_STACK_FUNCS(super_block_group_root_level, struct btrfs_super_block, - block_group_root_level, 8); - -int btrfs_super_csum_size(const struct btrfs_super_block *s); -const char *btrfs_super_csum_name(u16 csum_type); -const char *btrfs_super_csum_driver(u16 csum_type); -size_t __attribute_const__ btrfs_get_num_csums(void); - - -/* - * The leaf data grows from end-to-front in the node. - * this returns the address of the start of the last item, - * which is the stop of the leaf data stack - */ -static inline unsigned int leaf_data_end(const struct extent_buffer *leaf) -{ - u32 nr = btrfs_header_nritems(leaf); - - if (nr == 0) - return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); - return btrfs_item_offset(leaf, nr - 1); -} - -/* struct btrfs_file_extent_item */ -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, - type, 8); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, - struct btrfs_file_extent_item, disk_bytenr, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset, - struct btrfs_file_extent_item, offset, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, - struct btrfs_file_extent_item, generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, - struct btrfs_file_extent_item, num_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes, - struct btrfs_file_extent_item, ram_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, - struct btrfs_file_extent_item, disk_num_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, - struct btrfs_file_extent_item, compression, 8); - -static inline unsigned long -btrfs_file_extent_inline_start(const struct btrfs_file_extent_item *e) -{ - return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; -} - -static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) -{ - return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; -} - -BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); -BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, - disk_bytenr, 64); -BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, - generation, 64); -BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, - disk_num_bytes, 64); -BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, - offset, 64); -BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, - num_bytes, 64); -BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, - ram_bytes, 64); -BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, - compression, 8); -BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, - encryption, 8); -BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, - other_encoding, 16); - -/* - * this returns the number of bytes used by the item on disk, minus the - * size of any extent headers. If a file is compressed on disk, this is - * the compressed size - */ -static inline u32 btrfs_file_extent_inline_item_len( - const struct extent_buffer *eb, - int nr) -{ - return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START; -} - -/* btrfs_qgroup_status_item */ -BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, - generation, 64); -BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, - version, 64); -BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, - flags, 64); -BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, - rescan, 64); - -/* btrfs_qgroup_info_item */ -BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, - generation, 64); -BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); -BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, - rfer_cmpr, 64); -BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); -BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, - excl_cmpr, 64); - -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation, - struct btrfs_qgroup_info_item, generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item, - rfer, 64); -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr, - struct btrfs_qgroup_info_item, rfer_cmpr, 64); -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item, - excl, 64); -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr, - struct btrfs_qgroup_info_item, excl_cmpr, 64); - -/* btrfs_qgroup_limit_item */ -BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, - flags, 64); -BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, - max_rfer, 64); -BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, - max_excl, 64); -BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, - rsv_rfer, 64); -BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, - rsv_excl, 64); - -/* btrfs_dev_replace_item */ -BTRFS_SETGET_FUNCS(dev_replace_src_devid, - struct btrfs_dev_replace_item, src_devid, 64); -BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, - struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, - 64); -BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, - replace_state, 64); -BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, - time_started, 64); -BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, - time_stopped, 64); -BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, - num_write_errors, 64); -BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, - struct btrfs_dev_replace_item, num_uncorrectable_read_errors, - 64); -BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, - cursor_left, 64); -BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, - cursor_right, 64); - -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, - struct btrfs_dev_replace_item, src_devid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, - struct btrfs_dev_replace_item, - cont_reading_from_srcdev_mode, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, - struct btrfs_dev_replace_item, replace_state, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, - struct btrfs_dev_replace_item, time_started, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, - struct btrfs_dev_replace_item, time_stopped, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, - struct btrfs_dev_replace_item, num_write_errors, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, - struct btrfs_dev_replace_item, - num_uncorrectable_read_errors, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, - struct btrfs_dev_replace_item, cursor_left, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, - struct btrfs_dev_replace_item, cursor_right, 64); - -/* helper function to cast into the data area of the leaf. */ -#define btrfs_item_ptr(leaf, slot, type) \ - ((type *)(BTRFS_LEAF_DATA_OFFSET + \ - btrfs_item_offset(leaf, slot))) - -#define btrfs_item_ptr_offset(leaf, slot) \ - ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ - btrfs_item_offset(leaf, slot))) - static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) { return crc32c(crc, address, length); @@ -2747,202 +498,15 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) return mapping_gfp_constraint(mapping, ~__GFP_FS); } -/* extent-tree.c */ - -enum btrfs_inline_ref_type { - BTRFS_REF_TYPE_INVALID, - BTRFS_REF_TYPE_BLOCK, - BTRFS_REF_TYPE_DATA, - BTRFS_REF_TYPE_ANY, -}; - -int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, - struct btrfs_extent_inline_ref *iref, - enum btrfs_inline_ref_type is_data); -u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); - -static inline u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, - u64 offset) -{ - u64 offset_in_sectors = offset >> fs_info->sectorsize_bits; - - return csums + offset_in_sectors * fs_info->csum_size; -} - -/* - * Take the number of bytes to be checksummed and figure out how many leaves - * it would require to store the csums for that many bytes. - */ -static inline u64 btrfs_csum_bytes_to_leaves( - const struct btrfs_fs_info *fs_info, u64 csum_bytes) -{ - const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; - - return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); -} - -/* - * Use this if we would be adding new items, as we could split nodes as we cow - * down the tree. - */ -static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, - unsigned num_items) -{ - return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; -} - -/* - * Doing a truncate or a modification won't result in new nodes or leaves, just - * what we need for COW. - */ -static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, - unsigned num_items) -{ - return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; -} - -int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 num_bytes); -void btrfs_free_excluded_extents(struct btrfs_block_group *cache); -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - unsigned long count); -void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head); -int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); -int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 offset, int metadata, u64 *refs, u64 *flags); -int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, - int reserved); -int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes); -int btrfs_exclude_logged_extents(struct extent_buffer *eb); -int btrfs_cross_ref_exist(struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr, bool strict, - struct btrfs_path *path); -struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 parent, u64 root_objectid, - const struct btrfs_disk_key *key, - int level, u64 hint, - u64 empty_size, - enum btrfs_lock_nesting nest); -void btrfs_free_tree_block(struct btrfs_trans_handle *trans, - u64 root_id, - struct extent_buffer *buf, - u64 parent, int last_ref); -int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 owner, - u64 offset, u64 ram_bytes, - struct btrfs_key *ins); -int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, - u64 root_objectid, u64 owner, u64 offset, - struct btrfs_key *ins); -int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, - u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data, int delalloc); -int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); -int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); -int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct extent_buffer *eb, u64 flags, int level); -int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); - -int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, int delalloc); -int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, - u64 len); -int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); -int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_ref *generic_ref); - -void btrfs_clear_space_info_full(struct btrfs_fs_info *info); - -/* - * Different levels for to flush space when doing space reservations. - * - * The higher the level, the more methods we try to reclaim space. - */ -enum btrfs_reserve_flush_enum { - /* If we are in the transaction, we can't flush anything.*/ - BTRFS_RESERVE_NO_FLUSH, - - /* - * Flush space by: - * - Running delayed inode items - * - Allocating a new chunk - */ - BTRFS_RESERVE_FLUSH_LIMIT, - - /* - * Flush space by: - * - Running delayed inode items - * - Running delayed refs - * - Running delalloc and waiting for ordered extents - * - Allocating a new chunk - */ - BTRFS_RESERVE_FLUSH_EVICT, - - /* - * Flush space by above mentioned methods and by: - * - Running delayed iputs - * - Committing transaction - * - * Can be interrupted by a fatal signal. - */ - BTRFS_RESERVE_FLUSH_DATA, - BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, - BTRFS_RESERVE_FLUSH_ALL, - - /* - * Pretty much the same as FLUSH_ALL, but can also steal space from - * global rsv. - * - * Can be interrupted by a fatal signal. - */ - BTRFS_RESERVE_FLUSH_ALL_STEAL, -}; - -enum btrfs_flush_state { - FLUSH_DELAYED_ITEMS_NR = 1, - FLUSH_DELAYED_ITEMS = 2, - FLUSH_DELAYED_REFS_NR = 3, - FLUSH_DELAYED_REFS = 4, - FLUSH_DELALLOC = 5, - FLUSH_DELALLOC_WAIT = 6, - FLUSH_DELALLOC_FULL = 7, - ALLOC_CHUNK = 8, - ALLOC_CHUNK_FORCE = 9, - RUN_DELAYED_IPUTS = 10, - COMMIT_TRANS = 11, -}; - -int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv, - int nitems, bool use_global_rsv); -void btrfs_subvolume_release_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv); -void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); - -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, - u64 disk_num_bytes, bool noflush); -u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); -int btrfs_init_space_info(struct btrfs_fs_info *fs_info); -int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_start_write_no_snapshotting(struct btrfs_root *root); -void btrfs_end_write_no_snapshotting(struct btrfs_root *root); -void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); - /* ctree.c */ +int __init btrfs_ctree_init(void); +void __cold btrfs_ctree_exit(void); int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); @@ -3103,14 +667,7 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, (path)->slots[0]++ \ ) -static inline int btrfs_next_old_item(struct btrfs_root *root, - struct btrfs_path *p, u64 time_seq) -{ - ++p->slots[0]; - if (p->slots[0] >= btrfs_header_nritems(p->nodes[0])) - return btrfs_next_old_leaf(root, p, time_seq); - return 0; -} +int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); /* * Search the tree again to find a leaf with greater keys. @@ -3128,878 +685,6 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) return btrfs_next_old_item(root, p, 0); } int btrfs_leaf_free_space(struct extent_buffer *leaf); -int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, - int for_reloc); -int btrfs_drop_subtree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *node, - struct extent_buffer *parent); -static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) -{ - /* - * Do it this way so we only ever do one test_bit in the normal case. - */ - if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { - if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) - return 2; - return 1; - } - return 0; -} - -/* - * If we remount the fs to be R/O or umount the fs, the cleaner needn't do - * anything except sleeping. This function is used to check the status of - * the fs. - * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount, - * since setting and checking for SB_RDONLY in the superblock's flags is not - * atomic. - */ -static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) -{ - return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) || - btrfs_fs_closing(fs_info); -} - -static inline void btrfs_set_sb_rdonly(struct super_block *sb) -{ - sb->s_flags |= SB_RDONLY; - set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); -} - -static inline void btrfs_clear_sb_rdonly(struct super_block *sb) -{ - sb->s_flags &= ~SB_RDONLY; - clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); -} - -/* root-item.c */ -int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 sequence, const char *name, - int name_len); -int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 *sequence, const char *name, - int name_len); -int btrfs_del_root(struct btrfs_trans_handle *trans, - const struct btrfs_key *key); -int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const struct btrfs_key *key, - struct btrfs_root_item *item); -int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_key *key, - struct btrfs_root_item *item); -int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, - struct btrfs_path *path, struct btrfs_root_item *root_item, - struct btrfs_key *root_key); -int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info); -void btrfs_set_root_node(struct btrfs_root_item *item, - struct extent_buffer *node); -void btrfs_check_and_init_root_item(struct btrfs_root_item *item); -void btrfs_update_root_times(struct btrfs_trans_handle *trans, - struct btrfs_root *root); - -/* uuid-tree.c */ -int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, - u64 subid); -int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, - u64 subid); -int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); - -/* dir-item.c */ -int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, - const char *name, int name_len); -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, - int name_len, struct btrfs_inode *dir, - struct btrfs_key *location, u8 type, u64 index); -struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 dir, - const char *name, int name_len, - int mod); -struct btrfs_dir_item * -btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 dir, - u64 index, const char *name, int name_len, - int mod); -struct btrfs_dir_item * -btrfs_search_dir_index_item(struct btrfs_root *root, - struct btrfs_path *path, u64 dirid, - const char *name, int name_len); -int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_dir_item *di); -int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid, - const char *name, u16 name_len, - const void *data, u16 data_len); -struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 dir, - const char *name, u16 name_len, - int mod); -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - const char *name, - int name_len); - -/* orphan.c */ -int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 offset); -int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 offset); -int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); - -/* file-item.c */ -int btrfs_del_csums(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, u64 len); -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); -int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 pos, - u64 disk_offset, u64 disk_num_bytes, - u64 num_bytes, u64 offset, u64 ram_bytes, - u8 compression, u8 encryption, u16 other_encoding); -int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid, - u64 bytenr, int mod); -int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_ordered_sum *sums); -blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 offset, bool one_ordered); -int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit); -void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, - const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, - const bool new_inline, - struct extent_map *em); -int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, - u64 len); -int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, - u64 len); -void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size); -u64 btrfs_file_extent_end(const struct btrfs_path *path); - -/* inode.c */ -void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num); -void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type); -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, - u32 pgoff, u8 *csum, const u8 * const csum_expected); -int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff); -unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, - u64 start, u64 end); -int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff); -struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, - u64 start, u64 len); -noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, - u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes, bool strict); - -void __btrfs_del_delalloc_inode(struct btrfs_root *root, - struct btrfs_inode *inode); -struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); -int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); -int btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, int name_len); -int btrfs_add_link(struct btrfs_trans_handle *trans, - struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const char *name, int name_len, int add_backref, u64 index); -int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); -int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, - int front); - -int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, - bool in_reclaim_context); -int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, - unsigned int extra_bits, - struct extent_state **cached_state); -struct btrfs_new_inode_args { - /* Input */ - struct inode *dir; - struct dentry *dentry; - struct inode *inode; - bool orphan; - bool subvol; - - /* - * Output from btrfs_new_inode_prepare(), input to - * btrfs_create_new_inode(). - */ - struct posix_acl *default_acl; - struct posix_acl *acl; -}; -int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, - unsigned int *trans_num_items); -int btrfs_create_new_inode(struct btrfs_trans_handle *trans, - struct btrfs_new_inode_args *args); -void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); -struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, - struct inode *dir); - void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, - u32 bits); -void btrfs_clear_delalloc_extent(struct inode *inode, - struct extent_state *state, u32 bits); -void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, - struct extent_state *other); -void btrfs_split_delalloc_extent(struct inode *inode, - struct extent_state *orig, u64 split); -void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); -vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); -void btrfs_evict_inode(struct inode *inode); -int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); -struct inode *btrfs_alloc_inode(struct super_block *sb); -void btrfs_destroy_inode(struct inode *inode); -void btrfs_free_inode(struct inode *inode); -int btrfs_drop_inode(struct inode *inode); -int __init btrfs_init_cachep(void); -void __cold btrfs_destroy_cachep(void); -struct inode *btrfs_iget_path(struct super_block *s, u64 ino, - struct btrfs_root *root, struct btrfs_path *path); -struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); -struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 end); -int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); -int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); -int btrfs_orphan_add(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode); -int btrfs_orphan_cleanup(struct btrfs_root *root); -int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); -void btrfs_add_delayed_iput(struct inode *inode); -void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); -int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info); -int btrfs_prealloc_file_range(struct inode *inode, int mode, - u64 start, u64 num_bytes, u64 min_size, - loff_t actual_len, u64 *alloc_hint); -int btrfs_prealloc_file_range_trans(struct inode *inode, - struct btrfs_trans_handle *trans, int mode, - u64 start, u64 num_bytes, u64 min_size, - loff_t actual_len, u64 *alloc_hint); -int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, unsigned long *nr_written, - struct writeback_control *wbc); -int btrfs_writepage_cow_fixup(struct page *page); -void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, - struct page *page, u64 start, - u64 end, bool uptodate); -int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, - int compress_type); -int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, u64 disk_bytenr, - u64 disk_io_size, - struct page **pages); -ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, - struct btrfs_ioctl_encoded_io_args *encoded); -ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, - const struct btrfs_ioctl_encoded_io_args *encoded); - -ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before); - -extern const struct dentry_operations btrfs_dentry_operations; - -/* Inode locking type flags, by default the exclusive lock is taken */ -#define BTRFS_ILOCK_SHARED (1U << 0) -#define BTRFS_ILOCK_TRY (1U << 1) -#define BTRFS_ILOCK_MMAP (1U << 2) - -int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags); -void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); -void btrfs_update_inode_bytes(struct btrfs_inode *inode, - const u64 add_bytes, - const u64 del_bytes); -void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); - -/* ioctl.c */ -long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int btrfs_fileattr_set(struct user_namespace *mnt_userns, - struct dentry *dentry, struct fileattr *fa); -int btrfs_ioctl_get_supported_features(void __user *arg); -void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); -int __pure btrfs_is_empty_uuid(u8 *uuid); -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, - struct btrfs_ioctl_defrag_range_args *range, - u64 newer_than, unsigned long max_to_defrag); -void btrfs_get_block_group_info(struct list_head *groups_list, - struct btrfs_ioctl_space_info *space); -void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, - struct btrfs_ioctl_balance_args *bargs); -bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type); -bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type); -void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); -void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); -void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation op); - - -/* file.c */ -int __init btrfs_auto_defrag_init(void); -void __cold btrfs_auto_defrag_exit(void); -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh); -int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); -void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); -int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); -void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, - int skip_pinned); -extern const struct file_operations btrfs_file_operations; -int btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, - struct btrfs_drop_extents_args *args); -int btrfs_replace_file_extents(struct btrfs_inode *inode, - struct btrfs_path *path, const u64 start, - const u64 end, - struct btrfs_replace_extent_info *extent_info, - struct btrfs_trans_handle **trans_out); -int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u64 start, u64 end); -ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, - const struct btrfs_ioctl_encoded_io_args *encoded); -int btrfs_release_file(struct inode *inode, struct file *file); -int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, size_t write_bytes, - struct extent_state **cached, bool noreserve); -int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); -int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes); -void btrfs_check_nocow_unlock(struct btrfs_inode *inode); - -/* tree-defrag.c */ -int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root); - -/* super.c */ -int btrfs_parse_options(struct btrfs_fs_info *info, char *options, - unsigned long new_flags); -int btrfs_sync_fs(struct super_block *sb, int wait); -char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, - u64 subvol_objectid); - -static inline __printf(2, 3) __cold -void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) -{ -} - -#ifdef CONFIG_PRINTK_INDEX - -#define btrfs_printk(fs_info, fmt, args...) \ -do { \ - printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \ - _btrfs_printk(fs_info, fmt, ##args); \ -} while (0) - -__printf(2, 3) -__cold -void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); - -#elif defined(CONFIG_PRINTK) - -#define btrfs_printk(fs_info, fmt, args...) \ - _btrfs_printk(fs_info, fmt, ##args) - -__printf(2, 3) -__cold -void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); - -#else - -#define btrfs_printk(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, fmt, ##args) -#endif - -#define btrfs_emerg(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_INFO fmt, ##args) - -/* - * Wrappers that use printk_in_rcu - */ -#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) - -/* - * Wrappers that use a ratelimited printk_in_rcu - */ -#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) - -/* - * Wrappers that use a ratelimited printk - */ -#define btrfs_emerg_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args) - -#if defined(CONFIG_DYNAMIC_DEBUG) -#define btrfs_debug(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk, \ - fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \ - fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ - fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \ - fs_info, KERN_DEBUG fmt, ##args) -#elif defined(DEBUG) -#define btrfs_debug(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args) -#else -#define btrfs_debug(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ - btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) -#endif - -#define btrfs_printk_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - btrfs_printk(fs_info, fmt, ##args); \ - rcu_read_unlock(); \ -} while (0) - -#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - btrfs_no_printk(fs_info, fmt, ##args); \ - rcu_read_unlock(); \ -} while (0) - -#define btrfs_printk_ratelimited(fs_info, fmt, args...) \ -do { \ - static DEFINE_RATELIMIT_STATE(_rs, \ - DEFAULT_RATELIMIT_INTERVAL, \ - DEFAULT_RATELIMIT_BURST); \ - if (__ratelimit(&_rs)) \ - btrfs_printk(fs_info, fmt, ##args); \ -} while (0) - -#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - btrfs_printk_ratelimited(fs_info, fmt, ##args); \ - rcu_read_unlock(); \ -} while (0) - -#ifdef CONFIG_BTRFS_ASSERT -__cold __noreturn -static inline void assertfail(const char *expr, const char *file, int line) -{ - pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); - BUG(); -} - -#define ASSERT(expr) \ - (likely(expr) ? (void)0 : assertfail(#expr, __FILE__, __LINE__)) - -#else -static inline void assertfail(const char *expr, const char* file, int line) { } -#define ASSERT(expr) (void)(expr) -#endif - -#if BITS_PER_LONG == 32 -#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT) -/* - * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical - * addresses of extents. - * - * For 4K page size it's about 10T, for 64K it's 160T. - */ -#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8) -void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info); -void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info); -#endif - -/* - * Get the correct offset inside the page of extent buffer. - * - * @eb: target extent buffer - * @start: offset inside the extent buffer - * - * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases. - */ -static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb, - unsigned long offset) -{ - /* - * For sectorsize == PAGE_SIZE case, eb->start will always be aligned - * to PAGE_SIZE, thus adding it won't cause any difference. - * - * For sectorsize < PAGE_SIZE, we must only read the data that belongs - * to the eb, thus we have to take the eb->start into consideration. - */ - return offset_in_page(offset + eb->start); -} - -static inline unsigned long get_eb_page_index(unsigned long offset) -{ - /* - * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough. - * - * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE, - * and have ensured that all tree blocks are contained in one page, - * thus we always get index == 0. - */ - return offset >> PAGE_SHIFT; -} - -/* - * Use that for functions that are conditionally exported for sanity tests but - * otherwise static - */ -#ifndef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -#define EXPORT_FOR_TESTS static -#else -#define EXPORT_FOR_TESTS -#endif - -__cold -static inline void btrfs_print_v0_err(struct btrfs_fs_info *fs_info) -{ - btrfs_err(fs_info, -"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel"); -} - -__printf(5, 6) -__cold -void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); - -const char * __attribute_const__ btrfs_decode_error(int errno); - -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno); - -/* - * Call btrfs_abort_transaction as early as possible when an error condition is - * detected, that way the exact line number is reported. - */ -#define btrfs_abort_transaction(trans, errno) \ -do { \ - /* Report first abort since mount */ \ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ - &((trans)->fs_info->fs_state))) { \ - if ((errno) != -EIO && (errno) != -EROFS) { \ - WARN(1, KERN_DEBUG \ - "BTRFS: Transaction aborted (error %d)\n", \ - (errno)); \ - } else { \ - btrfs_debug((trans)->fs_info, \ - "Transaction aborted (error %d)", \ - (errno)); \ - } \ - } \ - __btrfs_abort_transaction((trans), __func__, \ - __LINE__, (errno)); \ -} while (0) - -#ifdef CONFIG_PRINTK_INDEX - -#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ -do { \ - printk_index_subsys_emit( \ - "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \ - KERN_CRIT, fmt); \ - __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args); \ -} while (0) - -#else - -#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ - __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args) - -#endif - -#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ - &(fs_info)->fs_state))) -#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ - (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ - &(fs_info)->fs_state))) - -__printf(5, 6) -__cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); -/* - * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic - * will panic(). Otherwise we BUG() here. - */ -#define btrfs_panic(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ - BUG(); \ -} while (0) - - -/* compatibility and incompatibility defines */ - -#define btrfs_set_fs_incompat(__fs_info, opt) \ - __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ - #opt) - -static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, - u64 flag, const char* name) -{ - struct btrfs_super_block *disk_super; - u64 features; - - disk_super = fs_info->super_copy; - features = btrfs_super_incompat_flags(disk_super); - if (!(features & flag)) { - spin_lock(&fs_info->super_lock); - features = btrfs_super_incompat_flags(disk_super); - if (!(features & flag)) { - features |= flag; - btrfs_set_super_incompat_flags(disk_super, features); - btrfs_info(fs_info, - "setting incompat feature flag for %s (0x%llx)", - name, flag); - } - spin_unlock(&fs_info->super_lock); - } -} - -#define btrfs_clear_fs_incompat(__fs_info, opt) \ - __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ - #opt) - -static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, - u64 flag, const char* name) -{ - struct btrfs_super_block *disk_super; - u64 features; - - disk_super = fs_info->super_copy; - features = btrfs_super_incompat_flags(disk_super); - if (features & flag) { - spin_lock(&fs_info->super_lock); - features = btrfs_super_incompat_flags(disk_super); - if (features & flag) { - features &= ~flag; - btrfs_set_super_incompat_flags(disk_super, features); - btrfs_info(fs_info, - "clearing incompat feature flag for %s (0x%llx)", - name, flag); - } - spin_unlock(&fs_info->super_lock); - } -} - -#define btrfs_fs_incompat(fs_info, opt) \ - __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) - -static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) -{ - struct btrfs_super_block *disk_super; - disk_super = fs_info->super_copy; - return !!(btrfs_super_incompat_flags(disk_super) & flag); -} - -#define btrfs_set_fs_compat_ro(__fs_info, opt) \ - __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ - #opt) - -static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, - u64 flag, const char *name) -{ - struct btrfs_super_block *disk_super; - u64 features; - - disk_super = fs_info->super_copy; - features = btrfs_super_compat_ro_flags(disk_super); - if (!(features & flag)) { - spin_lock(&fs_info->super_lock); - features = btrfs_super_compat_ro_flags(disk_super); - if (!(features & flag)) { - features |= flag; - btrfs_set_super_compat_ro_flags(disk_super, features); - btrfs_info(fs_info, - "setting compat-ro feature flag for %s (0x%llx)", - name, flag); - } - spin_unlock(&fs_info->super_lock); - } -} - -#define btrfs_clear_fs_compat_ro(__fs_info, opt) \ - __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ - #opt) - -static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, - u64 flag, const char *name) -{ - struct btrfs_super_block *disk_super; - u64 features; - - disk_super = fs_info->super_copy; - features = btrfs_super_compat_ro_flags(disk_super); - if (features & flag) { - spin_lock(&fs_info->super_lock); - features = btrfs_super_compat_ro_flags(disk_super); - if (features & flag) { - features &= ~flag; - btrfs_set_super_compat_ro_flags(disk_super, features); - btrfs_info(fs_info, - "clearing compat-ro feature flag for %s (0x%llx)", - name, flag); - } - spin_unlock(&fs_info->super_lock); - } -} - -#define btrfs_fs_compat_ro(fs_info, opt) \ - __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) - -static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) -{ - struct btrfs_super_block *disk_super; - disk_super = fs_info->super_copy; - return !!(btrfs_super_compat_ro_flags(disk_super) & flag); -} - -/* acl.c */ -#ifdef CONFIG_BTRFS_FS_POSIX_ACL -struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); -int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, - struct posix_acl *acl, int type); -int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, - struct posix_acl *acl, int type); -#else -#define btrfs_get_acl NULL -#define btrfs_set_acl NULL -static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct posix_acl *acl, - int type) -{ - return -EOPNOTSUPP; -} -#endif - -/* relocation.c */ -int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); -int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); -int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); -int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf, - struct extent_buffer *cow); -void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, - u64 *bytes_to_reserve); -int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending); -int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); -struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, - u64 bytenr); -int btrfs_should_ignore_reloc_root(struct btrfs_root *root); - -/* scrub.c */ -int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, - u64 end, struct btrfs_scrub_progress *progress, - int readonly, int is_dev_replace); -void btrfs_scrub_pause(struct btrfs_fs_info *fs_info); -void btrfs_scrub_continue(struct btrfs_fs_info *fs_info); -int btrfs_scrub_cancel(struct btrfs_fs_info *info); -int btrfs_scrub_cancel_dev(struct btrfs_device *dev); -int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, - struct btrfs_scrub_progress *progress); -static inline void btrfs_init_full_stripe_locks_tree( - struct btrfs_full_stripe_locks_tree *locks_root) -{ - locks_root->root = RB_ROOT; - mutex_init(&locks_root->lock); -} - -/* dev-replace.c */ -void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); -void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info); -void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); - -static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) -{ - btrfs_bio_counter_sub(fs_info, 1); -} static inline int is_fstree(u64 rootid) { @@ -4010,72 +695,16 @@ static inline int is_fstree(u64 rootid) return 0; } -static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) -{ - return signal_pending(current); -} - -/* verity.c */ -#ifdef CONFIG_FS_VERITY - -extern const struct fsverity_operations btrfs_verityops; -int btrfs_drop_verity_items(struct btrfs_inode *inode); - -BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, - encryption, 8); -BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item, - size, 64); -BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, - struct btrfs_verity_descriptor_item, encryption, 8); -BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, - struct btrfs_verity_descriptor_item, size, 64); - -#else - -static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) -{ - return 0; -} - -#endif - -/* Sanity test specific functions */ -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_destroy_inode(struct inode *inode); -static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) -{ - return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); -} -#else -static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) -{ - return 0; -} -#endif - -static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) -{ - return fs_info->zone_size > 0; -} - -/* - * Count how many fs_info->max_extent_size cover the @size - */ -static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) -{ -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (!fs_info) - return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); -#endif - - return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); -} - static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) { return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; } +int btrfs_super_csum_size(const struct btrfs_super_block *s); +const char *btrfs_super_csum_name(u16 csum_type); +const char *btrfs_super_csum_driver(u16 csum_type); +size_t __attribute_const__ btrfs_get_num_csums(void); + /* * We use page status Private2 to indicate there is an ordered extent with * unfinished IO. diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c new file mode 100644 index 000000000000..0a3c261b69c9 --- /dev/null +++ b/fs/btrfs/defrag.c @@ -0,0 +1,1376 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include <linux/sched.h> +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" +#include "transaction.h" +#include "locking.h" +#include "accessors.h" +#include "messages.h" +#include "delalloc-space.h" +#include "subpage.h" +#include "defrag.h" +#include "file-item.h" +#include "super.h" + +static struct kmem_cache *btrfs_inode_defrag_cachep; + +/* + * When auto defrag is enabled we queue up these defrag structs to remember + * which inodes need defragging passes. + */ +struct inode_defrag { + struct rb_node rb_node; + /* Inode number */ + u64 ino; + /* + * Transid where the defrag was added, we search for extents newer than + * this. + */ + u64 transid; + + /* Root objectid */ + u64 root; + + /* + * The extent size threshold for autodefrag. + * + * This value is different for compressed/non-compressed extents, thus + * needs to be passed from higher layer. + * (aka, inode_should_defrag()) + */ + u32 extent_thresh; +}; + +static int __compare_inode_defrag(struct inode_defrag *defrag1, + struct inode_defrag *defrag2) +{ + if (defrag1->root > defrag2->root) + return 1; + else if (defrag1->root < defrag2->root) + return -1; + else if (defrag1->ino > defrag2->ino) + return 1; + else if (defrag1->ino < defrag2->ino) + return -1; + else + return 0; +} + +/* + * Pop a record for an inode into the defrag tree. The lock must be held + * already. + * + * If you're inserting a record for an older transid than an existing record, + * the transid already in the tree is lowered. + * + * If an existing record is found the defrag item you pass in is freed. + */ +static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, + struct inode_defrag *defrag) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct inode_defrag *entry; + struct rb_node **p; + struct rb_node *parent = NULL; + int ret; + + p = &fs_info->defrag_inodes.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct inode_defrag, rb_node); + + ret = __compare_inode_defrag(defrag, entry); + if (ret < 0) + p = &parent->rb_left; + else if (ret > 0) + p = &parent->rb_right; + else { + /* + * If we're reinserting an entry for an old defrag run, + * make sure to lower the transid of our existing + * record. + */ + if (defrag->transid < entry->transid) + entry->transid = defrag->transid; + entry->extent_thresh = min(defrag->extent_thresh, + entry->extent_thresh); + return -EEXIST; + } + } + set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); + rb_link_node(&defrag->rb_node, parent, p); + rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); + return 0; +} + +static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) + return 0; + + if (btrfs_fs_closing(fs_info)) + return 0; + + return 1; +} + +/* + * Insert a defrag record for this inode if auto defrag is enabled. + */ +int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u32 extent_thresh) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode_defrag *defrag; + u64 transid; + int ret; + + if (!__need_auto_defrag(fs_info)) + return 0; + + if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) + return 0; + + if (trans) + transid = trans->transid; + else + transid = inode->root->last_trans; + + defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); + if (!defrag) + return -ENOMEM; + + defrag->ino = btrfs_ino(inode); + defrag->transid = transid; + defrag->root = root->root_key.objectid; + defrag->extent_thresh = extent_thresh; + + spin_lock(&fs_info->defrag_inodes_lock); + if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { + /* + * If we set IN_DEFRAG flag and evict the inode from memory, + * and then re-read this inode, this new inode doesn't have + * IN_DEFRAG flag. At the case, we may find the existed defrag. + */ + ret = __btrfs_add_inode_defrag(inode, defrag); + if (ret) + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } else { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } + spin_unlock(&fs_info->defrag_inodes_lock); + return 0; +} + +/* + * Pick the defragable inode that we want, if it doesn't exist, we will get the + * next one. + */ +static struct inode_defrag *btrfs_pick_defrag_inode( + struct btrfs_fs_info *fs_info, u64 root, u64 ino) +{ + struct inode_defrag *entry = NULL; + struct inode_defrag tmp; + struct rb_node *p; + struct rb_node *parent = NULL; + int ret; + + tmp.ino = ino; + tmp.root = root; + + spin_lock(&fs_info->defrag_inodes_lock); + p = fs_info->defrag_inodes.rb_node; + while (p) { + parent = p; + entry = rb_entry(parent, struct inode_defrag, rb_node); + + ret = __compare_inode_defrag(&tmp, entry); + if (ret < 0) + p = parent->rb_left; + else if (ret > 0) + p = parent->rb_right; + else + goto out; + } + + if (parent && __compare_inode_defrag(&tmp, entry) > 0) { + parent = rb_next(parent); + if (parent) + entry = rb_entry(parent, struct inode_defrag, rb_node); + else + entry = NULL; + } +out: + if (entry) + rb_erase(parent, &fs_info->defrag_inodes); + spin_unlock(&fs_info->defrag_inodes_lock); + return entry; +} + +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) +{ + struct inode_defrag *defrag; + struct rb_node *node; + + spin_lock(&fs_info->defrag_inodes_lock); + node = rb_first(&fs_info->defrag_inodes); + while (node) { + rb_erase(node, &fs_info->defrag_inodes); + defrag = rb_entry(node, struct inode_defrag, rb_node); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + + cond_resched_lock(&fs_info->defrag_inodes_lock); + + node = rb_first(&fs_info->defrag_inodes); + } + spin_unlock(&fs_info->defrag_inodes_lock); +} + +#define BTRFS_DEFRAG_BATCH 1024 + +static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + struct inode_defrag *defrag) +{ + struct btrfs_root *inode_root; + struct inode *inode; + struct btrfs_ioctl_defrag_range_args range; + int ret = 0; + u64 cur = 0; + +again: + if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) + goto cleanup; + if (!__need_auto_defrag(fs_info)) + goto cleanup; + + /* Get the inode */ + inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); + if (IS_ERR(inode_root)) { + ret = PTR_ERR(inode_root); + goto cleanup; + } + + inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); + btrfs_put_root(inode_root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto cleanup; + } + + if (cur >= i_size_read(inode)) { + iput(inode); + goto cleanup; + } + + /* Do a chunk of defrag */ + clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); + memset(&range, 0, sizeof(range)); + range.len = (u64)-1; + range.start = cur; + range.extent_thresh = defrag->extent_thresh; + + sb_start_write(fs_info->sb); + ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + BTRFS_DEFRAG_BATCH); + sb_end_write(fs_info->sb); + iput(inode); + + if (ret < 0) + goto cleanup; + + cur = max(cur + fs_info->sectorsize, range.start); + goto again; + +cleanup: + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return ret; +} + +/* + * Run through the list of inodes in the FS that need defragging. + */ +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) +{ + struct inode_defrag *defrag; + u64 first_ino = 0; + u64 root_objectid = 0; + + atomic_inc(&fs_info->defrag_running); + while (1) { + /* Pause the auto defragger. */ + if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) + break; + + if (!__need_auto_defrag(fs_info)) + break; + + /* find an inode to defrag */ + defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); + if (!defrag) { + if (root_objectid || first_ino) { + root_objectid = 0; + first_ino = 0; + continue; + } else { + break; + } + } + + first_ino = defrag->ino + 1; + root_objectid = defrag->root; + + __btrfs_run_defrag_inode(fs_info, defrag); + } + atomic_dec(&fs_info->defrag_running); + + /* + * During unmount, we use the transaction_wait queue to wait for the + * defragger to stop. + */ + wake_up(&fs_info->transaction_wait); + return 0; +} + +/* + * Defrag all the leaves in a given btree. + * Read all the leaves and try to get key order to + * better reflect disk order + */ + +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path = NULL; + struct btrfs_key key; + int ret = 0; + int wret; + int level; + int next_key_ret = 0; + u64 last_ret = 0; + + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + goto out; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + level = btrfs_header_level(root->node); + + if (level == 0) + goto out; + + if (root->defrag_progress.objectid == 0) { + struct extent_buffer *root_node; + u32 nritems; + + root_node = btrfs_lock_root_node(root); + nritems = btrfs_header_nritems(root_node); + root->defrag_max.objectid = 0; + /* from above we know this is not a leaf */ + btrfs_node_key_to_cpu(root_node, &root->defrag_max, + nritems - 1); + btrfs_tree_unlock(root_node); + free_extent_buffer(root_node); + memset(&key, 0, sizeof(key)); + } else { + memcpy(&key, &root->defrag_progress, sizeof(key)); + } + + path->keep_locks = 1; + + ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + goto out; + } + btrfs_release_path(path); + /* + * We don't need a lock on a leaf. btrfs_realloc_node() will lock all + * leafs from path->nodes[1], so set lowest_level to 1 to avoid later + * a deadlock (attempting to write lock an already write locked leaf). + */ + path->lowest_level = 1; + wret = btrfs_search_slot(trans, root, &key, path, 0, 1); + + if (wret < 0) { + ret = wret; + goto out; + } + if (!path->nodes[1]) { + ret = 0; + goto out; + } + /* + * The node at level 1 must always be locked when our path has + * keep_locks set and lowest_level is 1, regardless of the value of + * path->slots[1]. + */ + BUG_ON(path->locks[1] == 0); + ret = btrfs_realloc_node(trans, root, + path->nodes[1], 0, + &last_ret, + &root->defrag_progress); + if (ret) { + WARN_ON(ret == -EAGAIN); + goto out; + } + /* + * Now that we reallocated the node we can find the next key. Note that + * btrfs_find_next_key() can release our path and do another search + * without COWing, this is because even with path->keep_locks = 1, + * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a + * node when path->slots[node_level - 1] does not point to the last + * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore + * we search for the next key after reallocating our node. + */ + path->slots[1] = btrfs_header_nritems(path->nodes[1]); + next_key_ret = btrfs_find_next_key(root, path, &key, 1, + BTRFS_OLDEST_GENERATION); + if (next_key_ret == 0) { + memcpy(&root->defrag_progress, &key, sizeof(key)); + ret = -EAGAIN; + } +out: + btrfs_free_path(path); + if (ret == -EAGAIN) { + if (root->defrag_max.objectid > root->defrag_progress.objectid) + goto done; + if (root->defrag_max.type > root->defrag_progress.type) + goto done; + if (root->defrag_max.offset > root->defrag_progress.offset) + goto done; + ret = 0; + } +done: + if (ret != -EAGAIN) + memset(&root->defrag_progress, 0, + sizeof(root->defrag_progress)); + + return ret; +} + +/* + * Defrag specific helper to get an extent map. + * + * Differences between this and btrfs_get_extent() are: + * + * - No extent_map will be added to inode->extent_tree + * To reduce memory usage in the long run. + * + * - Extra optimization to skip file extents older than @newer_than + * By using btrfs_search_forward() we can skip entire file ranges that + * have extents created in past transactions, because btrfs_search_forward() + * will not visit leaves and nodes with a generation smaller than given + * minimal generation threshold (@newer_than). + * + * Return valid em if we find a file extent matching the requirement. + * Return NULL if we can not find a file extent matching the requirement. + * + * Return ERR_PTR() for error. + */ +static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, + u64 start, u64 newer_than) +{ + struct btrfs_root *root = inode->root; + struct btrfs_file_extent_item *fi; + struct btrfs_path path = { 0 }; + struct extent_map *em; + struct btrfs_key key; + u64 ino = btrfs_ino(inode); + int ret; + + em = alloc_extent_map(); + if (!em) { + ret = -ENOMEM; + goto err; + } + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + if (newer_than) { + ret = btrfs_search_forward(root, &key, &path, newer_than); + if (ret < 0) + goto err; + /* Can't find anything newer */ + if (ret > 0) + goto not_found; + } else { + ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); + if (ret < 0) + goto err; + } + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { + /* + * If btrfs_search_slot() makes path to point beyond nritems, + * we should not have an empty leaf, as this inode must at + * least have its INODE_ITEM. + */ + ASSERT(btrfs_header_nritems(path.nodes[0])); + path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; + } + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + /* Perfect match, no need to go one slot back */ + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && + key.offset == start) + goto iterate; + + /* We didn't find a perfect match, needs to go one slot back */ + if (path.slots[0] > 0) { + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path.slots[0]--; + } + +iterate: + /* Iterate through the path to find a file extent covering @start */ + while (true) { + u64 extent_end; + + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) + goto next; + + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + + /* + * We may go one slot back to INODE_REF/XATTR item, then + * need to go forward until we reach an EXTENT_DATA. + * But we should still has the correct ino as key.objectid. + */ + if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) + goto next; + + /* It's beyond our target range, definitely not extent found */ + if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) + goto not_found; + + /* + * | |<- File extent ->| + * \- start + * + * This means there is a hole between start and key.offset. + */ + if (key.offset > start) { + em->start = start; + em->orig_start = start; + em->block_start = EXTENT_MAP_HOLE; + em->len = key.offset - start; + break; + } + + fi = btrfs_item_ptr(path.nodes[0], path.slots[0], + struct btrfs_file_extent_item); + extent_end = btrfs_file_extent_end(&path); + + /* + * |<- file extent ->| | + * \- start + * + * We haven't reached start, search next slot. + */ + if (extent_end <= start) + goto next; + + /* Now this extent covers @start, convert it to em */ + btrfs_extent_item_to_extent_map(inode, &path, fi, em); + break; +next: + ret = btrfs_next_item(root, &path); + if (ret < 0) + goto err; + if (ret > 0) + goto not_found; + } + btrfs_release_path(&path); + return em; + +not_found: + btrfs_release_path(&path); + free_extent_map(em); + return NULL; + +err: + btrfs_release_path(&path); + free_extent_map(em); + return ERR_PTR(ret); +} + +static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, + u64 newer_than, bool locked) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em; + const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; + + /* + * Hopefully we have this extent in the tree already, try without the + * full extent lock. + */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, sectorsize); + read_unlock(&em_tree->lock); + + /* + * We can get a merged extent, in that case, we need to re-search + * tree to get the original em for defrag. + * + * If @newer_than is 0 or em::generation < newer_than, we can trust + * this em, as either we don't care about the generation, or the + * merged extent map will be rejected anyway. + */ + if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && + newer_than && em->generation >= newer_than) { + free_extent_map(em); + em = NULL; + } + + if (!em) { + struct extent_state *cached = NULL; + u64 end = start + sectorsize - 1; + + /* Get the big lock and read metadata off disk. */ + if (!locked) + lock_extent(io_tree, start, end, &cached); + em = defrag_get_extent(BTRFS_I(inode), start, newer_than); + if (!locked) + unlock_extent(io_tree, start, end, &cached); + + if (IS_ERR(em)) + return NULL; + } + + return em; +} + +static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, + const struct extent_map *em) +{ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + return BTRFS_MAX_COMPRESSED; + return fs_info->max_extent_size; +} + +static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, + u32 extent_thresh, u64 newer_than, bool locked) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct extent_map *next; + bool ret = false; + + /* This is the last extent */ + if (em->start + em->len >= i_size_read(inode)) + return false; + + /* + * Here we need to pass @newer_then when checking the next extent, or + * we will hit a case we mark current extent for defrag, but the next + * one will not be a target. + * This will just cause extra IO without really reducing the fragments. + */ + next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); + /* No more em or hole */ + if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) + goto out; + if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) + goto out; + /* + * If the next extent is at its max capacity, defragging current extent + * makes no sense, as the total number of extents won't change. + */ + if (next->len >= get_extent_max_capacity(fs_info, em)) + goto out; + /* Skip older extent */ + if (next->generation < newer_than) + goto out; + /* Also check extent size */ + if (next->len >= extent_thresh) + goto out; + + ret = true; +out: + free_extent_map(next); + return ret; +} + +/* + * Prepare one page to be defragged. + * + * This will ensure: + * + * - Returned page is locked and has been set up properly. + * - No ordered extent exists in the page. + * - The page is uptodate. + * + * NOTE: Caller should also wait for page writeback after the cluster is + * prepared, here we don't do writeback wait for each page. + */ +static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index) +{ + struct address_space *mapping = inode->vfs_inode.i_mapping; + gfp_t mask = btrfs_alloc_write_mask(mapping); + u64 page_start = (u64)index << PAGE_SHIFT; + u64 page_end = page_start + PAGE_SIZE - 1; + struct extent_state *cached_state = NULL; + struct page *page; + int ret; + +again: + page = find_or_create_page(mapping, index, mask); + if (!page) + return ERR_PTR(-ENOMEM); + + /* + * Since we can defragment files opened read-only, we can encounter + * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We + * can't do I/O using huge pages yet, so return an error for now. + * Filesystem transparent huge pages are typically only used for + * executables that explicitly enable them, so this isn't very + * restrictive. + */ + if (PageCompound(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-ETXTBSY); + } + + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + return ERR_PTR(ret); + } + + /* Wait for any existing ordered extent in the range */ + while (1) { + struct btrfs_ordered_extent *ordered; + + lock_extent(&inode->io_tree, page_start, page_end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); + unlock_extent(&inode->io_tree, page_start, page_end, + &cached_state); + if (!ordered) + break; + + unlock_page(page); + btrfs_start_ordered_extent(ordered, 1); + btrfs_put_ordered_extent(ordered); + lock_page(page); + /* + * We unlocked the page above, so we need check if it was + * released or not. + */ + if (page->mapping != mapping || !PagePrivate(page)) { + unlock_page(page); + put_page(page); + goto again; + } + } + + /* + * Now the page range has no ordered extent any more. Read the page to + * make it uptodate. + */ + if (!PageUptodate(page)) { + btrfs_read_folio(NULL, page_folio(page)); + lock_page(page); + if (page->mapping != mapping || !PagePrivate(page)) { + unlock_page(page); + put_page(page); + goto again; + } + if (!PageUptodate(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-EIO); + } + } + return page; +} + +struct defrag_target_range { + struct list_head list; + u64 start; + u64 len; +}; + +/* + * Collect all valid target extents. + * + * @start: file offset to lookup + * @len: length to lookup + * @extent_thresh: file extent size threshold, any extent size >= this value + * will be ignored + * @newer_than: only defrag extents newer than this value + * @do_compress: whether the defrag is doing compression + * if true, @extent_thresh will be ignored and all regular + * file extents meeting @newer_than will be targets. + * @locked: if the range has already held extent lock + * @target_list: list of targets file extents + */ +static int defrag_collect_targets(struct btrfs_inode *inode, + u64 start, u64 len, u32 extent_thresh, + u64 newer_than, bool do_compress, + bool locked, struct list_head *target_list, + u64 *last_scanned_ret) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + bool last_is_target = false; + u64 cur = start; + int ret = 0; + + while (cur < start + len) { + struct extent_map *em; + struct defrag_target_range *new; + bool next_mergeable = true; + u64 range_len; + + last_is_target = false; + em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked); + if (!em) + break; + + /* + * If the file extent is an inlined one, we may still want to + * defrag it (fallthrough) if it will cause a regular extent. + * This is for users who want to convert inline extents to + * regular ones through max_inline= mount option. + */ + if (em->block_start == EXTENT_MAP_INLINE && + em->len <= inode->root->fs_info->max_inline) + goto next; + + /* Skip hole/delalloc/preallocated extents */ + if (em->block_start == EXTENT_MAP_HOLE || + em->block_start == EXTENT_MAP_DELALLOC || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + goto next; + + /* Skip older extent */ + if (em->generation < newer_than) + goto next; + + /* This em is under writeback, no need to defrag */ + if (em->generation == (u64)-1) + goto next; + + /* + * Our start offset might be in the middle of an existing extent + * map, so take that into account. + */ + range_len = em->len - (cur - em->start); + /* + * If this range of the extent map is already flagged for delalloc, + * skip it, because: + * + * 1) We could deadlock later, when trying to reserve space for + * delalloc, because in case we can't immediately reserve space + * the flusher can start delalloc and wait for the respective + * ordered extents to complete. The deadlock would happen + * because we do the space reservation while holding the range + * locked, and starting writeback, or finishing an ordered + * extent, requires locking the range; + * + * 2) If there's delalloc there, it means there's dirty pages for + * which writeback has not started yet (we clean the delalloc + * flag when starting writeback and after creating an ordered + * extent). If we mark pages in an adjacent range for defrag, + * then we will have a larger contiguous range for delalloc, + * very likely resulting in a larger extent after writeback is + * triggered (except in a case of free space fragmentation). + */ + if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, + EXTENT_DELALLOC, 0, NULL)) + goto next; + + /* + * For do_compress case, we want to compress all valid file + * extents, thus no @extent_thresh or mergeable check. + */ + if (do_compress) + goto add; + + /* Skip too large extent */ + if (range_len >= extent_thresh) + goto next; + + /* + * Skip extents already at its max capacity, this is mostly for + * compressed extents, which max cap is only 128K. + */ + if (em->len >= get_extent_max_capacity(fs_info, em)) + goto next; + + /* + * Normally there are no more extents after an inline one, thus + * @next_mergeable will normally be false and not defragged. + * So if an inline extent passed all above checks, just add it + * for defrag, and be converted to regular extents. + */ + if (em->block_start == EXTENT_MAP_INLINE) + goto add; + + next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, + extent_thresh, newer_than, locked); + if (!next_mergeable) { + struct defrag_target_range *last; + + /* Empty target list, no way to merge with last entry */ + if (list_empty(target_list)) + goto next; + last = list_entry(target_list->prev, + struct defrag_target_range, list); + /* Not mergeable with last entry */ + if (last->start + last->len != cur) + goto next; + + /* Mergeable, fall through to add it to @target_list. */ + } + +add: + last_is_target = true; + range_len = min(extent_map_end(em), start + len) - cur; + /* + * This one is a good target, check if it can be merged into + * last range of the target list. + */ + if (!list_empty(target_list)) { + struct defrag_target_range *last; + + last = list_entry(target_list->prev, + struct defrag_target_range, list); + ASSERT(last->start + last->len <= cur); + if (last->start + last->len == cur) { + /* Mergeable, enlarge the last entry */ + last->len += range_len; + goto next; + } + /* Fall through to allocate a new entry */ + } + + /* Allocate new defrag_target_range */ + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) { + free_extent_map(em); + ret = -ENOMEM; + break; + } + new->start = cur; + new->len = range_len; + list_add_tail(&new->list, target_list); + +next: + cur = extent_map_end(em); + free_extent_map(em); + } + if (ret < 0) { + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + + list_for_each_entry_safe(entry, tmp, target_list, list) { + list_del_init(&entry->list); + kfree(entry); + } + } + if (!ret && last_scanned_ret) { + /* + * If the last extent is not a target, the caller can skip to + * the end of that extent. + * Otherwise, we can only go the end of the specified range. + */ + if (!last_is_target) + *last_scanned_ret = max(cur, *last_scanned_ret); + else + *last_scanned_ret = max(start + len, *last_scanned_ret); + } + return ret; +} + +#define CLUSTER_SIZE (SZ_256K) +static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); + +/* + * Defrag one contiguous target range. + * + * @inode: target inode + * @target: target range to defrag + * @pages: locked pages covering the defrag range + * @nr_pages: number of locked pages + * + * Caller should ensure: + * + * - Pages are prepared + * Pages should be locked, no ordered extent in the pages range, + * no writeback. + * + * - Extent bits are locked + */ +static int defrag_one_locked_target(struct btrfs_inode *inode, + struct defrag_target_range *target, + struct page **pages, int nr_pages, + struct extent_state **cached_state) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_changeset *data_reserved = NULL; + const u64 start = target->start; + const u64 len = target->len; + unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; + unsigned long start_index = start >> PAGE_SHIFT; + unsigned long first_index = page_index(pages[0]); + int ret = 0; + int i; + + ASSERT(last_index - first_index + 1 <= nr_pages); + + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); + if (ret < 0) + return ret; + clear_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, cached_state); + set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); + + /* Update the page status */ + for (i = start_index - first_index; i <= last_index - first_index; i++) { + ClearPageChecked(pages[i]); + btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); + } + btrfs_delalloc_release_extents(inode, len); + extent_changeset_free(data_reserved); + + return ret; +} + +static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, + u32 extent_thresh, u64 newer_than, bool do_compress, + u64 *last_scanned_ret) +{ + struct extent_state *cached_state = NULL; + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + LIST_HEAD(target_list); + struct page **pages; + const u32 sectorsize = inode->root->fs_info->sectorsize; + u64 last_index = (start + len - 1) >> PAGE_SHIFT; + u64 start_index = start >> PAGE_SHIFT; + unsigned int nr_pages = last_index - start_index + 1; + int ret = 0; + int i; + + ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); + ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); + + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + + /* Prepare all pages */ + for (i = 0; i < nr_pages; i++) { + pages[i] = defrag_prepare_one_page(inode, start_index + i); + if (IS_ERR(pages[i])) { + ret = PTR_ERR(pages[i]); + pages[i] = NULL; + goto free_pages; + } + } + for (i = 0; i < nr_pages; i++) + wait_on_page_writeback(pages[i]); + + /* Lock the pages range */ + lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); + /* + * Now we have a consistent view about the extent map, re-check + * which range really needs to be defragged. + * + * And this time we have extent locked already, pass @locked = true + * so that we won't relock the extent range and cause deadlock. + */ + ret = defrag_collect_targets(inode, start, len, extent_thresh, + newer_than, do_compress, true, + &target_list, last_scanned_ret); + if (ret < 0) + goto unlock_extent; + + list_for_each_entry(entry, &target_list, list) { + ret = defrag_one_locked_target(inode, entry, pages, nr_pages, + &cached_state); + if (ret < 0) + break; + } + + list_for_each_entry_safe(entry, tmp, &target_list, list) { + list_del_init(&entry->list); + kfree(entry); + } +unlock_extent: + unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); +free_pages: + for (i = 0; i < nr_pages; i++) { + if (pages[i]) { + unlock_page(pages[i]); + put_page(pages[i]); + } + } + kfree(pages); + return ret; +} + +static int defrag_one_cluster(struct btrfs_inode *inode, + struct file_ra_state *ra, + u64 start, u32 len, u32 extent_thresh, + u64 newer_than, bool do_compress, + unsigned long *sectors_defragged, + unsigned long max_sectors, + u64 *last_scanned_ret) +{ + const u32 sectorsize = inode->root->fs_info->sectorsize; + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + LIST_HEAD(target_list); + int ret; + + ret = defrag_collect_targets(inode, start, len, extent_thresh, + newer_than, do_compress, false, + &target_list, NULL); + if (ret < 0) + goto out; + + list_for_each_entry(entry, &target_list, list) { + u32 range_len = entry->len; + + /* Reached or beyond the limit */ + if (max_sectors && *sectors_defragged >= max_sectors) { + ret = 1; + break; + } + + if (max_sectors) + range_len = min_t(u32, range_len, + (max_sectors - *sectors_defragged) * sectorsize); + + /* + * If defrag_one_range() has updated last_scanned_ret, + * our range may already be invalid (e.g. hole punched). + * Skip if our range is before last_scanned_ret, as there is + * no need to defrag the range anymore. + */ + if (entry->start + range_len <= *last_scanned_ret) + continue; + + if (ra) + page_cache_sync_readahead(inode->vfs_inode.i_mapping, + ra, NULL, entry->start >> PAGE_SHIFT, + ((entry->start + range_len - 1) >> PAGE_SHIFT) - + (entry->start >> PAGE_SHIFT) + 1); + /* + * Here we may not defrag any range if holes are punched before + * we locked the pages. + * But that's fine, it only affects the @sectors_defragged + * accounting. + */ + ret = defrag_one_range(inode, entry->start, range_len, + extent_thresh, newer_than, do_compress, + last_scanned_ret); + if (ret < 0) + break; + *sectors_defragged += range_len >> + inode->root->fs_info->sectorsize_bits; + } +out: + list_for_each_entry_safe(entry, tmp, &target_list, list) { + list_del_init(&entry->list); + kfree(entry); + } + if (ret >= 0) + *last_scanned_ret = max(*last_scanned_ret, start + len); + return ret; +} + +/* + * Entry point to file defragmentation. + * + * @inode: inode to be defragged + * @ra: readahead state (can be NUL) + * @range: defrag options including range and flags + * @newer_than: minimum transid to defrag + * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode + * will be defragged. + * + * Return <0 for error. + * Return >=0 for the number of sectors defragged, and range->start will be updated + * to indicate the file offset where next defrag should be started at. + * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without + * defragging all the range). + */ +int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, + struct btrfs_ioctl_defrag_range_args *range, + u64 newer_than, unsigned long max_to_defrag) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + unsigned long sectors_defragged = 0; + u64 isize = i_size_read(inode); + u64 cur; + u64 last_byte; + bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); + bool ra_allocated = false; + int compress_type = BTRFS_COMPRESS_ZLIB; + int ret = 0; + u32 extent_thresh = range->extent_thresh; + pgoff_t start_index; + + if (isize == 0) + return 0; + + if (range->start >= isize) + return -EINVAL; + + if (do_compress) { + if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) + return -EINVAL; + if (range->compress_type) + compress_type = range->compress_type; + } + + if (extent_thresh == 0) + extent_thresh = SZ_256K; + + if (range->start + range->len > range->start) { + /* Got a specific range */ + last_byte = min(isize, range->start + range->len); + } else { + /* Defrag until file end */ + last_byte = isize; + } + + /* Align the range */ + cur = round_down(range->start, fs_info->sectorsize); + last_byte = round_up(last_byte, fs_info->sectorsize) - 1; + + /* + * If we were not given a ra, allocate a readahead context. As + * readahead is just an optimization, defrag will work without it so + * we don't error out. + */ + if (!ra) { + ra_allocated = true; + ra = kzalloc(sizeof(*ra), GFP_KERNEL); + if (ra) + file_ra_state_init(ra, inode->i_mapping); + } + + /* + * Make writeback start from the beginning of the range, so that the + * defrag range can be written sequentially. + */ + start_index = cur >> PAGE_SHIFT; + if (start_index < inode->i_mapping->writeback_index) + inode->i_mapping->writeback_index = start_index; + + while (cur < last_byte) { + const unsigned long prev_sectors_defragged = sectors_defragged; + u64 last_scanned = cur; + u64 cluster_end; + + if (btrfs_defrag_cancelled(fs_info)) { + ret = -EAGAIN; + break; + } + + /* We want the cluster end at page boundary when possible */ + cluster_end = (((cur >> PAGE_SHIFT) + + (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; + cluster_end = min(cluster_end, last_byte); + + btrfs_inode_lock(BTRFS_I(inode), 0); + if (IS_SWAPFILE(inode)) { + ret = -ETXTBSY; + btrfs_inode_unlock(BTRFS_I(inode), 0); + break; + } + if (!(inode->i_sb->s_flags & SB_ACTIVE)) { + btrfs_inode_unlock(BTRFS_I(inode), 0); + break; + } + if (do_compress) + BTRFS_I(inode)->defrag_compress = compress_type; + ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, + cluster_end + 1 - cur, extent_thresh, + newer_than, do_compress, §ors_defragged, + max_to_defrag, &last_scanned); + + if (sectors_defragged > prev_sectors_defragged) + balance_dirty_pages_ratelimited(inode->i_mapping); + + btrfs_inode_unlock(BTRFS_I(inode), 0); + if (ret < 0) + break; + cur = max(cluster_end + 1, last_scanned); + if (ret > 0) { + ret = 0; + break; + } + cond_resched(); + } + + if (ra_allocated) + kfree(ra); + /* + * Update range.start for autodefrag, this will indicate where to start + * in next run. + */ + range->start = cur; + if (sectors_defragged) { + /* + * We have defragged some sectors, for compression case they + * need to be written back immediately. + */ + if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { + filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + } + if (range->compress_type == BTRFS_COMPRESS_LZO) + btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); + else if (range->compress_type == BTRFS_COMPRESS_ZSTD) + btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); + ret = sectors_defragged; + } + if (do_compress) { + btrfs_inode_lock(BTRFS_I(inode), 0); + BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; + btrfs_inode_unlock(BTRFS_I(inode), 0); + } + return ret; +} + +void __cold btrfs_auto_defrag_exit(void) +{ + kmem_cache_destroy(btrfs_inode_defrag_cachep); +} + +int __init btrfs_auto_defrag_init(void) +{ + btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", + sizeof(struct inode_defrag), 0, + SLAB_MEM_SPREAD, + NULL); + if (!btrfs_inode_defrag_cachep) + return -ENOMEM; + + return 0; +} diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h new file mode 100644 index 000000000000..5305f2283b5e --- /dev/null +++ b/fs/btrfs/defrag.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_DEFRAG_H +#define BTRFS_DEFRAG_H + +int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, + struct btrfs_ioctl_defrag_range_args *range, + u64 newer_than, unsigned long max_to_defrag); +int __init btrfs_auto_defrag_init(void); +void __cold btrfs_auto_defrag_exit(void); +int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u32 extent_thresh); +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root); + +static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) +{ + return signal_pending(current); +} + +#endif diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 1e8f17ff829e..7ddb1d104e8e 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include "messages.h" #include "ctree.h" #include "delalloc-space.h" #include "block-rsv.h" @@ -8,6 +9,7 @@ #include "transaction.h" #include "qgroup.h" #include "block-group.h" +#include "fs.h" /* * HOW DOES THIS WORK @@ -127,9 +129,11 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) } int btrfs_check_data_free_space(struct btrfs_inode *inode, - struct extent_changeset **reserved, u64 start, u64 len) + struct extent_changeset **reserved, u64 start, + u64 len, bool noflush) { struct btrfs_fs_info *fs_info = inode->root->fs_info; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA; int ret; /* align the range */ @@ -137,7 +141,12 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode, round_down(start, fs_info->sectorsize); start = round_down(start, fs_info->sectorsize); - ret = btrfs_alloc_data_chunk_ondemand(inode, len); + if (noflush) + flush = BTRFS_RESERVE_NO_FLUSH; + else if (btrfs_is_free_space_inode(inode)) + flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; + + ret = btrfs_reserve_data_bytes(fs_info, len, flush); if (ret < 0) return ret; @@ -193,8 +202,8 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, btrfs_qgroup_free_data(inode, reserved, start, len); } -/** - * Release any excessive reservation +/* + * Release any excessive reservations for an inode. * * @inode: the inode we need to release from * @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup @@ -368,12 +377,12 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, return 0; } -/** - * Release a metadata reservation for an inode +/* + * Release a metadata reservation for an inode. * - * @inode: the inode to release the reservation for. - * @num_bytes: the number of bytes we are releasing. - * @qgroup_free: free qgroup reservation or convert it to per-trans reservation + * @inode: the inode to release the reservation for. + * @num_bytes: the number of bytes we are releasing. + * @qgroup_free: free qgroup reservation or convert it to per-trans reservation * * This will release the metadata reservation for an inode. This can be called * once we complete IO for a given set of bytes to release their metadata @@ -396,10 +405,11 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, btrfs_inode_rsv_release(inode, qgroup_free); } -/** - * btrfs_delalloc_release_extents - release our outstanding_extents - * @inode: the inode to balance the reservation for. - * @num_bytes: the number of bytes we originally reserved with +/* + * Release our outstanding_extents for an inode. + * + * @inode: the inode to balance the reservation for. + * @num_bytes: the number of bytes we originally reserved with * * When we reserve space we increase outstanding_extents for the extents we may * add. Once we've set the range as delalloc or created our ordered extents we @@ -424,37 +434,37 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) btrfs_inode_rsv_release(inode, true); } -/** - * btrfs_delalloc_reserve_space - reserve data and metadata space for - * delalloc - * @inode: inode we're writing to - * @start: start range we are writing to - * @len: how long the range we are writing to - * @reserved: mandatory parameter, record actually reserved qgroup ranges of - * current reservation. +/* + * Reserve data and metadata space for delalloc + * + * @inode: inode we're writing to + * @start: start range we are writing to + * @len: how long the range we are writing to + * @reserved: mandatory parameter, record actually reserved qgroup ranges of + * current reservation. * * This will do the following things * - * - reserve space in data space info for num bytes - * and reserve precious corresponding qgroup space + * - reserve space in data space info for num bytes and reserve precious + * corresponding qgroup space * (Done in check_data_free_space) * * - reserve space for metadata space, based on the number of outstanding - * extents and how much csums will be needed - * also reserve metadata space in a per root over-reserve method. + * extents and how much csums will be needed also reserve metadata space in a + * per root over-reserve method. * - add to the inodes->delalloc_bytes * - add it to the fs_info's delalloc inodes list. * (Above 3 all done in delalloc_reserve_metadata) * * Return 0 for success - * Return <0 for error(-ENOSPC or -EQUOT) + * Return <0 for error(-ENOSPC or -EDQUOT) */ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len) { int ret; - ret = btrfs_check_data_free_space(inode, reserved, start, len); + ret = btrfs_check_data_free_space(inode, reserved, start, len, false); if (ret < 0) return ret; ret = btrfs_delalloc_reserve_metadata(inode, len, len, false); @@ -466,7 +476,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, return ret; } -/** +/* * Release data and metadata space for delalloc * * @inode: inode we're releasing space for @@ -475,10 +485,10 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, * @len: length of the space already reserved * @qgroup_free: should qgroup reserved-space also be freed * - * This function will release the metadata space that was not used and will - * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes - * list if there are no delalloc bytes left. - * Also it will handle the qgroup reserved space. + * Release the metadata space that was not used and will decrement + * ->delalloc_bytes and remove it from the fs_info->delalloc_inodes list if + * there are no delalloc bytes left. Also it will handle the qgroup reserved + * space. */ void btrfs_delalloc_release_space(struct btrfs_inode *inode, struct extent_changeset *reserved, diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index 28bf5c3ef430..c5d573f2366e 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -7,7 +7,8 @@ struct extent_changeset; int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); int btrfs_check_data_free_space(struct btrfs_inode *inode, - struct extent_changeset **reserved, u64 start, u64 len); + struct extent_changeset **reserved, u64 start, u64 len, + bool noflush); void btrfs_free_reserved_data_space(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len); void btrfs_delalloc_release_space(struct btrfs_inode *inode, @@ -19,5 +20,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len); +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, + u64 disk_num_bytes, bool noflush); +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); #endif /* BTRFS_DELALLOC_SPACE_H */ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index e7f34871a132..0095c6e4c3d1 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -6,14 +6,19 @@ #include <linux/slab.h> #include <linux/iversion.h> +#include "ctree.h" +#include "fs.h" +#include "messages.h" #include "misc.h" #include "delayed-inode.h" #include "disk-io.h" #include "transaction.h" -#include "ctree.h" #include "qgroup.h" #include "locking.h" #include "inode-item.h" +#include "space-info.h" +#include "accessors.h" +#include "file-item.h" #define BTRFS_DELAYED_WRITEBACK 512 #define BTRFS_DELAYED_BACKGROUND 128 @@ -302,15 +307,21 @@ static inline void btrfs_release_prepared_delayed_node( __btrfs_release_delayed_node(node, 1); } -static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) +static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, + struct btrfs_delayed_node *node, + enum btrfs_delayed_item_type type) { struct btrfs_delayed_item *item; + item = kmalloc(sizeof(*item) + data_len, GFP_NOFS); if (item) { item->data_len = data_len; - item->ins_or_del = 0; + item->type = type; item->bytes_reserved = 0; - item->delayed_node = NULL; + item->delayed_node = node; + RB_CLEAR_NODE(&item->rb_node); + INIT_LIST_HEAD(&item->log_list); + item->logged = false; refcount_set(&item->refs, 1); } return item; @@ -319,72 +330,32 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) /* * __btrfs_lookup_delayed_item - look up the delayed item by key * @delayed_node: pointer to the delayed node - * @key: the key to look up - * @prev: used to store the prev item if the right item isn't found - * @next: used to store the next item if the right item isn't found + * @index: the dir index value to lookup (offset of a dir index key) * * Note: if we don't find the right item, we will return the prev item and * the next item. */ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item( struct rb_root *root, - struct btrfs_key *key, - struct btrfs_delayed_item **prev, - struct btrfs_delayed_item **next) + u64 index) { - struct rb_node *node, *prev_node = NULL; + struct rb_node *node = root->rb_node; struct btrfs_delayed_item *delayed_item = NULL; - int ret = 0; - - node = root->rb_node; while (node) { delayed_item = rb_entry(node, struct btrfs_delayed_item, rb_node); - prev_node = node; - ret = btrfs_comp_cpu_keys(&delayed_item->key, key); - if (ret < 0) + if (delayed_item->index < index) node = node->rb_right; - else if (ret > 0) + else if (delayed_item->index > index) node = node->rb_left; else return delayed_item; } - if (prev) { - if (!prev_node) - *prev = NULL; - else if (ret < 0) - *prev = delayed_item; - else if ((node = rb_prev(prev_node)) != NULL) { - *prev = rb_entry(node, struct btrfs_delayed_item, - rb_node); - } else - *prev = NULL; - } - - if (next) { - if (!prev_node) - *next = NULL; - else if (ret > 0) - *next = delayed_item; - else if ((node = rb_next(prev_node)) != NULL) { - *next = rb_entry(node, struct btrfs_delayed_item, - rb_node); - } else - *next = NULL; - } return NULL; } -static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( - struct btrfs_delayed_node *delayed_node, - struct btrfs_key *key) -{ - return __btrfs_lookup_delayed_item(&delayed_node->ins_root.rb_root, key, - NULL, NULL); -} - static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, struct btrfs_delayed_item *ins) { @@ -392,15 +363,13 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, struct rb_node *parent_node = NULL; struct rb_root_cached *root; struct btrfs_delayed_item *item; - int cmp; bool leftmost = true; - if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM) + if (ins->type == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; - else if (ins->ins_or_del == BTRFS_DELAYED_DELETION_ITEM) - root = &delayed_node->del_root; else - BUG(); + root = &delayed_node->del_root; + p = &root->rb_root.rb_node; node = &ins->rb_node; @@ -409,11 +378,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, item = rb_entry(parent_node, struct btrfs_delayed_item, rb_node); - cmp = btrfs_comp_cpu_keys(&item->key, &ins->key); - if (cmp < 0) { + if (item->index < ins->index) { p = &(*p)->rb_right; leftmost = false; - } else if (cmp > 0) { + } else if (item->index > ins->index) { p = &(*p)->rb_left; } else { return -EEXIST; @@ -422,14 +390,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, rb_link_node(node, parent_node, p); rb_insert_color_cached(node, root, leftmost); - ins->delayed_node = delayed_node; - - /* Delayed items are always for dir index items. */ - ASSERT(ins->key.type == BTRFS_DIR_INDEX_KEY); - if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM && - ins->key.offset >= delayed_node->index_cnt) - delayed_node->index_cnt = ins->key.offset + 1; + if (ins->type == BTRFS_DELAYED_INSERTION_ITEM && + ins->index >= delayed_node->index_cnt) + delayed_node->index_cnt = ins->index + 1; delayed_node->count++; atomic_inc(&delayed_node->root->fs_info->delayed_root->items); @@ -451,21 +415,21 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) struct rb_root_cached *root; struct btrfs_delayed_root *delayed_root; - /* Not associated with any delayed_node */ - if (!delayed_item->delayed_node) + /* Not inserted, ignore it. */ + if (RB_EMPTY_NODE(&delayed_item->rb_node)) return; + delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root; BUG_ON(!delayed_root); - BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM && - delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM); - if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM) + if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_item->delayed_node->ins_root; else root = &delayed_item->delayed_node->del_root; rb_erase_cached(&delayed_item->rb_node, root); + RB_CLEAR_NODE(&delayed_item->rb_node); delayed_item->delayed_node->count--; finish_one_item(delayed_root); @@ -520,12 +484,11 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item( } static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_delayed_item *item) { struct btrfs_block_rsv *src_rsv; struct btrfs_block_rsv *dst_rsv; - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; u64 num_bytes; int ret; @@ -545,14 +508,14 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); if (!ret) { trace_btrfs_space_reservation(fs_info, "delayed_item", - item->key.objectid, + item->delayed_node->inode_id, num_bytes, 1); /* * For insertions we track reserved metadata space by accounting * for the number of leaves that will be used, based on the delayed * node's index_items_size field. */ - if (item->ins_or_del == BTRFS_DELAYED_DELETION_ITEM) + if (item->type == BTRFS_DELAYED_DELETION_ITEM) item->bytes_reserved = num_bytes; } @@ -574,8 +537,8 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, * to release/reserve qgroup space. */ trace_btrfs_space_reservation(fs_info, "delayed_item", - item->key.objectid, item->bytes_reserved, - 0); + item->delayed_node->inode_id, + item->bytes_reserved, 0); btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL); } @@ -688,6 +651,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_delayed_item *next; const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info); struct btrfs_item_batch batch; + struct btrfs_key first_key; + const u32 first_data_size = first_item->data_len; int total_size; char *ins_data = NULL; int ret; @@ -716,9 +681,9 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, ASSERT(first_item->bytes_reserved == 0); list_add_tail(&first_item->tree_list, &item_list); - batch.total_data_size = first_item->data_len; + batch.total_data_size = first_data_size; batch.nr = 1; - total_size = first_item->data_len + sizeof(struct btrfs_item); + total_size = first_data_size + sizeof(struct btrfs_item); curr = first_item; while (true) { @@ -732,8 +697,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, * We cannot allow gaps in the key space if we're doing log * replay. */ - if (continuous_keys_only && - (next->key.offset != curr->key.offset + 1)) + if (continuous_keys_only && (next->index != curr->index + 1)) break; ASSERT(next->bytes_reserved == 0); @@ -750,8 +714,11 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, } if (batch.nr == 1) { - batch.keys = &first_item->key; - batch.data_sizes = &first_item->data_len; + first_key.objectid = node->inode_id; + first_key.type = BTRFS_DIR_INDEX_KEY; + first_key.offset = first_item->index; + batch.keys = &first_key; + batch.data_sizes = &first_data_size; } else { struct btrfs_key *ins_keys; u32 *ins_sizes; @@ -768,7 +735,9 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, batch.keys = ins_keys; batch.data_sizes = ins_sizes; list_for_each_entry(curr, &item_list, tree_list) { - ins_keys[i] = curr->key; + ins_keys[i].objectid = node->inode_id; + ins_keys[i].type = BTRFS_DIR_INDEX_KEY; + ins_keys[i].offset = curr->index; ins_sizes[i] = curr->data_len; i++; } @@ -864,6 +833,7 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_item *item) { + const u64 ino = item->delayed_node->inode_id; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_delayed_item *curr, *next; struct extent_buffer *leaf = path->nodes[0]; @@ -902,7 +872,9 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, slot++; btrfs_item_key_to_cpu(leaf, &key, slot); - if (btrfs_comp_cpu_keys(&next->key, &key) != 0) + if (key.objectid != ino || + key.type != BTRFS_DIR_INDEX_KEY || + key.offset != next->index) break; nitems++; curr = next; @@ -920,9 +892,8 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, * Check btrfs_delayed_item_reserve_metadata() to see why we * don't need to release/reserve qgroup space. */ - trace_btrfs_space_reservation(fs_info, "delayed_item", - item->key.objectid, total_reserved_size, - 0); + trace_btrfs_space_reservation(fs_info, "delayed_item", ino, + total_reserved_size, 0); btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, total_reserved_size, NULL); } @@ -940,8 +911,12 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_delayed_node *node) { + struct btrfs_key key; int ret = 0; + key.objectid = node->inode_id; + key.type = BTRFS_DIR_INDEX_KEY; + while (ret == 0) { struct btrfs_delayed_item *item; @@ -952,7 +927,8 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, break; } - ret = btrfs_search_slot(trans, root, &item->key, path, -1, 1); + key.offset = item->index; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { /* * There's no matching item in the leaf. This means we @@ -1441,7 +1417,7 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, - struct btrfs_disk_key *disk_key, u8 type, + struct btrfs_disk_key *disk_key, u8 flags, u64 index) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1457,23 +1433,22 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, if (IS_ERR(delayed_node)) return PTR_ERR(delayed_node); - delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len); + delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len, + delayed_node, + BTRFS_DELAYED_INSERTION_ITEM); if (!delayed_item) { ret = -ENOMEM; goto release_node; } - delayed_item->key.objectid = btrfs_ino(dir); - delayed_item->key.type = BTRFS_DIR_INDEX_KEY; - delayed_item->key.offset = index; - delayed_item->ins_or_del = BTRFS_DELAYED_INSERTION_ITEM; + delayed_item->index = index; dir_item = (struct btrfs_dir_item *)delayed_item->data; dir_item->location = *disk_key; btrfs_set_stack_dir_transid(dir_item, trans->transid); btrfs_set_stack_dir_data_len(dir_item, 0); btrfs_set_stack_dir_name_len(dir_item, name_len); - btrfs_set_stack_dir_type(dir_item, type); + btrfs_set_stack_dir_flags(dir_item, flags); memcpy((char *)(dir_item + 1), name, name_len); data_len = delayed_item->data_len + sizeof(struct btrfs_item); @@ -1490,8 +1465,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, } if (reserve_leaf_space) { - ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, - delayed_item); + ret = btrfs_delayed_item_reserve_metadata(trans, delayed_item); /* * Space was reserved for a dir index item insertion when we * started the transaction, so getting a failure here should be @@ -1538,12 +1512,12 @@ release_node: static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, struct btrfs_delayed_node *node, - struct btrfs_key *key) + u64 index) { struct btrfs_delayed_item *item; mutex_lock(&node->mutex); - item = __btrfs_lookup_delayed_insertion_item(node, key); + item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index); if (!item) { mutex_unlock(&node->mutex); return 1; @@ -1589,32 +1563,25 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, { struct btrfs_delayed_node *node; struct btrfs_delayed_item *item; - struct btrfs_key item_key; int ret; node = btrfs_get_or_create_delayed_node(dir); if (IS_ERR(node)) return PTR_ERR(node); - item_key.objectid = btrfs_ino(dir); - item_key.type = BTRFS_DIR_INDEX_KEY; - item_key.offset = index; - - ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, - &item_key); + ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index); if (!ret) goto end; - item = btrfs_alloc_delayed_item(0); + item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM); if (!item) { ret = -ENOMEM; goto end; } - item->key = item_key; - item->ins_or_del = BTRFS_DELAYED_DELETION_ITEM; + item->index = index; - ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item); + ret = btrfs_delayed_item_reserve_metadata(trans, item); /* * we have reserved enough space when we start a new transaction, * so reserving metadata failure is impossible. @@ -1679,8 +1646,8 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, * We can only do one readdir with delayed items at a time because of * item->readdir_list. */ - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - btrfs_inode_lock(inode, 0); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); + btrfs_inode_lock(BTRFS_I(inode), 0); mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); @@ -1743,9 +1710,9 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, int ret = 0; list_for_each_entry(curr, del_list, readdir_list) { - if (curr->key.offset > index) + if (curr->index > index) break; - if (curr->key.offset == index) { + if (curr->index == index) { ret = 1; break; } @@ -1779,19 +1746,19 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, list_for_each_entry_safe(curr, next, ins_list, readdir_list) { list_del(&curr->readdir_list); - if (curr->key.offset < ctx->pos) { + if (curr->index < ctx->pos) { if (refcount_dec_and_test(&curr->refs)) kfree(curr); continue; } - ctx->pos = curr->key.offset; + ctx->pos = curr->index; di = (struct btrfs_dir_item *)curr->data; name = (char *)(di + 1); name_len = btrfs_stack_dir_name_len(di); - d_type = fs_ftype_to_dtype(di->type); + d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type)); btrfs_disk_key_to_cpu(&location, &di->location); over = !dir_emit(ctx, name, name_len, @@ -2085,3 +2052,113 @@ void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info) } } +void btrfs_log_get_delayed_items(struct btrfs_inode *inode, + struct list_head *ins_list, + struct list_head *del_list) +{ + struct btrfs_delayed_node *node; + struct btrfs_delayed_item *item; + + node = btrfs_get_delayed_node(inode); + if (!node) + return; + + mutex_lock(&node->mutex); + item = __btrfs_first_delayed_insertion_item(node); + while (item) { + /* + * It's possible that the item is already in a log list. This + * can happen in case two tasks are trying to log the same + * directory. For example if we have tasks A and task B: + * + * Task A collected the delayed items into a log list while + * under the inode's log_mutex (at btrfs_log_inode()), but it + * only releases the items after logging the inodes they point + * to (if they are new inodes), which happens after unlocking + * the log mutex; + * + * Task B enters btrfs_log_inode() and acquires the log_mutex + * of the same directory inode, before task B releases the + * delayed items. This can happen for example when logging some + * inode we need to trigger logging of its parent directory, so + * logging two files that have the same parent directory can + * lead to this. + * + * If this happens, just ignore delayed items already in a log + * list. All the tasks logging the directory are under a log + * transaction and whichever finishes first can not sync the log + * before the other completes and leaves the log transaction. + */ + if (!item->logged && list_empty(&item->log_list)) { + refcount_inc(&item->refs); + list_add_tail(&item->log_list, ins_list); + } + item = __btrfs_next_delayed_item(item); + } + + item = __btrfs_first_delayed_deletion_item(node); + while (item) { + /* It may be non-empty, for the same reason mentioned above. */ + if (!item->logged && list_empty(&item->log_list)) { + refcount_inc(&item->refs); + list_add_tail(&item->log_list, del_list); + } + item = __btrfs_next_delayed_item(item); + } + mutex_unlock(&node->mutex); + + /* + * We are called during inode logging, which means the inode is in use + * and can not be evicted before we finish logging the inode. So we never + * have the last reference on the delayed inode. + * Also, we don't use btrfs_release_delayed_node() because that would + * requeue the delayed inode (change its order in the list of prepared + * nodes) and we don't want to do such change because we don't create or + * delete delayed items. + */ + ASSERT(refcount_read(&node->refs) > 1); + refcount_dec(&node->refs); +} + +void btrfs_log_put_delayed_items(struct btrfs_inode *inode, + struct list_head *ins_list, + struct list_head *del_list) +{ + struct btrfs_delayed_node *node; + struct btrfs_delayed_item *item; + struct btrfs_delayed_item *next; + + node = btrfs_get_delayed_node(inode); + if (!node) + return; + + mutex_lock(&node->mutex); + + list_for_each_entry_safe(item, next, ins_list, log_list) { + item->logged = true; + list_del_init(&item->log_list); + if (refcount_dec_and_test(&item->refs)) + kfree(item); + } + + list_for_each_entry_safe(item, next, del_list, log_list) { + item->logged = true; + list_del_init(&item->log_list); + if (refcount_dec_and_test(&item->refs)) + kfree(item); + } + + mutex_unlock(&node->mutex); + + /* + * We are called during inode logging, which means the inode is in use + * and can not be evicted before we finish logging the inode. So we never + * have the last reference on the delayed inode. + * Also, we don't use btrfs_release_delayed_node() because that would + * requeue the delayed inode (change its order in the list of prepared + * nodes) and we don't want to do such change because we don't create or + * delete delayed items. + */ + ASSERT(refcount_read(&node->refs) > 1); + refcount_dec(&node->refs); +} diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 9795dc295a18..4f21daa3dbc7 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -16,9 +16,10 @@ #include <linux/refcount.h> #include "ctree.h" -/* types of the delayed item */ -#define BTRFS_DELAYED_INSERTION_ITEM 1 -#define BTRFS_DELAYED_DELETION_ITEM 2 +enum btrfs_delayed_item_type { + BTRFS_DELAYED_INSERTION_ITEM, + BTRFS_DELAYED_DELETION_ITEM +}; struct btrfs_delayed_root { spinlock_t lock; @@ -73,14 +74,27 @@ struct btrfs_delayed_node { struct btrfs_delayed_item { struct rb_node rb_node; - struct btrfs_key key; + /* Offset value of the corresponding dir index key. */ + u64 index; struct list_head tree_list; /* used for batch insert/delete items */ struct list_head readdir_list; /* used for readdir items */ + /* + * Used when logging a directory. + * Insertions and deletions to this list are protected by the parent + * delayed node's mutex. + */ + struct list_head log_list; u64 bytes_reserved; struct btrfs_delayed_node *delayed_node; refcount_t refs; - int ins_or_del; - u32 data_len; + enum btrfs_delayed_item_type type:8; + /* + * Track if this delayed item was already logged. + * Protected by the mutex of the parent delayed inode. + */ + bool logged; + /* The maximum leaf size is 64K, so u16 is more than enough. */ + u16 data_len; char data[]; }; @@ -99,7 +113,7 @@ static inline void btrfs_init_delayed_root( int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, - struct btrfs_disk_key *disk_key, u8 type, + struct btrfs_disk_key *disk_key, u8 flags, u64 index); int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, @@ -144,6 +158,14 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, struct list_head *ins_list); +/* Used during directory logging. */ +void btrfs_log_get_delayed_items(struct btrfs_inode *inode, + struct list_head *ins_list, + struct list_head *del_list); +void btrfs_log_put_delayed_items(struct btrfs_inode *inode, + struct list_head *ins_list, + struct list_head *del_list); + /* for init */ int __init btrfs_delayed_inode_init(void); void __cold btrfs_delayed_inode_exit(void); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 36a3debe9493..573ebab886e2 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -6,12 +6,14 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/sort.h> +#include "messages.h" #include "ctree.h" #include "delayed-ref.h" #include "transaction.h" #include "qgroup.h" #include "space-info.h" #include "tree-mod-log.h" +#include "fs.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_tree_ref_cachep; @@ -69,14 +71,14 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) return btrfs_check_space_for_delayed_refs(trans->fs_info); } -/** - * Release a ref head's reservation +/* + * Release a ref head's reservation. * * @fs_info: the filesystem * @nr: number of items to drop * - * This drops the delayed ref head's count from the delayed refs rsv and frees - * any excess reservation we had. + * Drops the delayed ref head's count from the delayed refs rsv and free any + * excess reservation we had. */ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) { @@ -102,8 +104,7 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) } /* - * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv - * @trans - the trans that may have generated delayed refs + * Adjust the size of the delayed refs rsv. * * This is to be called anytime we may have adjusted trans->delayed_ref_updates, * it'll calculate the additional size and add it to the delayed_refs_rsv. @@ -137,8 +138,8 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) trans->delayed_ref_updates = 0; } -/** - * Transfer bytes to our delayed refs rsv +/* + * Transfer bytes to our delayed refs rsv. * * @fs_info: the filesystem * @src: source block rsv to transfer from @@ -186,8 +187,8 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, delayed_refs_rsv->space_info, to_free); } -/** - * Refill based on our delayed refs usage +/* + * Refill based on our delayed refs usage. * * @fs_info: the filesystem * @flush: control how we can flush for this reservation. diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 41cddd3ff059..78696d331639 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -18,11 +18,13 @@ #include "volumes.h" #include "async-thread.h" #include "check-integrity.h" -#include "rcu-string.h" #include "dev-replace.h" #include "sysfs.h" #include "zoned.h" #include "block-group.h" +#include "fs.h" +#include "accessors.h" +#include "scrub.h" /* * Device replace overview @@ -246,7 +248,6 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; struct block_device *bdev; - struct rcu_string *name; u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; @@ -290,19 +291,12 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, } - device = btrfs_alloc_device(NULL, &devid, NULL); + device = btrfs_alloc_device(NULL, &devid, NULL, device_path); if (IS_ERR(device)) { ret = PTR_ERR(device); goto error; } - name = rcu_string_strdup(device_path, GFP_KERNEL); - if (!name) { - btrfs_free_device(device); - ret = -ENOMEM; - goto error; - } - rcu_assign_pointer(device->name, name); ret = lookup_bdev(device_path, &device->devt); if (ret) goto error; @@ -456,14 +450,6 @@ out: return ret; } -static char* btrfs_dev_name(struct btrfs_device *device) -{ - if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) - return "<missing disk>"; - else - return rcu_str_deref(device->name); -} - static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, struct btrfs_device *src_dev) { @@ -545,10 +531,7 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, if (!cache) continue; - spin_lock(&cache->lock); - cache->to_copy = 1; - spin_unlock(&cache->lock); - + set_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); btrfs_put_block_group(cache); } if (iter_ret < 0) @@ -577,7 +560,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, return true; spin_lock(&cache->lock); - if (cache->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) { spin_unlock(&cache->lock); return true; } @@ -610,9 +593,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, } /* Last stripe on this device */ - spin_lock(&cache->lock); - cache->to_copy = 0; - spin_unlock(&cache->lock); + clear_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); return true; } @@ -684,7 +665,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, "dev_replace from %s (devid %llu) to %s started", btrfs_dev_name(src_device), src_device->devid, - rcu_str_deref(tgt_device->name)); + btrfs_dev_name(tgt_device)); /* * from now on, the writes to the srcdev are all duplicated to @@ -943,7 +924,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, "btrfs_scrub_dev(%s, %llu, %s) failed %d", btrfs_dev_name(src_device), src_device->devid, - rcu_str_deref(tgt_device->name), scrub_ret); + btrfs_dev_name(tgt_device), scrub_ret); error: up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); @@ -961,7 +942,7 @@ error: "dev_replace from %s (devid %llu) to %s finished", btrfs_dev_name(src_device), src_device->devid, - rcu_str_deref(tgt_device->name)); + btrfs_dev_name(tgt_device)); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state); tgt_device->devid = src_device->devid; src_device->devid = BTRFS_DEV_REPLACE_DEVID; @@ -1288,11 +1269,6 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) return 1; } -void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) -{ - percpu_counter_inc(&fs_info->dev_replace.bio_counter); -} - void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) { percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount); diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 3911049a5f23..675082ccec89 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -7,6 +7,10 @@ #define BTRFS_DEV_REPLACE_H struct btrfs_ioctl_dev_replace_args; +struct btrfs_fs_info; +struct btrfs_trans_handle; +struct btrfs_dev_replace; +struct btrfs_block_group; int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); int btrfs_run_dev_replace(struct btrfs_trans_handle *trans); @@ -21,5 +25,13 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, struct btrfs_block_group *cache, u64 physical); +void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); +void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); + +static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) +{ + btrfs_bio_counter_sub(fs_info, 1); +} + #endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 72fb2c518a2b..082eb0e19598 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -3,9 +3,12 @@ * Copyright (C) 2007 Oracle. All rights reserved. */ +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" +#include "accessors.h" +#include "dir-item.h" /* * insert a name into a directory, doing overflow properly if there is a hash @@ -81,7 +84,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; btrfs_cpu_key_to_disk(&disk_key, &location); btrfs_set_dir_item_key(leaf, dir_item, &disk_key); - btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR); + btrfs_set_dir_flags(leaf, dir_item, BTRFS_FT_XATTR); btrfs_set_dir_name_len(leaf, dir_item, name_len); btrfs_set_dir_transid(leaf, dir_item, trans->transid); btrfs_set_dir_data_len(leaf, dir_item, data_len); @@ -103,8 +106,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, * to use for the second index (if one is created). * Will return 0 or -ENOMEM */ -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, - int name_len, struct btrfs_inode *dir, +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, + const struct fscrypt_str *name, struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index) { int ret = 0; @@ -120,7 +123,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, key.objectid = btrfs_ino(dir); key.type = BTRFS_DIR_ITEM_KEY; - key.offset = btrfs_name_hash(name, name_len); + key.offset = btrfs_name_hash(name->name, name->len); path = btrfs_alloc_path(); if (!path) @@ -128,9 +131,9 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, btrfs_cpu_key_to_disk(&disk_key, location); - data_size = sizeof(*dir_item) + name_len; + data_size = sizeof(*dir_item) + name->len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, - name, name_len); + name->name, name->len); if (IS_ERR(dir_item)) { ret = PTR_ERR(dir_item); if (ret == -EEXIST) @@ -138,15 +141,18 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, goto out_free; } + if (IS_ENCRYPTED(&dir->vfs_inode)) + type |= BTRFS_FT_ENCRYPTED; + leaf = path->nodes[0]; btrfs_set_dir_item_key(leaf, dir_item, &disk_key); - btrfs_set_dir_type(leaf, dir_item, type); + btrfs_set_dir_flags(leaf, dir_item, type); btrfs_set_dir_data_len(leaf, dir_item, 0); - btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_name_len(leaf, dir_item, name->len); btrfs_set_dir_transid(leaf, dir_item, trans->transid); name_ptr = (unsigned long)(dir_item + 1); - write_extent_buffer(leaf, name, name_ptr, name_len); + write_extent_buffer(leaf, name->name, name_ptr, name->len); btrfs_mark_buffer_dirty(leaf); second_insert: @@ -157,7 +163,7 @@ second_insert: } btrfs_release_path(path); - ret2 = btrfs_insert_delayed_dir_index(trans, name, name_len, dir, + ret2 = btrfs_insert_delayed_dir_index(trans, name->name, name->len, dir, &disk_key, type, index); out_free: btrfs_free_path(path); @@ -206,7 +212,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir( struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - const char *name, int name_len, + const struct fscrypt_str *name, int mod) { struct btrfs_key key; @@ -214,9 +220,10 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, key.objectid = dir; key.type = BTRFS_DIR_ITEM_KEY; - key.offset = btrfs_name_hash(name, name_len); + key.offset = btrfs_name_hash(name->name, name->len); - di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + di = btrfs_lookup_match_dir(trans, root, path, &key, name->name, + name->len, mod); if (IS_ERR(di) && PTR_ERR(di) == -ENOENT) return NULL; @@ -224,7 +231,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, } int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, - const char *name, int name_len) + const struct fscrypt_str *name) { int ret; struct btrfs_key key; @@ -240,9 +247,10 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, key.objectid = dir; key.type = BTRFS_DIR_ITEM_KEY; - key.offset = btrfs_name_hash(name, name_len); + key.offset = btrfs_name_hash(name->name, name->len); - di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0); + di = btrfs_lookup_match_dir(NULL, root, path, &key, name->name, + name->len, 0); if (IS_ERR(di)) { ret = PTR_ERR(di); /* Nothing found, we're safe */ @@ -262,11 +270,8 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, goto out; } - /* - * see if there is room in the item to insert this - * name - */ - data_size = sizeof(*di) + name_len; + /* See if there is room in the item to insert this name. */ + data_size = sizeof(*di) + name->len; leaf = path->nodes[0]; slot = path->slots[0]; if (data_size + btrfs_item_size(leaf, slot) + @@ -303,8 +308,7 @@ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 index, const char *name, int name_len, - int mod) + u64 index, const struct fscrypt_str *name, int mod) { struct btrfs_dir_item *di; struct btrfs_key key; @@ -313,7 +317,8 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, key.type = BTRFS_DIR_INDEX_KEY; key.offset = index; - di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + di = btrfs_lookup_match_dir(trans, root, path, &key, name->name, + name->len, mod); if (di == ERR_PTR(-ENOENT)) return NULL; @@ -321,9 +326,8 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, } struct btrfs_dir_item * -btrfs_search_dir_index_item(struct btrfs_root *root, - struct btrfs_path *path, u64 dirid, - const char *name, int name_len) +btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, + u64 dirid, const struct fscrypt_str *name) { struct btrfs_dir_item *di; struct btrfs_key key; @@ -338,7 +342,7 @@ btrfs_search_dir_index_item(struct btrfs_root *root, break; di = btrfs_match_dir_item_name(root->fs_info, path, - name, name_len); + name->name, name->len); if (di) return di; } diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h new file mode 100644 index 000000000000..aab4b7cc7fa0 --- /dev/null +++ b/fs/btrfs/dir-item.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_DIR_ITEM_H +#define BTRFS_DIR_ITEM_H + +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, + const struct fscrypt_str *name); +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, + const struct fscrypt_str *name, struct btrfs_inode *dir, + struct btrfs_key *location, u8 type, u64 index); +struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const struct fscrypt_str *name, int mod); +struct btrfs_dir_item *btrfs_lookup_dir_index_item( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + u64 index, const struct fscrypt_str *name, int mod); +struct btrfs_dir_item *btrfs_search_dir_index_item(struct btrfs_root *root, + struct btrfs_path *path, u64 dirid, + const struct fscrypt_str *name); +int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_dir_item *di); +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + const char *name, u16 name_len, + const void *data, u16 data_len); +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod); +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + const char *name, + int name_len); + +#endif diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index e1b7bd927d69..ff2e524d9937 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -11,6 +11,7 @@ #include "block-group.h" #include "discard.h" #include "free-space-cache.h" +#include "fs.h" /* * This contains the logic to handle async discard. @@ -61,7 +62,7 @@ #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL) #define BTRFS_DISCARD_MAX_IOPS (10U) -/* Montonically decreasing minimum length filters after index 0 */ +/* Monotonically decreasing minimum length filters after index 0 */ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { 0, BTRFS_ASYNC_DISCARD_MAX_FILTER, @@ -146,10 +147,11 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, return running; } -/** - * find_next_block_group - find block_group that's up next for discarding - * @discard_ctl: discard control - * @now: current time +/* + * Find block_group that's up next for discarding. + * + * @discard_ctl: discard control + * @now: current time * * Iterate over the discard lists to find the next block_group up for * discarding checking the discard_eligible_time of block_group. @@ -184,17 +186,17 @@ static struct btrfs_block_group *find_next_block_group( return ret_block_group; } -/** - * Wrap find_next_block_group() +/* + * Look up next block group and set it for use. * * @discard_ctl: discard control * @discard_state: the discard_state of the block_group after state management * @discard_index: the discard_index of the block_group after state management * @now: time when discard was invoked, in ns * - * This wraps find_next_block_group() and sets the block_group to be in use. - * discard_state's control flow is managed here. Variables related to - * discard_state are reset here as needed (eg discard_cursor). @discard_state + * Wrap find_next_block_group() and set the block_group to be in use. + * @discard_state's control flow is managed here. Variables related to + * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state * and @discard_index are remembered as it may change while we're discarding, * but we want the discard to execute in the context determined here. */ @@ -233,10 +235,11 @@ again: return block_group; } -/** - * btrfs_discard_check_filter - updates a block groups filters - * @block_group: block group of interest - * @bytes: recently freed region size after coalescing +/* + * Update a block group's filters. + * + * @block_group: block group of interest + * @bytes: recently freed region size after coalescing * * Async discard maintains multiple lists with progressively smaller filters * to prioritize discarding based on size. Should a free space that matches @@ -271,8 +274,9 @@ void btrfs_discard_check_filter(struct btrfs_block_group *block_group, } } -/** - * btrfs_update_discard_index - moves a block group along the discard lists +/* + * Move a block group along the discard lists. + * * @discard_ctl: discard control * @block_group: block_group of interest * @@ -291,13 +295,14 @@ static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl, add_to_discard_list(discard_ctl, block_group); } -/** - * btrfs_discard_cancel_work - remove a block_group from the discard lists +/* + * Remove a block_group from the discard lists. + * * @discard_ctl: discard control * @block_group: block_group of interest * - * This removes @block_group from the discard lists. If necessary, it waits on - * the current work and then reschedules the delayed work. + * Remove @block_group from the discard lists. If necessary, wait on the + * current work and then reschedule the delayed work. */ void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) @@ -308,12 +313,13 @@ void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, } } -/** - * btrfs_discard_queue_work - handles queuing the block_groups +/* + * Handles queuing the block_groups. + * * @discard_ctl: discard control * @block_group: block_group of interest * - * This maintains the LRU order of the discard lists. + * Maintain the LRU order of the discard lists. */ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) @@ -383,7 +389,8 @@ static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, } /* - * btrfs_discard_schedule_work - responsible for scheduling the discard work + * Responsible for scheduling the discard work. + * * @discard_ctl: discard control * @override: override the current timer * @@ -401,15 +408,16 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, spin_unlock(&discard_ctl->lock); } -/** - * btrfs_finish_discard_pass - determine next step of a block_group +/* + * Determine next step of a block_group. + * * @discard_ctl: discard control * @block_group: block_group of interest * - * This determines the next step for a block group after it's finished going - * through a pass on a discard list. If it is unused and fully trimmed, we can - * mark it unused and send it to the unused_bgs path. Otherwise, pass it onto - * the appropriate filter list or let it fall off. + * Determine the next step for a block group after it's finished going through + * a pass on a discard list. If it is unused and fully trimmed, we can mark it + * unused and send it to the unused_bgs path. Otherwise, pass it onto the + * appropriate filter list or let it fall off. */ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) @@ -426,12 +434,13 @@ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, } } -/** - * btrfs_discard_workfn - discard work function +/* + * Discard work queue callback + * * @work: work * - * This finds the next block_group to start discarding and then discards a - * single region. It does this in a two-pass fashion: first extents and second + * Find the next block_group to start discarding and then discard a single + * region. It does this in a two-pass fashion: first extents and second * bitmaps. Completely discarded block groups are sent to the unused_bgs path. */ static void btrfs_discard_workfn(struct work_struct *work) @@ -507,11 +516,12 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_unlock(&discard_ctl->lock); } -/** - * btrfs_run_discard_work - determines if async discard should be running +/* + * Determine if async discard should be running. + * * @discard_ctl: discard control * - * Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. + * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. */ bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) { @@ -523,8 +533,9 @@ bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); } -/** - * btrfs_discard_calc_delay - recalculate the base delay +/* + * Recalculate the base delay. + * * @discard_ctl: discard control * * Recalculate the base delay which is based off the total number of @@ -545,7 +556,7 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) spin_lock(&discard_ctl->lock); /* - * The following is to fix a potential -1 discrepenancy that we're not + * The following is to fix a potential -1 discrepancy that we're not * sure how to reproduce. But given that this is the only place that * utilizes these numbers and this is only called by from * btrfs_finish_extent_commit() which is synchronized, we can correct @@ -578,13 +589,14 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) spin_unlock(&discard_ctl->lock); } -/** - * btrfs_discard_update_discardable - propagate discard counters +/* + * Propagate discard counters. + * * @block_group: block_group of interest * - * This propagates deltas of counters up to the discard_ctl. It maintains a - * current counter and a previous counter passing the delta up to the global - * stat. Then the current counter value becomes the previous counter value. + * Propagate deltas of counters up to the discard_ctl. It maintains a current + * counter and a previous counter passing the delta up to the global stat. + * Then the current counter value becomes the previous counter value. */ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) { @@ -619,8 +631,9 @@ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) } } -/** - * btrfs_discard_punt_unused_bgs_list - punt unused_bgs list to discard lists +/* + * Punt unused_bgs list to discard lists. + * * @fs_info: fs_info of interest * * The unused_bgs list needs to be punted to the discard lists because the @@ -644,8 +657,9 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->unused_bgs_lock); } -/** - * btrfs_discard_purge_list - purge discard lists +/* + * Purge discard lists. + * * @discard_ctl: discard control * * If we are disabling async discard, we may have intercepted block groups that diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2633137c3e9f..0888d484df80 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -23,7 +23,7 @@ #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "volumes.h" +#include "bio.h" #include "print-tree.h" #include "locking.h" #include "tree-log.h" @@ -43,6 +43,15 @@ #include "space-info.h" #include "zoned.h" #include "subpage.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "defrag.h" +#include "uuid-tree.h" +#include "relocation.h" +#include "scrub.h" +#include "super.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -75,12 +84,12 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) * just before they are sent down the IO stack. */ struct async_submit_bio { - struct inode *inode; + struct btrfs_inode *inode; struct bio *bio; - extent_submit_bio_start_t *submit_bio_start; + enum btrfs_wq_submit_cmd submit_cmd; int mirror_num; - /* Optional parameter for submit_bio_start used by direct io */ + /* Optional parameter for used by direct io */ u64 dio_file_offset; struct btrfs_work work; blk_status_t status; @@ -131,8 +140,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, if (atomic) return -EAGAIN; - lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state); + lock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); if (extent_buffer_uptodate(eb) && btrfs_header_generation(eb) == parent_transid) { ret = 0; @@ -145,8 +153,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, ret = 1; clear_extent_buffer_uptodate(eb); out: - unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state); + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, + &cached_state); return ret; } @@ -167,11 +175,9 @@ static bool btrfs_supported_super_csum(u16 csum_type) * Return 0 if the superblock checksum type matches the checksum value of that * algorithm. Pass the raw disk superblock data. */ -static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, - char *raw_disk_sb) +int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *disk_sb) { - struct btrfs_super_block *disk_sb = - (struct btrfs_super_block *)raw_disk_sb; char result[BTRFS_CSUM_SIZE]; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); @@ -182,7 +188,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is * filled with zeros and is included in the checksum. */ - crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE, + crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); if (memcmp(disk_sb->csum, result, fs_info->csum_size)) @@ -249,40 +255,54 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, return ret; } +static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, + int mirror_num) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + u64 start = eb->start; + int i, num_pages = num_extent_pages(eb); + int ret = 0; + + if (sb_rdonly(fs_info->sb)) + return -EROFS; + + for (i = 0; i < num_pages; i++) { + struct page *p = eb->pages[i]; + + ret = btrfs_repair_io_failure(fs_info, 0, start, PAGE_SIZE, + start, p, start - page_offset(p), mirror_num); + if (ret) + break; + start += PAGE_SIZE; + } + + return ret; +} + /* * helper to read a given tree block, doing retries as required when * the checksums don't match and we have alternate mirrors to try. * - * @parent_transid: expected transid, skip check if 0 - * @level: expected level, mandatory check - * @first_key: expected key of first slot, skip check if NULL + * @check: expected tree parentness check, see the comments of the + * structure for details. */ int btrfs_read_extent_buffer(struct extent_buffer *eb, - u64 parent_transid, int level, - struct btrfs_key *first_key) + struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct extent_io_tree *io_tree; int failed = 0; int ret; int num_copies = 0; int mirror_num = 0; int failed_mirror = 0; - io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; + ASSERT(check); + while (1) { clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); - ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num); - if (!ret) { - if (verify_parent_transid(io_tree, eb, - parent_transid, 0)) - ret = -EIO; - else if (btrfs_verify_level_key(eb, level, - first_key, parent_transid)) - ret = -EUCLEAN; - else - break; - } + ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check); + if (!ret) + break; num_copies = btrfs_num_copies(fs_info, eb->start, eb->len); @@ -458,7 +478,8 @@ static int check_tree_block_fsid(struct extent_buffer *eb) } /* Do basic extent buffer checks at read time */ -static int validate_extent_buffer(struct extent_buffer *eb) +static int validate_extent_buffer(struct extent_buffer *eb, + struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start; @@ -468,6 +489,8 @@ static int validate_extent_buffer(struct extent_buffer *eb) const u8 *header_csum; int ret = 0; + ASSERT(check); + found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { btrfs_err_rl(fs_info, @@ -506,6 +529,45 @@ static int validate_extent_buffer(struct extent_buffer *eb) goto out; } + if (found_level != check->level) { + ret = -EIO; + goto out; + } + if (unlikely(check->transid && + btrfs_header_generation(eb) != check->transid)) { + btrfs_err_rl(eb->fs_info, +"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", + eb->start, eb->read_mirror, check->transid, + btrfs_header_generation(eb)); + ret = -EIO; + goto out; + } + if (check->has_first_key) { + struct btrfs_key *expect_key = &check->first_key; + struct btrfs_key found_key; + + if (found_level) + btrfs_node_key_to_cpu(eb, &found_key, 0); + else + btrfs_item_key_to_cpu(eb, &found_key, 0); + if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) { + btrfs_err(fs_info, +"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", + eb->start, check->transid, + expect_key->objectid, + expect_key->type, expect_key->offset, + found_key.objectid, found_key.type, + found_key.offset); + ret = -EUCLEAN; + goto out; + } + } + if (check->owner_root) { + ret = btrfs_check_eb_owner(eb, check->owner_root); + if (ret < 0) + goto out; + } + /* * If this is a leaf block and it is corrupt, set the corrupt bit so * that we don't try and read the other copies of this block, just @@ -530,13 +592,15 @@ out: } static int validate_subpage_buffer(struct page *page, u64 start, u64 end, - int mirror) + int mirror, struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); struct extent_buffer *eb; bool reads_done; int ret = 0; + ASSERT(check); + /* * We don't allow bio merge for subpage metadata read, so we should * only get one eb for each endio hook. @@ -560,7 +624,7 @@ static int validate_subpage_buffer(struct page *page, u64 start, u64 end, ret = -EIO; goto err; } - ret = validate_extent_buffer(eb); + ret = validate_extent_buffer(eb, check); if (ret < 0) goto err; @@ -590,7 +654,8 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, ASSERT(page->private); if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return validate_subpage_buffer(page, start, end, mirror); + return validate_subpage_buffer(page, start, end, mirror, + &bbio->parent_check); eb = (struct extent_buffer *)page->private; @@ -609,7 +674,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, ret = -EIO; goto err; } - ret = validate_extent_buffer(eb); + ret = validate_extent_buffer(eb, &bbio->parent_check); err: if (ret) { /* @@ -631,8 +696,18 @@ static void run_one_async_start(struct btrfs_work *work) blk_status_t ret; async = container_of(work, struct async_submit_bio, work); - ret = async->submit_bio_start(async->inode, async->bio, - async->dio_file_offset); + switch (async->submit_cmd) { + case WQ_SUBMIT_METADATA: + ret = btree_submit_bio_start(async->bio); + break; + case WQ_SUBMIT_DATA: + ret = btrfs_submit_bio_start(async->inode, async->bio); + break; + case WQ_SUBMIT_DATA_DIO: + ret = btrfs_submit_bio_start_direct_io(async->inode, + async->bio, async->dio_file_offset); + break; + } if (ret) async->status = ret; } @@ -647,16 +722,14 @@ static void run_one_async_start(struct btrfs_work *work) */ static void run_one_async_done(struct btrfs_work *work) { - struct async_submit_bio *async; - struct inode *inode; - - async = container_of(work, struct async_submit_bio, work); - inode = async->inode; + struct async_submit_bio *async = + container_of(work, struct async_submit_bio, work); + struct btrfs_inode *inode = async->inode; + struct btrfs_bio *bbio = btrfs_bio(async->bio); /* If an error occurred we just want to clean up the bio and move on */ if (async->status) { - async->bio->bi_status = async->status; - bio_endio(async->bio); + btrfs_bio_end_io(bbio, async->status); return; } @@ -666,7 +739,7 @@ static void run_one_async_done(struct btrfs_work *work) * This changes nothing when cgroups aren't in use. */ async->bio->bi_opf |= REQ_CGROUP_PUNT; - btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num); + btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num); } static void run_one_async_free(struct btrfs_work *work) @@ -684,11 +757,10 @@ static void run_one_async_free(struct btrfs_work *work) * - true if the work has been succesfuly submitted * - false in case of error */ -bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, - extent_submit_bio_start_t *submit_bio_start) +bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, + u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd) { - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); @@ -698,7 +770,7 @@ bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, async->inode = inode; async->bio = bio; async->mirror_num = mirror_num; - async->submit_bio_start = submit_bio_start; + async->submit_cmd = cmd; btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, run_one_async_free); @@ -732,8 +804,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) return errno_to_blk_status(ret); } -static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, - u64 dio_file_offset) +blk_status_t btree_submit_bio_start(struct bio *bio) { /* * when we're called for a write, we're already in the async @@ -754,12 +825,14 @@ static bool should_async_write(struct btrfs_fs_info *fs_info, return true; } -void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num) +void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_bio *bbio = btrfs_bio(bio); blk_status_t ret; bio->bi_opf |= REQ_META; + bbio->is_metadata = 1; if (btrfs_op(bio) != BTRFS_MAP_WRITE) { btrfs_submit_bio(fs_info, bio, mirror_num); @@ -770,14 +843,13 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_ * Kthread helpers are used to submit writes so that checksumming can * happen in parallel across all CPUs. */ - if (should_async_write(fs_info, BTRFS_I(inode)) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start)) + if (should_async_write(fs_info, inode) && + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA)) return; ret = btree_csum_one_bio(bio); if (ret) { - bio->bi_status = ret; - bio_endio(bio); + btrfs_bio_end_io(bbio, ret); return; } @@ -924,28 +996,28 @@ struct extent_buffer *btrfs_find_create_tree_block( * Read tree block at logical address @bytenr and do variant basic but critical * verification. * - * @owner_root: the objectid of the root owner for this block. - * @parent_transid: expected transid of this tree block, skip check if 0 - * @level: expected level, mandatory check - * @first_key: expected key in slot 0, skip check if NULL + * @check: expected tree parentness check, see comments of the + * structure for details. */ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 owner_root, u64 parent_transid, - int level, struct btrfs_key *first_key) + struct btrfs_tree_parent_check *check) { struct extent_buffer *buf = NULL; int ret; - buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); + ASSERT(check); + + buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root, + check->level); if (IS_ERR(buf)) return buf; - ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key); + ret = btrfs_read_extent_buffer(buf, check); if (ret) { free_extent_buffer_stale(buf); return ERR_PTR(ret); } - if (btrfs_check_eb_owner(buf, owner_root)) { + if (btrfs_check_eb_owner(buf, check->owner_root)) { free_extent_buffer_stale(buf); return ERR_PTR(-EUCLEAN); } @@ -1032,9 +1104,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->anon_dev = 0; if (!dummy) { extent_io_tree_init(fs_info, &root->dirty_log_pages, - IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL); + IO_TREE_ROOT_DIRTY_LOG_PAGES); extent_io_tree_init(fs_info, &root->log_csum_range, - IO_TREE_LOG_CSUM_RANGE, NULL); + IO_TREE_LOG_CSUM_RANGE); } spin_lock_init(&root->root_item_lock); @@ -1172,6 +1244,13 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) return btrfs_global_root(fs_info, &key); } +struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) +{ + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) + return fs_info->block_group_root; + return btrfs_extent_root(fs_info, 0); +} + struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid) { @@ -1202,7 +1281,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; - goto fail_unlock; + goto fail; } root->node = leaf; @@ -1237,9 +1316,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, return root; -fail_unlock: - if (leaf) - btrfs_tree_unlock(leaf); fail: btrfs_put_root(root); @@ -1357,6 +1433,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, struct btrfs_key *key) { struct btrfs_root *root; + struct btrfs_tree_parent_check check = { 0 }; struct btrfs_fs_info *fs_info = tree_root->fs_info; u64 generation; int ret; @@ -1376,9 +1453,11 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, generation = btrfs_root_generation(&root->root_item); level = btrfs_root_level(&root->root_item); - root->node = read_tree_block(fs_info, - btrfs_root_bytenr(&root->root_item), - key->objectid, generation, level, NULL); + check.level = level; + check.transid = generation; + check.owner_root = key->objectid; + root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item), + &check); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); root->node = NULL; @@ -1524,6 +1603,9 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, if (objectid == BTRFS_UUID_TREE_OBJECTID) return btrfs_grab_root(fs_info->uuid_root) ? fs_info->uuid_root : ERR_PTR(-ENOENT); + if (objectid == BTRFS_BLOCK_GROUP_TREE_OBJECTID) + return btrfs_grab_root(fs_info->block_group_root) ? + fs_info->block_group_root : ERR_PTR(-ENOENT); if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) { struct btrfs_root *root = btrfs_global_root(fs_info, &key); @@ -1980,14 +2062,7 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_chunk_root_level(root_backup, btrfs_header_level(info->chunk_root->node)); - if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) { - btrfs_set_backup_block_group_root(root_backup, - info->block_group_root->node->start); - btrfs_set_backup_block_group_root_gen(root_backup, - btrfs_header_generation(info->block_group_root->node)); - btrfs_set_backup_block_group_root_level(root_backup, - btrfs_header_level(info->block_group_root->node)); - } else { + if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) { struct btrfs_root *extent_root = btrfs_extent_root(info, 0); struct btrfs_root *csum_root = btrfs_csum_root(info, 0); @@ -2093,8 +2168,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->workers); if (fs_info->endio_workers) destroy_workqueue(fs_info->endio_workers); - if (fs_info->endio_raid56_workers) - destroy_workqueue(fs_info->endio_raid56_workers); if (fs_info->rmw_workers) destroy_workqueue(fs_info->rmw_workers); if (fs_info->compressed_write_workers) @@ -2225,6 +2298,8 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info) static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) { struct inode *inode = fs_info->btree_inode; + unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID, + fs_info->tree_root); inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; set_nlink(inode, 1); @@ -2238,8 +2313,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, - IO_TREE_BTREE_INODE_IO, inode); - BTRFS_I(inode)->io_tree.track_uptodate = false; + IO_TREE_BTREE_INODE_IO); extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); @@ -2247,7 +2321,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) BTRFS_I(inode)->location.type = 0; BTRFS_I(inode)->location.offset = 0; set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); - btrfs_insert_inode_hash(inode); + __insert_inode_hash(inode, hash); } static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) @@ -2266,6 +2340,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->qgroup_seq = 1; fs_info->qgroup_ulist = NULL; fs_info->qgroup_rescan_running = false; + fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; mutex_init(&fs_info->qgroup_rescan_lock); } @@ -2298,8 +2373,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) alloc_workqueue("btrfs-endio", flags, max_active); fs_info->endio_meta_workers = alloc_workqueue("btrfs-endio-meta", flags, max_active); - fs_info->endio_raid56_workers = - alloc_workqueue("btrfs-endio-raid56", flags, max_active); fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, @@ -2321,7 +2394,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->compressed_write_workers && - fs_info->endio_write_workers && fs_info->endio_raid56_workers && + fs_info->endio_write_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->fixup_workers && fs_info->delayed_workers && fs_info->qgroup_rescan_workers && @@ -2357,6 +2430,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { int ret; + struct btrfs_tree_parent_check check = { 0 }; struct btrfs_root *log_tree_root; struct btrfs_super_block *disk_super = fs_info->super_copy; u64 bytenr = btrfs_super_log_root(disk_super); @@ -2372,10 +2446,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, if (!log_tree_root) return -ENOMEM; - log_tree_root->node = read_tree_block(fs_info, bytenr, - BTRFS_TREE_LOG_OBJECTID, - fs_info->generation + 1, level, - NULL); + check.level = level; + check.transid = fs_info->generation + 1; + check.owner_root = BTRFS_TREE_LOG_OBJECTID; + log_tree_root->node = read_tree_block(fs_info, bytenr, &check); if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); @@ -2529,10 +2603,24 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) if (ret) return ret; - location.objectid = BTRFS_DEV_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; location.offset = 0; + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) { + location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->block_group_root = root; + } + } + + location.objectid = BTRFS_DEV_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { @@ -2544,7 +2632,9 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) fs_info->dev_root = root; } /* Initialize fs_info for all devices in any case */ - btrfs_init_devices_late(fs_info); + ret = btrfs_init_devices_late(fs_info); + if (ret) + goto out; /* * This tree can share blocks with some other fs tree during relocation @@ -2600,8 +2690,8 @@ out: * 1, 2 2nd and 3rd backup copy * -1 skip bytenr check */ -static int validate_super(struct btrfs_fs_info *fs_info, - struct btrfs_super_block *sb, int mirror_num) +int btrfs_validate_super(struct btrfs_fs_info *fs_info, + struct btrfs_super_block *sb, int mirror_num) { u64 nodesize = btrfs_super_nodesize(sb); u64 sectorsize = btrfs_super_sectorsize(sb); @@ -2703,6 +2793,18 @@ static int validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } + /* + * Artificial requirement for block-group-tree to force newer features + * (free-space-tree, no-holes) so the test matrix is smaller. + */ + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) && + (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || + !btrfs_fs_incompat(fs_info, NO_HOLES))) { + btrfs_err(fs_info, + "block-group-tree feature requires fres-space-tree and no-holes"); + ret = -EINVAL; + } + if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, @@ -2785,7 +2887,7 @@ static int validate_super(struct btrfs_fs_info *fs_info, */ static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info) { - return validate_super(fs_info, fs_info->super_copy, 0); + return btrfs_validate_super(fs_info, fs_info->super_copy, 0); } /* @@ -2799,7 +2901,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, { int ret; - ret = validate_super(fs_info, sb, -1); + ret = btrfs_validate_super(fs_info, sb, -1); if (ret < 0) goto out; if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) { @@ -2825,10 +2927,14 @@ out: static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level) { + struct btrfs_tree_parent_check check = { + .level = level, + .transid = gen, + .owner_root = root->root_key.objectid + }; int ret = 0; - root->node = read_tree_block(root->fs_info, bytenr, - root->root_key.objectid, gen, level, NULL); + root->node = read_tree_block(root->fs_info, bytenr, &check); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); root->node = NULL; @@ -2860,17 +2966,7 @@ static int load_important_roots(struct btrfs_fs_info *fs_info) btrfs_warn(fs_info, "couldn't read tree root"); return ret; } - - if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) - return 0; - - bytenr = btrfs_super_block_group_root(sb); - gen = btrfs_super_block_group_root_generation(sb); - level = btrfs_super_block_group_root_level(sb); - ret = load_super_root(fs_info->block_group_root, bytenr, gen, level); - if (ret) - btrfs_warn(fs_info, "couldn't read block group root"); - return ret; + return 0; } static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) @@ -2882,16 +2978,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) int ret = 0; int i; - if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { - struct btrfs_root *root; - - root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID, - GFP_KERNEL); - if (!root) - return -ENOMEM; - fs_info->block_group_root = root; - } - for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { if (handle_error) { if (!IS_ERR(tree_root->node)) @@ -2990,6 +3076,19 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) mutex_init(&fs_info->zoned_data_reloc_io_lock); seqlock_init(&fs_info->profiles_lock); + btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers); + btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters); + btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered); + btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start, + BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked, + BTRFS_LOCKDEP_TRANS_UNBLOCKED); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed, + BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed, + BTRFS_LOCKDEP_TRANS_COMPLETED); + INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); @@ -3043,7 +3142,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->block_group_cache_tree = RB_ROOT_CACHED; extent_io_tree_init(fs_info, &fs_info->excluded_extents, - IO_TREE_FS_EXCLUDED_EXTENTS, NULL); + IO_TREE_FS_EXCLUDED_EXTENTS); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); @@ -3279,6 +3378,112 @@ out: return ret; } +/* + * Do various sanity and dependency checks of different features. + * + * This is the place for less strict checks (like for subpage or artificial + * feature dependencies). + * + * For strict checks or possible corruption detection, see + * btrfs_validate_super(). + * + * This should be called after btrfs_parse_options(), as some mount options + * (space cache related) can modify on-disk format like free space tree and + * screw up certain feature dependencies. + */ +int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb) +{ + struct btrfs_super_block *disk_super = fs_info->super_copy; + u64 incompat = btrfs_super_incompat_flags(disk_super); + const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super); + const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP); + + if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) { + btrfs_err(fs_info, + "cannot mount because of unknown incompat features (0x%llx)", + incompat); + return -EINVAL; + } + + /* Runtime limitation for mixed block groups. */ + if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && + (fs_info->sectorsize != fs_info->nodesize)) { + btrfs_err(fs_info, +"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", + fs_info->nodesize, fs_info->sectorsize); + return -EINVAL; + } + + /* Mixed backref is an always-enabled feature. */ + incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; + + /* Set compression related flags just in case. */ + if (fs_info->compress_type == BTRFS_COMPRESS_LZO) + incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD) + incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD; + + /* + * An ancient flag, which should really be marked deprecated. + * Such runtime limitation doesn't really need a incompat flag. + */ + if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) + incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; + + if (compat_ro_unsupp && !sb_rdonly(sb)) { + btrfs_err(fs_info, + "cannot mount read-write because of unknown compat_ro features (0x%llx)", + compat_ro); + return -EINVAL; + } + + /* + * We have unsupported RO compat features, although RO mounted, we + * should not cause any metadata writes, including log replay. + * Or we could screw up whatever the new feature requires. + */ + if (compat_ro_unsupp && btrfs_super_log_root(disk_super) && + !btrfs_test_opt(fs_info, NOLOGREPLAY)) { + btrfs_err(fs_info, +"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay", + compat_ro); + return -EINVAL; + } + + /* + * Artificial limitations for block group tree, to force + * block-group-tree to rely on no-holes and free-space-tree. + */ + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) && + (!btrfs_fs_incompat(fs_info, NO_HOLES) || + !btrfs_test_opt(fs_info, FREE_SPACE_TREE))) { + btrfs_err(fs_info, +"block-group-tree feature requires no-holes and free-space-tree features"); + return -EINVAL; + } + + /* + * Subpage runtime limitation on v1 cache. + * + * V1 space cache still has some hard codeed PAGE_SIZE usage, while + * we're already defaulting to v2 cache, no need to bother v1 as it's + * going to be deprecated anyway. + */ + if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { + btrfs_warn(fs_info, + "v1 space cache is not supported for page size %lu with sectorsize %u", + PAGE_SIZE, fs_info->sectorsize); + return -EINVAL; + } + + /* This can be called by remount, we need to protect the super block. */ + spin_lock(&fs_info->super_lock); + btrfs_set_super_incompat_flags(disk_super, incompat); + spin_unlock(&fs_info->super_lock); + + return 0; +} + int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) { @@ -3359,7 +3564,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * We want to check superblock checksum, the type is stored inside. * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ - if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) { + if (btrfs_check_super_csum(fs_info, disk_super)) { btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; btrfs_release_disk_super(disk_super); @@ -3428,72 +3633,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } - features = btrfs_super_incompat_flags(disk_super) & - ~BTRFS_FEATURE_INCOMPAT_SUPP; - if (features) { - btrfs_err(fs_info, - "cannot mount because of unsupported optional features (0x%llx)", - features); - err = -EINVAL; - goto fail_alloc; - } - - features = btrfs_super_incompat_flags(disk_super); - features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; - if (fs_info->compress_type == BTRFS_COMPRESS_LZO) - features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; - else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD) - features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD; - - /* - * Flag our filesystem as having big metadata blocks if they are bigger - * than the page size. - */ - if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) - features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; - - /* - * mixed block groups end up with duplicate but slightly offset - * extent buffers for the same range. It leads to corruptions - */ - if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && - (sectorsize != nodesize)) { - btrfs_err(fs_info, -"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", - nodesize, sectorsize); - goto fail_alloc; - } - - /* - * Needn't use the lock because there is no other task which will - * update the flag. - */ - btrfs_set_super_incompat_flags(disk_super, features); - - features = btrfs_super_compat_ro_flags(disk_super) & - ~BTRFS_FEATURE_COMPAT_RO_SUPP; - if (!sb_rdonly(sb) && features) { - btrfs_err(fs_info, - "cannot mount read-write because of unsupported optional features (0x%llx)", - features); - err = -EINVAL; - goto fail_alloc; - } - /* - * We have unsupported RO compat features, although RO mounted, we - * should not cause any metadata write, including log replay. - * Or we could screw up whatever the new feature requires. - */ - if (unlikely(features && btrfs_super_log_root(disk_super) && - !btrfs_test_opt(fs_info, NOLOGREPLAY))) { - btrfs_err(fs_info, -"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay", - features); - err = -EINVAL; + ret = btrfs_check_features(fs_info, sb); + if (ret < 0) { + err = ret; goto fail_alloc; } - if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; @@ -3683,10 +3828,18 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } /* - * Mount does not set all options immediately, we can do it now and do - * not have to wait for transaction commit + * For devices supporting discard turn on discard=async automatically, + * unless it's already set or disabled. This could be turned off by + * nodiscard for the same mount. */ - btrfs_apply_pending_changes(fs_info); + if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_test_opt(fs_info, DISCARD_ASYNC) || + btrfs_test_opt(fs_info, NODISCARD)) && + fs_info->fs_devices->discardable) { + btrfs_set_and_info(fs_info, DISCARD_ASYNC, + "auto enabling async discard"); + btrfs_clear_opt(fs_info->mount_opt, NODISCARD); + } #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) { @@ -3815,7 +3968,7 @@ static void btrfs_end_super_write(struct bio *bio) if (bio->bi_status) { btrfs_warn_rl_in_rcu(device->fs_info, "lost page write due to IO error on %s (%d)", - rcu_str_deref(device->name), + btrfs_dev_name(device), blk_status_to_errno(bio->bi_status)); ClearPageUptodate(page); SetPageError(page); @@ -3833,7 +3986,7 @@ static void btrfs_end_super_write(struct bio *bio) } struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, - int copy_num) + int copy_num, bool drop_cache) { struct btrfs_super_block *super; struct page *page; @@ -3851,6 +4004,19 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev)) return ERR_PTR(-EINVAL); + if (drop_cache) { + /* This should only be called with the primary sb. */ + ASSERT(copy_num == 0); + + /* + * Drop the page of the primary superblock, so later read will + * always read from the device. + */ + invalidate_inode_pages2_range(mapping, + bytenr >> PAGE_SHIFT, + (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT); + } + page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); if (IS_ERR(page)) return ERR_CAST(page); @@ -3882,7 +4048,7 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ for (i = 0; i < 1; i++) { - super = btrfs_read_dev_one_super(bdev, i); + super = btrfs_read_dev_one_super(bdev, i, false); if (IS_ERR(super)) continue; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 47ad8e0a2d33..363935cfc084 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -27,14 +27,14 @@ static inline u64 btrfs_sb_offset(int mirror) struct btrfs_device; struct btrfs_fs_devices; +struct btrfs_tree_parent_check; void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info); void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); int btrfs_verify_level_key(struct extent_buffer *eb, int level, struct btrfs_key *first_key, u64 parent_transid); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 owner_root, u64 parent_transid, - int level, struct btrfs_key *first_key); + struct btrfs_tree_parent_check *check); struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, @@ -42,14 +42,19 @@ struct extent_buffer *btrfs_find_create_tree_block( void btrfs_clean_tree_block(struct extent_buffer *buf); void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); +int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *disk_sb); int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); void __cold close_ctree(struct btrfs_fs_info *fs_info); +int btrfs_validate_super(struct btrfs_fs_info *fs_info, + struct btrfs_super_block *sb, int mirror_num); +int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb); int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev); struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, - int copy_num); + int copy_num, bool drop_cache); int btrfs_commit_super(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, struct btrfs_key *key); @@ -70,6 +75,7 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, struct btrfs_key *key); struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr); struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr); +struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); @@ -80,7 +86,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror); -void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num); +void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif @@ -101,24 +107,22 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) return NULL; } -static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) -{ - if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) - return fs_info->block_group_root; - return btrfs_extent_root(fs_info, 0); -} - void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); -int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid, - int level, struct btrfs_key *first_key); -bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, - extent_submit_bio_start_t *submit_bio_start); -blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, - int mirror_num); +int btrfs_read_extent_buffer(struct extent_buffer *buf, + struct btrfs_tree_parent_check *check); + +enum btrfs_wq_submit_cmd { + WQ_SUBMIT_METADATA, + WQ_SUBMIT_DATA, + WQ_SUBMIT_DATA_DIO, +}; + +bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, + u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd); +blk_status_t btree_submit_bio_start(struct bio *bio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, @@ -131,8 +135,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); -int btree_lock_page_hook(struct page *page, void *data, - void (*flush_fn)(void *)); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid); int btrfs_init_root_free_objectid(struct btrfs_root *root); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 1d4c2397d0d6..744a02b7fd67 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -7,6 +7,8 @@ #include "btrfs_inode.h" #include "print-tree.h" #include "export.h" +#include "accessors.h" +#include "super.h" #define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \ parent_objectid) / 4) @@ -57,9 +59,20 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, return type; } +/* + * Read dentry of inode with @objectid from filesystem root @root_objectid. + * + * @sb: the filesystem super block + * @objectid: inode objectid + * @root_objectid: object id of the subvolume root where to look up the inode + * @generation: optional, if not zero, verify that the found inode + * generation matches + * + * Return dentry alias for the inode, otherwise an error. In case the + * generation does not match return ESTALE. + */ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation, - int check_generation) + u64 root_objectid, u64 generation) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; @@ -77,7 +90,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, if (IS_ERR(inode)) return ERR_CAST(inode); - if (check_generation && generation != inode->i_generation) { + if (generation != 0 && generation != inode->i_generation) { iput(inode); return ERR_PTR(-ESTALE); } @@ -106,7 +119,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, objectid = fid->parent_objectid; generation = fid->parent_gen; - return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); + return btrfs_get_dentry(sb, objectid, root_objectid, generation); } static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, @@ -128,7 +141,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, root_objectid = fid->root_objectid; generation = fid->gen; - return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); + return btrfs_get_dentry(sb, objectid, root_objectid, generation); } struct dentry *btrfs_get_parent(struct dentry *child) @@ -188,7 +201,7 @@ struct dentry *btrfs_get_parent(struct dentry *child) if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { return btrfs_get_dentry(fs_info->sb, key.objectid, - found_key.offset, 0, 0); + found_key.offset, 0); } return d_obtain_alias(btrfs_iget(fs_info->sb, key.objectid, root)); diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index f32f4113c976..eba6bc4f5a61 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -19,8 +19,7 @@ struct btrfs_fid { } __attribute__ ((packed)); struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation, - int check_generation); + u64 root_objectid, u64 generation); struct dentry *btrfs_get_parent(struct dentry *child); #endif diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c new file mode 100644 index 000000000000..9ae9cd1e7035 --- /dev/null +++ b/fs/btrfs/extent-io-tree.c @@ -0,0 +1,1766 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/slab.h> +#include <trace/events/btrfs.h> +#include "messages.h" +#include "ctree.h" +#include "extent-io-tree.h" +#include "btrfs_inode.h" +#include "misc.h" + +static struct kmem_cache *extent_state_cache; + +static inline bool extent_state_in_tree(const struct extent_state *state) +{ + return !RB_EMPTY_NODE(&state->rb_node); +} + +#ifdef CONFIG_BTRFS_DEBUG +static LIST_HEAD(states); +static DEFINE_SPINLOCK(leak_lock); + +static inline void btrfs_leak_debug_add_state(struct extent_state *state) +{ + unsigned long flags; + + spin_lock_irqsave(&leak_lock, flags); + list_add(&state->leak_list, &states); + spin_unlock_irqrestore(&leak_lock, flags); +} + +static inline void btrfs_leak_debug_del_state(struct extent_state *state) +{ + unsigned long flags; + + spin_lock_irqsave(&leak_lock, flags); + list_del(&state->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +} + +static inline void btrfs_extent_state_leak_debug_check(void) +{ + struct extent_state *state; + + while (!list_empty(&states)) { + state = list_entry(states.next, struct extent_state, leak_list); + pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", + state->start, state->end, state->state, + extent_state_in_tree(state), + refcount_read(&state->refs)); + list_del(&state->leak_list); + kmem_cache_free(extent_state_cache, state); + } +} + +#define btrfs_debug_check_extent_io_range(tree, start, end) \ + __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) +static inline void __btrfs_debug_check_extent_io_range(const char *caller, + struct extent_io_tree *tree, + u64 start, u64 end) +{ + struct btrfs_inode *inode = tree->inode; + u64 isize; + + if (!inode) + return; + + isize = i_size_read(&inode->vfs_inode); + if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { + btrfs_debug_rl(inode->root->fs_info, + "%s: ino %llu isize %llu odd range [%llu,%llu]", + caller, btrfs_ino(inode), isize, start, end); + } +} +#else +#define btrfs_leak_debug_add_state(state) do {} while (0) +#define btrfs_leak_debug_del_state(state) do {} while (0) +#define btrfs_extent_state_leak_debug_check() do {} while (0) +#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) +#endif + +/* + * For the file_extent_tree, we want to hold the inode lock when we lookup and + * update the disk_i_size, but lockdep will complain because our io_tree we hold + * the tree lock and get the inode lock when setting delalloc. These two things + * are unrelated, so make a class for the file_extent_tree so we don't get the + * two locking patterns mixed up. + */ +static struct lock_class_key file_extent_tree_class; + +struct tree_entry { + u64 start; + u64 end; + struct rb_node rb_node; +}; + +void extent_io_tree_init(struct btrfs_fs_info *fs_info, + struct extent_io_tree *tree, unsigned int owner) +{ + tree->fs_info = fs_info; + tree->state = RB_ROOT; + spin_lock_init(&tree->lock); + tree->inode = NULL; + tree->owner = owner; + if (owner == IO_TREE_INODE_FILE_EXTENT) + lockdep_set_class(&tree->lock, &file_extent_tree_class); +} + +void extent_io_tree_release(struct extent_io_tree *tree) +{ + spin_lock(&tree->lock); + /* + * Do a single barrier for the waitqueue_active check here, the state + * of the waitqueue should not change once extent_io_tree_release is + * called. + */ + smp_mb(); + while (!RB_EMPTY_ROOT(&tree->state)) { + struct rb_node *node; + struct extent_state *state; + + node = rb_first(&tree->state); + state = rb_entry(node, struct extent_state, rb_node); + rb_erase(&state->rb_node, &tree->state); + RB_CLEAR_NODE(&state->rb_node); + /* + * btree io trees aren't supposed to have tasks waiting for + * changes in the flags of extent states ever. + */ + ASSERT(!waitqueue_active(&state->wq)); + free_extent_state(state); + + cond_resched_lock(&tree->lock); + } + spin_unlock(&tree->lock); +} + +static struct extent_state *alloc_extent_state(gfp_t mask) +{ + struct extent_state *state; + + /* + * The given mask might be not appropriate for the slab allocator, + * drop the unsupported bits + */ + mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); + state = kmem_cache_alloc(extent_state_cache, mask); + if (!state) + return state; + state->state = 0; + RB_CLEAR_NODE(&state->rb_node); + btrfs_leak_debug_add_state(state); + refcount_set(&state->refs, 1); + init_waitqueue_head(&state->wq); + trace_alloc_extent_state(state, mask, _RET_IP_); + return state; +} + +static struct extent_state *alloc_extent_state_atomic(struct extent_state *prealloc) +{ + if (!prealloc) + prealloc = alloc_extent_state(GFP_ATOMIC); + + return prealloc; +} + +void free_extent_state(struct extent_state *state) +{ + if (!state) + return; + if (refcount_dec_and_test(&state->refs)) { + WARN_ON(extent_state_in_tree(state)); + btrfs_leak_debug_del_state(state); + trace_free_extent_state(state, _RET_IP_); + kmem_cache_free(extent_state_cache, state); + } +} + +static int add_extent_changeset(struct extent_state *state, u32 bits, + struct extent_changeset *changeset, + int set) +{ + int ret; + + if (!changeset) + return 0; + if (set && (state->state & bits) == bits) + return 0; + if (!set && (state->state & bits) == 0) + return 0; + changeset->bytes_changed += state->end - state->start + 1; + ret = ulist_add(&changeset->range_changed, state->start, state->end, + GFP_ATOMIC); + return ret; +} + +static inline struct extent_state *next_state(struct extent_state *state) +{ + struct rb_node *next = rb_next(&state->rb_node); + + if (next) + return rb_entry(next, struct extent_state, rb_node); + else + return NULL; +} + +static inline struct extent_state *prev_state(struct extent_state *state) +{ + struct rb_node *next = rb_prev(&state->rb_node); + + if (next) + return rb_entry(next, struct extent_state, rb_node); + else + return NULL; +} + +/* + * Search @tree for an entry that contains @offset. Such entry would have + * entry->start <= offset && entry->end >= offset. + * + * @tree: the tree to search + * @offset: offset that should fall within an entry in @tree + * @node_ret: pointer where new node should be anchored (used when inserting an + * entry in the tree) + * @parent_ret: points to entry which would have been the parent of the entry, + * containing @offset + * + * Return a pointer to the entry that contains @offset byte address and don't change + * @node_ret and @parent_ret. + * + * If no such entry exists, return pointer to entry that ends before @offset + * and fill parameters @node_ret and @parent_ret, ie. does not return NULL. + */ +static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree, + u64 offset, + struct rb_node ***node_ret, + struct rb_node **parent_ret) +{ + struct rb_root *root = &tree->state; + struct rb_node **node = &root->rb_node; + struct rb_node *prev = NULL; + struct extent_state *entry = NULL; + + while (*node) { + prev = *node; + entry = rb_entry(prev, struct extent_state, rb_node); + + if (offset < entry->start) + node = &(*node)->rb_left; + else if (offset > entry->end) + node = &(*node)->rb_right; + else + return entry; + } + + if (node_ret) + *node_ret = node; + if (parent_ret) + *parent_ret = prev; + + /* Search neighbors until we find the first one past the end */ + while (entry && offset > entry->end) + entry = next_state(entry); + + return entry; +} + +/* + * Search offset in the tree or fill neighbor rbtree node pointers. + * + * @tree: the tree to search + * @offset: offset that should fall within an entry in @tree + * @next_ret: pointer to the first entry whose range ends after @offset + * @prev_ret: pointer to the first entry whose range begins before @offset + * + * Return a pointer to the entry that contains @offset byte address. If no + * such entry exists, then return NULL and fill @prev_ret and @next_ret. + * Otherwise return the found entry and other pointers are left untouched. + */ +static struct extent_state *tree_search_prev_next(struct extent_io_tree *tree, + u64 offset, + struct extent_state **prev_ret, + struct extent_state **next_ret) +{ + struct rb_root *root = &tree->state; + struct rb_node **node = &root->rb_node; + struct extent_state *orig_prev; + struct extent_state *entry = NULL; + + ASSERT(prev_ret); + ASSERT(next_ret); + + while (*node) { + entry = rb_entry(*node, struct extent_state, rb_node); + + if (offset < entry->start) + node = &(*node)->rb_left; + else if (offset > entry->end) + node = &(*node)->rb_right; + else + return entry; + } + + orig_prev = entry; + while (entry && offset > entry->end) + entry = next_state(entry); + *next_ret = entry; + entry = orig_prev; + + while (entry && offset < entry->start) + entry = prev_state(entry); + *prev_ret = entry; + + return NULL; +} + +/* + * Inexact rb-tree search, return the next entry if @offset is not found + */ +static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 offset) +{ + return tree_search_for_insert(tree, offset, NULL, NULL); +} + +static void extent_io_tree_panic(struct extent_io_tree *tree, int err) +{ + btrfs_panic(tree->fs_info, err, + "locking error: extent tree was modified by another thread while locked"); +} + +/* + * Utility function to look for merge candidates inside a given range. Any + * extents with matching state are merged together into a single extent in the + * tree. Extents with EXTENT_IO in their state field are not merged because + * the end_io handlers need to be able to do operations on them without + * sleeping (or doing allocations/splits). + * + * This should be called with the tree lock held. + */ +static void merge_state(struct extent_io_tree *tree, struct extent_state *state) +{ + struct extent_state *other; + + if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + return; + + other = prev_state(state); + if (other && other->end == state->start - 1 && + other->state == state->state) { + if (tree->inode) + btrfs_merge_delalloc_extent(tree->inode, state, other); + state->start = other->start; + rb_erase(&other->rb_node, &tree->state); + RB_CLEAR_NODE(&other->rb_node); + free_extent_state(other); + } + other = next_state(state); + if (other && other->start == state->end + 1 && + other->state == state->state) { + if (tree->inode) + btrfs_merge_delalloc_extent(tree->inode, state, other); + state->end = other->end; + rb_erase(&other->rb_node, &tree->state); + RB_CLEAR_NODE(&other->rb_node); + free_extent_state(other); + } +} + +static void set_state_bits(struct extent_io_tree *tree, + struct extent_state *state, + u32 bits, struct extent_changeset *changeset) +{ + u32 bits_to_set = bits & ~EXTENT_CTLBITS; + int ret; + + if (tree->inode) + btrfs_set_delalloc_extent(tree->inode, state, bits); + + ret = add_extent_changeset(state, bits_to_set, changeset, 1); + BUG_ON(ret < 0); + state->state |= bits_to_set; +} + +/* + * Insert an extent_state struct into the tree. 'bits' are set on the + * struct before it is inserted. + * + * This may return -EEXIST if the extent is already there, in which case the + * state struct is freed. + * + * The tree lock is not taken internally. This is a utility function and + * probably isn't what you want to call (see set/clear_extent_bit). + */ +static int insert_state(struct extent_io_tree *tree, + struct extent_state *state, + u32 bits, struct extent_changeset *changeset) +{ + struct rb_node **node; + struct rb_node *parent = NULL; + const u64 end = state->end; + + set_state_bits(tree, state, bits, changeset); + + node = &tree->state.rb_node; + while (*node) { + struct extent_state *entry; + + parent = *node; + entry = rb_entry(parent, struct extent_state, rb_node); + + if (end < entry->start) { + node = &(*node)->rb_left; + } else if (end > entry->end) { + node = &(*node)->rb_right; + } else { + btrfs_err(tree->fs_info, + "found node %llu %llu on insert of %llu %llu", + entry->start, entry->end, state->start, end); + return -EEXIST; + } + } + + rb_link_node(&state->rb_node, parent, node); + rb_insert_color(&state->rb_node, &tree->state); + + merge_state(tree, state); + return 0; +} + +/* + * Insert state to @tree to the location given by @node and @parent. + */ +static void insert_state_fast(struct extent_io_tree *tree, + struct extent_state *state, struct rb_node **node, + struct rb_node *parent, unsigned bits, + struct extent_changeset *changeset) +{ + set_state_bits(tree, state, bits, changeset); + rb_link_node(&state->rb_node, parent, node); + rb_insert_color(&state->rb_node, &tree->state); + merge_state(tree, state); +} + +/* + * Split a given extent state struct in two, inserting the preallocated + * struct 'prealloc' as the newly created second half. 'split' indicates an + * offset inside 'orig' where it should be split. + * + * Before calling, + * the tree has 'orig' at [orig->start, orig->end]. After calling, there + * are two extent state structs in the tree: + * prealloc: [orig->start, split - 1] + * orig: [ split, orig->end ] + * + * The tree locks are not taken by this function. They need to be held + * by the caller. + */ +static int split_state(struct extent_io_tree *tree, struct extent_state *orig, + struct extent_state *prealloc, u64 split) +{ + struct rb_node *parent = NULL; + struct rb_node **node; + + if (tree->inode) + btrfs_split_delalloc_extent(tree->inode, orig, split); + + prealloc->start = orig->start; + prealloc->end = split - 1; + prealloc->state = orig->state; + orig->start = split; + + parent = &orig->rb_node; + node = &parent; + while (*node) { + struct extent_state *entry; + + parent = *node; + entry = rb_entry(parent, struct extent_state, rb_node); + + if (prealloc->end < entry->start) { + node = &(*node)->rb_left; + } else if (prealloc->end > entry->end) { + node = &(*node)->rb_right; + } else { + free_extent_state(prealloc); + return -EEXIST; + } + } + + rb_link_node(&prealloc->rb_node, parent, node); + rb_insert_color(&prealloc->rb_node, &tree->state); + + return 0; +} + +/* + * Utility function to clear some bits in an extent state struct. It will + * optionally wake up anyone waiting on this state (wake == 1). + * + * If no bits are set on the state struct after clearing things, the + * struct is freed and removed from the tree + */ +static struct extent_state *clear_state_bit(struct extent_io_tree *tree, + struct extent_state *state, + u32 bits, int wake, + struct extent_changeset *changeset) +{ + struct extent_state *next; + u32 bits_to_clear = bits & ~EXTENT_CTLBITS; + int ret; + + if (tree->inode) + btrfs_clear_delalloc_extent(tree->inode, state, bits); + + ret = add_extent_changeset(state, bits_to_clear, changeset, 0); + BUG_ON(ret < 0); + state->state &= ~bits_to_clear; + if (wake) + wake_up(&state->wq); + if (state->state == 0) { + next = next_state(state); + if (extent_state_in_tree(state)) { + rb_erase(&state->rb_node, &tree->state); + RB_CLEAR_NODE(&state->rb_node); + free_extent_state(state); + } else { + WARN_ON(1); + } + } else { + merge_state(tree, state); + next = next_state(state); + } + return next; +} + +/* + * Clear some bits on a range in the tree. This may require splitting or + * inserting elements in the tree, so the gfp mask is used to indicate which + * allocations or sleeping are allowed. + * + * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given + * range from the tree regardless of state (ie for truncate). + * + * The range [start, end] is inclusive. + * + * This takes the tree lock, and returns 0 on success and < 0 on error. + */ +int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state, + gfp_t mask, struct extent_changeset *changeset) +{ + struct extent_state *state; + struct extent_state *cached; + struct extent_state *prealloc = NULL; + u64 last_end; + int err; + int clear = 0; + int wake; + int delete = (bits & EXTENT_CLEAR_ALL_BITS); + + btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); + + if (delete) + bits |= ~EXTENT_CTLBITS; + + if (bits & EXTENT_DELALLOC) + bits |= EXTENT_NORESERVE; + + wake = (bits & EXTENT_LOCKED) ? 1 : 0; + if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + clear = 1; +again: + if (!prealloc) { + /* + * Don't care for allocation failure here because we might end + * up not needing the pre-allocated extent state at all, which + * is the case if we only have in the tree extent states that + * cover our input range and don't cover too any other range. + * If we end up needing a new extent state we allocate it later. + */ + prealloc = alloc_extent_state(mask); + } + + spin_lock(&tree->lock); + if (cached_state) { + cached = *cached_state; + + if (clear) { + *cached_state = NULL; + cached_state = NULL; + } + + if (cached && extent_state_in_tree(cached) && + cached->start <= start && cached->end > start) { + if (clear) + refcount_dec(&cached->refs); + state = cached; + goto hit_next; + } + if (clear) + free_extent_state(cached); + } + + /* This search will find the extents that end after our range starts. */ + state = tree_search(tree, start); + if (!state) + goto out; +hit_next: + if (state->start > end) + goto out; + WARN_ON(state->end < start); + last_end = state->end; + + /* The state doesn't have the wanted bits, go ahead. */ + if (!(state->state & bits)) { + state = next_state(state); + goto next; + } + + /* + * | ---- desired range ---- | + * | state | or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on second + * half. + * + * If the extent we found extends past our range, we just split and + * search again. It'll get split again the next time though. + * + * If the extent we found is inside our range, we clear the desired bit + * on it. + */ + + if (state->start < start) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + goto search_again; + err = split_state(tree, state, prealloc, start); + if (err) + extent_io_tree_panic(tree, err); + + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + state = clear_state_bit(tree, state, bits, wake, changeset); + goto next; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and clear the bit on the first half. + */ + if (state->start <= end && state->end > end) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + goto search_again; + err = split_state(tree, state, prealloc, end + 1); + if (err) + extent_io_tree_panic(tree, err); + + if (wake) + wake_up(&state->wq); + + clear_state_bit(tree, prealloc, bits, wake, changeset); + + prealloc = NULL; + goto out; + } + + state = clear_state_bit(tree, state, bits, wake, changeset); +next: + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start <= end && state && !need_resched()) + goto hit_next; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (gfpflags_allow_blocking(mask)) + cond_resched(); + goto again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return 0; + +} + +static void wait_on_state(struct extent_io_tree *tree, + struct extent_state *state) + __releases(tree->lock) + __acquires(tree->lock) +{ + DEFINE_WAIT(wait); + prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&tree->lock); + schedule(); + spin_lock(&tree->lock); + finish_wait(&state->wq, &wait); +} + +/* + * Wait for one or more bits to clear on a range in the state tree. + * The range [start, end] is inclusive. + * The tree lock is taken by this function + */ +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state) +{ + struct extent_state *state; + + btrfs_debug_check_extent_io_range(tree, start, end); + + spin_lock(&tree->lock); +again: + /* + * Maintain cached_state, as we may not remove it from the tree if there + * are more bits than the bits we're waiting on set on this state. + */ + if (cached_state && *cached_state) { + state = *cached_state; + if (extent_state_in_tree(state) && + state->start <= start && start < state->end) + goto process_node; + } + while (1) { + /* + * This search will find all the extents that end after our + * range starts. + */ + state = tree_search(tree, start); +process_node: + if (!state) + break; + if (state->start > end) + goto out; + + if (state->state & bits) { + start = state->start; + refcount_inc(&state->refs); + wait_on_state(tree, state); + free_extent_state(state); + goto again; + } + start = state->end + 1; + + if (start > end) + break; + + if (!cond_resched_lock(&tree->lock)) { + state = next_state(state); + goto process_node; + } + } +out: + /* This state is no longer useful, clear it and free it up. */ + if (cached_state && *cached_state) { + state = *cached_state; + *cached_state = NULL; + free_extent_state(state); + } + spin_unlock(&tree->lock); +} + +static void cache_state_if_flags(struct extent_state *state, + struct extent_state **cached_ptr, + unsigned flags) +{ + if (cached_ptr && !(*cached_ptr)) { + if (!flags || (state->state & flags)) { + *cached_ptr = state; + refcount_inc(&state->refs); + } + } +} + +static void cache_state(struct extent_state *state, + struct extent_state **cached_ptr) +{ + return cache_state_if_flags(state, cached_ptr, + EXTENT_LOCKED | EXTENT_BOUNDARY); +} + +/* + * Find the first state struct with 'bits' set after 'start', and return it. + * tree->lock must be held. NULL will returned if nothing was found after + * 'start'. + */ +static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, + u64 start, u32 bits) +{ + struct extent_state *state; + + /* + * This search will find all the extents that end after our range + * starts. + */ + state = tree_search(tree, start); + while (state) { + if (state->end >= start && (state->state & bits)) + return state; + state = next_state(state); + } + return NULL; +} + +/* + * Find the first offset in the io tree with one or more @bits set. + * + * Note: If there are multiple bits set in @bits, any of them will match. + * + * Return 0 if we find something, and update @start_ret and @end_ret. + * Return 1 if we found nothing. + */ +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state) +{ + struct extent_state *state; + int ret = 1; + + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->end == start - 1 && extent_state_in_tree(state)) { + while ((state = next_state(state)) != NULL) { + if (state->state & bits) + goto got_it; + } + free_extent_state(*cached_state); + *cached_state = NULL; + goto out; + } + free_extent_state(*cached_state); + *cached_state = NULL; + } + + state = find_first_extent_bit_state(tree, start, bits); +got_it: + if (state) { + cache_state_if_flags(state, cached_state, 0); + *start_ret = state->start; + *end_ret = state->end; + ret = 0; + } +out: + spin_unlock(&tree->lock); + return ret; +} + +/* + * Find a contiguous area of bits + * + * @tree: io tree to check + * @start: offset to start the search from + * @start_ret: the first offset we found with the bits set + * @end_ret: the final contiguous range of the bits that were set + * @bits: bits to look for + * + * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges + * to set bits appropriately, and then merge them again. During this time it + * will drop the tree->lock, so use this helper if you want to find the actual + * contiguous area for given bits. We will search to the first bit we find, and + * then walk down the tree until we find a non-contiguous area. The area + * returned will be the full contiguous area with the bits set. + */ +int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits) +{ + struct extent_state *state; + int ret = 1; + + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, start, bits); + if (state) { + *start_ret = state->start; + *end_ret = state->end; + while ((state = next_state(state)) != NULL) { + if (state->start > (*end_ret + 1)) + break; + *end_ret = state->end; + } + ret = 0; + } + spin_unlock(&tree->lock); + return ret; +} + +/* + * Find a contiguous range of bytes in the file marked as delalloc, not more + * than 'max_bytes'. start and end are used to return the range, + * + * True is returned if we find something, false if nothing was in the tree. + */ +bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, + u64 *end, u64 max_bytes, + struct extent_state **cached_state) +{ + struct extent_state *state; + u64 cur_start = *start; + bool found = false; + u64 total_bytes = 0; + + spin_lock(&tree->lock); + + /* + * This search will find all the extents that end after our range + * starts. + */ + state = tree_search(tree, cur_start); + if (!state) { + *end = (u64)-1; + goto out; + } + + while (state) { + if (found && (state->start != cur_start || + (state->state & EXTENT_BOUNDARY))) { + goto out; + } + if (!(state->state & EXTENT_DELALLOC)) { + if (!found) + *end = state->end; + goto out; + } + if (!found) { + *start = state->start; + *cached_state = state; + refcount_inc(&state->refs); + } + found = true; + *end = state->end; + cur_start = state->end + 1; + total_bytes += state->end - state->start + 1; + if (total_bytes >= max_bytes) + break; + state = next_state(state); + } +out: + spin_unlock(&tree->lock); + return found; +} + +/* + * Set some bits on a range in the tree. This may require allocations or + * sleeping, so the gfp mask is used to indicate what is allowed. + * + * If any of the exclusive bits are set, this will fail with -EEXIST if some + * part of the range already has the desired bits set. The extent_state of the + * existing range is returned in failed_state in this case, and the start of the + * existing range is returned in failed_start. failed_state is used as an + * optimization for wait_extent_bit, failed_start must be used as the source of + * truth as failed_state may have changed since we returned. + * + * [start, end] is inclusive This takes the tree lock. + */ +static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u64 *failed_start, + struct extent_state **failed_state, + struct extent_state **cached_state, + struct extent_changeset *changeset, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node **p; + struct rb_node *parent; + int err = 0; + u64 last_start; + u64 last_end; + u32 exclusive_bits = (bits & EXTENT_LOCKED); + + btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); + + if (exclusive_bits) + ASSERT(failed_start); + else + ASSERT(failed_start == NULL && failed_state == NULL); +again: + if (!prealloc) { + /* + * Don't care for allocation failure here because we might end + * up not needing the pre-allocated extent state at all, which + * is the case if we only have in the tree extent states that + * cover our input range and don't cover too any other range. + * If we end up needing a new extent state we allocate it later. + */ + prealloc = alloc_extent_state(mask); + } + + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start <= start && state->end > start && + extent_state_in_tree(state)) + goto hit_next; + } + /* + * This search will find all the extents that end after our range + * starts. + */ + state = tree_search_for_insert(tree, start, &p, &parent); + if (!state) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + goto search_again; + prealloc->start = start; + prealloc->end = end; + insert_state_fast(tree, prealloc, p, parent, bits, changeset); + cache_state(prealloc, cached_state); + prealloc = NULL; + goto out; + } +hit_next: + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + if (state->state & exclusive_bits) { + *failed_start = state->start; + cache_state(state, failed_state); + err = -EEXIST; + goto out; + } + + set_state_bits(tree, state, bits, changeset); + cache_state(state, cached_state); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on second + * half. + * + * If the extent we found extends past our range, we just split and + * search again. It'll get split again the next time though. + * + * If the extent we found is inside our range, we set the desired bit + * on it. + */ + if (state->start < start) { + if (state->state & exclusive_bits) { + *failed_start = start; + cache_state(state, failed_state); + err = -EEXIST; + goto out; + } + + /* + * If this extent already has all the bits we want set, then + * skip it, not necessary to split it or do anything with it. + */ + if ((state->state & bits) == bits) { + start = state->end + 1; + cache_state(state, cached_state); + goto search_again; + } + + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + goto search_again; + err = split_state(tree, state, prealloc, start); + if (err) + extent_io_tree_panic(tree, err); + + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, bits, changeset); + cache_state(state, cached_state); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and ignore the + * extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + goto search_again; + + /* + * Avoid to free 'prealloc' if it can be merged with the later + * extent. + */ + prealloc->start = start; + prealloc->end = this_end; + err = insert_state(tree, prealloc, bits, changeset); + if (err) + extent_io_tree_panic(tree, err); + + cache_state(prealloc, cached_state); + prealloc = NULL; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * + * We need to split the extent, and set the bit on the first half + */ + if (state->start <= end && state->end > end) { + if (state->state & exclusive_bits) { + *failed_start = start; + cache_state(state, failed_state); + err = -EEXIST; + goto out; + } + + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + goto search_again; + err = split_state(tree, state, prealloc, end + 1); + if (err) + extent_io_tree_panic(tree, err); + + set_state_bits(tree, prealloc, bits, changeset); + cache_state(prealloc, cached_state); + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (gfpflags_allow_blocking(mask)) + cond_resched(); + goto again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; + +} + +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state, gfp_t mask) +{ + return __set_extent_bit(tree, start, end, bits, NULL, NULL, + cached_state, NULL, mask); +} + +/* + * Convert all bits in a given range from one bit to another + * + * @tree: the io tree to search + * @start: the start offset in bytes + * @end: the end offset in bytes (inclusive) + * @bits: the bits to set in this range + * @clear_bits: the bits to clear in this range + * @cached_state: state that we're going to cache + * + * This will go through and set bits for the given range. If any states exist + * already in this range they are set with the given bit and cleared of the + * clear_bits. This is only meant to be used by things that are mergeable, ie. + * converting from say DELALLOC to DIRTY. This is not meant to be used with + * boundary bits like LOCK. + * + * All allocations are done with GFP_NOFS. + */ +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u32 clear_bits, + struct extent_state **cached_state) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node **p; + struct rb_node *parent; + int err = 0; + u64 last_start; + u64 last_end; + bool first_iteration = true; + + btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits, + clear_bits); + +again: + if (!prealloc) { + /* + * Best effort, don't worry if extent state allocation fails + * here for the first iteration. We might have a cached state + * that matches exactly the target range, in which case no + * extent state allocations are needed. We'll only know this + * after locking the tree. + */ + prealloc = alloc_extent_state(GFP_NOFS); + if (!prealloc && !first_iteration) + return -ENOMEM; + } + + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start <= start && state->end > start && + extent_state_in_tree(state)) + goto hit_next; + } + + /* + * This search will find all the extents that end after our range + * starts. + */ + state = tree_search_for_insert(tree, start, &p, &parent); + if (!state) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + prealloc->start = start; + prealloc->end = end; + insert_state_fast(tree, prealloc, p, parent, bits, NULL); + cache_state(prealloc, cached_state); + prealloc = NULL; + goto out; + } +hit_next: + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going. + */ + if (state->start == start && state->end <= end) { + set_state_bits(tree, state, bits, NULL); + cache_state(state, cached_state); + state = clear_state_bit(tree, state, clear_bits, 0, NULL); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on second + * half. + * + * If the extent we found extends past our range, we just split and + * search again. It'll get split again the next time though. + * + * If the extent we found is inside our range, we set the desired bit + * on it. + */ + if (state->start < start) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + err = split_state(tree, state, prealloc, start); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, bits, NULL); + cache_state(state, cached_state); + state = clear_state_bit(tree, state, clear_bits, 0, NULL); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and ignore the + * extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + + /* + * Avoid to free 'prealloc' if it can be merged with the later + * extent. + */ + prealloc->start = start; + prealloc->end = this_end; + err = insert_state(tree, prealloc, bits, NULL); + if (err) + extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); + prealloc = NULL; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * + * We need to split the extent, and set the bit on the first half. + */ + if (state->start <= end && state->end > end) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + + err = split_state(tree, state, prealloc, end + 1); + if (err) + extent_io_tree_panic(tree, err); + + set_state_bits(tree, prealloc, bits, NULL); + cache_state(prealloc, cached_state); + clear_state_bit(tree, prealloc, clear_bits, 0, NULL); + prealloc = NULL; + goto out; + } + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + cond_resched(); + first_iteration = false; + goto again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; +} + +/* + * Find the first range that has @bits not set. This range could start before + * @start. + * + * @tree: the tree to search + * @start: offset at/after which the found extent should start + * @start_ret: records the beginning of the range + * @end_ret: records the end of the range (inclusive) + * @bits: the set of bits which must be unset + * + * Since unallocated range is also considered one which doesn't have the bits + * set it's possible that @end_ret contains -1, this happens in case the range + * spans (last_range_end, end of device]. In this case it's up to the caller to + * trim @end_ret to the appropriate size. + */ +void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits) +{ + struct extent_state *state; + struct extent_state *prev = NULL, *next = NULL; + + spin_lock(&tree->lock); + + /* Find first extent with bits cleared */ + while (1) { + state = tree_search_prev_next(tree, start, &prev, &next); + if (!state && !next && !prev) { + /* + * Tree is completely empty, send full range and let + * caller deal with it + */ + *start_ret = 0; + *end_ret = -1; + goto out; + } else if (!state && !next) { + /* + * We are past the last allocated chunk, set start at + * the end of the last extent. + */ + *start_ret = prev->end + 1; + *end_ret = -1; + goto out; + } else if (!state) { + state = next; + } + + /* + * At this point 'state' either contains 'start' or start is + * before 'state' + */ + if (in_range(start, state->start, state->end - state->start + 1)) { + if (state->state & bits) { + /* + * |--range with bits sets--| + * | + * start + */ + start = state->end + 1; + } else { + /* + * 'start' falls within a range that doesn't + * have the bits set, so take its start as the + * beginning of the desired range + * + * |--range with bits cleared----| + * | + * start + */ + *start_ret = state->start; + break; + } + } else { + /* + * |---prev range---|---hole/unset---|---node range---| + * | + * start + * + * or + * + * |---hole/unset--||--first node--| + * 0 | + * start + */ + if (prev) + *start_ret = prev->end + 1; + else + *start_ret = 0; + break; + } + } + + /* + * Find the longest stretch from start until an entry which has the + * bits set + */ + while (state) { + if (state->end >= start && !(state->state & bits)) { + *end_ret = state->end; + } else { + *end_ret = state->start - 1; + break; + } + state = next_state(state); + } +out: + spin_unlock(&tree->lock); +} + +/* + * Count the number of bytes in the tree that have a given bit(s) set for a + * given range. + * + * @tree: The io tree to search. + * @start: The start offset of the range. This value is updated to the + * offset of the first byte found with the given bit(s), so it + * can end up being bigger than the initial value. + * @search_end: The end offset (inclusive value) of the search range. + * @max_bytes: The maximum byte count we are interested. The search stops + * once it reaches this count. + * @bits: The bits the range must have in order to be accounted for. + * If multiple bits are set, then only subranges that have all + * the bits set are accounted for. + * @contig: Indicate if we should ignore holes in the range or not. If + * this is true, then stop once we find a hole. + * @cached_state: A cached state to be used across multiple calls to this + * function in order to speedup searches. Use NULL if this is + * called only once or if each call does not start where the + * previous one ended. + * + * Returns the total number of bytes found within the given range that have + * all given bits set. If the returned number of bytes is greater than zero + * then @start is updated with the offset of the first byte with the bits set. + */ +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, u64 max_bytes, + u32 bits, int contig, + struct extent_state **cached_state) +{ + struct extent_state *state = NULL; + struct extent_state *cached; + u64 cur_start = *start; + u64 total_bytes = 0; + u64 last = 0; + int found = 0; + + if (WARN_ON(search_end <= cur_start)) + return 0; + + spin_lock(&tree->lock); + + if (!cached_state || !*cached_state) + goto search; + + cached = *cached_state; + + if (!extent_state_in_tree(cached)) + goto search; + + if (cached->start <= cur_start && cur_start <= cached->end) { + state = cached; + } else if (cached->start > cur_start) { + struct extent_state *prev; + + /* + * The cached state starts after our search range's start. Check + * if the previous state record starts at or before the range we + * are looking for, and if so, use it - this is a common case + * when there are holes between records in the tree. If there is + * no previous state record, we can start from our cached state. + */ + prev = prev_state(cached); + if (!prev) + state = cached; + else if (prev->start <= cur_start && cur_start <= prev->end) + state = prev; + } + + /* + * This search will find all the extents that end after our range + * starts. + */ +search: + if (!state) + state = tree_search(tree, cur_start); + + while (state) { + if (state->start > search_end) + break; + if (contig && found && state->start > last + 1) + break; + if (state->end >= cur_start && (state->state & bits) == bits) { + total_bytes += min(search_end, state->end) + 1 - + max(cur_start, state->start); + if (total_bytes >= max_bytes) + break; + if (!found) { + *start = max(cur_start, state->start); + found = 1; + } + last = state->end; + } else if (contig && found) { + break; + } + state = next_state(state); + } + + if (cached_state) { + free_extent_state(*cached_state); + *cached_state = state; + if (state) + refcount_inc(&state->refs); + } + + spin_unlock(&tree->lock); + + return total_bytes; +} + +/* + * Searche a range in the state tree for a given mask. If 'filled' == 1, this + * returns 1 only if every extent in the tree has the bits set. Otherwise, 1 + * is returned if any bit in the range is found set. + */ +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, int filled, struct extent_state *cached) +{ + struct extent_state *state = NULL; + int bitset = 0; + + spin_lock(&tree->lock); + if (cached && extent_state_in_tree(cached) && cached->start <= start && + cached->end > start) + state = cached; + else + state = tree_search(tree, start); + while (state && start <= end) { + if (filled && state->start > start) { + bitset = 0; + break; + } + + if (state->start > end) + break; + + if (state->state & bits) { + bitset = 1; + if (!filled) + break; + } else if (filled) { + bitset = 0; + break; + } + + if (state->end == (u64)-1) + break; + + start = state->end + 1; + if (start > end) + break; + state = next_state(state); + } + + /* We ran out of states and were still inside of our range. */ + if (filled && !state) + bitset = 0; + spin_unlock(&tree->lock); + return bitset; +} + +/* Wrappers around set/clear extent bit */ +int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset) +{ + /* + * We don't support EXTENT_LOCKED yet, as current changeset will + * record any bits changed, so for EXTENT_LOCKED case, it will + * either fail with -EEXIST or changeset will record the whole + * range. + */ + ASSERT(!(bits & EXTENT_LOCKED)); + + return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, + changeset, GFP_NOFS); +} + +int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset) +{ + /* + * Don't support EXTENT_LOCKED case, same reason as + * set_record_extent_bits(). + */ + ASSERT(!(bits & EXTENT_LOCKED)); + + return __clear_extent_bit(tree, start, end, bits, NULL, GFP_NOFS, + changeset); +} + +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) +{ + int err; + u64 failed_start; + + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + NULL, cached, NULL, GFP_NOFS); + if (err == -EEXIST) { + if (failed_start > start) + clear_extent_bit(tree, start, failed_start - 1, + EXTENT_LOCKED, cached); + return 0; + } + return 1; +} + +/* + * Either insert or lock state struct between start and end use mask to tell + * us if waiting is desired. + */ +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state) +{ + struct extent_state *failed_state = NULL; + int err; + u64 failed_start; + + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + &failed_state, cached_state, NULL, GFP_NOFS); + while (err == -EEXIST) { + if (failed_start != start) + clear_extent_bit(tree, start, failed_start - 1, + EXTENT_LOCKED, cached_state); + + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED, + &failed_state); + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, + &failed_start, &failed_state, + cached_state, NULL, GFP_NOFS); + } + return err; +} + +void __cold extent_state_free_cachep(void) +{ + btrfs_extent_state_leak_debug_check(); + kmem_cache_destroy(extent_state_cache); +} + +int __init extent_state_init_cachep(void) +{ + extent_state_cache = kmem_cache_create("btrfs_extent_state", + sizeof(struct extent_state), 0, + SLAB_MEM_SPREAD, NULL); + if (!extent_state_cache) + return -ENOMEM; + + return 0; +} diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index c3eb52dbe61c..e3eeec380844 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -3,42 +3,54 @@ #ifndef BTRFS_EXTENT_IO_TREE_H #define BTRFS_EXTENT_IO_TREE_H +#include "misc.h" + struct extent_changeset; struct io_failure_record; /* Bits for the extent state */ -#define EXTENT_DIRTY (1U << 0) -#define EXTENT_UPTODATE (1U << 1) -#define EXTENT_LOCKED (1U << 2) -#define EXTENT_NEW (1U << 3) -#define EXTENT_DELALLOC (1U << 4) -#define EXTENT_DEFRAG (1U << 5) -#define EXTENT_BOUNDARY (1U << 6) -#define EXTENT_NODATASUM (1U << 7) -#define EXTENT_CLEAR_META_RESV (1U << 8) -#define EXTENT_NEED_WAIT (1U << 9) -#define EXTENT_DAMAGED (1U << 10) -#define EXTENT_NORESERVE (1U << 11) -#define EXTENT_QGROUP_RESERVED (1U << 12) -#define EXTENT_CLEAR_DATA_RESV (1U << 13) -/* - * Must be cleared only during ordered extent completion or on error paths if we - * did not manage to submit bios and create the ordered extents for the range. - * Should not be cleared during page release and page invalidation (if there is - * an ordered extent in flight), that is left for the ordered extent completion. - */ -#define EXTENT_DELALLOC_NEW (1U << 14) -/* - * When an ordered extent successfully completes for a region marked as a new - * delalloc range, use this flag when clearing a new delalloc range to indicate - * that the VFS' inode number of bytes should be incremented and the inode's new - * delalloc bytes decremented, in an atomic way to prevent races with stat(2). - */ -#define EXTENT_ADD_INODE_BYTES (1U << 15) +enum { + ENUM_BIT(EXTENT_DIRTY), + ENUM_BIT(EXTENT_UPTODATE), + ENUM_BIT(EXTENT_LOCKED), + ENUM_BIT(EXTENT_NEW), + ENUM_BIT(EXTENT_DELALLOC), + ENUM_BIT(EXTENT_DEFRAG), + ENUM_BIT(EXTENT_BOUNDARY), + ENUM_BIT(EXTENT_NODATASUM), + ENUM_BIT(EXTENT_CLEAR_META_RESV), + ENUM_BIT(EXTENT_NEED_WAIT), + ENUM_BIT(EXTENT_NORESERVE), + ENUM_BIT(EXTENT_QGROUP_RESERVED), + ENUM_BIT(EXTENT_CLEAR_DATA_RESV), + /* + * Must be cleared only during ordered extent completion or on error + * paths if we did not manage to submit bios and create the ordered + * extents for the range. Should not be cleared during page release + * and page invalidation (if there is an ordered extent in flight), + * that is left for the ordered extent completion. + */ + ENUM_BIT(EXTENT_DELALLOC_NEW), + /* + * When an ordered extent successfully completes for a region marked as + * a new delalloc range, use this flag when clearing a new delalloc + * range to indicate that the VFS' inode number of bytes should be + * incremented and the inode's new delalloc bytes decremented, in an + * atomic way to prevent races with stat(2). + */ + ENUM_BIT(EXTENT_ADD_INODE_BYTES), + /* + * Set during truncate when we're clearing an entire range and we just + * want the extent states to go away. + */ + ENUM_BIT(EXTENT_CLEAR_ALL_BITS), +}; + #define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ EXTENT_CLEAR_DATA_RESV) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | \ - EXTENT_ADD_INODE_BYTES) + EXTENT_ADD_INODE_BYTES | \ + EXTENT_CLEAR_ALL_BITS) /* * Redefined bits above which are used only in the device allocation tree, @@ -56,7 +68,6 @@ enum { IO_TREE_FS_EXCLUDED_EXTENTS, IO_TREE_BTREE_INODE_IO, IO_TREE_INODE_IO, - IO_TREE_INODE_IO_FAILURE, IO_TREE_RELOC_BLOCKS, IO_TREE_TRANS_DIRTY_PAGES, IO_TREE_ROOT_DIRTY_LOG_PAGES, @@ -69,9 +80,8 @@ enum { struct extent_io_tree { struct rb_root state; struct btrfs_fs_info *fs_info; - void *private_data; - u64 dirty_bytes; - bool track_uptodate; + /* Inode associated with this tree, or NULL. */ + struct btrfs_inode *inode; /* Who owns this io tree, should be one of IO_TREE_* */ u8 owner; @@ -89,109 +99,87 @@ struct extent_state { refcount_t refs; u32 state; - struct io_failure_record *failrec; - #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; #endif }; -int __init extent_state_cache_init(void); -void __cold extent_state_cache_exit(void); - void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner, - void *private_data); + struct extent_io_tree *tree, unsigned int owner); void extent_io_tree_release(struct extent_io_tree *tree); -int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached); - -static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - return lock_extent_bits(tree, start, end, NULL); -} +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached); -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached); -int __init extent_io_init(void); -void __cold extent_io_exit(void); +int __init extent_state_init_cachep(void); +void __cold extent_state_free_cachep(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, u32 bits, int contig); + u64 max_bytes, u32 bits, int contig, + struct extent_state **cached_state); void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, int filled, struct extent_state *cached_state); int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset); -int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int wake, int delete, - struct extent_state **cached); int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int wake, int delete, - struct extent_state **cached, gfp_t mask, - struct extent_changeset *changeset); + u32 bits, struct extent_state **cached, gfp_t mask, + struct extent_changeset *changeset); -static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) +static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 end, u32 bits, + struct extent_state **cached) { - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL); + return __clear_extent_bit(tree, start, end, bits, cached, + GFP_NOFS, NULL); } -static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) { - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, - GFP_NOFS, NULL); -} - -static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree, - u64 start, u64 end, struct extent_state **cached) -{ - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, - GFP_ATOMIC, NULL); + return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, + GFP_NOFS, NULL); } static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits) { - int wake = 0; - - if (bits & EXTENT_LOCKED) - wake = 1; - - return clear_extent_bit(tree, start, end, bits, wake, 0, NULL); + return clear_extent_bit(tree, start, end, bits, NULL); } int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, unsigned exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask, - struct extent_changeset *changeset); -int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits); + u32 bits, struct extent_state **cached_state, gfp_t mask); + +static inline int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, + u64 end, u32 bits) +{ + return set_extent_bit(tree, start, end, bits, NULL, GFP_NOWAIT); +} static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits) { - return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, - NULL); + return set_extent_bit(tree, start, end, bits, NULL, GFP_NOFS); } static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state) { - return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, - cached_state, GFP_NOFS, NULL); + return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, + cached_state, GFP_NOFS, NULL); } static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, NULL, - mask, NULL); + return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, mask); } static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, @@ -199,7 +187,7 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, cached); + EXTENT_DO_ACCOUNTING, cached); } int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -211,30 +199,22 @@ static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, struct extent_state **cached_state) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits, - 0, NULL, cached_state, GFP_NOFS, NULL); + EXTENT_DELALLOC | extra_bits, + cached_state, GFP_NOFS); } static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, - 0, NULL, cached_state, GFP_NOFS, NULL); + EXTENT_DELALLOC | EXTENT_DEFRAG, + cached_state, GFP_NOFS); } static inline int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end) { - return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, NULL, - GFP_NOFS, NULL); -} - -static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, - cached_state, mask, NULL); + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS); } int find_first_extent_bit(struct extent_io_tree *tree, u64 start, @@ -244,24 +224,10 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits); int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits); -int extent_invalidate_folio(struct extent_io_tree *tree, - struct folio *folio, size_t offset); bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); - -/* This should be reworked in the future and put elsewhere. */ -struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start); -int set_state_failrec(struct extent_io_tree *tree, u64 start, - struct io_failure_record *failrec); -void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, - u64 end); -int free_io_failure(struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, - struct io_failure_record *rec); -int clean_io_failure(struct btrfs_fs_info *fs_info, - struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, u64 start, - struct page *page, u64 ino, unsigned int pg_offset); +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state); #endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6914cd8024ba..892d78c1853c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -36,6 +36,13 @@ #include "rcu-string.h" #include "zoned.h" #include "dev-replace.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "file-item.h" +#include "orphan.h" +#include "tree-checker.h" #undef SCRAMBLE_DELAYED_REFS @@ -2220,6 +2227,12 @@ static noinline int check_delayed_ref(struct btrfs_root *root, } if (!mutex_trylock(&head->mutex)) { + if (path->nowait) { + spin_unlock(&delayed_refs->lock); + btrfs_put_transaction(cur_trans); + return -EAGAIN; + } + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); @@ -2686,13 +2699,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, len = cache->start + cache->length - start; len = min(len, end + 1 - start); - down_read(&fs_info->commit_root_sem); - if (start < cache->last_byte_to_unpin && return_free_space) { - u64 add_len = min(len, cache->last_byte_to_unpin - start); - - btrfs_add_free_space(cache, start, add_len); - } - up_read(&fs_info->commit_root_sem); + if (return_free_space) + btrfs_add_free_space(cache, start, len); start += len; total_unpinned += len; @@ -3294,21 +3302,22 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, } /* - * If this is a leaf and there are tree mod log users, we may - * have recorded mod log operations that point to this leaf. - * So we must make sure no one reuses this leaf's extent before - * mod log operations are applied to a node, otherwise after - * rewinding a node using the mod log operations we get an - * inconsistent btree, as the leaf's extent may now be used as - * a node or leaf for another different btree. + * If there are tree mod log users we may have recorded mod log + * operations for this node. If we re-allocate this node we + * could replay operations on this node that happened when it + * existed in a completely different root. For example if it + * was part of root A, then was reallocated to root B, and we + * are doing a btrfs_old_search_slot(root b), we could replay + * operations that happened when the block was part of root A, + * giving us an inconsistent view of the btree. + * * We are safe from races here because at this point no other * node or root points to this extent buffer, so if after this - * check a new tree mod log user joins, it will not be able to - * find a node pointing to this leaf and record operations that - * point to this leaf. + * check a new tree mod log user joins we will not have an + * existing log of operations on this node that we have to + * contend with. */ - if (btrfs_header_level(buf) == 0 && - test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) + if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) must_pin = true; if (must_pin || btrfs_is_zoned(fs_info)) { @@ -3804,7 +3813,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, block_group->start == fs_info->data_reloc_bg || fs_info->data_reloc_bg == 0); - if (block_group->ro || block_group->zoned_data_reloc_ongoing) { + if (block_group->ro || + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { ret = 1; goto out; } @@ -3881,7 +3891,7 @@ out: * regular extents) at the same time to the same zone, which * easily break the write pointer. */ - block_group->zoned_data_reloc_ongoing = 1; + set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); fs_info->data_reloc_bg = 0; } spin_unlock(&fs_info->relocation_bg_lock); @@ -4888,6 +4898,9 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) lockdep_owner = BTRFS_FS_TREE_OBJECTID; + /* btrfs_clean_tree_block() accesses generation field. */ + btrfs_set_header_generation(buf, trans->transid); + /* * This needs to stay, because we could allocate a freed block from an * old tree into a new tree, so we need to make sure this new block is @@ -5249,8 +5262,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, u64 bytenr; u64 generation; u64 parent; + struct btrfs_tree_parent_check check = { 0 }; struct btrfs_key key; - struct btrfs_key first_key; struct btrfs_ref ref = { 0 }; struct extent_buffer *next; int level = wc->level; @@ -5272,7 +5285,12 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); - btrfs_node_key_to_cpu(path->nodes[level], &first_key, + + check.level = level - 1; + check.transid = generation; + check.owner_root = root->root_key.objectid; + check.has_first_key = true; + btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]); next = find_extent_buffer(fs_info, bytenr); @@ -5334,8 +5352,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (!next) { if (reada && level == 1) reada_walk_down(trans, root, wc, path); - next = read_tree_block(fs_info, bytenr, root->root_key.objectid, - generation, level - 1, &first_key); + next = read_tree_block(fs_info, bytenr, &check); if (IS_ERR(next)) { return PTR_ERR(next); } else if (!extent_buffer_uptodate(next)) { @@ -5639,6 +5656,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, */ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) { + const bool is_reloc_root = (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID); struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_trans_handle *trans; @@ -5798,6 +5817,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) goto out_end_trans; } + if (!is_reloc_root) + btrfs_set_last_root_drop_gen(fs_info, trans->transid); + btrfs_end_transaction_throttle(trans); if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { btrfs_debug(fs_info, @@ -5832,7 +5854,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) goto out_end_trans; } - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + if (!is_reloc_root) { ret = btrfs_find_root(tree_root, &root->root_key, path, NULL, NULL); if (ret < 0) { @@ -5864,6 +5886,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) btrfs_put_root(root); root_dropped = true; out_end_trans: + if (!is_reloc_root) + btrfs_set_last_root_drop_gen(fs_info, trans->transid); + btrfs_end_transaction_throttle(trans); out_free: kfree(wc); @@ -5959,40 +5984,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return ret; } -/* - * helper to account the unused space of all the readonly block group in the - * space_info. takes mirrors into account. - */ -u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) -{ - struct btrfs_block_group *block_group; - u64 free_bytes = 0; - int factor; - - /* It's df, we don't care if it's racy */ - if (list_empty(&sinfo->ro_bgs)) - return 0; - - spin_lock(&sinfo->lock); - list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { - spin_lock(&block_group->lock); - - if (!block_group->ro) { - spin_unlock(&block_group->lock); - continue; - } - - factor = btrfs_bg_type_to_factor(block_group->flags); - free_bytes += (block_group->length - - block_group->used) * factor; - - spin_unlock(&block_group->lock); - } - spin_unlock(&sinfo->lock); - - return free_bytes; -} - int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { @@ -6058,7 +6049,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) btrfs_warn_in_rcu(fs_info, "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", start, end - start + 1, - rcu_str_deref(device->name), + btrfs_dev_name(device), device->total_bytes); mutex_unlock(&fs_info->chunk_mutex); ret = 0; diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h new file mode 100644 index 000000000000..ae5425253603 --- /dev/null +++ b/fs/btrfs/extent-tree.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_EXTENT_TREE_H +#define BTRFS_EXTENT_TREE_H + +enum btrfs_inline_ref_type { + BTRFS_REF_TYPE_INVALID, + BTRFS_REF_TYPE_BLOCK, + BTRFS_REF_TYPE_DATA, + BTRFS_REF_TYPE_ANY, +}; + +int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, + struct btrfs_extent_inline_ref *iref, + enum btrfs_inline_ref_type is_data); +u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); + +int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 num_bytes); +void btrfs_free_excluded_extents(struct btrfs_block_group *cache); +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); +void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); +int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 offset, int metadata, u64 *refs, u64 *flags); +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, + int reserved); +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes); +int btrfs_exclude_logged_extents(struct extent_buffer *eb); +int btrfs_cross_ref_exist(struct btrfs_root *root, + u64 objectid, u64 offset, u64 bytenr, bool strict, + struct btrfs_path *path); +struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + const struct btrfs_disk_key *key, + int level, u64 hint, + u64 empty_size, + enum btrfs_lock_nesting nest); +void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + u64 root_id, + struct extent_buffer *buf, + u64 parent, int last_ref); +int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 owner, + u64 offset, u64 ram_bytes, + struct btrfs_key *ins); +int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + u64 root_objectid, u64 owner, u64 offset, + struct btrfs_key *ins); +int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, + u64 min_alloc_size, u64 empty_size, u64 hint_byte, + struct btrfs_key *ins, int is_data, int delalloc); +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref); +int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref); +int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, + struct extent_buffer *eb, u64 flags, int level); +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); + +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 len, int delalloc); +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len); +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); +int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, + int for_reloc); +int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *node, + struct extent_buffer *parent); + +#endif diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cf4f19e80e2f..83dd3aa59663 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -20,7 +20,7 @@ #include "extent_map.h" #include "ctree.h" #include "btrfs_inode.h" -#include "volumes.h" +#include "bio.h" #include "check-integrity.h" #include "locking.h" #include "rcu-string.h" @@ -30,39 +30,34 @@ #include "zoned.h" #include "block-group.h" #include "compression.h" +#include "fs.h" +#include "accessors.h" +#include "file-item.h" +#include "file.h" +#include "dev-replace.h" +#include "super.h" -static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; -static struct bio_set btrfs_bioset; - -static inline bool extent_state_in_tree(const struct extent_state *state) -{ - return !RB_EMPTY_NODE(&state->rb_node); -} #ifdef CONFIG_BTRFS_DEBUG -static LIST_HEAD(states); -static DEFINE_SPINLOCK(leak_lock); - -static inline void btrfs_leak_debug_add(spinlock_t *lock, - struct list_head *new, - struct list_head *head) +static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; unsigned long flags; - spin_lock_irqsave(lock, flags); - list_add(new, head); - spin_unlock_irqrestore(lock, flags); + spin_lock_irqsave(&fs_info->eb_leak_lock, flags); + list_add(&eb->leak_list, &fs_info->allocated_ebs); + spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); } -static inline void btrfs_leak_debug_del(spinlock_t *lock, - struct list_head *entry) +static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; unsigned long flags; - spin_lock_irqsave(lock, flags); - list_del(entry); - spin_unlock_irqrestore(lock, flags); + spin_lock_irqsave(&fs_info->eb_leak_lock, flags); + list_del(&eb->leak_list); + spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); } void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) @@ -91,53 +86,11 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) } spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); } - -static inline void btrfs_extent_state_leak_debug_check(void) -{ - struct extent_state *state; - - while (!list_empty(&states)) { - state = list_entry(states.next, struct extent_state, leak_list); - pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", - state->start, state->end, state->state, - extent_state_in_tree(state), - refcount_read(&state->refs)); - list_del(&state->leak_list); - kmem_cache_free(extent_state_cache, state); - } -} - -#define btrfs_debug_check_extent_io_range(tree, start, end) \ - __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) -static inline void __btrfs_debug_check_extent_io_range(const char *caller, - struct extent_io_tree *tree, u64 start, u64 end) -{ - struct inode *inode = tree->private_data; - u64 isize; - - if (!inode || !is_data_inode(inode)) - return; - - isize = i_size_read(inode); - if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, - "%s: ino %llu isize %llu odd range [%llu,%llu]", - caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); - } -} #else -#define btrfs_leak_debug_add(lock, new, head) do {} while (0) -#define btrfs_leak_debug_del(lock, entry) do {} while (0) -#define btrfs_extent_state_leak_debug_check() do {} while (0) -#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) +#define btrfs_leak_debug_add_eb(eb) do {} while (0) +#define btrfs_leak_debug_del_eb(eb) do {} while (0) #endif -struct tree_entry { - u64 start; - u64 end; - struct rb_node rb_node; -}; - /* * Structure to record info about the bio being assembled, and other info like * how many bytes are there before stripe/ordered extent boundary. @@ -148,42 +101,23 @@ struct btrfs_bio_ctrl { enum btrfs_compression_type compress_type; u32 len_to_stripe_boundary; u32 len_to_oe_boundary; -}; + btrfs_bio_end_io_t end_io_func; -struct extent_page_data { - struct btrfs_bio_ctrl bio_ctrl; - /* tells writepage not to lock the state bits for this range - * it still does the unlocking + /* + * Tell writepage not to lock the state bits for this range, it still + * does the unlocking. */ - unsigned int extent_locked:1; + bool extent_locked; - /* tells the submit_bio code to use REQ_SYNC */ - unsigned int sync_io:1; + /* Tell the submit_bio code to use REQ_SYNC */ + bool sync_io; }; -static int add_extent_changeset(struct extent_state *state, u32 bits, - struct extent_changeset *changeset, - int set) -{ - int ret; - - if (!changeset) - return 0; - if (set && (state->state & bits) == bits) - return 0; - if (!set && (state->state & bits) == 0) - return 0; - changeset->bytes_changed += state->end - state->start + 1; - ret = ulist_add(&changeset->range_changed, state->start, state->end, - GFP_ATOMIC); - return ret; -} - static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct bio *bio; struct bio_vec *bv; - struct inode *inode; + struct btrfs_inode *inode; int mirror_num; if (!bio_ctrl->bio) @@ -191,7 +125,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio = bio_ctrl->bio; bv = bio_first_bvec_all(bio); - inode = bv->bv_page->mapping->host; + inode = BTRFS_I(bv->bv_page->mapping->host); mirror_num = bio_ctrl->mirror_num; /* Caller should ensure the bio has at least some range added */ @@ -199,7 +133,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; - if (!is_data_inode(inode)) + if (!is_data_inode(&inode->vfs_inode)) btrfs_submit_metadata_bio(inode, bio, mirror_num); else if (btrfs_op(bio) == BTRFS_MAP_WRITE) btrfs_submit_data_write_bio(inode, bio, mirror_num); @@ -207,42 +141,31 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) btrfs_submit_data_read_bio(inode, bio, mirror_num, bio_ctrl->compress_type); - /* The bio is owned by the bi_end_io handler now */ + /* The bio is owned by the end_io handler now */ bio_ctrl->bio = NULL; } /* - * Submit or fail the current bio in an extent_page_data structure. + * Submit or fail the current bio in the bio_ctrl structure. */ -static void submit_write_bio(struct extent_page_data *epd, int ret) +static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret) { - struct bio *bio = epd->bio_ctrl.bio; + struct bio *bio = bio_ctrl->bio; if (!bio) return; if (ret) { ASSERT(ret < 0); - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); - /* The bio is owned by the bi_end_io handler now */ - epd->bio_ctrl.bio = NULL; + btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); + /* The bio is owned by the end_io handler now */ + bio_ctrl->bio = NULL; } else { - submit_one_bio(&epd->bio_ctrl); + submit_one_bio(bio_ctrl); } } -int __init extent_state_cache_init(void) -{ - extent_state_cache = kmem_cache_create("btrfs_extent_state", - sizeof(struct extent_state), 0, - SLAB_MEM_SPREAD, NULL); - if (!extent_state_cache) - return -ENOMEM; - return 0; -} - -int __init extent_io_init(void) +int __init extent_buffer_init_cachep(void) { extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, @@ -250,32 +173,10 @@ int __init extent_io_init(void) if (!extent_buffer_cache) return -ENOMEM; - if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_bio, bio), - BIOSET_NEED_BVECS)) - goto free_buffer_cache; - - if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE)) - goto free_bioset; - return 0; - -free_bioset: - bioset_exit(&btrfs_bioset); - -free_buffer_cache: - kmem_cache_destroy(extent_buffer_cache); - extent_buffer_cache = NULL; - return -ENOMEM; -} - -void __cold extent_state_cache_exit(void) -{ - btrfs_extent_state_leak_debug_check(); - kmem_cache_destroy(extent_state_cache); } -void __cold extent_io_exit(void) +void __cold extent_buffer_free_cachep(void) { /* * Make sure all delayed rcu free are flushed before we @@ -283,1244 +184,6 @@ void __cold extent_io_exit(void) */ rcu_barrier(); kmem_cache_destroy(extent_buffer_cache); - bioset_exit(&btrfs_bioset); -} - -/* - * For the file_extent_tree, we want to hold the inode lock when we lookup and - * update the disk_i_size, but lockdep will complain because our io_tree we hold - * the tree lock and get the inode lock when setting delalloc. These two things - * are unrelated, so make a class for the file_extent_tree so we don't get the - * two locking patterns mixed up. - */ -static struct lock_class_key file_extent_tree_class; - -void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner, - void *private_data) -{ - tree->fs_info = fs_info; - tree->state = RB_ROOT; - tree->dirty_bytes = 0; - spin_lock_init(&tree->lock); - tree->private_data = private_data; - tree->owner = owner; - if (owner == IO_TREE_INODE_FILE_EXTENT) - lockdep_set_class(&tree->lock, &file_extent_tree_class); -} - -void extent_io_tree_release(struct extent_io_tree *tree) -{ - spin_lock(&tree->lock); - /* - * Do a single barrier for the waitqueue_active check here, the state - * of the waitqueue should not change once extent_io_tree_release is - * called. - */ - smp_mb(); - while (!RB_EMPTY_ROOT(&tree->state)) { - struct rb_node *node; - struct extent_state *state; - - node = rb_first(&tree->state); - state = rb_entry(node, struct extent_state, rb_node); - rb_erase(&state->rb_node, &tree->state); - RB_CLEAR_NODE(&state->rb_node); - /* - * btree io trees aren't supposed to have tasks waiting for - * changes in the flags of extent states ever. - */ - ASSERT(!waitqueue_active(&state->wq)); - free_extent_state(state); - - cond_resched_lock(&tree->lock); - } - spin_unlock(&tree->lock); -} - -static struct extent_state *alloc_extent_state(gfp_t mask) -{ - struct extent_state *state; - - /* - * The given mask might be not appropriate for the slab allocator, - * drop the unsupported bits - */ - mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); - state = kmem_cache_alloc(extent_state_cache, mask); - if (!state) - return state; - state->state = 0; - state->failrec = NULL; - RB_CLEAR_NODE(&state->rb_node); - btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states); - refcount_set(&state->refs, 1); - init_waitqueue_head(&state->wq); - trace_alloc_extent_state(state, mask, _RET_IP_); - return state; -} - -void free_extent_state(struct extent_state *state) -{ - if (!state) - return; - if (refcount_dec_and_test(&state->refs)) { - WARN_ON(extent_state_in_tree(state)); - btrfs_leak_debug_del(&leak_lock, &state->leak_list); - trace_free_extent_state(state, _RET_IP_); - kmem_cache_free(extent_state_cache, state); - } -} - -/** - * Search @tree for an entry that contains @offset. Such entry would have - * entry->start <= offset && entry->end >= offset. - * - * @tree: the tree to search - * @offset: offset that should fall within an entry in @tree - * @node_ret: pointer where new node should be anchored (used when inserting an - * entry in the tree) - * @parent_ret: points to entry which would have been the parent of the entry, - * containing @offset - * - * Return a pointer to the entry that contains @offset byte address and don't change - * @node_ret and @parent_ret. - * - * If no such entry exists, return pointer to entry that ends before @offset - * and fill parameters @node_ret and @parent_ret, ie. does not return NULL. - */ -static inline struct rb_node *tree_search_for_insert(struct extent_io_tree *tree, - u64 offset, - struct rb_node ***node_ret, - struct rb_node **parent_ret) -{ - struct rb_root *root = &tree->state; - struct rb_node **node = &root->rb_node; - struct rb_node *prev = NULL; - struct tree_entry *entry; - - while (*node) { - prev = *node; - entry = rb_entry(prev, struct tree_entry, rb_node); - - if (offset < entry->start) - node = &(*node)->rb_left; - else if (offset > entry->end) - node = &(*node)->rb_right; - else - return *node; - } - - if (node_ret) - *node_ret = node; - if (parent_ret) - *parent_ret = prev; - - /* Search neighbors until we find the first one past the end */ - while (prev && offset > entry->end) { - prev = rb_next(prev); - entry = rb_entry(prev, struct tree_entry, rb_node); - } - - return prev; -} - -/* - * Inexact rb-tree search, return the next entry if @offset is not found - */ -static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset) -{ - return tree_search_for_insert(tree, offset, NULL, NULL); -} - -/** - * Search offset in the tree or fill neighbor rbtree node pointers. - * - * @tree: the tree to search - * @offset: offset that should fall within an entry in @tree - * @next_ret: pointer to the first entry whose range ends after @offset - * @prev_ret: pointer to the first entry whose range begins before @offset - * - * Return a pointer to the entry that contains @offset byte address. If no - * such entry exists, then return NULL and fill @prev_ret and @next_ret. - * Otherwise return the found entry and other pointers are left untouched. - */ -static struct rb_node *tree_search_prev_next(struct extent_io_tree *tree, - u64 offset, - struct rb_node **prev_ret, - struct rb_node **next_ret) -{ - struct rb_root *root = &tree->state; - struct rb_node **node = &root->rb_node; - struct rb_node *prev = NULL; - struct rb_node *orig_prev = NULL; - struct tree_entry *entry; - - ASSERT(prev_ret); - ASSERT(next_ret); - - while (*node) { - prev = *node; - entry = rb_entry(prev, struct tree_entry, rb_node); - - if (offset < entry->start) - node = &(*node)->rb_left; - else if (offset > entry->end) - node = &(*node)->rb_right; - else - return *node; - } - - orig_prev = prev; - while (prev && offset > entry->end) { - prev = rb_next(prev); - entry = rb_entry(prev, struct tree_entry, rb_node); - } - *next_ret = prev; - prev = orig_prev; - - entry = rb_entry(prev, struct tree_entry, rb_node); - while (prev && offset < entry->start) { - prev = rb_prev(prev); - entry = rb_entry(prev, struct tree_entry, rb_node); - } - *prev_ret = prev; - - return NULL; -} - -/* - * utility function to look for merge candidates inside a given range. - * Any extents with matching state are merged together into a single - * extent in the tree. Extents with EXTENT_IO in their state field - * are not merged because the end_io handlers need to be able to do - * operations on them without sleeping (or doing allocations/splits). - * - * This should be called with the tree lock held. - */ -static void merge_state(struct extent_io_tree *tree, - struct extent_state *state) -{ - struct extent_state *other; - struct rb_node *other_node; - - if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) - return; - - other_node = rb_prev(&state->rb_node); - if (other_node) { - other = rb_entry(other_node, struct extent_state, rb_node); - if (other->end == state->start - 1 && - other->state == state->state) { - if (tree->private_data && - is_data_inode(tree->private_data)) - btrfs_merge_delalloc_extent(tree->private_data, - state, other); - state->start = other->start; - rb_erase(&other->rb_node, &tree->state); - RB_CLEAR_NODE(&other->rb_node); - free_extent_state(other); - } - } - other_node = rb_next(&state->rb_node); - if (other_node) { - other = rb_entry(other_node, struct extent_state, rb_node); - if (other->start == state->end + 1 && - other->state == state->state) { - if (tree->private_data && - is_data_inode(tree->private_data)) - btrfs_merge_delalloc_extent(tree->private_data, - state, other); - state->end = other->end; - rb_erase(&other->rb_node, &tree->state); - RB_CLEAR_NODE(&other->rb_node); - free_extent_state(other); - } - } -} - -static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, u32 bits, - struct extent_changeset *changeset); - -/* - * insert an extent_state struct into the tree. 'bits' are set on the - * struct before it is inserted. - * - * This may return -EEXIST if the extent is already there, in which case the - * state struct is freed. - * - * The tree lock is not taken internally. This is a utility function and - * probably isn't what you want to call (see set/clear_extent_bit). - */ -static int insert_state(struct extent_io_tree *tree, - struct extent_state *state, - u32 bits, struct extent_changeset *changeset) -{ - struct rb_node **node; - struct rb_node *parent; - const u64 end = state->end; - - set_state_bits(tree, state, bits, changeset); - - node = &tree->state.rb_node; - while (*node) { - struct tree_entry *entry; - - parent = *node; - entry = rb_entry(parent, struct tree_entry, rb_node); - - if (end < entry->start) { - node = &(*node)->rb_left; - } else if (end > entry->end) { - node = &(*node)->rb_right; - } else { - btrfs_err(tree->fs_info, - "found node %llu %llu on insert of %llu %llu", - entry->start, entry->end, state->start, end); - return -EEXIST; - } - } - - rb_link_node(&state->rb_node, parent, node); - rb_insert_color(&state->rb_node, &tree->state); - - merge_state(tree, state); - return 0; -} - -/* - * Insert state to @tree to the location given by @node and @parent. - */ -static void insert_state_fast(struct extent_io_tree *tree, - struct extent_state *state, struct rb_node **node, - struct rb_node *parent, unsigned bits, - struct extent_changeset *changeset) -{ - set_state_bits(tree, state, bits, changeset); - rb_link_node(&state->rb_node, parent, node); - rb_insert_color(&state->rb_node, &tree->state); - merge_state(tree, state); -} - -/* - * split a given extent state struct in two, inserting the preallocated - * struct 'prealloc' as the newly created second half. 'split' indicates an - * offset inside 'orig' where it should be split. - * - * Before calling, - * the tree has 'orig' at [orig->start, orig->end]. After calling, there - * are two extent state structs in the tree: - * prealloc: [orig->start, split - 1] - * orig: [ split, orig->end ] - * - * The tree locks are not taken by this function. They need to be held - * by the caller. - */ -static int split_state(struct extent_io_tree *tree, struct extent_state *orig, - struct extent_state *prealloc, u64 split) -{ - struct rb_node *parent = NULL; - struct rb_node **node; - - if (tree->private_data && is_data_inode(tree->private_data)) - btrfs_split_delalloc_extent(tree->private_data, orig, split); - - prealloc->start = orig->start; - prealloc->end = split - 1; - prealloc->state = orig->state; - orig->start = split; - - parent = &orig->rb_node; - node = &parent; - while (*node) { - struct tree_entry *entry; - - parent = *node; - entry = rb_entry(parent, struct tree_entry, rb_node); - - if (prealloc->end < entry->start) { - node = &(*node)->rb_left; - } else if (prealloc->end > entry->end) { - node = &(*node)->rb_right; - } else { - free_extent_state(prealloc); - return -EEXIST; - } - } - - rb_link_node(&prealloc->rb_node, parent, node); - rb_insert_color(&prealloc->rb_node, &tree->state); - - return 0; -} - -static struct extent_state *next_state(struct extent_state *state) -{ - struct rb_node *next = rb_next(&state->rb_node); - if (next) - return rb_entry(next, struct extent_state, rb_node); - else - return NULL; -} - -/* - * utility function to clear some bits in an extent state struct. - * it will optionally wake up anyone waiting on this state (wake == 1). - * - * If no bits are set on the state struct after clearing things, the - * struct is freed and removed from the tree - */ -static struct extent_state *clear_state_bit(struct extent_io_tree *tree, - struct extent_state *state, - u32 bits, int wake, - struct extent_changeset *changeset) -{ - struct extent_state *next; - u32 bits_to_clear = bits & ~EXTENT_CTLBITS; - int ret; - - if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { - u64 range = state->end - state->start + 1; - WARN_ON(range > tree->dirty_bytes); - tree->dirty_bytes -= range; - } - - if (tree->private_data && is_data_inode(tree->private_data)) - btrfs_clear_delalloc_extent(tree->private_data, state, bits); - - ret = add_extent_changeset(state, bits_to_clear, changeset, 0); - BUG_ON(ret < 0); - state->state &= ~bits_to_clear; - if (wake) - wake_up(&state->wq); - if (state->state == 0) { - next = next_state(state); - if (extent_state_in_tree(state)) { - rb_erase(&state->rb_node, &tree->state); - RB_CLEAR_NODE(&state->rb_node); - free_extent_state(state); - } else { - WARN_ON(1); - } - } else { - merge_state(tree, state); - next = next_state(state); - } - return next; -} - -static struct extent_state * -alloc_extent_state_atomic(struct extent_state *prealloc) -{ - if (!prealloc) - prealloc = alloc_extent_state(GFP_ATOMIC); - - return prealloc; -} - -static void extent_io_tree_panic(struct extent_io_tree *tree, int err) -{ - btrfs_panic(tree->fs_info, err, - "locking error: extent tree was modified by another thread while locked"); -} - -/* - * clear some bits on a range in the tree. This may require splitting - * or inserting elements in the tree, so the gfp mask is used to - * indicate which allocations or sleeping are allowed. - * - * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove - * the given range from the tree regardless of state (ie for truncate). - * - * the range [start, end] is inclusive. - * - * This takes the tree lock, and returns 0 on success and < 0 on error. - */ -int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int wake, int delete, - struct extent_state **cached_state, - gfp_t mask, struct extent_changeset *changeset) -{ - struct extent_state *state; - struct extent_state *cached; - struct extent_state *prealloc = NULL; - struct rb_node *node; - u64 last_end; - int err; - int clear = 0; - - btrfs_debug_check_extent_io_range(tree, start, end); - trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); - - if (bits & EXTENT_DELALLOC) - bits |= EXTENT_NORESERVE; - - if (delete) - bits |= ~EXTENT_CTLBITS; - - if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) - clear = 1; -again: - if (!prealloc && gfpflags_allow_blocking(mask)) { - /* - * Don't care for allocation failure here because we might end - * up not needing the pre-allocated extent state at all, which - * is the case if we only have in the tree extent states that - * cover our input range and don't cover too any other range. - * If we end up needing a new extent state we allocate it later. - */ - prealloc = alloc_extent_state(mask); - } - - spin_lock(&tree->lock); - if (cached_state) { - cached = *cached_state; - - if (clear) { - *cached_state = NULL; - cached_state = NULL; - } - - if (cached && extent_state_in_tree(cached) && - cached->start <= start && cached->end > start) { - if (clear) - refcount_dec(&cached->refs); - state = cached; - goto hit_next; - } - if (clear) - free_extent_state(cached); - } - /* - * this search will find the extents that end after - * our range starts - */ - node = tree_search(tree, start); - if (!node) - goto out; - state = rb_entry(node, struct extent_state, rb_node); -hit_next: - if (state->start > end) - goto out; - WARN_ON(state->end < start); - last_end = state->end; - - /* the state doesn't have the wanted bits, go ahead */ - if (!(state->state & bits)) { - state = next_state(state); - goto next; - } - - /* - * | ---- desired range ---- | - * | state | or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip - * bits on second half. - * - * If the extent we found extends past our range, we - * just split and search again. It'll get split again - * the next time though. - * - * If the extent we found is inside our range, we clear - * the desired bit on it. - */ - - if (state->start < start) { - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); - - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - state = clear_state_bit(tree, state, bits, wake, changeset); - goto next; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and clear the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); - - if (wake) - wake_up(&state->wq); - - clear_state_bit(tree, prealloc, bits, wake, changeset); - - prealloc = NULL; - goto out; - } - - state = clear_state_bit(tree, state, bits, wake, changeset); -next: - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - if (start <= end && state && !need_resched()) - goto hit_next; - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - goto again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return 0; - -} - -static void wait_on_state(struct extent_io_tree *tree, - struct extent_state *state) - __releases(tree->lock) - __acquires(tree->lock) -{ - DEFINE_WAIT(wait); - prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&tree->lock); - schedule(); - spin_lock(&tree->lock); - finish_wait(&state->wq, &wait); -} - -/* - * waits for one or more bits to clear on a range in the state tree. - * The range [start, end] is inclusive. - * The tree lock is taken by this function - */ -static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits) -{ - struct extent_state *state; - struct rb_node *node; - - btrfs_debug_check_extent_io_range(tree, start, end); - - spin_lock(&tree->lock); -again: - while (1) { - /* - * this search will find all the extents that end after - * our range starts - */ - node = tree_search(tree, start); -process_node: - if (!node) - break; - - state = rb_entry(node, struct extent_state, rb_node); - - if (state->start > end) - goto out; - - if (state->state & bits) { - start = state->start; - refcount_inc(&state->refs); - wait_on_state(tree, state); - free_extent_state(state); - goto again; - } - start = state->end + 1; - - if (start > end) - break; - - if (!cond_resched_lock(&tree->lock)) { - node = rb_next(node); - goto process_node; - } - } -out: - spin_unlock(&tree->lock); -} - -static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, - u32 bits, struct extent_changeset *changeset) -{ - u32 bits_to_set = bits & ~EXTENT_CTLBITS; - int ret; - - if (tree->private_data && is_data_inode(tree->private_data)) - btrfs_set_delalloc_extent(tree->private_data, state, bits); - - if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { - u64 range = state->end - state->start + 1; - tree->dirty_bytes += range; - } - ret = add_extent_changeset(state, bits_to_set, changeset, 1); - BUG_ON(ret < 0); - state->state |= bits_to_set; -} - -static void cache_state_if_flags(struct extent_state *state, - struct extent_state **cached_ptr, - unsigned flags) -{ - if (cached_ptr && !(*cached_ptr)) { - if (!flags || (state->state & flags)) { - *cached_ptr = state; - refcount_inc(&state->refs); - } - } -} - -static void cache_state(struct extent_state *state, - struct extent_state **cached_ptr) -{ - return cache_state_if_flags(state, cached_ptr, - EXTENT_LOCKED | EXTENT_BOUNDARY); -} - -/* - * set some bits on a range in the tree. This may require allocations or - * sleeping, so the gfp mask is used to indicate what is allowed. - * - * If any of the exclusive bits are set, this will fail with -EEXIST if some - * part of the range already has the desired bits set. The start of the - * existing range is returned in failed_start in this case. - * - * [start, end] is inclusive This takes the tree lock. - */ -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - u32 exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask, - struct extent_changeset *changeset) -{ - struct extent_state *state; - struct extent_state *prealloc = NULL; - struct rb_node *node; - struct rb_node **p; - struct rb_node *parent; - int err = 0; - u64 last_start; - u64 last_end; - - btrfs_debug_check_extent_io_range(tree, start, end); - trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); - - if (exclusive_bits) - ASSERT(failed_start); - else - ASSERT(failed_start == NULL); -again: - if (!prealloc && gfpflags_allow_blocking(mask)) { - /* - * Don't care for allocation failure here because we might end - * up not needing the pre-allocated extent state at all, which - * is the case if we only have in the tree extent states that - * cover our input range and don't cover too any other range. - * If we end up needing a new extent state we allocate it later. - */ - prealloc = alloc_extent_state(mask); - } - - spin_lock(&tree->lock); - if (cached_state && *cached_state) { - state = *cached_state; - if (state->start <= start && state->end > start && - extent_state_in_tree(state)) { - node = &state->rb_node; - goto hit_next; - } - } - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search_for_insert(tree, start, &p, &parent); - if (!node) { - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - prealloc->start = start; - prealloc->end = end; - insert_state_fast(tree, prealloc, p, parent, bits, changeset); - cache_state(prealloc, cached_state); - prealloc = NULL; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); -hit_next: - last_start = state->start; - last_end = state->end; - - /* - * | ---- desired range ---- | - * | state | - * - * Just lock what we found and keep going - */ - if (state->start == start && state->end <= end) { - if (state->state & exclusive_bits) { - *failed_start = state->start; - err = -EEXIST; - goto out; - } - - set_state_bits(tree, state, bits, changeset); - cache_state(state, cached_state); - merge_state(tree, state); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - state = next_state(state); - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - goto search_again; - } - - /* - * | ---- desired range ---- | - * | state | - * or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip bits on - * second half. - * - * If the extent we found extends past our - * range, we just split and search again. It'll get split - * again the next time though. - * - * If the extent we found is inside our range, we set the - * desired bit on it. - */ - if (state->start < start) { - if (state->state & exclusive_bits) { - *failed_start = start; - err = -EEXIST; - goto out; - } - - /* - * If this extent already has all the bits we want set, then - * skip it, not necessary to split it or do anything with it. - */ - if ((state->state & bits) == bits) { - start = state->end + 1; - cache_state(state, cached_state); - goto search_again; - } - - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); - - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - set_state_bits(tree, state, bits, changeset); - cache_state(state, cached_state); - merge_state(tree, state); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - state = next_state(state); - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | or | state | - * - * There's a hole, we need to insert something in it and - * ignore the extent we found. - */ - if (state->start > start) { - u64 this_end; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - - /* - * Avoid to free 'prealloc' if it can be merged with - * the later extent. - */ - prealloc->start = start; - prealloc->end = this_end; - err = insert_state(tree, prealloc, bits, changeset); - if (err) - extent_io_tree_panic(tree, err); - - cache_state(prealloc, cached_state); - prealloc = NULL; - start = this_end + 1; - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and set the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - if (state->state & exclusive_bits) { - *failed_start = start; - err = -EEXIST; - goto out; - } - - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); - - set_state_bits(tree, prealloc, bits, changeset); - cache_state(prealloc, cached_state); - merge_state(tree, prealloc); - prealloc = NULL; - goto out; - } - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - goto again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return err; - -} - -/** - * convert_extent_bit - convert all bits in a given range from one bit to - * another - * @tree: the io tree to search - * @start: the start offset in bytes - * @end: the end offset in bytes (inclusive) - * @bits: the bits to set in this range - * @clear_bits: the bits to clear in this range - * @cached_state: state that we're going to cache - * - * This will go through and set bits for the given range. If any states exist - * already in this range they are set with the given bit and cleared of the - * clear_bits. This is only meant to be used by things that are mergeable, ie - * converting from say DELALLOC to DIRTY. This is not meant to be used with - * boundary bits like LOCK. - * - * All allocations are done with GFP_NOFS. - */ -int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, u32 clear_bits, - struct extent_state **cached_state) -{ - struct extent_state *state; - struct extent_state *prealloc = NULL; - struct rb_node *node; - struct rb_node **p; - struct rb_node *parent; - int err = 0; - u64 last_start; - u64 last_end; - bool first_iteration = true; - - btrfs_debug_check_extent_io_range(tree, start, end); - trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits, - clear_bits); - -again: - if (!prealloc) { - /* - * Best effort, don't worry if extent state allocation fails - * here for the first iteration. We might have a cached state - * that matches exactly the target range, in which case no - * extent state allocations are needed. We'll only know this - * after locking the tree. - */ - prealloc = alloc_extent_state(GFP_NOFS); - if (!prealloc && !first_iteration) - return -ENOMEM; - } - - spin_lock(&tree->lock); - if (cached_state && *cached_state) { - state = *cached_state; - if (state->start <= start && state->end > start && - extent_state_in_tree(state)) { - node = &state->rb_node; - goto hit_next; - } - } - - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search_for_insert(tree, start, &p, &parent); - if (!node) { - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - prealloc->start = start; - prealloc->end = end; - insert_state_fast(tree, prealloc, p, parent, bits, NULL); - cache_state(prealloc, cached_state); - prealloc = NULL; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); -hit_next: - last_start = state->start; - last_end = state->end; - - /* - * | ---- desired range ---- | - * | state | - * - * Just lock what we found and keep going - */ - if (state->start == start && state->end <= end) { - set_state_bits(tree, state, bits, NULL); - cache_state(state, cached_state); - state = clear_state_bit(tree, state, clear_bits, 0, NULL); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - goto search_again; - } - - /* - * | ---- desired range ---- | - * | state | - * or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip bits on - * second half. - * - * If the extent we found extends past our - * range, we just split and search again. It'll get split - * again the next time though. - * - * If the extent we found is inside our range, we set the - * desired bit on it. - */ - if (state->start < start) { - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - set_state_bits(tree, state, bits, NULL); - cache_state(state, cached_state); - state = clear_state_bit(tree, state, clear_bits, 0, NULL); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | or | state | - * - * There's a hole, we need to insert something in it and - * ignore the extent we found. - */ - if (state->start > start) { - u64 this_end; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - - /* - * Avoid to free 'prealloc' if it can be merged with - * the later extent. - */ - prealloc->start = start; - prealloc->end = this_end; - err = insert_state(tree, prealloc, bits, NULL); - if (err) - extent_io_tree_panic(tree, err); - cache_state(prealloc, cached_state); - prealloc = NULL; - start = this_end + 1; - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and set the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); - - set_state_bits(tree, prealloc, bits, NULL); - cache_state(prealloc, cached_state); - clear_state_bit(tree, prealloc, clear_bits, 0, NULL); - prealloc = NULL; - goto out; - } - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - cond_resched(); - first_iteration = false; - goto again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return err; -} - -/* wrappers around set/clear extent bit */ -int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset) -{ - /* - * We don't support EXTENT_LOCKED yet, as current changeset will - * record any bits changed, so for EXTENT_LOCKED case, it will - * either fail with -EEXIST or changeset will record the whole - * range. - */ - BUG_ON(bits & EXTENT_LOCKED); - - return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, - changeset); -} - -int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits) -{ - return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, - GFP_NOWAIT, NULL); -} - -int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int wake, int delete, - struct extent_state **cached) -{ - return __clear_extent_bit(tree, start, end, bits, wake, delete, - cached, GFP_NOFS, NULL); -} - -int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset) -{ - /* - * Don't support EXTENT_LOCKED case, same reason as - * set_record_extent_bits(). - */ - BUG_ON(bits & EXTENT_LOCKED); - - return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, - changeset); -} - -/* - * either insert or lock state struct between start and end use mask to tell - * us if waiting is desired. - */ -int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state) -{ - int err; - u64 failed_start; - - while (1) { - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, - EXTENT_LOCKED, &failed_start, - cached_state, GFP_NOFS, NULL); - if (err == -EEXIST) { - wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); - start = failed_start; - } else - break; - WARN_ON(start > end); - } - return err; -} - -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - int err; - u64 failed_start; - - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, GFP_NOFS, NULL); - if (err == -EEXIST) { - if (failed_start > start) - clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, 1, 0, NULL); - return 0; - } - return 1; } void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) @@ -1554,295 +217,6 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) } } -/* find the first state struct with 'bits' set after 'start', and - * return it. tree->lock must be held. NULL will returned if - * nothing was found after 'start' - */ -static struct extent_state * -find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits) -{ - struct rb_node *node; - struct extent_state *state; - - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) - goto out; - - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->end >= start && (state->state & bits)) - return state; - - node = rb_next(node); - if (!node) - break; - } -out: - return NULL; -} - -/* - * Find the first offset in the io tree with one or more @bits set. - * - * Note: If there are multiple bits set in @bits, any of them will match. - * - * Return 0 if we find something, and update @start_ret and @end_ret. - * Return 1 if we found nothing. - */ -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state) -{ - struct extent_state *state; - int ret = 1; - - spin_lock(&tree->lock); - if (cached_state && *cached_state) { - state = *cached_state; - if (state->end == start - 1 && extent_state_in_tree(state)) { - while ((state = next_state(state)) != NULL) { - if (state->state & bits) - goto got_it; - } - free_extent_state(*cached_state); - *cached_state = NULL; - goto out; - } - free_extent_state(*cached_state); - *cached_state = NULL; - } - - state = find_first_extent_bit_state(tree, start, bits); -got_it: - if (state) { - cache_state_if_flags(state, cached_state, 0); - *start_ret = state->start; - *end_ret = state->end; - ret = 0; - } -out: - spin_unlock(&tree->lock); - return ret; -} - -/** - * Find a contiguous area of bits - * - * @tree: io tree to check - * @start: offset to start the search from - * @start_ret: the first offset we found with the bits set - * @end_ret: the final contiguous range of the bits that were set - * @bits: bits to look for - * - * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges - * to set bits appropriately, and then merge them again. During this time it - * will drop the tree->lock, so use this helper if you want to find the actual - * contiguous area for given bits. We will search to the first bit we find, and - * then walk down the tree until we find a non-contiguous area. The area - * returned will be the full contiguous area with the bits set. - */ -int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits) -{ - struct extent_state *state; - int ret = 1; - - spin_lock(&tree->lock); - state = find_first_extent_bit_state(tree, start, bits); - if (state) { - *start_ret = state->start; - *end_ret = state->end; - while ((state = next_state(state)) != NULL) { - if (state->start > (*end_ret + 1)) - break; - *end_ret = state->end; - } - ret = 0; - } - spin_unlock(&tree->lock); - return ret; -} - -/** - * Find the first range that has @bits not set. This range could start before - * @start. - * - * @tree: the tree to search - * @start: offset at/after which the found extent should start - * @start_ret: records the beginning of the range - * @end_ret: records the end of the range (inclusive) - * @bits: the set of bits which must be unset - * - * Since unallocated range is also considered one which doesn't have the bits - * set it's possible that @end_ret contains -1, this happens in case the range - * spans (last_range_end, end of device]. In this case it's up to the caller to - * trim @end_ret to the appropriate size. - */ -void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits) -{ - struct extent_state *state; - struct rb_node *node, *prev = NULL, *next; - - spin_lock(&tree->lock); - - /* Find first extent with bits cleared */ - while (1) { - node = tree_search_prev_next(tree, start, &prev, &next); - if (!node && !next && !prev) { - /* - * Tree is completely empty, send full range and let - * caller deal with it - */ - *start_ret = 0; - *end_ret = -1; - goto out; - } else if (!node && !next) { - /* - * We are past the last allocated chunk, set start at - * the end of the last extent. - */ - state = rb_entry(prev, struct extent_state, rb_node); - *start_ret = state->end + 1; - *end_ret = -1; - goto out; - } else if (!node) { - node = next; - } - /* - * At this point 'node' either contains 'start' or start is - * before 'node' - */ - state = rb_entry(node, struct extent_state, rb_node); - - if (in_range(start, state->start, state->end - state->start + 1)) { - if (state->state & bits) { - /* - * |--range with bits sets--| - * | - * start - */ - start = state->end + 1; - } else { - /* - * 'start' falls within a range that doesn't - * have the bits set, so take its start as - * the beginning of the desired range - * - * |--range with bits cleared----| - * | - * start - */ - *start_ret = state->start; - break; - } - } else { - /* - * |---prev range---|---hole/unset---|---node range---| - * | - * start - * - * or - * - * |---hole/unset--||--first node--| - * 0 | - * start - */ - if (prev) { - state = rb_entry(prev, struct extent_state, - rb_node); - *start_ret = state->end + 1; - } else { - *start_ret = 0; - } - break; - } - } - - /* - * Find the longest stretch from start until an entry which has the - * bits set - */ - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->end >= start && !(state->state & bits)) { - *end_ret = state->end; - } else { - *end_ret = state->start - 1; - break; - } - - node = rb_next(node); - if (!node) - break; - } -out: - spin_unlock(&tree->lock); -} - -/* - * find a contiguous range of bytes in the file marked as delalloc, not - * more than 'max_bytes'. start and end are used to return the range, - * - * true is returned if we find something, false if nothing was in the tree - */ -bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, - u64 *end, u64 max_bytes, - struct extent_state **cached_state) -{ - struct rb_node *node; - struct extent_state *state; - u64 cur_start = *start; - bool found = false; - u64 total_bytes = 0; - - spin_lock(&tree->lock); - - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, cur_start); - if (!node) { - *end = (u64)-1; - goto out; - } - - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (found && (state->start != cur_start || - (state->state & EXTENT_BOUNDARY))) { - goto out; - } - if (!(state->state & EXTENT_DELALLOC)) { - if (!found) - *end = state->end; - goto out; - } - if (!found) { - *start = state->start; - *cached_state = state; - refcount_inc(&state->refs); - } - found = true; - *end = state->end; - cur_start = state->end + 1; - node = rb_next(node); - total_bytes += state->end - state->start + 1; - if (total_bytes >= max_bytes) - break; - if (!node) - break; - } -out: - spin_unlock(&tree->lock); - return found; -} - /* * Process one page for __process_pages_contig(). * @@ -1900,9 +274,8 @@ static int __process_pages_contig(struct address_space *mapping, pgoff_t start_index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; pgoff_t index = start_index; - unsigned long nr_pages = end_index - start_index + 1; unsigned long pages_processed = 0; - struct page *pages[16]; + struct folio_batch fbatch; int err = 0; int i; @@ -1911,16 +284,17 @@ static int __process_pages_contig(struct address_space *mapping, ASSERT(processed_end && *processed_end == start); } - if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) + if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index) mapping_set_error(mapping, -EIO); - while (nr_pages > 0) { - int found_pages; + folio_batch_init(&fbatch); + while (index <= end_index) { + int found_folios; + + found_folios = filemap_get_folios_contig(mapping, &index, + end_index, &fbatch); - found_pages = find_get_pages_contig(mapping, index, - min_t(unsigned long, - nr_pages, ARRAY_SIZE(pages)), pages); - if (found_pages == 0) { + if (found_folios == 0) { /* * Only if we're going to lock these pages, we can find * nothing at @index. @@ -1930,23 +304,20 @@ static int __process_pages_contig(struct address_space *mapping, goto out; } - for (i = 0; i < found_pages; i++) { + for (i = 0; i < found_folios; i++) { int process_ret; - + struct folio *folio = fbatch.folios[i]; process_ret = process_one_page(fs_info, mapping, - pages[i], locked_page, page_ops, + &folio->page, locked_page, page_ops, start, end); if (process_ret < 0) { - for (; i < found_pages; i++) - put_page(pages[i]); err = -EAGAIN; + folio_batch_release(&fbatch); goto out; } - put_page(pages[i]); - pages_processed++; + pages_processed += folio_nr_pages(folio); } - nr_pages -= found_pages; - index += found_pages; + folio_batch_release(&fbatch); cond_resched(); } out: @@ -2094,14 +465,14 @@ again: } /* step three, lock the state bits for the whole range */ - lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); + lock_extent(tree, delalloc_start, delalloc_end, &cached_state); /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, EXTENT_DELALLOC, 1, cached_state); if (!ret) { - unlock_extent_cached(tree, delalloc_start, delalloc_end, - &cached_state); + unlock_extent(tree, delalloc_start, delalloc_end, + &cached_state); __unlock_for_delalloc(inode, locked_page, delalloc_start, delalloc_end); cond_resched(); @@ -2118,324 +489,46 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 clear_bits, unsigned long page_ops) { - clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL); + clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL); __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, start, end, page_ops, NULL); } -/* - * count the number of bytes in the tree that have a given bit(s) - * set. This can be fairly slow, except for EXTENT_DIRTY which is - * cached. The total number found is returned. - */ -u64 count_range_bits(struct extent_io_tree *tree, - u64 *start, u64 search_end, u64 max_bytes, - u32 bits, int contig) +static int insert_failrec(struct btrfs_inode *inode, + struct io_failure_record *failrec) { - struct rb_node *node; - struct extent_state *state; - u64 cur_start = *start; - u64 total_bytes = 0; - u64 last = 0; - int found = 0; + struct rb_node *exist; - if (WARN_ON(search_end <= cur_start)) - return 0; + spin_lock(&inode->io_failure_lock); + exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr, + &failrec->rb_node); + spin_unlock(&inode->io_failure_lock); - spin_lock(&tree->lock); - if (cur_start == 0 && bits == EXTENT_DIRTY) { - total_bytes = tree->dirty_bytes; - goto out; - } - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, cur_start); - if (!node) - goto out; - - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->start > search_end) - break; - if (contig && found && state->start > last + 1) - break; - if (state->end >= cur_start && (state->state & bits) == bits) { - total_bytes += min(search_end, state->end) + 1 - - max(cur_start, state->start); - if (total_bytes >= max_bytes) - break; - if (!found) { - *start = max(cur_start, state->start); - found = 1; - } - last = state->end; - } else if (contig && found) { - break; - } - node = rb_next(node); - if (!node) - break; - } -out: - spin_unlock(&tree->lock); - return total_bytes; + return (exist == NULL) ? 0 : -EEXIST; } -/* - * set the private field for a given byte offset in the tree. If there isn't - * an extent_state there already, this does nothing. - */ -int set_state_failrec(struct extent_io_tree *tree, u64 start, - struct io_failure_record *failrec) +static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start) { struct rb_node *node; - struct extent_state *state; - int ret = 0; + struct io_failure_record *failrec = ERR_PTR(-ENOENT); - spin_lock(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) { - ret = -ENOENT; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); - if (state->start != start) { - ret = -ENOENT; - goto out; - } - state->failrec = failrec; -out: - spin_unlock(&tree->lock); - return ret; -} - -struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start) -{ - struct rb_node *node; - struct extent_state *state; - struct io_failure_record *failrec; - - spin_lock(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) { - failrec = ERR_PTR(-ENOENT); - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); - if (state->start != start) { - failrec = ERR_PTR(-ENOENT); - goto out; - } - - failrec = state->failrec; -out: - spin_unlock(&tree->lock); + spin_lock(&inode->io_failure_lock); + node = rb_simple_search(&inode->io_failure_tree, start); + if (node) + failrec = rb_entry(node, struct io_failure_record, rb_node); + spin_unlock(&inode->io_failure_lock); return failrec; } -/* - * searches a range in the state tree for a given mask. - * If 'filled' == 1, this returns 1 only if every extent in the tree - * has the bits set. Otherwise, 1 is returned if any bit in the - * range is found set. - */ -int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int filled, struct extent_state *cached) +static void free_io_failure(struct btrfs_inode *inode, + struct io_failure_record *rec) { - struct extent_state *state = NULL; - struct rb_node *node; - int bitset = 0; - - spin_lock(&tree->lock); - if (cached && extent_state_in_tree(cached) && cached->start <= start && - cached->end > start) - node = &cached->rb_node; - else - node = tree_search(tree, start); - while (node && start <= end) { - state = rb_entry(node, struct extent_state, rb_node); - - if (filled && state->start > start) { - bitset = 0; - break; - } - - if (state->start > end) - break; - - if (state->state & bits) { - bitset = 1; - if (!filled) - break; - } else if (filled) { - bitset = 0; - break; - } - - if (state->end == (u64)-1) - break; - - start = state->end + 1; - if (start > end) - break; - node = rb_next(node); - if (!node) { - if (filled) - bitset = 0; - break; - } - } - spin_unlock(&tree->lock); - return bitset; -} - -int free_io_failure(struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, - struct io_failure_record *rec) -{ - int ret; - int err = 0; - - set_state_failrec(failure_tree, rec->start, NULL); - ret = clear_extent_bits(failure_tree, rec->start, - rec->start + rec->len - 1, - EXTENT_LOCKED | EXTENT_DIRTY); - if (ret) - err = ret; - - ret = clear_extent_bits(io_tree, rec->start, - rec->start + rec->len - 1, - EXTENT_DAMAGED); - if (ret && !err) - err = ret; + spin_lock(&inode->io_failure_lock); + rb_erase(&rec->rb_node, &inode->io_failure_tree); + spin_unlock(&inode->io_failure_lock); kfree(rec); - return err; -} - -/* - * this bypasses the standard btrfs submit functions deliberately, as - * the standard behavior is to write all copies in a raid setup. here we only - * want to write the one bad copy. so we do the mapping for ourselves and issue - * submit_bio directly. - * to avoid any synchronization issues, wait for the data after writing, which - * actually prevents the read that triggered the error from finishing. - * currently, there can be no more than two copies of every data bit. thus, - * exactly one rewrite is required. - */ -static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num) -{ - struct btrfs_device *dev; - struct bio_vec bvec; - struct bio bio; - u64 map_length = 0; - u64 sector; - struct btrfs_io_context *bioc = NULL; - int ret = 0; - - ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); - BUG_ON(!mirror_num); - - if (btrfs_repair_one_zone(fs_info, logical)) - return 0; - - map_length = length; - - /* - * Avoid races with device replace and make sure our bioc has devices - * associated to its stripes that don't go away while we are doing the - * read repair operation. - */ - btrfs_bio_counter_inc_blocked(fs_info); - if (btrfs_is_parity_mirror(fs_info, logical, length)) { - /* - * Note that we don't use BTRFS_MAP_WRITE because it's supposed - * to update all raid stripes, but here we just want to correct - * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad - * stripe's dev and sector. - */ - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, - &map_length, &bioc, 0); - if (ret) - goto out_counter_dec; - ASSERT(bioc->mirror_num == 1); - } else { - ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, - &map_length, &bioc, mirror_num); - if (ret) - goto out_counter_dec; - BUG_ON(mirror_num != bioc->mirror_num); - } - - sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; - dev = bioc->stripes[bioc->mirror_num - 1].dev; - btrfs_put_bioc(bioc); - - if (!dev || !dev->bdev || - !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { - ret = -EIO; - goto out_counter_dec; - } - - bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); - bio.bi_iter.bi_sector = sector; - __bio_add_page(&bio, page, length, pg_offset); - - btrfsic_check_bio(&bio); - ret = submit_bio_wait(&bio); - if (ret) { - /* try to remap that extent elsewhere? */ - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - goto out_bio_uninit; - } - - btrfs_info_rl_in_rcu(fs_info, - "read error corrected: ino %llu off %llu (dev %s sector %llu)", - ino, start, - rcu_str_deref(dev->name), sector); - ret = 0; - -out_bio_uninit: - bio_uninit(&bio); -out_counter_dec: - btrfs_bio_counter_dec(fs_info); - return ret; -} - -int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - u64 start = eb->start; - int i, num_pages = num_extent_pages(eb); - int ret = 0; - - if (sb_rdonly(fs_info->sb)) - return -EROFS; - - for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; - - ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p, - start - page_offset(p), mirror_num); - if (ret) - break; - start += PAGE_SIZE; - } - - return ret; } static int next_mirror(const struct io_failure_record *failrec, int cur_mirror) @@ -2456,24 +549,18 @@ static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror) * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record */ -int clean_io_failure(struct btrfs_fs_info *fs_info, - struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, u64 start, - struct page *page, u64 ino, unsigned int pg_offset) +int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, + struct page *page, unsigned int pg_offset) { - u64 private; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + u64 ino = btrfs_ino(inode); + u64 locked_start, locked_end; struct io_failure_record *failrec; - struct extent_state *state; int mirror; int ret; - private = 0; - ret = count_range_bits(failure_tree, &private, (u64)-1, 1, - EXTENT_DIRTY, 0); - if (!ret) - return 0; - - failrec = get_state_failrec(failure_tree, start); + failrec = get_failrec(inode, start); if (IS_ERR(failrec)) return 0; @@ -2482,25 +569,21 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, if (sb_rdonly(fs_info->sb)) goto out; - spin_lock(&io_tree->lock); - state = find_first_extent_bit_state(io_tree, - failrec->start, - EXTENT_LOCKED); - spin_unlock(&io_tree->lock); - - if (!state || state->start > failrec->start || - state->end < failrec->start + failrec->len - 1) + ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start, + &locked_end, EXTENT_LOCKED, NULL); + if (ret || locked_start > failrec->bytenr || + locked_end < failrec->bytenr + failrec->len - 1) goto out; mirror = failrec->this_mirror; do { mirror = prev_mirror(failrec, mirror); - repair_io_failure(fs_info, ino, start, failrec->len, + btrfs_repair_io_failure(fs_info, ino, start, failrec->len, failrec->logical, page, pg_offset, mirror); } while (mirror != failrec->failed_mirror); out: - free_io_failure(failure_tree, io_tree, failrec); + free_io_failure(inode, failrec); return 0; } @@ -2512,30 +595,26 @@ out: */ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) { - struct extent_io_tree *failure_tree = &inode->io_failure_tree; struct io_failure_record *failrec; - struct extent_state *state, *next; + struct rb_node *node, *next; - if (RB_EMPTY_ROOT(&failure_tree->state)) + if (RB_EMPTY_ROOT(&inode->io_failure_tree)) return; - spin_lock(&failure_tree->lock); - state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); - while (state) { - if (state->start > end) + spin_lock(&inode->io_failure_lock); + node = rb_simple_search_first(&inode->io_failure_tree, start); + while (node) { + failrec = rb_entry(node, struct io_failure_record, rb_node); + if (failrec->bytenr > end) break; - ASSERT(state->end <= end); - - next = next_state(state); - - failrec = state->failrec; - free_extent_state(state); + next = rb_next(node); + rb_erase(&failrec->rb_node, &inode->io_failure_tree); kfree(failrec); - state = next; + node = next; } - spin_unlock(&failure_tree->lock); + spin_unlock(&inode->io_failure_lock); } static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, @@ -2545,16 +624,14 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 start = bbio->file_offset + bio_offset; struct io_failure_record *failrec; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; const u32 sectorsize = fs_info->sectorsize; int ret; - failrec = get_state_failrec(failure_tree, start); + failrec = get_failrec(BTRFS_I(inode), start); if (!IS_ERR(failrec)) { btrfs_debug(fs_info, "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu", - failrec->logical, failrec->start, failrec->len); + failrec->logical, failrec->bytenr, failrec->len); /* * when data can be on disk more than twice, add to failrec here * (e.g. with a list for failed_mirror) to make @@ -2569,7 +646,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode if (!failrec) return ERR_PTR(-ENOMEM); - failrec->start = start; + RB_CLEAR_NODE(&failrec->rb_node); + failrec->bytenr = start; failrec->len = sectorsize; failrec->failed_mirror = bbio->mirror_num; failrec->this_mirror = bbio->mirror_num; @@ -2594,14 +672,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode } /* Set the bits in the private failure tree */ - ret = set_extent_bits(failure_tree, start, start + sectorsize - 1, - EXTENT_LOCKED | EXTENT_DIRTY); - if (ret >= 0) { - ret = set_state_failrec(failure_tree, start, failrec); - /* Set the bits in the inode's tree */ - ret = set_extent_bits(tree, start, start + sectorsize - 1, - EXTENT_DAMAGED); - } else if (ret < 0) { + ret = insert_failrec(BTRFS_I(inode), failrec); + if (ret) { kfree(failrec); return ERR_PTR(ret); } @@ -2609,15 +681,13 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode return failrec; } -int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, +int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, u32 bio_offset, struct page *page, unsigned int pgoff, - submit_bio_hook_t *submit_bio_hook) + bool submit_buffered) { u64 start = failed_bbio->file_offset + bio_offset; struct io_failure_record *failrec; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *failed_bio = &failed_bbio->bio; const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; @@ -2628,7 +698,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset); + failrec = btrfs_get_io_failure_record(&inode->vfs_inode, failed_bbio, bio_offset); if (IS_ERR(failrec)) return PTR_ERR(failrec); @@ -2639,24 +709,22 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, * * Since we're only doing repair for one sector, we only need to get * a good copy of the failed sector and if we succeed, we have setup - * everything for repair_io_failure to do the rest for us. + * everything for btrfs_repair_io_failure to do the rest for us. */ failrec->this_mirror = next_mirror(failrec, failrec->this_mirror); if (failrec->this_mirror == failrec->failed_mirror) { btrfs_debug(fs_info, "failed to repair num_copies %d this_mirror %d failed_mirror %d", failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); - free_io_failure(failure_tree, tree, failrec); + free_io_failure(inode, failrec); return -EIO; } - repair_bio = btrfs_bio_alloc(1); + repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io, + failed_bbio->private); repair_bbio = btrfs_bio(repair_bio); repair_bbio->file_offset = start; - repair_bio->bi_opf = REQ_OP_READ; - repair_bio->bi_end_io = failed_bio->bi_end_io; repair_bio->bi_iter.bi_sector = failrec->logical >> 9; - repair_bio->bi_private = failed_bio->bi_private; if (failed_bbio->csum) { const u32 csum_size = fs_info->csum_size; @@ -2669,16 +737,21 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, bio_add_page(repair_bio, page, failrec->len, pgoff); repair_bbio->iter = repair_bio->bi_iter; - btrfs_debug(btrfs_sb(inode->i_sb), + btrfs_debug(fs_info, "repair read error: submitting new read to mirror %d", failrec->this_mirror); /* - * At this point we have a bio, so any errors from submit_bio_hook() - * will be handled by the endio on the repair_bio, so we can't return an + * At this point we have a bio, so any errors from bio submission will + * be handled by the endio on the repair_bio, so we can't return an * error here. */ - submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0); + if (submit_buffered) + btrfs_submit_data_read_bio(inode, repair_bio, + failrec->this_mirror, 0); + else + btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror); + return BLK_STS_OK; } @@ -2714,14 +787,9 @@ static void end_sector_io(struct page *page, u64 offset, bool uptodate) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); const u32 sectorsize = inode->root->fs_info->sectorsize; - struct extent_state *cached = NULL; end_page_read(page, uptodate, offset, sectorsize); - if (uptodate) - set_extent_uptodate(&inode->io_tree, offset, - offset + sectorsize - 1, &cached, GFP_ATOMIC); - unlock_extent_cached_atomic(&inode->io_tree, offset, - offset + sectorsize - 1, &cached); + unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, NULL); } static void submit_data_read_repair(struct inode *inode, @@ -2767,9 +835,9 @@ static void submit_data_read_repair(struct inode *inode, goto next; } - ret = btrfs_repair_one_sector(inode, failed_bbio, + ret = btrfs_repair_one_sector(BTRFS_I(inode), failed_bbio, bio_offset + offset, page, pgoff + offset, - btrfs_submit_data_read_bio); + true); if (!ret) { /* * We have submitted the read repair, the page release @@ -2823,8 +891,9 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_writepage(struct bio *bio) +static void end_bio_extent_writepage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); struct bio_vec *bvec; u64 start; @@ -2924,11 +993,7 @@ static void endio_readpage_release_extent(struct processed_extent *processed, * Now we don't have range contiguous to the processed range, release * the processed range now. */ - if (processed->uptodate && tree->track_uptodate) - set_extent_uptodate(tree, processed->start, processed->end, - &cached, GFP_ATOMIC); - unlock_extent_cached_atomic(tree, processed->start, processed->end, - &cached); + unlock_extent(tree, processed->start, processed->end, &cached); update: /* Update processed to current range */ @@ -2988,11 +1053,10 @@ static struct extent_buffer *find_extent_buffer_readpage( * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_readpage(struct bio *bio) +static void end_bio_extent_readpage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; struct bio_vec *bvec; - struct btrfs_bio *bbio = btrfs_bio(bio); - struct extent_io_tree *tree, *failure_tree; struct processed_extent processed = { 0 }; /* * The offset to the beginning of a bio, since one bio can never be @@ -3019,8 +1083,6 @@ static void end_bio_extent_readpage(struct bio *bio) "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", bio->bi_iter.bi_sector, bio->bi_status, bbio->mirror_num); - tree = &BTRFS_I(inode)->io_tree; - failure_tree = &BTRFS_I(inode)->io_failure_tree; /* * We always issue full-sector reads, but if some block in a @@ -3061,9 +1123,7 @@ static void end_bio_extent_readpage(struct bio *bio) loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_SHIFT; - clean_io_failure(BTRFS_I(inode)->root->fs_info, - failure_tree, tree, start, page, - btrfs_ino(BTRFS_I(inode)), 0); + btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0); /* * Zero out the remaining part if this range straddles @@ -3126,7 +1186,7 @@ static void end_bio_extent_readpage(struct bio *bio) bio_put(bio); } -/** +/* * Populate every free slot in a provided array with pages. * * @nr_pages: number of pages to allocate @@ -3163,59 +1223,15 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) } /* - * Initialize the members up to but not including 'bio'. Use after allocating a - * new bio by bio_alloc_bioset as it does not initialize the bytes outside of - * 'bio' because use of __GFP_ZERO is not supported. - */ -static inline void btrfs_bio_init(struct btrfs_bio *bbio) -{ - memset(bbio, 0, offsetof(struct btrfs_bio, bio)); -} - -/* - * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs. + * Attempt to add a page to bio. * - * The bio allocation is backed by bioset and does not fail. - */ -struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) -{ - struct bio *bio; - - ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS); - bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset); - btrfs_bio_init(btrfs_bio(bio)); - return bio; -} - -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) -{ - struct bio *bio; - struct btrfs_bio *bbio; - - ASSERT(offset <= UINT_MAX && size <= UINT_MAX); - - /* this will never fail when it's backed by a bioset */ - bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); - ASSERT(bio); - - bbio = btrfs_bio(bio); - btrfs_bio_init(bbio); - - bio_trim(bio, offset >> 9, size >> 9); - bbio->iter = bio->bi_iter; - return bio; -} - -/** - * Attempt to add a page to bio - * - * @bio_ctrl: record both the bio, and its bio_flags - * @page: page to add to the bio - * @disk_bytenr: offset of the new bio or to check whether we are adding - * a contiguous page to the previous one - * @size: portion of page that we want to write - * @pg_offset: starting offset in the page - * @compress_type: compression type of the current bio to see if we can merge them + * @bio_ctrl: record both the bio, and its bio_flags + * @page: page to add to the bio + * @disk_bytenr: offset of the new bio or to check whether we are adding + * a contiguous page to the previous one + * @size: portion of page that we want to write + * @pg_offset: starting offset in the page + * @compress_type: compression type of the current bio to see if we can merge them * * Attempt to add a page to bio considering stripe alignment etc. * @@ -3351,7 +1367,6 @@ static int alloc_new_bio(struct btrfs_inode *inode, struct btrfs_bio_ctrl *bio_ctrl, struct writeback_control *wbc, blk_opf_t opf, - bio_end_io_t end_io_func, u64 disk_bytenr, u32 offset, u64 file_offset, enum btrfs_compression_type compress_type) { @@ -3359,7 +1374,9 @@ static int alloc_new_bio(struct btrfs_inode *inode, struct bio *bio; int ret; - bio = btrfs_bio_alloc(BIO_MAX_VECS); + ASSERT(bio_ctrl->end_io_func); + + bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL); /* * For compressed page range, its disk_bytenr is always @disk_bytenr * passed in, no matter if we have added any range into previous bio. @@ -3370,8 +1387,6 @@ static int alloc_new_bio(struct btrfs_inode *inode, bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; bio_ctrl->bio = bio; bio_ctrl->compress_type = compress_type; - bio->bi_end_io = end_io_func; - bio->bi_opf = opf; ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); if (ret < 0) goto error; @@ -3410,31 +1425,30 @@ static int alloc_new_bio(struct btrfs_inode *inode, return 0; error: bio_ctrl->bio = NULL; - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); + btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); return ret; } /* * @opf: bio REQ_OP_* and REQ_* flags as one value * @wbc: optional writeback control for io accounting - * @page: page to add to the bio * @disk_bytenr: logical bytenr where the write will be + * @page: page to add to the bio * @size: portion of page that we want to write to * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one - * @bio_ret: must be valid pointer, newly allocated bio will be stored there - * @end_io_func: end_io callback for new bio - * @mirror_num: desired mirror to read/write - * @prev_bio_flags: flags of previous bio to see if we can merge the current one * @compress_type: compress type for current bio + * + * The will either add the page into the existing @bio_ctrl->bio, or allocate a + * new one in @bio_ctrl->bio. + * The mirror number for this IO should already be initizlied in + * @bio_ctrl->mirror_num. */ static int submit_extent_page(blk_opf_t opf, struct writeback_control *wbc, struct btrfs_bio_ctrl *bio_ctrl, - struct page *page, u64 disk_bytenr, + u64 disk_bytenr, struct page *page, size_t size, unsigned long pg_offset, - bio_end_io_t end_io_func, enum btrfs_compression_type compress_type, bool force_bio_submit) { @@ -3446,6 +1460,9 @@ static int submit_extent_page(blk_opf_t opf, ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && pg_offset + size <= PAGE_SIZE); + + ASSERT(bio_ctrl->end_io_func); + if (force_bio_submit) submit_one_bio(bio_ctrl); @@ -3456,7 +1473,7 @@ static int submit_extent_page(blk_opf_t opf, /* Allocate new bio if needed */ if (!bio_ctrl->bio) { ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, - end_io_func, disk_bytenr, offset, + disk_bytenr, offset, page_offset(page) + cur, compress_type); if (ret < 0) @@ -3613,7 +1630,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, u64 extent_offset; u64 last_byte = i_size_read(inode); u64 block_start; - u64 cur_end; struct extent_map *em; int ret = 0; size_t pg_offset = 0; @@ -3623,7 +1639,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ret = set_page_extent_mapped(page); if (ret < 0) { - unlock_extent(tree, start, end); + unlock_extent(tree, start, end, NULL); btrfs_page_set_error(fs_info, page, start, PAGE_SIZE); unlock_page(page); goto out; @@ -3637,6 +1653,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, memzero_page(page, zero_offset, iosize); } } + bio_ctrl->end_io_func = end_bio_extent_readpage; begin_page_read(fs_info, page); while (cur <= end) { unsigned long this_bio_flag = 0; @@ -3645,21 +1662,16 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { - struct extent_state *cached = NULL; - iosize = PAGE_SIZE - pg_offset; memzero_page(page, pg_offset, iosize); - set_extent_uptodate(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); - unlock_extent_cached(tree, cur, - cur + iosize - 1, &cached); + unlock_extent(tree, cur, cur + iosize - 1, NULL); end_page_read(page, true, cur, iosize); break; } em = __get_extent_map(inode, page, pg_offset, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { - unlock_extent(tree, cur, end); + unlock_extent(tree, cur, end, NULL); end_page_read(page, false, cur, end + 1 - cur); ret = PTR_ERR(em); break; @@ -3672,7 +1684,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, this_bio_flag = em->compress_type; iosize = min(extent_map_end(em) - cur, end - cur + 1); - cur_end = min(extent_map_end(em) - 1, end); iosize = ALIGN(iosize, blocksize); if (this_bio_flag != BTRFS_COMPRESS_NONE) disk_bytenr = em->block_start; @@ -3729,49 +1740,33 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - struct extent_state *cached = NULL; - memzero_page(page, pg_offset, iosize); - set_extent_uptodate(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); - unlock_extent_cached(tree, cur, - cur + iosize - 1, &cached); + unlock_extent(tree, cur, cur + iosize - 1, NULL); end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; } /* the get_extent function already copied into the page */ - if (test_range_bit(tree, cur, cur_end, - EXTENT_UPTODATE, 1, NULL)) { - unlock_extent(tree, cur, cur + iosize - 1); - end_page_read(page, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; - continue; - } - /* we have an inline extent but it didn't get marked up - * to date. Error out - */ if (block_start == EXTENT_MAP_INLINE) { - unlock_extent(tree, cur, cur + iosize - 1); - end_page_read(page, false, cur, iosize); + unlock_extent(tree, cur, cur + iosize - 1, NULL); + end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; } ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, - bio_ctrl, page, disk_bytenr, iosize, - pg_offset, end_bio_extent_readpage, - this_bio_flag, force_bio_submit); + bio_ctrl, disk_bytenr, page, iosize, + pg_offset, this_bio_flag, + force_bio_submit); if (ret) { /* * We have to unlock the remaining range, or the page * will never be unlocked. */ - unlock_extent(tree, cur, end); + unlock_extent(tree, cur, end, NULL); end_page_read(page, false, cur, end + 1 - cur); goto out; } @@ -3952,7 +1947,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd, + struct btrfs_bio_ctrl *bio_ctrl, loff_t i_size, int *nr_ret) { @@ -3984,6 +1979,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, */ wbc->nr_to_write--; + bio_ctrl->end_io_func = end_bio_extent_writepage; while (cur <= end) { u64 disk_bytenr; u64 em_end; @@ -4077,10 +2073,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, btrfs_page_clear_dirty(fs_info, page, cur, iosize); ret = submit_extent_page(op | write_flags, wbc, - &epd->bio_ctrl, page, - disk_bytenr, iosize, + bio_ctrl, disk_bytenr, + page, iosize, cur - page_offset(page), - end_bio_extent_writepage, 0, false); if (ret) { has_error = true; @@ -4118,7 +2113,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * Return <0 for error. */ static int __extent_writepage(struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; @@ -4155,7 +2150,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, goto done; } - if (!epd->extent_locked) { + if (!bio_ctrl->extent_locked) { ret = writepage_delalloc(BTRFS_I(inode), page, wbc); if (ret == 1) return 0; @@ -4163,7 +2158,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, goto done; } - ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size, + ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, bio_ctrl, i_size, &nr); if (ret == 1) return 0; @@ -4207,9 +2202,9 @@ done: */ if (PageError(page)) end_extent_writepage(page, ret, page_start, page_end); - if (epd->extent_locked) { + if (bio_ctrl->extent_locked) { /* - * If epd->extent_locked, it's from extent_write_locked_range(), + * If bio_ctrl->extent_locked, it's from extent_write_locked_range(), * the page can either be locked by lock_page() or * process_one_page(). * Let btrfs_page_unlock_writer() handle both cases. @@ -4248,7 +2243,7 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) * Return <0 if something went wrong, no page is locked. */ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = eb->fs_info; int i, num_pages; @@ -4256,17 +2251,17 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb int ret = 0; if (!btrfs_try_tree_write_lock(eb)) { - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); flush = 1; btrfs_tree_lock(eb); } if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { btrfs_tree_unlock(eb); - if (!epd->sync_io) + if (!bio_ctrl->sync_io) return 0; if (!flush) { - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); flush = 1; } while (1) { @@ -4313,7 +2308,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!trylock_page(p)) { if (!flush) { - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); flush = 1; } lock_page(p); @@ -4431,8 +2426,9 @@ static struct extent_buffer *find_extent_buffer_nolock( * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() * after all extent buffers in the page has finished their writeback. */ -static void end_bio_subpage_eb_writepage(struct bio *bio) +static void end_bio_subpage_eb_writepage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; struct btrfs_fs_info *fs_info; struct bio_vec *bvec; struct bvec_iter_all iter_all; @@ -4488,8 +2484,9 @@ static void end_bio_subpage_eb_writepage(struct bio *bio) bio_put(bio); } -static void end_bio_extent_buffer_writepage(struct bio *bio) +static void end_bio_extent_buffer_writepage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; struct bio_vec *bvec; struct extent_buffer *eb; int done; @@ -4532,15 +2529,19 @@ static void prepare_eb_write(struct extent_buffer *eb) /* Set btree blocks beyond nritems with 0 to avoid stale content */ nritems = btrfs_header_nritems(eb); if (btrfs_header_level(eb) > 0) { - end = btrfs_node_key_ptr_offset(nritems); + end = btrfs_node_key_ptr_offset(eb, nritems); memzero_extent_buffer(eb, end, eb->len - end); } else { /* * Leaf: * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 */ - start = btrfs_item_nr_offset(nritems); - end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); + start = btrfs_item_nr_offset(eb, nritems); + end = btrfs_item_nr_offset(eb, 0); + if (nritems == 0) + end += BTRFS_LEAF_DATA_SIZE(eb->fs_info); + else + end += btrfs_item_offset(eb, nritems - 1); memzero_extent_buffer(eb, start, end - start); } } @@ -4551,7 +2552,7 @@ static void prepare_eb_write(struct extent_buffer *eb) */ static int write_one_subpage_eb(struct extent_buffer *eb, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page = eb->pages[0]; @@ -4571,10 +2572,11 @@ static int write_one_subpage_eb(struct extent_buffer *eb, if (no_dirty_ebs) clear_page_dirty_for_io(page); + bio_ctrl->end_io_func = end_bio_subpage_eb_writepage; + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - &epd->bio_ctrl, page, eb->start, eb->len, - eb->start - page_offset(page), - end_bio_subpage_eb_writepage, 0, false); + bio_ctrl, eb->start, page, eb->len, + eb->start - page_offset(page), 0, false); if (ret) { btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); set_btree_ioerr(page, eb); @@ -4596,7 +2598,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { u64 disk_bytenr = eb->start; int i, num_pages; @@ -4605,6 +2607,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, prepare_eb_write(eb); + bio_ctrl->end_io_func = end_bio_extent_buffer_writepage; + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; @@ -4612,10 +2616,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - &epd->bio_ctrl, p, disk_bytenr, - PAGE_SIZE, 0, - end_bio_extent_buffer_writepage, - 0, false); + bio_ctrl, disk_bytenr, p, + PAGE_SIZE, 0, 0, false); if (ret) { set_btree_ioerr(p, eb); if (PageWriteback(p)) @@ -4657,7 +2659,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, */ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); int submitted = 0; @@ -4710,7 +2712,7 @@ static int submit_eb_subpage(struct page *page, if (!eb) continue; - ret = lock_extent_buffer_for_io(eb, epd); + ret = lock_extent_buffer_for_io(eb, bio_ctrl); if (ret == 0) { free_extent_buffer(eb); continue; @@ -4719,7 +2721,7 @@ static int submit_eb_subpage(struct page *page, free_extent_buffer(eb); goto cleanup; } - ret = write_one_subpage_eb(eb, wbc, epd); + ret = write_one_subpage_eb(eb, wbc, bio_ctrl); free_extent_buffer(eb); if (ret < 0) goto cleanup; @@ -4729,7 +2731,7 @@ static int submit_eb_subpage(struct page *page, cleanup: /* We hit error, end bio for the submitted extent buffers */ - submit_write_bio(epd, ret); + submit_write_bio(bio_ctrl, ret); return ret; } @@ -4754,7 +2756,7 @@ cleanup: * Return <0 for fatal error. */ static int submit_eb_page(struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd, + struct btrfs_bio_ctrl *bio_ctrl, struct extent_buffer **eb_context) { struct address_space *mapping = page->mapping; @@ -4766,7 +2768,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, return 0; if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return submit_eb_subpage(page, wbc, epd); + return submit_eb_subpage(page, wbc, bio_ctrl); spin_lock(&mapping->private_lock); if (!PagePrivate(page)) { @@ -4809,7 +2811,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, *eb_context = eb; - ret = lock_extent_buffer_for_io(eb, epd); + ret = lock_extent_buffer_for_io(eb, bio_ctrl); if (ret <= 0) { btrfs_revert_meta_write_pointer(cache, eb); if (cache) @@ -4824,7 +2826,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, btrfs_schedule_zone_finish_bg(cache, eb); btrfs_put_block_group(cache); } - ret = write_one_eb(eb, wbc, epd); + ret = write_one_eb(eb, wbc, bio_ctrl); free_extent_buffer(eb); if (ret < 0) return ret; @@ -4835,10 +2837,9 @@ int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { struct extent_buffer *eb_context = NULL; - struct extent_page_data epd = { - .bio_ctrl = { 0 }, + struct btrfs_bio_ctrl bio_ctrl = { .extent_locked = 0, - .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .sync_io = (wbc->sync_mode == WB_SYNC_ALL), }; struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; int ret = 0; @@ -4881,7 +2882,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - ret = submit_eb_page(page, wbc, &epd, &eb_context); + ret = submit_eb_page(page, wbc, &bio_ctrl, &eb_context); if (ret == 0) continue; if (ret < 0) { @@ -4942,18 +2943,18 @@ retry: ret = 0; if (!ret && BTRFS_FS_ERROR(fs_info)) ret = -EROFS; - submit_write_bio(&epd, ret); + submit_write_bio(&bio_ctrl, ret); btrfs_zoned_meta_io_unlock(fs_info); return ret; } -/** +/* * Walk the list of dirty pages of the given address space and write all of them. * - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @epd: holds context for the write, namely the bio + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @bio_ctrl: holds context for the write, namely the bio * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, @@ -4965,7 +2966,7 @@ retry: */ static int extent_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct inode *inode = mapping->host; int ret = 0; @@ -5046,7 +3047,7 @@ retry: * tmpfs file mapping */ if (!trylock_page(page)) { - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); lock_page(page); } @@ -5057,7 +3058,7 @@ retry: if (wbc->sync_mode != WB_SYNC_NONE) { if (PageWriteback(page)) - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); wait_on_page_writeback(page); } @@ -5067,7 +3068,7 @@ retry: continue; } - ret = __extent_writepage(page, wbc, epd); + ret = __extent_writepage(page, wbc, bio_ctrl); if (ret < 0) { done = 1; break; @@ -5097,14 +3098,14 @@ retry: * page in our current bio, and thus deadlock, so flush the * write bio here. */ - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); goto retry; } if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) mapping->writeback_index = done_index; - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); return ret; } @@ -5123,8 +3124,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) u64 cur = start; unsigned long nr_pages; const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize; - struct extent_page_data epd = { - .bio_ctrl = { 0 }, + struct btrfs_bio_ctrl bio_ctrl = { .extent_locked = 1, .sync_io = 1, }; @@ -5155,7 +3155,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) ASSERT(PageLocked(page)); ASSERT(PageDirty(page)); clear_page_dirty_for_io(page); - ret = __extent_writepage(page, &wbc_writepages, &epd); + ret = __extent_writepage(page, &wbc_writepages, &bio_ctrl); ASSERT(ret <= 0); if (ret < 0) { found_error = true; @@ -5165,7 +3165,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) cur = cur_end + 1; } - submit_write_bio(&epd, found_error ? ret : 0); + submit_write_bio(&bio_ctrl, found_error ? ret : 0); wbc_detach_inode(&wbc_writepages); if (found_error) @@ -5178,10 +3178,9 @@ int extent_writepages(struct address_space *mapping, { struct inode *inode = mapping->host; int ret = 0; - struct extent_page_data epd = { - .bio_ctrl = { 0 }, + struct btrfs_bio_ctrl bio_ctrl = { .extent_locked = 0, - .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .sync_io = (wbc->sync_mode == WB_SYNC_ALL), }; /* @@ -5189,8 +3188,8 @@ int extent_writepages(struct address_space *mapping, * protect the write pointer updates. */ btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); - ret = extent_write_cache_pages(mapping, wbc, &epd); - submit_write_bio(&epd, ret); + ret = extent_write_cache_pages(mapping, wbc, &bio_ctrl); + submit_write_bio(&bio_ctrl, ret); btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); return ret; } @@ -5236,7 +3235,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, if (start > end) return 0; - lock_extent_bits(tree, start, end, &cached_state); + lock_extent(tree, start, end, &cached_state); folio_wait_writeback(folio); /* @@ -5244,7 +3243,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * so here we only need to unlock the extent range to free any * existing extent state. */ - unlock_extent_cached(tree, start, end, &cached_state); + unlock_extent(tree, start, end, &cached_state); return 0; } @@ -5263,15 +3262,17 @@ static int try_release_extent_state(struct extent_io_tree *tree, if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) { ret = 0; } else { + u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | + EXTENT_DELALLOC_NEW | EXTENT_CTLBITS); + /* * At this point we can safely clear everything except the * locked bit, the nodatasum bit and the delalloc new bit. * The delalloc new bit will be cleared by ordered extent * completion. */ - ret = __clear_extent_bit(tree, start, end, - ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW), - 0, 0, NULL, mask, NULL); + ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, + mask, NULL); /* if clear_extent_bit failed for enomem reasons, * we can't allow the release to continue. @@ -5370,42 +3371,6 @@ next: } /* - * helper function for fiemap, which doesn't want to see any holes. - * This maps until we find something past 'last' - */ -static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode, - u64 offset, u64 last) -{ - u64 sectorsize = btrfs_inode_sectorsize(inode); - struct extent_map *em; - u64 len; - - if (offset >= last) - return NULL; - - while (1) { - len = last - offset; - if (len == 0) - break; - len = ALIGN(len, sectorsize); - em = btrfs_get_extent_fiemap(inode, offset, len); - if (IS_ERR(em)) - return em; - - /* if this isn't a hole return it */ - if (em->block_start != EXTENT_MAP_HOLE) - return em; - - /* this is a hole, advance to the next extent */ - offset = extent_map_end(em); - free_extent_map(em); - if (offset >= last) - break; - } - return NULL; -} - -/* * To cache previous fiemap extent * * Will be used for merging fiemap extent @@ -5434,6 +3399,9 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, { int ret = 0; + /* Set at the end of extent_fiemap(). */ + ASSERT((flags & FIEMAP_EXTENT_LAST) == 0); + if (!cache->cached) goto assign; @@ -5457,16 +3425,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * So truly compressed (physical size smaller than logical size) * extents won't get merged with each other * - * 3) Share same flags except FIEMAP_EXTENT_LAST - * So regular extent won't get merged with prealloc extent + * 3) Share same flags */ if (cache->offset + cache->len == offset && cache->phys + cache->len == phys && - (cache->flags & ~FIEMAP_EXTENT_LAST) == - (flags & ~FIEMAP_EXTENT_LAST)) { + cache->flags == flags) { cache->len += len; - cache->flags |= flags; - goto try_submit_last; + return 0; } /* Not mergeable, need to submit cached one */ @@ -5481,13 +3446,8 @@ assign: cache->phys = phys; cache->len = len; cache->flags = flags; -try_submit_last: - if (cache->flags & FIEMAP_EXTENT_LAST) { - ret = fiemap_fill_next_extent(fieinfo, cache->offset, - cache->phys, cache->len, cache->flags); - cache->cached = false; - } - return ret; + + return 0; } /* @@ -5517,218 +3477,532 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, return ret; } -int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len) +static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path) { - int ret = 0; - u64 off; - u64 max = start + len; - u32 flags = 0; - u32 found_type; - u64 last; - u64 last_for_get_extent = 0; - u64 disko = 0; - u64 isize = i_size_read(&inode->vfs_inode); - struct btrfs_key found_key; - struct extent_map *em = NULL; - struct extent_state *cached_state = NULL; - struct btrfs_path *path; - struct btrfs_root *root = inode->root; - struct fiemap_cache cache = { 0 }; - struct ulist *roots; - struct ulist *tmp_ulist; - int end = 0; - u64 em_start = 0; - u64 em_len = 0; - u64 em_end = 0; - - if (len == 0) - return -EINVAL; + struct extent_buffer *clone; + struct btrfs_key key; + int slot; + int ret; - path = btrfs_alloc_path(); - if (!path) + path->slots[0]++; + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) + return 0; + + ret = btrfs_next_leaf(inode->root, path); + if (ret != 0) + return ret; + + /* + * Don't bother with cloning if there are no more file extent items for + * our inode. + */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) + return 1; + + /* See the comment at fiemap_search_slot() about why we clone. */ + clone = btrfs_clone_extent_buffer(path->nodes[0]); + if (!clone) return -ENOMEM; - roots = ulist_alloc(GFP_KERNEL); - tmp_ulist = ulist_alloc(GFP_KERNEL); - if (!roots || !tmp_ulist) { - ret = -ENOMEM; - goto out_free_ulist; + slot = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = clone; + path->slots[0] = slot; + + return 0; +} + +/* + * Search for the first file extent item that starts at a given file offset or + * the one that starts immediately before that offset. + * Returns: 0 on success, < 0 on error, 1 if not found. + */ +static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path, + u64 file_offset) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct extent_buffer *clone; + struct btrfs_key key; + int slot; + int ret; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = file_offset; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret != 0) + return ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + return 1; } /* - * We can't initialize that to 'start' as this could miss extents due - * to extent item merging + * We clone the leaf and use it during fiemap. This is because while + * using the leaf we do expensive things like checking if an extent is + * shared, which can take a long time. In order to prevent blocking + * other tasks for too long, we use a clone of the leaf. We have locked + * the file range in the inode's io tree, so we know none of our file + * extent items can change. This way we avoid blocking other tasks that + * want to insert items for other inodes in the same leaf or b+tree + * rebalance operations (triggered for example when someone is trying + * to push items into this leaf when trying to insert an item in a + * neighbour leaf). + * We also need the private clone because holding a read lock on an + * extent buffer of the subvolume's b+tree will make lockdep unhappy + * when we call fiemap_fill_next_extent(), because that may cause a page + * fault when filling the user space buffer with fiemap data. */ - off = 0; - start = round_down(start, btrfs_inode_sectorsize(inode)); - len = round_up(max, btrfs_inode_sectorsize(inode)) - start; + clone = btrfs_clone_extent_buffer(path->nodes[0]); + if (!clone) + return -ENOMEM; + + slot = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = clone; + path->slots[0] = slot; + + return 0; +} + +/* + * Process a range which is a hole or a prealloc extent in the inode's subvolume + * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc + * extent. The end offset (@end) is inclusive. + */ +static int fiemap_process_hole(struct btrfs_inode *inode, + struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache, + struct extent_state **delalloc_cached_state, + struct btrfs_backref_share_check_ctx *backref_ctx, + u64 disk_bytenr, u64 extent_offset, + u64 extent_gen, + u64 start, u64 end) +{ + const u64 i_size = i_size_read(&inode->vfs_inode); + u64 cur_offset = start; + u64 last_delalloc_end = 0; + u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN; + bool checked_extent_shared = false; + int ret; /* - * lookup the last file extent. We're not using i_size here - * because there might be preallocation past i_size + * There can be no delalloc past i_size, so don't waste time looking for + * it beyond i_size. */ - ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, - 0); - if (ret < 0) { - goto out_free_ulist; - } else { - WARN_ON(!ret); - if (ret == 1) - ret = 0; - } + while (cur_offset < end && cur_offset < i_size) { + u64 delalloc_start; + u64 delalloc_end; + u64 prealloc_start; + u64 prealloc_len = 0; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, + delalloc_cached_state, + &delalloc_start, + &delalloc_end); + if (!delalloc) + break; - path->slots[0]--; - btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - found_type = found_key.type; - - /* No extents, but there might be delalloc bits */ - if (found_key.objectid != btrfs_ino(inode) || - found_type != BTRFS_EXTENT_DATA_KEY) { - /* have to trust i_size as the end */ - last = (u64)-1; - last_for_get_extent = isize; - } else { /* - * remember the start of the last extent. There are a - * bunch of different factors that go into the length of the - * extent, so its much less complex to remember where it started + * If this is a prealloc extent we have to report every section + * of it that has no delalloc. */ - last = found_key.offset; - last_for_get_extent = last + 1; + if (disk_bytenr != 0) { + if (last_delalloc_end == 0) { + prealloc_start = start; + prealloc_len = delalloc_start - start; + } else { + prealloc_start = last_delalloc_end + 1; + prealloc_len = delalloc_start - prealloc_start; + } + } + + if (prealloc_len > 0) { + if (!checked_extent_shared && fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode, + disk_bytenr, + extent_gen, + backref_ctx); + if (ret < 0) + return ret; + else if (ret > 0) + prealloc_flags |= FIEMAP_EXTENT_SHARED; + + checked_extent_shared = true; + } + ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, + disk_bytenr + extent_offset, + prealloc_len, prealloc_flags); + if (ret) + return ret; + extent_offset += prealloc_len; + } + + ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0, + delalloc_end + 1 - delalloc_start, + FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + if (ret) + return ret; + + last_delalloc_end = delalloc_end; + cur_offset = delalloc_end + 1; + extent_offset += cur_offset - delalloc_start; + cond_resched(); } - btrfs_release_path(path); /* - * we might have some extents allocated but more delalloc past those - * extents. so, we trust isize unless the start of the last extent is - * beyond isize + * Either we found no delalloc for the whole prealloc extent or we have + * a prealloc extent that spans i_size or starts at or after i_size. + */ + if (disk_bytenr != 0 && last_delalloc_end < end) { + u64 prealloc_start; + u64 prealloc_len; + + if (last_delalloc_end == 0) { + prealloc_start = start; + prealloc_len = end + 1 - start; + } else { + prealloc_start = last_delalloc_end + 1; + prealloc_len = end + 1 - prealloc_start; + } + + if (!checked_extent_shared && fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode, + disk_bytenr, + extent_gen, + backref_ctx); + if (ret < 0) + return ret; + else if (ret > 0) + prealloc_flags |= FIEMAP_EXTENT_SHARED; + } + ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, + disk_bytenr + extent_offset, + prealloc_len, prealloc_flags); + if (ret) + return ret; + } + + return 0; +} + +static int fiemap_find_last_extent_offset(struct btrfs_inode *inode, + struct btrfs_path *path, + u64 *last_extent_end_ret) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; + struct btrfs_key key; + u64 disk_bytenr; + int ret; + + /* + * Lookup the last file extent. We're not using i_size here because + * there might be preallocation past i_size. + */ + ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0); + /* There can't be a file extent item at offset (u64)-1 */ + ASSERT(ret != 0); + if (ret < 0) + return ret; + + /* + * For a non-existing key, btrfs_search_slot() always leaves us at a + * slot > 0, except if the btree is empty, which is impossible because + * at least it has the inode item for this inode and all the items for + * the root inode 256. */ - if (last < isize) { - last = (u64)-1; - last_for_get_extent = isize; + ASSERT(path->slots[0] > 0); + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { + /* No file extent items in the subvolume tree. */ + *last_extent_end_ret = 0; + return 0; } - lock_extent_bits(&inode->io_tree, start, start + len - 1, - &cached_state); + /* + * For an inline extent, the disk_bytenr is where inline data starts at, + * so first check if we have an inline extent item before checking if we + * have an implicit hole (disk_bytenr == 0). + */ + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { + *last_extent_end_ret = btrfs_file_extent_end(path); + return 0; + } - em = get_extent_skip_holes(inode, start, last_for_get_extent); - if (!em) - goto out; - if (IS_ERR(em)) { - ret = PTR_ERR(em); + /* + * Find the last file extent item that is not a hole (when NO_HOLES is + * not enabled). This should take at most 2 iterations in the worst + * case: we have one hole file extent item at slot 0 of a leaf and + * another hole file extent item as the last item in the previous leaf. + * This is because we merge file extent items that represent holes. + */ + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + while (disk_bytenr == 0) { + ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); + if (ret < 0) { + return ret; + } else if (ret > 0) { + /* No file extent items that are not holes. */ + *last_extent_end_ret = 0; + return 0; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + } + + *last_extent_end_ret = btrfs_file_extent_end(path); + return 0; +} + +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + const u64 ino = btrfs_ino(inode); + struct extent_state *cached_state = NULL; + struct extent_state *delalloc_cached_state = NULL; + struct btrfs_path *path; + struct fiemap_cache cache = { 0 }; + struct btrfs_backref_share_check_ctx *backref_ctx; + u64 last_extent_end; + u64 prev_extent_end; + u64 lockstart; + u64 lockend; + bool stopped = false; + int ret; + + backref_ctx = btrfs_alloc_backref_share_check_ctx(); + path = btrfs_alloc_path(); + if (!backref_ctx || !path) { + ret = -ENOMEM; goto out; } - while (!end) { - u64 offset_in_extent = 0; + lockstart = round_down(start, inode->root->fs_info->sectorsize); + lockend = round_up(start + len, inode->root->fs_info->sectorsize); + prev_extent_end = lockstart; - /* break if the extent we found is outside the range */ - if (em->start >= max || extent_map_end(em) < off) - break; + lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); - /* - * get_extent may return an extent that starts before our - * requested range. We have to make sure the ranges - * we return to fiemap always move forward and don't - * overlap, so adjust the offsets here - */ - em_start = max(em->start, off); + ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); + if (ret < 0) + goto out_unlock; + btrfs_release_path(path); + path->reada = READA_FORWARD; + ret = fiemap_search_slot(inode, path, lockstart); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { /* - * record the offset from the start of the extent - * for adjusting the disk offset below. Only do this if the - * extent isn't compressed since our in ram offset may be past - * what we have actually allocated on disk. + * No file extent item found, but we may have delalloc between + * the current offset and i_size. So check for that. */ - if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) - offset_in_extent = em_start - em->start; - em_end = extent_map_end(em); - em_len = em_end - em_start; - flags = 0; - if (em->block_start < EXTENT_MAP_LAST_BYTE) - disko = em->block_start + offset_in_extent; - else - disko = 0; + ret = 0; + goto check_eof_delalloc; + } + + while (prev_extent_end < lockend) { + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *ei; + struct btrfs_key key; + u64 extent_end; + u64 extent_len; + u64 extent_offset = 0; + u64 extent_gen; + u64 disk_bytenr = 0; + u64 flags = 0; + int extent_type; + u8 compression; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + break; + + extent_end = btrfs_file_extent_end(path); /* - * bump off for our next call to get_extent + * The first iteration can leave us at an extent item that ends + * before our range's start. Move to the next item. */ - off = extent_map_end(em); - if (off >= max) - end = 1; - - if (em->block_start == EXTENT_MAP_LAST_BYTE) { - end = 1; - flags |= FIEMAP_EXTENT_LAST; - } else if (em->block_start == EXTENT_MAP_INLINE) { - flags |= (FIEMAP_EXTENT_DATA_INLINE | - FIEMAP_EXTENT_NOT_ALIGNED); - } else if (em->block_start == EXTENT_MAP_DELALLOC) { - flags |= (FIEMAP_EXTENT_DELALLOC | - FIEMAP_EXTENT_UNKNOWN); - } else if (fieinfo->fi_extents_max) { - u64 bytenr = em->block_start - - (em->start - em->orig_start); + if (extent_end <= lockstart) + goto next_item; - /* - * As btrfs supports shared space, this information - * can be exported to userspace tools via - * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 - * then we're just getting a count and we can skip the - * lookup stuff. - */ - ret = btrfs_check_shared(root, btrfs_ino(inode), - bytenr, roots, tmp_ulist); - if (ret < 0) - goto out_free; - if (ret) - flags |= FIEMAP_EXTENT_SHARED; - ret = 0; + backref_ctx->curr_leaf_bytenr = leaf->start; + + /* We have in implicit hole (NO_HOLES feature enabled). */ + if (prev_extent_end < key.offset) { + const u64 range_end = min(key.offset, lockend) - 1; + + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, + backref_ctx, 0, 0, 0, + prev_extent_end, range_end); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* fiemap_fill_next_extent() told us to stop. */ + stopped = true; + break; + } + + /* We've reached the end of the fiemap range, stop. */ + if (key.offset >= lockend) { + stopped = true; + break; + } } - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + + extent_len = extent_end - key.offset; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + compression = btrfs_file_extent_compression(leaf, ei); + extent_type = btrfs_file_extent_type(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (compression == BTRFS_COMPRESS_NONE) + extent_offset = btrfs_file_extent_offset(leaf, ei); + } + + if (compression != BTRFS_COMPRESS_NONE) flags |= FIEMAP_EXTENT_ENCODED; - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - flags |= FIEMAP_EXTENT_UNWRITTEN; - free_extent_map(em); - em = NULL; - if ((em_start >= last) || em_len == (u64)-1 || - (last == (u64)-1 && isize <= em_end)) { - flags |= FIEMAP_EXTENT_LAST; - end = 1; + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + flags |= FIEMAP_EXTENT_DATA_INLINE; + flags |= FIEMAP_EXTENT_NOT_ALIGNED; + ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0, + extent_len, flags); + } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, + backref_ctx, + disk_bytenr, extent_offset, + extent_gen, key.offset, + extent_end - 1); + } else if (disk_bytenr == 0) { + /* We have an explicit hole. */ + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, + backref_ctx, 0, 0, 0, + key.offset, extent_end - 1); + } else { + /* We have a regular extent. */ + if (fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode, + disk_bytenr, + extent_gen, + backref_ctx); + if (ret < 0) + goto out_unlock; + else if (ret > 0) + flags |= FIEMAP_EXTENT_SHARED; + } + + ret = emit_fiemap_extent(fieinfo, &cache, key.offset, + disk_bytenr + extent_offset, + extent_len, flags); } - /* now scan forward to see if this is really the last extent. */ - em = get_extent_skip_holes(inode, off, last_for_get_extent); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* fiemap_fill_next_extent() told us to stop. */ + stopped = true; + break; } - if (!em) { - flags |= FIEMAP_EXTENT_LAST; - end = 1; + + prev_extent_end = extent_end; +next_item: + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out_unlock; } - ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko, - em_len, flags); - if (ret) { - if (ret == 1) - ret = 0; - goto out_free; + + ret = fiemap_next_leaf_item(inode, path); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* No more file extent items for this inode. */ + break; } + cond_resched(); } -out_free: - if (!ret) - ret = emit_last_fiemap_cache(fieinfo, &cache); - free_extent_map(em); -out: - unlock_extent_cached(&inode->io_tree, start, start + len - 1, - &cached_state); -out_free_ulist: +check_eof_delalloc: + /* + * Release (and free) the path before emitting any final entries to + * fiemap_fill_next_extent() to keep lockdep happy. This is because + * once we find no more file extent items exist, we may have a + * non-cloned leaf, and fiemap_fill_next_extent() can trigger page + * faults when copying data to the user space buffer. + */ + btrfs_free_path(path); + path = NULL; + + if (!stopped && prev_extent_end < lockend) { + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, backref_ctx, + 0, 0, 0, prev_extent_end, lockend - 1); + if (ret < 0) + goto out_unlock; + prev_extent_end = lockend; + } + + if (cache.cached && cache.offset + cache.len >= last_extent_end) { + const u64 i_size = i_size_read(&inode->vfs_inode); + + if (prev_extent_end < i_size) { + u64 delalloc_start; + u64 delalloc_end; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, + prev_extent_end, + i_size - 1, + &delalloc_cached_state, + &delalloc_start, + &delalloc_end); + if (!delalloc) + cache.flags |= FIEMAP_EXTENT_LAST; + } else { + cache.flags |= FIEMAP_EXTENT_LAST; + } + } + + ret = emit_last_fiemap_cache(fieinfo, &cache); + +out_unlock: + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); +out: + free_extent_state(delalloc_cached_state); + btrfs_free_backref_share_ctx(backref_ctx); btrfs_free_path(path); - ulist_free(roots); - ulist_free(tmp_ulist); return ret; } @@ -5856,7 +4130,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) { btrfs_release_extent_buffer_pages(eb); - btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); + btrfs_leak_debug_del_eb(eb); __free_extent_buffer(eb); } @@ -5870,11 +4144,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, eb->start = start; eb->len = len; eb->fs_info = fs_info; - eb->bflags = 0; init_rwsem(&eb->lock); - btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, - &fs_info->allocated_ebs); + btrfs_leak_debug_add_eb(eb); INIT_LIST_HEAD(&eb->release_list); spin_lock_init(&eb->refs_lock); @@ -5904,7 +4176,6 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) */ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); - memset(new->pages, 0, sizeof(*new->pages) * num_pages); ret = btrfs_alloc_page_array(num_pages, new->pages); if (ret) { btrfs_release_extent_buffer(new); @@ -6342,7 +4613,7 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); } - btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); + btrfs_leak_debug_del_eb(eb); /* Should be safe to release our pages at this point */ btrfs_release_extent_buffer_pages(eb); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -6362,18 +4633,16 @@ static int release_extent_buffer(struct extent_buffer *eb) void free_extent_buffer(struct extent_buffer *eb) { int refs; - int old; if (!eb) return; + refs = atomic_read(&eb->refs); while (1) { - refs = atomic_read(&eb->refs); if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs == 1)) break; - old = atomic_cmpxchg(&eb->refs, refs, refs - 1); - if (old == refs) + if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1)) return; } @@ -6551,11 +4820,13 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) } static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, - int mirror_num) + int mirror_num, + struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; struct page *page = eb->pages[0]; + struct extent_state *cached_state = NULL; struct btrfs_bio_ctrl bio_ctrl = { .mirror_num = mirror_num, }; @@ -6563,13 +4834,16 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); ASSERT(PagePrivate(page)); + ASSERT(check); io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; if (wait == WAIT_NONE) { - if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1)) + if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1, + &cached_state)) return -EAGAIN; } else { - ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1); + ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, + &cached_state); if (ret < 0) return ret; } @@ -6579,7 +4853,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, PageUptodate(page) || btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - unlock_extent(io_tree, eb->start, eb->start + eb->len - 1); + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, + &cached_state); return ret; } @@ -6587,13 +4862,14 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, eb->read_mirror = 0; atomic_set(&eb->io_pages, 1); check_buffer_tree_ref(eb); + bio_ctrl.end_io_func = end_bio_extent_readpage; + btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, - page, eb->start, eb->len, - eb->start - page_offset(page), - end_bio_extent_readpage, 0, true); + eb->start, page, eb->len, + eb->start - page_offset(page), 0, true); if (ret) { /* * In the endio function, if we hit something wrong we will @@ -6602,17 +4878,22 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, */ atomic_dec(&eb->io_pages); } + memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check)); submit_one_bio(&bio_ctrl); - if (ret || wait != WAIT_COMPLETE) + if (ret || wait != WAIT_COMPLETE) { + free_extent_state(cached_state); return ret; + } - wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED); + wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, + EXTENT_LOCKED, &cached_state); if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) ret = -EIO; return ret; } -int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) +int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, + struct btrfs_tree_parent_check *check) { int i; struct page *page; @@ -6638,7 +4919,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) return -EIO; if (eb->fs_info->nodesize < PAGE_SIZE) - return read_extent_buffer_subpage(eb, wait, mirror_num); + return read_extent_buffer_subpage(eb, wait, mirror_num, check); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { @@ -6684,6 +4965,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) * set io_pages. See check_buffer_tree_ref for a more detailed comment. */ check_buffer_tree_ref(eb); + bio_ctrl.end_io_func = end_bio_extent_readpage; for (i = 0; i < num_pages; i++) { page = eb->pages[i]; @@ -6696,9 +4978,8 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) ClearPageError(page); err = submit_extent_page(REQ_OP_READ, NULL, - &bio_ctrl, page, page_offset(page), - PAGE_SIZE, 0, end_bio_extent_readpage, - 0, false); + &bio_ctrl, page_offset(page), page, + PAGE_SIZE, 0, 0, false); if (err) { /* * We failed to submit the bio so it's the @@ -6715,6 +4996,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } } + memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check)); submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) @@ -7073,11 +5355,12 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, *page_offset = offset_in_page(offset); } -/** - * extent_buffer_test_bit - determine whether a bit in a bitmap item is set - * @eb: the extent buffer - * @start: offset of the bitmap item in the extent buffer - * @nr: bit number to test +/* + * Determine whether a bit in a bitmap item is set. + * + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @nr: bit number to test */ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, unsigned long nr) @@ -7094,12 +5377,13 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); } -/** - * extent_buffer_bitmap_set - set an area of a bitmap - * @eb: the extent buffer - * @start: offset of the bitmap item in the extent buffer - * @pos: bit number of the first bit - * @len: number of bits to set +/* + * Set an area of a bitmap to 1. + * + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @pos: bit number of the first bit + * @len: number of bits to set */ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) @@ -7136,12 +5420,13 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star } -/** - * extent_buffer_bitmap_clear - clear an area of a bitmap - * @eb: the extent buffer - * @start: offset of the bitmap item in the extent buffer - * @pos: bit number of the first bit - * @len: number of bits to clear +/* + * Clear an area of a bitmap. + * + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @pos: bit number of the first bit + * @len: number of bits to clear */ void extent_buffer_bitmap_clear(const struct extent_buffer *eb, unsigned long start, unsigned long pos, @@ -7447,6 +5732,11 @@ int try_release_extent_buffer(struct page *page) void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level) { + struct btrfs_tree_parent_check check = { + .has_first_key = 0, + .level = level, + .transid = gen + }; struct extent_buffer *eb; int ret; @@ -7459,7 +5749,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, return; } - ret = read_extent_buffer_pages(eb, WAIT_NONE, 0); + ret = read_extent_buffer_pages(eb, WAIT_NONE, 0, &check); if (ret < 0) free_extent_buffer_stale(eb); else diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4bc72a87b9a9..a2c82448b2e0 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -9,6 +9,7 @@ #include <linux/btrfs_tree.h> #include "compression.h" #include "ulist.h" +#include "misc.h" enum { EXTENT_BUFFER_UPTODATE, @@ -29,13 +30,15 @@ enum { }; /* these are flags for __process_pages_contig */ -#define PAGE_UNLOCK (1 << 0) -/* Page starts writeback, clear dirty bit and set writeback bit */ -#define PAGE_START_WRITEBACK (1 << 1) -#define PAGE_END_WRITEBACK (1 << 2) -#define PAGE_SET_ORDERED (1 << 3) -#define PAGE_SET_ERROR (1 << 4) -#define PAGE_LOCK (1 << 5) +enum { + ENUM_BIT(PAGE_UNLOCK), + /* Page starts writeback, clear dirty bit and set writeback bit */ + ENUM_BIT(PAGE_START_WRITEBACK), + ENUM_BIT(PAGE_END_WRITEBACK), + ENUM_BIT(PAGE_SET_ORDERED), + ENUM_BIT(PAGE_SET_ERROR), + ENUM_BIT(PAGE_LOCK), +}; /* * page->private values. Every page that is controlled by the extent @@ -60,17 +63,13 @@ enum { struct btrfs_bio; struct btrfs_root; struct btrfs_inode; -struct btrfs_io_bio; struct btrfs_fs_info; struct io_failure_record; struct extent_io_tree; +struct btrfs_tree_parent_check; -typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio, - int mirror_num, - enum btrfs_compression_type compress_type); - -typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode, - struct bio *bio, u64 dio_file_offset); +int __init extent_buffer_init_cachep(void); +void __cold extent_buffer_free_cachep(void); #define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE) struct extent_buffer { @@ -97,6 +96,39 @@ struct extent_buffer { }; /* + * Get the correct offset inside the page of extent buffer. + * + * @eb: target extent buffer + * @start: offset inside the extent buffer + * + * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases. + */ +static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb, + unsigned long offset) +{ + /* + * For sectorsize == PAGE_SIZE case, eb->start will always be aligned + * to PAGE_SIZE, thus adding it won't cause any difference. + * + * For sectorsize < PAGE_SIZE, we must only read the data that belongs + * to the eb, thus we have to take the eb->start into consideration. + */ + return offset_in_page(offset + eb->start); +} + +static inline unsigned long get_eb_page_index(unsigned long offset) +{ + /* + * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough. + * + * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE, + * and have ensured that all tree blocks are contained in one page, + * thus we always get index == 0. + */ + return offset >> PAGE_SHIFT; +} + +/* * Structure to record how many bytes and which ranges are set/cleared */ struct extent_changeset { @@ -172,8 +204,8 @@ void free_extent_buffer_stale(struct extent_buffer *eb); #define WAIT_NONE 0 #define WAIT_COMPLETE 1 #define WAIT_PAGE_LOCK 2 -int read_extent_buffer_pages(struct extent_buffer *eb, int wait, - int mirror_num); +int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, + struct btrfs_tree_parent_check *parent_check); void wait_on_extent_buffer_writeback(struct extent_buffer *eb); void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level); @@ -240,13 +272,12 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 bits_to_clear, unsigned long page_ops); +int extent_invalidate_folio(struct extent_io_tree *tree, + struct folio *folio, size_t offset); int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); -struct bio *btrfs_bio_alloc(unsigned int nr_iovecs); -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); -int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num); /* * When IO fails, either with EIO or csum verification fails, we @@ -257,8 +288,12 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num); * bio end_io callback is called to indicate things have failed. */ struct io_failure_record { + /* Use rb_simple_node for search/insert */ + struct { + struct rb_node rb_node; + u64 bytenr; + }; struct page *page; - u64 start; u64 len; u64 logical; int this_mirror; @@ -266,9 +301,12 @@ struct io_failure_record { int num_copies; }; -int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, +int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, u32 bio_offset, struct page *page, unsigned int pgoff, - submit_bio_hook_t *submit_bio_hook); + bool submit_buffered); +void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); +int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, + struct page *page, unsigned int pg_offset); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6fee14ce2e6b..be94030e1dfb 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -3,10 +3,12 @@ #include <linux/err.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include "messages.h" #include "ctree.h" #include "volumes.h" #include "extent_map.h" #include "compression.h" +#include "btrfs_inode.h" static struct kmem_cache *extent_map_cache; @@ -26,12 +28,9 @@ void __cold extent_map_exit(void) kmem_cache_destroy(extent_map_cache); } -/** - * extent_map_tree_init - initialize extent map tree - * @tree: tree to initialize - * - * Initialize the extent tree @tree. Should be called for each new inode - * or other user of the extent_map interface. +/* + * Initialize the extent tree @tree. Should be called for each new inode or + * other user of the extent_map interface. */ void extent_map_tree_init(struct extent_map_tree *tree) { @@ -40,12 +39,9 @@ void extent_map_tree_init(struct extent_map_tree *tree) rwlock_init(&tree->lock); } -/** - * alloc_extent_map - allocate new extent map structure - * - * Allocate a new extent_map structure. The new structure is - * returned with a reference count of one and needs to be - * freed using free_extent_map() +/* + * Allocate a new extent_map structure. The new structure is returned with a + * reference count of one and needs to be freed using free_extent_map() */ struct extent_map *alloc_extent_map(void) { @@ -54,26 +50,20 @@ struct extent_map *alloc_extent_map(void) if (!em) return NULL; RB_CLEAR_NODE(&em->rb_node); - em->flags = 0; em->compress_type = BTRFS_COMPRESS_NONE; - em->generation = 0; refcount_set(&em->refs, 1); INIT_LIST_HEAD(&em->list); return em; } -/** - * free_extent_map - drop reference count of an extent_map - * @em: extent map being released - * - * Drops the reference out on @em by one and free the structure - * if the reference count hits zero. +/* + * Drop the reference out on @em by one and free the structure if the reference + * count hits zero. */ void free_extent_map(struct extent_map *em) { if (!em) return; - WARN_ON(refcount_read(&em->refs) == 0); if (refcount_dec_and_test(&em->refs)) { WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); @@ -83,7 +73,7 @@ void free_extent_map(struct extent_map *em) } } -/* simple helper to do math around the end of an extent, handling wrap */ +/* Do the math around the end of an extent, handling wrapping. */ static u64 range_end(u64 start, u64 len) { if (start + len < start) @@ -139,12 +129,11 @@ static int tree_insert(struct rb_root_cached *root, struct extent_map *em) } /* - * search through the tree for an extent_map with a given offset. If - * it can't be found, try to find some neighboring extents + * Search through the tree for an extent_map with a given offset. If it can't + * be found, try to find some neighboring extents */ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, - struct rb_node **prev_ret, - struct rb_node **next_ret) + struct rb_node **prev_or_next_ret) { struct rb_node *n = root->rb_node; struct rb_node *prev = NULL; @@ -152,6 +141,8 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, struct extent_map *entry; struct extent_map *prev_entry = NULL; + ASSERT(prev_or_next_ret); + while (n) { entry = rb_entry(n, struct extent_map, rb_node); prev = n; @@ -165,28 +156,33 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, return n; } - if (prev_ret) { - orig_prev = prev; - while (prev && offset >= extent_map_end(prev_entry)) { - prev = rb_next(prev); - prev_entry = rb_entry(prev, struct extent_map, rb_node); - } - *prev_ret = prev; - prev = orig_prev; + orig_prev = prev; + while (prev && offset >= extent_map_end(prev_entry)) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct extent_map, rb_node); + } + + /* + * Previous extent map found, return as in this case the caller does not + * care about the next one. + */ + if (prev) { + *prev_or_next_ret = prev; + return NULL; } - if (next_ret) { + prev = orig_prev; + prev_entry = rb_entry(prev, struct extent_map, rb_node); + while (prev && offset < prev_entry->start) { + prev = rb_prev(prev); prev_entry = rb_entry(prev, struct extent_map, rb_node); - while (prev && offset < prev_entry->start) { - prev = rb_prev(prev); - prev_entry = rb_entry(prev, struct extent_map, rb_node); - } - *next_ret = prev; } + *prev_or_next_ret = prev; + return NULL; } -/* check to see if two extent_map structs are adjacent and safe to merge */ +/* Check to see if two extent_map structs are adjacent and safe to merge. */ static int mergable_maps(struct extent_map *prev, struct extent_map *next) { if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) @@ -284,8 +280,9 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) } } -/** - * unpin_extent_cache - unpin an extent from the cache +/* + * Unpin an extent from the cache. + * * @tree: tree to unpin the extent in * @start: logical offset in the file * @len: length of the extent @@ -336,6 +333,8 @@ out: void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) { + lockdep_assert_held_write(&tree->lock); + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); if (extent_map_in_tree(em)) try_merge_map(tree, em); @@ -382,11 +381,11 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) __clear_extent_bit(&device->alloc_state, stripe->physical, stripe->physical + stripe_size - 1, bits, - 0, 0, NULL, GFP_NOWAIT, NULL); + NULL, GFP_NOWAIT, NULL); } } -/** +/* * Add new extent map to the extent tree * * @tree: tree to insert new map in @@ -425,16 +424,13 @@ __lookup_extent_mapping(struct extent_map_tree *tree, { struct extent_map *em; struct rb_node *rb_node; - struct rb_node *prev = NULL; - struct rb_node *next = NULL; + struct rb_node *prev_or_next = NULL; u64 end = range_end(start, len); - rb_node = __tree_search(&tree->map.rb_root, start, &prev, &next); + rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next); if (!rb_node) { - if (prev) - rb_node = prev; - else if (next) - rb_node = next; + if (prev_or_next) + rb_node = prev_or_next; else return NULL; } @@ -448,8 +444,9 @@ __lookup_extent_mapping(struct extent_map_tree *tree, return em; } -/** - * lookup_extent_mapping - lookup extent_map +/* + * Lookup extent_map that intersects @start + @len range. + * * @tree: tree to lookup in * @start: byte offset to start the search * @len: length of the lookup range @@ -465,8 +462,9 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, return __lookup_extent_mapping(tree, start, len, 1); } -/** - * search_extent_mapping - find a nearby extent map +/* + * Find a nearby extent map intersecting @start + @len (not an exact search). + * * @tree: tree to lookup in * @start: byte offset to start the search * @len: length of the lookup range @@ -482,13 +480,14 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, return __lookup_extent_mapping(tree, start, len, 0); } -/** - * remove_extent_mapping - removes an extent_map from the extent tree +/* + * Remove an extent_map from the extent tree. + * * @tree: extent tree to remove from * @em: extent map being removed * - * Removes @em from @tree. No reference counts are dropped, and no checks - * are done to see if the range is in use + * Remove @em from @tree. No reference counts are dropped, and no checks + * are done to see if the range is in use. */ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { @@ -520,7 +519,7 @@ void replace_extent_mapping(struct extent_map_tree *tree, setup_extent_mapping(tree, new, modified); } -static struct extent_map *next_extent_map(struct extent_map *em) +static struct extent_map *next_extent_map(const struct extent_map *em) { struct rb_node *next; @@ -582,8 +581,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, return add_extent_mapping(em_tree, em, 0); } -/** - * Add extent mapping into em_tree +/* + * Add extent mapping into em_tree. * * @fs_info: the filesystem * @em_tree: extent tree into which we want to insert the extent mapping @@ -610,6 +609,13 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, int ret; struct extent_map *em = *em_in; + /* + * Tree-checker should have rejected any inline extent with non-zero + * file offset. Here just do a sanity check. + */ + if (em->block_start == EXTENT_MAP_INLINE) + ASSERT(em->start == 0); + ret = add_extent_mapping(em_tree, em, 0); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that @@ -658,3 +664,293 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, ASSERT(ret == 0 || ret == -EEXIST); return ret; } + +/* + * Drop all extent maps from a tree in the fastest possible way, rescheduling + * if needed. This avoids searching the tree, from the root down to the first + * extent map, before each deletion. + */ +static void drop_all_extent_maps_fast(struct extent_map_tree *tree) +{ + write_lock(&tree->lock); + while (!RB_EMPTY_ROOT(&tree->map.rb_root)) { + struct extent_map *em; + struct rb_node *node; + + node = rb_first_cached(&tree->map); + em = rb_entry(node, struct extent_map, rb_node); + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + remove_extent_mapping(tree, em); + free_extent_map(em); + cond_resched_rwlock_write(&tree->lock); + } + write_unlock(&tree->lock); +} + +/* + * Drop all extent maps in a given range. + * + * @inode: The target inode. + * @start: Start offset of the range. + * @end: End offset of the range (inclusive value). + * @skip_pinned: Indicate if pinned extent maps should be ignored or not. + * + * This drops all the extent maps that intersect the given range [@start, @end]. + * Extent maps that partially overlap the range and extend behind or beyond it, + * are split. + * The caller should have locked an appropriate file range in the inode's io + * tree before calling this function. + */ +void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, + bool skip_pinned) +{ + struct extent_map *split; + struct extent_map *split2; + struct extent_map *em; + struct extent_map_tree *em_tree = &inode->extent_tree; + u64 len = end - start + 1; + + WARN_ON(end < start); + if (end == (u64)-1) { + if (start == 0 && !skip_pinned) { + drop_all_extent_maps_fast(em_tree); + return; + } + len = (u64)-1; + } else { + /* Make end offset exclusive for use in the loop below. */ + end++; + } + + /* + * It's ok if we fail to allocate the extent maps, see the comment near + * the bottom of the loop below. We only need two spare extent maps in + * the worst case, where the first extent map that intersects our range + * starts before the range and the last extent map that intersects our + * range ends after our range (and they might be the same extent map), + * because we need to split those two extent maps at the boundaries. + */ + split = alloc_extent_map(); + split2 = alloc_extent_map(); + + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + + while (em) { + /* extent_map_end() returns exclusive value (last byte + 1). */ + const u64 em_end = extent_map_end(em); + struct extent_map *next_em = NULL; + u64 gen; + unsigned long flags; + bool modified; + bool compressed; + + if (em_end < end) { + next_em = next_extent_map(em); + if (next_em) { + if (next_em->start < end) + refcount_inc(&next_em->refs); + else + next_em = NULL; + } + } + + if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { + start = em_end; + if (end != (u64)-1) + len = start + len - em_end; + goto next; + } + + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + clear_bit(EXTENT_FLAG_LOGGING, &flags); + modified = !list_empty(&em->list); + + /* + * The extent map does not cross our target range, so no need to + * split it, we can remove it directly. + */ + if (em->start >= start && em_end <= end) + goto remove_em; + + flags = em->flags; + gen = em->generation; + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + + if (em->start < start) { + if (!split) { + split = split2; + split2 = NULL; + if (!split) + goto remove_em; + } + split->start = em->start; + split->len = start - em->start; + + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_start = em->orig_start; + split->block_start = em->block_start; + + if (compressed) + split->block_len = em->block_len; + else + split->block_len = split->len; + split->orig_block_len = max(split->block_len, + em->orig_block_len); + split->ram_bytes = em->ram_bytes; + } else { + split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; + split->ram_bytes = split->len; + } + + split->generation = gen; + split->flags = flags; + split->compress_type = em->compress_type; + replace_extent_mapping(em_tree, em, split, modified); + free_extent_map(split); + split = split2; + split2 = NULL; + } + if (em_end > end) { + if (!split) { + split = split2; + split2 = NULL; + if (!split) + goto remove_em; + } + split->start = start + len; + split->len = em_end - (start + len); + split->block_start = em->block_start; + split->flags = flags; + split->compress_type = em->compress_type; + split->generation = gen; + + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_block_len = max(em->block_len, + em->orig_block_len); + + split->ram_bytes = em->ram_bytes; + if (compressed) { + split->block_len = em->block_len; + split->orig_start = em->orig_start; + } else { + const u64 diff = start + len - em->start; + + split->block_len = split->len; + split->block_start += diff; + split->orig_start = em->orig_start; + } + } else { + split->ram_bytes = split->len; + split->orig_start = split->start; + split->block_len = 0; + split->orig_block_len = 0; + } + + if (extent_map_in_tree(em)) { + replace_extent_mapping(em_tree, em, split, + modified); + } else { + int ret; + + ret = add_extent_mapping(em_tree, split, + modified); + /* Logic error, shouldn't happen. */ + ASSERT(ret == 0); + if (WARN_ON(ret != 0) && modified) + btrfs_set_inode_full_sync(inode); + } + free_extent_map(split); + split = NULL; + } +remove_em: + if (extent_map_in_tree(em)) { + /* + * If the extent map is still in the tree it means that + * either of the following is true: + * + * 1) It fits entirely in our range (doesn't end beyond + * it or starts before it); + * + * 2) It starts before our range and/or ends after our + * range, and we were not able to allocate the extent + * maps for split operations, @split and @split2. + * + * If we are at case 2) then we just remove the entire + * extent map - this is fine since if anyone needs it to + * access the subranges outside our range, will just + * load it again from the subvolume tree's file extent + * item. However if the extent map was in the list of + * modified extents, then we must mark the inode for a + * full fsync, otherwise a fast fsync will miss this + * extent if it's new and needs to be logged. + */ + if ((em->start < start || em_end > end) && modified) { + ASSERT(!split); + btrfs_set_inode_full_sync(inode); + } + remove_extent_mapping(em_tree, em); + } + + /* + * Once for the tree reference (we replaced or removed the + * extent map from the tree). + */ + free_extent_map(em); +next: + /* Once for us (for our lookup reference). */ + free_extent_map(em); + + em = next_em; + } + + write_unlock(&em_tree->lock); + + free_extent_map(split); + free_extent_map(split2); +} + +/* + * Replace a range in the inode's extent map tree with a new extent map. + * + * @inode: The target inode. + * @new_em: The new extent map to add to the inode's extent map tree. + * @modified: Indicate if the new extent map should be added to the list of + * modified extents (for fast fsync tracking). + * + * Drops all the extent maps in the inode's extent map tree that intersect the + * range of the new extent map and adds the new extent map to the tree. + * The caller should have locked an appropriate file range in the inode's io + * tree before calling this function. + */ +int btrfs_replace_extent_map_range(struct btrfs_inode *inode, + struct extent_map *new_em, + bool modified) +{ + const u64 end = new_em->start + new_em->len - 1; + struct extent_map_tree *tree = &inode->extent_tree; + int ret; + + ASSERT(!extent_map_in_tree(new_em)); + + /* + * The caller has locked an appropriate file range in the inode's io + * tree, but getting -EEXIST when adding the new extent map can still + * happen in case there are extents that partially cover the range, and + * this is due to two tasks operating on different parts of the extent. + * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from + * btrfs_get_extent") for an example and details. + */ + do { + btrfs_drop_extent_map_range(inode, new_em->start, end, false); + write_lock(&tree->lock); + ret = add_extent_mapping(tree, new_em, modified); + write_unlock(&tree->lock); + } while (ret == -EEXIST); + + return ret; +} diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index d2fa32ffe304..ad311864272a 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -63,6 +63,8 @@ struct extent_map_tree { rwlock_t lock; }; +struct btrfs_inode; + static inline int extent_map_in_tree(const struct extent_map *em) { return !RB_EMPTY_NODE(&em->rb_node); @@ -104,5 +106,11 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree, struct extent_map **em_in, u64 start, u64 len); +void btrfs_drop_extent_map_range(struct btrfs_inode *inode, + u64 start, u64 end, + bool skip_pinned); +int btrfs_replace_extent_map_range(struct btrfs_inode *inode, + struct extent_map *new_em, + bool modified); #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index c828f971a346..5de73466b2ca 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -9,13 +9,18 @@ #include <linux/highmem.h> #include <linux/sched/mm.h> #include <crypto/hash.h> +#include "messages.h" #include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" -#include "volumes.h" +#include "bio.h" #include "print-tree.h" #include "compression.h" +#include "fs.h" +#include "accessors.h" +#include "file-item.h" +#include "super.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ @@ -24,8 +29,8 @@ #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ PAGE_SIZE)) -/** - * Set inode's size according to filesystem options +/* + * Set inode's size according to filesystem options. * * @inode: inode we want to update the disk_i_size for * @new_i_size: i_size we want to set to, 0 if we use i_size @@ -64,8 +69,8 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz spin_unlock(&inode->lock); } -/** - * Mark range within a file as having a new extent inserted +/* + * Mark range within a file as having a new extent inserted. * * @inode: inode being modified * @start: start file offset of the file extent we've inserted @@ -92,8 +97,8 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, EXTENT_DIRTY); } -/** - * Marks an inode range as not having a backing extent +/* + * Mark an inode range as not having a backing extent. * * @inode: inode being modified * @start: start file offset of the file extent we've inserted @@ -118,23 +123,43 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) return 0; return clear_extent_bit(&inode->file_extent_tree, start, - start + len - 1, EXTENT_DIRTY, 0, 0, NULL); + start + len - 1, EXTENT_DIRTY, NULL); +} + +static size_t bytes_to_csum_size(const struct btrfs_fs_info *fs_info, u32 bytes) +{ + ASSERT(IS_ALIGNED(bytes, fs_info->sectorsize)); + + return (bytes >> fs_info->sectorsize_bits) * fs_info->csum_size; } -static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, - u16 csum_size) +static size_t csum_size_to_bytes(const struct btrfs_fs_info *fs_info, u32 csum_size) { - u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size; + ASSERT(IS_ALIGNED(csum_size, fs_info->csum_size)); - return ncsums * fs_info->sectorsize; + return (csum_size / fs_info->csum_size) << fs_info->sectorsize_bits; +} + +static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info) +{ + u32 max_csum_size = round_down(PAGE_SIZE - sizeof(struct btrfs_ordered_sum), + fs_info->csum_size); + + return csum_size_to_bytes(fs_info, max_csum_size); +} + +/* + * Calculate the total size needed to allocate for an ordered sum structure + * spanning @bytes in the file. + */ +static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes) +{ + return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes); } -int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, +int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 objectid, u64 pos, - u64 disk_offset, u64 disk_num_bytes, - u64 num_bytes, u64 offset, u64 ram_bytes, - u8 compression, u8 encryption, u16 other_encoding) + u64 objectid, u64 pos, u64 num_bytes) { int ret = 0; struct btrfs_file_extent_item *item; @@ -157,16 +182,16 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset); - btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes); - btrfs_set_file_extent_offset(leaf, item, offset); + btrfs_set_file_extent_disk_bytenr(leaf, item, 0); + btrfs_set_file_extent_disk_num_bytes(leaf, item, 0); + btrfs_set_file_extent_offset(leaf, item, 0); btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); - btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes); + btrfs_set_file_extent_ram_bytes(leaf, item, num_bytes); btrfs_set_file_extent_generation(leaf, item, trans->transid); btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); - btrfs_set_file_extent_compression(leaf, item, compression); - btrfs_set_file_extent_encryption(leaf, item, encryption); - btrfs_set_file_extent_other_encoding(leaf, item, other_encoding); + btrfs_set_file_extent_compression(leaf, item, 0); + btrfs_set_file_extent_encryption(leaf, item, 0); + btrfs_set_file_extent_other_encoding(leaf, item, 0); btrfs_mark_buffer_dirty(leaf); out: @@ -246,7 +271,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, /* * Find checksums for logical bytenr range [disk_bytenr, disk_bytenr + len) and - * estore the result to @dst. + * store the result to @dst. * * Return >0 for the number of sectors we found. * Return 0 for the range [disk_bytenr, disk_bytenr + sectorsize) has no csum @@ -352,15 +377,15 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode, return ret; } -/** +/* * Lookup the checksum for the read bio in csum tree. * - * @inode: inode that the bio is for. - * @bio: bio to look up. - * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return - * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If - * NULL, the checksum buffer is allocated and returned in - * btrfs_bio(bio)->csum instead. + * @inode: inode that the bio is for. + * @bio: bio to look up. + * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return + * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If + * NULL, the checksum buffer is allocated and returned in + * btrfs_bio(bio)->csum instead. * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ @@ -502,8 +527,9 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst return ret; } -int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit) +int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; @@ -512,11 +538,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_ordered_sum *sums; struct btrfs_csum_item *item; LIST_HEAD(tmplist); - unsigned long offset; int ret; - size_t size; - u64 csum_end; - const u32 csum_size = fs_info->csum_size; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(end + 1, fs_info->sectorsize)); @@ -525,6 +547,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (!path) return -ENOMEM; + path->nowait = nowait; if (search_commit) { path->skip_locking = 1; path->reada = READA_FORWARD; @@ -541,16 +564,33 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (ret > 0 && path->slots[0] > 0) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + + /* + * There are two cases we can hit here for the previous csum + * item: + * + * |<- search range ->| + * |<- csum item ->| + * + * Or + * |<- search range ->| + * |<- csum item ->| + * + * Check if the previous csum item covers the leading part of + * the search range. If so we have to start from previous csum + * item. + */ if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && key.type == BTRFS_EXTENT_CSUM_KEY) { - offset = (start - key.offset) >> fs_info->sectorsize_bits; - if (offset * csum_size < + if (bytes_to_csum_size(fs_info, start - key.offset) < btrfs_item_size(leaf, path->slots[0] - 1)) path->slots[0]--; } } while (start <= end) { + u64 csum_end; + leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); @@ -570,8 +610,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (key.offset > start) start = key.offset; - size = btrfs_item_size(leaf, path->slots[0]); - csum_end = key.offset + (size / csum_size) * fs_info->sectorsize; + csum_end = key.offset + csum_size_to_bytes(fs_info, + btrfs_item_size(leaf, path->slots[0])); if (csum_end <= start) { path->slots[0]++; continue; @@ -581,8 +621,11 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_csum_item); while (start < csum_end) { + unsigned long offset; + size_t size; + size = min_t(size_t, csum_end - start, - max_ordered_sum_bytes(fs_info, csum_size)); + max_ordered_sum_bytes(fs_info)); sums = kzalloc(btrfs_ordered_sum_size(fs_info, size), GFP_NOFS); if (!sums) { @@ -593,16 +636,14 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, sums->bytenr = start; sums->len = (int)size; - offset = (start - key.offset) >> fs_info->sectorsize_bits; - offset *= csum_size; - size >>= fs_info->sectorsize_bits; + offset = bytes_to_csum_size(fs_info, start - key.offset); read_extent_buffer(path->nodes[0], sums->sums, ((unsigned long)item) + offset, - csum_size * size); + bytes_to_csum_size(fs_info, size)); - start += fs_info->sectorsize * size; + start += size; list_add_tail(&sums->list, &tmplist); } path->slots[0]++; @@ -620,8 +661,129 @@ fail: return ret; } -/** - * Calculate checksums of the data contained inside a bio +/* + * Do the same work as btrfs_lookup_csums_list(), the difference is in how + * we return the result. + * + * This version will set the corresponding bits in @csum_bitmap to represent + * that there is a csum found. + * Each bit represents a sector. Thus caller should ensure @csum_buf passed + * in is large enough to contain all csums. + */ +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, + u8 *csum_buf, unsigned long *csum_bitmap) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_csum_item *item; + const u64 orig_start = start; + int ret; + + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(end + 1, fs_info->sectorsize)); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = start; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto fail; + if (ret > 0 && path->slots[0] > 0) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + + /* + * There are two cases we can hit here for the previous csum + * item: + * + * |<- search range ->| + * |<- csum item ->| + * + * Or + * |<- search range ->| + * |<- csum item ->| + * + * Check if the previous csum item covers the leading part of + * the search range. If so we have to start from previous csum + * item. + */ + if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && + key.type == BTRFS_EXTENT_CSUM_KEY) { + if (bytes_to_csum_size(fs_info, start - key.offset) < + btrfs_item_size(leaf, path->slots[0] - 1)) + path->slots[0]--; + } + } + + while (start <= end) { + u64 csum_end; + + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto fail; + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset > end) + break; + + if (key.offset > start) + start = key.offset; + + csum_end = key.offset + csum_size_to_bytes(fs_info, + btrfs_item_size(leaf, path->slots[0])); + if (csum_end <= start) { + path->slots[0]++; + continue; + } + + csum_end = min(csum_end, end + 1); + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + while (start < csum_end) { + unsigned long offset; + size_t size; + u8 *csum_dest = csum_buf + bytes_to_csum_size(fs_info, + start - orig_start); + + size = min_t(size_t, csum_end - start, end + 1 - start); + + offset = bytes_to_csum_size(fs_info, start - key.offset); + + read_extent_buffer(path->nodes[0], csum_dest, + ((unsigned long)item) + offset, + bytes_to_csum_size(fs_info, size)); + + bitmap_set(csum_bitmap, + (start - orig_start) >> fs_info->sectorsize_bits, + size >> fs_info->sectorsize_bits); + + start += size; + } + path->slots[0]++; + } + ret = 0; +fail: + btrfs_free_path(path); + return ret; +} + +/* + * Calculate checksums of the data contained inside a bio. * * @inode: Owner of the data inside the bio * @bio: Contains the data to be checksummed @@ -736,15 +898,16 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, } /* - * helper function for csum removal, this expects the - * key to describe the csum pointed to by the path, and it expects - * the csum to overlap the range [bytenr, len] + * Remove one checksum overlapping a range. * - * The csum should not be entirely contained in the range and the - * range should not be entirely contained in the csum. + * This expects the key to describe the csum pointed to by the path, and it + * expects the csum to overlap the range [bytenr, len] * - * This calls btrfs_truncate_item with the correct args based on the - * overlap, and fixes up the key as required. + * The csum should not be entirely contained in the range and the range should + * not be entirely contained in the csum. + * + * This calls btrfs_truncate_item with the correct args based on the overlap, + * and fixes up the key as required. */ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, struct btrfs_path *path, @@ -793,8 +956,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, } /* - * deletes the csum items from the csum tree for a given - * range of bytes. + * Delete the csum items from the csum tree for a given range of bytes. */ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len) @@ -1199,7 +1361,6 @@ out: void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, - const bool new_inline, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1251,10 +1412,9 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, */ em->orig_start = EXTENT_MAP_HOLE; em->block_len = (u64)-1; - if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) { + em->compress_type = compress_type; + if (compress_type != BTRFS_COMPRESS_NONE) set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - } } else { btrfs_err(fs_info, "unknown file extent item type %d, inode %llu, offset %llu, " diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h new file mode 100644 index 000000000000..031225668434 --- /dev/null +++ b/fs/btrfs/file-item.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_FILE_ITEM_H +#define BTRFS_FILE_ITEM_H + +#include "accessors.h" + +#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ + (offsetof(struct btrfs_file_extent_item, disk_bytenr)) + +static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info) +{ + return BTRFS_MAX_ITEM_SIZE(info) - BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +/* + * Return the number of bytes used by the item on disk, minus the size of any + * extent headers. If a file is compressed on disk, this is the compressed + * size. + */ +static inline u32 btrfs_file_extent_inline_item_len( + const struct extent_buffer *eb, + int nr) +{ + return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +static inline unsigned long btrfs_file_extent_inline_start( + const struct btrfs_file_extent_item *e) +{ + return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) +{ + return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; +} + +int btrfs_del_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 len); +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); +int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid, u64 pos, + u64 num_bytes); +int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + u64 bytenr, int mod); +int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums); +blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, + u64 offset, bool one_ordered); +int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait); +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, + u8 *csum_buf, unsigned long *csum_bitmap); +void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, + struct extent_map *em); +int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, + u64 len); +int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); +void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size); +u64 btrfs_file_extent_end(const struct btrfs_path *path); + +#endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5a3f6e0d9688..91b00eb2440e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -30,329 +30,13 @@ #include "delalloc-space.h" #include "reflink.h" #include "subpage.h" - -static struct kmem_cache *btrfs_inode_defrag_cachep; -/* - * when auto defrag is enabled we - * queue up these defrag structs to remember which - * inodes need defragging passes - */ -struct inode_defrag { - struct rb_node rb_node; - /* objectid */ - u64 ino; - /* - * transid where the defrag was added, we search for - * extents newer than this - */ - u64 transid; - - /* root objectid */ - u64 root; - - /* - * The extent size threshold for autodefrag. - * - * This value is different for compressed/non-compressed extents, - * thus needs to be passed from higher layer. - * (aka, inode_should_defrag()) - */ - u32 extent_thresh; -}; - -static int __compare_inode_defrag(struct inode_defrag *defrag1, - struct inode_defrag *defrag2) -{ - if (defrag1->root > defrag2->root) - return 1; - else if (defrag1->root < defrag2->root) - return -1; - else if (defrag1->ino > defrag2->ino) - return 1; - else if (defrag1->ino < defrag2->ino) - return -1; - else - return 0; -} - -/* pop a record for an inode into the defrag tree. The lock - * must be held already - * - * If you're inserting a record for an older transid than an - * existing record, the transid already in the tree is lowered - * - * If an existing record is found the defrag item you - * pass in is freed - */ -static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, - struct inode_defrag *defrag) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct inode_defrag *entry; - struct rb_node **p; - struct rb_node *parent = NULL; - int ret; - - p = &fs_info->defrag_inodes.rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct inode_defrag, rb_node); - - ret = __compare_inode_defrag(defrag, entry); - if (ret < 0) - p = &parent->rb_left; - else if (ret > 0) - p = &parent->rb_right; - else { - /* if we're reinserting an entry for - * an old defrag run, make sure to - * lower the transid of our existing record - */ - if (defrag->transid < entry->transid) - entry->transid = defrag->transid; - entry->extent_thresh = min(defrag->extent_thresh, - entry->extent_thresh); - return -EEXIST; - } - } - set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); - rb_link_node(&defrag->rb_node, parent, p); - rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); - return 0; -} - -static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) -{ - if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) - return 0; - - if (btrfs_fs_closing(fs_info)) - return 0; - - return 1; -} - -/* - * insert a defrag record for this inode if auto defrag is - * enabled - */ -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh) -{ - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct inode_defrag *defrag; - u64 transid; - int ret; - - if (!__need_auto_defrag(fs_info)) - return 0; - - if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) - return 0; - - if (trans) - transid = trans->transid; - else - transid = inode->root->last_trans; - - defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); - if (!defrag) - return -ENOMEM; - - defrag->ino = btrfs_ino(inode); - defrag->transid = transid; - defrag->root = root->root_key.objectid; - defrag->extent_thresh = extent_thresh; - - spin_lock(&fs_info->defrag_inodes_lock); - if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { - /* - * If we set IN_DEFRAG flag and evict the inode from memory, - * and then re-read this inode, this new inode doesn't have - * IN_DEFRAG flag. At the case, we may find the existed defrag. - */ - ret = __btrfs_add_inode_defrag(inode, defrag); - if (ret) - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - } else { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - } - spin_unlock(&fs_info->defrag_inodes_lock); - return 0; -} - -/* - * pick the defragable inode that we want, if it doesn't exist, we will get - * the next one. - */ -static struct inode_defrag * -btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino) -{ - struct inode_defrag *entry = NULL; - struct inode_defrag tmp; - struct rb_node *p; - struct rb_node *parent = NULL; - int ret; - - tmp.ino = ino; - tmp.root = root; - - spin_lock(&fs_info->defrag_inodes_lock); - p = fs_info->defrag_inodes.rb_node; - while (p) { - parent = p; - entry = rb_entry(parent, struct inode_defrag, rb_node); - - ret = __compare_inode_defrag(&tmp, entry); - if (ret < 0) - p = parent->rb_left; - else if (ret > 0) - p = parent->rb_right; - else - goto out; - } - - if (parent && __compare_inode_defrag(&tmp, entry) > 0) { - parent = rb_next(parent); - if (parent) - entry = rb_entry(parent, struct inode_defrag, rb_node); - else - entry = NULL; - } -out: - if (entry) - rb_erase(parent, &fs_info->defrag_inodes); - spin_unlock(&fs_info->defrag_inodes_lock); - return entry; -} - -void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) -{ - struct inode_defrag *defrag; - struct rb_node *node; - - spin_lock(&fs_info->defrag_inodes_lock); - node = rb_first(&fs_info->defrag_inodes); - while (node) { - rb_erase(node, &fs_info->defrag_inodes); - defrag = rb_entry(node, struct inode_defrag, rb_node); - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - - cond_resched_lock(&fs_info->defrag_inodes_lock); - - node = rb_first(&fs_info->defrag_inodes); - } - spin_unlock(&fs_info->defrag_inodes_lock); -} - -#define BTRFS_DEFRAG_BATCH 1024 - -static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, - struct inode_defrag *defrag) -{ - struct btrfs_root *inode_root; - struct inode *inode; - struct btrfs_ioctl_defrag_range_args range; - int ret = 0; - u64 cur = 0; - -again: - if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) - goto cleanup; - if (!__need_auto_defrag(fs_info)) - goto cleanup; - - /* get the inode */ - inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); - if (IS_ERR(inode_root)) { - ret = PTR_ERR(inode_root); - goto cleanup; - } - - inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); - btrfs_put_root(inode_root); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - goto cleanup; - } - - if (cur >= i_size_read(inode)) { - iput(inode); - goto cleanup; - } - - /* do a chunk of defrag */ - clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); - memset(&range, 0, sizeof(range)); - range.len = (u64)-1; - range.start = cur; - range.extent_thresh = defrag->extent_thresh; - - sb_start_write(fs_info->sb); - ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, - BTRFS_DEFRAG_BATCH); - sb_end_write(fs_info->sb); - iput(inode); - - if (ret < 0) - goto cleanup; - - cur = max(cur + fs_info->sectorsize, range.start); - goto again; - -cleanup: - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return ret; -} - -/* - * run through the list of inodes in the FS that need - * defragging - */ -int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) -{ - struct inode_defrag *defrag; - u64 first_ino = 0; - u64 root_objectid = 0; - - atomic_inc(&fs_info->defrag_running); - while (1) { - /* Pause the auto defragger. */ - if (test_bit(BTRFS_FS_STATE_REMOUNTING, - &fs_info->fs_state)) - break; - - if (!__need_auto_defrag(fs_info)) - break; - - /* find an inode to defrag */ - defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, - first_ino); - if (!defrag) { - if (root_objectid || first_ino) { - root_objectid = 0; - first_ino = 0; - continue; - } else { - break; - } - } - - first_ino = defrag->ino + 1; - root_objectid = defrag->root; - - __btrfs_run_defrag_inode(fs_info, defrag); - } - atomic_dec(&fs_info->defrag_running); - - /* - * during unmount, we use the transaction_wait queue to - * wait for the defragger to stop - */ - wake_up(&fs_info->transaction_wait); - return 0; -} +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "file-item.h" +#include "ioctl.h" +#include "file.h" +#include "super.h" /* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. @@ -473,7 +157,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, */ clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, cached); + cached); err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, extra_bits, cached); @@ -499,159 +183,6 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, } /* - * this drops all the extents in the cache that intersect the range - * [start, end]. Existing extents are split as required. - */ -void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, - int skip_pinned) -{ - struct extent_map *em; - struct extent_map *split = NULL; - struct extent_map *split2 = NULL; - struct extent_map_tree *em_tree = &inode->extent_tree; - u64 len = end - start + 1; - u64 gen; - int ret; - int testend = 1; - unsigned long flags; - int compressed = 0; - bool modified; - - WARN_ON(end < start); - if (end == (u64)-1) { - len = (u64)-1; - testend = 0; - } - while (1) { - int no_splits = 0; - - modified = false; - if (!split) - split = alloc_extent_map(); - if (!split2) - split2 = alloc_extent_map(); - if (!split || !split2) - no_splits = 1; - - write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - if (!em) { - write_unlock(&em_tree->lock); - break; - } - flags = em->flags; - gen = em->generation; - if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { - if (testend && em->start + em->len >= start + len) { - free_extent_map(em); - write_unlock(&em_tree->lock); - break; - } - start = em->start + em->len; - if (testend) - len = start + len - (em->start + em->len); - free_extent_map(em); - write_unlock(&em_tree->lock); - continue; - } - compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - clear_bit(EXTENT_FLAG_PINNED, &em->flags); - clear_bit(EXTENT_FLAG_LOGGING, &flags); - modified = !list_empty(&em->list); - if (no_splits) - goto next; - - if (em->start < start) { - split->start = em->start; - split->len = start - em->start; - - if (em->block_start < EXTENT_MAP_LAST_BYTE) { - split->orig_start = em->orig_start; - split->block_start = em->block_start; - - if (compressed) - split->block_len = em->block_len; - else - split->block_len = split->len; - split->orig_block_len = max(split->block_len, - em->orig_block_len); - split->ram_bytes = em->ram_bytes; - } else { - split->orig_start = split->start; - split->block_len = 0; - split->block_start = em->block_start; - split->orig_block_len = 0; - split->ram_bytes = split->len; - } - - split->generation = gen; - split->flags = flags; - split->compress_type = em->compress_type; - replace_extent_mapping(em_tree, em, split, modified); - free_extent_map(split); - split = split2; - split2 = NULL; - } - if (testend && em->start + em->len > start + len) { - u64 diff = start + len - em->start; - - split->start = start + len; - split->len = em->start + em->len - (start + len); - split->flags = flags; - split->compress_type = em->compress_type; - split->generation = gen; - - if (em->block_start < EXTENT_MAP_LAST_BYTE) { - split->orig_block_len = max(em->block_len, - em->orig_block_len); - - split->ram_bytes = em->ram_bytes; - if (compressed) { - split->block_len = em->block_len; - split->block_start = em->block_start; - split->orig_start = em->orig_start; - } else { - split->block_len = split->len; - split->block_start = em->block_start - + diff; - split->orig_start = em->orig_start; - } - } else { - split->ram_bytes = split->len; - split->orig_start = split->start; - split->block_len = 0; - split->block_start = em->block_start; - split->orig_block_len = 0; - } - - if (extent_map_in_tree(em)) { - replace_extent_mapping(em_tree, em, split, - modified); - } else { - ret = add_extent_mapping(em_tree, split, - modified); - ASSERT(ret == 0); /* Logic error */ - } - free_extent_map(split); - split = NULL; - } -next: - if (extent_map_in_tree(em)) - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - - /* once for us */ - free_extent_map(em); - /* once for the tree*/ - free_extent_map(em); - } - if (split) - free_extent_map(split); - if (split2) - free_extent_map(split2); -} - -/* * this is very complex, but the basic idea is to drop all extents * in the range start - end. hint_block is filled in with a block number * that would be a good hint to the block allocator for this file. @@ -708,7 +239,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, } if (args->drop_cache) - btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0); + btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false); if (args->start >= inode->disk_i_size && !args->replace_extent) modify_tree = 0; @@ -849,7 +380,10 @@ next_slot: args->start - extent_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } } key.offset = args->start; } @@ -936,7 +470,10 @@ delete_extent_item: key.offset - extent_offset, 0, false); ret = btrfs_free_extent(trans, &ref); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } args->bytes_found += extent_end - key.offset; } @@ -1339,26 +876,54 @@ static int prepare_uptodate_page(struct inode *inode, return 0; } +static unsigned int get_prepare_fgp_flags(bool nowait) +{ + unsigned int fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; + + if (nowait) + fgp_flags |= FGP_NOWAIT; + + return fgp_flags; +} + +static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) +{ + gfp_t gfp; + + gfp = btrfs_alloc_write_mask(inode->i_mapping); + if (nowait) { + gfp &= ~__GFP_DIRECT_RECLAIM; + gfp |= GFP_NOWAIT; + } + + return gfp; +} + /* * this just gets pages into the page cache and locks them down. */ static noinline int prepare_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, - size_t write_bytes, bool force_uptodate) + size_t write_bytes, bool force_uptodate, + bool nowait) { int i; unsigned long index = pos >> PAGE_SHIFT; - gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + gfp_t mask = get_prepare_gfp_flags(inode, nowait); + unsigned int fgp_flags = get_prepare_fgp_flags(nowait); int err = 0; int faili; for (i = 0; i < num_pages; i++) { again: - pages[i] = find_or_create_page(inode->i_mapping, index + i, - mask | __GFP_WRITE); + pages[i] = pagecache_get_page(inode->i_mapping, index + i, + fgp_flags, mask | __GFP_WRITE); if (!pages[i]) { faili = i - 1; - err = -ENOMEM; + if (nowait) + err = -EAGAIN; + else + err = -ENOMEM; goto fail; } @@ -1376,7 +941,7 @@ again: pos + write_bytes, false); if (err) { put_page(pages[i]); - if (err == -EAGAIN) { + if (!nowait && err == -EAGAIN) { err = 0; goto again; } @@ -1411,7 +976,7 @@ static noinline int lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, - u64 *lockstart, u64 *lockend, + u64 *lockstart, u64 *lockend, bool nowait, struct extent_state **cached_state) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1426,15 +991,28 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, if (start_pos < inode->vfs_inode.i_size) { struct btrfs_ordered_extent *ordered; - lock_extent_bits(&inode->io_tree, start_pos, last_pos, - cached_state); + if (nowait) { + if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, + cached_state)) { + for (i = 0; i < num_pages; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + pages[i] = NULL; + } + + return -EAGAIN; + } + } else { + lock_extent(&inode->io_tree, start_pos, last_pos, cached_state); + } + ordered = btrfs_lookup_ordered_range(inode, start_pos, last_pos - start_pos + 1); if (ordered && ordered->file_offset + ordered->num_bytes > start_pos && ordered->file_offset <= last_pos) { - unlock_extent_cached(&inode->io_tree, start_pos, - last_pos, cached_state); + unlock_extent(&inode->io_tree, start_pos, last_pos, + cached_state); for (i = 0; i < num_pages; i++) { unlock_page(pages[i]); put_page(pages[i]); @@ -1481,10 +1059,11 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0. */ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes) + size_t *write_bytes, bool nowait) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; + struct extent_state *cached_state = NULL; u64 lockstart, lockend; u64 num_bytes; int ret; @@ -1500,17 +1079,24 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, fs_info->sectorsize) - 1; num_bytes = lockend - lockstart + 1; - btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL); + if (nowait) { + if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend, + &cached_state)) { + btrfs_drew_write_unlock(&root->snapshot_lock); + return -EAGAIN; + } + } else { + btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, + &cached_state); + } ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, - NULL, NULL, NULL, false); - if (ret <= 0) { - ret = 0; + NULL, NULL, NULL, nowait, false); + if (ret <= 0) btrfs_drew_write_unlock(&root->snapshot_lock); - } else { + else *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); - } - unlock_extent(&inode->io_tree, lockstart, lockend); + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); return ret; } @@ -1607,11 +1193,13 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, bool force_page_uptodate = false; loff_t old_isize = i_size_read(inode); unsigned int ilock_flags = 0; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); + unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); - if (iocb->ki_flags & IOCB_NOWAIT) + if (nowait) ilock_flags |= BTRFS_ILOCK_TRY; - ret = btrfs_inode_lock(inode, ilock_flags); + ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); if (ret < 0) return ret; @@ -1664,18 +1252,29 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, extent_changeset_release(data_reserved); ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, pos, - write_bytes); + write_bytes, nowait); if (ret < 0) { + int can_nocow; + + if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) { + ret = -EAGAIN; + break; + } + /* * If we don't have to COW at the offset, reserve * metadata only. write_bytes may get smaller than * requested here. */ - if (btrfs_check_nocow_lock(BTRFS_I(inode), pos, - &write_bytes) > 0) - only_release_metadata = true; - else + can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos, + &write_bytes, nowait); + if (can_nocow < 0) + ret = can_nocow; + if (can_nocow > 0) + ret = 0; + if (ret) break; + only_release_metadata = true; } num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE); @@ -1685,7 +1284,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, WARN_ON(reserve_bytes == 0); ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), reserve_bytes, - reserve_bytes, false); + reserve_bytes, nowait); if (ret) { if (!only_release_metadata) btrfs_free_reserved_data_space(BTRFS_I(inode), @@ -1693,19 +1292,27 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, write_bytes); else btrfs_check_nocow_unlock(BTRFS_I(inode)); + + if (nowait && ret == -ENOSPC) + ret = -EAGAIN; break; } release_bytes = reserve_bytes; again: + ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags); + if (ret) { + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); + break; + } + /* * This is going to setup the pages array with the number of * pages we want, so we don't really need to worry about the * contents of pages from loop to loop */ ret = prepare_pages(inode, pages, num_pages, - pos, write_bytes, - force_page_uptodate); + pos, write_bytes, force_page_uptodate, false); if (ret) { btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); @@ -1715,10 +1322,11 @@ again: extents_locked = lock_and_cleanup_extent_if_need( BTRFS_I(inode), pages, num_pages, pos, write_bytes, &lockstart, - &lockend, &cached_state); + &lockend, nowait, &cached_state); if (extents_locked < 0) { - if (extents_locked == -EAGAIN) + if (!nowait && extents_locked == -EAGAIN) goto again; + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); ret = extents_locked; @@ -1782,8 +1390,8 @@ again: * possible cached extent state to avoid a memory leak. */ if (extents_locked) - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - lockstart, lockend, &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); else free_extent_state(cached_state); @@ -1801,8 +1409,6 @@ again: cond_resched(); - balance_dirty_pages_ratelimited(inode->i_mapping); - pos += copied; num_written += copied; } @@ -1828,7 +1434,7 @@ again: iocb->ki_pos += num_written; } out: - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); return num_written ? num_written : ret; } @@ -1858,6 +1464,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) loff_t endbyte; ssize_t err; unsigned int ilock_flags = 0; + struct iomap_dio *dio; if (iocb->ki_flags & IOCB_NOWAIT) ilock_flags |= BTRFS_ILOCK_TRY; @@ -1867,19 +1474,19 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) ilock_flags |= BTRFS_ILOCK_SHARED; relock: - err = btrfs_inode_lock(inode, ilock_flags); + err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); if (err < 0) return err; err = generic_write_checks(iocb, from); if (err <= 0) { - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); return err; } err = btrfs_write_check(iocb, from, err); if (err < 0) { - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto out; } @@ -1890,13 +1497,13 @@ relock: */ if ((ilock_flags & BTRFS_ILOCK_SHARED) && pos + iov_iter_count(from) > i_size_read(inode)) { - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); ilock_flags &= ~BTRFS_ILOCK_SHARED; goto relock; } if (check_direct_IO(fs_info, from, pos)) { - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto buffered; } @@ -1918,11 +1525,22 @@ relock: * So here we disable page faults in the iov_iter and then retry if we * got -EFAULT, faulting in the pages before the retry. */ -again: from->nofault = true; - err = btrfs_dio_rw(iocb, from, written); + dio = btrfs_dio_write(iocb, from, written); from->nofault = false; + /* + * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync + * iocb, and that needs to lock the inode. So unlock it before calling + * iomap_dio_complete() to avoid a deadlock. + */ + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + + if (IS_ERR_OR_NULL(dio)) + err = PTR_ERR_OR_ZERO(dio); + else + err = iomap_dio_complete(dio); + /* No increment (+=) because iomap returns a cumulative value. */ if (err > 0) written = err; @@ -1948,12 +1566,10 @@ again: } else { fault_in_iov_iter_readable(from, left); prev_left = left; - goto again; + goto relock; } } - btrfs_inode_unlock(inode, ilock_flags); - /* * If 'err' is -ENOTBLK or we have not written all data, then it means * we must fallback to buffered IO. @@ -1965,8 +1581,8 @@ buffered: /* * If we are in a NOWAIT context, then return -EAGAIN to signal the caller * it must retry the operation in a context where blocking is acceptable, - * since we currently don't have NOWAIT semantics support for buffered IO - * and may block there for many reasons (reserving space for example). + * because even if we end up not blocking during the buffered IO attempt + * below, we will block when flushing and waiting for the IO. */ if (iocb->ki_flags & IOCB_NOWAIT) { err = -EAGAIN; @@ -2006,7 +1622,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, loff_t count; ssize_t ret; - btrfs_inode_lock(inode, 0); + btrfs_inode_lock(BTRFS_I(inode), 0); count = encoded->len; ret = generic_write_checks_count(iocb, &count); if (ret == 0 && count != encoded->len) { @@ -2025,7 +1641,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, ret = btrfs_do_encoded_write(iocb, from, encoded); out: - btrfs_inode_unlock(inode, 0); + btrfs_inode_unlock(BTRFS_I(inode), 0); return ret; } @@ -2045,7 +1661,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, if (BTRFS_FS_ERROR(inode->root->fs_info)) return -EROFS; - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + if (encoded && (iocb->ki_flags & IOCB_NOWAIT)) return -EOPNOTSUPP; if (sync) @@ -2086,10 +1702,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp) { struct btrfs_file_private *private = filp->private_data; - if (private && private->filldir_buf) + if (private) { kfree(private->filldir_buf); - kfree(private); - filp->private_data = NULL; + free_extent_state(private->llseek_cached_state); + kfree(private); + filp->private_data = NULL; + } /* * Set by setattr when we are about to truncate a file from a non-zero @@ -2196,19 +1814,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) goto out; - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); atomic_inc(&root->log_batch); /* - * Always check for the full sync flag while holding the inode's lock, - * to avoid races with other tasks. The flag must be either set all the - * time during logging or always off all the time while logging. - */ - full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); - - /* * Before we acquired the inode's lock and the mmap lock, someone may * have dirtied more pages in the target range. We need to make sure * that writeback for any such pages does not start while we are logging @@ -2228,11 +1838,22 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) { - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); goto out; } /* + * Always check for the full sync flag while holding the inode's lock, + * to avoid races with other tasks. The flag must be either set all the + * time during logging or always off all the time while logging. + * We check the flag here after starting delalloc above, because when + * running delalloc the full sync flag may be set if we need to drop + * extra extent map ranges due to temporary memory allocation failures. + */ + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + + /* * We have to do this here to avoid the priority inversion of waiting on * IO of a lower priority task while holding a transaction open. * @@ -2320,7 +1941,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); if (ret == BTRFS_NO_LOG_SYNC) { ret = btrfs_end_transaction(trans); @@ -2380,6 +2001,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = btrfs_commit_transaction(trans); out: ASSERT(list_empty(&ctx.list)); + ASSERT(list_empty(&ctx.conflict_inodes)); err = file_check_and_advance_wb_err(file); if (!ret) ret = err; @@ -2387,7 +2009,7 @@ out: out_release_extents: btrfs_release_log_ctx_extents(&ctx); - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); goto out; } @@ -2448,7 +2070,6 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; struct extent_map *hole_em; - struct extent_map_tree *em_tree = &inode->extent_tree; struct btrfs_key key; int ret; @@ -2505,8 +2126,8 @@ static int fill_holes(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), - offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0); + ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, + end - offset); if (ret) return ret; @@ -2515,7 +2136,7 @@ out: hole_em = alloc_extent_map(); if (!hole_em) { - btrfs_drop_extent_cache(inode, offset, end - 1, 0); + btrfs_drop_extent_map_range(inode, offset, end - 1, false); btrfs_set_inode_full_sync(inode); } else { hole_em->start = offset; @@ -2529,12 +2150,7 @@ out: hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = trans->transid; - do { - btrfs_drop_extent_cache(inode, offset, end - 1, 0); - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, hole_em, 1); - write_unlock(&em_tree->lock); - } while (ret == -EEXIST); + ret = btrfs_replace_extent_map_range(inode, hole_em, true); free_extent_map(hole_em); if (ret) btrfs_set_inode_full_sync(inode); @@ -2591,8 +2207,8 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, while (1) { truncate_pagecache_range(inode, lockstart, lockend); - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); /* * We can't have ordered extents in the range, nor dirty/writeback * pages, because we have locked the inode's VFS lock in exclusive @@ -2607,8 +2223,8 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, page_lockend)) break; - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); } btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend); @@ -2988,7 +2604,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) bool truncated_block = false; bool updated_inode = false; - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) @@ -3008,9 +2624,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) if (ret) goto out_only_mutex; - lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode))); - lockend = round_down(offset + len, - btrfs_inode_sectorsize(BTRFS_I(inode))) - 1; + lockstart = round_up(offset, fs_info->sectorsize); + lockend = round_down(offset + len, fs_info->sectorsize) - 1; same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); /* @@ -3037,7 +2652,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) truncated_block = true; ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); if (ret) { - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; } } @@ -3108,8 +2723,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); out_only_mutex: if (!updated_inode && truncated_block && !ret) { /* @@ -3136,7 +2751,7 @@ out_only_mutex: ret = ret2; } } - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; } @@ -3212,7 +2827,7 @@ enum { static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, u64 offset) { - const u64 sectorsize = btrfs_inode_sectorsize(inode); + const u64 sectorsize = inode->root->fs_info->sectorsize; struct extent_map *em; int ret; @@ -3242,7 +2857,7 @@ static int btrfs_zero_range(struct inode *inode, struct extent_changeset *data_reserved = NULL; int ret; u64 alloc_hint = 0; - const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); + const u64 sectorsize = fs_info->sectorsize; u64 alloc_start = round_down(offset, sectorsize); u64 alloc_end = round_up(offset + len, sectorsize); u64 bytes_to_reserve = 0; @@ -3382,16 +2997,16 @@ reserve_space: ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, alloc_start, bytes_to_reserve); if (ret) { - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); goto out; } ret = btrfs_prealloc_file_range(inode, mode, alloc_start, alloc_end - alloc_start, i_blocksize(inode), offset + len, &alloc_hint); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); /* btrfs_prealloc_file_range releases reserved space on error */ if (ret) { space_reserved = false; @@ -3428,7 +3043,7 @@ static long btrfs_fallocate(struct file *file, int mode, u64 data_space_reserved = 0; u64 qgroup_reserved = 0; struct extent_map *em; - int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode)); + int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize; int ret; /* Do not allow fallocate in ZONED mode */ @@ -3447,7 +3062,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (mode & FALLOC_FL_PUNCH_HOLE) return btrfs_punch_hole(file, offset, len); - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { ret = inode_newsize_ok(inode, offset + len); @@ -3497,13 +3112,13 @@ static long btrfs_fallocate(struct file *file, int mode, if (mode & FALLOC_FL_ZERO_RANGE) { ret = btrfs_zero_range(inode, offset, len, mode); - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; } locked_end = alloc_end - 1; - lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state); + lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); @@ -3592,31 +3207,303 @@ static long btrfs_fallocate(struct file *file, int mode, */ ret = btrfs_fallocate_update_isize(inode, actual_end, mode); out_unlock: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); out: - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); extent_changeset_free(data_reserved); return ret; } -static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, - int whence) +/* + * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range + * that has unflushed and/or flushing delalloc. There might be other adjacent + * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps + * looping while it gets adjacent subranges, and merging them together. + */ +static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state, + bool *search_io_tree, + u64 *delalloc_start_ret, u64 *delalloc_end_ret) +{ + u64 len = end + 1 - start; + u64 delalloc_len = 0; + struct btrfs_ordered_extent *oe; + u64 oe_start; + u64 oe_end; + + /* + * Search the io tree first for EXTENT_DELALLOC. If we find any, it + * means we have delalloc (dirty pages) for which writeback has not + * started yet. + */ + if (*search_io_tree) { + spin_lock(&inode->lock); + if (inode->delalloc_bytes > 0) { + spin_unlock(&inode->lock); + *delalloc_start_ret = start; + delalloc_len = count_range_bits(&inode->io_tree, + delalloc_start_ret, end, + len, EXTENT_DELALLOC, 1, + cached_state); + } else { + spin_unlock(&inode->lock); + } + } + + if (delalloc_len > 0) { + /* + * If delalloc was found then *delalloc_start_ret has a sector size + * aligned value (rounded down). + */ + *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1; + + if (*delalloc_start_ret == start) { + /* Delalloc for the whole range, nothing more to do. */ + if (*delalloc_end_ret == end) + return true; + /* Else trim our search range for ordered extents. */ + start = *delalloc_end_ret + 1; + len = end + 1 - start; + } + } else { + /* No delalloc, future calls don't need to search again. */ + *search_io_tree = false; + } + + /* + * Now also check if there's any ordered extent in the range. + * We do this because: + * + * 1) When delalloc is flushed, the file range is locked, we clear the + * EXTENT_DELALLOC bit from the io tree and create an extent map and + * an ordered extent for the write. So we might just have been called + * after delalloc is flushed and before the ordered extent completes + * and inserts the new file extent item in the subvolume's btree; + * + * 2) We may have an ordered extent created by flushing delalloc for a + * subrange that starts before the subrange we found marked with + * EXTENT_DELALLOC in the io tree. + * + * We could also use the extent map tree to find such delalloc that is + * being flushed, but using the ordered extents tree is more efficient + * because it's usually much smaller as ordered extents are removed from + * the tree once they complete. With the extent maps, we mau have them + * in the extent map tree for a very long time, and they were either + * created by previous writes or loaded by read operations. + */ + oe = btrfs_lookup_first_ordered_range(inode, start, len); + if (!oe) + return (delalloc_len > 0); + + /* The ordered extent may span beyond our search range. */ + oe_start = max(oe->file_offset, start); + oe_end = min(oe->file_offset + oe->num_bytes - 1, end); + + btrfs_put_ordered_extent(oe); + + /* Don't have unflushed delalloc, return the ordered extent range. */ + if (delalloc_len == 0) { + *delalloc_start_ret = oe_start; + *delalloc_end_ret = oe_end; + return true; + } + + /* + * We have both unflushed delalloc (io_tree) and an ordered extent. + * If the ranges are adjacent returned a combined range, otherwise + * return the leftmost range. + */ + if (oe_start < *delalloc_start_ret) { + if (oe_end < *delalloc_start_ret) + *delalloc_end_ret = oe_end; + *delalloc_start_ret = oe_start; + } else if (*delalloc_end_ret + 1 == oe_start) { + *delalloc_end_ret = oe_end; + } + + return true; +} + +/* + * Check if there's delalloc in a given range. + * + * @inode: The inode. + * @start: The start offset of the range. It does not need to be + * sector size aligned. + * @end: The end offset (inclusive value) of the search range. + * It does not need to be sector size aligned. + * @cached_state: Extent state record used for speeding up delalloc + * searches in the inode's io_tree. Can be NULL. + * @delalloc_start_ret: Output argument, set to the start offset of the + * subrange found with delalloc (may not be sector size + * aligned). + * @delalloc_end_ret: Output argument, set to he end offset (inclusive value) + * of the subrange found with delalloc. + * + * Returns true if a subrange with delalloc is found within the given range, and + * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and + * end offsets of the subrange. + */ +bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state, + u64 *delalloc_start_ret, u64 *delalloc_end_ret) +{ + u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize); + u64 prev_delalloc_end = 0; + bool search_io_tree = true; + bool ret = false; + + while (cur_offset < end) { + u64 delalloc_start; + u64 delalloc_end; + bool delalloc; + + delalloc = find_delalloc_subrange(inode, cur_offset, end, + cached_state, &search_io_tree, + &delalloc_start, + &delalloc_end); + if (!delalloc) + break; + + if (prev_delalloc_end == 0) { + /* First subrange found. */ + *delalloc_start_ret = max(delalloc_start, start); + *delalloc_end_ret = delalloc_end; + ret = true; + } else if (delalloc_start == prev_delalloc_end + 1) { + /* Subrange adjacent to the previous one, merge them. */ + *delalloc_end_ret = delalloc_end; + } else { + /* Subrange not adjacent to the previous one, exit. */ + break; + } + + prev_delalloc_end = delalloc_end; + cur_offset = delalloc_end + 1; + cond_resched(); + } + + return ret; +} + +/* + * Check if there's a hole or delalloc range in a range representing a hole (or + * prealloc extent) found in the inode's subvolume btree. + * + * @inode: The inode. + * @whence: Seek mode (SEEK_DATA or SEEK_HOLE). + * @start: Start offset of the hole region. It does not need to be sector + * size aligned. + * @end: End offset (inclusive value) of the hole region. It does not + * need to be sector size aligned. + * @start_ret: Return parameter, used to set the start of the subrange in the + * hole that matches the search criteria (seek mode), if such + * subrange is found (return value of the function is true). + * The value returned here may not be sector size aligned. + * + * Returns true if a subrange matching the given seek mode is found, and if one + * is found, it updates @start_ret with the start of the subrange. + */ +static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, + struct extent_state **cached_state, + u64 start, u64 end, u64 *start_ret) { + u64 delalloc_start; + u64 delalloc_end; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state, + &delalloc_start, &delalloc_end); + if (delalloc && whence == SEEK_DATA) { + *start_ret = delalloc_start; + return true; + } + + if (delalloc && whence == SEEK_HOLE) { + /* + * We found delalloc but it starts after out start offset. So we + * have a hole between our start offset and the delalloc start. + */ + if (start < delalloc_start) { + *start_ret = start; + return true; + } + /* + * Delalloc range starts at our start offset. + * If the delalloc range's length is smaller than our range, + * then it means we have a hole that starts where the delalloc + * subrange ends. + */ + if (delalloc_end < end) { + *start_ret = delalloc_end + 1; + return true; + } + + /* There's delalloc for the whole range. */ + return false; + } + + if (!delalloc && whence == SEEK_HOLE) { + *start_ret = start; + return true; + } + + /* + * No delalloc in the range and we are seeking for data. The caller has + * to iterate to the next extent item in the subvolume btree. + */ + return false; +} + +static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) +{ + struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host); + struct btrfs_file_private *private = file->private_data; struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_map *em = NULL; struct extent_state *cached_state = NULL; - loff_t i_size = inode->vfs_inode.i_size; + struct extent_state **delalloc_cached_state; + const loff_t i_size = i_size_read(&inode->vfs_inode); + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct btrfs_path *path; + struct btrfs_key key; + u64 last_extent_end; u64 lockstart; u64 lockend; u64 start; - u64 len; - int ret = 0; + int ret; + bool found = false; if (i_size == 0 || offset >= i_size) return -ENXIO; /* + * Quick path. If the inode has no prealloc extents and its number of + * bytes used matches its i_size, then it can not have holes. + */ + if (whence == SEEK_HOLE && + !(inode->flags & BTRFS_INODE_PREALLOC) && + inode_get_bytes(&inode->vfs_inode) == i_size) + return i_size; + + if (!private) { + private = kzalloc(sizeof(*private), GFP_KERNEL); + /* + * No worries if memory allocation failed. + * The private structure is used only for speeding up multiple + * lseek SEEK_HOLE/DATA calls to a file when there's delalloc, + * so everything will still be correct. + */ + file->private_data = private; + } + + if (private) + delalloc_cached_state = &private->llseek_cached_state; + else + delalloc_cached_state = NULL; + + /* * offset can be negative, in this case we start finding DATA/HOLE from * the very start of the file. */ @@ -3627,45 +3514,167 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, if (lockend <= lockstart) lockend = lockstart + fs_info->sectorsize; lockend--; - len = lockend - lockstart + 1; - lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = READA_FORWARD; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + last_extent_end = lockstart; + + lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } while (start < i_size) { - em = btrfs_get_extent_fiemap(inode, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - em = NULL; - break; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *extent; + u64 extent_end; + + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + + leaf = path->nodes[0]; } - if (whence == SEEK_HOLE && - (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) - break; - else if (whence == SEEK_DATA && - (em->block_start != EXTENT_MAP_HOLE && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) break; - start = em->start + em->len; - free_extent_map(em); - em = NULL; + extent_end = btrfs_file_extent_end(path); + + /* + * In the first iteration we may have a slot that points to an + * extent that ends before our start offset, so skip it. + */ + if (extent_end <= start) { + path->slots[0]++; + continue; + } + + /* We have an implicit hole, NO_HOLES feature is likely set. */ + if (last_extent_end < key.offset) { + u64 search_start = last_extent_end; + u64 found_start; + + /* + * First iteration, @start matches @offset and it's + * within the hole. + */ + if (start == offset) + search_start = offset; + + found = find_desired_extent_in_hole(inode, whence, + delalloc_cached_state, + search_start, + key.offset - 1, + &found_start); + if (found) { + start = found_start; + break; + } + /* + * Didn't find data or a hole (due to delalloc) in the + * implicit hole range, so need to analyze the extent. + */ + } + + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_disk_bytenr(leaf, extent) == 0 || + btrfs_file_extent_type(leaf, extent) == + BTRFS_FILE_EXTENT_PREALLOC) { + /* + * Explicit hole or prealloc extent, search for delalloc. + * A prealloc extent is treated like a hole. + */ + u64 search_start = key.offset; + u64 found_start; + + /* + * First iteration, @start matches @offset and it's + * within the hole. + */ + if (start == offset) + search_start = offset; + + found = find_desired_extent_in_hole(inode, whence, + delalloc_cached_state, + search_start, + extent_end - 1, + &found_start); + if (found) { + start = found_start; + break; + } + /* + * Didn't find data or a hole (due to delalloc) in the + * implicit hole range, so need to analyze the next + * extent item. + */ + } else { + /* + * Found a regular or inline extent. + * If we are seeking for data, adjust the start offset + * and stop, we're done. + */ + if (whence == SEEK_DATA) { + start = max_t(u64, key.offset, offset); + found = true; + break; + } + /* + * Else, we are seeking for a hole, check the next file + * extent item. + */ + } + + start = extent_end; + last_extent_end = extent_end; + path->slots[0]++; + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } cond_resched(); } - free_extent_map(em); - unlock_extent_cached(&inode->io_tree, lockstart, lockend, - &cached_state); - if (ret) { - offset = ret; - } else { - if (whence == SEEK_DATA && start >= i_size) - offset = -ENXIO; - else - offset = min_t(loff_t, start, i_size); + + /* We have an implicit hole from the last extent found up to i_size. */ + if (!found && start < i_size) { + found = find_desired_extent_in_hole(inode, whence, + delalloc_cached_state, start, + i_size - 1, &start); + if (!found) + start = i_size; } - return offset; +out: + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_free_path(path); + + if (ret < 0) + return ret; + + if (whence == SEEK_DATA && start >= i_size) + return -ENXIO; + + return min_t(loff_t, start, i_size); } static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) @@ -3677,9 +3686,9 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) return generic_file_llseek(file, offset, whence); case SEEK_DATA: case SEEK_HOLE: - btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); - offset = find_desired_extent(BTRFS_I(inode), offset, whence); - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); + offset = find_desired_extent(file, offset, whence); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); break; } @@ -3693,7 +3702,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) { int ret; - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC; ret = fsverity_file_open(inode, filp); if (ret) @@ -3734,7 +3743,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos)) return 0; - btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); again: /* * This is similar to what we do for direct IO writes, see the comment @@ -3753,7 +3762,7 @@ again: */ pagefault_disable(); to->nofault = true; - ret = btrfs_dio_rw(iocb, to, read); + ret = btrfs_dio_read(iocb, to, read); to->nofault = false; pagefault_enable(); @@ -3783,7 +3792,7 @@ again: goto again; } } - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); return ret < 0 ? ret : read; } @@ -3810,6 +3819,7 @@ const struct file_operations btrfs_file_operations = { .mmap = btrfs_file_mmap, .open = btrfs_file_open, .release = btrfs_release_file, + .get_unmapped_area = thp_get_unmapped_area, .fsync = btrfs_sync_file, .fallocate = btrfs_fallocate, .unlocked_ioctl = btrfs_ioctl, @@ -3819,23 +3829,6 @@ const struct file_operations btrfs_file_operations = { .remap_file_range = btrfs_remap_file_range, }; -void __cold btrfs_auto_defrag_exit(void) -{ - kmem_cache_destroy(btrfs_inode_defrag_cachep); -} - -int __init btrfs_auto_defrag_init(void) -{ - btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", - sizeof(struct inode_defrag), 0, - SLAB_MEM_SPREAD, - NULL); - if (!btrfs_inode_defrag_cachep) - return -ENOMEM; - - return 0; -} - int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) { int ret; diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h new file mode 100644 index 000000000000..82b34fbb295f --- /dev/null +++ b/fs/btrfs/file.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_FILE_H +#define BTRFS_FILE_H + +extern const struct file_operations btrfs_file_operations; + +int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_drop_extents_args *args); +int btrfs_replace_file_extents(struct btrfs_inode *inode, + struct btrfs_path *path, const u64 start, + const u64 end, + struct btrfs_replace_extent_info *extent_info, + struct btrfs_trans_handle **trans_out); +int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u64 start, u64 end); +ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); +int btrfs_release_file(struct inode *inode, struct file *file); +int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, + size_t num_pages, loff_t pos, size_t write_bytes, + struct extent_state **cached, bool noreserve); +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); +int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes, bool nowait); +void btrfs_check_nocow_unlock(struct btrfs_inode *inode); +bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state, + u64 *delalloc_start_ret, u64 *delalloc_end_ret); + +#endif diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 996da650ecdc..0d250d052487 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -11,8 +11,10 @@ #include <linux/ratelimit.h> #include <linux/error-injection.h> #include <linux/sched/mm.h> -#include "misc.h" #include "ctree.h" +#include "fs.h" +#include "messages.h" +#include "misc.h" #include "free-space-cache.h" #include "transaction.h" #include "disk-io.h" @@ -24,11 +26,18 @@ #include "discard.h" #include "subpage.h" #include "inode-item.h" +#include "accessors.h" +#include "file-item.h" +#include "file.h" +#include "super.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K #define FORCE_EXTENT_THRESHOLD SZ_1M +static struct kmem_cache *btrfs_free_space_cachep; +static struct kmem_cache *btrfs_free_space_bitmap_cachep; + struct btrfs_trim_range { u64 start; u64 bytes; @@ -48,6 +57,24 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes, bool update_stats); +static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *info; + struct rb_node *node; + + while ((node = rb_last(&ctl->free_space_offset)) != NULL) { + info = rb_entry(node, struct btrfs_free_space, offset_index); + if (!info->bitmap) { + unlink_free_space(ctl, info, true); + kmem_cache_free(btrfs_free_space_cachep, info); + } else { + free_bitmap(ctl, info); + } + + cond_resched_lock(&ctl->tree_lock); + } +} + static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_path *path, u64 offset) @@ -126,10 +153,8 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, block_group->disk_cache_state = BTRFS_DC_CLEAR; } - if (!block_group->iref) { + if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) block_group->inode = igrab(inode); - block_group->iref = 1; - } spin_unlock(&block_group->lock); return inode; @@ -235,14 +260,13 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, } ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); goto out; } clear_nlink(inode); /* One for the block groups ref */ spin_lock(&block_group->lock); - if (block_group->iref) { - block_group->iref = 0; + if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { block_group->inode = NULL; spin_unlock(&block_group->lock); iput(inode); @@ -250,7 +274,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, spin_unlock(&block_group->lock); } /* One for the lookup ref */ - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.type = 0; @@ -333,8 +357,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, btrfs_i_size_write(inode, 0); truncate_pagecache(vfs_inode, 0); - lock_extent_bits(&inode->io_tree, 0, (u64)-1, &cached_state); - btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); + btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); /* * We skip the throttling logic for free space cache inodes, so we don't @@ -345,7 +369,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); btrfs_inode_safe_disk_i_size_write(inode, control.last_size); - unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state); + unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); if (ret) goto fail; @@ -693,6 +717,12 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) max_bitmaps = max_t(u64, max_bitmaps, 1); + if (ctl->total_bitmaps > max_bitmaps) + btrfs_err(block_group->fs_info, +"invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu", + block_group->start, block_group->length, + ctl->total_bitmaps, ctl->unit, max_bitmaps, + bytes_per_bg); ASSERT(ctl->total_bitmaps <= max_bitmaps); /* @@ -875,7 +905,10 @@ out: return ret; free_cache: io_ctl_drop_pages(&io_ctl); + + spin_lock(&ctl->tree_lock); __btrfs_remove_free_space_cache(ctl); + spin_unlock(&ctl->tree_lock); goto out; } @@ -914,6 +947,8 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group, return ret; } +static struct lock_class_key btrfs_free_space_inode_key; + int load_free_space_cache(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; @@ -983,6 +1018,14 @@ int load_free_space_cache(struct btrfs_block_group *block_group) } spin_unlock(&block_group->lock); + /* + * Reinitialize the class of struct inode's mapping->invalidate_lock for + * free space inodes to prevent false positives related to locks for normal + * inodes. + */ + lockdep_set_class(&(&inode->i_data)->invalidate_lock, + &btrfs_free_space_inode_key); + ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, path, block_group->start); btrfs_free_path(path); @@ -1001,7 +1044,13 @@ int load_free_space_cache(struct btrfs_block_group *block_group) if (ret == 0) ret = 1; } else { + /* + * We need to call the _locked variant so we don't try to update + * the discard counters. + */ + spin_lock(&tmp_ctl.tree_lock); __btrfs_remove_free_space_cache(&tmp_ctl); + spin_unlock(&tmp_ctl.tree_lock); btrfs_warn(fs_info, "block group %llu has wrong amount of free space", block_group->start); @@ -1123,7 +1172,7 @@ update_cache_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DELALLOC, 0, 0, NULL); + EXTENT_DELALLOC, NULL); goto fail; } leaf = path->nodes[0]; @@ -1135,8 +1184,8 @@ update_cache_item(struct btrfs_trans_handle *trans, if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, - inode->i_size - 1, EXTENT_DELALLOC, 0, - 0, NULL); + inode->i_size - 1, EXTENT_DELALLOC, + NULL); btrfs_release_path(path); goto fail; } @@ -1232,7 +1281,7 @@ static int flush_dirty_cache(struct inode *inode) ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DELALLOC, 0, 0, NULL); + EXTENT_DELALLOC, NULL); return ret; } @@ -1252,8 +1301,8 @@ cleanup_write_cache_enospc(struct inode *inode, struct extent_state **cached_state) { io_ctl_drop_pages(io_ctl); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + cached_state); } static int __btrfs_wait_cache_io(struct btrfs_root *root, @@ -1323,8 +1372,8 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, path, block_group->start); } -/** - * Write out cached info to an inode +/* + * Write out cached info to an inode. * * @root: root the inode belongs to * @inode: freespace inode we are writing out @@ -1378,8 +1427,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, if (ret) goto out_unlock; - lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - &cached_state); + lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + &cached_state); io_ctl_set_generation(io_ctl, trans->transid); @@ -1434,8 +1483,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, io_ctl_drop_pages(io_ctl); io_ctl_free(io_ctl); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + &cached_state); /* * at this point the pages are under IO and we're happy, @@ -2677,8 +2726,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, btrfs_mark_bg_unused(block_group); } else if (bg_reclaim_threshold && reclaimable_unusable >= - div_factor_fine(block_group->zone_capacity, - bg_reclaim_threshold)) { + mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) { btrfs_mark_bg_to_reclaim(block_group); } @@ -2860,7 +2908,8 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, if (btrfs_is_zoned(fs_info)) { btrfs_info(fs_info, "free space %llu active %d", block_group->zone_capacity - block_group->alloc_offset, - block_group->zone_is_active); + test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &block_group->runtime_flags)); return; } @@ -2964,34 +3013,6 @@ static void __btrfs_return_cluster_to_free_space( btrfs_put_block_group(block_group); } -static void __btrfs_remove_free_space_cache_locked( - struct btrfs_free_space_ctl *ctl) -{ - struct btrfs_free_space *info; - struct rb_node *node; - - while ((node = rb_last(&ctl->free_space_offset)) != NULL) { - info = rb_entry(node, struct btrfs_free_space, offset_index); - if (!info->bitmap) { - unlink_free_space(ctl, info, true); - kmem_cache_free(btrfs_free_space_cachep, info); - } else { - free_bitmap(ctl, info); - } - - cond_resched_lock(&ctl->tree_lock); - } -} - -void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) -{ - spin_lock(&ctl->tree_lock); - __btrfs_remove_free_space_cache_locked(ctl); - if (ctl->block_group) - btrfs_discard_update_discardable(ctl->block_group); - spin_unlock(&ctl->tree_lock); -} - void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; @@ -3009,16 +3030,13 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) cond_resched_lock(&ctl->tree_lock); } - __btrfs_remove_free_space_cache_locked(ctl); + __btrfs_remove_free_space_cache(ctl); btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); } -/** - * btrfs_is_free_space_trimmed - see if everything is trimmed - * @block_group: block_group of interest - * +/* * Walk @block_group's free space rb_tree to determine if everything is trimmed. */ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) @@ -3992,7 +4010,7 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group, *trimmed = 0; spin_lock(&block_group->lock); - if (block_group->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } @@ -4022,7 +4040,7 @@ int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, *trimmed = 0; spin_lock(&block_group->lock); - if (block_group->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } @@ -4044,7 +4062,7 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, *trimmed = 0; spin_lock(&block_group->lock); - if (block_group->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } @@ -4119,6 +4137,31 @@ out: return ret; } +int __init btrfs_free_space_init(void) +{ + btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", + sizeof(struct btrfs_free_space), 0, + SLAB_MEM_SPREAD, NULL); + if (!btrfs_free_space_cachep) + return -ENOMEM; + + btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", + PAGE_SIZE, PAGE_SIZE, + SLAB_MEM_SPREAD, NULL); + if (!btrfs_free_space_bitmap_cachep) { + kmem_cache_destroy(btrfs_free_space_cachep); + return -ENOMEM; + } + + return 0; +} + +void __cold btrfs_free_space_exit(void) +{ + kmem_cache_destroy(btrfs_free_space_cachep); + kmem_cache_destroy(btrfs_free_space_bitmap_cachep); +} + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* * Use this if you need to make a bitmap or extent entry specifically, it diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 15591b299895..a855e0483e03 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -43,6 +43,17 @@ static inline bool btrfs_free_space_trimming_bitmap( return (info->trim_state == BTRFS_TRIM_STATE_TRIMMING); } +/* + * Deltas are an effective way to populate global statistics. Give macro names + * to make it clear what we're doing. An example is discard_extents in + * btrfs_free_space_ctl. + */ +enum { + BTRFS_STAT_CURR, + BTRFS_STAT_PREV, + BTRFS_STAT_NR_ENTRIES, +}; + struct btrfs_free_space_ctl { spinlock_t tree_lock; struct rb_root free_space_offset; @@ -79,6 +90,8 @@ struct btrfs_io_ctl { int bitmaps; }; +int __init btrfs_free_space_init(void); +void __cold btrfs_free_space_exit(void); struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, struct btrfs_path *path); int create_free_space_inode(struct btrfs_trans_handle *trans, @@ -113,7 +126,6 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, u64 bytenr, u64 size); int btrfs_remove_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); -void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl); void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group); bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group); u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 1bf89aa67216..c667e878ef1a 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -5,12 +5,17 @@ #include <linux/kernel.h> #include <linux/sched/mm.h> +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "locking.h" #include "free-space-tree.h" #include "transaction.h" #include "block-group.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, @@ -803,7 +808,7 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, u32 flags; int ret; - if (block_group->needs_free_space) { + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { ret = __add_block_group_free_space(trans, block_group, path); if (ret) return ret; @@ -996,7 +1001,7 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans, u32 flags; int ret; - if (block_group->needs_free_space) { + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { ret = __add_block_group_free_space(trans, block_group, path); if (ret) return ret; @@ -1299,7 +1304,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, { int ret; - block_group->needs_free_space = 0; + clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags); ret = add_new_free_space_info(trans, block_group, path); if (ret) @@ -1321,7 +1326,7 @@ int add_block_group_free_space(struct btrfs_trans_handle *trans, return 0; mutex_lock(&block_group->free_space_lock); - if (!block_group->needs_free_space) + if (!test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) goto out; path = btrfs_alloc_path(); @@ -1354,7 +1359,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) return 0; - if (block_group->needs_free_space) { + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { /* We never added this block group to the free space tree. */ return 0; } @@ -1453,8 +1458,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); ASSERT(key.objectid < end && key.objectid + key.offset <= end); - caching_ctl->progress = key.objectid; - offset = key.objectid; while (offset < key.objectid + key.offset) { bit = free_space_test_bit(block_group, path, offset); @@ -1490,8 +1493,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, goto out; } - caching_ctl->progress = (u64)-1; - ret = 0; out: return ret; @@ -1531,8 +1532,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); ASSERT(key.objectid < end && key.objectid + key.offset <= end); - caching_ctl->progress = key.objectid; - total_found += add_new_free_space(block_group, key.objectid, key.objectid + key.offset); if (total_found > CACHING_CTL_WAKE_UP) { @@ -1552,8 +1551,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, goto out; } - caching_ctl->progress = (u64)-1; - ret = 0; out: return ret; diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c new file mode 100644 index 000000000000..5553e1f8afe8 --- /dev/null +++ b/fs/btrfs/fs.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "messages.h" +#include "ctree.h" +#include "fs.h" +#include "accessors.h" + +void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (!(features & flag)) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_incompat_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_incompat_flags(disk_super, features); + btrfs_info(fs_info, + "setting incompat feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (features & flag) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_incompat_flags(disk_super); + if (features & flag) { + features &= ~flag; + btrfs_set_super_incompat_flags(disk_super, features); + btrfs_info(fs_info, + "clearing incompat feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_compat_ro_flags(disk_super); + if (!(features & flag)) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_compat_ro_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_compat_ro_flags(disk_super, features); + btrfs_info(fs_info, + "setting compat-ro feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_compat_ro_flags(disk_super); + if (features & flag) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_compat_ro_flags(disk_super); + if (features & flag) { + features &= ~flag; + btrfs_set_super_compat_ro_flags(disk_super, features); + btrfs_info(fs_info, + "clearing compat-ro feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + } +} diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h new file mode 100644 index 000000000000..a749367e5ae2 --- /dev/null +++ b/fs/btrfs/fs.h @@ -0,0 +1,976 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_FS_H +#define BTRFS_FS_H + +#include <linux/fs.h> +#include <linux/btrfs_tree.h> +#include <linux/sizes.h> +#include "extent-io-tree.h" +#include "extent_map.h" +#include "async-thread.h" +#include "block-rsv.h" + +#define BTRFS_MAX_EXTENT_SIZE SZ_128M + +#define BTRFS_OLDEST_GENERATION 0ULL + +#define BTRFS_EMPTY_DIR_SIZE 0 + +#define BTRFS_DIRTY_METADATA_THRESH SZ_32M + +#define BTRFS_SUPER_INFO_OFFSET SZ_64K +#define BTRFS_SUPER_INFO_SIZE 4096 +static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); + +/* + * The reserved space at the beginning of each device. It covers the primary + * super block and leaves space for potential use by other tools like + * bootloaders or to lower potential damage of accidental overwrite. + */ +#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) +/* + * Runtime (in-memory) states of filesystem + */ +enum { + /* Global indicator of serious filesystem errors */ + BTRFS_FS_STATE_ERROR, + /* + * Filesystem is being remounted, allow to skip some operations, like + * defrag + */ + BTRFS_FS_STATE_REMOUNTING, + /* Filesystem in RO mode */ + BTRFS_FS_STATE_RO, + /* Track if a transaction abort has been reported on this filesystem */ + BTRFS_FS_STATE_TRANS_ABORTED, + /* + * Bio operations should be blocked on this filesystem because a source + * or target device is being destroyed as part of a device replace + */ + BTRFS_FS_STATE_DEV_REPLACING, + /* The btrfs_fs_info created for self-tests */ + BTRFS_FS_STATE_DUMMY_FS_INFO, + + BTRFS_FS_STATE_NO_CSUMS, + + /* Indicates there was an error cleaning up a log tree. */ + BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + + BTRFS_FS_STATE_COUNT +}; + +enum { + BTRFS_FS_CLOSING_START, + BTRFS_FS_CLOSING_DONE, + BTRFS_FS_LOG_RECOVERING, + BTRFS_FS_OPEN, + BTRFS_FS_QUOTA_ENABLED, + BTRFS_FS_UPDATE_UUID_TREE_GEN, + BTRFS_FS_CREATING_FREE_SPACE_TREE, + BTRFS_FS_BTREE_ERR, + BTRFS_FS_LOG1_ERR, + BTRFS_FS_LOG2_ERR, + BTRFS_FS_QUOTA_OVERRIDE, + /* Used to record internally whether fs has been frozen */ + BTRFS_FS_FROZEN, + /* + * Indicate that balance has been set up from the ioctl and is in the + * main phase. The fs_info::balance_ctl is initialized. + */ + BTRFS_FS_BALANCE_RUNNING, + + /* + * Indicate that relocation of a chunk has started, it's set per chunk + * and is toggled between chunks. + */ + BTRFS_FS_RELOC_RUNNING, + + /* Indicate that the cleaner thread is awake and doing something. */ + BTRFS_FS_CLEANER_RUNNING, + + /* + * The checksumming has an optimized version and is considered fast, + * so we don't need to offload checksums to workqueues. + */ + BTRFS_FS_CSUM_IMPL_FAST, + + /* Indicate that the discard workqueue can service discards. */ + BTRFS_FS_DISCARD_RUNNING, + + /* Indicate that we need to cleanup space cache v1 */ + BTRFS_FS_CLEANUP_SPACE_CACHE_V1, + + /* Indicate that we can't trust the free space tree for caching yet */ + BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, + + /* Indicate whether there are any tree modification log users */ + BTRFS_FS_TREE_MOD_LOG_USERS, + + /* Indicate that we want the transaction kthread to commit right now. */ + BTRFS_FS_COMMIT_TRANS, + + /* Indicate we have half completed snapshot deletions pending. */ + BTRFS_FS_UNFINISHED_DROPS, + + /* Indicate we have to finish a zone to do next allocation. */ + BTRFS_FS_NEED_ZONE_FINISH, + + /* Indicate that we want to commit the transaction. */ + BTRFS_FS_NEED_TRANS_COMMIT, + +#if BITS_PER_LONG == 32 + /* Indicate if we have error/warn message printed on 32bit systems */ + BTRFS_FS_32BIT_ERROR, + BTRFS_FS_32BIT_WARN, +#endif +}; + +/* + * Flags for mount options. + * + * Note: don't forget to add new options to btrfs_show_options() + */ +enum { + BTRFS_MOUNT_NODATASUM = (1UL << 0), + BTRFS_MOUNT_NODATACOW = (1UL << 1), + BTRFS_MOUNT_NOBARRIER = (1UL << 2), + BTRFS_MOUNT_SSD = (1UL << 3), + BTRFS_MOUNT_DEGRADED = (1UL << 4), + BTRFS_MOUNT_COMPRESS = (1UL << 5), + BTRFS_MOUNT_NOTREELOG = (1UL << 6), + BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7), + BTRFS_MOUNT_SSD_SPREAD = (1UL << 8), + BTRFS_MOUNT_NOSSD = (1UL << 9), + BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10), + BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11), + BTRFS_MOUNT_SPACE_CACHE = (1UL << 12), + BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13), + BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14), + BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15), + BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), + BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), + BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), + BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19), + BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20), + BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21), + BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22), + BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23), + BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24), + BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25), + BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26), + BTRFS_MOUNT_REF_VERIFY = (1UL << 27), + BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), + BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), + BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), + BTRFS_MOUNT_NODISCARD = (1UL << 31), +}; + +/* + * Compat flags that we support. If any incompat flags are set other than the + * ones specified below then we will fail to mount + */ +#define BTRFS_FEATURE_COMPAT_SUPP 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL + +#define BTRFS_FEATURE_COMPAT_RO_SUPP \ + (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ + BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \ + BTRFS_FEATURE_COMPAT_RO_VERITY | \ + BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE) + +#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL + +#ifdef CONFIG_BTRFS_DEBUG +/* + * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG + */ +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ + BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ + BTRFS_FEATURE_INCOMPAT_ZONED | \ + BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) +#else +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ + BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ + BTRFS_FEATURE_INCOMPAT_ZONED) +#endif + +#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ + (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) +#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL + +#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) +#define BTRFS_DEFAULT_MAX_INLINE (2048) + +struct btrfs_dev_replace { + /* See #define above */ + u64 replace_state; + /* Seconds since 1-Jan-1970 */ + time64_t time_started; + /* Seconds since 1-Jan-1970 */ + time64_t time_stopped; + atomic64_t num_write_errors; + atomic64_t num_uncorrectable_read_errors; + + u64 cursor_left; + u64 committed_cursor_left; + u64 cursor_left_last_write_of_item; + u64 cursor_right; + + /* See #define above */ + u64 cont_reading_from_srcdev_mode; + + int is_valid; + int item_needs_writeback; + struct btrfs_device *srcdev; + struct btrfs_device *tgtdev; + + struct mutex lock_finishing_cancel_unmount; + struct rw_semaphore rwsem; + + struct btrfs_scrub_progress scrub_progress; + + struct percpu_counter bio_counter; + wait_queue_head_t replace_wait; +}; + +/* + * Free clusters are used to claim free space in relatively large chunks, + * allowing us to do less seeky writes. They are used for all metadata + * allocations. In ssd_spread mode they are also used for data allocations. + */ +struct btrfs_free_cluster { + spinlock_t lock; + spinlock_t refill_lock; + struct rb_root root; + + /* Largest extent in this cluster */ + u64 max_size; + + /* First extent starting offset */ + u64 window_start; + + /* We did a full search and couldn't create a cluster */ + bool fragmented; + + struct btrfs_block_group *block_group; + /* + * When a cluster is allocated from a block group, we put the cluster + * onto a list in the block group so that it can be freed before the + * block group is freed. + */ + struct list_head block_group_list; +}; + +/* Discard control. */ +/* + * Async discard uses multiple lists to differentiate the discard filter + * parameters. Index 0 is for completely free block groups where we need to + * ensure the entire block group is trimmed without being lossy. Indices + * afterwards represent monotonically decreasing discard filter sizes to + * prioritize what should be discarded next. + */ +#define BTRFS_NR_DISCARD_LISTS 3 +#define BTRFS_DISCARD_INDEX_UNUSED 0 +#define BTRFS_DISCARD_INDEX_START 1 + +struct btrfs_discard_ctl { + struct workqueue_struct *discard_workers; + struct delayed_work work; + spinlock_t lock; + struct btrfs_block_group *block_group; + struct list_head discard_list[BTRFS_NR_DISCARD_LISTS]; + u64 prev_discard; + u64 prev_discard_time; + atomic_t discardable_extents; + atomic64_t discardable_bytes; + u64 max_discard_size; + u64 delay_ms; + u32 iops_limit; + u32 kbps_limit; + u64 discard_extent_bytes; + u64 discard_bitmap_bytes; + atomic64_t discard_bytes_saved; +}; + +/* + * Exclusive operations (device replace, resize, device add/remove, balance) + */ +enum btrfs_exclusive_operation { + BTRFS_EXCLOP_NONE, + BTRFS_EXCLOP_BALANCE_PAUSED, + BTRFS_EXCLOP_BALANCE, + BTRFS_EXCLOP_DEV_ADD, + BTRFS_EXCLOP_DEV_REMOVE, + BTRFS_EXCLOP_DEV_REPLACE, + BTRFS_EXCLOP_RESIZE, + BTRFS_EXCLOP_SWAP_ACTIVATE, +}; + +/* Store data about transaction commits, exported via sysfs. */ +struct btrfs_commit_stats { + /* Total number of commits */ + u64 commit_count; + /* The maximum commit duration so far in ns */ + u64 max_commit_dur; + /* The last commit duration in ns */ + u64 last_commit_dur; + /* The total commit duration in ns */ + u64 total_commit_dur; +}; + +struct btrfs_fs_info { + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + unsigned long flags; + struct btrfs_root *tree_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; + struct btrfs_root *fs_root; + struct btrfs_root *quota_root; + struct btrfs_root *uuid_root; + struct btrfs_root *data_reloc_root; + struct btrfs_root *block_group_root; + + /* The log root tree is a directory of all the other log roots */ + struct btrfs_root *log_root_tree; + + /* The tree that holds the global roots (csum, extent, etc) */ + rwlock_t global_root_lock; + struct rb_root global_root_tree; + + spinlock_t fs_roots_radix_lock; + struct radix_tree_root fs_roots_radix; + + /* Block group cache stuff */ + rwlock_t block_group_cache_lock; + struct rb_root_cached block_group_cache_tree; + + /* Keep track of unallocated space */ + atomic64_t free_chunk_space; + + /* Track ranges which are used by log trees blocks/logged data extents */ + struct extent_io_tree excluded_extents; + + /* logical->physical extent mapping */ + struct extent_map_tree mapping_tree; + + /* + * Block reservation for extent, checksum, root tree and delayed dir + * index item. + */ + struct btrfs_block_rsv global_block_rsv; + /* Block reservation for metadata operations */ + struct btrfs_block_rsv trans_block_rsv; + /* Block reservation for chunk tree */ + struct btrfs_block_rsv chunk_block_rsv; + /* Block reservation for delayed operations */ + struct btrfs_block_rsv delayed_block_rsv; + /* Block reservation for delayed refs */ + struct btrfs_block_rsv delayed_refs_rsv; + + struct btrfs_block_rsv empty_block_rsv; + + u64 generation; + u64 last_trans_committed; + /* + * Generation of the last transaction used for block group relocation + * since the filesystem was last mounted (or 0 if none happened yet). + * Must be written and read while holding btrfs_fs_info::commit_root_sem. + */ + u64 last_reloc_trans; + u64 avg_delayed_ref_runtime; + + /* + * This is updated to the current trans every time a full commit is + * required instead of the faster short fsync log commits + */ + u64 last_trans_log_full_commit; + unsigned long mount_opt; + + unsigned long compress_type:4; + unsigned int compress_level; + u32 commit_interval; + /* + * It is a suggestive number, the read side is safe even it gets a + * wrong number because we will write out the data into a regular + * extent. The write side(mount/remount) is under ->s_umount lock, + * so it is also safe. + */ + u64 max_inline; + + struct btrfs_transaction *running_transaction; + wait_queue_head_t transaction_throttle; + wait_queue_head_t transaction_wait; + wait_queue_head_t transaction_blocked_wait; + wait_queue_head_t async_submit_wait; + + /* + * Used to protect the incompat_flags, compat_flags, compat_ro_flags + * when they are updated. + * + * Because we do not clear the flags for ever, so we needn't use + * the lock on the read side. + * + * We also needn't use the lock when we mount the fs, because + * there is no other task which will update the flag. + */ + spinlock_t super_lock; + struct btrfs_super_block *super_copy; + struct btrfs_super_block *super_for_commit; + struct super_block *sb; + struct inode *btree_inode; + struct mutex tree_log_mutex; + struct mutex transaction_kthread_mutex; + struct mutex cleaner_mutex; + struct mutex chunk_mutex; + + /* + * This is taken to make sure we don't set block groups ro after the + * free space cache has been allocated on them. + */ + struct mutex ro_block_group_mutex; + + /* + * This is used during read/modify/write to make sure no two ios are + * trying to mod the same stripe at the same time. + */ + struct btrfs_stripe_hash_table *stripe_hash_table; + + /* + * This protects the ordered operations list only while we are + * processing all of the entries on it. This way we make sure the + * commit code doesn't find the list temporarily empty because another + * function happens to be doing non-waiting preflush before jumping + * into the main commit. + */ + struct mutex ordered_operations_mutex; + + struct rw_semaphore commit_root_sem; + + struct rw_semaphore cleanup_work_sem; + + struct rw_semaphore subvol_sem; + + spinlock_t trans_lock; + /* + * The reloc mutex goes with the trans lock, it is taken during commit + * to protect us from the relocation code. + */ + struct mutex reloc_mutex; + + struct list_head trans_list; + struct list_head dead_roots; + struct list_head caching_block_groups; + + spinlock_t delayed_iput_lock; + struct list_head delayed_iputs; + atomic_t nr_delayed_iputs; + wait_queue_head_t delayed_iputs_wait; + + atomic64_t tree_mod_seq; + + /* This protects tree_mod_log and tree_mod_seq_list */ + rwlock_t tree_mod_log_lock; + struct rb_root tree_mod_log; + struct list_head tree_mod_seq_list; + + atomic_t async_delalloc_pages; + + /* This is used to protect the following list -- ordered_roots. */ + spinlock_t ordered_root_lock; + + /* + * All fs/file tree roots in which there are data=ordered extents + * pending writeback are added into this list. + * + * These can span multiple transactions and basically include every + * dirty data page that isn't from nodatacow. + */ + struct list_head ordered_roots; + + struct mutex delalloc_root_mutex; + spinlock_t delalloc_root_lock; + /* All fs/file tree roots that have delalloc inodes. */ + struct list_head delalloc_roots; + + /* + * There is a pool of worker threads for checksumming during writes and + * a pool for checksumming after reads. This is because readers can + * run with FS locks held, and the writers may be waiting for those + * locks. We don't want ordering in the pending list to cause + * deadlocks, and so the two are serviced separately. + * + * A third pool does submit_bio to avoid deadlocking with the other two. + */ + struct btrfs_workqueue *workers; + struct btrfs_workqueue *hipri_workers; + struct btrfs_workqueue *delalloc_workers; + struct btrfs_workqueue *flush_workers; + struct workqueue_struct *endio_workers; + struct workqueue_struct *endio_meta_workers; + struct workqueue_struct *rmw_workers; + struct workqueue_struct *compressed_write_workers; + struct btrfs_workqueue *endio_write_workers; + struct btrfs_workqueue *endio_freespace_worker; + struct btrfs_workqueue *caching_workers; + + /* + * Fixup workers take dirty pages that didn't properly go through the + * cow mechanism and make them safe to write. It happens for the + * sys_munmap function call path. + */ + struct btrfs_workqueue *fixup_workers; + struct btrfs_workqueue *delayed_workers; + + struct task_struct *transaction_kthread; + struct task_struct *cleaner_kthread; + u32 thread_pool_size; + + struct kobject *space_info_kobj; + struct kobject *qgroups_kobj; + struct kobject *discard_kobj; + + /* Used to keep from writing metadata until there is a nice batch */ + struct percpu_counter dirty_metadata_bytes; + struct percpu_counter delalloc_bytes; + struct percpu_counter ordered_bytes; + s32 dirty_metadata_batch; + s32 delalloc_batch; + + struct list_head dirty_cowonly_roots; + + struct btrfs_fs_devices *fs_devices; + + /* + * The space_info list is effectively read only after initial setup. + * It is populated at mount time and cleaned up after all block groups + * are removed. RCU is used to protect it. + */ + struct list_head space_info; + + struct btrfs_space_info *data_sinfo; + + struct reloc_control *reloc_ctl; + + /* data_alloc_cluster is only used in ssd_spread mode */ + struct btrfs_free_cluster data_alloc_cluster; + + /* All metadata allocations go through this cluster. */ + struct btrfs_free_cluster meta_alloc_cluster; + + /* Auto defrag inodes go here. */ + spinlock_t defrag_inodes_lock; + struct rb_root defrag_inodes; + atomic_t defrag_running; + + /* Used to protect avail_{data, metadata, system}_alloc_bits */ + seqlock_t profiles_lock; + /* + * These three are in extended format (availability of single chunks is + * denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other types are denoted + * by corresponding BTRFS_BLOCK_GROUP_* bits) + */ + u64 avail_data_alloc_bits; + u64 avail_metadata_alloc_bits; + u64 avail_system_alloc_bits; + + /* Balance state */ + spinlock_t balance_lock; + struct mutex balance_mutex; + atomic_t balance_pause_req; + atomic_t balance_cancel_req; + struct btrfs_balance_control *balance_ctl; + wait_queue_head_t balance_wait_q; + + /* Cancellation requests for chunk relocation */ + atomic_t reloc_cancel_req; + + u32 data_chunk_allocations; + u32 metadata_ratio; + + void *bdev_holder; + + /* Private scrub information */ + struct mutex scrub_lock; + atomic_t scrubs_running; + atomic_t scrub_pause_req; + atomic_t scrubs_paused; + atomic_t scrub_cancel_req; + wait_queue_head_t scrub_pause_wait; + /* + * The worker pointers are NULL iff the refcount is 0, ie. scrub is not + * running. + */ + refcount_t scrub_workers_refcnt; + struct workqueue_struct *scrub_workers; + struct workqueue_struct *scrub_wr_completion_workers; + struct workqueue_struct *scrub_parity_workers; + struct btrfs_subpage_info *subpage_info; + + struct btrfs_discard_ctl discard_ctl; + +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + u32 check_integrity_print_mask; +#endif + /* Is qgroup tracking in a consistent state? */ + u64 qgroup_flags; + + /* Holds configuration and tracking. Protected by qgroup_lock. */ + struct rb_root qgroup_tree; + spinlock_t qgroup_lock; + + /* + * Used to avoid frequently calling ulist_alloc()/ulist_free() + * when doing qgroup accounting, it must be protected by qgroup_lock. + */ + struct ulist *qgroup_ulist; + + /* + * Protect user change for quota operations. If a transaction is needed, + * it must be started before locking this lock. + */ + struct mutex qgroup_ioctl_lock; + + /* List of dirty qgroups to be written at next commit. */ + struct list_head dirty_qgroups; + + /* Used by qgroup for an efficient tree traversal. */ + u64 qgroup_seq; + + /* Qgroup rescan items. */ + /* Protects the progress item */ + struct mutex qgroup_rescan_lock; + struct btrfs_key qgroup_rescan_progress; + struct btrfs_workqueue *qgroup_rescan_workers; + struct completion qgroup_rescan_completion; + struct btrfs_work qgroup_rescan_work; + /* Protected by qgroup_rescan_lock */ + bool qgroup_rescan_running; + u8 qgroup_drop_subtree_thres; + + /* Filesystem state */ + unsigned long fs_state; + + struct btrfs_delayed_root *delayed_root; + + /* Extent buffer radix tree */ + spinlock_t buffer_lock; + /* Entries are eb->start / sectorsize */ + struct radix_tree_root buffer_radix; + + /* Next backup root to be overwritten */ + int backup_root_index; + + /* Device replace state */ + struct btrfs_dev_replace dev_replace; + + struct semaphore uuid_tree_rescan_sem; + + /* Used to reclaim the metadata space in the background. */ + struct work_struct async_reclaim_work; + struct work_struct async_data_reclaim_work; + struct work_struct preempt_reclaim_work; + + /* Reclaim partially filled block groups in the background */ + struct work_struct reclaim_bgs_work; + struct list_head reclaim_bgs; + int bg_reclaim_threshold; + + spinlock_t unused_bgs_lock; + struct list_head unused_bgs; + struct mutex unused_bg_unpin_mutex; + /* Protect block groups that are going to be deleted */ + struct mutex reclaim_bgs_lock; + + /* Cached block sizes */ + u32 nodesize; + u32 sectorsize; + /* ilog2 of sectorsize, use to avoid 64bit division */ + u32 sectorsize_bits; + u32 csum_size; + u32 csums_per_leaf; + u32 stripesize; + + /* + * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular + * filesystem, on zoned it depends on the device constraints. + */ + u64 max_extent_size; + + /* Block groups and devices containing active swapfiles. */ + spinlock_t swapfile_pins_lock; + struct rb_root swapfile_pins; + + struct crypto_shash *csum_shash; + + /* Type of exclusive operation running, protected by super_lock */ + enum btrfs_exclusive_operation exclusive_operation; + + /* + * Zone size > 0 when in ZONED mode, otherwise it's used for a check + * if the mode is enabled + */ + u64 zone_size; + + /* Max size to emit ZONE_APPEND write command */ + u64 max_zone_append_size; + struct mutex zoned_meta_io_lock; + spinlock_t treelog_bg_lock; + u64 treelog_bg; + + /* + * Start of the dedicated data relocation block group, protected by + * relocation_bg_lock. + */ + spinlock_t relocation_bg_lock; + u64 data_reloc_bg; + struct mutex zoned_data_reloc_io_lock; + + u64 nr_global_roots; + + spinlock_t zone_active_bgs_lock; + struct list_head zone_active_bgs; + + /* Updates are not protected by any lock */ + struct btrfs_commit_stats commit_stats; + + /* + * Last generation where we dropped a non-relocation root. + * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen() + * to change it and to read it, respectively. + */ + u64 last_root_drop_gen; + + /* + * Annotations for transaction events (structures are empty when + * compiled without lockdep). + */ + struct lockdep_map btrfs_trans_num_writers_map; + struct lockdep_map btrfs_trans_num_extwriters_map; + struct lockdep_map btrfs_state_change_map[4]; + struct lockdep_map btrfs_trans_pending_ordered_map; + struct lockdep_map btrfs_ordered_extent_map; + +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + spinlock_t ref_verify_lock; + struct rb_root block_tree; +#endif + +#ifdef CONFIG_BTRFS_DEBUG + struct kobject *debug_kobj; + struct list_head allocated_roots; + + spinlock_t eb_leak_lock; + struct list_head allocated_ebs; +#endif +}; + +static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info, + u64 gen) +{ + WRITE_ONCE(fs_info->last_root_drop_gen, gen); +} + +static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info) +{ + return READ_ONCE(fs_info->last_root_drop_gen); +} + +/* + * Take the number of bytes to be checksummed and figure out how many leaves + * it would require to store the csums for that many bytes. + */ +static inline u64 btrfs_csum_bytes_to_leaves( + const struct btrfs_fs_info *fs_info, u64 csum_bytes) +{ + const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; + + return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); +} + +/* + * Use this if we would be adding new items, as we could split nodes as we cow + * down the tree. + */ +static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, + unsigned num_items) +{ + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; +} + +/* + * Doing a truncate or a modification won't result in new nodes or leaves, just + * what we need for COW. + */ +static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, + unsigned num_items) +{ + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; +} + +#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ + sizeof(struct btrfs_item)) + +static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) +{ + return fs_info->zone_size > 0; +} + +/* + * Count how many fs_info->max_extent_size cover the @size + */ +static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) +{ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (!fs_info) + return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); +#endif + + return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); +} + +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation op); + +/* Compatibility and incompatibility defines */ +void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); +void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); +void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); +void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); + +#define __btrfs_fs_incompat(fs_info, flags) \ + (!!(btrfs_super_incompat_flags((fs_info)->super_copy) & (flags))) + +#define __btrfs_fs_compat_ro(fs_info, flags) \ + (!!(btrfs_super_compat_ro_flags((fs_info)->super_copy) & (flags))) + +#define btrfs_set_fs_incompat(__fs_info, opt) \ + __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) + +#define btrfs_clear_fs_incompat(__fs_info, opt) \ + __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) + +#define btrfs_fs_incompat(fs_info, opt) \ + __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + +#define btrfs_set_fs_compat_ro(__fs_info, opt) \ + __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt) + +#define btrfs_clear_fs_compat_ro(__fs_info, opt) \ + __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt) + +#define btrfs_fs_compat_ro(fs_info, opt) \ + __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + +#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) +#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) +#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) +#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ + BTRFS_MOUNT_##opt) + +#define btrfs_set_and_info(fs_info, opt, fmt, args...) \ +do { \ + if (!btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_set_opt(fs_info->mount_opt, opt); \ +} while (0) + +#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ +do { \ + if (btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_clear_opt(fs_info->mount_opt, opt); \ +} while (0) + +static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) +{ + /* Do it this way so we only ever do one test_bit in the normal case. */ + if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { + if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) + return 2; + return 1; + } + return 0; +} + +/* + * If we remount the fs to be R/O or umount the fs, the cleaner needn't do + * anything except sleeping. This function is used to check the status of + * the fs. + * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount, + * since setting and checking for SB_RDONLY in the superblock's flags is not + * atomic. + */ +static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) +{ + return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) || + btrfs_fs_closing(fs_info); +} + +static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); +} + +#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ + &(fs_info)->fs_state))) +#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ + (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ + &(fs_info)->fs_state))) + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + +#define EXPORT_FOR_TESTS + +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) +{ + return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); +} + +void btrfs_test_destroy_inode(struct inode *inode); + +#else + +#define EXPORT_FOR_TESTS static + +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) +{ + return 0; +} +#endif + +#endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 0eeb5ea87894..b65c45b5d681 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -4,14 +4,20 @@ */ #include "ctree.h" +#include "fs.h" +#include "messages.h" #include "inode-item.h" #include "disk-io.h" #include "transaction.h" #include "print-tree.h" +#include "space-info.h" +#include "accessors.h" +#include "extent-tree.h" +#include "file-item.h" struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, - int slot, const char *name, - int name_len) + int slot, + const struct fscrypt_str *name) { struct btrfs_inode_ref *ref; unsigned long ptr; @@ -27,9 +33,10 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, len = btrfs_inode_ref_name_len(leaf, ref); name_ptr = (unsigned long)(ref + 1); cur_offset += len + sizeof(*ref); - if (len != name_len) + if (len != name->len) continue; - if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) + if (memcmp_extent_buffer(leaf, name->name, name_ptr, + name->len) == 0) return ref; } return NULL; @@ -37,7 +44,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( struct extent_buffer *leaf, int slot, u64 ref_objectid, - const char *name, int name_len) + const struct fscrypt_str *name) { struct btrfs_inode_extref *extref; unsigned long ptr; @@ -60,9 +67,10 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( name_ptr = (unsigned long)(&extref->name); ref_name_len = btrfs_inode_extref_name_len(leaf, extref); - if (ref_name_len == name_len && + if (ref_name_len == name->len && btrfs_inode_extref_parent(leaf, extref) == ref_objectid && - (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) + (memcmp_extent_buffer(leaf, name->name, name_ptr, + name->len) == 0)) return extref; cur_offset += ref_name_len + sizeof(*extref); @@ -75,7 +83,7 @@ struct btrfs_inode_extref * btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - const char *name, int name_len, + const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow) { @@ -84,7 +92,7 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; - key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); if (ret < 0) @@ -92,13 +100,13 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, if (ret > 0) return NULL; return btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], - ref_objectid, name, name_len); + ref_objectid, name); } static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 *index) { @@ -107,14 +115,14 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_inode_extref *extref; struct extent_buffer *leaf; int ret; - int del_len = name_len + sizeof(*extref); + int del_len = name->len + sizeof(*extref); unsigned long ptr; unsigned long item_start; u32 item_size; key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; - key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); path = btrfs_alloc_path(); if (!path) @@ -132,7 +140,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, * readonly. */ extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], - ref_objectid, name, name_len); + ref_objectid, name); if (!extref) { btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL); ret = -EROFS; @@ -168,8 +176,7 @@ out: } int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, + struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 *index) { struct btrfs_path *path; @@ -182,7 +189,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, u32 sub_item_len; int ret; int search_ext_refs = 0; - int del_len = name_len + sizeof(*ref); + int del_len = name->len + sizeof(*ref); key.objectid = inode_objectid; key.offset = ref_objectid; @@ -201,8 +208,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, goto out; } - ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name, - name_len); + ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name); if (!ref) { ret = -ENOENT; search_ext_refs = 1; @@ -219,7 +225,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, goto out; } ptr = (unsigned long)ref; - sub_item_len = name_len + sizeof(*ref); + sub_item_len = name->len + sizeof(*ref); item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_size - (ptr + sub_item_len - item_start)); @@ -233,7 +239,7 @@ out: * name in our ref array. Find and remove the extended * inode ref then. */ - return btrfs_del_inode_extref(trans, root, name, name_len, + return btrfs_del_inode_extref(trans, root, name, inode_objectid, ref_objectid, index); } @@ -247,12 +253,13 @@ out: */ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 index) + const struct fscrypt_str *name, + u64 inode_objectid, u64 ref_objectid, + u64 index) { struct btrfs_inode_extref *extref; int ret; - int ins_len = name_len + sizeof(*extref); + int ins_len = name->len + sizeof(*extref); unsigned long ptr; struct btrfs_path *path; struct btrfs_key key; @@ -260,7 +267,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; - key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); path = btrfs_alloc_path(); if (!path) @@ -272,7 +279,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, if (btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], ref_objectid, - name, name_len)) + name)) goto out; btrfs_extend_item(path, ins_len); @@ -286,12 +293,12 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len; extref = (struct btrfs_inode_extref *)ptr; - btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len); + btrfs_set_inode_extref_name_len(path->nodes[0], extref, name->len); btrfs_set_inode_extref_index(path->nodes[0], extref, index); btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid); ptr = (unsigned long)&extref->name; - write_extent_buffer(path->nodes[0], name, ptr, name_len); + write_extent_buffer(path->nodes[0], name->name, ptr, name->len); btrfs_mark_buffer_dirty(path->nodes[0]); out: @@ -301,8 +308,7 @@ out: /* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, + struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 index) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -311,7 +317,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_inode_ref *ref; unsigned long ptr; int ret; - int ins_len = name_len + sizeof(*ref); + int ins_len = name->len + sizeof(*ref); key.objectid = inode_objectid; key.offset = ref_objectid; @@ -327,7 +333,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (ret == -EEXIST) { u32 old_size; ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, name_len); + name); if (ref) goto out; @@ -336,7 +342,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len); btrfs_set_inode_ref_index(path->nodes[0], ref, index); ptr = (unsigned long)(ref + 1); ret = 0; @@ -344,7 +350,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (ret == -EOVERFLOW) { if (btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, name_len)) + name)) ret = -EEXIST; else ret = -EMLINK; @@ -353,11 +359,11 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, } else { ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len); btrfs_set_inode_ref_index(path->nodes[0], ref, index); ptr = (unsigned long)(ref + 1); } - write_extent_buffer(path->nodes[0], name, ptr, name_len); + write_extent_buffer(path->nodes[0], name->name, ptr, name->len); btrfs_mark_buffer_dirty(path->nodes[0]); out: @@ -370,7 +376,6 @@ out: if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) ret = btrfs_insert_inode_extref(trans, root, name, - name_len, inode_objectid, ref_objectid, index); } diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index a8fc16d0147f..b80aeb715701 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -64,33 +64,31 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_truncate_control *control); int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, + struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 index); int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 *index); + struct btrfs_root *root, const struct fscrypt_str *name, + u64 inode_objectid, u64 ref_objectid, u64 *index); int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); -int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *location, int mod); struct btrfs_inode_extref *btrfs_lookup_inode_extref( struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - const char *name, int name_len, + const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, - int slot, const char *name, - int name_len); + int slot, + const struct fscrypt_str *name); struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( struct extent_buffer *leaf, int slot, u64 ref_objectid, - const char *name, int name_len); + const struct fscrypt_str *name); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1372210869b1..8bcad9940154 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -43,7 +43,7 @@ #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" -#include "volumes.h" +#include "bio.h" #include "compression.h" #include "locking.h" #include "free-space-cache.h" @@ -55,6 +55,21 @@ #include "zoned.h" #include "subpage.h" #include "inode-item.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "defrag.h" +#include "dir-item.h" +#include "file-item.h" +#include "uuid-tree.h" +#include "ioctl.h" +#include "file.h" +#include "acl.h" +#include "relocation.h" +#include "verity.h" +#include "super.h" +#include "orphan.h" struct btrfs_iget_args { u64 ino; @@ -69,7 +84,7 @@ struct btrfs_dio_data { }; struct btrfs_dio_private { - struct inode *inode; + struct btrfs_inode *inode; /* * Since DIO can use anonymous page, we cannot use page_offset() to @@ -107,13 +122,9 @@ static const struct address_space_operations btrfs_aops; static const struct file_operations btrfs_dir_file_operations; static struct kmem_cache *btrfs_inode_cachep; -struct kmem_cache *btrfs_trans_handle_cachep; -struct kmem_cache *btrfs_path_cachep; -struct kmem_cache *btrfs_free_space_cachep; -struct kmem_cache *btrfs_free_space_bitmap_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); -static int btrfs_truncate(struct inode *inode, bool skip_writeback); +static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, @@ -125,6 +136,32 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 ram_bytes, int compress_type, int type); +static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, + u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) +{ + struct btrfs_root *root = inode->root; + const u32 csum_size = root->fs_info->csum_size; + + /* Output without objectid, which is more meaningful */ + if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) { + btrfs_warn_rl(root->fs_info, +"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", + root->root_key.objectid, btrfs_ino(inode), + logical_start, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); + } else { + btrfs_warn_rl(root->fs_info, +"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", + root->root_key.objectid, btrfs_ino(inode), + logical_start, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); + } +} + /* * btrfs_inode_lock - lock inode i_rwsem based on arguments passed * @@ -135,27 +172,27 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, * return -EAGAIN * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock */ -int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) +int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) { if (ilock_flags & BTRFS_ILOCK_SHARED) { if (ilock_flags & BTRFS_ILOCK_TRY) { - if (!inode_trylock_shared(inode)) + if (!inode_trylock_shared(&inode->vfs_inode)) return -EAGAIN; else return 0; } - inode_lock_shared(inode); + inode_lock_shared(&inode->vfs_inode); } else { if (ilock_flags & BTRFS_ILOCK_TRY) { - if (!inode_trylock(inode)) + if (!inode_trylock(&inode->vfs_inode)) return -EAGAIN; else return 0; } - inode_lock(inode); + inode_lock(&inode->vfs_inode); } if (ilock_flags & BTRFS_ILOCK_MMAP) - down_write(&BTRFS_I(inode)->i_mmap_lock); + down_write(&inode->i_mmap_lock); return 0; } @@ -165,14 +202,14 @@ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() * to decide whether the lock acquired is shared or exclusive. */ -void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags) +void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) { if (ilock_flags & BTRFS_ILOCK_MMAP) - up_write(&BTRFS_I(inode)->i_mmap_lock); + up_write(&inode->i_mmap_lock); if (ilock_flags & BTRFS_ILOCK_SHARED) - inode_unlock_shared(inode); + inode_unlock_shared(&inode->vfs_inode); else - inode_unlock(inode); + inode_unlock(&inode->vfs_inode); } /* @@ -249,7 +286,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); } -static int btrfs_dirty_inode(struct inode *inode); +static int btrfs_dirty_inode(struct btrfs_inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args) @@ -483,7 +520,7 @@ struct async_extent { }; struct async_chunk { - struct inode *inode; + struct btrfs_inode *inode; struct page *locked_page; u64 start; u64 end; @@ -611,8 +648,8 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, */ static noinline int compress_file_range(struct async_chunk *async_chunk) { - struct inode *inode = async_chunk->inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *inode = async_chunk->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; u64 end = async_chunk->end; @@ -629,8 +666,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) int compressed_extents = 0; int redirty = 0; - inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, - SZ_16K); + inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); /* * We need to save i_size before now because it could change in between @@ -642,7 +678,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) * does that for us. */ barrier(); - i_size = i_size_read(inode); + i_size = i_size_read(&inode->vfs_inode); barrier(); actual_end = min_t(u64, i_size, end + 1); again: @@ -671,7 +707,7 @@ again: * isn't an inline extent, since it doesn't save disk space at all. */ if (total_compressed <= blocksize && - (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) + (start > 0 || end + 1 < inode->disk_i_size)) goto cleanup_and_bail_uncompressed; /* @@ -695,7 +731,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (inode_need_compress(BTRFS_I(inode), start, end)) { + if (inode_need_compress(inode, start, end)) { WARN_ON(pages); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { @@ -704,10 +740,10 @@ again: goto cont; } - if (BTRFS_I(inode)->defrag_compress) - compress_type = BTRFS_I(inode)->defrag_compress; - else if (BTRFS_I(inode)->prop_compress) - compress_type = BTRFS_I(inode)->prop_compress; + if (inode->defrag_compress) + compress_type = inode->defrag_compress; + else if (inode->prop_compress) + compress_type = inode->prop_compress; /* * we need to call clear_page_dirty_for_io on each @@ -722,14 +758,14 @@ again: * has moved, the end is the original one. */ if (!redirty) { - extent_range_clear_dirty_for_io(inode, start, end); + extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); redirty = 1; } /* Compression level is applied here and only here */ ret = btrfs_compress_pages( compress_type | (fs_info->compress_level << 4), - inode->i_mapping, start, + inode->vfs_inode.i_mapping, start, pages, &nr_pages, &total_in, @@ -758,12 +794,12 @@ cont: /* we didn't compress the entire range, try * to make an uncompressed inline extent. */ - ret = cow_file_range_inline(BTRFS_I(inode), actual_end, + ret = cow_file_range_inline(inode, actual_end, 0, BTRFS_COMPRESS_NONE, NULL, false); } else { /* try making a compressed inline extent */ - ret = cow_file_range_inline(BTRFS_I(inode), actual_end, + ret = cow_file_range_inline(inode, actual_end, total_compressed, compress_type, pages, false); @@ -786,7 +822,7 @@ cont: * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, + extent_clear_unlock_delalloc(inode, start, end, NULL, clear_flags, PAGE_UNLOCK | @@ -861,8 +897,8 @@ cont: /* flag the file so we don't compress in the future */ if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && - !(BTRFS_I(inode)->prop_compress)) { - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + !(inode->prop_compress)) { + inode->flags |= BTRFS_INODE_NOCOMPRESS; } } cleanup_and_bail_uncompressed: @@ -880,7 +916,7 @@ cleanup_and_bail_uncompressed: } if (redirty) - extent_range_redirty_for_io(inode, start, end); + extent_range_redirty_for_io(&inode->vfs_inode, start, end); add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); compressed_extents++; @@ -977,7 +1013,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, if (!(start >= locked_page_end || end <= locked_page_start)) locked_page = async_chunk->locked_page; } - lock_extent(io_tree, start, end); + lock_extent(io_tree, start, end, NULL); /* We have fall back to uncompressed write */ if (!async_extent->pages) @@ -1024,7 +1060,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, 1 << BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); if (ret) { - btrfs_drop_extent_cache(inode, start, end, 0); + btrfs_drop_extent_map_range(inode, start, end, false); goto out_free_reserve; } btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -1076,7 +1112,7 @@ out_free: */ static noinline void submit_compressed_extents(struct async_chunk *async_chunk) { - struct btrfs_inode *inode = BTRFS_I(async_chunk->inode); + struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct async_extent *async_extent; u64 alloc_hint = 0; @@ -1254,7 +1290,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, } alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); - btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); /* * Relocation relies on the relocated extents to have exactly the same @@ -1319,8 +1354,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * skip current ordered extent. */ if (ret) - btrfs_drop_extent_cache(inode, start, - start + ram_size - 1, 0); + btrfs_drop_extent_map_range(inode, start, + start + ram_size - 1, + false); } btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -1360,7 +1396,7 @@ out: return ret; out_drop_extent_cache: - btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); + btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); @@ -1524,7 +1560,7 @@ static int cow_file_range_async(struct btrfs_inode *inode, unsigned nofs_flag; const blk_opf_t write_flags = wbc_to_write_flags(wbc); - unlock_extent(&inode->io_tree, start, end); + unlock_extent(&inode->io_tree, start, end, NULL); if (inode->flags & BTRFS_INODE_NOCOMPRESS && !btrfs_test_opt(fs_info, FORCE_COMPRESS)) { @@ -1565,7 +1601,7 @@ static int cow_file_range_async(struct btrfs_inode *inode, */ ihold(&inode->vfs_inode); async_chunk[i].async_cow = ctx; - async_chunk[i].inode = &inode->vfs_inode; + async_chunk[i].inode = inode; async_chunk[i].start = start; async_chunk[i].end = cur_end; async_chunk[i].write_flags = write_flags; @@ -1666,15 +1702,15 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, } static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, - u64 bytenr, u64 num_bytes) + u64 bytenr, u64 num_bytes, bool nowait) { struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr); struct btrfs_ordered_sum *sums; int ret; LIST_HEAD(list); - ret = btrfs_lookup_csums_range(csum_root, bytenr, - bytenr + num_bytes - 1, &list, 0); + ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1, + &list, 0, nowait); if (ret == 0 && list_empty(&list)) return 0; @@ -1732,7 +1768,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, * when starting writeback. */ count = count_range_bits(io_tree, &range_start, end, range_bytes, - EXTENT_NORESERVE, 0); + EXTENT_NORESERVE, 0, NULL); if (count > 0 || is_space_ino || is_reloc_ino) { u64 bytes = count; struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1747,7 +1783,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, if (count > 0) clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, - 0, 0, NULL); + NULL); } return cow_file_range(inode, locked_page, start, end, page_started, @@ -1800,6 +1836,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, u8 extent_type; int can_nocow = 0; int ret = 0; + bool nowait = path->nowait; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); @@ -1876,7 +1913,8 @@ static int can_nocow_file_extent(struct btrfs_path *path, * Force COW if csums exist in the range. This ensures that csums for a * given extent are either valid or do not exist. */ - ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes); + ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes, + nowait); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; @@ -2099,8 +2137,8 @@ out_check: 1 << BTRFS_ORDERED_PREALLOC, BTRFS_COMPRESS_NONE); if (ret) { - btrfs_drop_extent_cache(inode, cur_offset, - nocow_end, 0); + btrfs_drop_extent_map_range(inode, cur_offset, + nocow_end, false); goto error; } } else { @@ -2237,10 +2275,10 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page return ret; } -void btrfs_split_delalloc_extent(struct inode *inode, +void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct extent_state *orig, u64 split) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 size; /* not delalloc, ignore it */ @@ -2264,9 +2302,9 @@ void btrfs_split_delalloc_extent(struct inode *inode, return; } - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, 1); + spin_unlock(&inode->lock); } /* @@ -2274,10 +2312,10 @@ void btrfs_split_delalloc_extent(struct inode *inode, * that are just merged onto old extents, such as when we are doing sequential * writes, so we can properly account for the metadata space we'll need. */ -void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, +void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, struct extent_state *other) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 new_size, old_size; u32 num_extents; @@ -2292,9 +2330,9 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, /* we're not bigger than the max, unreserve the space and go */ if (new_size <= fs_info->max_extent_size) { - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, -1); + spin_unlock(&inode->lock); return; } @@ -2323,22 +2361,20 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, if (count_max_extents(fs_info, new_size) >= num_extents) return; - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, -1); + spin_unlock(&inode->lock); } static void btrfs_add_delalloc_inodes(struct btrfs_root *root, - struct inode *inode) + struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; spin_lock(&root->delalloc_lock); - if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_add_tail(&BTRFS_I(inode)->delalloc_inodes, - &root->delalloc_inodes); - set_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags); + if (list_empty(&inode->delalloc_inodes)) { + list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); + set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags); root->nr_delalloc_inodes++; if (root->nr_delalloc_inodes == 1) { spin_lock(&fs_info->delalloc_root_lock); @@ -2351,7 +2387,6 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root, spin_unlock(&root->delalloc_lock); } - void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode) { @@ -2384,10 +2419,10 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, * Properly track delayed allocation bytes in the inode and to maintain the * list of inodes that have pending delalloc work to be done. */ -void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, +void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) WARN_ON(1); @@ -2397,14 +2432,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, * bit, which is only set or cleared with irqs on */ if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(fs_info, len); - bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); + bool do_list = !btrfs_is_free_space_inode(inode); - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents); - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, num_extents); + spin_unlock(&inode->lock); /* For sanity tests */ if (btrfs_is_testing(fs_info)) @@ -2412,22 +2447,21 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, percpu_counter_add_batch(&fs_info->delalloc_bytes, len, fs_info->delalloc_batch); - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->delalloc_bytes += len; + spin_lock(&inode->lock); + inode->delalloc_bytes += len; if (bits & EXTENT_DEFRAG) - BTRFS_I(inode)->defrag_bytes += len; + inode->defrag_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags)) + &inode->runtime_flags)) btrfs_add_delalloc_inodes(root, inode); - spin_unlock(&BTRFS_I(inode)->lock); + spin_unlock(&inode->lock); } if (!(state->state & EXTENT_DELALLOC_NEW) && (bits & EXTENT_DELALLOC_NEW)) { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - - state->start; - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + inode->new_delalloc_bytes += state->end + 1 - state->start; + spin_unlock(&inode->lock); } } @@ -2435,11 +2469,10 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, * Once a range is no longer delalloc this function ensures that proper * accounting happens. */ -void btrfs_clear_delalloc_extent(struct inode *vfs_inode, +void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits) { - struct btrfs_inode *inode = BTRFS_I(vfs_inode); - struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(fs_info, len); @@ -2510,10 +2543,9 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, - u64 dio_file_offset) +blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio) { - return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); + return btrfs_csum_one_bio(inode, bio, (u64)-1, false); } /* @@ -2548,7 +2580,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, ASSERT(pre + post < len); - lock_extent(&inode->io_tree, start, start + len - 1); + lock_extent(&inode->io_tree, start, start + len - 1, NULL); write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (!em) { @@ -2622,7 +2654,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, out_unlock: write_unlock(&em_tree->lock); - unlock_extent(&inode->io_tree, start, start + len - 1); + unlock_extent(&inode->io_tree, start, start + len - 1, NULL); out: free_extent_map(split_pre); free_extent_map(split_mid); @@ -2691,17 +2723,18 @@ out: return errno_to_blk_status(ret); } -void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num) +void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_inode *bi = BTRFS_I(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; blk_status_t ret; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - ret = extract_ordered_extent(bi, bio, + ret = extract_ordered_extent(inode, bio, page_offset(bio_first_bvec_all(bio)->bv_page)); - if (ret) - goto out; + if (ret) { + btrfs_bio_end_io(btrfs_bio(bio), ret); + return; + } } /* @@ -2712,31 +2745,26 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro * Csum items for reloc roots have already been cloned at this point, * so they are handled as part of the no-checksum case. */ - if (!(bi->flags & BTRFS_INODE_NODATASUM) && + if (!(inode->flags & BTRFS_INODE_NODATASUM) && !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && - !btrfs_is_data_reloc_root(bi->root)) { - if (!atomic_read(&bi->sync_writers) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, - btrfs_submit_bio_start)) + !btrfs_is_data_reloc_root(inode->root)) { + if (!atomic_read(&inode->sync_writers) && + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA)) return; - ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false); - if (ret) - goto out; + ret = btrfs_csum_one_bio(inode, bio, (u64)-1, false); + if (ret) { + btrfs_bio_end_io(btrfs_bio(bio), ret); + return; + } } btrfs_submit_bio(fs_info, bio, mirror_num); - return; -out: - if (ret) { - bio->bi_status = ret; - bio_endio(bio); - } } -void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, +void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; blk_status_t ret; if (compress_type != BTRFS_COMPRESS_NONE) { @@ -2744,7 +2772,7 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, * btrfs_submit_compressed_read will handle completing the bio * if there were any errors, so just return here. */ - btrfs_submit_compressed_read(inode, bio, mirror_num); + btrfs_submit_compressed_read(&inode->vfs_inode, bio, mirror_num); return; } @@ -2755,10 +2783,9 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, * Lookup bio sums does extra checks around whether we need to csum or * not, which is why we ignore skip_sum here. */ - ret = btrfs_lookup_bio_sums(inode, bio, NULL); + ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); if (ret) { - bio->bi_status = ret; - bio_endio(bio); + btrfs_bio_end_io(btrfs_bio(bio), ret); return; } @@ -2818,8 +2845,8 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, ret = set_extent_bit(&inode->io_tree, search_start, search_start + em_len - 1, - EXTENT_DELALLOC_NEW, 0, NULL, cached_state, - GFP_NOFS, NULL); + EXTENT_DELALLOC_NEW, cached_state, + GFP_NOFS); next: search_start = extent_map_end(em); free_extent_map(em); @@ -2859,7 +2886,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { struct page *page; - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_work work; }; @@ -2878,7 +2905,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) fixup = container_of(work, struct btrfs_writepage_fixup, work); page = fixup->page; - inode = BTRFS_I(fixup->inode); + inode = fixup->inode; page_start = page_offset(page); page_end = page_offset(page) + PAGE_SIZE - 1; @@ -2931,7 +2958,7 @@ again: if (ret) goto out_page; - lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); + lock_extent(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (PageOrdered(page)) @@ -2939,8 +2966,8 @@ again: ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { - unlock_extent_cached(&inode->io_tree, page_start, page_end, - &cached_state); + unlock_extent(&inode->io_tree, page_start, page_end, + &cached_state); unlock_page(page); btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); @@ -2966,8 +2993,7 @@ out_reserved: if (free_delalloc_space) btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); - unlock_extent_cached(&inode->io_tree, page_start, page_end, - &cached_state); + unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); out_page: if (ret) { /* @@ -2989,7 +3015,7 @@ out_page: * that could need flushing space. Recursing back to fixup worker would * deadlock. */ - btrfs_add_delayed_iput(&inode->vfs_inode); + btrfs_add_delayed_iput(inode); } /* @@ -3038,7 +3064,7 @@ int btrfs_writepage_cow_fixup(struct page *page) get_page(page); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; - fixup->inode = inode; + fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); return -EAGAIN; @@ -3225,6 +3251,8 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) clear_bits |= EXTENT_DELALLOC_NEW; freespace_inode = btrfs_is_free_space_inode(inode); + if (!freespace_inode) + btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent); if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { ret = -EIO; @@ -3269,7 +3297,7 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) } clear_bits |= EXTENT_LOCKED; - lock_extent_bits(io_tree, start, end, &cached_state); + lock_extent(io_tree, start, end, &cached_state); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); @@ -3325,7 +3353,7 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) clear_extent_bit(&inode->io_tree, start, end, EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, - 0, 0, &cached_state); + &cached_state); btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, root, inode); @@ -3336,7 +3364,6 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ret = 0; out: clear_extent_bit(&inode->io_tree, start, end, clear_bits, - (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0, &cached_state); if (trans) @@ -3361,8 +3388,8 @@ out: unwritten_start += logical_len; clear_extent_uptodate(io_tree, unwritten_start, end, NULL); - /* Drop the cache for the part of the extent we didn't write. */ - btrfs_drop_extent_cache(inode, unwritten_start, end, 0); + /* Drop extent maps for the part of the extent we didn't write. */ + btrfs_drop_extent_map_range(inode, unwritten_start, end, false); /* * If the ordered extent had an IOERR or something else went @@ -3439,6 +3466,13 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, return 0; } +static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 offset) +{ + u64 offset_in_sectors = offset >> fs_info->sectorsize_bits; + + return csums + offset_in_sectors * fs_info->csum_size; +} + /* * check_data_csum - verify checksum of one sector of uncompressed data * @inode: inode @@ -3452,10 +3486,10 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, * When csum mismatch is detected, we will also report the error and fill the * corrupted range with zero. (Thus it needs the extra parameters) */ -int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, +int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u32 pgoff) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; u32 len = fs_info->sectorsize; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; @@ -3469,8 +3503,7 @@ int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, return 0; zeroit: - btrfs_print_data_csum_error(BTRFS_I(inode), - bbio->file_offset + bio_offset, + btrfs_print_data_csum_error(inode, bbio->file_offset + bio_offset, csum, csum_expected, bbio->mirror_num); if (bbio->device) btrfs_dev_stat_inc_and_print(bbio->device, @@ -3495,10 +3528,10 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u64 start, u64 end) { - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; const u32 sectorsize = root->fs_info->sectorsize; u32 pg_off; unsigned int result = 0; @@ -3511,7 +3544,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, if (bbio->csum == NULL) return 0; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + if (inode->flags & BTRFS_INODE_NODATASUM) return 0; if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))) @@ -3556,18 +3589,17 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, * the inode to the delayed iput machinery. Delayed iputs are processed at * transaction commit time/superblock commit/cleaner kthread. */ -void btrfs_add_delayed_iput(struct inode *inode) +void btrfs_add_delayed_iput(struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_inode *binode = BTRFS_I(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; - if (atomic_add_unless(&inode->i_count, -1, 1)) + if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return; atomic_inc(&fs_info->nr_delayed_iputs); spin_lock(&fs_info->delayed_iput_lock); - ASSERT(list_empty(&binode->delayed_iput)); - list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); + ASSERT(list_empty(&inode->delayed_iput)); + list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs); spin_unlock(&fs_info->delayed_iput_lock); if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) wake_up_process(fs_info->cleaner_kthread); @@ -3610,7 +3642,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->delayed_iput_lock); } -/** +/* * Wait for flushing all delayed iputs * * @fs_info: the filesystem @@ -4255,7 +4287,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, int name_len, + const struct fscrypt_str *name, struct btrfs_rename_ctx *rename_ctx) { struct btrfs_root *root = dir->root; @@ -4273,8 +4305,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto out; } - di = btrfs_lookup_dir_item(trans, root, path, dir_ino, - name, name_len, -1); + di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto err; @@ -4302,12 +4333,11 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, } } - ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, - dir_ino, &index); + ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); if (ret) { btrfs_info(fs_info, "failed to delete reference to %.*s, inode %llu parent %llu", - name_len, name, ino, dir_ino); + name->len, name->name, ino, dir_ino); btrfs_abort_transaction(trans, ret); goto err; } @@ -4328,10 +4358,8 @@ skip_backref: * operations on the log tree, increasing latency for applications. */ if (!rename_ctx) { - btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, - dir_ino); - btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, - index); + btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino); + btrfs_del_dir_entries_in_log(trans, root, name, dir, index); } /* @@ -4349,7 +4377,7 @@ err: if (ret) goto out; - btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2); + btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); @@ -4362,10 +4390,11 @@ out: int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, int name_len) + const struct fscrypt_str *name) { int ret; - ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL); + + ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL); if (!ret) { drop_nlink(&inode->vfs_inode); ret = btrfs_update_inode(trans, inode->root, inode); @@ -4381,9 +4410,9 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, * plenty of slack room in the global reserve to migrate, otherwise we cannot * allow the unlink to occur. */ -static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) +static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir) { - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; /* * 1 for the possible orphan item @@ -4401,47 +4430,62 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; struct inode *inode = d_inode(dentry); int ret; + struct fscrypt_name fname; - trans = __unlink_start_trans(dir); - if (IS_ERR(trans)) - return PTR_ERR(trans); + ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + if (ret) + return ret; + + /* This needs to handle no-key deletions later on */ + + trans = __unlink_start_trans(BTRFS_I(dir)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto fscrypt_free; + } btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 0); - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), - BTRFS_I(d_inode(dentry)), dentry->d_name.name, - dentry->d_name.len); + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), + &fname.disk_name); if (ret) - goto out; + goto end_trans; if (inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) - goto out; + goto end_trans; } -out: +end_trans: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); +fscrypt_free: + fscrypt_free_filename(&fname); return ret; } static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, - struct inode *dir, struct dentry *dentry) + struct btrfs_inode *dir, struct dentry *dentry) { - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; - const char *name = dentry->d_name.name; - int name_len = dentry->d_name.len; u64 index; int ret; u64 objectid; - u64 dir_ino = btrfs_ino(BTRFS_I(dir)); + u64 dir_ino = btrfs_ino(dir); + struct fscrypt_name fname; + + ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); + if (ret) + return ret; + + /* This needs to handle no-key deletions later on */ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { objectid = inode->root->root_key.objectid; @@ -4449,15 +4493,18 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, objectid = inode->location.objectid; } else { WARN_ON(1); + fscrypt_free_filename(&fname); return -EINVAL; } path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + ret = -ENOMEM; + goto out; + } di = btrfs_lookup_dir_item(trans, root, path, dir_ino, - name, name_len, -1); + &fname.disk_name, -1); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; @@ -4483,8 +4530,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, * call btrfs_del_root_ref, and it _shouldn't_ fail. */ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { - di = btrfs_search_dir_index_item(root, path, dir_ino, - name, name_len); + di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name); if (IS_ERR_OR_NULL(di)) { if (!di) ret = -ENOENT; @@ -4501,28 +4547,29 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } else { ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid, dir_ino, - &index, name, name_len); + &index, &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } } - ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index); + ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2); - inode_inc_iversion(dir); - dir->i_mtime = current_time(dir); - dir->i_ctime = dir->i_mtime; - ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir)); + btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2); + inode_inc_iversion(&dir->vfs_inode); + dir->vfs_inode.i_mtime = current_time(&dir->vfs_inode); + dir->vfs_inode.i_ctime = dir->vfs_inode.i_mtime; + ret = btrfs_update_inode_fallback(trans, root, dir); if (ret) btrfs_abort_transaction(trans, ret); out: btrfs_free_path(path); + fscrypt_free_filename(&fname); return ret; } @@ -4536,6 +4583,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) struct btrfs_path *path; struct btrfs_dir_item *di; struct btrfs_key key; + struct fscrypt_str name = FSTR_INIT("default", 7); u64 dir_id; int ret; @@ -4546,7 +4594,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) /* Make sure this root isn't set as the default subvol */ dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, - dir_id, "default", 7, 0); + dir_id, &name, 0); if (di && !IS_ERR(di)) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); if (key.objectid == root->root_key.objectid) { @@ -4645,10 +4693,10 @@ again: spin_unlock(&root->inode_lock); } -int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) +int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) { struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; struct inode *inode = d_inode(dentry); struct btrfs_root *dest = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -4705,7 +4753,7 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; - btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); + btrfs_record_snapshot_destroy(trans, dir); ret = btrfs_unlink_subvol(trans, dir, dentry); if (ret) { @@ -4785,6 +4833,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) int err = 0; struct btrfs_trans_handle *trans; u64 last_unlink_trans; + struct fscrypt_name fname; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; @@ -4794,15 +4843,23 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) "extent tree v2 doesn't support snapshot deletion yet"); return -EOPNOTSUPP; } - return btrfs_delete_subvolume(dir, dentry); + return btrfs_delete_subvolume(BTRFS_I(dir), dentry); } - trans = __unlink_start_trans(dir); - if (IS_ERR(trans)) - return PTR_ERR(trans); + err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + if (err) + return err; + + /* This needs to handle no-key deletions later on */ + + trans = __unlink_start_trans(BTRFS_I(dir)); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_notrans; + } if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - err = btrfs_unlink_subvol(trans, dir, dentry); + err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); goto out; } @@ -4813,9 +4870,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; /* now the directory is empty */ - err = btrfs_unlink_inode(trans, BTRFS_I(dir), - BTRFS_I(d_inode(dentry)), dentry->d_name.name, - dentry->d_name.len); + err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), + &fname.disk_name); if (!err) { btrfs_i_size_write(BTRFS_I(inode), 0); /* @@ -4834,7 +4890,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) } out: btrfs_end_transaction(trans); +out_notrans: btrfs_btree_balance_dirty(fs_info); + fscrypt_free_filename(&fname); return err; } @@ -4878,9 +4936,9 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, block_end = block_start + blocksize - 1; ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, - blocksize); + blocksize, false); if (ret < 0) { - if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) { + if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) { /* For nocow case, no need to reserve data space */ only_release_metadata = true; } else { @@ -4922,12 +4980,11 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, block_start, block_end, &cached_state); + lock_extent(io_tree, block_start, block_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { - unlock_extent_cached(io_tree, block_start, block_end, - &cached_state); + unlock_extent(io_tree, block_start, block_end, &cached_state); unlock_page(page); put_page(page); btrfs_start_ordered_extent(ordered, 1); @@ -4937,13 +4994,12 @@ again: clear_extent_bit(&inode->io_tree, block_start, block_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state); + &cached_state); ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state); if (ret) { - unlock_extent_cached(io_tree, block_start, block_end, - &cached_state); + unlock_extent(io_tree, block_start, block_end, &cached_state); goto out_unlock; } @@ -4960,11 +5016,11 @@ again: btrfs_page_clear_checked(fs_info, page, block_start, block_end + 1 - block_start); btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); - unlock_extent_cached(io_tree, block_start, block_end, &cached_state); + unlock_extent(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) set_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL); + EXTENT_NORESERVE, NULL, GFP_NOFS); out_unlock: if (ret) { @@ -5021,8 +5077,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, return ret; } - ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), - offset, 0, 0, len, 0, len, 0, 0, 0); + ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len); if (ret) { btrfs_abort_transaction(trans, ret); } else { @@ -5046,7 +5101,6 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) struct extent_io_tree *io_tree = &inode->io_tree; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; - struct extent_map_tree *em_tree = &inode->extent_tree; u64 hole_start = ALIGN(oldsize, fs_info->sectorsize); u64 block_end = ALIGN(size, fs_info->sectorsize); u64 last_byte; @@ -5094,10 +5148,11 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) if (err) break; - btrfs_drop_extent_cache(inode, cur_offset, - cur_offset + hole_size - 1, 0); hole_em = alloc_extent_map(); if (!hole_em) { + btrfs_drop_extent_map_range(inode, cur_offset, + cur_offset + hole_size - 1, + false); btrfs_set_inode_full_sync(inode); goto next; } @@ -5112,16 +5167,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = fs_info->generation; - while (1) { - write_lock(&em_tree->lock); - err = add_extent_mapping(em_tree, hole_em, 1); - write_unlock(&em_tree->lock); - if (err != -EEXIST) - break; - btrfs_drop_extent_cache(inode, cur_offset, - cur_offset + - hole_size - 1, 0); - } + err = btrfs_replace_extent_map_range(inode, hole_em, true); free_extent_map(hole_em); } else { err = btrfs_inode_set_file_extent_range(inode, @@ -5137,7 +5183,7 @@ next: break; } free_extent_map(em); - unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state); + unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); return err; } @@ -5215,7 +5261,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) inode_dio_wait(inode); - ret = btrfs_truncate(inode, newsize == oldsize); + ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize); if (ret && inode->i_nlink) { int err; @@ -5258,10 +5304,10 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr if (attr->ia_valid) { setattr_copy(mnt_userns, inode, attr); inode_inc_iversion(inode); - err = btrfs_dirty_inode(inode); + err = btrfs_dirty_inode(BTRFS_I(inode)); if (!err && attr->ia_valid & ATTR_MODE) - err = posix_acl_chmod(mnt_userns, inode, inode->i_mode); + err = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); } return err; @@ -5271,7 +5317,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr * While truncating the inode pages during eviction, we get the VFS * calling btrfs_invalidate_folio() against each folio of the inode. This * is slow because the calls to btrfs_invalidate_folio() result in a - * huge amount of calls to lock_extent_bits() and clear_extent_bit(), + * huge amount of calls to lock_extent() and clear_extent_bit(), * which keep merging and splitting extent_state structures over and over, * wasting lots of time. * @@ -5283,29 +5329,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr static void evict_inode_truncate_pages(struct inode *inode) { struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree; struct rb_node *node; ASSERT(inode->i_state & I_FREEING); truncate_inode_pages_final(&inode->i_data); - write_lock(&map_tree->lock); - while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) { - struct extent_map *em; - - node = rb_first_cached(&map_tree->map); - em = rb_entry(node, struct extent_map, rb_node); - clear_bit(EXTENT_FLAG_PINNED, &em->flags); - clear_bit(EXTENT_FLAG_LOGGING, &em->flags); - remove_extent_mapping(map_tree, em); - free_extent_map(em); - if (need_resched()) { - write_unlock(&map_tree->lock); - cond_resched(); - write_lock(&map_tree->lock); - } - } - write_unlock(&map_tree->lock); + btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); /* * Keep looping until we have no more ranges in the io tree. @@ -5338,7 +5367,7 @@ static void evict_inode_truncate_pages(struct inode *inode) state_flags = state->state; spin_unlock(&io_tree->lock); - lock_extent_bits(io_tree, start, end, &cached_state); + lock_extent(io_tree, start, end, &cached_state); /* * If still has DELALLOC flag, the extent didn't reach disk, @@ -5353,8 +5382,7 @@ static void evict_inode_truncate_pages(struct inode *inode) end - start + 1); clear_extent_bit(io_tree, start, end, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, + EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, &cached_state); cond_resched(); @@ -5534,22 +5562,27 @@ no_delete: * If no dir entries were found, returns -ENOENT. * If found a corrupted location in dir entry, returns -EUCLEAN. */ -static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, +static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type) { - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; struct btrfs_dir_item *di; struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; int ret = 0; + struct fscrypt_name fname; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), - name, namelen, 0); + ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); + if (ret) + goto out; + + /* This needs to handle no-key deletions later on */ + + di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), + &fname.disk_name, 0); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; @@ -5561,12 +5594,13 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, ret = -EUCLEAN; btrfs_warn(root->fs_info, "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", - __func__, name, btrfs_ino(BTRFS_I(dir)), + __func__, fname.disk_name.name, btrfs_ino(dir), location->objectid, location->type, location->offset); } if (!ret) - *type = btrfs_dir_type(path->nodes[0], di); + *type = btrfs_dir_ftype(path->nodes[0], di); out: + fscrypt_free_filename(&fname); btrfs_free_path(path); return ret; } @@ -5577,7 +5611,7 @@ out: * is kind of like crossing a mount point. */ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, - struct inode *dir, + struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, struct btrfs_root **sub_root) @@ -5589,6 +5623,11 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, struct btrfs_key key; int ret; int err = 0; + struct fscrypt_name fname; + + ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname); + if (ret) + return ret; path = btrfs_alloc_path(); if (!path) { @@ -5597,7 +5636,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, } err = -ENOENT; - key.objectid = BTRFS_I(dir)->root->root_key.objectid; + key.objectid = dir->root->root_key.objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = location->objectid; @@ -5610,13 +5649,12 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); - if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) || - btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) + if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || + btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len) goto out; - ret = memcmp_extent_buffer(leaf, dentry->d_name.name, - (unsigned long)(ref + 1), - dentry->d_name.len); + ret = memcmp_extent_buffer(leaf, fname.disk_name.name, + (unsigned long)(ref + 1), fname.disk_name.len); if (ret) goto out; @@ -5635,19 +5673,20 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, err = 0; out: btrfs_free_path(path); + fscrypt_free_filename(&fname); return err; } -static void inode_tree_add(struct inode *inode) +static void inode_tree_add(struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_inode *entry; struct rb_node **p; struct rb_node *parent; - struct rb_node *new = &BTRFS_I(inode)->rb_node; - u64 ino = btrfs_ino(BTRFS_I(inode)); + struct rb_node *new = &inode->rb_node; + u64 ino = btrfs_ino(inode); - if (inode_unhashed(inode)) + if (inode_unhashed(&inode->vfs_inode)) return; parent = NULL; spin_lock(&root->inode_lock); @@ -5707,6 +5746,11 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) BTRFS_I(inode)->location.offset = 0; BTRFS_I(inode)->root = btrfs_grab_root(args->root); BUG_ON(args->root && !BTRFS_I(inode)->root); + + if (args->root && args->root == args->root->fs_info->tree_root && + args->ino != BTRFS_BTREE_INODE_OBJECTID) + set_bit(BTRFS_INODE_FREE_SPACE_INODE, + &BTRFS_I(inode)->runtime_flags); return 0; } @@ -5754,7 +5798,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino, ret = btrfs_read_locked_inode(inode, path); if (!ret) { - inode_tree_add(inode); + inode_tree_add(BTRFS_I(inode)); unlock_new_inode(inode); } else { iget_failed(inode); @@ -5834,7 +5878,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - ret = btrfs_inode_by_name(dir, dentry, &location, &di_type); + ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type); if (ret < 0) return ERR_PTR(ret); @@ -5855,7 +5899,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return inode; } - ret = fixup_tree_root_location(fs_info, dir, dentry, + ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry, &location, &sub_root); if (ret < 0) { if (ret != -ENOENT) @@ -6003,6 +6047,7 @@ again: btrfs_for_each_slot(root, &key, &found_key, path, ret) { struct dir_entry *entry; struct extent_buffer *leaf = path->nodes[0]; + u8 ftype; if (found_key.objectid != key.objectid) break; @@ -6026,13 +6071,13 @@ again: goto again; } + ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di)); entry = addr; - put_unaligned(name_len, &entry->name_len); name_ptr = (char *)(entry + 1); - read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), - name_len); - put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)), - &entry->type); + read_extent_buffer(leaf, name_ptr, + (unsigned long)(di + 1), name_len); + put_unaligned(name_len, &entry->name_len); + put_unaligned(fs_ftype_to_dtype(ftype), &entry->type); btrfs_dir_item_key_to_cpu(leaf, di, &location); put_unaligned(location.objectid, &entry->ino); put_unaligned(found_key.offset, &entry->offset); @@ -6090,21 +6135,21 @@ err: * FIXME, needs more benchmarking...there are no reasons other than performance * to keep or drop this code. */ -static int btrfs_dirty_inode(struct inode *inode) +static int btrfs_dirty_inode(struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; int ret; - if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) + if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags)) return 0; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, inode); if (ret && (ret == -ENOSPC || ret == -EDQUOT)) { /* whoops, lets try again with the full transaction */ btrfs_end_transaction(trans); @@ -6112,10 +6157,10 @@ static int btrfs_dirty_inode(struct inode *inode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, inode); } btrfs_end_transaction(trans); - if (BTRFS_I(inode)->delayed_node) + if (inode->delayed_node) btrfs_balance_delayed_items(fs_info); return ret; @@ -6142,7 +6187,7 @@ static int btrfs_update_time(struct inode *inode, struct timespec64 *now, inode->i_mtime = *now; if (flags & S_ATIME) inode->i_atime = *now; - return dirty ? btrfs_dirty_inode(inode) : 0; + return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0; } /* @@ -6238,9 +6283,18 @@ int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, struct inode *inode = args->inode; int ret; + if (!args->orphan) { + ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0, + &args->fname); + if (ret) + return ret; + } + ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl); - if (ret) + if (ret) { + fscrypt_free_filename(&args->fname); return ret; + } /* 1 to add inode item */ *trans_num_items = 1; @@ -6280,6 +6334,7 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args) { posix_acl_release(args->acl); posix_acl_release(args->default_acl); + fscrypt_free_filename(&args->fname); } /* @@ -6287,27 +6342,27 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args) * * Currently only the compression flags and the cow flags are inherited. */ -static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) +static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir) { unsigned int flags; - flags = BTRFS_I(dir)->flags; + flags = dir->flags; if (flags & BTRFS_INODE_NOCOMPRESS) { - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + inode->flags &= ~BTRFS_INODE_COMPRESS; + inode->flags |= BTRFS_INODE_NOCOMPRESS; } else if (flags & BTRFS_INODE_COMPRESS) { - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; + inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + inode->flags |= BTRFS_INODE_COMPRESS; } if (flags & BTRFS_INODE_NODATACOW) { - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; - if (S_ISREG(inode->i_mode)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + inode->flags |= BTRFS_INODE_NODATACOW; + if (S_ISREG(inode->vfs_inode.i_mode)) + inode->flags |= BTRFS_INODE_NODATASUM; } - btrfs_sync_inode_flags_to_i_flags(inode); + btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); } int btrfs_create_new_inode(struct btrfs_trans_handle *trans, @@ -6315,8 +6370,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, { struct inode *dir = args->dir; struct inode *inode = args->inode; - const char *name = args->orphan ? NULL : args->dentry->d_name.name; - int name_len = args->orphan ? 0 : args->dentry->d_name.len; + const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name; struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_root *root; struct btrfs_inode_item *inode_item; @@ -6367,7 +6421,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, * change it now without compatibility issues. */ if (!args->subvol) - btrfs_inherit_iflags(inode, dir); + btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir)); if (S_ISREG(inode->i_mode)) { if (btrfs_test_opt(fs_info, NODATASUM)) @@ -6417,7 +6471,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, sizes[1] = 2 + sizeof(*ref); } else { key[1].offset = btrfs_ino(BTRFS_I(dir)); - sizes[1] = name_len + sizeof(*ref); + sizes[1] = name->len + sizeof(*ref); } } @@ -6456,10 +6510,12 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_ref_index(path->nodes[0], ref, 0); write_extent_buffer(path->nodes[0], "..", ptr, 2); } else { - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, + name->len); btrfs_set_inode_ref_index(path->nodes[0], ref, BTRFS_I(inode)->dir_index); - write_extent_buffer(path->nodes[0], name, ptr, name_len); + write_extent_buffer(path->nodes[0], name->name, ptr, + name->len); } } @@ -6509,7 +6565,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } - inode_tree_add(inode); + inode_tree_add(BTRFS_I(inode)); trace_btrfs_inode_new(inode); btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); @@ -6520,7 +6576,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, ret = btrfs_orphan_add(trans, BTRFS_I(inode)); } else { ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, - name_len, 0, BTRFS_I(inode)->dir_index); + 0, BTRFS_I(inode)->dir_index); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -6549,7 +6605,7 @@ out: */ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const char *name, int name_len, int add_backref, u64 index) + const struct fscrypt_str *name, int add_backref, u64 index) { int ret = 0; struct btrfs_key key; @@ -6568,17 +6624,17 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_add_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, - index, name, name_len); + index, name); } else if (add_backref) { - ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, - parent_ino, index); + ret = btrfs_insert_inode_ref(trans, root, name, + ino, parent_ino, index); } /* Nothing to clean up yet */ if (ret) return ret; - ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key, + ret = btrfs_insert_dir_item(trans, name, parent_inode, &key, btrfs_inode_type(&inode->vfs_inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; @@ -6588,7 +6644,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, } btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + - name_len * 2); + name->len * 2); inode_inc_iversion(&parent_inode->vfs_inode); /* * If we are replaying a log tree, we do not want to update the mtime @@ -6613,15 +6669,15 @@ fail_dir_item: int err; err = btrfs_del_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, - &local_index, name, name_len); + &local_index, name); if (err) btrfs_abort_transaction(trans, err); } else if (add_backref) { u64 local_index; int err; - err = btrfs_del_inode_ref(trans, root, name, name_len, - ino, parent_ino, &local_index); + err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, + &local_index); if (err) btrfs_abort_transaction(trans, err); } @@ -6704,6 +6760,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = d_inode(old_dentry); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct fscrypt_name fname; u64 index; int err; int drop_inode = 0; @@ -6715,6 +6772,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink >= BTRFS_LINK_MAX) return -EMLINK; + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); + if (err) + goto fail; + err = btrfs_set_inode_index(BTRFS_I(dir), &index); if (err) goto fail; @@ -6741,7 +6802,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - dentry->d_name.name, dentry->d_name.len, 1, index); + &fname.disk_name, 1, index); if (err) { drop_inode = 1; @@ -6765,6 +6826,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } fail: + fscrypt_free_filename(&fname); if (trans) btrfs_end_transaction(trans); if (drop_inode) { @@ -6791,7 +6853,6 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, static noinline int uncompress_inline(struct btrfs_path *path, struct page *page, - size_t pg_offset, u64 extent_offset, struct btrfs_file_extent_item *item) { int ret; @@ -6802,7 +6863,6 @@ static noinline int uncompress_inline(struct btrfs_path *path, unsigned long ptr; int compress_type; - WARN_ON(pg_offset != 0); compress_type = btrfs_file_extent_compression(leaf, item); max_size = btrfs_file_extent_ram_bytes(leaf, item); inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); @@ -6814,8 +6874,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_SIZE, max_size); - ret = btrfs_decompress(compress_type, tmp, page, - extent_offset, inline_size, max_size); + ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size); /* * decompression code contains a memset to fill in any space between the end @@ -6825,25 +6884,52 @@ static noinline int uncompress_inline(struct btrfs_path *path, * cover that region here. */ - if (max_size + pg_offset < PAGE_SIZE) - memzero_page(page, pg_offset + max_size, - PAGE_SIZE - max_size - pg_offset); + if (max_size < PAGE_SIZE) + memzero_page(page, max_size, PAGE_SIZE - max_size); kfree(tmp); return ret; } -/** - * btrfs_get_extent - Lookup the first extent overlapping a range in a file. +static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, + struct page *page) +{ + struct btrfs_file_extent_item *fi; + void *kaddr; + size_t copy_size; + + if (!page || PageUptodate(page)) + return 0; + + ASSERT(page_offset(page) == 0); + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) + return uncompress_inline(path, page, fi); + + copy_size = min_t(u64, PAGE_SIZE, + btrfs_file_extent_ram_bytes(path->nodes[0], fi)); + kaddr = kmap_local_page(page); + read_extent_buffer(path->nodes[0], kaddr, + btrfs_file_extent_inline_start(fi), copy_size); + kunmap_local(kaddr); + if (copy_size < PAGE_SIZE) + memzero_page(page, copy_size, PAGE_SIZE - copy_size); + return 0; +} + +/* + * Lookup the first extent overlapping a range in a file. + * * @inode: file to search in * @page: page to read extent data into if the extent is inline * @pg_offset: offset into @page to copy to * @start: file offset * @len: length of range starting at @start * - * This returns the first &struct extent_map which overlaps with the given - * range, reading it from the B-tree and caching it if necessary. Note that - * there may be more extents which overlap the given range after the returned - * extent_map. + * Return the first &struct extent_map which overlaps the given range, reading + * it from the B-tree and caching it if necessary. Note that there may be more + * extents which overlap the given range after the returned extent_map. * * If @page is not NULL and the extent is inline, this also reads the extent * data directly into the page and marks the extent up to date in the io_tree. @@ -6867,7 +6953,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_map_tree *em_tree = &inode->extent_tree; - struct extent_io_tree *io_tree = &inode->io_tree; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); @@ -6985,53 +7070,33 @@ next: goto insert; } - btrfs_extent_item_to_extent_map(inode, path, item, !page, em); + btrfs_extent_item_to_extent_map(inode, path, item, em); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { goto insert; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - unsigned long ptr; - char *map; - size_t size; - size_t extent_offset; - size_t copy_size; - - if (!page) - goto out; + /* + * Inline extent can only exist at file offset 0. This is + * ensured by tree-checker and inline extent creation path. + * Thus all members representing file offsets should be zero. + */ + ASSERT(pg_offset == 0); + ASSERT(extent_start == 0); + ASSERT(em->start == 0); - size = btrfs_file_extent_ram_bytes(leaf, item); - extent_offset = page_offset(page) + pg_offset - extent_start; - copy_size = min_t(u64, PAGE_SIZE - pg_offset, - size - extent_offset); - em->start = extent_start + extent_offset; - em->len = ALIGN(copy_size, fs_info->sectorsize); - em->orig_block_len = em->len; - em->orig_start = em->start; - ptr = btrfs_file_extent_inline_start(item) + extent_offset; + /* + * btrfs_extent_item_to_extent_map() should have properly + * initialized em members already. + * + * Other members are not utilized for inline extents. + */ + ASSERT(em->block_start == EXTENT_MAP_INLINE); + ASSERT(em->len = fs_info->sectorsize); - if (!PageUptodate(page)) { - if (btrfs_file_extent_compression(leaf, item) != - BTRFS_COMPRESS_NONE) { - ret = uncompress_inline(path, page, pg_offset, - extent_offset, item); - if (ret) - goto out; - } else { - map = kmap_local_page(page); - read_extent_buffer(leaf, map + pg_offset, ptr, - copy_size); - if (pg_offset + copy_size < PAGE_SIZE) { - memset(map + pg_offset + copy_size, 0, - PAGE_SIZE - pg_offset - - copy_size); - } - kunmap_local(map); - } - flush_dcache_page(page); - } - set_extent_uptodate(io_tree, em->start, - extent_map_end(em) - 1, NULL, GFP_NOFS); + ret = read_inline_extent(inode, path, page); + if (ret < 0) + goto out; goto insert; } not_found: @@ -7065,133 +7130,6 @@ out: return em; } -struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, - u64 start, u64 len) -{ - struct extent_map *em; - struct extent_map *hole_em = NULL; - u64 delalloc_start = start; - u64 end; - u64 delalloc_len; - u64 delalloc_end; - int err = 0; - - em = btrfs_get_extent(inode, NULL, 0, start, len); - if (IS_ERR(em)) - return em; - /* - * If our em maps to: - * - a hole or - * - a pre-alloc extent, - * there might actually be delalloc bytes behind it. - */ - if (em->block_start != EXTENT_MAP_HOLE && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - return em; - else - hole_em = em; - - /* check to see if we've wrapped (len == -1 or similar) */ - end = start + len; - if (end < start) - end = (u64)-1; - else - end -= 1; - - em = NULL; - - /* ok, we didn't find anything, lets look for delalloc */ - delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start, - end, len, EXTENT_DELALLOC, 1); - delalloc_end = delalloc_start + delalloc_len; - if (delalloc_end < delalloc_start) - delalloc_end = (u64)-1; - - /* - * We didn't find anything useful, return the original results from - * get_extent() - */ - if (delalloc_start > end || delalloc_end <= start) { - em = hole_em; - hole_em = NULL; - goto out; - } - - /* - * Adjust the delalloc_start to make sure it doesn't go backwards from - * the start they passed in - */ - delalloc_start = max(start, delalloc_start); - delalloc_len = delalloc_end - delalloc_start; - - if (delalloc_len > 0) { - u64 hole_start; - u64 hole_len; - const u64 hole_end = extent_map_end(hole_em); - - em = alloc_extent_map(); - if (!em) { - err = -ENOMEM; - goto out; - } - - ASSERT(hole_em); - /* - * When btrfs_get_extent can't find anything it returns one - * huge hole - * - * Make sure what it found really fits our range, and adjust to - * make sure it is based on the start from the caller - */ - if (hole_end <= start || hole_em->start > end) { - free_extent_map(hole_em); - hole_em = NULL; - } else { - hole_start = max(hole_em->start, start); - hole_len = hole_end - hole_start; - } - - if (hole_em && delalloc_start > hole_start) { - /* - * Our hole starts before our delalloc, so we have to - * return just the parts of the hole that go until the - * delalloc starts - */ - em->len = min(hole_len, delalloc_start - hole_start); - em->start = hole_start; - em->orig_start = hole_start; - /* - * Don't adjust block start at all, it is fixed at - * EXTENT_MAP_HOLE - */ - em->block_start = hole_em->block_start; - em->block_len = hole_len; - if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); - } else { - /* - * Hole is out of passed range or it starts after - * delalloc range - */ - em->start = delalloc_start; - em->len = delalloc_len; - em->orig_start = delalloc_start; - em->block_start = EXTENT_MAP_DELALLOC; - em->block_len = delalloc_len; - } - } else { - return hole_em; - } -out: - - free_extent_map(hole_em); - if (err) { - free_extent_map(em); - return ERR_PTR(err); - } - return em; -} - static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, const u64 start, const u64 len, @@ -7221,7 +7159,8 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, if (ret) { if (em) { free_extent_map(em); - btrfs_drop_extent_cache(inode, start, start + len - 1, 0); + btrfs_drop_extent_map_range(inode, start, + start + len - 1, false); } em = ERR_PTR(ret); } @@ -7292,7 +7231,7 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) */ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes, bool strict) + u64 *ram_bytes, bool nowait, bool strict) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct can_nocow_file_extent_args nocow_args = { 0 }; @@ -7308,6 +7247,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->nowait = nowait; ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(BTRFS_I(inode)), offset, 0); @@ -7401,10 +7341,11 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, while (1) { if (nowait) { - if (!try_lock_extent(io_tree, lockstart, lockend)) + if (!try_lock_extent(io_tree, lockstart, lockend, + cached_state)) return -EAGAIN; } else { - lock_extent_bits(io_tree, lockstart, lockend, cached_state); + lock_extent(io_tree, lockstart, lockend, cached_state); } /* * We're concerned with the entire range that we're going to be @@ -7426,7 +7367,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, lockstart, lockend))) break; - unlock_extent_cached(io_tree, lockstart, lockend, cached_state); + unlock_extent(io_tree, lockstart, lockend, cached_state); if (ordered) { if (nowait) { @@ -7488,7 +7429,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 ram_bytes, int compress_type, int type) { - struct extent_map_tree *em_tree; struct extent_map *em; int ret; @@ -7497,7 +7437,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_REGULAR); - em_tree = &inode->extent_tree; em = alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM); @@ -7518,18 +7457,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, em->compress_type = compress_type; } - do { - btrfs_drop_extent_cache(inode, em->start, - em->start + em->len - 1, 0); - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 1); - write_unlock(&em_tree->lock); - /* - * The caller has taken lock_extent(), who could race with us - * to add em? - */ - } while (ret == -EEXIST); - + ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) { free_extent_map(em); return ERR_PTR(ret); @@ -7577,7 +7505,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes, false) == 1) { + &orig_block_len, &ram_bytes, false, false) == 1) { bg = btrfs_inc_nocow_writers(fs_info, block_start); if (bg) can_nocow = true; @@ -7762,7 +7690,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (write && !(flags & IOMAP_NOWAIT)) { ret = btrfs_check_data_free_space(BTRFS_I(inode), &dio_data->data_reserved, - start, data_alloc_len); + start, data_alloc_len, false); if (!ret) dio_data->data_space_reserved = true; else if (ret && !(BTRFS_I(inode)->flags & @@ -7884,8 +7812,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, } if (unlock_extents) - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - lockstart, lockend, &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); else free_extent_state(cached_state); @@ -7914,8 +7842,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, return 0; unlock_err: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); err: if (dio_data->data_space_reserved) { btrfs_free_reserved_data_space(BTRFS_I(inode), @@ -7938,7 +7866,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (!write && (iomap->type == IOMAP_HOLE)) { /* If reading from a hole, unlock and return */ - unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); + unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, + NULL); return 0; } @@ -7950,7 +7879,7 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, pos, length, false); else unlock_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1); + pos + length - 1, NULL); ret = -ENOTBLK; } @@ -7969,40 +7898,35 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) return; if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { - btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL, + btrfs_mark_ordered_io_finished(dip->inode, NULL, dip->file_offset, dip->bytes, !dip->bio.bi_status); } else { - unlock_extent(&BTRFS_I(dip->inode)->io_tree, + unlock_extent(&dip->inode->io_tree, dip->file_offset, - dip->file_offset + dip->bytes - 1); + dip->file_offset + dip->bytes - 1, NULL); } kfree(dip->csums); bio_endio(&dip->bio); } -static void submit_dio_repair_bio(struct inode *inode, struct bio *bio, - int mirror_num, - enum btrfs_compression_type compress_type) +void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { - struct btrfs_dio_private *dip = bio->bi_private; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_dio_private *dip = btrfs_bio(bio)->private; BUG_ON(bio_op(bio) == REQ_OP_WRITE); refcount_inc(&dip->refs); - btrfs_submit_bio(fs_info, bio, mirror_num); + btrfs_submit_bio(inode->root->fs_info, bio, mirror_num); } static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, struct btrfs_bio *bbio, const bool uptodate) { - struct inode *inode = dip->inode; + struct inode *inode = &dip->inode->vfs_inode; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); blk_status_t err = BLK_STS_OK; struct bvec_iter iter; @@ -8013,17 +7937,15 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, u64 start = bbio->file_offset + offset; if (uptodate && - (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page, - bv.bv_offset))) { - clean_io_failure(fs_info, failure_tree, io_tree, start, - bv.bv_page, btrfs_ino(BTRFS_I(inode)), - bv.bv_offset); + (!csum || !btrfs_check_data_csum(BTRFS_I(inode), bbio, offset, + bv.bv_page, bv.bv_offset))) { + btrfs_clean_io_failure(BTRFS_I(inode), start, + bv.bv_page, bv.bv_offset); } else { int ret; - ret = btrfs_repair_one_sector(inode, bbio, offset, - bv.bv_page, bv.bv_offset, - submit_dio_repair_bio); + ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, + bv.bv_page, bv.bv_offset, false); if (ret) err = errno_to_blk_status(ret); } @@ -8032,23 +7954,23 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, return err; } -static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, - struct bio *bio, - u64 dio_file_offset) +blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, + struct bio *bio, + u64 dio_file_offset) { - return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false); + return btrfs_csum_one_bio(inode, bio, dio_file_offset, false); } -static void btrfs_end_dio_bio(struct bio *bio) +static void btrfs_end_dio_bio(struct btrfs_bio *bbio) { - struct btrfs_dio_private *dip = bio->bi_private; - struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_dio_private *dip = bbio->private; + struct bio *bio = &bbio->bio; blk_status_t err = bio->bi_status; if (err) - btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, + btrfs_warn(dip->inode->root->fs_info, "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", - btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio), + btrfs_ino(dip->inode), bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); @@ -8058,41 +7980,40 @@ static void btrfs_end_dio_bio(struct bio *bio) if (err) dip->bio.bi_status = err; - btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio); + btrfs_record_physical_zoned(&dip->inode->vfs_inode, bbio->file_offset, bio); bio_put(bio); btrfs_dio_private_put(dip); } -static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, +static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode, u64 file_offset, int async_submit) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_dio_private *dip = bio->bi_private; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_dio_private *dip = btrfs_bio(bio)->private; blk_status_t ret; /* Save the original iter for read repair */ if (btrfs_op(bio) == BTRFS_MAP_READ) btrfs_bio(bio)->iter = bio->bi_iter; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + if (inode->flags & BTRFS_INODE_NODATASUM) goto map; if (btrfs_op(bio) == BTRFS_MAP_WRITE) { /* Check btrfs_submit_data_write_bio() for async submit rules */ - if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) && + if (async_submit && !atomic_read(&inode->sync_writers) && btrfs_wq_submit_bio(inode, bio, 0, file_offset, - btrfs_submit_bio_start_direct_io)) + WQ_SUBMIT_DATA_DIO)) return; /* * If we aren't doing async submit, calculate the csum of the * bio now. */ - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false); + ret = btrfs_csum_one_bio(inode, bio, file_offset, false); if (ret) { - bio->bi_status = ret; - bio_endio(bio); + btrfs_bio_end_io(btrfs_bio(bio), ret); return; } } else { @@ -8126,7 +8047,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, struct btrfs_dio_data *dio_data = iter->private; struct extent_map *em = NULL; - dip->inode = inode; + dip->inode = BTRFS_I(inode); dip->file_offset = file_offset; dip->bytes = dio_bio->bi_iter.bi_size; refcount_set(&dip->refs, 1); @@ -8142,7 +8063,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, */ status = BLK_STS_RESOURCE; dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS); - if (!dip) + if (!dip->csums) goto out_err; status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); @@ -8175,9 +8096,8 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, * This will never fail as it's passing GPF_NOFS and * the allocation is backed by btrfs_bioset. */ - bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len); - bio->bi_private = dip; - bio->bi_end_io = btrfs_end_dio_bio; + bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len, + btrfs_end_dio_bio, dip); btrfs_bio(bio)->file_offset = file_offset; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { @@ -8213,7 +8133,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, async_submit = 1; } - btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); + btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit); dio_data->submitted += clone_len; clone_offset += clone_len; @@ -8241,13 +8161,21 @@ static const struct iomap_dio_ops btrfs_dio_ops = { .bio_set = &btrfs_dio_bioset, }; -ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) +ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) { struct btrfs_dio_data data; return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL | IOMAP_DIO_NOSYNC, - &data, done_before); + IOMAP_DIO_PARTIAL, &data, done_before); +} + +struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before) +{ + struct btrfs_dio_data data; + + return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, &data, done_before); } static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -8259,6 +8187,25 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; + /* + * fiemap_prep() called filemap_write_and_wait() for the whole possible + * file range (0 to LLONG_MAX), but that is not enough if we have + * compression enabled. The first filemap_fdatawrite_range() only kicks + * in the compression of data (in an async thread) and will return + * before the compression is done and writeback is started. A second + * filemap_fdatawrite_range() is needed to wait for the compression to + * complete and writeback to start. We also need to wait for ordered + * extents to complete, because our fiemap implementation uses mainly + * file extent items to list the extents, searching for extent maps + * only for file ranges with holes or prealloc extents to figure out + * if we have delalloc in those ranges. + */ + if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { + ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); + if (ret) + return ret; + } + return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); } @@ -8391,14 +8338,14 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, } if (!inode_evicting) - lock_extent_bits(tree, page_start, page_end, &cached_state); + lock_extent(tree, page_start, page_end, &cached_state); cur = page_start; while (cur < page_end) { struct btrfs_ordered_extent *ordered; - bool delete_states; u64 range_end; u32 range_len; + u32 extra_flags = 0; ordered = btrfs_lookup_first_ordered_range(inode, cur, page_end + 1 - cur); @@ -8408,7 +8355,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * No ordered extent covering this range, we are safe * to delete all extent states in the range. */ - delete_states = true; + extra_flags = EXTENT_CLEAR_ALL_BITS; goto next; } if (ordered->file_offset > cur) { @@ -8419,7 +8366,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * the ordered extent in the next iteration. */ range_end = ordered->file_offset - 1; - delete_states = true; + extra_flags = EXTENT_CLEAR_ALL_BITS; goto next; } @@ -8434,7 +8381,6 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * We can't delete the extent states as * btrfs_finish_ordered_io() may still use some of them. */ - delete_states = false; goto next; } btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len); @@ -8451,7 +8397,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, clear_extent_bit(tree, cur, range_end, EXTENT_DELALLOC | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 0, &cached_state); + EXTENT_DEFRAG, &cached_state); spin_lock_irq(&inode->ordered_tree.lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); @@ -8459,6 +8405,12 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, cur - ordered->file_offset); spin_unlock_irq(&inode->ordered_tree.lock); + /* + * If the ordered extent has finished, we're safe to delete all + * the extent states of the range, otherwise + * btrfs_finish_ordered_io() will get executed by endio for + * other pages, so we can't delete extent states. + */ if (btrfs_dec_test_ordered_pending(inode, &ordered, cur, range_end + 1 - cur)) { btrfs_finish_ordered_io(ordered); @@ -8466,14 +8418,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * The ordered extent has finished, now we're again * safe to delete all extent states of the range. */ - delete_states = true; - } else { - /* - * btrfs_finish_ordered_io() will get executed by endio - * of other pages, thus we can't delete extent states - * anymore - */ - delete_states = false; + extra_flags = EXTENT_CLEAR_ALL_BITS; } next: if (ordered) @@ -8497,8 +8442,8 @@ next: if (!inode_evicting) { clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_UPTODATE | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, - delete_states, &cached_state); + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG | + extra_flags, &cached_state); } cur = range_end + 1; } @@ -8589,11 +8534,11 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, &cached_state); + lock_extent(io_tree, page_start, page_end, &cached_state); ret2 = set_page_extent_mapped(page); if (ret2 < 0) { ret = vmf_error(ret2); - unlock_extent_cached(io_tree, page_start, page_end, &cached_state); + unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock; } @@ -8604,8 +8549,7 @@ again: ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); if (ordered) { - unlock_extent_cached(io_tree, page_start, page_end, - &cached_state); + unlock_extent(io_tree, page_start, page_end, &cached_state); unlock_page(page); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_start_ordered_extent(ordered, 1); @@ -8633,13 +8577,12 @@ again: */ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 0, 0, &cached_state); + EXTENT_DEFRAG, &cached_state); ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, &cached_state); if (ret2) { - unlock_extent_cached(io_tree, page_start, page_end, - &cached_state); + unlock_extent(io_tree, page_start, page_end, &cached_state); ret = VM_FAULT_SIGBUS; goto out_unlock; } @@ -8659,7 +8602,7 @@ again: btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); - unlock_extent_cached(io_tree, page_start, page_end, &cached_state); + unlock_extent(io_tree, page_start, page_end, &cached_state); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); @@ -8680,16 +8623,16 @@ out_noreserve: return ret; } -static int btrfs_truncate(struct inode *inode, bool skip_writeback) +static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) { struct btrfs_truncate_control control = { - .inode = BTRFS_I(inode), - .ino = btrfs_ino(BTRFS_I(inode)), + .inode = inode, + .ino = btrfs_ino(inode), .min_type = BTRFS_EXTENT_DATA_KEY, .clear_extent_range = true, }; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *rsv; int ret; struct btrfs_trans_handle *trans; @@ -8697,7 +8640,8 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) u64 min_size = btrfs_calc_metadata_size(fs_info, 1); if (!skip_writeback) { - ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), + ret = btrfs_wait_ordered_range(&inode->vfs_inode, + inode->vfs_inode.i_size & (~mask), (u64)-1); if (ret) return ret; @@ -8756,34 +8700,32 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) while (1) { struct extent_state *cached_state = NULL; - const u64 new_size = inode->i_size; + const u64 new_size = inode->vfs_inode.i_size; const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); control.new_size = new_size; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, - &cached_state); + lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* * We want to drop from the next block forward in case this new * size is not block aligned since we will be keeping the last * block of the extent just the way it is. */ - btrfs_drop_extent_cache(BTRFS_I(inode), - ALIGN(new_size, fs_info->sectorsize), - (u64)-1, 0); + btrfs_drop_extent_map_range(inode, + ALIGN(new_size, fs_info->sectorsize), + (u64)-1, false); ret = btrfs_truncate_inode_items(trans, root, &control); - inode_sub_bytes(inode, control.sub_bytes); - btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size); + inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); + btrfs_inode_safe_disk_i_size_write(inode, control.last_size); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, - (u64)-1, &cached_state); + unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) break; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, inode); if (ret) break; @@ -8814,7 +8756,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); + ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0); if (ret) goto out; trans = btrfs_start_transaction(root, 1); @@ -8822,14 +8764,14 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) ret = PTR_ERR(trans); goto out; } - btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); + btrfs_inode_safe_disk_i_size_write(inode, 0); } if (trans) { int ret2; trans->block_rsv = &fs_info->trans_block_rsv; - ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret2 = btrfs_update_inode(trans, root, inode); if (ret2 && !ret) ret = ret2; @@ -8855,7 +8797,7 @@ out: * extents beyond i_size to drop. */ if (control.extents_found > 0) - btrfs_set_inode_full_sync(BTRFS_I(inode)); + btrfs_set_inode_full_sync(inode); return ret; } @@ -8908,6 +8850,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_log_commit = 0; spin_lock_init(&ei->lock); + spin_lock_init(&ei->io_failure_lock); ei->outstanding_extents = 0; if (sb->s_magic != BTRFS_TEST_MAGIC) btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, @@ -8923,13 +8866,11 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); - extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode); - extent_io_tree_init(fs_info, &ei->io_failure_tree, - IO_TREE_INODE_IO_FAILURE, inode); + extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); + ei->io_tree.inode = ei; extent_io_tree_init(fs_info, &ei->file_extent_tree, - IO_TREE_INODE_FILE_EXTENT, inode); - ei->io_tree.track_uptodate = true; - ei->io_failure_tree.track_uptodate = true; + IO_TREE_INODE_FILE_EXTENT); + ei->io_failure_tree = RB_ROOT; atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); @@ -8944,7 +8885,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode) { - btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); + btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } #endif @@ -8959,6 +8900,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode) struct btrfs_ordered_extent *ordered; struct btrfs_inode *inode = BTRFS_I(vfs_inode); struct btrfs_root *root = inode->root; + bool freespace_inode; WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); WARN_ON(vfs_inode->i_data.nrpages); @@ -8980,6 +8922,12 @@ void btrfs_destroy_inode(struct inode *vfs_inode) if (!root) return; + /* + * If this is a free space inode do not take the ordered extents lockdep + * map. + */ + freespace_inode = btrfs_is_free_space_inode(inode); + while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); if (!ordered) @@ -8988,6 +8936,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode) btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", ordered->file_offset, ordered->num_bytes); + + if (!freespace_inode) + btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent); + btrfs_remove_ordered_extent(inode, ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); @@ -8995,7 +8947,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode) } btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); - btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); btrfs_put_root(inode->root); } @@ -9030,10 +8982,6 @@ void __cold btrfs_destroy_cachep(void) rcu_barrier(); bioset_exit(&btrfs_dio_bioset); kmem_cache_destroy(btrfs_inode_cachep); - kmem_cache_destroy(btrfs_trans_handle_cachep); - kmem_cache_destroy(btrfs_path_cachep); - kmem_cache_destroy(btrfs_free_space_cachep); - kmem_cache_destroy(btrfs_free_space_bitmap_cachep); } int __init btrfs_init_cachep(void) @@ -9045,30 +8993,6 @@ int __init btrfs_init_cachep(void) if (!btrfs_inode_cachep) goto fail; - btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", - sizeof(struct btrfs_trans_handle), 0, - SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); - if (!btrfs_trans_handle_cachep) - goto fail; - - btrfs_path_cachep = kmem_cache_create("btrfs_path", - sizeof(struct btrfs_path), 0, - SLAB_MEM_SPREAD, NULL); - if (!btrfs_path_cachep) - goto fail; - - btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", - sizeof(struct btrfs_free_space), 0, - SLAB_MEM_SPREAD, NULL); - if (!btrfs_free_space_cachep) - goto fail; - - btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", - PAGE_SIZE, PAGE_SIZE, - SLAB_MEM_SPREAD, NULL); - if (!btrfs_free_space_bitmap_cachep) - goto fail; - if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, offsetof(struct btrfs_dio_private, bio), BIOSET_NEED_BVECS)) @@ -9144,6 +9068,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, int ret; int ret2; bool need_abort = false; + struct fscrypt_name old_fname, new_fname; + struct fscrypt_str *old_name, *new_name; /* * For non-subvolumes allow exchange only within one subvolume, in the @@ -9155,6 +9081,19 @@ static int btrfs_rename_exchange(struct inode *old_dir, new_ino != BTRFS_FIRST_FREE_OBJECTID)) return -EXDEV; + ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); + if (ret) + return ret; + + ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); + if (ret) { + fscrypt_free_filename(&old_fname); + return ret; + } + + old_name = &old_fname.disk_name; + new_name = &new_fname.disk_name; + /* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID || new_ino == BTRFS_FIRST_FREE_OBJECTID) @@ -9222,10 +9161,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, dest, - new_dentry->d_name.name, - new_dentry->d_name.len, - old_ino, + ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino, btrfs_ino(BTRFS_I(new_dir)), old_idx); if (ret) @@ -9238,10 +9174,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, root, - old_dentry->d_name.name, - old_dentry->d_name.len, - new_ino, + ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino, btrfs_ino(BTRFS_I(old_dir)), new_idx); if (ret) { @@ -9272,13 +9205,11 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), - old_dentry->d_name.name, - old_dentry->d_name.len, - &old_rename_ctx); + old_name, &old_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9289,13 +9220,11 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), - new_dentry->d_name.name, - new_dentry->d_name.len, - &new_rename_ctx); + new_name, &new_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); } @@ -9305,16 +9234,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), - new_dentry->d_name.name, - new_dentry->d_name.len, 0, old_idx); + new_name, 0, old_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), - old_dentry->d_name.name, - old_dentry->d_name.len, 0, new_idx); + old_name, 0, new_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; @@ -9357,6 +9284,8 @@ out_notrans: old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); + fscrypt_free_filename(&new_fname); + fscrypt_free_filename(&old_fname); return ret; } @@ -9396,6 +9325,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, int ret; int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); + struct fscrypt_name old_fname, new_fname; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9412,22 +9342,28 @@ static int btrfs_rename(struct user_namespace *mnt_userns, new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; + ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); + if (ret) + return ret; + + ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); + if (ret) { + fscrypt_free_filename(&old_fname); + return ret; + } /* check for collisions, even if the name isn't there */ - ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, - new_dentry->d_name.name, - new_dentry->d_name.len); - + ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name); if (ret) { if (ret == -EEXIST) { /* we shouldn't get * eexist without a new_inode */ if (WARN_ON(!new_inode)) { - return ret; + goto out_fscrypt_names; } } else { /* maybe -EOVERFLOW */ - return ret; + goto out_fscrypt_names; } } ret = 0; @@ -9510,11 +9446,9 @@ static int btrfs_rename(struct user_namespace *mnt_userns, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, dest, - new_dentry->d_name.name, - new_dentry->d_name.len, - old_ino, - btrfs_ino(BTRFS_I(new_dir)), index); + ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name, + old_ino, btrfs_ino(BTRFS_I(new_dir)), + index); if (ret) goto out_fail; } @@ -9533,13 +9467,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns, BTRFS_I(old_inode), 1); if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { - ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); } else { ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), - BTRFS_I(d_inode(old_dentry)), - old_dentry->d_name.name, - old_dentry->d_name.len, - &rename_ctx); + BTRFS_I(d_inode(old_dentry)), + &old_fname.disk_name, &rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9553,13 +9485,12 @@ static int btrfs_rename(struct user_namespace *mnt_userns, new_inode->i_ctime = current_time(new_inode); if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), - new_dentry->d_name.name, - new_dentry->d_name.len); + &new_fname.disk_name); } if (!ret && new_inode->i_nlink == 0) ret = btrfs_orphan_add(trans, @@ -9571,8 +9502,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), - new_dentry->d_name.name, - new_dentry->d_name.len, 0, index); + &new_fname.disk_name, 0, index); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; @@ -9607,6 +9537,9 @@ out_notrans: out_whiteout_inode: if (flags & RENAME_WHITEOUT) iput(whiteout_args.inode); +out_fscrypt_names: + fscrypt_free_filename(&old_fname); + fscrypt_free_filename(&new_fname); return ret; } @@ -9726,7 +9659,7 @@ static int start_delalloc_inodes(struct btrfs_root *root, &work->work); } else { ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); if (ret || wbc->nr_to_write <= 0) goto out; } @@ -10008,7 +9941,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; @@ -10064,11 +9996,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, break; } - btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, - cur_offset + ins.offset -1, 0); - em = alloc_extent_map(); if (!em) { + btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset, + cur_offset + ins.offset - 1, false); btrfs_set_inode_full_sync(BTRFS_I(inode)); goto next; } @@ -10083,16 +10014,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, set_bit(EXTENT_FLAG_PREALLOC, &em->flags); em->generation = trans->transid; - while (1) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 1); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) - break; - btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, - cur_offset + ins.offset - 1, - 0); - } + ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true); free_extent_map(em); next: num_bytes -= ins.offset; @@ -10168,7 +10090,7 @@ static int btrfs_permission(struct user_namespace *mnt_userns, } static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; @@ -10176,7 +10098,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, struct inode *inode; struct btrfs_new_inode_args new_inode_args = { .dir = dir, - .dentry = dentry, + .dentry = file->f_path.dentry, .orphan = true, }; unsigned int trans_num_items; @@ -10213,7 +10135,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, set_nlink(inode, 1); if (!ret) { - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); unlock_new_inode(inode); mark_inode_dirty(inode); } @@ -10225,7 +10147,7 @@ out_new_inode_args: out_inode: if (ret) iput(inode); - return ret; + return finish_open_simple(file, ret); } void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) @@ -10346,8 +10268,8 @@ static ssize_t btrfs_encoded_read_inline( } read_extent_buffer(leaf, tmp, ptr, count); btrfs_release_path(path); - unlock_extent_cached(io_tree, start, lockend, cached_state); - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + unlock_extent(io_tree, start, lockend, cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; ret = copy_to_iter(tmp, count, iter); @@ -10371,7 +10293,7 @@ struct btrfs_encoded_read_private { static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { - struct btrfs_encoded_read_private *priv = bio->bi_private; + struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private; struct btrfs_fs_info *fs_info = inode->root->fs_info; blk_status_t ret; @@ -10389,7 +10311,7 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) { const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK); - struct btrfs_encoded_read_private *priv = bbio->bio.bi_private; + struct btrfs_encoded_read_private *priv = bbio->private; struct btrfs_inode *inode = priv->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; u32 sectorsize = fs_info->sectorsize; @@ -10407,7 +10329,7 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) pgoff = bvec->bv_offset; for (i = 0; i < nr_sectors; i++) { ASSERT(pgoff < PAGE_SIZE); - if (btrfs_check_data_csum(&inode->vfs_inode, bbio, bio_offset, + if (btrfs_check_data_csum(inode, bbio, bio_offset, bvec->bv_page, pgoff)) return BLK_STS_IOERR; bio_offset += sectorsize; @@ -10417,10 +10339,9 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) return BLK_STS_OK; } -static void btrfs_encoded_read_endio(struct bio *bio) +static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) { - struct btrfs_encoded_read_private *priv = bio->bi_private; - struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_encoded_read_private *priv = bbio->private; blk_status_t status; status = btrfs_encoded_read_verify_csum(bbio); @@ -10438,7 +10359,7 @@ static void btrfs_encoded_read_endio(struct bio *bio) if (!atomic_dec_return(&priv->pending)) wake_up(&priv->wait); btrfs_bio_free_csum(bbio); - bio_put(bio); + bio_put(&bbio->bio); } int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, @@ -10485,12 +10406,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, size_t bytes = min_t(u64, remaining, PAGE_SIZE); if (!bio) { - bio = btrfs_bio_alloc(BIO_MAX_VECS); + bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, + btrfs_encoded_read_endio, + &priv); bio->bi_iter.bi_sector = (disk_bytenr + cur) >> SECTOR_SHIFT; - bio->bi_end_io = btrfs_encoded_read_endio; - bio->bi_private = &priv; - bio->bi_opf = REQ_OP_READ; } if (!bytes || @@ -10551,8 +10471,8 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, if (ret) goto out; - unlock_extent_cached(io_tree, start, lockend, cached_state); - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + unlock_extent(io_tree, start, lockend, cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; if (compressed) { @@ -10601,10 +10521,10 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, file_accessed(iocb->ki_filp); - btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); if (iocb->ki_pos >= inode->vfs_inode.i_size) { - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return 0; } start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize); @@ -10621,13 +10541,13 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, lockend - start + 1); if (ret) goto out_unlock_inode; - lock_extent_bits(io_tree, start, lockend, &cached_state); + lock_extent(io_tree, start, lockend, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, lockend - start + 1); if (!ordered) break; btrfs_put_ordered_extent(ordered); - unlock_extent_cached(io_tree, start, lockend, &cached_state); + unlock_extent(io_tree, start, lockend, &cached_state); cond_resched(); } @@ -10701,8 +10621,8 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, em = NULL; if (disk_bytenr == EXTENT_MAP_HOLE) { - unlock_extent_cached(io_tree, start, lockend, &cached_state); - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); unlocked = true; ret = iov_iter_zero(count, iter); if (ret != count) @@ -10722,10 +10642,10 @@ out_em: free_extent_map(em); out_unlock_extent: if (!unlocked) - unlock_extent_cached(io_tree, start, lockend, &cached_state); + unlock_extent(io_tree, start, lockend, &cached_state); out_unlock_inode: if (!unlocked) - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return ret; } @@ -10860,14 +10780,14 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, end >> PAGE_SHIFT); if (ret) goto out_pages; - lock_extent_bits(io_tree, start, end, &cached_state); + lock_extent(io_tree, start, end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); if (!ordered && !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) break; if (ordered) btrfs_put_ordered_extent(ordered); - unlock_extent_cached(io_tree, start, end, &cached_state); + unlock_extent(io_tree, start, end, &cached_state); cond_resched(); } @@ -10921,7 +10841,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, (1 << BTRFS_ORDERED_COMPRESSED), compression); if (ret) { - btrfs_drop_extent_cache(inode, start, end, 0); + btrfs_drop_extent_map_range(inode, start, end, false); goto out_free_reserved; } btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -10929,7 +10849,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (start + encoded->len > inode->vfs_inode.i_size) i_size_write(&inode->vfs_inode, start + encoded->len); - unlock_extent_cached(io_tree, start, end, &cached_state); + unlock_extent(io_tree, start, end, &cached_state); btrfs_delalloc_release_extents(inode, num_bytes); @@ -10960,7 +10880,7 @@ out_free_data_space: if (!extent_reserved) btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); out_unlock: - unlock_extent_cached(io_tree, start, end, &cached_state); + unlock_extent(io_tree, start, end, &cached_state); out_pages: for (i = 0; i < nr_pages; i++) { if (pages[i]) @@ -11201,7 +11121,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); - lock_extent_bits(io_tree, 0, isize - 1, &cached_state); + lock_extent(io_tree, 0, isize - 1, &cached_state); start = 0; while (start < isize) { u64 logical_block_start, physical_block_start; @@ -11242,7 +11162,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, free_extent_map(em); em = NULL; - ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true); + ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true); if (ret < 0) { goto out; } else if (ret) { @@ -11338,7 +11258,7 @@ out: if (!IS_ERR_OR_NULL(em)) free_extent_map(em); - unlock_extent_cached(io_tree, 0, isize - 1, &cached_state); + unlock_extent(io_tree, 0, isize - 1, &cached_state); if (ret) btrfs_swap_deactivate(file); @@ -11391,7 +11311,7 @@ void btrfs_update_inode_bytes(struct btrfs_inode *inode, spin_unlock(&inode->lock); } -/** +/* * Verify that there are no ordered extents for a given file range. * * @inode: The target inode. @@ -11440,7 +11360,7 @@ static const struct inode_operations btrfs_dir_inode_operations = { .mknod = btrfs_mknod, .listxattr = btrfs_listxattr, .permission = btrfs_permission, - .get_acl = btrfs_get_acl, + .get_inode_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, .tmpfile = btrfs_tmpfile, @@ -11493,7 +11413,7 @@ static const struct inode_operations btrfs_file_inode_operations = { .listxattr = btrfs_listxattr, .permission = btrfs_permission, .fiemap = btrfs_fiemap, - .get_acl = btrfs_get_acl, + .get_inode_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, .fileattr_get = btrfs_fileattr_get, @@ -11504,7 +11424,7 @@ static const struct inode_operations btrfs_special_inode_operations = { .setattr = btrfs_setattr, .permission = btrfs_permission, .listxattr = btrfs_listxattr, - .get_acl = btrfs_get_acl, + .get_inode_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fe0cc816b4eb..7e348bd2ccde 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -50,6 +50,17 @@ #include "delalloc-space.h" #include "block-group.h" #include "subpage.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "defrag.h" +#include "dir-item.h" +#include "uuid-tree.h" +#include "ioctl.h" +#include "file.h" +#include "scrub.h" +#include "super.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI @@ -949,6 +960,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, struct inode *dir = d_inode(parent->dentry); struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct dentry *dentry; + struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen); int error; error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); @@ -969,8 +981,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, * check for them now when we can safely fail */ error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, - dir->i_ino, name, - namelen); + dir->i_ino, &name_str); if (error) goto out_dput; @@ -991,7 +1002,7 @@ out_up_read: out_dput: dput(dentry); out_unlock: - btrfs_inode_unlock(dir, 0); + btrfs_inode_unlock(BTRFS_I(dir), 0); return error; } @@ -1036,908 +1047,6 @@ out: } /* - * Defrag specific helper to get an extent map. - * - * Differences between this and btrfs_get_extent() are: - * - * - No extent_map will be added to inode->extent_tree - * To reduce memory usage in the long run. - * - * - Extra optimization to skip file extents older than @newer_than - * By using btrfs_search_forward() we can skip entire file ranges that - * have extents created in past transactions, because btrfs_search_forward() - * will not visit leaves and nodes with a generation smaller than given - * minimal generation threshold (@newer_than). - * - * Return valid em if we find a file extent matching the requirement. - * Return NULL if we can not find a file extent matching the requirement. - * - * Return ERR_PTR() for error. - */ -static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, - u64 start, u64 newer_than) -{ - struct btrfs_root *root = inode->root; - struct btrfs_file_extent_item *fi; - struct btrfs_path path = { 0 }; - struct extent_map *em; - struct btrfs_key key; - u64 ino = btrfs_ino(inode); - int ret; - - em = alloc_extent_map(); - if (!em) { - ret = -ENOMEM; - goto err; - } - - key.objectid = ino; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = start; - - if (newer_than) { - ret = btrfs_search_forward(root, &key, &path, newer_than); - if (ret < 0) - goto err; - /* Can't find anything newer */ - if (ret > 0) - goto not_found; - } else { - ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); - if (ret < 0) - goto err; - } - if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { - /* - * If btrfs_search_slot() makes path to point beyond nritems, - * we should not have an empty leaf, as this inode must at - * least have its INODE_ITEM. - */ - ASSERT(btrfs_header_nritems(path.nodes[0])); - path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; - } - btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); - /* Perfect match, no need to go one slot back */ - if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && - key.offset == start) - goto iterate; - - /* We didn't find a perfect match, needs to go one slot back */ - if (path.slots[0] > 0) { - btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); - if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) - path.slots[0]--; - } - -iterate: - /* Iterate through the path to find a file extent covering @start */ - while (true) { - u64 extent_end; - - if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) - goto next; - - btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); - - /* - * We may go one slot back to INODE_REF/XATTR item, then - * need to go forward until we reach an EXTENT_DATA. - * But we should still has the correct ino as key.objectid. - */ - if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) - goto next; - - /* It's beyond our target range, definitely not extent found */ - if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) - goto not_found; - - /* - * | |<- File extent ->| - * \- start - * - * This means there is a hole between start and key.offset. - */ - if (key.offset > start) { - em->start = start; - em->orig_start = start; - em->block_start = EXTENT_MAP_HOLE; - em->len = key.offset - start; - break; - } - - fi = btrfs_item_ptr(path.nodes[0], path.slots[0], - struct btrfs_file_extent_item); - extent_end = btrfs_file_extent_end(&path); - - /* - * |<- file extent ->| | - * \- start - * - * We haven't reached start, search next slot. - */ - if (extent_end <= start) - goto next; - - /* Now this extent covers @start, convert it to em */ - btrfs_extent_item_to_extent_map(inode, &path, fi, false, em); - break; -next: - ret = btrfs_next_item(root, &path); - if (ret < 0) - goto err; - if (ret > 0) - goto not_found; - } - btrfs_release_path(&path); - return em; - -not_found: - btrfs_release_path(&path); - free_extent_map(em); - return NULL; - -err: - btrfs_release_path(&path); - free_extent_map(em); - return ERR_PTR(ret); -} - -static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, - u64 newer_than, bool locked) -{ - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map *em; - const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; - - /* - * hopefully we have this extent in the tree already, try without - * the full extent lock - */ - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, sectorsize); - read_unlock(&em_tree->lock); - - /* - * We can get a merged extent, in that case, we need to re-search - * tree to get the original em for defrag. - * - * If @newer_than is 0 or em::generation < newer_than, we can trust - * this em, as either we don't care about the generation, or the - * merged extent map will be rejected anyway. - */ - if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && - newer_than && em->generation >= newer_than) { - free_extent_map(em); - em = NULL; - } - - if (!em) { - struct extent_state *cached = NULL; - u64 end = start + sectorsize - 1; - - /* get the big lock and read metadata off disk */ - if (!locked) - lock_extent_bits(io_tree, start, end, &cached); - em = defrag_get_extent(BTRFS_I(inode), start, newer_than); - if (!locked) - unlock_extent_cached(io_tree, start, end, &cached); - - if (IS_ERR(em)) - return NULL; - } - - return em; -} - -static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, - const struct extent_map *em) -{ - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) - return BTRFS_MAX_COMPRESSED; - return fs_info->max_extent_size; -} - -static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, - u32 extent_thresh, u64 newer_than, bool locked) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_map *next; - bool ret = false; - - /* this is the last extent */ - if (em->start + em->len >= i_size_read(inode)) - return false; - - /* - * Here we need to pass @newer_then when checking the next extent, or - * we will hit a case we mark current extent for defrag, but the next - * one will not be a target. - * This will just cause extra IO without really reducing the fragments. - */ - next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); - /* No more em or hole */ - if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) - goto out; - if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) - goto out; - /* - * If the next extent is at its max capacity, defragging current extent - * makes no sense, as the total number of extents won't change. - */ - if (next->len >= get_extent_max_capacity(fs_info, em)) - goto out; - /* Skip older extent */ - if (next->generation < newer_than) - goto out; - /* Also check extent size */ - if (next->len >= extent_thresh) - goto out; - - ret = true; -out: - free_extent_map(next); - return ret; -} - -/* - * Prepare one page to be defragged. - * - * This will ensure: - * - * - Returned page is locked and has been set up properly. - * - No ordered extent exists in the page. - * - The page is uptodate. - * - * NOTE: Caller should also wait for page writeback after the cluster is - * prepared, here we don't do writeback wait for each page. - */ -static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, - pgoff_t index) -{ - struct address_space *mapping = inode->vfs_inode.i_mapping; - gfp_t mask = btrfs_alloc_write_mask(mapping); - u64 page_start = (u64)index << PAGE_SHIFT; - u64 page_end = page_start + PAGE_SIZE - 1; - struct extent_state *cached_state = NULL; - struct page *page; - int ret; - -again: - page = find_or_create_page(mapping, index, mask); - if (!page) - return ERR_PTR(-ENOMEM); - - /* - * Since we can defragment files opened read-only, we can encounter - * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We - * can't do I/O using huge pages yet, so return an error for now. - * Filesystem transparent huge pages are typically only used for - * executables that explicitly enable them, so this isn't very - * restrictive. - */ - if (PageCompound(page)) { - unlock_page(page); - put_page(page); - return ERR_PTR(-ETXTBSY); - } - - ret = set_page_extent_mapped(page); - if (ret < 0) { - unlock_page(page); - put_page(page); - return ERR_PTR(ret); - } - - /* Wait for any existing ordered extent in the range */ - while (1) { - struct btrfs_ordered_extent *ordered; - - lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); - ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); - unlock_extent_cached(&inode->io_tree, page_start, page_end, - &cached_state); - if (!ordered) - break; - - unlock_page(page); - btrfs_start_ordered_extent(ordered, 1); - btrfs_put_ordered_extent(ordered); - lock_page(page); - /* - * We unlocked the page above, so we need check if it was - * released or not. - */ - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); - goto again; - } - } - - /* - * Now the page range has no ordered extent any more. Read the page to - * make it uptodate. - */ - if (!PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); - goto again; - } - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); - return ERR_PTR(-EIO); - } - } - return page; -} - -struct defrag_target_range { - struct list_head list; - u64 start; - u64 len; -}; - -/* - * Collect all valid target extents. - * - * @start: file offset to lookup - * @len: length to lookup - * @extent_thresh: file extent size threshold, any extent size >= this value - * will be ignored - * @newer_than: only defrag extents newer than this value - * @do_compress: whether the defrag is doing compression - * if true, @extent_thresh will be ignored and all regular - * file extents meeting @newer_than will be targets. - * @locked: if the range has already held extent lock - * @target_list: list of targets file extents - */ -static int defrag_collect_targets(struct btrfs_inode *inode, - u64 start, u64 len, u32 extent_thresh, - u64 newer_than, bool do_compress, - bool locked, struct list_head *target_list, - u64 *last_scanned_ret) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - bool last_is_target = false; - u64 cur = start; - int ret = 0; - - while (cur < start + len) { - struct extent_map *em; - struct defrag_target_range *new; - bool next_mergeable = true; - u64 range_len; - - last_is_target = false; - em = defrag_lookup_extent(&inode->vfs_inode, cur, - newer_than, locked); - if (!em) - break; - - /* - * If the file extent is an inlined one, we may still want to - * defrag it (fallthrough) if it will cause a regular extent. - * This is for users who want to convert inline extents to - * regular ones through max_inline= mount option. - */ - if (em->block_start == EXTENT_MAP_INLINE && - em->len <= inode->root->fs_info->max_inline) - goto next; - - /* Skip hole/delalloc/preallocated extents */ - if (em->block_start == EXTENT_MAP_HOLE || - em->block_start == EXTENT_MAP_DELALLOC || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - goto next; - - /* Skip older extent */ - if (em->generation < newer_than) - goto next; - - /* This em is under writeback, no need to defrag */ - if (em->generation == (u64)-1) - goto next; - - /* - * Our start offset might be in the middle of an existing extent - * map, so take that into account. - */ - range_len = em->len - (cur - em->start); - /* - * If this range of the extent map is already flagged for delalloc, - * skip it, because: - * - * 1) We could deadlock later, when trying to reserve space for - * delalloc, because in case we can't immediately reserve space - * the flusher can start delalloc and wait for the respective - * ordered extents to complete. The deadlock would happen - * because we do the space reservation while holding the range - * locked, and starting writeback, or finishing an ordered - * extent, requires locking the range; - * - * 2) If there's delalloc there, it means there's dirty pages for - * which writeback has not started yet (we clean the delalloc - * flag when starting writeback and after creating an ordered - * extent). If we mark pages in an adjacent range for defrag, - * then we will have a larger contiguous range for delalloc, - * very likely resulting in a larger extent after writeback is - * triggered (except in a case of free space fragmentation). - */ - if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, - EXTENT_DELALLOC, 0, NULL)) - goto next; - - /* - * For do_compress case, we want to compress all valid file - * extents, thus no @extent_thresh or mergeable check. - */ - if (do_compress) - goto add; - - /* Skip too large extent */ - if (range_len >= extent_thresh) - goto next; - - /* - * Skip extents already at its max capacity, this is mostly for - * compressed extents, which max cap is only 128K. - */ - if (em->len >= get_extent_max_capacity(fs_info, em)) - goto next; - - /* - * Normally there are no more extents after an inline one, thus - * @next_mergeable will normally be false and not defragged. - * So if an inline extent passed all above checks, just add it - * for defrag, and be converted to regular extents. - */ - if (em->block_start == EXTENT_MAP_INLINE) - goto add; - - next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, - extent_thresh, newer_than, locked); - if (!next_mergeable) { - struct defrag_target_range *last; - - /* Empty target list, no way to merge with last entry */ - if (list_empty(target_list)) - goto next; - last = list_entry(target_list->prev, - struct defrag_target_range, list); - /* Not mergeable with last entry */ - if (last->start + last->len != cur) - goto next; - - /* Mergeable, fall through to add it to @target_list. */ - } - -add: - last_is_target = true; - range_len = min(extent_map_end(em), start + len) - cur; - /* - * This one is a good target, check if it can be merged into - * last range of the target list. - */ - if (!list_empty(target_list)) { - struct defrag_target_range *last; - - last = list_entry(target_list->prev, - struct defrag_target_range, list); - ASSERT(last->start + last->len <= cur); - if (last->start + last->len == cur) { - /* Mergeable, enlarge the last entry */ - last->len += range_len; - goto next; - } - /* Fall through to allocate a new entry */ - } - - /* Allocate new defrag_target_range */ - new = kmalloc(sizeof(*new), GFP_NOFS); - if (!new) { - free_extent_map(em); - ret = -ENOMEM; - break; - } - new->start = cur; - new->len = range_len; - list_add_tail(&new->list, target_list); - -next: - cur = extent_map_end(em); - free_extent_map(em); - } - if (ret < 0) { - struct defrag_target_range *entry; - struct defrag_target_range *tmp; - - list_for_each_entry_safe(entry, tmp, target_list, list) { - list_del_init(&entry->list); - kfree(entry); - } - } - if (!ret && last_scanned_ret) { - /* - * If the last extent is not a target, the caller can skip to - * the end of that extent. - * Otherwise, we can only go the end of the specified range. - */ - if (!last_is_target) - *last_scanned_ret = max(cur, *last_scanned_ret); - else - *last_scanned_ret = max(start + len, *last_scanned_ret); - } - return ret; -} - -#define CLUSTER_SIZE (SZ_256K) -static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); - -/* - * Defrag one contiguous target range. - * - * @inode: target inode - * @target: target range to defrag - * @pages: locked pages covering the defrag range - * @nr_pages: number of locked pages - * - * Caller should ensure: - * - * - Pages are prepared - * Pages should be locked, no ordered extent in the pages range, - * no writeback. - * - * - Extent bits are locked - */ -static int defrag_one_locked_target(struct btrfs_inode *inode, - struct defrag_target_range *target, - struct page **pages, int nr_pages, - struct extent_state **cached_state) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_changeset *data_reserved = NULL; - const u64 start = target->start; - const u64 len = target->len; - unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; - unsigned long start_index = start >> PAGE_SHIFT; - unsigned long first_index = page_index(pages[0]); - int ret = 0; - int i; - - ASSERT(last_index - first_index + 1 <= nr_pages); - - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); - if (ret < 0) - return ret; - clear_extent_bit(&inode->io_tree, start, start + len - 1, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 0, 0, cached_state); - set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); - - /* Update the page status */ - for (i = start_index - first_index; i <= last_index - first_index; i++) { - ClearPageChecked(pages[i]); - btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); - } - btrfs_delalloc_release_extents(inode, len); - extent_changeset_free(data_reserved); - - return ret; -} - -static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, - u32 extent_thresh, u64 newer_than, bool do_compress, - u64 *last_scanned_ret) -{ - struct extent_state *cached_state = NULL; - struct defrag_target_range *entry; - struct defrag_target_range *tmp; - LIST_HEAD(target_list); - struct page **pages; - const u32 sectorsize = inode->root->fs_info->sectorsize; - u64 last_index = (start + len - 1) >> PAGE_SHIFT; - u64 start_index = start >> PAGE_SHIFT; - unsigned int nr_pages = last_index - start_index + 1; - int ret = 0; - int i; - - ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); - ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); - - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) - return -ENOMEM; - - /* Prepare all pages */ - for (i = 0; i < nr_pages; i++) { - pages[i] = defrag_prepare_one_page(inode, start_index + i); - if (IS_ERR(pages[i])) { - ret = PTR_ERR(pages[i]); - pages[i] = NULL; - goto free_pages; - } - } - for (i = 0; i < nr_pages; i++) - wait_on_page_writeback(pages[i]); - - /* Lock the pages range */ - lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); - /* - * Now we have a consistent view about the extent map, re-check - * which range really needs to be defragged. - * - * And this time we have extent locked already, pass @locked = true - * so that we won't relock the extent range and cause deadlock. - */ - ret = defrag_collect_targets(inode, start, len, extent_thresh, - newer_than, do_compress, true, - &target_list, last_scanned_ret); - if (ret < 0) - goto unlock_extent; - - list_for_each_entry(entry, &target_list, list) { - ret = defrag_one_locked_target(inode, entry, pages, nr_pages, - &cached_state); - if (ret < 0) - break; - } - - list_for_each_entry_safe(entry, tmp, &target_list, list) { - list_del_init(&entry->list); - kfree(entry); - } -unlock_extent: - unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); -free_pages: - for (i = 0; i < nr_pages; i++) { - if (pages[i]) { - unlock_page(pages[i]); - put_page(pages[i]); - } - } - kfree(pages); - return ret; -} - -static int defrag_one_cluster(struct btrfs_inode *inode, - struct file_ra_state *ra, - u64 start, u32 len, u32 extent_thresh, - u64 newer_than, bool do_compress, - unsigned long *sectors_defragged, - unsigned long max_sectors, - u64 *last_scanned_ret) -{ - const u32 sectorsize = inode->root->fs_info->sectorsize; - struct defrag_target_range *entry; - struct defrag_target_range *tmp; - LIST_HEAD(target_list); - int ret; - - ret = defrag_collect_targets(inode, start, len, extent_thresh, - newer_than, do_compress, false, - &target_list, NULL); - if (ret < 0) - goto out; - - list_for_each_entry(entry, &target_list, list) { - u32 range_len = entry->len; - - /* Reached or beyond the limit */ - if (max_sectors && *sectors_defragged >= max_sectors) { - ret = 1; - break; - } - - if (max_sectors) - range_len = min_t(u32, range_len, - (max_sectors - *sectors_defragged) * sectorsize); - - /* - * If defrag_one_range() has updated last_scanned_ret, - * our range may already be invalid (e.g. hole punched). - * Skip if our range is before last_scanned_ret, as there is - * no need to defrag the range anymore. - */ - if (entry->start + range_len <= *last_scanned_ret) - continue; - - if (ra) - page_cache_sync_readahead(inode->vfs_inode.i_mapping, - ra, NULL, entry->start >> PAGE_SHIFT, - ((entry->start + range_len - 1) >> PAGE_SHIFT) - - (entry->start >> PAGE_SHIFT) + 1); - /* - * Here we may not defrag any range if holes are punched before - * we locked the pages. - * But that's fine, it only affects the @sectors_defragged - * accounting. - */ - ret = defrag_one_range(inode, entry->start, range_len, - extent_thresh, newer_than, do_compress, - last_scanned_ret); - if (ret < 0) - break; - *sectors_defragged += range_len >> - inode->root->fs_info->sectorsize_bits; - } -out: - list_for_each_entry_safe(entry, tmp, &target_list, list) { - list_del_init(&entry->list); - kfree(entry); - } - if (ret >= 0) - *last_scanned_ret = max(*last_scanned_ret, start + len); - return ret; -} - -/* - * Entry point to file defragmentation. - * - * @inode: inode to be defragged - * @ra: readahead state (can be NUL) - * @range: defrag options including range and flags - * @newer_than: minimum transid to defrag - * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode - * will be defragged. - * - * Return <0 for error. - * Return >=0 for the number of sectors defragged, and range->start will be updated - * to indicate the file offset where next defrag should be started at. - * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without - * defragging all the range). - */ -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, - struct btrfs_ioctl_defrag_range_args *range, - u64 newer_than, unsigned long max_to_defrag) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - unsigned long sectors_defragged = 0; - u64 isize = i_size_read(inode); - u64 cur; - u64 last_byte; - bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS; - bool ra_allocated = false; - int compress_type = BTRFS_COMPRESS_ZLIB; - int ret = 0; - u32 extent_thresh = range->extent_thresh; - pgoff_t start_index; - - if (isize == 0) - return 0; - - if (range->start >= isize) - return -EINVAL; - - if (do_compress) { - if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) - return -EINVAL; - if (range->compress_type) - compress_type = range->compress_type; - } - - if (extent_thresh == 0) - extent_thresh = SZ_256K; - - if (range->start + range->len > range->start) { - /* Got a specific range */ - last_byte = min(isize, range->start + range->len); - } else { - /* Defrag until file end */ - last_byte = isize; - } - - /* Align the range */ - cur = round_down(range->start, fs_info->sectorsize); - last_byte = round_up(last_byte, fs_info->sectorsize) - 1; - - /* - * If we were not given a ra, allocate a readahead context. As - * readahead is just an optimization, defrag will work without it so - * we don't error out. - */ - if (!ra) { - ra_allocated = true; - ra = kzalloc(sizeof(*ra), GFP_KERNEL); - if (ra) - file_ra_state_init(ra, inode->i_mapping); - } - - /* - * Make writeback start from the beginning of the range, so that the - * defrag range can be written sequentially. - */ - start_index = cur >> PAGE_SHIFT; - if (start_index < inode->i_mapping->writeback_index) - inode->i_mapping->writeback_index = start_index; - - while (cur < last_byte) { - const unsigned long prev_sectors_defragged = sectors_defragged; - u64 last_scanned = cur; - u64 cluster_end; - - if (btrfs_defrag_cancelled(fs_info)) { - ret = -EAGAIN; - break; - } - - /* We want the cluster end at page boundary when possible */ - cluster_end = (((cur >> PAGE_SHIFT) + - (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; - cluster_end = min(cluster_end, last_byte); - - btrfs_inode_lock(inode, 0); - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - btrfs_inode_unlock(inode, 0); - break; - } - if (!(inode->i_sb->s_flags & SB_ACTIVE)) { - btrfs_inode_unlock(inode, 0); - break; - } - if (do_compress) - BTRFS_I(inode)->defrag_compress = compress_type; - ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, - cluster_end + 1 - cur, extent_thresh, - newer_than, do_compress, §ors_defragged, - max_to_defrag, &last_scanned); - - if (sectors_defragged > prev_sectors_defragged) - balance_dirty_pages_ratelimited(inode->i_mapping); - - btrfs_inode_unlock(inode, 0); - if (ret < 0) - break; - cur = max(cluster_end + 1, last_scanned); - if (ret > 0) { - ret = 0; - break; - } - cond_resched(); - } - - if (ra_allocated) - kfree(ra); - /* - * Update range.start for autodefrag, this will indicate where to start - * in next run. - */ - range->start = cur; - if (sectors_defragged) { - /* - * We have defragged some sectors, for compression case they - * need to be written back immediately. - */ - if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { - filemap_flush(inode->i_mapping); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_flush(inode->i_mapping); - } - if (range->compress_type == BTRFS_COMPRESS_LZO) - btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); - else if (range->compress_type == BTRFS_COMPRESS_ZSTD) - btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); - ret = sectors_defragged; - } - if (do_compress) { - btrfs_inode_lock(inode, 0); - BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; - btrfs_inode_unlock(inode, 0); - } - return ret; -} - -/* * Try to start exclusive operation @type or cancel it if it's running. * * Return: @@ -2119,7 +1228,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret == 0 && new_size != old_size) btrfs_info_in_rcu(fs_info, "resize device %s (devid %llu) from %llu to %llu", - rcu_str_deref(device->name), device->devid, + btrfs_dev_name(device), device->devid, old_size, new_size); out_finish: btrfs_exclop_finish(fs_info); @@ -3105,6 +2214,8 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) } } + btrfs_free_path(path); + path = NULL; if (copy_to_user(argp, subvol_info, sizeof(*subvol_info))) ret = -EFAULT; @@ -3194,6 +2305,8 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, } out: + btrfs_free_path(path); + if (!ret || ret == -EOVERFLOW) { rootrefs->num_items = found; /* update min_treeid for next search */ @@ -3205,7 +2318,6 @@ out: } kfree(rootrefs); - btrfs_free_path(path); return ret; } @@ -3271,7 +2383,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dentry = btrfs_get_dentry(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, - vol_args2->subvolid, 0, 0); + vol_args2->subvolid, 0); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_drop_write; @@ -3416,16 +2528,16 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_dput; } - btrfs_inode_lock(inode, 0); - err = btrfs_delete_subvolume(dir, dentry); - btrfs_inode_unlock(inode, 0); + btrfs_inode_lock(BTRFS_I(inode), 0); + err = btrfs_delete_subvolume(BTRFS_I(dir), dentry); + btrfs_inode_unlock(BTRFS_I(inode), 0); if (!err) d_delete_notify(dir, dentry); out_dput: dput(dentry); out_unlock_dir: - btrfs_inode_unlock(dir, 0); + btrfs_inode_unlock(BTRFS_I(dir), 0); free_subvol_name: kfree(subvol_name_ptr); free_parent: @@ -3747,13 +2859,10 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, di_args->bytes_used = btrfs_device_get_bytes_used(dev); di_args->total_bytes = btrfs_device_get_total_bytes(dev); memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); - if (dev->name) { - strncpy(di_args->path, rcu_str_deref(dev->name), - sizeof(di_args->path) - 1); - di_args->path[sizeof(di_args->path) - 1] = 0; - } else { + if (dev->name) + strscpy(di_args->path, btrfs_dev_name(dev), sizeof(di_args->path)); + else di_args->path[0] = '\0'; - } out: rcu_read_unlock(); @@ -3774,6 +2883,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) struct btrfs_trans_handle *trans; struct btrfs_path *path = NULL; struct btrfs_disk_key disk_key; + struct fscrypt_str name = FSTR_INIT("default", 7); u64 objectid = 0; u64 dir_id; int ret; @@ -3817,7 +2927,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path, - dir_id, "default", 7, 1); + dir_id, &name, 1); if (IS_ERR_OR_NULL(di)) { btrfs_release_path(path); btrfs_end_transaction(trans); @@ -4231,6 +3341,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) ipath->fspath->val[i] = rel_ptr; } + btrfs_free_path(path); + path = NULL; ret = copy_to_user((void __user *)(unsigned long)ipa->fspath, ipath->fspath, size); if (ret) { @@ -4281,21 +3393,20 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, size = min_t(u32, loi->size, SZ_16M); } - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - inodes = init_data_container(size); if (IS_ERR(inodes)) { ret = PTR_ERR(inodes); - inodes = NULL; - goto out; + goto out_loi; } + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } ret = iterate_inodes_from_logical(loi->logical, fs_info, path, inodes, ignore_offset); + btrfs_free_path(path); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) @@ -4307,7 +3418,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, ret = -EFAULT; out: - btrfs_free_path(path); kvfree(inodes); out_loi: kfree(loi); @@ -4338,7 +3448,7 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->balance_lock); } -/** +/* * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as * required. * @@ -5283,7 +4393,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, goto out_acct; } - ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) goto out_acct; @@ -5382,7 +4492,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool if (args.len > args.unencoded_len - args.unencoded_offset) goto out_acct; - ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + ret = import_iovec(ITER_SOURCE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) goto out_acct; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h new file mode 100644 index 000000000000..8a855d5ac2fa --- /dev/null +++ b/fs/btrfs/ioctl.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_IOCTL_H +#define BTRFS_IOCTL_H + +long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int btrfs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); +int btrfs_ioctl_get_supported_features(void __user *arg); +void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); +int __pure btrfs_is_empty_uuid(u8 *uuid); +void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_balance_args *bargs); + +#endif diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 9063072b399b..870528d87526 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -12,6 +12,7 @@ #include "ctree.h" #include "extent_io.h" #include "locking.h" +#include "accessors.h" /* * Lockdep class keys for extent_buffer->lock's in this root. For a given @@ -286,6 +287,31 @@ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) } /* + * Loop around taking references on and locking the root node of the tree in + * nowait mode until we end up with a lock on the root node or returning to + * avoid blocking. + * + * Return: root extent buffer with read lock held or -EAGAIN. + */ +struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + if (!btrfs_try_tree_read_lock(eb)) { + free_extent_buffer(eb); + return ERR_PTR(-EAGAIN); + } + if (eb == root->node) + break; + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + +/* * DREW locks * ========== * diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index ab268be09bb5..11c2269b4b6f 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -78,6 +78,82 @@ enum btrfs_lock_nesting { BTRFS_NESTING_MAX, }; +enum btrfs_lockdep_trans_states { + BTRFS_LOCKDEP_TRANS_COMMIT_START, + BTRFS_LOCKDEP_TRANS_UNBLOCKED, + BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED, + BTRFS_LOCKDEP_TRANS_COMPLETED, +}; + +/* + * Lockdep annotation for wait events. + * + * @owner: The struct where the lockdep map is defined + * @lock: The lockdep map corresponding to a wait event + * + * This macro is used to annotate a wait event. In this case a thread acquires + * the lockdep map as writer (exclusive lock) because it has to block until all + * the threads that hold the lock as readers signal the condition for the wait + * event and release their locks. + */ +#define btrfs_might_wait_for_event(owner, lock) \ + do { \ + rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \ + rwsem_release(&owner->lock##_map, _THIS_IP_); \ + } while (0) + +/* + * Protection for the resource/condition of a wait event. + * + * @owner: The struct where the lockdep map is defined + * @lock: The lockdep map corresponding to a wait event + * + * Many threads can modify the condition for the wait event at the same time + * and signal the threads that block on the wait event. The threads that modify + * the condition and do the signaling acquire the lock as readers (shared + * lock). + */ +#define btrfs_lockdep_acquire(owner, lock) \ + rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_) + +/* + * Used after signaling the condition for a wait event to release the lockdep + * map held by a reader thread. + */ +#define btrfs_lockdep_release(owner, lock) \ + rwsem_release(&owner->lock##_map, _THIS_IP_) + +/* + * Macros for the transaction states wait events, similar to the generic wait + * event macros. + */ +#define btrfs_might_wait_for_state(owner, i) \ + do { \ + rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \ + rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \ + } while (0) + +#define btrfs_trans_state_lockdep_acquire(owner, i) \ + rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_) + +#define btrfs_trans_state_lockdep_release(owner, i) \ + rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_) + +/* Initialization of the lockdep map */ +#define btrfs_lockdep_init_map(owner, lock) \ + do { \ + static struct lock_class_key lock##_key; \ + lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \ + } while (0) + +/* Initialization of the transaction states lockdep maps. */ +#define btrfs_state_lockdep_init_map(owner, lock, state) \ + do { \ + static struct lock_class_key lock##_key; \ + lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \ + &lock##_key, 0); \ + } while (0) + static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES, "too many lock subclasses defined"); @@ -94,6 +170,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); +struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root); #ifdef CONFIG_BTRFS_DEBUG static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 89bc5f825e0a..d5e78cbc8fbc 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -13,8 +13,10 @@ #include <linux/bio.h> #include <linux/lzo.h> #include <linux/refcount.h> +#include "messages.h" #include "compression.h" #include "ctree.h" +#include "super.h" #define LZO_LEN 4 @@ -425,7 +427,7 @@ out: return ret; } -int lzo_decompress(struct list_head *ws, unsigned char *data_in, +int lzo_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c new file mode 100644 index 000000000000..625bbbbb2608 --- /dev/null +++ b/fs/btrfs/messages.c @@ -0,0 +1,353 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "fs.h" +#include "messages.h" +#include "discard.h" +#include "transaction.h" +#include "space-info.h" +#include "super.h" + +#ifdef CONFIG_PRINTK + +#define STATE_STRING_PREFACE ": state " +#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) + +/* + * Characters to print to indicate error conditions or uncommon filesystem state. + * RO is not an error. + */ +static const char fs_state_chars[] = { + [BTRFS_FS_STATE_ERROR] = 'E', + [BTRFS_FS_STATE_REMOUNTING] = 'M', + [BTRFS_FS_STATE_RO] = 0, + [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', + [BTRFS_FS_STATE_DEV_REPLACING] = 'R', + [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, + [BTRFS_FS_STATE_NO_CSUMS] = 'C', + [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', +}; + +static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) +{ + unsigned int bit; + bool states_printed = false; + unsigned long fs_state = READ_ONCE(info->fs_state); + char *curr = buf; + + memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); + curr += sizeof(STATE_STRING_PREFACE) - 1; + + for_each_set_bit(bit, &fs_state, sizeof(fs_state)) { + WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT); + if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) { + *curr++ = fs_state_chars[bit]; + states_printed = true; + } + } + + /* If no states were printed, reset the buffer */ + if (!states_printed) + curr = buf; + + *curr++ = 0; +} +#endif + +/* + * Generally the error codes correspond to their respective errors, but there + * are a few special cases. + * + * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for + * instance will return EUCLEAN if any of the blocks are corrupted in + * a way that is problematic. We want to reserve EUCLEAN for these + * sort of corruptions. + * + * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we + * need to use EROFS for this case. We will have no idea of the + * original failure, that will have been reported at the time we tripped + * over the error. Each subsequent error that doesn't have any context + * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR. + */ +const char * __attribute_const__ btrfs_decode_error(int errno) +{ + char *errstr = "unknown"; + + switch (errno) { + case -ENOENT: /* -2 */ + errstr = "No such entry"; + break; + case -EIO: /* -5 */ + errstr = "IO failure"; + break; + case -ENOMEM: /* -12*/ + errstr = "Out of memory"; + break; + case -EEXIST: /* -17 */ + errstr = "Object already exists"; + break; + case -ENOSPC: /* -28 */ + errstr = "No space left"; + break; + case -EROFS: /* -30 */ + errstr = "Readonly filesystem"; + break; + case -EOPNOTSUPP: /* -95 */ + errstr = "Operation not supported"; + break; + case -EUCLEAN: /* -117 */ + errstr = "Filesystem corrupted"; + break; + case -EDQUOT: /* -122 */ + errstr = "Quota exceeded"; + break; + } + + return errstr; +} + +/* + * __btrfs_handle_fs_error decodes expected errors from the caller and + * invokes the appropriate error response. + */ +__cold +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + struct super_block *sb = fs_info->sb; +#ifdef CONFIG_PRINTK + char statestr[STATE_STRING_BUF_LEN]; + const char *errstr; +#endif + +#ifdef CONFIG_PRINTK_INDEX + printk_index_subsys_emit( + "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", KERN_CRIT, fmt); +#endif + + /* + * Special case: if the error is EROFS, and we're already under + * SB_RDONLY, then it is safe here. + */ + if (errno == -EROFS && sb_rdonly(sb)) + return; + +#ifdef CONFIG_PRINTK + errstr = btrfs_decode_error(errno); + btrfs_state_to_string(fs_info, statestr); + if (fmt) { + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n", + sb->s_id, statestr, function, line, errno, errstr, &vaf); + va_end(args); + } else { + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n", + sb->s_id, statestr, function, line, errno, errstr); + } +#endif + + /* + * Today we only save the error info to memory. Long term we'll also + * send it down to the disk. + */ + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); + + /* Don't go through full error handling during mount. */ + if (!(sb->s_flags & SB_BORN)) + return; + + if (sb_rdonly(sb)) + return; + + btrfs_discard_stop(fs_info); + + /* Handle error by forcing the filesystem readonly. */ + btrfs_set_sb_rdonly(sb); + btrfs_info(fs_info, "forced readonly"); + /* + * Note that a running device replace operation is not canceled here + * although there is no way to update the progress. It would add the + * risk of a deadlock, therefore the canceling is omitted. The only + * penalty is that some I/O remains active until the procedure + * completes. The next time when the filesystem is mounted writable + * again, the device replace operation continues. + */ +} + +#ifdef CONFIG_PRINTK +static const char * const logtypes[] = { + "emergency", + "alert", + "critical", + "error", + "warning", + "notice", + "info", + "debug", +}; + +/* + * Use one ratelimit state per log level so that a flood of less important + * messages doesn't cause more important ones to be dropped. + */ +static struct ratelimit_state printk_limits[] = { + RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), +}; + +void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +{ + char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; + struct va_format vaf; + va_list args; + int kern_level; + const char *type = logtypes[4]; + struct ratelimit_state *ratelimit = &printk_limits[4]; + +#ifdef CONFIG_PRINTK_INDEX + printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); +#endif + + va_start(args, fmt); + + while ((kern_level = printk_get_level(fmt)) != 0) { + size_t size = printk_skip_level(fmt) - fmt; + + if (kern_level >= '0' && kern_level <= '7') { + memcpy(lvl, fmt, size); + lvl[size] = '\0'; + type = logtypes[kern_level - '0']; + ratelimit = &printk_limits[kern_level - '0']; + } + fmt += size; + } + + vaf.fmt = fmt; + vaf.va = &args; + + if (__ratelimit(ratelimit)) { + if (fs_info) { + char statestr[STATE_STRING_BUF_LEN]; + + btrfs_state_to_string(fs_info, statestr); + _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, + fs_info->sb->s_id, statestr, &vaf); + } else { + _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); + } + } + + va_end(args); +} +#endif + +#ifdef CONFIG_BTRFS_ASSERT +void __cold btrfs_assertfail(const char *expr, const char *file, int line) +{ + pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); + BUG(); +} +#endif + +void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info) +{ + btrfs_err(fs_info, +"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel"); +} + +#if BITS_PER_LONG == 32 +void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info) +{ + if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) { + btrfs_warn(fs_info, "reaching 32bit limit for logical addresses"); + btrfs_warn(fs_info, +"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT", + BTRFS_32BIT_MAX_FILE_SIZE >> 40); + btrfs_warn(fs_info, + "please consider upgrading to 64bit kernel/hardware"); + } +} + +void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) +{ + if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) { + btrfs_err(fs_info, "reached 32bit limit for logical addresses"); + btrfs_err(fs_info, +"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed", + BTRFS_32BIT_MAX_FILE_SIZE >> 40); + btrfs_err(fs_info, + "please consider upgrading to 64bit kernel/hardware"); + } +} +#endif + +/* + * We only mark the transaction aborted and then set the file system read-only. + * This will prevent new transactions from starting or trying to join this + * one. + * + * This means that error recovery at the call site is limited to freeing + * any local memory allocations and passing the error code up without + * further cleanup. The transaction should complete as it normally would + * in the call path but will return -EIO. + * + * We'll complete the cleanup in btrfs_end_transaction and + * btrfs_commit_transaction. + */ +__cold +void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + WRITE_ONCE(trans->aborted, errno); + WRITE_ONCE(trans->transaction->aborted, errno); + if (first_hit && errno == -ENOSPC) + btrfs_dump_space_info_for_trans_abort(fs_info); + /* Wake up anybody who may be waiting on this transaction */ + wake_up(&fs_info->transaction_wait); + wake_up(&fs_info->transaction_blocked_wait); + __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); +} + +/* + * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an + * alert, and either panics or BUGs, depending on mount options. + */ +__cold +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + char *s_id = "<unknown>"; + const char *errstr; + struct va_format vaf = { .fmt = fmt }; + va_list args; + + if (fs_info) + s_id = fs_info->sb->s_id; + + va_start(args, fmt); + vaf.va = &args; + + errstr = btrfs_decode_error(errno); + if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR))) + panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", + s_id, function, line, &vaf, errno, errstr); + + btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)", + function, line, &vaf, errno, errstr); + va_end(args); + /* Caller calls BUG() */ +} diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h new file mode 100644 index 000000000000..190af1f698d9 --- /dev/null +++ b/fs/btrfs/messages.h @@ -0,0 +1,245 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_MESSAGES_H +#define BTRFS_MESSAGES_H + +#include <linux/types.h> + +struct btrfs_fs_info; +struct btrfs_trans_handle; + +static inline __printf(2, 3) __cold +void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +{ +} + +#ifdef CONFIG_PRINTK + +#define btrfs_printk(fs_info, fmt, args...) \ + _btrfs_printk(fs_info, fmt, ##args) + +__printf(2, 3) +__cold +void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); + +#else + +#define btrfs_printk(fs_info, fmt, args...) \ + btrfs_no_printk(fs_info, fmt, ##args) +#endif + +#define btrfs_emerg(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use printk_in_rcu + */ +#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use a ratelimited printk_in_rcu + */ +#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use a ratelimited printk + */ +#define btrfs_emerg_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args) + +#if defined(CONFIG_DYNAMIC_DEBUG) +#define btrfs_debug(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk, \ + fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \ + fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ + fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \ + fs_info, KERN_DEBUG fmt, ##args) +#elif defined(DEBUG) +#define btrfs_debug(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args) +#else +#define btrfs_debug(fs_info, fmt, args...) \ + btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) +#endif + +#define btrfs_printk_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_printk(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + +#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_no_printk(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + +#define btrfs_printk_ratelimited(fs_info, fmt, args...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + btrfs_printk(fs_info, fmt, ##args); \ +} while (0) + +#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_printk_ratelimited(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + +#ifdef CONFIG_BTRFS_ASSERT +void __cold btrfs_assertfail(const char *expr, const char *file, int line); + +#define ASSERT(expr) \ + (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__)) +#else +#define ASSERT(expr) (void)(expr) +#endif + +void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info); + +__printf(5, 6) +__cold +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); + +const char * __attribute_const__ btrfs_decode_error(int errno); + +__cold +void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit); + +bool __cold abort_should_print_stack(int errno); + +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact stack trace is reported for some errors. + */ +#define btrfs_abort_transaction(trans, errno) \ +do { \ + bool first = false; \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((trans)->fs_info->fs_state))) { \ + first = true; \ + if (WARN(abort_should_print_stack(errno), \ + KERN_ERR \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno))) { \ + /* Stack trace printed. */ \ + } else { \ + btrfs_err((trans)->fs_info, \ + "Transaction aborted (error %d)", \ + (errno)); \ + } \ + } \ + __btrfs_abort_transaction((trans), __func__, \ + __LINE__, (errno), first); \ +} while (0) + +#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args) + +__printf(5, 6) +__cold +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); +/* + * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic + * will panic(). Otherwise we BUG() here. + */ +#define btrfs_panic(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ + BUG(); \ +} while (0) + +#if BITS_PER_LONG == 32 +#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT) +/* + * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical + * addresses of extents. + * + * For 4K page size it's about 10T, for 64K it's 160T. + */ +#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8) +void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info); +void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info); +#endif + +#endif diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 340f995652f2..768583a440e1 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -10,6 +10,14 @@ #define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) +/* + * Enumerate bits using enum autoincrement. Define the @name as the n-th bit. + */ +#define ENUM_BIT(name) \ + __ ## name ## _BIT, \ + name = (1U << __ ## name ## _BIT), \ + __ ## name ## _SEQ = __ ## name ## _BIT + static inline void cond_wake_up(struct wait_queue_head *wq) { /* @@ -32,22 +40,10 @@ static inline void cond_wake_up_nomb(struct wait_queue_head *wq) wake_up(wq); } -static inline u64 div_factor(u64 num, int factor) -{ - if (factor == 10) - return num; - num *= factor; - return div_u64(num, 10); -} - -static inline u64 div_factor_fine(u64 num, int factor) +static inline u64 mult_perc(u64 num, u32 percent) { - if (factor == 100) - return num; - num *= factor; - return div_u64(num, 100); + return div_u64(num * percent, 100); } - /* Copy of is_power_of_two that is 64bit safe */ static inline bool is_power_of_two_u64(u64 n) { @@ -88,6 +84,41 @@ static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr) return NULL; } +/* + * Search @root from an entry that starts or comes after @bytenr. + * + * @root: the root to search. + * @bytenr: bytenr to search from. + * + * Return the rb_node that start at or after @bytenr. If there is no entry at + * or after @bytner return NULL. + */ +static inline struct rb_node *rb_simple_search_first(struct rb_root *root, + u64 bytenr) +{ + struct rb_node *node = root->rb_node, *ret = NULL; + struct rb_simple_node *entry, *ret_entry = NULL; + + while (node) { + entry = rb_entry(node, struct rb_simple_node, rb_node); + + if (bytenr < entry->bytenr) { + if (!ret || entry->bytenr < ret_entry->bytenr) { + ret = node; + ret_entry = entry; + } + + node = node->rb_left; + } else if (bytenr > entry->bytenr) { + node = node->rb_right; + } else { + return node; + } + } + + return ret; +} + static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr, struct rb_node *node) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 1952ac85222c..57d8c72737e1 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -7,6 +7,7 @@ #include <linux/blkdev.h> #include <linux/writeback.h> #include <linux/sched/mm.h> +#include "messages.h" #include "misc.h" #include "ctree.h" #include "transaction.h" @@ -17,6 +18,8 @@ #include "delalloc-space.h" #include "qgroup.h" #include "subpage.h" +#include "file.h" +#include "super.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -143,7 +146,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, return ret; } -/** +/* * Add an ordered extent to the per-inode tree. * * @inode: Inode that this extent is for. @@ -501,7 +504,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) ASSERT(list_empty(&entry->log_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) - btrfs_add_delayed_iput(entry->inode); + btrfs_add_delayed_iput(BTRFS_I(entry->inode)); while (!list_empty(&entry->list)) { cur = entry->list.next; sum = list_entry(cur, struct btrfs_ordered_sum, list); @@ -524,7 +527,15 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; bool pending; + bool freespace_inode; + /* + * If this is a free space inode the thread has not acquired the ordered + * extents lockdep map. + */ + freespace_inode = btrfs_is_free_space_inode(btrfs_inode); + + btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered); /* This is paired with btrfs_add_ordered_extent. */ spin_lock(&btrfs_inode->lock); btrfs_mod_outstanding_extents(btrfs_inode, -1); @@ -580,6 +591,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, } } + btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered); + spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; @@ -594,6 +607,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, } spin_unlock(&root->ordered_extent_lock); wake_up(&entry->wait); + if (!freespace_inode) + btrfs_lockdep_release(fs_info, btrfs_ordered_extent); } static void btrfs_run_ordered_extent_work(struct btrfs_work *work) @@ -712,10 +727,17 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; struct btrfs_inode *inode = BTRFS_I(entry->inode); + bool freespace_inode; trace_btrfs_ordered_extent_start(inode, entry); /* + * If this is a free space inode do not take the ordered extents lockdep + * map. + */ + freespace_inode = btrfs_is_free_space_inode(inode); + + /* * pages in the range can be dirty, clean or writeback. We * start IO on any dirty ones so the wait doesn't stall waiting * for the flusher thread to find them @@ -723,6 +745,8 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); if (wait) { + if (!freespace_inode) + btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); } @@ -740,11 +764,11 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) struct btrfs_ordered_extent *ordered; if (start + len < start) { - orig_end = INT_LIMIT(loff_t); + orig_end = OFFSET_MAX; } else { orig_end = start + len - 1; - if (orig_end > INT_LIMIT(loff_t)) - orig_end = INT_LIMIT(loff_t); + if (orig_end > OFFSET_MAX) + orig_end = OFFSET_MAX; } /* start IO across the range first to instantiate any delalloc @@ -998,17 +1022,18 @@ out: } /* - * btrfs_flush_ordered_range - Lock the passed range and ensures all pending - * ordered extents in it are run to completion. + * Lock the passed range and ensures all pending ordered extents in it are run + * to completion. * * @inode: Inode whose ordered tree is to be searched * @start: Beginning of range to flush * @end: Last byte of range to lock * @cached_state: If passed, will return the extent state responsible for the - * locked range. It's the caller's responsibility to free the cached state. + * locked range. It's the caller's responsibility to free the + * cached state. * - * This function always returns with the given range locked, ensuring after it's - * called no order extent can be pending. + * Always return with the given range locked, ensuring after it's called no + * order extent can be pending. */ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, @@ -1022,7 +1047,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, cachedp = cached_state; while (1) { - lock_extent_bits(&inode->io_tree, start, end, cachedp); + lock_extent(&inode->io_tree, start, end, cachedp); ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); if (!ordered) { @@ -1035,12 +1060,38 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, refcount_dec(&cache->refs); break; } - unlock_extent_cached(&inode->io_tree, start, end, cachedp); + unlock_extent(&inode->io_tree, start, end, cachedp); btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); } } +/* + * Lock the passed range and ensure all pending ordered extents in it are run + * to completion in nowait mode. + * + * Return true if btrfs_lock_ordered_range does not return any extents, + * otherwise false. + */ +bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state) +{ + struct btrfs_ordered_extent *ordered; + + if (!try_lock_extent(&inode->io_tree, start, end, cached_state)) + return false; + + ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); + if (!ordered) + return true; + + btrfs_put_ordered_extent(ordered); + unlock_extent(&inode->io_tree, start, end, cached_state); + + return false; +} + + static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos, u64 len) { diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 87792f85e2c4..89f82b78f590 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -160,18 +160,6 @@ struct btrfs_ordered_extent { struct block_device *bdev; }; -/* - * calculates the total size you need to allocate for an ordered sum - * structure spanning 'bytes' in the file - */ -static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, - unsigned long bytes) -{ - int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize); - - return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size; -} - static inline void btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) { @@ -218,6 +206,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); +bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state); int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, u64 post); int __init ordered_data_init(void); diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index aa534108c1e2..7a1b021b5669 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -5,6 +5,7 @@ #include "ctree.h" #include "disk-io.h" +#include "orphan.h" int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) diff --git a/fs/btrfs/orphan.h b/fs/btrfs/orphan.h new file mode 100644 index 000000000000..3faab5cbb59a --- /dev/null +++ b/fs/btrfs/orphan.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ORPHAN_H +#define BTRFS_ORPHAN_H + +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); + +#endif diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index dd8777872143..b93c96213304 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -3,9 +3,12 @@ * Copyright (C) 2007 Oracle. All rights reserved. */ +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "print-tree.h" +#include "accessors.h" +#include "tree-checker.h" struct root_name_map { u64 id; @@ -240,9 +243,9 @@ void btrfs_print_leaf(struct extent_buffer *l) case BTRFS_DIR_ITEM_KEY: di = btrfs_item_ptr(l, i, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(l, di, &found_key); - pr_info("\t\tdir oid %llu type %u\n", + pr_info("\t\tdir oid %llu flags %u\n", found_key.objectid, - btrfs_dir_type(l, di)); + btrfs_dir_flags(l, di)); break; case BTRFS_ROOT_ITEM_KEY: ri = btrfs_item_ptr(l, i, struct btrfs_root_item); @@ -384,14 +387,16 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow) if (!follow) return; for (i = 0; i < nr; i++) { - struct btrfs_key first_key; + struct btrfs_tree_parent_check check = { + .level = level - 1, + .transid = btrfs_node_ptr_generation(c, i), + .owner_root = btrfs_header_owner(c), + .has_first_key = true + }; struct extent_buffer *next; - btrfs_node_key_to_cpu(c, &first_key, i); - next = read_tree_block(fs_info, btrfs_node_blockptr(c, i), - btrfs_header_owner(c), - btrfs_node_ptr_generation(c, i), - level - 1, &first_key); + btrfs_node_key_to_cpu(c, &check.first_key, i); + next = read_tree_block(fs_info, btrfs_node_blockptr(c, i), &check); if (IS_ERR(next)) continue; if (!extent_buffer_uptodate(next)) { diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index a2ec8ecae8de..0755af0e53e3 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -4,12 +4,17 @@ */ #include <linux/hashtable.h> +#include "messages.h" #include "props.h" #include "btrfs_inode.h" #include "transaction.h" #include "ctree.h" #include "xattr.h" #include "compression.h" +#include "space-info.h" +#include "fs.h" +#include "accessors.h" +#include "super.h" #define BTRFS_PROP_HANDLERS_HT_BITS 8 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); @@ -270,11 +275,8 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 ino = btrfs_ino(BTRFS_I(inode)); - int ret; - - ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode); - return ret; + return iterate_object_props(root, path, ino, inode_prop_iterator, inode); } static int prop_compression_validate(const struct btrfs_inode *inode, @@ -456,7 +458,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, return 0; } -void __init btrfs_props_init(void) +int __init btrfs_props_init(void) { int i; @@ -466,5 +468,6 @@ void __init btrfs_props_init(void) hash_add(prop_handlers_ht, &p->node, h); } + return 0; } diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index ca9dd3df129b..6e283196e38a 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -8,7 +8,7 @@ #include "ctree.h" -void __init btrfs_props_init(void); +int __init btrfs_props_init(void); int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const char *value, size_t value_len, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index db723c0026bd..5c636e00d77d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -24,6 +24,11 @@ #include "block-group.h" #include "sysfs.h" #include "tree-mod-log.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "tree-checker.h" /* * Helpers to access qgroup reservation @@ -275,7 +280,7 @@ static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *p } /* - * Add relation specified by two qgoup ids. + * Add relation specified by two qgroup ids. * * Must be called with qgroup_lock held. * @@ -333,6 +338,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, } #endif +static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) +{ + fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | + BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | + BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); +} + /* * The full config is read in one go, only called from open_ctree() * It doesn't use any locking, as at this point we're still single-threaded @@ -401,7 +413,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) } if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) { - flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); btrfs_err(fs_info, "qgroup generation mismatch, marked as inconsistent"); } @@ -419,7 +431,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { btrfs_err(fs_info, "inconsistent qgroup config"); - flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); } if (!qgroup) { qgroup = add_qgroup_rb(fs_info, found_key.offset); @@ -878,7 +890,8 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans) l = path->nodes[0]; slot = path->slots[0]; ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); - btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags); + btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_generation(l, ptr, trans->transid); btrfs_set_qgroup_status_rescan(l, ptr, fs_info->qgroup_rescan_progress.objectid); @@ -1052,7 +1065,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags); + btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_rescan(leaf, ptr, 0); btrfs_mark_buffer_dirty(leaf); @@ -1174,6 +1188,21 @@ out_add_root: fs_info->qgroup_rescan_running = true; btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); + } else { + /* + * We have set both BTRFS_FS_QUOTA_ENABLED and + * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with + * -EINPROGRESS. That can happen because someone started the + * rescan worker by calling quota rescan ioctl before we + * attempted to initialize the rescan worker. Failure due to + * quotas disabled in the meanwhile is not possible, because + * we are holding a write lock on fs_info->subvol_sem, which + * is also acquired when disabling quotas. + * Ignore such error, and any other error would need to undo + * everything we did in the transaction we just committed. + */ + ASSERT(ret == -EINPROGRESS); + ret = 0; } out_free_path: @@ -1255,6 +1284,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) quota_root = fs_info->quota_root; fs_info->quota_root = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; + fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; spin_unlock(&fs_info->qgroup_lock); btrfs_free_qgroup_config(fs_info); @@ -1717,7 +1747,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, ret = update_qgroup_limit_item(trans, qgroup); if (ret) { - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); btrfs_info(fs_info, "unable to update quota limit for %llu", qgroupid); } @@ -1765,8 +1795,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord) { - struct ulist *old_root; - u64 bytenr = qrecord->bytenr; + struct btrfs_backref_walk_ctx ctx = { 0 }; int ret; /* @@ -1790,10 +1819,15 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, */ ASSERT(trans != NULL); - ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root, - true); + if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) + return 0; + + ctx.bytenr = qrecord->bytenr; + ctx.fs_info = trans->fs_info; + + ret = btrfs_find_all_roots(&ctx, true); if (ret < 0) { - trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(trans->fs_info); btrfs_warn(trans->fs_info, "error accounting new delayed refs extent (err code: %d), quota inconsistent", ret); @@ -1807,12 +1841,12 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, * * So modifying qrecord->old_roots is safe here */ - qrecord->old_roots = old_root; + qrecord->old_roots = ctx.roots; return 0; } int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, - u64 num_bytes, gfp_t gfp_flag) + u64 num_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; @@ -1822,7 +1856,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || bytenr == 0 || num_bytes == 0) return 0; - record = kzalloc(sizeof(*record), gfp_flag); + record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) return -ENOMEM; @@ -1874,8 +1908,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); - ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes, - GFP_NOFS); + ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes); if (ret) return ret; } @@ -2074,12 +2107,11 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, * blocks for qgroup accounting. */ ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, - nodesize, GFP_NOFS); + nodesize); if (ret < 0) goto out; - ret = btrfs_qgroup_trace_extent(trans, - dst_path->nodes[dst_level]->start, - nodesize, GFP_NOFS); + ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start, + nodesize); if (ret < 0) goto out; @@ -2269,7 +2301,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, out: btrfs_free_path(dst_path); if (ret < 0) - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); return ret; } @@ -2280,6 +2312,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; int ret = 0; int level; + u8 drop_subptree_thres; struct extent_buffer *eb = root_eb; struct btrfs_path *path = NULL; @@ -2289,8 +2322,31 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) return 0; + spin_lock(&fs_info->qgroup_lock); + drop_subptree_thres = fs_info->qgroup_drop_subtree_thres; + spin_unlock(&fs_info->qgroup_lock); + + /* + * This function only gets called for snapshot drop, if we hit a high + * node here, it means we are going to change ownership for quite a lot + * of extents, which will greatly slow down btrfs_commit_transaction(). + * + * So here if we find a high tree here, we just skip the accounting and + * mark qgroup inconsistent. + */ + if (root_level >= drop_subptree_thres) { + qgroup_mark_inconsistent(fs_info); + return 0; + } + if (!extent_buffer_uptodate(root_eb)) { - ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL); + struct btrfs_tree_parent_check check = { + .has_first_key = false, + .transid = root_gen, + .level = root_level + }; + + ret = btrfs_read_extent_buffer(root_eb, &check); if (ret) goto out; } @@ -2345,8 +2401,7 @@ walk_down: path->locks[level] = BTRFS_READ_LOCK; ret = btrfs_qgroup_trace_extent(trans, child_bytenr, - fs_info->nodesize, - GFP_NOFS); + fs_info->nodesize); if (ret) goto out; } @@ -2604,7 +2659,8 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, * If quotas get disabled meanwhile, the resources need to be freed and * we can't just exit here. */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) goto out_free; if (new_roots) { @@ -2700,18 +2756,24 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) num_dirty_extents++; trace_btrfs_qgroup_account_extents(fs_info, record); - if (!ret) { + if (!ret && !(fs_info->qgroup_flags & + BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { + struct btrfs_backref_walk_ctx ctx = { 0 }; + + ctx.bytenr = record->bytenr; + ctx.fs_info = fs_info; + /* * Old roots should be searched when inserting qgroup * extent record */ if (WARN_ON(!record->old_roots)) { /* Search commit root to find old_roots */ - ret = btrfs_find_all_roots(NULL, fs_info, - record->bytenr, 0, - &record->old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto cleanup; + record->old_roots = ctx.roots; + ctx.roots = NULL; } /* Free the reserved data space */ @@ -2724,10 +2786,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) * which doesn't lock tree or delayed_refs and search * current root. It's safe inside commit_transaction(). */ - ret = btrfs_find_all_roots(trans, fs_info, - record->bytenr, BTRFS_SEQ_LAST, &new_roots, false); + ctx.trans = trans; + ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto cleanup; + new_roots = ctx.roots; if (qgroup_to_skip) { ulist_del(new_roots, qgroup_to_skip, 0); ulist_del(record->old_roots, qgroup_to_skip, @@ -2773,12 +2836,10 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) spin_unlock(&fs_info->qgroup_lock); ret = update_qgroup_info_item(trans, qgroup); if (ret) - fs_info->qgroup_flags |= - BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); ret = update_qgroup_limit_item(trans, qgroup); if (ret) - fs_info->qgroup_flags |= - BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); spin_lock(&fs_info->qgroup_lock); } if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) @@ -2789,7 +2850,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) ret = update_qgroup_status_item(trans); if (ret) - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); return ret; } @@ -2905,14 +2966,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, dstgroup->rsv_rfer = inherit->lim.rsv_rfer; dstgroup->rsv_excl = inherit->lim.rsv_excl; - ret = update_qgroup_limit_item(trans, dstgroup); - if (ret) { - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - btrfs_info(fs_info, - "unable to update quota limit for %llu", - dstgroup->qgroupid); - goto unlock; - } + qgroup_dirty(fs_info, dstgroup); } if (srcid) { @@ -3013,7 +3067,7 @@ out: if (!committing) mutex_unlock(&fs_info->qgroup_ioctl_lock); if (need_rescan) - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); return ret; } @@ -3202,7 +3256,6 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root; struct btrfs_key found; struct extent_buffer *scratch_leaf = NULL; - struct ulist *roots = NULL; u64 num_bytes; bool done; int slot; @@ -3252,6 +3305,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, mutex_unlock(&fs_info->qgroup_rescan_lock); for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { + struct btrfs_backref_walk_ctx ctx = { 0 }; + btrfs_item_key_to_cpu(scratch_leaf, &found, slot); if (found.type != BTRFS_EXTENT_ITEM_KEY && found.type != BTRFS_METADATA_ITEM_KEY) @@ -3261,13 +3316,15 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, else num_bytes = found.offset; - ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, - &roots, false); + ctx.bytenr = found.objectid; + ctx.fs_info = fs_info; + + ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto out; /* For rescan, just pass old_roots as NULL */ ret = btrfs_qgroup_account_extent(trans, found.objectid, - num_bytes, NULL, roots); + num_bytes, NULL, ctx.roots); if (ret < 0) goto out; } @@ -3286,7 +3343,8 @@ static bool rescan_should_stop(struct btrfs_fs_info *fs_info) { return btrfs_fs_closing(fs_info) || test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) || - !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; } static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) @@ -3351,7 +3409,8 @@ out: } mutex_lock(&fs_info->qgroup_rescan_lock); - if (!stopped) + if (!stopped || + fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; if (trans) { ret = update_qgroup_status_item(trans); @@ -3362,6 +3421,7 @@ out: } } fs_info->qgroup_rescan_running = false; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; complete_all(&fs_info->qgroup_rescan_completion); mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -3372,6 +3432,8 @@ out: if (stopped) { btrfs_info(fs_info, "qgroup scan paused"); + } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) { + btrfs_info(fs_info, "qgroup scan cancelled"); } else if (err >= 0) { btrfs_info(fs_info, "qgroup scan completed%s", err > 0 ? " (inconsistency flag cleared)" : ""); @@ -3434,6 +3496,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, memset(&fs_info->qgroup_rescan_progress, 0, sizeof(fs_info->qgroup_rescan_progress)); + fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | + BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); fs_info->qgroup_rescan_progress.objectid = progress_objectid; init_completion(&fs_info->qgroup_rescan_completion); mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -4231,8 +4295,7 @@ out_unlock: spin_unlock(&blocks->lock); out: if (ret < 0) - fs_info->qgroup_flags |= - BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); return ret; } @@ -4247,6 +4310,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct extent_buffer *subvol_eb) { struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_tree_parent_check check = { 0 }; struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; struct btrfs_qgroup_swapped_block *block; struct extent_buffer *reloc_eb = NULL; @@ -4295,10 +4359,13 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, blocks->swapped = swapped; spin_unlock(&blocks->lock); + check.level = block->level; + check.transid = block->reloc_generation; + check.has_first_key = true; + memcpy(&check.first_key, &block->first_key, sizeof(check.first_key)); + /* Read out reloc subtree root */ - reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0, - block->reloc_generation, block->level, - &block->first_key); + reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check); if (IS_ERR(reloc_eb)) { ret = PTR_ERR(reloc_eb); reloc_eb = NULL; @@ -4319,7 +4386,7 @@ out: btrfs_err_rl(fs_info, "failed to account subtree at bytenr %llu: %d", subvol_eb->start, ret); - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); } return ret; } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 0c4dd2a9af96..7bffa10589d6 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -11,6 +11,7 @@ #include <linux/kobject.h> #include "ulist.h" #include "delayed-ref.h" +#include "misc.h" /* * Btrfs qgroup overview @@ -100,6 +101,9 @@ * subtree rescan for them. */ +#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1UL << 3) +#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1UL << 4) + /* * Record a dirty extent, and info qgroup to update quota on it * TODO: Use kmem cache to alloc it. @@ -239,9 +243,11 @@ static inline u64 btrfs_qgroup_subvolid(u64 qgroupid) /* * For qgroup event trace points only */ -#define QGROUP_RESERVE (1<<0) -#define QGROUP_RELEASE (1<<1) -#define QGROUP_FREE (1<<2) +enum { + ENUM_BIT(QGROUP_RESERVE), + ENUM_BIT(QGROUP_RELEASE), + ENUM_BIT(QGROUP_FREE), +}; int btrfs_quota_enable(struct btrfs_fs_info *fs_info); int btrfs_quota_disable(struct btrfs_fs_info *fs_info); @@ -315,7 +321,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, * (NULL trans) */ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, - u64 num_bytes, gfp_t gfp_flag); + u64 num_bytes); /* * Inform qgroup to trace all leaf items of data diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 2feb5c20641a..2d90a6b5eb00 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -13,12 +13,15 @@ #include <linux/list_sort.h> #include <linux/raid/xor.h> #include <linux/mm.h> +#include "messages.h" #include "misc.h" #include "ctree.h" #include "disk-io.h" #include "volumes.h" #include "raid56.h" #include "async-thread.h" +#include "file-item.h" +#include "btrfs_inode.h" /* set when additional merges to this rbio are not allowed */ #define RBIO_RMW_LOCKED_BIT 1 @@ -63,19 +66,45 @@ struct sector_ptr { unsigned int uptodate:8; }; -static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); -static noinline void finish_rmw(struct btrfs_raid_bio *rbio); -static void rmw_work(struct work_struct *work); -static void read_rebuild_work(struct work_struct *work); -static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); -static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); -static void __free_raid_bio(struct btrfs_raid_bio *rbio); +static void rmw_rbio_work(struct work_struct *work); +static void rmw_rbio_work_locked(struct work_struct *work); static void index_rbio_pages(struct btrfs_raid_bio *rbio); static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); -static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, - int need_check); -static void scrub_parity_work(struct work_struct *work); +static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); +static void scrub_rbio_work_locked(struct work_struct *work); + +static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) +{ + bitmap_free(rbio->error_bitmap); + kfree(rbio->stripe_pages); + kfree(rbio->bio_sectors); + kfree(rbio->stripe_sectors); + kfree(rbio->finish_pointers); +} + +static void free_raid_bio(struct btrfs_raid_bio *rbio) +{ + int i; + + if (!refcount_dec_and_test(&rbio->refs)) + return; + + WARN_ON(!list_empty(&rbio->stripe_cache)); + WARN_ON(!list_empty(&rbio->hash_list)); + WARN_ON(!bio_list_empty(&rbio->bio_list)); + + for (i = 0; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) { + __free_page(rbio->stripe_pages[i]); + rbio->stripe_pages[i] = NULL; + } + } + + btrfs_put_bioc(rbio->bioc); + free_raid_bio_pointers(rbio); + kfree(rbio); +} static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) { @@ -146,8 +175,16 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) for (i = 0; i < rbio->nr_sectors; i++) { /* Some range not covered by bio (partial write), skip it */ - if (!rbio->bio_sectors[i].page) + if (!rbio->bio_sectors[i].page) { + /* + * Even if the sector is not covered by bio, if it is + * a data sector it should still be uptodate as it is + * read from disk. + */ + if (i < rbio->nr_data * rbio->stripe_nsectors) + ASSERT(rbio->stripe_sectors[i].uptodate); continue; + } ASSERT(rbio->stripe_sectors[i].page); memcpy_page(rbio->stripe_sectors[i].page, @@ -234,6 +271,21 @@ static void steal_rbio_page(struct btrfs_raid_bio *src, dest->stripe_sectors[i].uptodate = true; } +static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) +{ + const int sector_nr = (page_nr << PAGE_SHIFT) >> + rbio->bioc->fs_info->sectorsize_bits; + + /* + * We have ensured PAGE_SIZE is aligned with sectorsize, thus + * we won't have a page which is half data half parity. + * + * Thus if the first sector of the page belongs to data stripes, then + * the full page belongs to data stripes. + */ + return (sector_nr < rbio->nr_data * rbio->stripe_nsectors); +} + /* * Stealing an rbio means taking all the uptodate pages from the stripe array * in the source rbio and putting them into the destination rbio. @@ -244,16 +296,26 @@ static void steal_rbio_page(struct btrfs_raid_bio *src, static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) { int i; - struct page *s; if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) return; for (i = 0; i < dest->nr_pages; i++) { - s = src->stripe_pages[i]; - if (!s || !full_page_sectors_uptodate(src, i)) + struct page *p = src->stripe_pages[i]; + + /* + * We don't need to steal P/Q pages as they will always be + * regenerated for RMW or full write anyway. + */ + if (!is_data_stripe_page(src, i)) continue; + /* + * If @src already has RBIO_CACHE_READY_BIT, it should have + * all data stripe pages present and uptodate. + */ + ASSERT(p); + ASSERT(full_page_sectors_uptodate(src, i)); steal_rbio_page(src, dest, i); } index_stripe_sectors(dest); @@ -275,7 +337,6 @@ static void merge_rbio(struct btrfs_raid_bio *dest, /* Also inherit the bitmaps from @victim. */ bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, dest->stripe_nsectors); - dest->generic_bio_cnt += victim->generic_bio_cnt; bio_list_init(&victim->bio_list); } @@ -337,7 +398,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) spin_unlock(&h->lock); if (freeit) - __free_raid_bio(rbio); + free_raid_bio(rbio); } /* @@ -527,28 +588,10 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, if (last->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; - if (last->operation == BTRFS_RBIO_REBUILD_MISSING) + if (last->operation == BTRFS_RBIO_REBUILD_MISSING || + last->operation == BTRFS_RBIO_READ_REBUILD) return 0; - if (last->operation == BTRFS_RBIO_READ_REBUILD) { - int fa = last->faila; - int fb = last->failb; - int cur_fa = cur->faila; - int cur_fb = cur->failb; - - if (last->faila >= last->failb) { - fa = last->failb; - fb = last->faila; - } - - if (cur->faila >= cur->failb) { - cur_fa = cur->failb; - cur_fb = cur->faila; - } - - if (fa != cur_fa || fb != cur_fb) - return 0; - } return 1; } @@ -685,10 +728,12 @@ out: if (cache_drop) remove_rbio_from_cache(cache_drop); if (freeit) - __free_raid_bio(freeit); + free_raid_bio(freeit); return ret; } +static void recover_rbio_work_locked(struct work_struct *work); + /* * called as rmw or parity rebuild is completed. If the plug list has more * rbios waiting for this stripe, the next one on the list will be started @@ -746,16 +791,16 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) spin_unlock_irqrestore(&h->lock, flags); if (next->operation == BTRFS_RBIO_READ_REBUILD) - start_async_work(next, read_rebuild_work); + start_async_work(next, recover_rbio_work_locked); else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { steal_rbio(rbio, next); - start_async_work(next, read_rebuild_work); + start_async_work(next, recover_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); - start_async_work(next, rmw_work); + start_async_work(next, rmw_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { steal_rbio(rbio, next); - start_async_work(next, scrub_parity_work); + start_async_work(next, scrub_rbio_work_locked); } goto done_nolock; @@ -770,28 +815,6 @@ done_nolock: remove_rbio_from_cache(rbio); } -static void __free_raid_bio(struct btrfs_raid_bio *rbio) -{ - int i; - - if (!refcount_dec_and_test(&rbio->refs)) - return; - - WARN_ON(!list_empty(&rbio->stripe_cache)); - WARN_ON(!list_empty(&rbio->hash_list)); - WARN_ON(!bio_list_empty(&rbio->bio_list)); - - for (i = 0; i < rbio->nr_pages; i++) { - if (rbio->stripe_pages[i]) { - __free_page(rbio->stripe_pages[i]); - rbio->stripe_pages[i] = NULL; - } - } - - btrfs_put_bioc(rbio->bioc); - kfree(rbio); -} - static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) { struct bio *next; @@ -814,8 +837,11 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *extra; - if (rbio->generic_bio_cnt) - btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt); + kfree(rbio->csum_buf); + bitmap_free(rbio->csum_bitmap); + rbio->csum_buf = NULL; + rbio->csum_bitmap = NULL; + /* * Clear the data bitmap, as the rbio may be cached for later usage. * do this before before unlock_stripe() so there will be no new bio @@ -833,7 +859,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) */ unlock_stripe(rbio); extra = bio_list_get(&rbio->bio_list); - __free_raid_bio(rbio); + free_raid_bio(rbio); rbio_endio_bio_list(cur, err); if (extra) @@ -841,36 +867,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) } /* - * end io function used by finish_rmw. When we finally - * get here, we've written a full stripe - */ -static void raid_write_end_io(struct bio *bio) -{ - struct btrfs_raid_bio *rbio = bio->bi_private; - blk_status_t err = bio->bi_status; - int max_errors; - - if (err) - fail_bio_stripe(rbio, bio); - - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; - - err = BLK_STS_OK; - - /* OK, we have read all the stripes we need to. */ - max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? - 0 : rbio->bioc->max_errors; - if (atomic_read(&rbio->error) > max_errors) - err = BLK_STS_IOERR; - - rbio_orig_end_io(rbio, err); -} - -/** - * Get a sector pointer specified by its @stripe_nr and @sector_nr + * Get a sector pointer specified by its @stripe_nr and @sector_nr. * * @rbio: The raid bio * @stripe_nr: Stripe number, valid range [0, real_stripe) @@ -922,7 +919,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; const unsigned int num_sectors = stripe_nsectors * real_stripes; struct btrfs_raid_bio *rbio; - void *p; /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); @@ -932,47 +928,41 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, */ ASSERT(stripe_nsectors <= BITS_PER_LONG); - rbio = kzalloc(sizeof(*rbio) + - sizeof(*rbio->stripe_pages) * num_pages + - sizeof(*rbio->bio_sectors) * num_sectors + - sizeof(*rbio->stripe_sectors) * num_sectors + - sizeof(*rbio->finish_pointers) * real_stripes, - GFP_NOFS); + rbio = kzalloc(sizeof(*rbio), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); + rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), + GFP_NOFS); + rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), + GFP_NOFS); + rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), + GFP_NOFS); + rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); + rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); + + if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || + !rbio->finish_pointers || !rbio->error_bitmap) { + free_raid_bio_pointers(rbio); + kfree(rbio); + return ERR_PTR(-ENOMEM); + } bio_list_init(&rbio->bio_list); + init_waitqueue_head(&rbio->io_wait); INIT_LIST_HEAD(&rbio->plug_list); spin_lock_init(&rbio->bio_list_lock); INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); + btrfs_get_bioc(bioc); rbio->bioc = bioc; rbio->nr_pages = num_pages; rbio->nr_sectors = num_sectors; rbio->real_stripes = real_stripes; rbio->stripe_npages = stripe_npages; rbio->stripe_nsectors = stripe_nsectors; - rbio->faila = -1; - rbio->failb = -1; refcount_set(&rbio->refs, 1); - atomic_set(&rbio->error, 0); atomic_set(&rbio->stripes_pending, 0); - /* - * The stripe_pages, bio_sectors, etc arrays point to the extra memory - * we allocated past the end of the rbio. - */ - p = rbio + 1; -#define CONSUME_ALLOC(ptr, count) do { \ - ptr = p; \ - p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ - } while (0) - CONSUME_ALLOC(rbio->stripe_pages, num_pages); - CONSUME_ALLOC(rbio->bio_sectors, num_sectors); - CONSUME_ALLOC(rbio->stripe_sectors, num_sectors); - CONSUME_ALLOC(rbio->finish_pointers, real_stripes); -#undef CONSUME_ALLOC - ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); @@ -1008,6 +998,45 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) } /* + * Return the total numer of errors found in the vertical stripe of @sector_nr. + * + * @faila and @failb will also be updated to the first and second stripe + * number of the errors. + */ +static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, + int *faila, int *failb) +{ + int stripe_nr; + int found_errors = 0; + + if (faila || failb) { + /* + * Both @faila and @failb should be valid pointers if any of + * them is specified. + */ + ASSERT(faila && failb); + *faila = -1; + *failb = -1; + } + + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr; + + if (test_bit(total_sector_nr, rbio->error_bitmap)) { + found_errors++; + if (faila) { + /* Update faila and failb. */ + if (*faila < 0) + *faila = stripe_nr; + else if (*failb < 0) + *failb = stripe_nr; + } + } + } + return found_errors; +} + +/* * Add a single sector @sector into our list of bios for IO. * * Return 0 if everything went well. @@ -1040,8 +1069,19 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, disk_start = stripe->physical + sector_nr * sectorsize; /* if the device is missing, just fail this stripe */ - if (!stripe->dev->bdev) - return fail_rbio_index(rbio, stripe_nr); + if (!stripe->dev->bdev) { + int found_errors; + + set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr, + rbio->error_bitmap); + + /* Check if we have reached tolerance early. */ + found_errors = get_rbio_veritical_errors(rbio, sector_nr, + NULL, NULL); + if (found_errors > rbio->bioc->max_errors) + return -EIO; + return 0; + } /* see if we can add this page onto our existing bio */ if (last) { @@ -1073,23 +1113,6 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, return 0; } -/* - * while we're doing the read/modify/write cycle, we could - * have errors in reading pages off the disk. This checks - * for errors and if we're not able to read the page it'll - * trigger parity reconstruction. The rmw will be finished - * after we've reconstructed the failed stripes - */ -static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) -{ - if (rbio->faila >= 0 || rbio->failb >= 0) { - BUG_ON(rbio->faila == rbio->real_stripes - 1); - __raid56_parity_recover(rbio); - } else { - finish_rmw(rbio); - } -} - static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; @@ -1160,109 +1183,71 @@ not_found: trace_info->stripe_nr = -1; } -/* - * this is called from one of two situations. We either - * have a full stripe from the higher layers, or we've read all - * the missing bits off disk. - * - * This will calculate the parity and then send down any - * changed blocks. - */ -static noinline void finish_rmw(struct btrfs_raid_bio *rbio) +/* Generate PQ for one veritical stripe. */ +static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { - struct btrfs_io_context *bioc = rbio->bioc; - const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; - int nr_data = rbio->nr_data; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + struct sector_ptr *sector; + int stripe; + const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; + + /* First collect one sector from each data stripe */ + for (stripe = 0; stripe < rbio->nr_data; stripe++) { + sector = sector_in_rbio(rbio, stripe, sectornr, 0); + pointers[stripe] = kmap_local_page(sector->page) + + sector->pgoff; + } + + /* Then add the parity stripe */ + sector = rbio_pstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; + + if (has_qstripe) { + /* + * RAID6, add the qstripe and call the library function + * to fill in our p/q + */ + sector = rbio_qstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + + sector->pgoff; + + raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, + pointers); + } else { + /* raid5 */ + memcpy(pointers[rbio->nr_data], pointers[0], sectorsize); + run_xor(pointers + 1, rbio->nr_data - 1, sectorsize); + } + for (stripe = stripe - 1; stripe >= 0; stripe--) + kunmap_local(pointers[stripe]); +} + +static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; /* The total sector number inside the full stripe. */ int total_sector_nr; - int stripe; - /* Sector number inside a stripe. */ int sectornr; - bool has_qstripe; - struct bio_list bio_list; - struct bio *bio; + int stripe; int ret; - bio_list_init(&bio_list); - - if (rbio->real_stripes - rbio->nr_data == 1) - has_qstripe = false; - else if (rbio->real_stripes - rbio->nr_data == 2) - has_qstripe = true; - else - BUG(); + ASSERT(bio_list_size(bio_list) == 0); /* We should have at least one data sector. */ ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); - /* at this point we either have a full stripe, - * or we've read the full stripe from the drive. - * recalculate the parity and write the new results. - * - * We're not allowed to add any new bios to the - * bio list here, anyone else that wants to - * change this stripe needs to do their own rmw. - */ - spin_lock_irq(&rbio->bio_list_lock); - set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); - spin_unlock_irq(&rbio->bio_list_lock); - - atomic_set(&rbio->error, 0); - /* - * now that we've set rmw_locked, run through the - * bio list one last time and map the page pointers - * - * We don't cache full rbios because we're assuming - * the higher layers are unlikely to use this area of - * the disk again soon. If they do use it again, - * hopefully they will send another full bio. + * Reset errors, as we may have errors inherited from from degraded + * write. */ - index_rbio_pages(rbio); - if (!rbio_is_full(rbio)) - cache_rbio_pages(rbio); - else - clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; - - /* First collect one sector from each data stripe */ - for (stripe = 0; stripe < nr_data; stripe++) { - sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; - } - - /* Then add the parity stripe */ - sector = rbio_pstripe_sector(rbio, sectornr); - sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; - - if (has_qstripe) { - /* - * RAID6, add the qstripe and call the library function - * to fill in our p/q - */ - sector = rbio_qstripe_sector(rbio, sectornr); - sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + - sector->pgoff; - - raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, - pointers); - } else { - /* raid5 */ - memcpy(pointers[nr_data], pointers[0], sectorsize); - run_xor(pointers + 1, nr_data - 1, sectorsize); - } - for (stripe = stripe - 1; stripe >= 0; stripe--) - kunmap_local(pointers[stripe]); - } + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* - * Start writing. Make bios for everything from the higher layers (the + * Start assembly. Make bios for everything from the higher layers (the * bio_list in our rbio) and our P/Q. Ignore everything else. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; @@ -1284,15 +1269,16 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) sector = rbio_stripe_sector(rbio, stripe, sectornr); } - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_WRITE); if (ret) - goto cleanup; + goto error; } - if (likely(!bioc->num_tgtdevs)) - goto write_data; + if (likely(!rbio->bioc->num_tgtdevs)) + return 0; + /* Make a copy for the replace target device. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { struct sector_ptr *sector; @@ -1300,7 +1286,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; - if (!bioc->tgtdev_map[stripe]) { + if (!rbio->bioc->tgtdev_map[stripe]) { /* * We can skip the whole stripe completely, note * total_sector_nr will be increased by one anyway. @@ -1322,125 +1308,52 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) sector = rbio_stripe_sector(rbio, stripe, sectornr); } - ret = rbio_add_io_sector(rbio, &bio_list, sector, + ret = rbio_add_io_sector(rbio, bio_list, sector, rbio->bioc->tgtdev_map[stripe], sectornr, REQ_OP_WRITE); if (ret) - goto cleanup; - } - -write_data: - atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); - BUG_ON(atomic_read(&rbio->stripes_pending) == 0); - - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid_write_end_io; - - if (trace_raid56_write_stripe_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; - - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_write_stripe(rbio, bio, &trace_info); - } - submit_bio(bio); + goto error; } - return; -cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - - while ((bio = bio_list_pop(&bio_list))) + return 0; +error: + while ((bio = bio_list_pop(bio_list))) bio_put(bio); + return -EIO; } -/* - * helper to find the stripe number for a given bio. Used to figure out which - * stripe has failed. This expects the bio to correspond to a physical disk, - * so it looks up based on physical sector numbers. - */ -static int find_bio_stripe(struct btrfs_raid_bio *rbio, - struct bio *bio) -{ - u64 physical = bio->bi_iter.bi_sector; - int i; - struct btrfs_io_stripe *stripe; - - physical <<= 9; - - for (i = 0; i < rbio->bioc->num_stripes; i++) { - stripe = &rbio->bioc->stripes[i]; - if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) && - stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { - return i; - } - } - return -1; -} - -/* - * helper to find the stripe number for a given - * bio (before mapping). Used to figure out which stripe has - * failed. This looks up based on logical block numbers. - */ -static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, - struct bio *bio) -{ - u64 logical = bio->bi_iter.bi_sector << 9; - int i; - - for (i = 0; i < rbio->nr_data; i++) { - u64 stripe_start = rbio->bioc->raid_map[i]; - - if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN)) - return i; - } - return -1; -} - -/* - * returns -EIO if we had too many failures - */ -static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) +static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) { - unsigned long flags; - int ret = 0; + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + rbio->bioc->raid_map[0]; + int total_nr_sector = offset >> fs_info->sectorsize_bits; - spin_lock_irqsave(&rbio->bio_list_lock, flags); + ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors); - /* we already know this stripe is bad, move on */ - if (rbio->faila == failed || rbio->failb == failed) - goto out; + bitmap_set(rbio->error_bitmap, total_nr_sector, + bio->bi_iter.bi_size >> fs_info->sectorsize_bits); - if (rbio->faila == -1) { - /* first failure on this rbio */ - rbio->faila = failed; - atomic_inc(&rbio->error); - } else if (rbio->failb == -1) { - /* second failure on this rbio */ - rbio->failb = failed; - atomic_inc(&rbio->error); - } else { - ret = -EIO; + /* + * Special handling for raid56_alloc_missing_rbio() used by + * scrub/replace. Unlike call path in raid56_parity_recover(), they + * pass an empty bio here. Thus we have to find out the missing device + * and mark the stripe error instead. + */ + if (bio->bi_iter.bi_size == 0) { + bool found_missing = false; + int stripe_nr; + + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + if (!rbio->bioc->stripes[stripe_nr].dev->bdev) { + found_missing = true; + bitmap_set(rbio->error_bitmap, + stripe_nr * rbio->stripe_nsectors, + rbio->stripe_nsectors); + } + } + ASSERT(found_missing); } -out: - spin_unlock_irqrestore(&rbio->bio_list_lock, flags); - - return ret; -} - -/* - * helper to fail a stripe based on a physical disk - * bio. - */ -static int fail_bio_stripe(struct btrfs_raid_bio *rbio, - struct bio *bio) -{ - int failed = find_bio_stripe(rbio, bio); - - if (failed < 0) - return -EIO; - - return fail_rbio_index(rbio, failed); } /* @@ -1488,193 +1401,163 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) } } -static void raid56_bio_end_io(struct bio *bio) +static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) { - struct btrfs_raid_bio *rbio = bio->bi_private; - - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(rbio, bio); + struct bio_vec *bv = bio_first_bvec_all(bio); + int i; - bio_put(bio); + for (i = 0; i < rbio->nr_sectors; i++) { + struct sector_ptr *sector; - if (atomic_dec_and_test(&rbio->stripes_pending)) - queue_work(rbio->bioc->fs_info->endio_raid56_workers, - &rbio->end_io_work); + sector = &rbio->stripe_sectors[i]; + if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + break; + sector = &rbio->bio_sectors[i]; + if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + break; + } + ASSERT(i < rbio->nr_sectors); + return i; } -/* - * End io handler for the read phase of the RMW cycle. All the bios here are - * physical stripe bios we've read from the disk so we can recalculate the - * parity of the stripe. - * - * This will usually kick off finish_rmw once all the bios are read in, but it - * may trigger parity reconstruction if we had any errors along the way - */ -static void raid56_rmw_end_io_work(struct work_struct *work) +static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio) { - struct btrfs_raid_bio *rbio = - container_of(work, struct btrfs_raid_bio, end_io_work); + int total_sector_nr = get_bio_sector_nr(rbio, bio); + u32 bio_size = 0; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { - rbio_orig_end_io(rbio, BLK_STS_IOERR); - return; - } + bio_for_each_segment_all(bvec, bio, iter_all) + bio_size += bvec->bv_len; - /* - * This will normally call finish_rmw to start our write but if there - * are any failed stripes we'll reconstruct from parity first. - */ - validate_rbio_for_rmw(rbio); + bitmap_set(rbio->error_bitmap, total_sector_nr, + bio_size >> rbio->bioc->fs_info->sectorsize_bits); } -/* - * the stripe must be locked by the caller. It will - * unlock after all the writes are done - */ -static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) +/* Verify the data sectors at read time. */ +static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, + struct bio *bio) { - int bios_to_read = 0; - struct bio_list bio_list; - const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data; - int ret; - int total_sector_nr; - struct bio *bio; + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + int total_sector_nr = get_bio_sector_nr(rbio, bio); + struct bio_vec *bvec; + struct bvec_iter_all iter_all; - bio_list_init(&bio_list); + /* No data csum for the whole stripe, no need to verify. */ + if (!rbio->csum_bitmap || !rbio->csum_buf) + return; - ret = alloc_rbio_pages(rbio); - if (ret) - goto cleanup; + /* P/Q stripes, they have no data csum to verify against. */ + if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) + return; - index_rbio_pages(rbio); + bio_for_each_segment_all(bvec, bio, iter_all) { + int bv_offset; + + for (bv_offset = bvec->bv_offset; + bv_offset < bvec->bv_offset + bvec->bv_len; + bv_offset += fs_info->sectorsize, total_sector_nr++) { + u8 csum_buf[BTRFS_CSUM_SIZE]; + u8 *expected_csum = rbio->csum_buf + + total_sector_nr * fs_info->csum_size; + int ret; - atomic_set(&rbio->error, 0); - /* Build a list of bios to read all the missing data sectors. */ - for (total_sector_nr = 0; total_sector_nr < nr_data_sectors; - total_sector_nr++) { - struct sector_ptr *sector; - int stripe = total_sector_nr / rbio->stripe_nsectors; - int sectornr = total_sector_nr % rbio->stripe_nsectors; + /* No csum for this sector, skip to the next sector. */ + if (!test_bit(total_sector_nr, rbio->csum_bitmap)) + continue; - /* - * We want to find all the sectors missing from the rbio and - * read them from the disk. If sector_in_rbio() finds a page - * in the bio list we don't need to read it off the stripe. - */ - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (sector) - continue; + ret = btrfs_check_sector_csum(fs_info, bvec->bv_page, + bv_offset, csum_buf, expected_csum); + if (ret < 0) + set_bit(total_sector_nr, rbio->error_bitmap); + } + } +} - sector = rbio_stripe_sector(rbio, stripe, sectornr); - /* - * The bio cache may have handed us an uptodate page. If so, - * use it. - */ - if (sector->uptodate) - continue; +static void raid_wait_read_end_io(struct bio *bio) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; - ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, REQ_OP_READ); - if (ret) - goto cleanup; + if (bio->bi_status) { + rbio_update_error_bitmap(rbio, bio); + } else { + set_bio_pages_uptodate(rbio, bio); + verify_bio_data_sectors(rbio, bio); } - bios_to_read = bio_list_size(&bio_list); - if (!bios_to_read) { - /* - * this can happen if others have merged with - * us, it means there is nothing left to read. - * But if there are missing devices it may not be - * safe to do the full stripe write yet. - */ - goto finish; - } + bio_put(bio); + if (atomic_dec_and_test(&rbio->stripes_pending)) + wake_up(&rbio->io_wait); +} - /* - * The bioc may be freed once we submit the last bio. Make sure not to - * touch it after that. - */ - atomic_set(&rbio->stripes_pending, bios_to_read); - INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work); - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid56_bio_end_io; +static void submit_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; + + atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); + while ((bio = bio_list_pop(bio_list))) { + bio->bi_end_io = raid_wait_read_end_io; - if (trace_raid56_read_partial_enabled()) { + if (trace_raid56_scrub_read_recover_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_read_partial(rbio, bio, &trace_info); + trace_raid56_scrub_read_recover(rbio, bio, &trace_info); } submit_bio(bio); } - /* the actual write will happen once the reads are done */ - return 0; +} -cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); +static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; + int total_sector_nr; + int ret = 0; - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); + ASSERT(bio_list_size(bio_list) == 0); - return -EIO; + /* + * Build a list of bios to read all sectors (including data and P/Q). + * + * This behaviro is to compensate the later csum verification and + * recovery. + */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; -finish: - validate_rbio_for_rmw(rbio); + sector = rbio_stripe_sector(rbio, stripe, sectornr); + ret = rbio_add_io_sector(rbio, bio_list, sector, + stripe, sectornr, REQ_OP_READ); + if (ret) + goto cleanup; + } return 0; + +cleanup: + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); + return ret; } -/* - * if the upper layers pass in a full stripe, we thank them by only allocating - * enough pages to hold the parity, and sending it all down quickly. - */ -static int full_stripe_write(struct btrfs_raid_bio *rbio) +static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) { + const int data_pages = rbio->nr_data * rbio->stripe_npages; int ret; - ret = alloc_rbio_parity_pages(rbio); - if (ret) { - __free_raid_bio(rbio); + ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages); + if (ret < 0) return ret; - } - - ret = lock_stripe_add(rbio); - if (ret == 0) - finish_rmw(rbio); - return 0; -} -/* - * partial stripe writes get handed over to async helpers. - * We're really hoping to merge a few more writes into this - * rbio before calculating new parity - */ -static int partial_stripe_write(struct btrfs_raid_bio *rbio) -{ - int ret; - - ret = lock_stripe_add(rbio); - if (ret == 0) - start_async_work(rbio, rmw_work); + index_stripe_sectors(rbio); return 0; } /* - * sometimes while we were reading from the drive to - * recalculate parity, enough new bios come into create - * a full stripe. So we do a check here to see if we can - * go directly to finish_rmw - */ -static int __raid56_parity_write(struct btrfs_raid_bio *rbio) -{ - /* head off into rmw land if we don't have a full stripe */ - if (!rbio_is_full(rbio)) - return partial_stripe_write(rbio); - return full_stripe_write(rbio); -} - -/* * We use plugging call backs to collect full stripes. * Any time we get a partial stripe write while plugged * we collect it into a list. When the unplug comes down, @@ -1708,71 +1591,39 @@ static int plug_cmp(void *priv, const struct list_head *a, return 0; } -static void run_plug(struct btrfs_plug_cb *plug) +static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule) { + struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb); struct btrfs_raid_bio *cur; struct btrfs_raid_bio *last = NULL; - /* - * sort our plug list then try to merge - * everything we can in hopes of creating full - * stripes. - */ list_sort(NULL, &plug->rbio_list, plug_cmp); + while (!list_empty(&plug->rbio_list)) { cur = list_entry(plug->rbio_list.next, struct btrfs_raid_bio, plug_list); list_del_init(&cur->plug_list); if (rbio_is_full(cur)) { - int ret; - - /* we have a full stripe, send it down */ - ret = full_stripe_write(cur); - BUG_ON(ret); + /* We have a full stripe, queue it down. */ + start_async_work(cur, rmw_rbio_work); continue; } if (last) { if (rbio_can_merge(last, cur)) { merge_rbio(last, cur); - __free_raid_bio(cur); + free_raid_bio(cur); continue; - } - __raid56_parity_write(last); + start_async_work(last, rmw_rbio_work); } last = cur; } - if (last) { - __raid56_parity_write(last); - } + if (last) + start_async_work(last, rmw_rbio_work); kfree(plug); } -/* - * if the unplug comes from schedule, we have to push the - * work off to a helper thread - */ -static void unplug_work(struct work_struct *work) -{ - struct btrfs_plug_cb *plug; - plug = container_of(work, struct btrfs_plug_cb, work); - run_plug(plug); -} - -static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) -{ - struct btrfs_plug_cb *plug; - plug = container_of(cb, struct btrfs_plug_cb, cb); - - if (from_schedule) { - INIT_WORK(&plug->work, unplug_work); - queue_work(plug->info->rmw_workers, &plug->work); - return; - } - run_plug(plug); -} - /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) { @@ -1813,27 +1664,20 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { - btrfs_put_bioc(bioc); ret = PTR_ERR(rbio); - goto out_dec_counter; + goto fail; } rbio->operation = BTRFS_RBIO_WRITE; rbio_add_bio(rbio, bio); - rbio->generic_bio_cnt = 1; - /* - * don't plug on full rbios, just get them out the door + * Don't plug on full rbios, just get them out the door * as quickly as we can */ - if (rbio_is_full(rbio)) { - ret = full_stripe_write(rbio); - if (ret) - goto out_dec_counter; - return; - } + if (rbio_is_full(rbio)) + goto queue_rbio; - cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); + cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); if (cb) { plug = container_of(cb, struct btrfs_plug_cb, cb); if (!plug->info) { @@ -1841,282 +1685,270 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) INIT_LIST_HEAD(&plug->rbio_list); } list_add_tail(&rbio->plug_list, &plug->rbio_list); - } else { - ret = __raid56_parity_write(rbio); - if (ret) - goto out_dec_counter; + return; } +queue_rbio: + /* + * Either we don't have any existing plug, or we're doing a full stripe, + * can queue the rmw work now. + */ + start_async_work(rbio, rmw_rbio_work); return; -out_dec_counter: - btrfs_bio_counter_dec(fs_info); +fail: bio->bi_status = errno_to_blk_status(ret); bio_endio(bio); } -/* - * all parity reconstruction happens here. We've read in everything - * we can find from the drives and this does the heavy lifting of - * sorting the good from the bad. - */ -static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) +static int verify_one_sector(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - int sectornr, stripe; - void **pointers; - void **unmap_array; - int faila = -1, failb = -1; - blk_status_t err; - int i; + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct sector_ptr *sector; + u8 csum_buf[BTRFS_CSUM_SIZE]; + u8 *csum_expected; + int ret; - /* - * This array stores the pointer for each sector, thus it has the extra - * pgoff value added from each sector - */ - pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); - if (!pointers) { - err = BLK_STS_RESOURCE; - goto cleanup_io; - } + if (!rbio->csum_bitmap || !rbio->csum_buf) + return 0; + /* No way to verify P/Q as they are not covered by data csum. */ + if (stripe_nr >= rbio->nr_data) + return 0; /* - * Store copy of pointers that does not get reordered during - * reconstruction so that kunmap_local works. + * If we're rebuilding a read, we have to use pages from the + * bio list if possible. */ - unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); - if (!unmap_array) { - err = BLK_STS_RESOURCE; - goto cleanup_pointers; + if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { + sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + } else { + sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); } - faila = rbio->faila; - failb = rbio->failb; + ASSERT(sector->page); - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { - spin_lock_irq(&rbio->bio_list_lock); - set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); - spin_unlock_irq(&rbio->bio_list_lock); - } + csum_expected = rbio->csum_buf + + (stripe_nr * rbio->stripe_nsectors + sector_nr) * + fs_info->csum_size; + ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff, + csum_buf, csum_expected); + return ret; +} - index_rbio_pages(rbio); +/* + * Recover a vertical stripe specified by @sector_nr. + * @*pointers are the pre-allocated pointers by the caller, so we don't + * need to allocate/free the pointers again and again. + */ +static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, + void **pointers, void **unmap_array) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct sector_ptr *sector; + const u32 sectorsize = fs_info->sectorsize; + int found_errors; + int faila; + int failb; + int stripe_nr; + int ret = 0; - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; + /* + * Now we just use bitmap to mark the horizontal stripes in + * which we have data when doing parity scrub. + */ + if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && + !test_bit(sector_nr, &rbio->dbitmap)) + return 0; - /* - * Now we just use bitmap to mark the horizontal stripes in - * which we have data when doing parity scrub. - */ - if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && - !test_bit(sectornr, &rbio->dbitmap)) - continue; + found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, + &failb); + /* + * No errors in the veritical stripe, skip it. Can happen for recovery + * which only part of a stripe failed csum check. + */ + if (!found_errors) + return 0; + + if (found_errors > rbio->bioc->max_errors) + return -EIO; + /* + * Setup our array of pointers with sectors from each stripe + * + * NOTE: store a duplicate array of pointers to preserve the + * pointer order. + */ + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { /* - * Setup our array of pointers with sectors from each stripe - * - * NOTE: store a duplicate array of pointers to preserve the - * pointer order + * If we're rebuilding a read, we have to use pages from the + * bio list if possible. */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - /* - * If we're rebuilding a read, we have to use - * pages from the bio list - */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && - (stripe == faila || stripe == failb)) { - sector = sector_in_rbio(rbio, stripe, sectornr, 0); - } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); - } - ASSERT(sector->page); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; - unmap_array[stripe] = pointers[stripe]; + if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { + sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + } else { + sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); } + ASSERT(sector->page); + pointers[stripe_nr] = kmap_local_page(sector->page) + + sector->pgoff; + unmap_array[stripe_nr] = pointers[stripe_nr]; + } - /* All raid6 handling here */ - if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { - /* Single failure, rebuild from parity raid5 style */ - if (failb < 0) { - if (faila == rbio->nr_data) { - /* - * Just the P stripe has failed, without - * a bad data or Q stripe. - * TODO, we should redo the xor here. - */ - err = BLK_STS_IOERR; - goto cleanup; - } + /* All raid6 handling here */ + if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { + /* Single failure, rebuild from parity raid5 style */ + if (failb < 0) { + if (faila == rbio->nr_data) /* - * a single failure in raid6 is rebuilt - * in the pstripe code below + * Just the P stripe has failed, without + * a bad data or Q stripe. + * We have nothing to do, just skip the + * recovery for this stripe. */ - goto pstripe; - } - - /* make sure our ps and qs are in order */ - if (faila > failb) - swap(faila, failb); - - /* if the q stripe is failed, do a pstripe reconstruction - * from the xors. - * If both the q stripe and the P stripe are failed, we're - * here due to a crc mismatch and we can't give them the - * data they want + goto cleanup; + /* + * a single failure in raid6 is rebuilt + * in the pstripe code below */ - if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { - if (rbio->bioc->raid_map[faila] == - RAID5_P_STRIPE) { - err = BLK_STS_IOERR; - goto cleanup; - } + goto pstripe; + } + + /* + * If the q stripe is failed, do a pstripe reconstruction from + * the xors. + * If both the q stripe and the P stripe are failed, we're + * here due to a crc mismatch and we can't give them the + * data they want. + */ + if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { + if (rbio->bioc->raid_map[faila] == + RAID5_P_STRIPE) /* - * otherwise we have one bad data stripe and - * a good P stripe. raid5! + * Only P and Q are corrupted. + * We only care about data stripes recovery, + * can skip this vertical stripe. */ - goto pstripe; - } + goto cleanup; + /* + * Otherwise we have one bad data stripe and + * a good P stripe. raid5! + */ + goto pstripe; + } - if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { - raid6_datap_recov(rbio->real_stripes, - sectorsize, faila, pointers); - } else { - raid6_2data_recov(rbio->real_stripes, - sectorsize, faila, failb, - pointers); - } + if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { + raid6_datap_recov(rbio->real_stripes, sectorsize, + faila, pointers); } else { - void *p; + raid6_2data_recov(rbio->real_stripes, sectorsize, + faila, failb, pointers); + } + } else { + void *p; - /* rebuild from P stripe here (raid5 or raid6) */ - BUG_ON(failb != -1); + /* Rebuild from P stripe here (raid5 or raid6). */ + ASSERT(failb == -1); pstripe: - /* Copy parity block into failed block to start with */ - memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); + /* Copy parity block into failed block to start with */ + memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); - /* rearrange the pointer array */ - p = pointers[faila]; - for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) - pointers[stripe] = pointers[stripe + 1]; - pointers[rbio->nr_data - 1] = p; + /* Rearrange the pointer array */ + p = pointers[faila]; + for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1; + stripe_nr++) + pointers[stripe_nr] = pointers[stripe_nr + 1]; + pointers[rbio->nr_data - 1] = p; - /* xor in the rest */ - run_xor(pointers, rbio->nr_data - 1, sectorsize); - } - /* if we're doing this rebuild as part of an rmw, go through - * and set all of our private rbio pages in the - * failed stripes as uptodate. This way finish_rmw will - * know they can be trusted. If this was a read reconstruction, - * other endio functions will fiddle the uptodate bits - */ - if (rbio->operation == BTRFS_RBIO_WRITE) { - for (i = 0; i < rbio->stripe_nsectors; i++) { - if (faila != -1) { - sector = rbio_stripe_sector(rbio, faila, i); - sector->uptodate = 1; - } - if (failb != -1) { - sector = rbio_stripe_sector(rbio, failb, i); - sector->uptodate = 1; - } - } - } - for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--) - kunmap_local(unmap_array[stripe]); + /* Xor in the rest */ + run_xor(pointers, rbio->nr_data - 1, sectorsize); + + } + + /* + * No matter if this is a RMW or recovery, we should have all + * failed sectors repaired in the vertical stripe, thus they are now + * uptodate. + * Especially if we determine to cache the rbio, we need to + * have at least all data sectors uptodate. + * + * If possible, also check if the repaired sector matches its data + * checksum. + */ + if (faila >= 0) { + ret = verify_one_sector(rbio, faila, sector_nr); + if (ret < 0) + goto cleanup; + + sector = rbio_stripe_sector(rbio, faila, sector_nr); + sector->uptodate = 1; + } + if (failb >= 0) { + ret = verify_one_sector(rbio, faila, sector_nr); + if (ret < 0) + goto cleanup; + + sector = rbio_stripe_sector(rbio, failb, sector_nr); + sector->uptodate = 1; } - err = BLK_STS_OK; cleanup: - kfree(unmap_array); -cleanup_pointers: - kfree(pointers); + for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) + kunmap_local(unmap_array[stripe_nr]); + return ret; +} + +static int recover_sectors(struct btrfs_raid_bio *rbio) +{ + void **pointers = NULL; + void **unmap_array = NULL; + int sectornr; + int ret = 0; -cleanup_io: /* - * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a - * valid rbio which is consistent with ondisk content, thus such a - * valid rbio can be cached to avoid further disk reads. + * @pointers array stores the pointer for each sector. + * + * @unmap_array stores copy of pointers that does not get reordered + * during reconstruction so that kunmap_local works. */ + pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + if (!pointers || !unmap_array) { + ret = -ENOMEM; + goto out; + } + if (rbio->operation == BTRFS_RBIO_READ_REBUILD || rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { - /* - * - In case of two failures, where rbio->failb != -1: - * - * Do not cache this rbio since the above read reconstruction - * (raid6_datap_recov() or raid6_2data_recov()) may have - * changed some content of stripes which are not identical to - * on-disk content any more, otherwise, a later write/recover - * may steal stripe_pages from this rbio and end up with - * corruptions or rebuild failures. - * - * - In case of single failure, where rbio->failb == -1: - * - * Cache this rbio iff the above read reconstruction is - * executed without problems. - */ - if (err == BLK_STS_OK && rbio->failb < 0) - cache_rbio_pages(rbio); - else - clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + spin_lock_irq(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock_irq(&rbio->bio_list_lock); + } - rbio_orig_end_io(rbio, err); - } else if (err == BLK_STS_OK) { - rbio->faila = -1; - rbio->failb = -1; + index_rbio_pages(rbio); - if (rbio->operation == BTRFS_RBIO_WRITE) - finish_rmw(rbio); - else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) - finish_parity_scrub(rbio, 0); - else - BUG(); - } else { - rbio_orig_end_io(rbio, err); + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + ret = recover_vertical(rbio, sectornr, pointers, unmap_array); + if (ret < 0) + break; } -} -/* - * This is called only for stripes we've read from disk to reconstruct the - * parity. - */ -static void raid_recover_end_io_work(struct work_struct *work) -{ - struct btrfs_raid_bio *rbio = - container_of(work, struct btrfs_raid_bio, end_io_work); - - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - rbio_orig_end_io(rbio, BLK_STS_IOERR); - else - __raid_recover_end_io(rbio); +out: + kfree(pointers); + kfree(unmap_array); + return ret; } -/* - * reads everything we need off the disk to reconstruct - * the parity. endio handlers trigger final reconstruction - * when the IO is done. - * - * This is used both for reads from the higher layers and for - * parity construction required to finish a rmw cycle. - */ -static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) +static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) { - int bios_to_read = 0; - struct bio_list bio_list; - int ret; - int total_sector_nr; struct bio *bio; + int total_sector_nr; + int ret = 0; - bio_list_init(&bio_list); - - ret = alloc_rbio_pages(rbio); - if (ret) - goto cleanup; - - atomic_set(&rbio->error, 0); - + ASSERT(bio_list_size(bio_list) == 0); /* * Read everything that hasn't failed. However this time we will * not trust any cached sector. @@ -2131,64 +1963,139 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) int sectornr = total_sector_nr % rbio->stripe_nsectors; struct sector_ptr *sector; - if (rbio->faila == stripe || rbio->failb == stripe) { - atomic_inc(&rbio->error); - /* Skip the current stripe. */ - ASSERT(sectornr == 0); - total_sector_nr += rbio->stripe_nsectors - 1; + /* + * Skip the range which has error. It can be a range which is + * marked error (for csum mismatch), or it can be a missing + * device. + */ + if (!rbio->bioc->stripes[stripe].dev->bdev || + test_bit(total_sector_nr, rbio->error_bitmap)) { + /* + * Also set the error bit for missing device, which + * may not yet have its error bit set. + */ + set_bit(total_sector_nr, rbio->error_bitmap); continue; } + sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret < 0) - goto cleanup; + goto error; } + return 0; +error: + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); - bios_to_read = bio_list_size(&bio_list); - if (!bios_to_read) { - /* - * we might have no bios to read just because the pages - * were up to date, or we might have no bios to read because - * the devices were gone. - */ - if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) { - __raid_recover_end_io(rbio); - return 0; - } else { - goto cleanup; - } - } + return -EIO; +} + +static int recover_rbio(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list; + struct bio *bio; + int ret; /* - * The bioc may be freed once we submit the last bio. Make sure not to - * touch it after that. + * Either we're doing recover for a read failure or degraded write, + * caller should have set error bitmap correctly. */ - atomic_set(&rbio->stripes_pending, bios_to_read); - INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work); - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid56_bio_end_io; + ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); + bio_list_init(&bio_list); - if (trace_raid56_scrub_read_recover_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; + /* For recovery, we need to read all sectors including P/Q. */ + ret = alloc_rbio_pages(rbio); + if (ret < 0) + goto out; - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_read_recover(rbio, bio, &trace_info); - } - submit_bio(bio); - } + index_rbio_pages(rbio); - return 0; + ret = recover_assemble_read_bios(rbio, &bio_list); + if (ret < 0) + goto out; -cleanup: - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) - rbio_orig_end_io(rbio, BLK_STS_IOERR); + submit_read_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + ret = recover_sectors(rbio); + +out: while ((bio = bio_list_pop(&bio_list))) bio_put(bio); - return -EIO; + return ret; +} + +static void recover_rbio_work(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio; + int ret; + + rbio = container_of(work, struct btrfs_raid_bio, work); + + ret = lock_stripe_add(rbio); + if (ret == 0) { + ret = recover_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + } +} + +static void recover_rbio_work_locked(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio; + int ret; + + rbio = container_of(work, struct btrfs_raid_bio, work); + + ret = recover_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); +} + +static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) +{ + bool found = false; + int sector_nr; + + /* + * This is for RAID6 extra recovery tries, thus mirror number should + * be large than 2. + * Mirror 1 means read from data stripes. Mirror 2 means rebuild using + * RAID5 methods. + */ + ASSERT(mirror_num > 2); + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { + int found_errors; + int faila; + int failb; + + found_errors = get_rbio_veritical_errors(rbio, sector_nr, + &faila, &failb); + /* This vertical stripe doesn't have errors. */ + if (!found_errors) + continue; + + /* + * If we found errors, there should be only one error marked + * by previous set_rbio_range_error(). + */ + ASSERT(found_errors == 1); + found = true; + + /* Now select another stripe to mark as error. */ + failb = rbio->real_stripes - (mirror_num - 1); + if (failb <= faila) + failb--; + + /* Set the extra bit in error bitmap. */ + if (failb >= 0) + set_bit(failb * rbio->stripe_nsectors + sector_nr, + rbio->error_bitmap); + } + + /* We should found at least one vertical stripe with error.*/ + ASSERT(found); } /* @@ -2198,88 +2105,292 @@ cleanup: * of the drive. */ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - int mirror_num, bool generic_io) + int mirror_num) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; - if (generic_io) { - ASSERT(bioc->mirror_num == mirror_num); - btrfs_bio(bio)->mirror_num = mirror_num; - } else { - btrfs_get_bioc(bioc); - } - rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); - goto out_end_bio; + bio_endio(bio); + return; } rbio->operation = BTRFS_RBIO_READ_REBUILD; rbio_add_bio(rbio, bio); - rbio->faila = find_logical_bio_stripe(rbio, bio); - if (rbio->faila == -1) { - btrfs_warn(fs_info, -"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", - __func__, bio->bi_iter.bi_sector << 9, - (u64)bio->bi_iter.bi_size, bioc->map_type); - kfree(rbio); - bio->bi_status = BLK_STS_IOERR; - goto out_end_bio; - } - - if (generic_io) - rbio->generic_bio_cnt = 1; + set_rbio_range_error(rbio, bio); /* * Loop retry: * for 'mirror == 2', reconstruct from all other stripes. * for 'mirror_num > 2', select a stripe to fail on every retry. */ - if (mirror_num > 2) { + if (mirror_num > 2) + set_rbio_raid6_extra_error(rbio, mirror_num); + + start_async_work(rbio, recover_rbio_work); +} + +static void fill_data_csums(struct btrfs_raid_bio *rbio) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, + rbio->bioc->raid_map[0]); + const u64 start = rbio->bioc->raid_map[0]; + const u32 len = (rbio->nr_data * rbio->stripe_nsectors) << + fs_info->sectorsize_bits; + int ret; + + /* The rbio should not have its csum buffer initialized. */ + ASSERT(!rbio->csum_buf && !rbio->csum_bitmap); + + /* + * Skip the csum search if: + * + * - The rbio doesn't belong to data block groups + * Then we are doing IO for tree blocks, no need to search csums. + * + * - The rbio belongs to mixed block groups + * This is to avoid deadlock, as we're already holding the full + * stripe lock, if we trigger a metadata read, and it needs to do + * raid56 recovery, we will deadlock. + */ + if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) || + rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA) + return; + + rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors * + fs_info->csum_size, GFP_NOFS); + rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors, + GFP_NOFS); + if (!rbio->csum_buf || !rbio->csum_bitmap) { + ret = -ENOMEM; + goto error; + } + + ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1, + rbio->csum_buf, rbio->csum_bitmap); + if (ret < 0) + goto error; + if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) + goto no_csum; + return; + +error: + /* + * We failed to allocate memory or grab the csum, but it's not fatal, + * we can still continue. But better to warn users that RMW is no + * longer safe for this particular sub-stripe write. + */ + btrfs_warn_rl(fs_info, +"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d", + rbio->bioc->raid_map[0], ret); +no_csum: + kfree(rbio->csum_buf); + bitmap_free(rbio->csum_bitmap); + rbio->csum_buf = NULL; + rbio->csum_bitmap = NULL; +} + +static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list; + struct bio *bio; + int ret; + + bio_list_init(&bio_list); + + /* + * Fill the data csums we need for data verification. We need to fill + * the csum_bitmap/csum_buf first, as our endio function will try to + * verify the data sectors. + */ + fill_data_csums(rbio); + + ret = rmw_assemble_read_bios(rbio, &bio_list); + if (ret < 0) + goto out; + + submit_read_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + /* + * We may or may not have any corrupted sectors (including missing dev + * and csum mismatch), just let recover_sectors() to handle them all. + */ + ret = recover_sectors(rbio); + return ret; +out: + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); + + return ret; +} + +static void raid_wait_write_end_io(struct bio *bio) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + blk_status_t err = bio->bi_status; + + if (err) + rbio_update_error_bitmap(rbio, bio); + bio_put(bio); + if (atomic_dec_and_test(&rbio->stripes_pending)) + wake_up(&rbio->io_wait); +} + +static void submit_write_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; + + atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); + while ((bio = bio_list_pop(bio_list))) { + bio->bi_end_io = raid_wait_write_end_io; + + if (trace_raid56_write_stripe_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_write_stripe(rbio, bio, &trace_info); + } + submit_bio(bio); + } +} + +/* + * To determine if we need to read any sector from the disk. + * Should only be utilized in RMW path, to skip cached rbio. + */ +static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) +{ + int i; + + for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { + struct sector_ptr *sector = &rbio->stripe_sectors[i]; + /* - * 'mirror == 3' is to fail the p stripe and - * reconstruct from the q stripe. 'mirror > 3' is to - * fail a data stripe and reconstruct from p+q stripe. + * We have a sector which doesn't have page nor uptodate, + * thus this rbio can not be cached one, as cached one must + * have all its data sectors present and uptodate. */ - rbio->failb = rbio->real_stripes - (mirror_num - 1); - ASSERT(rbio->failb > 0); - if (rbio->failb <= rbio->faila) - rbio->failb--; + if (!sector->page || !sector->uptodate) + return true; } + return false; +} - if (lock_stripe_add(rbio)) - return; +static int rmw_rbio(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list; + int sectornr; + int ret = 0; /* - * This adds our rbio to the list of rbios that will be handled after - * the current lock owner is done. + * Allocate the pages for parity first, as P/Q pages will always be + * needed for both full-stripe and sub-stripe writes. */ - __raid56_parity_recover(rbio); - return; + ret = alloc_rbio_parity_pages(rbio); + if (ret < 0) + return ret; -out_end_bio: - btrfs_bio_counter_dec(fs_info); - btrfs_put_bioc(bioc); - bio_endio(bio); + /* + * Either full stripe write, or we have every data sector already + * cached, can go to write path immediately. + */ + if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio)) + goto write; + + /* + * Now we're doing sub-stripe write, also need all data stripes to do + * the full RMW. + */ + ret = alloc_rbio_data_pages(rbio); + if (ret < 0) + return ret; + + index_rbio_pages(rbio); + + ret = rmw_read_wait_recover(rbio); + if (ret < 0) + return ret; + +write: + /* + * At this stage we're not allowed to add any new bios to the + * bio list any more, anyone else that wants to change this stripe + * needs to do their own rmw. + */ + spin_lock_irq(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock_irq(&rbio->bio_list_lock); + + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); + + index_rbio_pages(rbio); + + /* + * We don't cache full rbios because we're assuming + * the higher layers are unlikely to use this area of + * the disk again soon. If they do use it again, + * hopefully they will send another full bio. + */ + if (!rbio_is_full(rbio)) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) + generate_pq_vertical(rbio, sectornr); + + bio_list_init(&bio_list); + ret = rmw_assemble_write_bios(rbio, &bio_list); + if (ret < 0) + return ret; + + /* We should have at least one bio assembled. */ + ASSERT(bio_list_size(&bio_list)); + submit_write_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + /* We may have more errors than our tolerance during the read. */ + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + int found_errors; + + found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); + if (found_errors > rbio->bioc->max_errors) { + ret = -EIO; + break; + } + } + return ret; } -static void rmw_work(struct work_struct *work) +static void rmw_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; + int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - raid56_rmw_stripe(rbio); + + ret = lock_stripe_add(rbio); + if (ret == 0) { + ret = rmw_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + } } -static void read_rebuild_work(struct work_struct *work) +static void rmw_rbio_work_locked(struct work_struct *work) { struct btrfs_raid_bio *rbio; + int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - __raid56_parity_recover(rbio); + + ret = rmw_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } /* @@ -2326,13 +2437,6 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, ASSERT(i < rbio->real_stripes); bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); - - /* - * We have already increased bio_counter when getting bioc, record it - * so we can free it at rbio_orig_end_io(). - */ - rbio->generic_bio_cnt = 1; - return rbio; } @@ -2381,8 +2485,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) return 0; } -static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, - int need_check) +static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) { struct btrfs_io_context *bioc = rbio->bioc; const u32 sectorsize = bioc->fs_info->sectorsize; @@ -2425,7 +2528,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, p_sector.page = alloc_page(GFP_NOFS); if (!p_sector.page) - goto cleanup; + return -ENOMEM; p_sector.pgoff = 0; p_sector.uptodate = 1; @@ -2435,14 +2538,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, if (!q_sector.page) { __free_page(p_sector.page); p_sector.page = NULL; - goto cleanup; + return -ENOMEM; } q_sector.pgoff = 0; q_sector.uptodate = 1; pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); } - atomic_set(&rbio->error, 0); + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* Map the parity stripe just once */ pointers[nr_data] = kmap_local_page(p_sector.page); @@ -2522,33 +2625,13 @@ writeback: } submit_write: - nr_data = bio_list_size(&bio_list); - if (!nr_data) { - /* Every parity is right */ - rbio_orig_end_io(rbio, BLK_STS_OK); - return; - } - - atomic_set(&rbio->stripes_pending, nr_data); - - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid_write_end_io; - - if (trace_raid56_scrub_write_stripe_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; - - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_write_stripe(rbio, bio, &trace_info); - } - submit_bio(bio); - } - return; + submit_write_bios(rbio, &bio_list); + return 0; cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - while ((bio = bio_list_pop(&bio_list))) bio_put(bio); + return ret; } static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) @@ -2558,102 +2641,99 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) return 0; } -/* - * While we're doing the parity check and repair, we could have errors - * in reading pages off the disk. This checks for errors and if we're - * not able to read the page it'll trigger parity reconstruction. The - * parity scrub will be finished after we've reconstructed the failed - * stripes - */ -static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) +static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) { - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - goto cleanup; + void **pointers = NULL; + void **unmap_array = NULL; + int sector_nr; + int ret; - if (rbio->faila >= 0 || rbio->failb >= 0) { + /* + * @pointers array stores the pointer for each sector. + * + * @unmap_array stores copy of pointers that does not get reordered + * during reconstruction so that kunmap_local works. + */ + pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + if (!pointers || !unmap_array) { + ret = -ENOMEM; + goto out; + } + + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { int dfail = 0, failp = -1; + int faila; + int failb; + int found_errors; + + found_errors = get_rbio_veritical_errors(rbio, sector_nr, + &faila, &failb); + if (found_errors > rbio->bioc->max_errors) { + ret = -EIO; + goto out; + } + if (found_errors == 0) + continue; - if (is_data_stripe(rbio, rbio->faila)) - dfail++; - else if (is_parity_stripe(rbio->faila)) - failp = rbio->faila; + /* We should have at least one error here. */ + ASSERT(faila >= 0 || failb >= 0); - if (is_data_stripe(rbio, rbio->failb)) + if (is_data_stripe(rbio, faila)) dfail++; - else if (is_parity_stripe(rbio->failb)) - failp = rbio->failb; + else if (is_parity_stripe(faila)) + failp = faila; + if (is_data_stripe(rbio, failb)) + dfail++; + else if (is_parity_stripe(failb)) + failp = failb; /* - * Because we can not use a scrubbing parity to repair - * the data, so the capability of the repair is declined. - * (In the case of RAID5, we can not repair anything) + * Because we can not use a scrubbing parity to repair the + * data, so the capability of the repair is declined. (In the + * case of RAID5, we can not repair anything.) */ - if (dfail > rbio->bioc->max_errors - 1) - goto cleanup; - + if (dfail > rbio->bioc->max_errors - 1) { + ret = -EIO; + goto out; + } /* - * If all data is good, only parity is correctly, just - * repair the parity. + * If all data is good, only parity is correctly, just repair + * the parity, no need to recover data stripes. */ - if (dfail == 0) { - finish_parity_scrub(rbio, 0); - return; - } + if (dfail == 0) + continue; /* * Here means we got one corrupted data stripe and one - * corrupted parity on RAID6, if the corrupted parity - * is scrubbing parity, luckily, use the other one to repair - * the data, or we can not repair the data stripe. + * corrupted parity on RAID6, if the corrupted parity is + * scrubbing parity, luckily, use the other one to repair the + * data, or we can not repair the data stripe. */ - if (failp != rbio->scrubp) - goto cleanup; + if (failp != rbio->scrubp) { + ret = -EIO; + goto out; + } - __raid_recover_end_io(rbio); - } else { - finish_parity_scrub(rbio, 1); + ret = recover_vertical(rbio, sector_nr, pointers, unmap_array); + if (ret < 0) + goto out; } - return; - -cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); -} - -/* - * end io for the read phase of the rmw cycle. All the bios here are physical - * stripe bios we've read from the disk so we can recalculate the parity of the - * stripe. - * - * This will usually kick off finish_rmw once all the bios are read in, but it - * may trigger parity reconstruction if we had any errors along the way - */ -static void raid56_parity_scrub_end_io_work(struct work_struct *work) -{ - struct btrfs_raid_bio *rbio = - container_of(work, struct btrfs_raid_bio, end_io_work); - - /* - * This will normally call finish_rmw to start our write, but if there - * are any failed stripes we'll reconstruct from parity first - */ - validate_rbio_for_parity_scrub(rbio); +out: + kfree(pointers); + kfree(unmap_array); + return ret; } -static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) +static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) { - int bios_to_read = 0; - struct bio_list bio_list; - int ret; - int total_sector_nr; struct bio *bio; + int total_sector_nr; + int ret = 0; - bio_list_init(&bio_list); - - ret = alloc_rbio_essential_pages(rbio); - if (ret) - goto cleanup; + ASSERT(bio_list_size(bio_list) == 0); - atomic_set(&rbio->error, 0); /* Build a list of bios to read all the missing parts. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { @@ -2682,67 +2762,84 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) if (sector->uptodate) continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret) - goto cleanup; + goto error; } + return 0; +error: + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); + return ret; +} - bios_to_read = bio_list_size(&bio_list); - if (!bios_to_read) { - /* - * this can happen if others have merged with - * us, it means there is nothing left to read. - * But if there are missing devices it may not be - * safe to do the full stripe write yet. - */ - goto finish; - } +static int scrub_rbio(struct btrfs_raid_bio *rbio) +{ + bool need_check = false; + struct bio_list bio_list; + int sector_nr; + int ret; + struct bio *bio; - /* - * The bioc may be freed once we submit the last bio. Make sure not to - * touch it after that. - */ - atomic_set(&rbio->stripes_pending, bios_to_read); - INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work); - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid56_bio_end_io; + bio_list_init(&bio_list); - if (trace_raid56_scrub_read_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; + ret = alloc_rbio_essential_pages(rbio); + if (ret) + goto cleanup; - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_read(rbio, bio, &trace_info); + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); + + ret = scrub_assemble_read_bios(rbio, &bio_list); + if (ret < 0) + goto cleanup; + + submit_read_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + /* We may have some failures, recover the failed sectors first. */ + ret = recover_scrub_rbio(rbio); + if (ret < 0) + goto cleanup; + + /* + * We have every sector properly prepared. Can finish the scrub + * and writeback the good content. + */ + ret = finish_parity_scrub(rbio, need_check); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { + int found_errors; + + found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); + if (found_errors > rbio->bioc->max_errors) { + ret = -EIO; + break; } - submit_bio(bio); } - /* the actual write will happen once the reads are done */ - return; + return ret; cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - while ((bio = bio_list_pop(&bio_list))) bio_put(bio); - return; - -finish: - validate_rbio_for_parity_scrub(rbio); + return ret; } -static void scrub_parity_work(struct work_struct *work) +static void scrub_rbio_work_locked(struct work_struct *work) { struct btrfs_raid_bio *rbio; + int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - raid56_parity_scrub_stripe(rbio); + ret = scrub_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) { if (!lock_stripe_add(rbio)) - start_async_work(rbio, scrub_parity_work); + start_async_work(rbio, scrub_rbio_work_locked); } /* The following code is used for dev replace of a missing RAID 5/6 device. */ @@ -2765,24 +2862,12 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) */ ASSERT(!bio->bi_iter.bi_size); - rbio->faila = find_logical_bio_stripe(rbio, bio); - if (rbio->faila == -1) { - BUG(); - kfree(rbio); - return NULL; - } - - /* - * When we get bioc, we have already increased bio_counter, record it - * so we can free it at rbio_orig_end_io() - */ - rbio->generic_bio_cnt = 1; + set_rbio_range_error(rbio, bio); return rbio; } void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) { - if (!lock_stripe_add(rbio)) - start_async_work(rbio, read_rebuild_work); + start_async_work(rbio, recover_rbio_work); } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 6f48f9e4c869..7c73a443939e 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -74,12 +74,6 @@ struct btrfs_raid_bio { /* How many sectors there are for each stripe */ u8 stripe_nsectors; - /* First bad stripe, -1 means no corruption */ - s8 faila; - - /* Second bad stripe (for RAID6 use) */ - s8 failb; - /* Stripe number that we're scrubbing */ u8 scrubp; @@ -89,15 +83,11 @@ struct btrfs_raid_bio { */ int bio_list_bytes; - int generic_bio_cnt; - refcount_t refs; atomic_t stripes_pending; - atomic_t error; - - struct work_struct end_io_work; + wait_queue_head_t io_wait; /* Bitmap to record which horizontal stripe has data */ unsigned long dbitmap; @@ -128,6 +118,29 @@ struct btrfs_raid_bio { /* Allocated with real_stripes-many pointers for finish_*() calls */ void **finish_pointers; + + /* + * The bitmap recording where IO errors happened. + * Each bit is corresponding to one sector in either bio_sectors[] or + * stripe_sectors[] array. + * + * The reason we don't use another bit in sector_ptr is, we have two + * arrays of sectors, and a lot of IO can use sectors in both arrays. + * Thus making it much harder to iterate. + */ + unsigned long *error_bitmap; + + /* + * Checksum buffer if the rbio is for data. The buffer should cover + * all data sectors (exlcuding P/Q sectors). + */ + u8 *csum_buf; + + /* + * Each bit represents if the corresponding sector has data csum found. + * Should only cover data sectors (excluding P/Q sectors). + */ + unsigned long *csum_bitmap; }; /* @@ -166,7 +179,7 @@ static inline int nr_data_stripes(const struct map_lookup *map) struct btrfs_device; void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - int mirror_num, bool generic_io); + int mirror_num); void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h index 5c1a617eb25d..5c2b66d155ef 100644 --- a/fs/btrfs/rcu-string.h +++ b/fs/btrfs/rcu-string.h @@ -18,7 +18,11 @@ static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask) (len * sizeof(char)), mask); if (!ret) return ret; - strncpy(ret->str, src, len); + /* Warn if the source got unexpectedly truncated. */ + if (WARN_ON(strscpy(ret->str, src, len) < 0)) { + kfree(ret); + return NULL; + } return ret; } diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index a248f46cfe72..95d28497de7c 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -5,11 +5,14 @@ #include <linux/sched.h> #include <linux/stacktrace.h> +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "locking.h" #include "delayed-ref.h" #include "ref-verify.h" +#include "fs.h" +#include "accessors.h" /* * Used to keep track the roots and number of refs each root has for a given diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 9acf47b11fe6..0474bbe39da7 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -2,13 +2,19 @@ #include <linux/blkdev.h> #include <linux/iversion.h> -#include "compression.h" #include "ctree.h" +#include "fs.h" +#include "messages.h" +#include "compression.h" #include "delalloc-space.h" #include "disk-io.h" #include "reflink.h" #include "transaction.h" #include "subpage.h" +#include "accessors.h" +#include "file-item.h" +#include "file.h" +#include "super.h" #define BTRFS_MAX_DEDUPE_LEN SZ_16M @@ -92,7 +98,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode, clear_extent_bit(&inode->io_tree, file_offset, range_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, NULL); + NULL); ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); if (ret) goto out_unlock; @@ -318,16 +324,16 @@ copy_to_page: goto out; } -/** - * btrfs_clone() - clone a range from inode file to another +/* + * Clone a range from inode file to another. * - * @src: Inode to clone from - * @inode: Inode to clone to - * @off: Offset within source to start clone from - * @olen: Original length, passed by user, of range to clone - * @olen_aligned: Block-aligned value of olen - * @destoff: Offset within @inode to start clone - * @no_time_update: Whether to update mtime/ctime on the target inode + * @src: Inode to clone from + * @inode: Inode to clone to + * @off: Offset within source to start clone from + * @olen: Original length, passed by user, of range to clone + * @olen_aligned: Block-aligned value of olen + * @destoff: Offset within @inode to start clone + * @no_time_update: Whether to update mtime/ctime on the target inode */ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, @@ -615,8 +621,8 @@ out: static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, struct inode *inode2, u64 loff2, u64 len) { - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL); } static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, @@ -634,8 +640,8 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, swap(range1_end, range2_end); } - lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end); - lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end); + lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL); + lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL); btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end); btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end); @@ -887,7 +893,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, return -EINVAL; if (same_inode) { - btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP); + btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); } else { lock_two_nondirectories(src_inode, dst_inode); btrfs_double_mmap_lock(src_inode, dst_inode); @@ -905,7 +911,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, out_unlock: if (same_inode) { - btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); } else { btrfs_double_mmap_unlock(src_inode, dst_inode); unlock_two_nondirectories(src_inode, dst_inode); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 45c02aba2492..31ec4a7658ce 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -27,6 +27,15 @@ #include "subpage.h" #include "zoned.h" #include "inode-item.h" +#include "space-info.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "file-item.h" +#include "relocation.h" +#include "super.h" +#include "tree-checker.h" /* * Relocation overview @@ -470,7 +479,7 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( int ret; int err = 0; - iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS); + iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info); if (!iter) return ERR_PTR(-ENOMEM); path = btrfs_alloc_path(); @@ -1109,10 +1118,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans, inode = find_next_inode(root, key.objectid); first = 0; } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); inode = find_next_inode(root, key.objectid); } if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { + struct extent_state *cached_state = NULL; + end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); WARN_ON(!IS_ALIGNED(key.offset, @@ -1120,14 +1131,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans, WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); end--; ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end); + key.offset, end, + &cached_state); if (!ret) continue; - btrfs_drop_extent_cache(BTRFS_I(inode), - key.offset, end, 1); + btrfs_drop_extent_map_range(BTRFS_I(inode), + key.offset, end, true); unlock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end); + key.offset, end, &cached_state); } } @@ -1170,7 +1182,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, if (dirty) btrfs_mark_buffer_dirty(leaf); if (inode) - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); return ret; } @@ -1516,6 +1528,8 @@ static int invalidate_extent_cache(struct btrfs_root *root, objectid = min_key->objectid; while (1) { + struct extent_state *cached_state = NULL; + cond_resched(); iput(inode); @@ -1566,9 +1580,9 @@ static int invalidate_extent_cache(struct btrfs_root *root, } /* the lock_extent waits for read_folio to complete */ - lock_extent(&BTRFS_I(inode)->io_tree, start, end); - btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end); + lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); } return 0; } @@ -2597,10 +2611,14 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc) static int get_tree_block_key(struct btrfs_fs_info *fs_info, struct tree_block *block) { + struct btrfs_tree_parent_check check = { + .level = block->level, + .owner_root = block->owner, + .transid = block->key.offset + }; struct extent_buffer *eb; - eb = read_tree_block(fs_info, block->bytenr, block->owner, - block->key.offset, block->level, NULL); + eb = read_tree_block(fs_info, block->bytenr, &check); if (IS_ERR(eb)) return PTR_ERR(eb); if (!extent_buffer_uptodate(eb)) { @@ -2861,25 +2879,27 @@ static noinline_for_stack int prealloc_file_extent_cluster( if (ret) return ret; - btrfs_inode_lock(&inode->vfs_inode, 0); + btrfs_inode_lock(inode, 0); for (nr = 0; nr < cluster->nr; nr++) { + struct extent_state *cached_state = NULL; + start = cluster->boundary[nr] - offset; if (nr + 1 < cluster->nr) end = cluster->boundary[nr + 1] - 1 - offset; else end = cluster->end - offset; - lock_extent(&inode->io_tree, start, end); + lock_extent(&inode->io_tree, start, end, &cached_state); num_bytes = end + 1 - start; ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); cur_offset = end + 1; - unlock_extent(&inode->io_tree, start, end); + unlock_extent(&inode->io_tree, start, end, &cached_state); if (ret) break; } - btrfs_inode_unlock(&inode->vfs_inode, 0); + btrfs_inode_unlock(inode, 0); if (cur_offset < prealloc_end) btrfs_free_reserved_data_space_noquota(inode->root->fs_info, @@ -2890,8 +2910,8 @@ static noinline_for_stack int prealloc_file_extent_cluster( static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode, u64 start, u64 end, u64 block_start) { - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em; + struct extent_state *cached_state = NULL; int ret = 0; em = alloc_extent_map(); @@ -2904,18 +2924,11 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod em->block_start = block_start; set_bit(EXTENT_FLAG_PINNED, &em->flags); - lock_extent(&BTRFS_I(inode)->io_tree, start, end); - while (1) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) { - free_extent_map(em); - break; - } - btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); - } - unlock_extent(&BTRFS_I(inode)->io_tree, start, end); + lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); + ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); + free_extent_map(em); + return ret; } @@ -2991,6 +3004,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, */ cur = max(page_start, cluster->boundary[*cluster_nr] - offset); while (cur <= page_end) { + struct extent_state *cached_state = NULL; u64 extent_start = cluster->boundary[*cluster_nr] - offset; u64 extent_end = get_cluster_boundary_end(cluster, *cluster_nr) - offset; @@ -3006,13 +3020,15 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, goto release_page; /* Mark the range delalloc and dirty for later writeback */ - lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end); + lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, + &cached_state); ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, - clamped_end, 0, NULL); + clamped_end, 0, &cached_state); if (ret) { - clear_extent_bits(&BTRFS_I(inode)->io_tree, - clamped_start, clamped_end, - EXTENT_LOCKED | EXTENT_BOUNDARY); + clear_extent_bit(&BTRFS_I(inode)->io_tree, + clamped_start, clamped_end, + EXTENT_LOCKED | EXTENT_BOUNDARY, + &cached_state); btrfs_delalloc_release_metadata(BTRFS_I(inode), clamped_len, true); btrfs_delalloc_release_extents(BTRFS_I(inode), @@ -3039,7 +3055,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, boundary_start, boundary_end, EXTENT_BOUNDARY); } - unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end); + unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, + &cached_state); btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); cur += clamped_len; @@ -3396,24 +3413,28 @@ int add_data_references(struct reloc_control *rc, struct btrfs_path *path, struct rb_root *blocks) { - struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - struct ulist *leaves = NULL; + struct btrfs_backref_walk_ctx ctx = { 0 }; struct ulist_iterator leaf_uiter; struct ulist_node *ref_node = NULL; - const u32 blocksize = fs_info->nodesize; + const u32 blocksize = rc->extent_root->fs_info->nodesize; int ret = 0; btrfs_release_path(path); - ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid, - 0, &leaves, NULL, true); + + ctx.bytenr = extent_key->objectid; + ctx.ignore_extent_item_pos = true; + ctx.fs_info = rc->extent_root->fs_info; + + ret = btrfs_find_all_leafs(&ctx); if (ret < 0) return ret; ULIST_ITER_INIT(&leaf_uiter); - while ((ref_node = ulist_next(leaves, &leaf_uiter))) { + while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) { + struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb; - eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL); + eb = read_tree_block(ctx.fs_info, ref_node->val, &check); if (IS_ERR(eb)) { ret = PTR_ERR(eb); break; @@ -3429,7 +3450,7 @@ int add_data_references(struct reloc_control *rc, } if (ret < 0) free_block_list(blocks); - ulist_free(leaves); + ulist_free(ctx.refs); return ret; } @@ -3913,8 +3934,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&rc->dirty_subvol_roots); btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1); mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(fs_info, &rc->processed_blocks, - IO_TREE_RELOC_BLOCKS, NULL); + extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); return rc; } @@ -4338,8 +4358,8 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) disk_bytenr = file_pos + inode->index_cnt; csum_root = btrfs_csum_root(fs_info, disk_bytenr); - ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, - disk_bytenr + len - 1, &list, 0); + ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, + disk_bytenr + len - 1, &list, 0, false); if (ret) goto out; diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h new file mode 100644 index 000000000000..2041a86186de --- /dev/null +++ b/fs/btrfs/relocation.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_RELOCATION_H +#define BTRFS_RELOCATION_H + +int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); +int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); +int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow); +void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve); +int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending); +int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); +struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr); +int btrfs_should_ignore_reloc_root(struct btrfs_root *root); + +#endif diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index d647cb2938c0..859874579456 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -6,11 +6,16 @@ #include <linux/err.h> #include <linux/uuid.h> #include "ctree.h" +#include "fs.h" +#include "messages.h" #include "transaction.h" #include "disk-io.h" #include "print-tree.h" #include "qgroup.h" #include "space-info.h" +#include "accessors.h" +#include "root-tree.h" +#include "orphan.h" /* * Read a root item from the tree. In case we detect a root item smaller then @@ -327,9 +332,8 @@ out: } int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 *sequence, const char *name, - int name_len) - + u64 ref_id, u64 dirid, u64 *sequence, + const struct fscrypt_str *name) { struct btrfs_root *tree_root = trans->fs_info->tree_root; struct btrfs_path *path; @@ -337,7 +341,6 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, struct extent_buffer *leaf; struct btrfs_key key; unsigned long ptr; - int err = 0; int ret; path = btrfs_alloc_path(); @@ -350,7 +353,6 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, again: ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); if (ret < 0) { - err = ret; goto out; } else if (ret == 0) { leaf = path->nodes[0]; @@ -358,20 +360,20 @@ again: struct btrfs_root_ref); ptr = (unsigned long)(ref + 1); if ((btrfs_root_ref_dirid(leaf, ref) != dirid) || - (btrfs_root_ref_name_len(leaf, ref) != name_len) || - memcmp_extent_buffer(leaf, name, ptr, name_len)) { - err = -ENOENT; + (btrfs_root_ref_name_len(leaf, ref) != name->len) || + memcmp_extent_buffer(leaf, name->name, ptr, name->len)) { + ret = -ENOENT; goto out; } *sequence = btrfs_root_ref_sequence(leaf, ref); ret = btrfs_del_item(trans, tree_root, path); - if (ret) { - err = ret; + if (ret) goto out; - } - } else - err = -ENOENT; + } else { + ret = -ENOENT; + goto out; + } if (key.type == BTRFS_ROOT_BACKREF_KEY) { btrfs_release_path(path); @@ -383,7 +385,7 @@ again: out: btrfs_free_path(path); - return err; + return ret; } /* @@ -402,8 +404,8 @@ out: * Will return 0, -ENOMEM, or anything from the CoW path */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 sequence, const char *name, - int name_len) + u64 ref_id, u64 dirid, u64 sequence, + const struct fscrypt_str *name) { struct btrfs_root *tree_root = trans->fs_info->tree_root; struct btrfs_key key; @@ -422,7 +424,7 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, key.offset = ref_id; again: ret = btrfs_insert_empty_item(trans, tree_root, path, &key, - sizeof(*ref) + name_len); + sizeof(*ref) + name->len); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_free_path(path); @@ -433,9 +435,9 @@ again: ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); btrfs_set_root_ref_dirid(leaf, ref, dirid); btrfs_set_root_ref_sequence(leaf, ref, sequence); - btrfs_set_root_ref_name_len(leaf, ref, name_len); + btrfs_set_root_ref_name_len(leaf, ref, name->len); ptr = (unsigned long)(ref + 1); - write_extent_buffer(leaf, name, ptr, name_len); + write_extent_buffer(leaf, name->name, ptr, name->len); btrfs_mark_buffer_dirty(leaf); if (key.type == BTRFS_ROOT_BACKREF_KEY) { diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h new file mode 100644 index 000000000000..cbbaca32126e --- /dev/null +++ b/fs/btrfs/root-tree.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ROOT_TREE_H +#define BTRFS_ROOT_TREE_H + +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + int nitems, bool use_global_rsv); +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv); +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, + u64 ref_id, u64 dirid, u64 sequence, + const struct fscrypt_str *name); +int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, + u64 ref_id, u64 dirid, u64 *sequence, + const struct fscrypt_str *name); +int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key); +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *key, + struct btrfs_root_item *item); +int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, + struct btrfs_root_item *item); +int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, + struct btrfs_path *path, struct btrfs_root_item *root_item, + struct btrfs_key *root_key); +int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info); +void btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node); +void btrfs_check_and_init_root_item(struct btrfs_root_item *item); +void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root); + +#endif diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3afe5fa50a63..52b346795f66 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -17,10 +17,13 @@ #include "extent_io.h" #include "dev-replace.h" #include "check-integrity.h" -#include "rcu-string.h" #include "raid56.h" #include "block-group.h" #include "zoned.h" +#include "fs.h" +#include "accessors.h" +#include "file-item.h" +#include "scrub.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -54,6 +57,19 @@ struct scrub_ctx; */ #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) +#define SCRUB_MAX_PAGES (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE)) + +/* + * Maximum number of mirrors that can be available for all profiles counting + * the target device of dev-replace as one. During an active device replace + * procedure, the target device of the copy operation is a mirror for the + * filesystem data as well that can be used to read data in order to repair + * read errors on other disks. + * + * Current value is derived from RAID1C4 with 4 copies. + */ +#define BTRFS_MAX_MIRRORS (4 + 1) + struct scrub_recover { refcount_t refs; struct btrfs_io_context *bioc; @@ -62,16 +78,12 @@ struct scrub_recover { struct scrub_sector { struct scrub_block *sblock; - struct page *page; - struct btrfs_device *dev; struct list_head list; u64 flags; /* extent flags */ u64 generation; - u64 logical; - u64 physical; - u64 physical_for_dev_replace; + /* Offset in bytes to @sblock. */ + u32 offset; atomic_t refs; - u8 mirror_num; unsigned int have_csum:1; unsigned int io_error:1; u8 csum[BTRFS_CSUM_SIZE]; @@ -94,8 +106,22 @@ struct scrub_bio { }; struct scrub_block { + /* + * Each page will have its page::private used to record the logical + * bytenr. + */ + struct page *pages[SCRUB_MAX_PAGES]; struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK]; + struct btrfs_device *dev; + /* Logical bytenr of the sblock */ + u64 logical; + u64 physical; + u64 physical_for_dev_replace; + /* Length of sblock in bytes */ + u32 len; int sector_count; + int mirror_num; + atomic_t outstanding_sectors; refcount_t refs; /* free mem on transition to zero */ struct scrub_ctx *sctx; @@ -202,8 +228,174 @@ struct full_stripe_lock { struct mutex mutex; }; +#ifndef CONFIG_64BIT +/* This structure is for archtectures whose (void *) is smaller than u64 */ +struct scrub_page_private { + u64 logical; +}; +#endif + +static int attach_scrub_page_private(struct page *page, u64 logical) +{ +#ifdef CONFIG_64BIT + attach_page_private(page, (void *)logical); + return 0; +#else + struct scrub_page_private *spp; + + spp = kmalloc(sizeof(*spp), GFP_KERNEL); + if (!spp) + return -ENOMEM; + spp->logical = logical; + attach_page_private(page, (void *)spp); + return 0; +#endif +} + +static void detach_scrub_page_private(struct page *page) +{ +#ifdef CONFIG_64BIT + detach_page_private(page); + return; +#else + struct scrub_page_private *spp; + + spp = detach_page_private(page); + kfree(spp); + return; +#endif +} + +static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx, + struct btrfs_device *dev, + u64 logical, u64 physical, + u64 physical_for_dev_replace, + int mirror_num) +{ + struct scrub_block *sblock; + + sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); + if (!sblock) + return NULL; + refcount_set(&sblock->refs, 1); + sblock->sctx = sctx; + sblock->logical = logical; + sblock->physical = physical; + sblock->physical_for_dev_replace = physical_for_dev_replace; + sblock->dev = dev; + sblock->mirror_num = mirror_num; + sblock->no_io_error_seen = 1; + /* + * Scrub_block::pages will be allocated at alloc_scrub_sector() when + * the corresponding page is not allocated. + */ + return sblock; +} + +/* + * Allocate a new scrub sector and attach it to @sblock. + * + * Will also allocate new pages for @sblock if needed. + */ +static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock, + u64 logical) +{ + const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT; + struct scrub_sector *ssector; + + /* We must never have scrub_block exceed U32_MAX in size. */ + ASSERT(logical - sblock->logical < U32_MAX); + + ssector = kzalloc(sizeof(*ssector), GFP_KERNEL); + if (!ssector) + return NULL; + + /* Allocate a new page if the slot is not allocated */ + if (!sblock->pages[page_index]) { + int ret; + + sblock->pages[page_index] = alloc_page(GFP_KERNEL); + if (!sblock->pages[page_index]) { + kfree(ssector); + return NULL; + } + ret = attach_scrub_page_private(sblock->pages[page_index], + sblock->logical + (page_index << PAGE_SHIFT)); + if (ret < 0) { + kfree(ssector); + __free_page(sblock->pages[page_index]); + sblock->pages[page_index] = NULL; + return NULL; + } + } + + atomic_set(&ssector->refs, 1); + ssector->sblock = sblock; + /* The sector to be added should not be used */ + ASSERT(sblock->sectors[sblock->sector_count] == NULL); + ssector->offset = logical - sblock->logical; + + /* The sector count must be smaller than the limit */ + ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK); + + sblock->sectors[sblock->sector_count] = ssector; + sblock->sector_count++; + sblock->len += sblock->sctx->fs_info->sectorsize; + + return ssector; +} + +static struct page *scrub_sector_get_page(struct scrub_sector *ssector) +{ + struct scrub_block *sblock = ssector->sblock; + pgoff_t index; + /* + * When calling this function, ssector must be alreaday attached to the + * parent sblock. + */ + ASSERT(sblock); + + /* The range should be inside the sblock range */ + ASSERT(ssector->offset < sblock->len); + + index = ssector->offset >> PAGE_SHIFT; + ASSERT(index < SCRUB_MAX_PAGES); + ASSERT(sblock->pages[index]); + ASSERT(PagePrivate(sblock->pages[index])); + return sblock->pages[index]; +} + +static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector) +{ + struct scrub_block *sblock = ssector->sblock; + + /* + * When calling this function, ssector must be already attached to the + * parent sblock. + */ + ASSERT(sblock); + + /* The range should be inside the sblock range */ + ASSERT(ssector->offset < sblock->len); + + return offset_in_page(ssector->offset); +} + +static char *scrub_sector_get_kaddr(struct scrub_sector *ssector) +{ + return page_address(scrub_sector_get_page(ssector)) + + scrub_sector_get_page_offset(ssector); +} + +static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector, + unsigned int len) +{ + return bio_add_page(bio, scrub_sector_get_page(ssector), len, + scrub_sector_get_page_offset(ssector)); +} + static int scrub_setup_recheck_block(struct scrub_block *original_sblock, - struct scrub_block *sblocks_for_recheck); + struct scrub_block *sblocks_for_recheck[]); static void scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int retry_failed_mirror); @@ -533,10 +725,8 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) if (sctx->curr != -1) { struct scrub_bio *sbio = sctx->bios[sctx->curr]; - for (i = 0; i < sbio->sector_count; i++) { - WARN_ON(!sbio->sectors[i]->page); + for (i = 0; i < sbio->sector_count; i++) scrub_block_put(sbio->sectors[i]->sblock); - } bio_put(sbio->bio); } @@ -618,8 +808,8 @@ nomem: return ERR_PTR(-ENOMEM); } -static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, - void *warn_ctx) +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, + u64 root, void *warn_ctx) { u32 nlink; int ret; @@ -686,7 +876,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)", swarn->errstr, swarn->logical, - rcu_str_deref(swarn->dev->name), + btrfs_dev_name(swarn->dev), swarn->physical, root, inum, offset, fs_info->sectorsize, nlink, @@ -700,7 +890,7 @@ err: btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", swarn->errstr, swarn->logical, - rcu_str_deref(swarn->dev->name), + btrfs_dev_name(swarn->dev), swarn->physical, root, inum, offset, ret); @@ -718,7 +908,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) struct btrfs_extent_item *ei; struct scrub_warning swarn; unsigned long ptr = 0; - u64 extent_item_pos; u64 flags = 0; u64 ref_root; u32 item_size; @@ -726,15 +915,21 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) int ret; WARN_ON(sblock->sector_count < 1); - dev = sblock->sectors[0]->dev; + dev = sblock->dev; fs_info = sblock->sctx->fs_info; + /* Super block error, no need to search extent tree. */ + if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { + btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu", + errstr, btrfs_dev_name(dev), sblock->physical); + return; + } path = btrfs_alloc_path(); if (!path) return; - swarn.physical = sblock->sectors[0]->physical; - swarn.logical = sblock->sectors[0]->logical; + swarn.physical = sblock->physical; + swarn.logical = sblock->logical; swarn.errstr = errstr; swarn.dev = NULL; @@ -743,7 +938,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) if (ret < 0) goto out; - extent_item_pos = swarn.logical - found_key.objectid; swarn.extent_item_size = found_key.offset; eb = path->nodes[0]; @@ -758,7 +952,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", errstr, swarn.logical, - rcu_str_deref(dev->name), + btrfs_dev_name(dev), swarn.physical, ref_level ? "node" : "leaf", ret < 0 ? -1 : ref_level, @@ -766,12 +960,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) } while (ret != 1); btrfs_release_path(path); } else { + struct btrfs_backref_walk_ctx ctx = { 0 }; + btrfs_release_path(path); + + ctx.bytenr = found_key.objectid; + ctx.extent_item_pos = swarn.logical - found_key.objectid; + ctx.fs_info = fs_info; + swarn.path = path; swarn.dev = dev; - iterate_extent_inodes(fs_info, found_key.objectid, - extent_item_pos, 1, - scrub_print_warning_inode, &swarn, false); + + iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn); } out: @@ -804,13 +1004,14 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) { struct scrub_ctx *sctx = sblock_to_check->sctx; - struct btrfs_device *dev; + struct btrfs_device *dev = sblock_to_check->dev; struct btrfs_fs_info *fs_info; u64 logical; unsigned int failed_mirror_index; unsigned int is_metadata; unsigned int have_csum; - struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ + /* One scrub_block for each mirror */ + struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 }; struct scrub_block *sblock_bad; int ret; int mirror_index; @@ -825,22 +1026,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) fs_info = sctx->fs_info; if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { /* - * if we find an error in a super block, we just report it. + * If we find an error in a super block, we just report it. * They will get written with the next transaction commit * anyway */ + scrub_print_warning("super block error", sblock_to_check); spin_lock(&sctx->stat_lock); ++sctx->stat.super_errors; spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); return 0; } - logical = sblock_to_check->sectors[0]->logical; - BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1); - failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1; + logical = sblock_to_check->logical; + ASSERT(sblock_to_check->mirror_num); + failed_mirror_index = sblock_to_check->mirror_num - 1; is_metadata = !(sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA); have_csum = sblock_to_check->sectors[0]->have_csum; - dev = sblock_to_check->sectors[0]->dev; if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical)) return 0; @@ -902,17 +1104,28 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * repaired area is verified in order to correctly maintain * the statistics. */ - - sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS, - sizeof(*sblocks_for_recheck), GFP_KERNEL); - if (!sblocks_for_recheck) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - sctx->stat.read_errors++; - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); - goto out; + for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) { + /* + * Note: the two members refs and outstanding_sectors are not + * used in the blocks that are used for the recheck procedure. + * + * But alloc_scrub_block() will initialize sblock::ref anyway, + * so we can use scrub_block_put() to clean them up. + * + * And here we don't setup the physical/dev for the sblock yet, + * they will be correctly initialized in scrub_setup_recheck_block(). + */ + sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL, + logical, 0, 0, mirror_index); + if (!sblocks_for_recheck[mirror_index]) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); + goto out; + } } /* Setup the context, map the logical blocks and alloc the sectors */ @@ -926,7 +1139,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) goto out; } BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); - sblock_bad = sblocks_for_recheck + failed_mirror_index; + sblock_bad = sblocks_for_recheck[failed_mirror_index]; /* build and submit the bios for the failed mirror, check checksums */ scrub_recheck_block(fs_info, sblock_bad, 1); @@ -1011,22 +1224,22 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) { if (mirror_index >= BTRFS_MAX_MIRRORS) break; - if (!sblocks_for_recheck[mirror_index].sector_count) + if (!sblocks_for_recheck[mirror_index]->sector_count) break; - sblock_other = sblocks_for_recheck + mirror_index; + sblock_other = sblocks_for_recheck[mirror_index]; } else { struct scrub_recover *r = sblock_bad->sectors[0]->recover; int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs; if (mirror_index >= max_allowed) break; - if (!sblocks_for_recheck[1].sector_count) + if (!sblocks_for_recheck[1]->sector_count) break; ASSERT(failed_mirror_index == 0); - sblock_other = sblocks_for_recheck + 1; - sblock_other->sectors[0]->mirror_num = 1 + mirror_index; + sblock_other = sblocks_for_recheck[1]; + sblock_other->mirror_num = 1 + mirror_index; } /* build and submit the bios, check checksums */ @@ -1097,12 +1310,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) /* Try to find no-io-error sector in mirrors */ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].sector_count > 0; + sblocks_for_recheck[mirror_index]->sector_count > 0; mirror_index++) { - if (!sblocks_for_recheck[mirror_index]. + if (!sblocks_for_recheck[mirror_index]-> sectors[sector_num]->io_error) { - sblock_other = sblocks_for_recheck + - mirror_index; + sblock_other = sblocks_for_recheck[mirror_index]; break; } } @@ -1163,7 +1375,7 @@ corrected_error: spin_unlock(&sctx->stat_lock); btrfs_err_rl_in_rcu(fs_info, "fixed up error at logical %llu on dev %s", - logical, rcu_str_deref(dev->name)); + logical, btrfs_dev_name(dev)); } } else { did_not_correct_error: @@ -1172,29 +1384,32 @@ did_not_correct_error: spin_unlock(&sctx->stat_lock); btrfs_err_rl_in_rcu(fs_info, "unable to fixup (regular) error at logical %llu on dev %s", - logical, rcu_str_deref(dev->name)); + logical, btrfs_dev_name(dev)); } out: - if (sblocks_for_recheck) { - for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; - mirror_index++) { - struct scrub_block *sblock = sblocks_for_recheck + - mirror_index; - struct scrub_recover *recover; - int i; - - for (i = 0; i < sblock->sector_count; i++) { - sblock->sectors[i]->sblock = NULL; - recover = sblock->sectors[i]->recover; - if (recover) { - scrub_put_recover(fs_info, recover); - sblock->sectors[i]->recover = NULL; - } - scrub_sector_put(sblock->sectors[i]); + for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) { + struct scrub_block *sblock = sblocks_for_recheck[mirror_index]; + struct scrub_recover *recover; + int sector_index; + + /* Not allocated, continue checking the next mirror */ + if (!sblock) + continue; + + for (sector_index = 0; sector_index < sblock->sector_count; + sector_index++) { + /* + * Here we just cleanup the recover, each sector will be + * properly cleaned up by later scrub_block_put() + */ + recover = sblock->sectors[sector_index]->recover; + if (recover) { + scrub_put_recover(fs_info, recover); + sblock->sectors[sector_index]->recover = NULL; } } - kfree(sblocks_for_recheck); + scrub_block_put(sblock); } ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); @@ -1244,12 +1459,12 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, } static int scrub_setup_recheck_block(struct scrub_block *original_sblock, - struct scrub_block *sblocks_for_recheck) + struct scrub_block *sblocks_for_recheck[]) { struct scrub_ctx *sctx = original_sblock->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; + u64 logical = original_sblock->logical; u64 length = original_sblock->sector_count << fs_info->sectorsize_bits; - u64 logical = original_sblock->sectors[0]->logical; u64 generation = original_sblock->sectors[0]->generation; u64 flags = original_sblock->sectors[0]->flags; u64 have_csum = original_sblock->sectors[0]->have_csum; @@ -1264,11 +1479,6 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, int nmirrors; int ret; - /* - * Note: the two members refs and outstanding_sectors are not used (and - * not set) in the blocks that are used for the recheck procedure. - */ - while (length > 0) { sublen = min_t(u64, length, fs_info->sectorsize); mapped_length = sublen; @@ -1287,7 +1497,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, return -EIO; } - recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); + recover = kzalloc(sizeof(struct scrub_recover), GFP_KERNEL); if (!recover) { btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); @@ -1307,24 +1517,19 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, struct scrub_block *sblock; struct scrub_sector *sector; - sblock = sblocks_for_recheck + mirror_index; + sblock = sblocks_for_recheck[mirror_index]; sblock->sctx = sctx; - sector = kzalloc(sizeof(*sector), GFP_NOFS); + sector = alloc_scrub_sector(sblock, logical); if (!sector) { -leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); scrub_put_recover(fs_info, recover); return -ENOMEM; } - scrub_sector_get(sector); - sblock->sectors[sector_index] = sector; - sector->sblock = sblock; sector->flags = flags; sector->generation = generation; - sector->logical = logical; sector->have_csum = have_csum; if (have_csum) memcpy(sector->csum, @@ -1339,21 +1544,20 @@ leave_nomem: mirror_index, &stripe_index, &stripe_offset); - sector->physical = bioc->stripes[stripe_index].physical + - stripe_offset; - sector->dev = bioc->stripes[stripe_index].dev; + /* + * We're at the first sector, also populate @sblock + * physical and dev. + */ + if (sector_index == 0) { + sblock->physical = + bioc->stripes[stripe_index].physical + + stripe_offset; + sblock->dev = bioc->stripes[stripe_index].dev; + sblock->physical_for_dev_replace = + original_sblock->physical_for_dev_replace; + } BUG_ON(sector_index >= original_sblock->sector_count); - sector->physical_for_dev_replace = - original_sblock->sectors[sector_index]-> - physical_for_dev_replace; - /* For missing devices, dev->bdev is NULL */ - sector->mirror_num = mirror_index + 1; - sblock->sector_count++; - sector->page = alloc_page(GFP_NOFS); - if (!sector->page) - goto leave_nomem; - scrub_get_recover(recover); sector->recover = recover; } @@ -1377,11 +1581,11 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, { DECLARE_COMPLETION_ONSTACK(done); - bio->bi_iter.bi_sector = sector->logical >> 9; + bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >> + SECTOR_SHIFT; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; - raid56_parity_recover(bio, sector->recover->bioc, - sector->sblock->sectors[0]->mirror_num, false); + raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num); wait_for_completion_io(&done); return blk_status_to_errno(bio->bi_status); @@ -1395,17 +1599,16 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, int i; /* All sectors in sblock belong to the same stripe on the same device. */ - ASSERT(first_sector->dev); - if (!first_sector->dev->bdev) + ASSERT(sblock->dev); + if (!sblock->dev->bdev) goto out; - bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); + bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); for (i = 0; i < sblock->sector_count; i++) { struct scrub_sector *sector = sblock->sectors[i]; - WARN_ON(!sector->page); - bio_add_page(bio, sector->page, PAGE_SIZE, 0); + bio_add_scrub_sector(bio, sector, fs_info->sectorsize); } if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) { @@ -1449,16 +1652,16 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, struct bio bio; struct bio_vec bvec; - if (sector->dev->bdev == NULL) { + if (sblock->dev->bdev == NULL) { sector->io_error = 1; sblock->no_io_error_seen = 0; continue; } - WARN_ON(!sector->page); - bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ); - bio_add_page(&bio, sector->page, fs_info->sectorsize, 0); - bio.bi_iter.bi_sector = sector->physical >> 9; + bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ); + bio_add_scrub_sector(&bio, sector, fs_info->sectorsize); + bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >> + SECTOR_SHIFT; btrfsic_check_bio(&bio); if (submit_bio_wait(&bio)) { @@ -1475,7 +1678,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector) { - struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices; + struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices; int ret; ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE); @@ -1521,30 +1724,29 @@ static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; const u32 sectorsize = fs_info->sectorsize; - BUG_ON(sector_bad->page == NULL); - BUG_ON(sector_good->page == NULL); if (force_write || sblock_bad->header_error || sblock_bad->checksum_error || sector_bad->io_error) { struct bio bio; struct bio_vec bvec; int ret; - if (!sector_bad->dev->bdev) { + if (!sblock_bad->dev->bdev) { btrfs_warn_rl(fs_info, "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected"); return -EIO; } - bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE); - bio.bi_iter.bi_sector = sector_bad->physical >> 9; - __bio_add_page(&bio, sector_good->page, sectorsize, 0); + bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE); + bio.bi_iter.bi_sector = (sblock_bad->physical + + sector_bad->offset) >> SECTOR_SHIFT; + ret = bio_add_scrub_sector(&bio, sector_good, sectorsize); btrfsic_check_bio(&bio); ret = submit_bio_wait(&bio); bio_uninit(&bio); if (ret) { - btrfs_dev_stat_inc_and_print(sector_bad->dev, + btrfs_dev_stat_inc_and_print(sblock_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); atomic64_inc(&fs_info->dev_replace.num_write_errors); return -EIO; @@ -1577,11 +1779,11 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num) { + const u32 sectorsize = sblock->sctx->fs_info->sectorsize; struct scrub_sector *sector = sblock->sectors[sector_num]; - BUG_ON(sector->page == NULL); if (sector->io_error) - clear_page(page_address(sector->page)); + memset(scrub_sector_get_kaddr(sector), 0, sectorsize); return scrub_add_sector_to_wr_bio(sblock->sctx, sector); } @@ -1608,9 +1810,15 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) return ret; } +static void scrub_block_get(struct scrub_block *sblock) +{ + refcount_inc(&sblock->refs); +} + static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, struct scrub_sector *sector) { + struct scrub_block *sblock = sector->sblock; struct scrub_bio *sbio; int ret; const u32 sectorsize = sctx->fs_info->sectorsize; @@ -1629,14 +1837,15 @@ again: } sbio = sctx->wr_curr_bio; if (sbio->sector_count == 0) { - ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace); + ret = fill_writer_pointer_gap(sctx, sector->offset + + sblock->physical_for_dev_replace); if (ret) { mutex_unlock(&sctx->wr_lock); return ret; } - sbio->physical = sector->physical_for_dev_replace; - sbio->logical = sector->logical; + sbio->physical = sblock->physical_for_dev_replace + sector->offset; + sbio->logical = sblock->logical + sector->offset; sbio->dev = sctx->wr_tgtdev; if (!sbio->bio) { sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, @@ -1647,14 +1856,14 @@ again: sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->status = 0; } else if (sbio->physical + sbio->sector_count * sectorsize != - sector->physical_for_dev_replace || + sblock->physical_for_dev_replace + sector->offset || sbio->logical + sbio->sector_count * sectorsize != - sector->logical) { + sblock->logical + sector->offset) { scrub_wr_submit(sctx); goto again; } - ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0); + ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize); if (ret != sectorsize) { if (sbio->sector_count < 1) { bio_put(sbio->bio); @@ -1668,6 +1877,13 @@ again: sbio->sectors[sbio->sector_count] = sector; scrub_sector_get(sector); + /* + * Since ssector no longer holds a page, but uses sblock::pages, we + * have to ensure the sblock had not been freed before our write bio + * finished. + */ + scrub_block_get(sector->sblock); + sbio->sector_count++; if (sbio->sector_count == sctx->sectors_per_bio) scrub_wr_submit(sctx); @@ -1729,8 +1945,14 @@ static void scrub_wr_bio_end_io_worker(struct work_struct *work) } } - for (i = 0; i < sbio->sector_count; i++) + /* + * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in + * endio we should put the sblock. + */ + for (i = 0; i < sbio->sector_count; i++) { + scrub_block_put(sbio->sectors[i]->sblock); scrub_sector_put(sbio->sectors[i]); + } bio_put(sbio->bio); kfree(sbio); @@ -1762,7 +1984,7 @@ static int scrub_checksum(struct scrub_block *sblock) else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) ret = scrub_checksum_tree_block(sblock); else if (flags & BTRFS_EXTENT_FLAG_SUPER) - (void)scrub_checksum_super(sblock); + ret = scrub_checksum_super(sblock); else WARN_ON(1); if (ret) @@ -1785,15 +2007,11 @@ static int scrub_checksum_data(struct scrub_block *sblock) if (!sector->have_csum) return 0; - kaddr = page_address(sector->page); + kaddr = scrub_sector_get_kaddr(sector); shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); - /* - * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector - * only contains one sector of data. - */ crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); if (memcmp(csum, sector->csum, fs_info->csum_size)) @@ -1826,7 +2044,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) ASSERT(sblock->sector_count == num_sectors); sector = sblock->sectors[0]; - kaddr = page_address(sector->page); + kaddr = scrub_sector_get_kaddr(sector); h = (struct btrfs_header *)kaddr; memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size); @@ -1835,7 +2053,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * a) don't have an extent buffer and * b) the page is already kmapped */ - if (sector->logical != btrfs_stack_header_bytenr(h)) + if (sblock->logical != btrfs_stack_header_bytenr(h)) sblock->header_error = 1; if (sector->generation != btrfs_stack_header_generation(h)) { @@ -1856,7 +2074,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) sectorsize - BTRFS_CSUM_SIZE); for (i = 1; i < num_sectors; i++) { - kaddr = page_address(sblock->sectors[i]->page); + kaddr = scrub_sector_get_kaddr(sblock->sectors[i]); crypto_shash_update(shash, kaddr, sectorsize); } @@ -1881,10 +2099,10 @@ static int scrub_checksum_super(struct scrub_block *sblock) BUG_ON(sblock->sector_count < 1); sector = sblock->sectors[0]; - kaddr = page_address(sector->page); + kaddr = scrub_sector_get_kaddr(sector); s = (struct btrfs_super_block *)kaddr; - if (sector->logical != btrfs_super_bytenr(s)) + if (sblock->logical != btrfs_super_bytenr(s)) ++fail_cor; if (sector->generation != btrfs_super_generation(s)) @@ -1901,31 +2119,9 @@ static int scrub_checksum_super(struct scrub_block *sblock) if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size)) ++fail_cor; - if (fail_cor + fail_gen) { - /* - * if we find an error in a super block, we just report it. - * They will get written with the next transaction commit - * anyway - */ - spin_lock(&sctx->stat_lock); - ++sctx->stat.super_errors; - spin_unlock(&sctx->stat_lock); - if (fail_cor) - btrfs_dev_stat_inc_and_print(sector->dev, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - else - btrfs_dev_stat_inc_and_print(sector->dev, - BTRFS_DEV_STAT_GENERATION_ERRS); - } - return fail_cor + fail_gen; } -static void scrub_block_get(struct scrub_block *sblock) -{ - refcount_inc(&sblock->refs); -} - static void scrub_block_put(struct scrub_block *sblock) { if (refcount_dec_and_test(&sblock->refs)) { @@ -1936,6 +2132,12 @@ static void scrub_block_put(struct scrub_block *sblock) for (i = 0; i < sblock->sector_count; i++) scrub_sector_put(sblock->sectors[i]); + for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) { + if (sblock->pages[i]) { + detach_scrub_page_private(sblock->pages[i]); + __free_page(sblock->pages[i]); + } + } kfree(sblock); } } @@ -1947,11 +2149,8 @@ static void scrub_sector_get(struct scrub_sector *sector) static void scrub_sector_put(struct scrub_sector *sector) { - if (atomic_dec_and_test(§or->refs)) { - if (sector->page) - __free_page(sector->page); + if (atomic_dec_and_test(§or->refs)) kfree(sector); - } } /* @@ -2056,9 +2255,9 @@ again: } sbio = sctx->bios[sctx->curr]; if (sbio->sector_count == 0) { - sbio->physical = sector->physical; - sbio->logical = sector->logical; - sbio->dev = sector->dev; + sbio->physical = sblock->physical + sector->offset; + sbio->logical = sblock->logical + sector->offset; + sbio->dev = sblock->dev; if (!sbio->bio) { sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, REQ_OP_READ, GFP_NOFS); @@ -2068,16 +2267,16 @@ again: sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->status = 0; } else if (sbio->physical + sbio->sector_count * sectorsize != - sector->physical || + sblock->physical + sector->offset || sbio->logical + sbio->sector_count * sectorsize != - sector->logical || - sbio->dev != sector->dev) { + sblock->logical + sector->offset || + sbio->dev != sblock->dev) { scrub_submit(sctx); goto again; } sbio->sectors[sbio->sector_count] = sector; - ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0); + ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize); if (ret != sectorsize) { if (sbio->sector_count < 1) { bio_put(sbio->bio); @@ -2102,6 +2301,7 @@ static void scrub_missing_raid56_end_io(struct bio *bio) struct scrub_block *sblock = bio->bi_private; struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; + btrfs_bio_counter_dec(fs_info); if (bio->bi_status) sblock->no_io_error_seen = 0; @@ -2118,8 +2318,8 @@ static void scrub_missing_raid56_worker(struct work_struct *work) u64 logical; struct btrfs_device *dev; - logical = sblock->sectors[0]->logical; - dev = sblock->sectors[0]->dev; + logical = sblock->logical; + dev = sblock->dev; if (sblock->no_io_error_seen) scrub_recheck_block_checksum(sblock); @@ -2130,14 +2330,14 @@ static void scrub_missing_raid56_worker(struct work_struct *work) spin_unlock(&sctx->stat_lock); btrfs_err_rl_in_rcu(fs_info, "IO error rebuilding logical %llu for dev %s", - logical, rcu_str_deref(dev->name)); + logical, btrfs_dev_name(dev)); } else if (sblock->header_error || sblock->checksum_error) { spin_lock(&sctx->stat_lock); sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); btrfs_err_rl_in_rcu(fs_info, "failed to rebuild valid logical %llu for dev %s", - logical, rcu_str_deref(dev->name)); + logical, btrfs_dev_name(dev)); } else { scrub_write_block_to_dev_replace(sblock); } @@ -2157,7 +2357,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) struct scrub_ctx *sctx = sblock->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; u64 length = sblock->sector_count << fs_info->sectorsize_bits; - u64 logical = sblock->sectors[0]->logical; + u64 logical = sblock->logical; struct btrfs_io_context *bioc = NULL; struct bio *bio; struct btrfs_raid_bio *rbio; @@ -2193,17 +2393,16 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) for (i = 0; i < sblock->sector_count; i++) { struct scrub_sector *sector = sblock->sectors[i]; - /* - * For now, our scrub is still one page per sector, so pgoff - * is always 0. - */ - raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical); + raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector), + scrub_sector_get_page_offset(sector), + sector->offset + sector->sblock->logical); } INIT_WORK(&sblock->work, scrub_missing_raid56_worker); scrub_block_get(sblock); scrub_pending_bio_inc(sctx); raid56_submit_missing_rbio(rbio); + btrfs_put_bioc(bioc); return; rbio_out: @@ -2225,7 +2424,8 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, const u32 sectorsize = sctx->fs_info->sectorsize; int index; - sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); + sblock = alloc_scrub_block(sctx, dev, logical, physical, + physical_for_dev_replace, mirror_num); if (!sblock) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2233,12 +2433,6 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, return -ENOMEM; } - /* one ref inside this function, plus one for each page added to - * a bio later on */ - refcount_set(&sblock->refs, 1); - sblock->sctx = sctx; - sblock->no_io_error_seen = 1; - for (index = 0; len > 0; index++) { struct scrub_sector *sector; /* @@ -2248,36 +2442,22 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, */ u32 l = min(sectorsize, len); - sector = kzalloc(sizeof(*sector), GFP_KERNEL); + sector = alloc_scrub_sector(sblock, logical); if (!sector) { -leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); scrub_block_put(sblock); return -ENOMEM; } - ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK); - scrub_sector_get(sector); - sblock->sectors[index] = sector; - sector->sblock = sblock; - sector->dev = dev; sector->flags = flags; sector->generation = gen; - sector->logical = logical; - sector->physical = physical; - sector->physical_for_dev_replace = physical_for_dev_replace; - sector->mirror_num = mirror_num; if (csum) { sector->have_csum = 1; memcpy(sector->csum, csum, sctx->fs_info->csum_size); } else { sector->have_csum = 0; } - sblock->sector_count++; - sector->page = alloc_page(GFP_KERNEL); - if (!sector->page) - goto leave_nomem; len -= l; logical += l; physical += l; @@ -2423,8 +2603,9 @@ static void scrub_block_complete(struct scrub_block *sblock) } if (sblock->sparity && corrupted && !sblock->data_corrected) { - u64 start = sblock->sectors[0]->logical; - u64 end = sblock->sectors[sblock->sector_count - 1]->logical + + u64 start = sblock->logical; + u64 end = sblock->logical + + sblock->sectors[sblock->sector_count - 1]->offset + sblock->sctx->fs_info->sectorsize; ASSERT(end - start <= U32_MAX); @@ -2578,7 +2759,7 @@ static int scrub_sectors_for_parity(struct scrub_parity *sparity, ASSERT(IS_ALIGNED(len, sectorsize)); - sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); + sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num); if (!sblock) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2586,51 +2767,32 @@ static int scrub_sectors_for_parity(struct scrub_parity *sparity, return -ENOMEM; } - /* one ref inside this function, plus one for each page added to - * a bio later on */ - refcount_set(&sblock->refs, 1); - sblock->sctx = sctx; - sblock->no_io_error_seen = 1; sblock->sparity = sparity; scrub_parity_get(sparity); for (index = 0; len > 0; index++) { struct scrub_sector *sector; - sector = kzalloc(sizeof(*sector), GFP_KERNEL); + sector = alloc_scrub_sector(sblock, logical); if (!sector) { -leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); scrub_block_put(sblock); return -ENOMEM; } - ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK); - /* For scrub block */ - scrub_sector_get(sector); sblock->sectors[index] = sector; /* For scrub parity */ scrub_sector_get(sector); list_add_tail(§or->list, &sparity->sectors_list); - sector->sblock = sblock; - sector->dev = dev; sector->flags = flags; sector->generation = gen; - sector->logical = logical; - sector->physical = physical; - sector->mirror_num = mirror_num; if (csum) { sector->have_csum = 1; memcpy(sector->csum, csum, sctx->fs_info->csum_size); } else { sector->have_csum = 0; } - sblock->sector_count++; - sector->page = alloc_page(GFP_KERNEL); - if (!sector->page) - goto leave_nomem; - /* Iterate over the stripe range in sectorsize steps */ len -= sectorsize; @@ -2774,6 +2936,7 @@ static void scrub_parity_bio_endio_worker(struct work_struct *work) work); struct scrub_ctx *sctx = sparity->sctx; + btrfs_bio_counter_dec(sctx->fs_info); scrub_free_parity(sparity); scrub_pending_bio_dec(sctx); } @@ -2824,6 +2987,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) sparity->scrub_dev, &sparity->dbitmap, sparity->nsectors); + btrfs_put_bioc(bioc); if (!rbio) goto rbio_out; @@ -2835,7 +2999,6 @@ rbio_out: bio_put(bio); bioc_out: btrfs_bio_counter_dec(fs_info); - btrfs_put_bioc(bioc); bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap, sparity->nsectors); spin_lock(&sctx->stat_lock); @@ -3075,9 +3238,9 @@ static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx, extent_dev = bioc->stripes[0].dev; btrfs_put_bioc(bioc); - ret = btrfs_lookup_csums_range(csum_root, extent_start, - extent_start + extent_size - 1, - &sctx->csum_list, 1); + ret = btrfs_lookup_csums_list(csum_root, extent_start, + extent_start + extent_size - 1, + &sctx->csum_list, 1, false); if (ret) { scrub_parity_mark_sectors_error(sparity, extent_start, extent_size); @@ -3266,7 +3429,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, } /* Block group removed? */ spin_lock(&bg->lock); - if (bg->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { spin_unlock(&bg->lock); ret = 0; break; @@ -3301,9 +3464,9 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, cur_logical; if (extent_flags & BTRFS_EXTENT_FLAG_DATA) { - ret = btrfs_lookup_csums_range(csum_root, cur_logical, + ret = btrfs_lookup_csums_list(csum_root, cur_logical, cur_logical + scrub_len - 1, - &sctx->csum_list, 1); + &sctx->csum_list, 1, false); if (ret) break; } @@ -3606,7 +3769,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, * kthread or relocation. */ spin_lock(&bg->lock); - if (!bg->removed) + if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) ret = -EINVAL; spin_unlock(&bg->lock); @@ -3764,13 +3927,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, } if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { - spin_lock(&cache->lock); - if (!cache->to_copy) { - spin_unlock(&cache->lock); + if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) { btrfs_put_block_group(cache); goto skip; } - spin_unlock(&cache->lock); } /* @@ -3782,7 +3942,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * repair extents. */ spin_lock(&cache->lock); - if (cache->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) { spin_unlock(&cache->lock); btrfs_put_block_group(cache); goto skip; @@ -3942,8 +4102,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * balance is triggered or it becomes used and unused again. */ spin_lock(&cache->lock); - if (!cache->removed && !cache->ro && cache->reserved == 0 && - cache->used == 0) { + if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) && + !cache->ro && cache->reserved == 0 && cache->used == 0) { spin_unlock(&cache->lock); if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) btrfs_discard_queue_work(&fs_info->discard_ctl, @@ -4102,36 +4262,21 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, int ret; struct btrfs_device *dev; unsigned int nofs_flag; + bool need_commit = false; if (btrfs_fs_closing(fs_info)) return -EAGAIN; - if (fs_info->nodesize > BTRFS_STRIPE_LEN) { - /* - * in this case scrub is unable to calculate the checksum - * the way scrub is implemented. Do not handle this - * situation at all because it won't ever happen. - */ - btrfs_err(fs_info, - "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails", - fs_info->nodesize, - BTRFS_STRIPE_LEN); - return -EINVAL; - } + /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */ + ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN); - if (fs_info->nodesize > - SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits || - fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) { - /* - * Would exhaust the array bounds of sectorv member in - * struct scrub_block - */ - btrfs_err(fs_info, -"scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails", - fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK, - fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK); - return -EINVAL; - } + /* + * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible + * value (max nodesize / min sectorsize), thus nodesize should always + * be fine. + */ + ASSERT(fs_info->nodesize <= + SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits); /* Allocate outside of device_list_mutex */ sctx = scrub_setup_ctx(fs_info, is_dev_replace); @@ -4156,7 +4301,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_err_in_rcu(fs_info, "scrub on devid %llu: filesystem on %s is not writable", - devid, rcu_str_deref(dev->name)); + devid, btrfs_dev_name(dev)); ret = -EROFS; goto out; } @@ -4205,6 +4350,12 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, */ nofs_flag = memalloc_nofs_save(); if (!is_dev_replace) { + u64 old_super_errors; + + spin_lock(&sctx->stat_lock); + old_super_errors = sctx->stat.super_errors; + spin_unlock(&sctx->stat_lock); + btrfs_info(fs_info, "scrub: started on devid %llu", devid); /* * by holding device list mutex, we can @@ -4213,6 +4364,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_lock(&fs_info->fs_devices->device_list_mutex); ret = scrub_supers(sctx, dev); mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + spin_lock(&sctx->stat_lock); + /* + * Super block errors found, but we can not commit transaction + * at current context, since btrfs_commit_transaction() needs + * to pause the current running scrub (hold by ourselves). + */ + if (sctx->stat.super_errors > old_super_errors && !sctx->readonly) + need_commit = true; + spin_unlock(&sctx->stat_lock); } if (!ret) @@ -4239,6 +4400,25 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, scrub_workers_put(fs_info); scrub_put_ctx(sctx); + /* + * We found some super block errors before, now try to force a + * transaction commit, as scrub has finished. + */ + if (need_commit) { + struct btrfs_trans_handle *trans; + + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_err(fs_info, + "scrub: failed to start transaction to fix super block errors: %d", ret); + return ret; + } + ret = btrfs_commit_transaction(trans); + if (ret < 0) + btrfs_err(fs_info, + "scrub: failed to commit transaction to fix super block errors: %d", ret); + } return ret; out: scrub_workers_put(fs_info); diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h new file mode 100644 index 000000000000..7639103ebf9d --- /dev/null +++ b/fs/btrfs/scrub.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SCRUB_H +#define BTRFS_SCRUB_H + +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, + u64 end, struct btrfs_scrub_progress *progress, + int readonly, int is_dev_replace); +void btrfs_scrub_pause(struct btrfs_fs_info *fs_info); +void btrfs_scrub_continue(struct btrfs_fs_info *fs_info); +int btrfs_scrub_cancel(struct btrfs_fs_info *info); +int btrfs_scrub_cancel_dev(struct btrfs_device *dev); +int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, + struct btrfs_scrub_progress *progress); + +#endif diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e7671afcee4f..e65e6b6600a7 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -15,6 +15,7 @@ #include <linux/string.h> #include <linux/compat.h> #include <linux/crc32c.h> +#include <linux/fsverity.h> #include "send.h" #include "ctree.h" @@ -26,6 +27,11 @@ #include "compression.h" #include "xattr.h" #include "print-tree.h" +#include "accessors.h" +#include "dir-item.h" +#include "file-item.h" +#include "ioctl.h" +#include "verity.h" /* * Maximum number of references an extent can have in order for us to attempt to @@ -33,7 +39,7 @@ * avoid hitting limitations of the backreference walking code (taking a lot of * time and using too much memory for extents with large number of references). */ -#define SEND_MAX_EXTENT_REFS 64 +#define SEND_MAX_EXTENT_REFS 1024 /* * A fs_path is a helper to dynamically build path names with unknown size. @@ -70,13 +76,46 @@ struct clone_root { struct btrfs_root *root; u64 ino; u64 offset; - - u64 found_refs; + u64 num_bytes; + bool found_ref; }; #define SEND_CTX_MAX_NAME_CACHE_SIZE 128 #define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) +/* + * Limit the root_ids array of struct backref_cache_entry to 12 elements. + * This makes the size of a cache entry to be exactly 128 bytes on x86_64. + * The most common case is to have a single root for cloning, which corresponds + * to the send root. Having the user specify more than 11 clone roots is not + * common, and in such rare cases we simply don't use caching if the number of + * cloning roots that lead down to a leaf is more than 12. + */ +#define SEND_MAX_BACKREF_CACHE_ROOTS 12 + +/* + * Max number of entries in the cache. + * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding + * maple tree's internal nodes, is 16K. + */ +#define SEND_MAX_BACKREF_CACHE_SIZE 128 + +/* + * A backref cache entry maps a leaf to a list of IDs of roots from which the + * leaf is accessible and we can use for clone operations. + * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on + * x86_64). + */ +struct backref_cache_entry { + /* List to link to the cache's lru list. */ + struct list_head list; + /* The key for this entry in the cache. */ + u64 key; + u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS]; + /* Number of valid elements in the root_ids array. */ + int num_roots; +}; + struct send_ctx { struct file *send_filp; loff_t send_off; @@ -127,6 +166,8 @@ struct send_ctx { bool cur_inode_new_gen; bool cur_inode_deleted; bool ignore_cur_inode; + bool cur_inode_needs_verity; + void *verity_descriptor; u64 send_progress; @@ -243,6 +284,14 @@ struct send_ctx { struct rb_root rbtree_new_refs; struct rb_root rbtree_deleted_refs; + + struct { + u64 last_reloc_trans; + struct list_head lru_list; + struct maple_tree entries; + /* Number of entries stored in the cache. */ + int size; + } backref_cache; }; struct pending_dir_move { @@ -345,6 +394,7 @@ static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd) switch (sctx->proto) { case 1: return cmd <= BTRFS_SEND_C_MAX_V1; case 2: return cmd <= BTRFS_SEND_C_MAX_V2; + case 3: return cmd <= BTRFS_SEND_C_MAX_V3; default: return false; } } @@ -436,6 +486,11 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) old_buf_len = p->buf_len; /* + * Allocate to the next largest kmalloc bucket size, to let + * the fast path happen most of the time. + */ + len = kmalloc_size_roundup(len); + /* * First time the inline_buf does not suffice */ if (p->buf == p->inline_buf) { @@ -448,11 +503,7 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (!tmp_buf) return -ENOMEM; p->buf = tmp_buf; - /* - * The real size of the buffer is bigger, this will let the fast path - * happen most of the time - */ - p->buf_len = ksize(p->buf); + p->buf_len = len; if (p->reversed) { tmp_buf = p->buf + old_buf_len - path_len - 1; @@ -624,6 +675,7 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \ } +TLV_PUT_DEFINE_INT(8) TLV_PUT_DEFINE_INT(32) TLV_PUT_DEFINE_INT(64) @@ -842,17 +894,32 @@ out: return ret; } +struct btrfs_inode_info { + u64 size; + u64 gen; + u64 mode; + u64 uid; + u64 gid; + u64 rdev; + u64 fileattr; + u64 nlink; +}; + /* * Helper function to retrieve some fields from an inode item. */ -static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, - u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, - u64 *gid, u64 *rdev, u64 *fileattr) +static int get_inode_info(struct btrfs_root *root, u64 ino, + struct btrfs_inode_info *info) { int ret; + struct btrfs_path *path; struct btrfs_inode_item *ii; struct btrfs_key key; + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + key.objectid = ino; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; @@ -860,47 +927,43 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, if (ret) { if (ret > 0) ret = -ENOENT; - return ret; + goto out; } + if (!info) + goto out; + ii = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); - if (size) - *size = btrfs_inode_size(path->nodes[0], ii); - if (gen) - *gen = btrfs_inode_generation(path->nodes[0], ii); - if (mode) - *mode = btrfs_inode_mode(path->nodes[0], ii); - if (uid) - *uid = btrfs_inode_uid(path->nodes[0], ii); - if (gid) - *gid = btrfs_inode_gid(path->nodes[0], ii); - if (rdev) - *rdev = btrfs_inode_rdev(path->nodes[0], ii); + info->size = btrfs_inode_size(path->nodes[0], ii); + info->gen = btrfs_inode_generation(path->nodes[0], ii); + info->mode = btrfs_inode_mode(path->nodes[0], ii); + info->uid = btrfs_inode_uid(path->nodes[0], ii); + info->gid = btrfs_inode_gid(path->nodes[0], ii); + info->rdev = btrfs_inode_rdev(path->nodes[0], ii); + info->nlink = btrfs_inode_nlink(path->nodes[0], ii); /* * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's * otherwise logically split to 32/32 parts. */ - if (fileattr) - *fileattr = btrfs_inode_flags(path->nodes[0], ii); + info->fileattr = btrfs_inode_flags(path->nodes[0], ii); +out: + btrfs_free_path(path); return ret; } -static int get_inode_info(struct btrfs_root *root, - u64 ino, u64 *size, u64 *gen, - u64 *mode, u64 *uid, u64 *gid, - u64 *rdev, u64 *fileattr) +static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) { - struct btrfs_path *path; int ret; + struct btrfs_inode_info info; - path = alloc_path_for_send(); - if (!path) - return -ENOMEM; - ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid, - rdev, fileattr); - btrfs_free_path(path); + if (!gen) + return -EPERM; + + ret = get_inode_info(root, ino, &info); + if (!ret) + *gen = info.gen; return ret; } @@ -1077,7 +1140,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, data_len = btrfs_dir_data_len(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &di_key); - if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) { + if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) { if (name_len > XATTR_NAME_MAX) { ret = -ENAMETOOLONG; goto out; @@ -1220,8 +1283,12 @@ struct backref_ctx { /* may be truncated in case it's the last extent in a file */ u64 extent_len; - /* Just to check for bugs in backref resolving */ - int found_itself; + /* The bytenr the file extent item we are processing refers to. */ + u64 bytenr; + /* The owner (root id) of the data backref for the current extent. */ + u64 backref_owner; + /* The offset of the data backref for the current extent. */ + u64 backref_offset; }; static int __clone_root_cmp_bsearch(const void *key, const void *elt) @@ -1250,32 +1317,33 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2) /* * Called for every backref that is found for the current extent. - * Results are collected in sctx->clone_roots->ino/offset/found_refs + * Results are collected in sctx->clone_roots->ino/offset. */ -static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) +static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id, + void *ctx_) { struct backref_ctx *bctx = ctx_; - struct clone_root *found; + struct clone_root *clone_root; /* First check if the root is in the list of accepted clone sources */ - found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots, - bctx->sctx->clone_roots_cnt, - sizeof(struct clone_root), - __clone_root_cmp_bsearch); - if (!found) + clone_root = bsearch((void *)(uintptr_t)root_id, bctx->sctx->clone_roots, + bctx->sctx->clone_roots_cnt, + sizeof(struct clone_root), + __clone_root_cmp_bsearch); + if (!clone_root) return 0; - if (found->root == bctx->sctx->send_root && + /* This is our own reference, bail out as we can't clone from it. */ + if (clone_root->root == bctx->sctx->send_root && ino == bctx->cur_objectid && - offset == bctx->cur_offset) { - bctx->found_itself = 1; - } + offset == bctx->cur_offset) + return 0; /* * Make sure we don't consider clones from send_root that are * behind the current inode/offset. */ - if (found->root == bctx->sctx->send_root) { + if (clone_root->root == bctx->sctx->send_root) { /* * If the source inode was not yet processed we can't issue a * clone operation, as the source extent does not exist yet at @@ -1296,21 +1364,217 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) } bctx->found++; - found->found_refs++; - if (ino < found->ino) { - found->ino = ino; - found->offset = offset; - } else if (found->ino == ino) { + clone_root->found_ref = true; + + /* + * If the given backref refers to a file extent item with a larger + * number of bytes than what we found before, use the new one so that + * we clone more optimally and end up doing less writes and getting + * less exclusive, non-shared extents at the destination. + */ + if (num_bytes > clone_root->num_bytes) { + clone_root->ino = ino; + clone_root->offset = offset; + clone_root->num_bytes = num_bytes; + /* - * same extent found more then once in the same file. + * Found a perfect candidate, so there's no need to continue + * backref walking. */ - if (found->offset > offset + bctx->extent_len) - found->offset = offset; + if (num_bytes >= bctx->extent_len) + return BTRFS_ITERATE_EXTENT_INODES_STOP; } return 0; } +static void empty_backref_cache(struct send_ctx *sctx) +{ + struct backref_cache_entry *entry; + struct backref_cache_entry *tmp; + + list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list) + kfree(entry); + + INIT_LIST_HEAD(&sctx->backref_cache.lru_list); + mtree_destroy(&sctx->backref_cache.entries); + sctx->backref_cache.size = 0; +} + +static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, + const u64 **root_ids_ret, int *root_count_ret) +{ + struct backref_ctx *bctx = ctx; + struct send_ctx *sctx = bctx->sctx; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; + struct backref_cache_entry *entry; + + if (sctx->backref_cache.size == 0) + return false; + + /* + * If relocation happened since we first filled the cache, then we must + * empty the cache and can not use it, because even though we operate on + * read-only roots, their leaves and nodes may have been reallocated and + * now be used for different nodes/leaves of the same tree or some other + * tree. + * + * We are called from iterate_extent_inodes() while either holding a + * transaction handle or holding fs_info->commit_root_sem, so no need + * to take any lock here. + */ + if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) { + empty_backref_cache(sctx); + return false; + } + + entry = mtree_load(&sctx->backref_cache.entries, key); + if (!entry) + return false; + + *root_ids_ret = entry->root_ids; + *root_count_ret = entry->num_roots; + list_move_tail(&entry->list, &sctx->backref_cache.lru_list); + + return true; +} + +static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, + void *ctx) +{ + struct backref_ctx *bctx = ctx; + struct send_ctx *sctx = bctx->sctx; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + struct backref_cache_entry *new_entry; + struct ulist_iterator uiter; + struct ulist_node *node; + int ret; + + /* + * We're called while holding a transaction handle or while holding + * fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a + * NOFS allocation. + */ + new_entry = kmalloc(sizeof(struct backref_cache_entry), GFP_NOFS); + /* No worries, cache is optional. */ + if (!new_entry) + return; + + new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->num_roots = 0; + ULIST_ITER_INIT(&uiter); + while ((node = ulist_next(root_ids, &uiter)) != NULL) { + const u64 root_id = node->val; + struct clone_root *root; + + root = bsearch((void *)(uintptr_t)root_id, sctx->clone_roots, + sctx->clone_roots_cnt, sizeof(struct clone_root), + __clone_root_cmp_bsearch); + if (!root) + continue; + + /* Too many roots, just exit, no worries as caching is optional. */ + if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) { + kfree(new_entry); + return; + } + + new_entry->root_ids[new_entry->num_roots] = root_id; + new_entry->num_roots++; + } + + /* + * We may have not added any roots to the new cache entry, which means + * none of the roots is part of the list of roots from which we are + * allowed to clone. Cache the new entry as it's still useful to avoid + * backref walking to determine which roots have a path to the leaf. + */ + + if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) { + struct backref_cache_entry *lru_entry; + struct backref_cache_entry *mt_entry; + + lru_entry = list_first_entry(&sctx->backref_cache.lru_list, + struct backref_cache_entry, list); + mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key); + ASSERT(mt_entry == lru_entry); + list_del(&mt_entry->list); + kfree(mt_entry); + sctx->backref_cache.size--; + } + + ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key, + new_entry, GFP_NOFS); + ASSERT(ret == 0 || ret == -ENOMEM); + if (ret) { + /* Caching is optional, no worries. */ + kfree(new_entry); + return; + } + + list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list); + + /* + * We are called from iterate_extent_inodes() while either holding a + * transaction handle or holding fs_info->commit_root_sem, so no need + * to take any lock here. + */ + if (sctx->backref_cache.size == 0) + sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans; + + sctx->backref_cache.size++; +} + +static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei, + const struct extent_buffer *leaf, void *ctx) +{ + const u64 refs = btrfs_extent_refs(leaf, ei); + const struct backref_ctx *bctx = ctx; + const struct send_ctx *sctx = bctx->sctx; + + if (bytenr == bctx->bytenr) { + const u64 flags = btrfs_extent_flags(leaf, ei); + + if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) + return -EUCLEAN; + + /* + * If we have only one reference and only the send root as a + * clone source - meaning no clone roots were given in the + * struct btrfs_ioctl_send_args passed to the send ioctl - then + * it's our reference and there's no point in doing backref + * walking which is expensive, so exit early. + */ + if (refs == 1 && sctx->clone_roots_cnt == 1) + return -ENOENT; + } + + /* + * Backreference walking (iterate_extent_inodes() below) is currently + * too expensive when an extent has a large number of references, both + * in time spent and used memory. So for now just fallback to write + * operations instead of clone operations when an extent has more than + * a certain amount of references. + */ + if (refs > SEND_MAX_EXTENT_REFS) + return -ENOENT; + + return 0; +} + +static bool skip_self_data_ref(u64 root, u64 ino, u64 offset, void *ctx) +{ + const struct backref_ctx *bctx = ctx; + + if (ino == bctx->cur_objectid && + root == bctx->backref_owner && + offset == bctx->backref_offset) + return true; + + return false; +} + /* * Given an inode, offset and extent item, it finds a good clone for a clone * instruction. Returns -ENOENT when none could be found. The function makes @@ -1332,79 +1596,36 @@ static int find_extent_clone(struct send_ctx *sctx, u64 logical; u64 disk_byte; u64 num_bytes; - u64 extent_item_pos; - u64 flags = 0; struct btrfs_file_extent_item *fi; struct extent_buffer *eb = path->nodes[0]; - struct backref_ctx backref_ctx = {0}; + struct backref_ctx backref_ctx = { 0 }; + struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 }; struct clone_root *cur_clone_root; - struct btrfs_key found_key; - struct btrfs_path *tmp_path; - struct btrfs_extent_item *ei; int compressed; u32 i; - tmp_path = alloc_path_for_send(); - if (!tmp_path) - return -ENOMEM; + /* + * With fallocate we can get prealloc extents beyond the inode's i_size, + * so we don't do anything here because clone operations can not clone + * to a range beyond i_size without increasing the i_size of the + * destination inode. + */ + if (data_offset >= ino_size) + return 0; - /* We only use this path under the commit sem */ - tmp_path->need_commit_sem = 0; + fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(eb, fi); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + return -ENOENT; - if (data_offset >= ino_size) { - /* - * There may be extents that lie behind the file's size. - * I at least had this in combination with snapshotting while - * writing large files. - */ - ret = 0; - goto out; - } + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte == 0) + return -ENOENT; - fi = btrfs_item_ptr(eb, path->slots[0], - struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(eb, fi); - if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - ret = -ENOENT; - goto out; - } compressed = btrfs_file_extent_compression(eb, fi); - num_bytes = btrfs_file_extent_num_bytes(eb, fi); - disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte == 0) { - ret = -ENOENT; - goto out; - } logical = disk_byte + btrfs_file_extent_offset(eb, fi); - down_read(&fs_info->commit_root_sem); - ret = extent_from_logical(fs_info, disk_byte, tmp_path, - &found_key, &flags); - up_read(&fs_info->commit_root_sem); - - if (ret < 0) - goto out; - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - ret = -EIO; - goto out; - } - - ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0], - struct btrfs_extent_item); - /* - * Backreference walking (iterate_extent_inodes() below) is currently - * too expensive when an extent has a large number of references, both - * in time spent and used memory. So for now just fallback to write - * operations instead of clone operations when an extent has more than - * a certain amount of references. - */ - if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) { - ret = -ENOENT; - goto out; - } - btrfs_release_path(tmp_path); - /* * Setup the clone roots. */ @@ -1412,37 +1633,59 @@ static int find_extent_clone(struct send_ctx *sctx, cur_clone_root = sctx->clone_roots + i; cur_clone_root->ino = (u64)-1; cur_clone_root->offset = 0; - cur_clone_root->found_refs = 0; + cur_clone_root->num_bytes = 0; + cur_clone_root->found_ref = false; } backref_ctx.sctx = sctx; - backref_ctx.found = 0; backref_ctx.cur_objectid = ino; backref_ctx.cur_offset = data_offset; - backref_ctx.found_itself = 0; - backref_ctx.extent_len = num_bytes; + backref_ctx.bytenr = disk_byte; + /* + * Use the header owner and not the send root's id, because in case of a + * snapshot we can have shared subtrees. + */ + backref_ctx.backref_owner = btrfs_header_owner(eb); + backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, fi); /* * The last extent of a file may be too large due to page alignment. * We need to adjust extent_len in this case so that the checks in - * __iterate_backrefs work. + * iterate_backrefs() work. */ if (data_offset + num_bytes >= ino_size) backref_ctx.extent_len = ino_size - data_offset; + else + backref_ctx.extent_len = num_bytes; /* * Now collect all backrefs. */ + backref_walk_ctx.bytenr = disk_byte; if (compressed == BTRFS_COMPRESS_NONE) - extent_item_pos = logical - found_key.objectid; - else - extent_item_pos = 0; - ret = iterate_extent_inodes(fs_info, found_key.objectid, - extent_item_pos, 1, __iterate_backrefs, - &backref_ctx, false); + backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, fi); + backref_walk_ctx.fs_info = fs_info; + backref_walk_ctx.cache_lookup = lookup_backref_cache; + backref_walk_ctx.cache_store = store_backref_cache; + backref_walk_ctx.indirect_ref_iterator = iterate_backrefs; + backref_walk_ctx.check_extent_item = check_extent_item; + backref_walk_ctx.user_ctx = &backref_ctx; + + /* + * If have a single clone root, then it's the send root and we can tell + * the backref walking code to skip our own backref and not resolve it, + * since we can not use it for cloning - the source and destination + * ranges can't overlap and in case the leaf is shared through a subtree + * due to snapshots, we can't use those other roots since they are not + * in the list of clone roots. + */ + if (sctx->clone_roots_cnt == 1) + backref_walk_ctx.skip_data_ref = skip_self_data_ref; + ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs, + &backref_ctx); if (ret < 0) - goto out; + return ret; down_read(&fs_info->commit_root_sem); if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { @@ -1459,37 +1702,42 @@ static int find_extent_clone(struct send_ctx *sctx, * was already reallocated after the relocation. */ up_read(&fs_info->commit_root_sem); - ret = -ENOENT; - goto out; + return -ENOENT; } up_read(&fs_info->commit_root_sem); - if (!backref_ctx.found_itself) { - /* found a bug in backref code? */ - ret = -EIO; - btrfs_err(fs_info, - "did not find backref in send_root. inode=%llu, offset=%llu, disk_byte=%llu found extent=%llu", - ino, data_offset, disk_byte, found_key.objectid); - goto out; - } - btrfs_debug(fs_info, "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu", data_offset, ino, num_bytes, logical); - if (!backref_ctx.found) + if (!backref_ctx.found) { btrfs_debug(fs_info, "no clones found"); + return -ENOENT; + } cur_clone_root = NULL; for (i = 0; i < sctx->clone_roots_cnt; i++) { - if (sctx->clone_roots[i].found_refs) { - if (!cur_clone_root) - cur_clone_root = sctx->clone_roots + i; - else if (sctx->clone_roots[i].root == sctx->send_root) - /* prefer clones from send_root over others */ - cur_clone_root = sctx->clone_roots + i; - } + struct clone_root *clone_root = &sctx->clone_roots[i]; + + if (!clone_root->found_ref) + continue; + + /* + * Choose the root from which we can clone more bytes, to + * minimize write operations and therefore have more extent + * sharing at the destination (the same as in the source). + */ + if (!cur_clone_root || + clone_root->num_bytes > cur_clone_root->num_bytes) { + cur_clone_root = clone_root; + /* + * We found an optimal clone candidate (any inode from + * any root is fine), so we're done. + */ + if (clone_root->num_bytes >= backref_ctx.extent_len) + break; + } } if (cur_clone_root) { @@ -1499,8 +1747,6 @@ static int find_extent_clone(struct send_ctx *sctx, ret = -ENOENT; } -out: - btrfs_free_path(tmp_path); return ret; } @@ -1580,13 +1826,17 @@ static int gen_unique_name(struct send_ctx *sctx, return -ENOMEM; while (1) { + struct fscrypt_str tmp_name; + len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", ino, gen, idx); ASSERT(len < sizeof(tmp)); + tmp_name.name = tmp; + tmp_name.len = strlen(tmp); di = btrfs_lookup_dir_item(NULL, sctx->send_root, path, BTRFS_FIRST_FREE_OBJECTID, - tmp, strlen(tmp), 0); + &tmp_name, 0); btrfs_release_path(path); if (IS_ERR(di)) { ret = PTR_ERR(di); @@ -1606,7 +1856,7 @@ static int gen_unique_name(struct send_ctx *sctx, di = btrfs_lookup_dir_item(NULL, sctx->parent_root, path, BTRFS_FIRST_FREE_OBJECTID, - tmp, strlen(tmp), 0); + &tmp_name, 0); btrfs_release_path(path); if (IS_ERR(di)) { ret = PTR_ERR(di); @@ -1643,21 +1893,22 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) int right_ret; u64 left_gen; u64 right_gen; + struct btrfs_inode_info info; - ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL, - NULL, NULL, NULL); + ret = get_inode_info(sctx->send_root, ino, &info); if (ret < 0 && ret != -ENOENT) goto out; - left_ret = ret; + left_ret = (info.nlink == 0) ? -ENOENT : ret; + left_gen = info.gen; if (!sctx->parent_root) { right_ret = -ENOENT; } else { - ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen, - NULL, NULL, NULL, NULL, NULL); + ret = get_inode_info(sctx->parent_root, ino, &info); if (ret < 0 && ret != -ENOENT) goto out; - right_ret = ret; + right_ret = (info.nlink == 0) ? -ENOENT : ret; + right_gen = info.gen; } if (!left_ret && !right_ret) { @@ -1735,13 +1986,13 @@ static int lookup_dir_item_inode(struct btrfs_root *root, struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_path *path; + struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len); path = alloc_path_for_send(); if (!path) return -ENOMEM; - di = btrfs_lookup_dir_item(NULL, root, path, - dir, name, name_len, 0); + di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; @@ -1816,8 +2067,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, btrfs_release_path(path); if (dir_gen) { - ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, - NULL, NULL, NULL, NULL); + ret = get_inode_gen(root, parent_dir, dir_gen); if (ret < 0) goto out; } @@ -1874,6 +2124,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, int ret = 0; u64 gen; u64 other_inode = 0; + struct btrfs_inode_info info; if (!sctx->parent_root) goto out; @@ -1888,8 +2139,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, * and we can just unlink this entry. */ if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { - ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, - NULL, NULL, NULL, NULL); + ret = get_inode_gen(sctx->parent_root, dir, &gen); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -1916,13 +2166,14 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, */ if (other_inode > sctx->send_progress || is_waiting_for_move(sctx, other_inode)) { - ret = get_inode_info(sctx->parent_root, other_inode, NULL, - who_gen, who_mode, NULL, NULL, NULL, NULL); + ret = get_inode_info(sctx->parent_root, other_inode, &info); if (ret < 0) goto out; ret = 1; *who_ino = other_inode; + *who_gen = info.gen; + *who_mode = info.mode; } else { ret = 0; } @@ -1955,8 +2206,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, goto out; if (dir != BTRFS_FIRST_FREE_OBJECTID) { - ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, - NULL, NULL, NULL, NULL); + ret = get_inode_gen(sctx->send_root, dir, &gen); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -1978,8 +2228,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, goto out; } - ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL, - NULL, NULL, NULL); + ret = get_inode_gen(sctx->send_root, ow_inode, &gen); if (ret < 0) goto out; @@ -2645,6 +2894,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) int ret = 0; struct fs_path *p; int cmd; + struct btrfs_inode_info info; u64 gen; u64 mode; u64 rdev; @@ -2656,10 +2906,12 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) return -ENOMEM; if (ino != sctx->cur_ino) { - ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, - NULL, NULL, &rdev, NULL); + ret = get_inode_info(sctx->send_root, ino, &info); if (ret < 0) goto out; + gen = info.gen; + mode = info.mode; + rdev = info.rdev; } else { gen = sctx->cur_inode_gen; mode = sctx->cur_inode_mode; @@ -3359,8 +3611,7 @@ finish: /* * The parent inode might have been deleted in the send snapshot */ - ret = get_inode_info(sctx->send_root, cur->dir, NULL, - NULL, NULL, NULL, NULL, NULL, NULL); + ret = get_inode_info(sctx->send_root, cur->dir, NULL); if (ret == -ENOENT) { ret = 0; continue; @@ -3534,12 +3785,10 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, goto out; } - ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL, - &left_gen, NULL, NULL, NULL, NULL, NULL); + ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen); if (ret < 0) goto out; - ret = get_inode_info(sctx->send_root, di_key.objectid, NULL, - &right_gen, NULL, NULL, NULL, NULL, NULL); + ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen); if (ret < 0) { if (ret == -ENOENT) ret = 0; @@ -3669,8 +3918,7 @@ static int is_ancestor(struct btrfs_root *root, cur_offset = item_size; } - ret = get_inode_info(root, parent, NULL, &parent_gen, - NULL, NULL, NULL, NULL, NULL); + ret = get_inode_gen(root, parent, &parent_gen); if (ret < 0) goto out; ret = check_ino_in_path(root, ino1, ino1_gen, @@ -3760,9 +4008,7 @@ static int wait_for_parent_move(struct send_ctx *sctx, memcmp(path_before->start, path_after->start, len1))) { u64 parent_ino_gen; - ret = get_inode_info(sctx->parent_root, ino, NULL, - &parent_ino_gen, NULL, NULL, NULL, - NULL, NULL); + ret = get_inode_gen(sctx->parent_root, ino, &parent_ino_gen); if (ret < 0) goto out; if (ino_gen == parent_ino_gen) { @@ -4441,8 +4687,7 @@ static int record_new_ref_if_needed(int num, u64 dir, int index, struct recorded_ref *ref; u64 dir_gen; - ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL, NULL); + ret = get_inode_gen(sctx->send_root, dir, &dir_gen); if (ret < 0) goto out; @@ -4472,8 +4717,7 @@ static int record_deleted_ref_if_needed(int num, u64 dir, int index, struct recorded_ref *ref; u64 dir_gen; - ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL, NULL); + ret = get_inode_gen(sctx->parent_root, dir, &dir_gen); if (ret < 0) goto out; @@ -4886,6 +5130,84 @@ static int process_all_new_xattrs(struct send_ctx *sctx) return ret; } +static int send_verity(struct send_ctx *sctx, struct fs_path *path, + struct fsverity_descriptor *desc) +{ + int ret; + + ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM, + le8_to_cpu(desc->hash_algorithm)); + TLV_PUT_U32(sctx, BTRFS_SEND_A_VERITY_BLOCK_SIZE, + 1U << le8_to_cpu(desc->log_blocksize)); + TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SALT_DATA, desc->salt, + le8_to_cpu(desc->salt_size)); + TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SIG_DATA, desc->signature, + le32_to_cpu(desc->sig_size)); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +static int process_verity(struct send_ctx *sctx) +{ + int ret = 0; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + struct inode *inode; + struct fs_path *p; + + inode = btrfs_iget(fs_info->sb, sctx->cur_ino, sctx->send_root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + ret = btrfs_get_verity_descriptor(inode, NULL, 0); + if (ret < 0) + goto iput; + + if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) { + ret = -EMSGSIZE; + goto iput; + } + if (!sctx->verity_descriptor) { + sctx->verity_descriptor = kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE, + GFP_KERNEL); + if (!sctx->verity_descriptor) { + ret = -ENOMEM; + goto iput; + } + } + + ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret); + if (ret < 0) + goto iput; + + p = fs_path_alloc(); + if (!p) { + ret = -ENOMEM; + goto iput; + } + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto free_path; + + ret = send_verity(sctx, p, sctx->verity_descriptor); + if (ret < 0) + goto free_path; + +free_path: + fs_path_free(p); +iput: + iput(inode); + return ret; +} + static inline u64 max_send_read_size(const struct send_ctx *sctx) { return sctx->send_max_size - SZ_16K; @@ -5056,8 +5378,7 @@ static int send_clone(struct send_ctx *sctx, TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); if (clone_root->root == sctx->send_root) { - ret = get_inode_info(sctx->send_root, clone_root->ino, NULL, - &gen, NULL, NULL, NULL, NULL, NULL); + ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen); if (ret < 0) goto out; ret = get_cur_path(sctx, clone_root->ino, gen, p); @@ -5536,6 +5857,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, struct btrfs_path *path; struct btrfs_key key; int ret; + struct btrfs_inode_info info; u64 clone_src_i_size = 0; /* @@ -5565,12 +5887,11 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, * There are inodes that have extents that lie behind its i_size. Don't * accept clones from these extents. */ - ret = __get_inode_info(clone_root->root, path, clone_root->ino, - &clone_src_i_size, NULL, NULL, NULL, NULL, NULL, - NULL); + ret = get_inode_info(clone_root->root, clone_root->ino, &info); btrfs_release_path(path); if (ret < 0) goto out; + clone_src_i_size = info.size; /* * We can't send a clone operation for the entire range if we find @@ -5615,6 +5936,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, u64 ext_len; u64 clone_len; u64 clone_data_offset; + bool crossed_src_i_size = false; if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(clone_root->root, path); @@ -5672,8 +5994,10 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, if (key.offset >= clone_src_i_size) break; - if (key.offset + ext_len > clone_src_i_size) + if (key.offset + ext_len > clone_src_i_size) { ext_len = clone_src_i_size - key.offset; + crossed_src_i_size = true; + } clone_data_offset = btrfs_file_extent_offset(leaf, ei); if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) { @@ -5734,6 +6058,25 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, ret = send_clone(sctx, offset, clone_len, clone_root); } + } else if (crossed_src_i_size && clone_len < len) { + /* + * If we are at i_size of the clone source inode and we + * can not clone from it, terminate the loop. This is + * to avoid sending two write operations, one with a + * length matching clone_len and the final one after + * this loop with a length of len - clone_len. + * + * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED + * was passed to the send ioctl), this helps avoid + * sending an encoded write for an offset that is not + * sector size aligned, in case the i_size of the source + * inode is not sector size aligned. That will make the + * receiver fallback to decompression of the data and + * writing it using regular buffered IO, therefore while + * not incorrect, it's not optimal due decompression and + * possible re-compression at the receiver. + */ + break; } else { ret = send_extent_data(sctx, dst_path, offset, clone_len); @@ -6259,6 +6602,7 @@ out: static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) { int ret = 0; + struct btrfs_inode_info info; u64 left_mode; u64 left_uid; u64 left_gid; @@ -6301,11 +6645,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) goto out; if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino) goto out; - - ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL, - &left_mode, &left_uid, &left_gid, NULL, &left_fileattr); + ret = get_inode_info(sctx->send_root, sctx->cur_ino, &info); if (ret < 0) goto out; + left_mode = info.mode; + left_uid = info.uid; + left_gid = info.gid; + left_fileattr = info.fileattr; if (!sctx->parent_root || sctx->cur_inode_new) { need_chown = 1; @@ -6316,11 +6662,14 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) } else { u64 old_size; - ret = get_inode_info(sctx->parent_root, sctx->cur_ino, - &old_size, NULL, &right_mode, &right_uid, - &right_gid, NULL, &right_fileattr); + ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &info); if (ret < 0) goto out; + old_size = info.size; + right_mode = info.mode; + right_uid = info.uid; + right_gid = info.gid; + right_fileattr = info.fileattr; if (left_uid != right_uid || left_gid != right_gid) need_chown = 1; @@ -6378,6 +6727,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) goto out; } + if (proto_cmd_ok(sctx, BTRFS_SEND_C_ENABLE_VERITY) + && sctx->cur_inode_needs_verity) { + ret = process_verity(sctx); + if (ret < 0) + goto out; + } + ret = send_capabilities(sctx); if (ret < 0) goto out; @@ -6407,86 +6763,6 @@ out: return ret; } -struct parent_paths_ctx { - struct list_head *refs; - struct send_ctx *sctx; -}; - -static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name, - void *ctx) -{ - struct parent_paths_ctx *ppctx = ctx; - - /* - * Pass 0 as the generation for the directory, we don't care about it - * here as we have no new references to add, we just want to delete all - * references for an inode. - */ - return record_ref_in_tree(&ppctx->sctx->rbtree_deleted_refs, ppctx->refs, - name, dir, 0, ppctx->sctx); -} - -/* - * Issue unlink operations for all paths of the current inode found in the - * parent snapshot. - */ -static int btrfs_unlink_all_paths(struct send_ctx *sctx) -{ - LIST_HEAD(deleted_refs); - struct btrfs_path *path; - struct btrfs_root *root = sctx->parent_root; - struct btrfs_key key; - struct btrfs_key found_key; - struct parent_paths_ctx ctx; - int iter_ret = 0; - int ret; - - path = alloc_path_for_send(); - if (!path) - return -ENOMEM; - - key.objectid = sctx->cur_ino; - key.type = BTRFS_INODE_REF_KEY; - key.offset = 0; - - ctx.refs = &deleted_refs; - ctx.sctx = sctx; - - btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { - if (found_key.objectid != key.objectid) - break; - if (found_key.type != key.type && - found_key.type != BTRFS_INODE_EXTREF_KEY) - break; - - ret = iterate_inode_ref(root, path, &found_key, 1, - record_parent_ref, &ctx); - if (ret < 0) - goto out; - } - /* Catch error found during iteration */ - if (iter_ret < 0) { - ret = iter_ret; - goto out; - } - - while (!list_empty(&deleted_refs)) { - struct recorded_ref *ref; - - ref = list_first_entry(&deleted_refs, struct recorded_ref, list); - ret = send_unlink(sctx, ref->full_path); - if (ret < 0) - goto out; - recorded_ref_free(ref); - } - ret = 0; -out: - btrfs_free_path(path); - if (ret) - __free_recorded_refs(&deleted_refs); - return ret; -} - static void close_current_inode(struct send_ctx *sctx) { u64 i_size; @@ -6577,25 +6853,36 @@ static int changed_inode(struct send_ctx *sctx, * file descriptor against it or turning a RO snapshot into RW mode, * keep an open file descriptor against a file, delete it and then * turn the snapshot back to RO mode before using it for a send - * operation. So if we find such cases, ignore the inode and all its - * items completely if it's a new inode, or if it's a changed inode - * make sure all its previous paths (from the parent snapshot) are all - * unlinked and all other the inode items are ignored. + * operation. The former is what the receiver operation does. + * Therefore, if we want to send these snapshots soon after they're + * received, we need to handle orphan inodes as well. Moreover, orphans + * can appear not only in the send snapshot but also in the parent + * snapshot. Here are several cases: + * + * Case 1: BTRFS_COMPARE_TREE_NEW + * | send snapshot | action + * -------------------------------- + * nlink | 0 | ignore + * + * Case 2: BTRFS_COMPARE_TREE_DELETED + * | parent snapshot | action + * ---------------------------------- + * nlink | 0 | as usual + * Note: No unlinks will be sent because there're no paths for it. + * + * Case 3: BTRFS_COMPARE_TREE_CHANGED + * | | parent snapshot | send snapshot | action + * ----------------------------------------------------------------------- + * subcase 1 | nlink | 0 | 0 | ignore + * subcase 2 | nlink | >0 | 0 | new_gen(deletion) + * subcase 3 | nlink | 0 | >0 | new_gen(creation) + * */ - if (result == BTRFS_COMPARE_TREE_NEW || - result == BTRFS_COMPARE_TREE_CHANGED) { - u32 nlinks; - - nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii); - if (nlinks == 0) { + if (result == BTRFS_COMPARE_TREE_NEW) { + if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) { sctx->ignore_cur_inode = true; - if (result == BTRFS_COMPARE_TREE_CHANGED) - ret = btrfs_unlink_all_paths(sctx); goto out; } - } - - if (result == BTRFS_COMPARE_TREE_NEW) { sctx->cur_inode_gen = left_gen; sctx->cur_inode_new = true; sctx->cur_inode_deleted = false; @@ -6616,6 +6903,16 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_inode_mode = btrfs_inode_mode( sctx->right_path->nodes[0], right_ii); } else if (result == BTRFS_COMPARE_TREE_CHANGED) { + u32 new_nlinks, old_nlinks; + + new_nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii); + old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii); + if (new_nlinks == 0 && old_nlinks == 0) { + sctx->ignore_cur_inode = true; + goto out; + } else if (new_nlinks == 0 || old_nlinks == 0) { + sctx->cur_inode_new_gen = 1; + } /* * We need to do some special handling in case the inode was * reported as changed with a changed generation number. This @@ -6627,53 +6924,61 @@ static int changed_inode(struct send_ctx *sctx, /* * First, process the inode as if it was deleted. */ - sctx->cur_inode_gen = right_gen; - sctx->cur_inode_new = false; - sctx->cur_inode_deleted = true; - sctx->cur_inode_size = btrfs_inode_size( - sctx->right_path->nodes[0], right_ii); - sctx->cur_inode_mode = btrfs_inode_mode( - sctx->right_path->nodes[0], right_ii); - ret = process_all_refs(sctx, - BTRFS_COMPARE_TREE_DELETED); - if (ret < 0) - goto out; + if (old_nlinks > 0) { + sctx->cur_inode_gen = right_gen; + sctx->cur_inode_new = false; + sctx->cur_inode_deleted = true; + sctx->cur_inode_size = btrfs_inode_size( + sctx->right_path->nodes[0], right_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->right_path->nodes[0], right_ii); + ret = process_all_refs(sctx, + BTRFS_COMPARE_TREE_DELETED); + if (ret < 0) + goto out; + } /* * Now process the inode as if it was new. */ - sctx->cur_inode_gen = left_gen; - sctx->cur_inode_new = true; - sctx->cur_inode_deleted = false; - sctx->cur_inode_size = btrfs_inode_size( - sctx->left_path->nodes[0], left_ii); - sctx->cur_inode_mode = btrfs_inode_mode( - sctx->left_path->nodes[0], left_ii); - sctx->cur_inode_rdev = btrfs_inode_rdev( - sctx->left_path->nodes[0], left_ii); - ret = send_create_inode_if_needed(sctx); - if (ret < 0) - goto out; + if (new_nlinks > 0) { + sctx->cur_inode_gen = left_gen; + sctx->cur_inode_new = true; + sctx->cur_inode_deleted = false; + sctx->cur_inode_size = btrfs_inode_size( + sctx->left_path->nodes[0], + left_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->left_path->nodes[0], + left_ii); + sctx->cur_inode_rdev = btrfs_inode_rdev( + sctx->left_path->nodes[0], + left_ii); + ret = send_create_inode_if_needed(sctx); + if (ret < 0) + goto out; - ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); - if (ret < 0) - goto out; - /* - * Advance send_progress now as we did not get into - * process_recorded_refs_if_needed in the new_gen case. - */ - sctx->send_progress = sctx->cur_ino + 1; + ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); + if (ret < 0) + goto out; + /* + * Advance send_progress now as we did not get + * into process_recorded_refs_if_needed in the + * new_gen case. + */ + sctx->send_progress = sctx->cur_ino + 1; - /* - * Now process all extents and xattrs of the inode as if - * they were all new. - */ - ret = process_all_extents(sctx); - if (ret < 0) - goto out; - ret = process_all_new_xattrs(sctx); - if (ret < 0) - goto out; + /* + * Now process all extents and xattrs of the + * inode as if they were all new. + */ + ret = process_all_extents(sctx); + if (ret < 0) + goto out; + ret = process_all_new_xattrs(sctx); + if (ret < 0) + goto out; + } } else { sctx->cur_inode_gen = left_gen; sctx->cur_inode_new = false; @@ -6785,18 +7090,27 @@ static int changed_extent(struct send_ctx *sctx, return ret; } +static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result) +{ + int ret = 0; + + if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { + if (result == BTRFS_COMPARE_TREE_NEW) + sctx->cur_inode_needs_verity = true; + } + return ret; +} + static int dir_changed(struct send_ctx *sctx, u64 dir) { u64 orig_gen, new_gen; int ret; - ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL, - NULL, NULL, NULL); + ret = get_inode_gen(sctx->send_root, dir, &new_gen); if (ret) return ret; - ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL, - NULL, NULL, NULL, NULL); + ret = get_inode_gen(sctx->parent_root, dir, &orig_gen); if (ret) return ret; @@ -6939,6 +7253,9 @@ static int changed_cb(struct btrfs_path *left_path, ret = changed_xattr(sctx, result); else if (key->type == BTRFS_EXTENT_DATA_KEY) ret = changed_extent(sctx, result); + else if (key->type == BTRFS_VERITY_DESC_ITEM_KEY && + key->offset == 0) + ret = changed_verity(sctx, result); } out: @@ -7780,6 +8097,9 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); INIT_LIST_HEAD(&sctx->name_cache_list); + INIT_LIST_HEAD(&sctx->backref_cache.lru_list); + mt_init(&sctx->backref_cache.entries); + sctx->flags = arg->flags; if (arg->flags & BTRFS_SEND_FLAG_VERSION) { @@ -7818,7 +8138,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) if (sctx->proto >= 2) { u32 send_buf_num_pages; - sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE); + sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V2; sctx->send_buf = vmalloc(sctx->send_max_size); if (!sctx->send_buf) { ret = -ENOMEM; @@ -8036,11 +8356,14 @@ out: kvfree(sctx->clone_roots); kfree(sctx->send_buf_pages); kvfree(sctx->send_buf); + kvfree(sctx->verity_descriptor); name_cache_free(sctx); close_current_inode(sctx); + empty_backref_cache(sctx); + kfree(sctx); } diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 4bb4e6a638cb..4f5509cb1803 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -10,13 +10,20 @@ #include <linux/types.h> #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" +/* Conditional support for the upcoming protocol version. */ +#ifdef CONFIG_BTRFS_DEBUG +#define BTRFS_SEND_STREAM_VERSION 3 +#else #define BTRFS_SEND_STREAM_VERSION 2 +#endif /* - * In send stream v1, no command is larger than 64K. In send stream v2, no limit - * should be assumed. + * In send stream v1, no command is larger than 64K. In send stream v2, no + * limit should be assumed, the buffer size is set to be a header with + * compressed extent size. */ #define BTRFS_SEND_BUF_SIZE_V1 SZ_64K +#define BTRFS_SEND_BUF_SIZE_V2 ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE) struct inode; struct btrfs_ioctl_send_args; @@ -92,8 +99,11 @@ enum btrfs_send_cmd { BTRFS_SEND_C_ENCODED_WRITE = 25, BTRFS_SEND_C_MAX_V2 = 25, + /* Version 3 */ + BTRFS_SEND_C_ENABLE_VERITY = 26, + BTRFS_SEND_C_MAX_V3 = 26, /* End */ - BTRFS_SEND_C_MAX = 25, + BTRFS_SEND_C_MAX = 26, }; /* attributes in send stream */ @@ -160,8 +170,14 @@ enum { BTRFS_SEND_A_ENCRYPTION = 31, BTRFS_SEND_A_MAX_V2 = 31, - /* End */ - BTRFS_SEND_A_MAX = 31, + /* Version 3 */ + BTRFS_SEND_A_VERITY_ALGORITHM = 32, + BTRFS_SEND_A_VERITY_BLOCK_SIZE = 33, + BTRFS_SEND_A_VERITY_SALT_DATA = 34, + BTRFS_SEND_A_VERITY_SIG_DATA = 35, + BTRFS_SEND_A_MAX_V3 = 35, + + __BTRFS_SEND_A_MAX = 35, }; long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 435559ba94fa..d28ee4e36f3d 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -10,6 +10,9 @@ #include "transaction.h" #include "block-group.h" #include "zoned.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" /* * HOW DOES SPACE RESERVATION WORK @@ -293,32 +296,36 @@ out: return ret; } -void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, - u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, u64 bytes_zone_unusable, - bool active, struct btrfs_space_info **space_info) +void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, + struct btrfs_block_group *block_group) { struct btrfs_space_info *found; - int factor; + int factor, index; - factor = btrfs_bg_type_to_factor(flags); + factor = btrfs_bg_type_to_factor(block_group->flags); - found = btrfs_find_space_info(info, flags); + found = btrfs_find_space_info(info, block_group->flags); ASSERT(found); spin_lock(&found->lock); - found->total_bytes += total_bytes; - if (active) - found->active_total_bytes += total_bytes; - found->disk_total += total_bytes * factor; - found->bytes_used += bytes_used; - found->disk_used += bytes_used * factor; - found->bytes_readonly += bytes_readonly; - found->bytes_zone_unusable += bytes_zone_unusable; - if (total_bytes > 0) + found->total_bytes += block_group->length; + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) + found->active_total_bytes += block_group->length; + found->disk_total += block_group->length * factor; + found->bytes_used += block_group->used; + found->disk_used += block_group->used * factor; + found->bytes_readonly += block_group->bytes_super; + found->bytes_zone_unusable += block_group->zone_unusable; + if (block_group->length > 0) found->full = 0; btrfs_try_granting_tickets(info, found); spin_unlock(&found->lock); - *space_info = found; + + block_group->space_info = found; + + index = btrfs_bg_flags_to_raid_index(block_group->flags); + down_write(&found->groups_sem); + list_add_tail(&block_group->list, &found->block_groups[index]); + up_write(&found->groups_sem); } struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, @@ -472,28 +479,47 @@ do { \ spin_unlock(&__rsv->lock); \ } while (0) +static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info) +{ + switch (space_info->flags) { + case BTRFS_BLOCK_GROUP_SYSTEM: + return "SYSTEM"; + case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: + return "DATA+METADATA"; + case BTRFS_BLOCK_GROUP_DATA: + return "DATA"; + case BTRFS_BLOCK_GROUP_METADATA: + return "METADATA"; + default: + return "UNKNOWN"; + } +} + +static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + DUMP_BLOCK_RSV(fs_info, global_block_rsv); + DUMP_BLOCK_RSV(fs_info, trans_block_rsv); + DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); +} + static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info) { + const char *flag_str = space_info_flag_to_str(info); lockdep_assert_held(&info->lock); /* The free space could be negative in case of overcommit */ - btrfs_info(fs_info, "space_info %llu has %lld free, is %sfull", - info->flags, + btrfs_info(fs_info, "space_info %s has %lld free, is %sfull", + flag_str, (s64)(info->total_bytes - btrfs_space_info_used(info, true)), info->full ? "" : "not "); btrfs_info(fs_info, - "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu", +"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu", info->total_bytes, info->bytes_used, info->bytes_pinned, info->bytes_reserved, info->bytes_may_use, info->bytes_readonly, info->bytes_zone_unusable); - - DUMP_BLOCK_RSV(fs_info, global_block_rsv); - DUMP_BLOCK_RSV(fs_info, trans_block_rsv); - DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); - DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); - DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); - } void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, @@ -505,6 +531,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, spin_lock(&info->lock); __btrfs_dump_space_info(fs_info, info); + dump_global_block_rsv(fs_info); spin_unlock(&info->lock); if (!dump_block_groups) @@ -832,7 +859,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, u64 thresh; u64 used; - thresh = div_factor_fine(total, 90); + thresh = mult_perc(total, 90); lockdep_assert_held(&space_info->lock); @@ -950,7 +977,7 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, return false; spin_lock(&global_rsv->lock); - min_bytes = div_factor(global_rsv->size, 1); + min_bytes = mult_perc(global_rsv->size, 10); if (global_rsv->reserved < min_bytes + ticket->bytes) { spin_unlock(&global_rsv->lock); return false; @@ -1466,8 +1493,8 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); } -/** - * Do the appropriate flushing and waiting for a ticket +/* + * Do the appropriate flushing and waiting for a ticket. * * @fs_info: the filesystem * @space_info: space info for the reservation @@ -1559,8 +1586,18 @@ static inline bool can_steal(enum btrfs_reserve_flush_enum flush) flush == BTRFS_RESERVE_FLUSH_EVICT); } -/** - * Try to reserve bytes from the block_rsv's space +/* + * NO_FLUSH and FLUSH_EMERGENCY don't want to create a ticket, they just want to + * fail as quickly as possible. + */ +static inline bool can_ticket(enum btrfs_reserve_flush_enum flush) +{ + return (flush != BTRFS_RESERVE_NO_FLUSH && + flush != BTRFS_RESERVE_FLUSH_EMERGENCY); +} + +/* + * Try to reserve bytes from the block_rsv's space. * * @fs_info: the filesystem * @space_info: space info we want to allocate from @@ -1621,13 +1658,28 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, } /* + * Things are dire, we need to make a reservation so we don't abort. We + * will let this reservation go through as long as we have actual space + * left to allocate for the block. + */ + if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { + used = btrfs_space_info_used(space_info, false); + if (used + orig_bytes <= + writable_total_bytes(fs_info, space_info)) { + btrfs_space_info_update_bytes_may_use(fs_info, space_info, + orig_bytes); + ret = 0; + } + } + + /* * If we couldn't make a reservation then setup our reservation ticket * and kick the async worker if it's not already running. * * If we are a priority flusher then we just need to add our ticket to * the list and we will do our own flushing further down. */ - if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { + if (ret && can_ticket(flush)) { ticket.bytes = orig_bytes; ticket.error = 0; space_info->reclaim_size += ticket.bytes; @@ -1662,7 +1714,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, &space_info->priority_tickets); } } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { - used += orig_bytes; /* * We will do the space reservation dance during log replay, * which means we won't have fs_info->fs_root set, so don't do @@ -1678,15 +1729,15 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, } } spin_unlock(&space_info->lock); - if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) + if (!ret || !can_ticket(flush)) return ret; return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns, orig_bytes, flush); } -/** - * Trye to reserve metadata bytes from the block_rsv's space +/* + * Try to reserve metadata bytes from the block_rsv's space. * * @fs_info: the filesystem * @block_rsv: block_rsv we're allocating for @@ -1720,8 +1771,8 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, return ret; } -/** - * Try to reserve data bytes for an allocation +/* + * Try to reserve data bytes for an allocation. * * @fs_info: the filesystem * @bytes: number of bytes we need @@ -1737,7 +1788,8 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, int ret; ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || - flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE); + flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE || + flush == BTRFS_RESERVE_NO_FLUSH); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush); @@ -1749,3 +1801,51 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, } return ret; } + +/* Dump all the space infos when we abort a transaction due to ENOSPC. */ +__cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + + btrfs_info(fs_info, "dumping space info:"); + list_for_each_entry(space_info, &fs_info->space_info, list) { + spin_lock(&space_info->lock); + __btrfs_dump_space_info(fs_info, space_info); + spin_unlock(&space_info->lock); + } + dump_global_block_rsv(fs_info); +} + +/* + * Account the unused space of all the readonly block group in the space_info. + * takes mirrors into account. + */ +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) +{ + struct btrfs_block_group *block_group; + u64 free_bytes = 0; + int factor; + + /* It's df, we don't care if it's racy */ + if (list_empty(&sinfo->ro_bgs)) + return 0; + + spin_lock(&sinfo->lock); + list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { + spin_lock(&block_group->lock); + + if (!block_group->ro) { + spin_unlock(&block_group->lock); + continue; + } + + factor = btrfs_bg_type_to_factor(block_group->flags); + free_bytes += (block_group->length - + block_group->used) * factor; + + spin_unlock(&block_group->lock); + } + spin_unlock(&sinfo->lock); + + return free_bytes; +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 12fd6147f92d..fc99ea2b0c34 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -5,6 +5,83 @@ #include "volumes.h" +/* + * Different levels for to flush space when doing space reservations. + * + * The higher the level, the more methods we try to reclaim space. + */ +enum btrfs_reserve_flush_enum { + /* If we are in the transaction, we can't flush anything.*/ + BTRFS_RESERVE_NO_FLUSH, + + /* + * Flush space by: + * - Running delayed inode items + * - Allocating a new chunk + */ + BTRFS_RESERVE_FLUSH_LIMIT, + + /* + * Flush space by: + * - Running delayed inode items + * - Running delayed refs + * - Running delalloc and waiting for ordered extents + * - Allocating a new chunk + */ + BTRFS_RESERVE_FLUSH_EVICT, + + /* + * Flush space by above mentioned methods and by: + * - Running delayed iputs + * - Committing transaction + * + * Can be interrupted by a fatal signal. + */ + BTRFS_RESERVE_FLUSH_DATA, + BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, + BTRFS_RESERVE_FLUSH_ALL, + + /* + * Pretty much the same as FLUSH_ALL, but can also steal space from + * global rsv. + * + * Can be interrupted by a fatal signal. + */ + BTRFS_RESERVE_FLUSH_ALL_STEAL, + + /* + * This is for btrfs_use_block_rsv only. We have exhausted our block + * rsv and our global block rsv. This can happen for things like + * delalloc where we are overwriting a lot of extents with a single + * extent and didn't reserve enough space. Alternatively it can happen + * with delalloc where we reserve 1 extents worth for a large extent but + * fragmentation leads to multiple extents being created. This will + * give us the reservation in the case of + * + * if (num_bytes < (space_info->total_bytes - + * btrfs_space_info_used(space_info, false)) + * + * Which ignores bytes_may_use. This is potentially dangerous, but our + * reservation system is generally pessimistic so is able to absorb this + * style of mistake. + */ + BTRFS_RESERVE_FLUSH_EMERGENCY, +}; + +enum btrfs_flush_state { + FLUSH_DELAYED_ITEMS_NR = 1, + FLUSH_DELAYED_ITEMS = 2, + FLUSH_DELAYED_REFS_NR = 3, + FLUSH_DELAYED_REFS = 4, + FLUSH_DELALLOC = 5, + FLUSH_DELALLOC_WAIT = 6, + FLUSH_DELALLOC_FULL = 7, + ALLOC_CHUNK = 8, + ALLOC_CHUNK_FORCE = 9, + RUN_DELAYED_IPUTS = 10, + COMMIT_TRANS = 11, +}; + struct btrfs_space_info { spinlock_t lock; @@ -123,10 +200,8 @@ DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info"); DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned"); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); -void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, - u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, u64 bytes_zone_unusable, - bool active, struct btrfs_space_info **space_info); +void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, + struct btrfs_block_group *block_group); void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, u64 chunk_size); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, @@ -159,4 +234,8 @@ static inline void btrfs_space_info_free_bytes_may_use( } int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, enum btrfs_reserve_flush_enum flush); +void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info); +void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); + #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 6fc2b77ae5c3..dd46b978ac2c 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/slab.h> +#include "messages.h" #include "ctree.h" #include "subpage.h" #include "btrfs_inode.h" @@ -337,7 +338,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, * * Even with 0 returned, the page still need extra check to make sure * it's really the correct page, as the caller is using - * find_get_pages_contig(), which can race with page invalidating. + * filemap_get_folios_contig(), which can race with page invalidating. */ int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f89beac3c665..93f52ee85f6f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -26,6 +26,7 @@ #include <linux/ratelimit.h> #include <linux/crc32c.h> #include <linux/btrfs.h> +#include "messages.h" #include "delayed-inode.h" #include "ctree.h" #include "disk-io.h" @@ -34,7 +35,7 @@ #include "print-tree.h" #include "props.h" #include "xattr.h" -#include "volumes.h" +#include "bio.h" #include "export.h" #include "compression.h" #include "rcu-string.h" @@ -49,6 +50,14 @@ #include "discard.h" #include "qgroup.h" #include "raid56.h" +#include "fs.h" +#include "accessors.h" +#include "defrag.h" +#include "dir-item.h" +#include "ioctl.h" +#include "scrub.h" +#include "verity.h" +#include "super.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> @@ -67,326 +76,6 @@ static struct file_system_type btrfs_root_fs_type; static int btrfs_remount(struct super_block *sb, int *flags, char *data); -#ifdef CONFIG_PRINTK - -#define STATE_STRING_PREFACE ": state " -#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) - -/* - * Characters to print to indicate error conditions or uncommon filesystem state. - * RO is not an error. - */ -static const char fs_state_chars[] = { - [BTRFS_FS_STATE_ERROR] = 'E', - [BTRFS_FS_STATE_REMOUNTING] = 'M', - [BTRFS_FS_STATE_RO] = 0, - [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', - [BTRFS_FS_STATE_DEV_REPLACING] = 'R', - [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, - [BTRFS_FS_STATE_NO_CSUMS] = 'C', - [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', -}; - -static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) -{ - unsigned int bit; - bool states_printed = false; - unsigned long fs_state = READ_ONCE(info->fs_state); - char *curr = buf; - - memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); - curr += sizeof(STATE_STRING_PREFACE) - 1; - - for_each_set_bit(bit, &fs_state, sizeof(fs_state)) { - WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT); - if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) { - *curr++ = fs_state_chars[bit]; - states_printed = true; - } - } - - /* If no states were printed, reset the buffer */ - if (!states_printed) - curr = buf; - - *curr++ = 0; -} -#endif - -/* - * Generally the error codes correspond to their respective errors, but there - * are a few special cases. - * - * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for - * instance will return EUCLEAN if any of the blocks are corrupted in - * a way that is problematic. We want to reserve EUCLEAN for these - * sort of corruptions. - * - * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we - * need to use EROFS for this case. We will have no idea of the - * original failure, that will have been reported at the time we tripped - * over the error. Each subsequent error that doesn't have any context - * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR. - */ -const char * __attribute_const__ btrfs_decode_error(int errno) -{ - char *errstr = "unknown"; - - switch (errno) { - case -ENOENT: /* -2 */ - errstr = "No such entry"; - break; - case -EIO: /* -5 */ - errstr = "IO failure"; - break; - case -ENOMEM: /* -12*/ - errstr = "Out of memory"; - break; - case -EEXIST: /* -17 */ - errstr = "Object already exists"; - break; - case -ENOSPC: /* -28 */ - errstr = "No space left"; - break; - case -EROFS: /* -30 */ - errstr = "Readonly filesystem"; - break; - case -EOPNOTSUPP: /* -95 */ - errstr = "Operation not supported"; - break; - case -EUCLEAN: /* -117 */ - errstr = "Filesystem corrupted"; - break; - case -EDQUOT: /* -122 */ - errstr = "Quota exceeded"; - break; - } - - return errstr; -} - -/* - * __btrfs_handle_fs_error decodes expected errors from the caller and - * invokes the appropriate error response. - */ -__cold -void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) -{ - struct super_block *sb = fs_info->sb; -#ifdef CONFIG_PRINTK - char statestr[STATE_STRING_BUF_LEN]; - const char *errstr; -#endif - - /* - * Special case: if the error is EROFS, and we're already - * under SB_RDONLY, then it is safe here. - */ - if (errno == -EROFS && sb_rdonly(sb)) - return; - -#ifdef CONFIG_PRINTK - errstr = btrfs_decode_error(errno); - btrfs_state_to_string(fs_info, statestr); - if (fmt) { - struct va_format vaf; - va_list args; - - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - - pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n", - sb->s_id, statestr, function, line, errno, errstr, &vaf); - va_end(args); - } else { - pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n", - sb->s_id, statestr, function, line, errno, errstr); - } -#endif - - /* - * Today we only save the error info to memory. Long term we'll - * also send it down to the disk - */ - set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); - - /* Don't go through full error handling during mount */ - if (!(sb->s_flags & SB_BORN)) - return; - - if (sb_rdonly(sb)) - return; - - btrfs_discard_stop(fs_info); - - /* btrfs handle error by forcing the filesystem readonly */ - btrfs_set_sb_rdonly(sb); - btrfs_info(fs_info, "forced readonly"); - /* - * Note that a running device replace operation is not canceled here - * although there is no way to update the progress. It would add the - * risk of a deadlock, therefore the canceling is omitted. The only - * penalty is that some I/O remains active until the procedure - * completes. The next time when the filesystem is mounted writable - * again, the device replace operation continues. - */ -} - -#ifdef CONFIG_PRINTK -static const char * const logtypes[] = { - "emergency", - "alert", - "critical", - "error", - "warning", - "notice", - "info", - "debug", -}; - - -/* - * Use one ratelimit state per log level so that a flood of less important - * messages doesn't cause more important ones to be dropped. - */ -static struct ratelimit_state printk_limits[] = { - RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), -}; - -void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) -{ - char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; - struct va_format vaf; - va_list args; - int kern_level; - const char *type = logtypes[4]; - struct ratelimit_state *ratelimit = &printk_limits[4]; - - va_start(args, fmt); - - while ((kern_level = printk_get_level(fmt)) != 0) { - size_t size = printk_skip_level(fmt) - fmt; - - if (kern_level >= '0' && kern_level <= '7') { - memcpy(lvl, fmt, size); - lvl[size] = '\0'; - type = logtypes[kern_level - '0']; - ratelimit = &printk_limits[kern_level - '0']; - } - fmt += size; - } - - vaf.fmt = fmt; - vaf.va = &args; - - if (__ratelimit(ratelimit)) { - if (fs_info) { - char statestr[STATE_STRING_BUF_LEN]; - - btrfs_state_to_string(fs_info, statestr); - _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, - fs_info->sb->s_id, statestr, &vaf); - } else { - _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); - } - } - - va_end(args); -} -#endif - -#if BITS_PER_LONG == 32 -void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info) -{ - if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) { - btrfs_warn(fs_info, "reaching 32bit limit for logical addresses"); - btrfs_warn(fs_info, -"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT", - BTRFS_32BIT_MAX_FILE_SIZE >> 40); - btrfs_warn(fs_info, - "please consider upgrading to 64bit kernel/hardware"); - } -} - -void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) -{ - if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) { - btrfs_err(fs_info, "reached 32bit limit for logical addresses"); - btrfs_err(fs_info, -"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed", - BTRFS_32BIT_MAX_FILE_SIZE >> 40); - btrfs_err(fs_info, - "please consider upgrading to 64bit kernel/hardware"); - } -} -#endif - -/* - * We only mark the transaction aborted and then set the file system read-only. - * This will prevent new transactions from starting or trying to join this - * one. - * - * This means that error recovery at the call site is limited to freeing - * any local memory allocations and passing the error code up without - * further cleanup. The transaction should complete as it normally would - * in the call path but will return -EIO. - * - * We'll complete the cleanup in btrfs_end_transaction and - * btrfs_commit_transaction. - */ -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - - WRITE_ONCE(trans->aborted, errno); - WRITE_ONCE(trans->transaction->aborted, errno); - /* Wake up anybody who may be waiting on this transaction */ - wake_up(&fs_info->transaction_wait); - wake_up(&fs_info->transaction_blocked_wait); - __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); -} -/* - * __btrfs_panic decodes unexpected, fatal errors from the caller, - * issues an alert, and either panics or BUGs, depending on mount options. - */ -__cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) -{ - char *s_id = "<unknown>"; - const char *errstr; - struct va_format vaf = { .fmt = fmt }; - va_list args; - - if (fs_info) - s_id = fs_info->sb->s_id; - - va_start(args, fmt); - vaf.va = &args; - - errstr = btrfs_decode_error(errno); - if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR))) - panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", - s_id, function, line, &vaf, errno, errstr); - - btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)", - function, line, &vaf, errno, errstr); - va_end(args); - /* Caller calls BUG() */ -} - static void btrfs_put_super(struct super_block *sb) { close_ctree(btrfs_sb(sb)); @@ -626,6 +315,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, int saved_compress_level; bool saved_compress_force; int no_compress = 0; + const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state); if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE)) btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE); @@ -915,12 +605,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, ret = -EINVAL; goto out; } + btrfs_clear_opt(info->mount_opt, NODISCARD); break; case Opt_nodiscard: btrfs_clear_and_info(info, DISCARD_SYNC, "turning off discard"); btrfs_clear_and_info(info, DISCARD_ASYNC, "turning off async discard"); + btrfs_set_opt(info->mount_opt, NODISCARD); break; case Opt_space_cache: case Opt_space_cache_version: @@ -1137,10 +829,12 @@ out: } if (!ret) ret = btrfs_check_mountopts_zoned(info); - if (!ret && btrfs_test_opt(info, SPACE_CACHE)) - btrfs_info(info, "disk space caching is enabled"); - if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE)) - btrfs_info(info, "using free space tree"); + if (!ret && !remounting) { + if (btrfs_test_opt(info, SPACE_CACHE)) + btrfs_info(info, "disk space caching is enabled"); + if (btrfs_test_opt(info, FREE_SPACE_TREE)) + btrfs_info(info, "using free space tree"); + } return ret; } @@ -1389,6 +1083,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec struct btrfs_dir_item *di; struct btrfs_path *path; struct btrfs_key location; + struct fscrypt_str name = FSTR_INIT("default", 7); u64 dir_id; path = btrfs_alloc_path(); @@ -1401,7 +1096,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec * to mount. */ dir_id = btrfs_super_root_dir(fs_info->super_copy); - di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); + di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0); if (IS_ERR(di)) { btrfs_free_path(path); return PTR_ERR(di); @@ -1502,7 +1197,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) * Exit unless we have some pending changes * that need to go through commit */ - if (fs_info->pending_changes == 0) + if (!test_bit(BTRFS_FS_NEED_TRANS_COMMIT, + &fs_info->flags)) return 0; /* * A non-blocking test if the fs is frozen. We must not @@ -2009,14 +1705,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; - /* V1 cache is not supported for subpage mount. */ - if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { - btrfs_warn(fs_info, - "v1 space cache is not supported for page size %lu with sectorsize %u", - PAGE_SIZE, fs_info->sectorsize); - ret = -EINVAL; + ret = btrfs_check_features(fs_info, sb); + if (ret < 0) goto restore; - } + btrfs_remount_begin(fs_info, old_opts, *flags); btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, old_thread_pool_size); @@ -2550,11 +2242,87 @@ static int btrfs_freeze(struct super_block *sb) return btrfs_commit_transaction(trans); } +static int check_dev_super(struct btrfs_device *dev) +{ + struct btrfs_fs_info *fs_info = dev->fs_info; + struct btrfs_super_block *sb; + u16 csum_type; + int ret = 0; + + /* This should be called with fs still frozen. */ + ASSERT(test_bit(BTRFS_FS_FROZEN, &fs_info->flags)); + + /* Missing dev, no need to check. */ + if (!dev->bdev) + return 0; + + /* Only need to check the primary super block. */ + sb = btrfs_read_dev_one_super(dev->bdev, 0, true); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + /* Verify the checksum. */ + csum_type = btrfs_super_csum_type(sb); + if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) { + btrfs_err(fs_info, "csum type changed, has %u expect %u", + csum_type, btrfs_super_csum_type(fs_info->super_copy)); + ret = -EUCLEAN; + goto out; + } + + if (btrfs_check_super_csum(fs_info, sb)) { + btrfs_err(fs_info, "csum for on-disk super block no longer matches"); + ret = -EUCLEAN; + goto out; + } + + /* Btrfs_validate_super() includes fsid check against super->fsid. */ + ret = btrfs_validate_super(fs_info, sb, 0); + if (ret < 0) + goto out; + + if (btrfs_super_generation(sb) != fs_info->last_trans_committed) { + btrfs_err(fs_info, "transid mismatch, has %llu expect %llu", + btrfs_super_generation(sb), + fs_info->last_trans_committed); + ret = -EUCLEAN; + goto out; + } +out: + btrfs_release_disk_super(sb); + return ret; +} + static int btrfs_unfreeze(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_device *device; + int ret = 0; + /* + * Make sure the fs is not changed by accident (like hibernation then + * modified by other OS). + * If we found anything wrong, we mark the fs error immediately. + * + * And since the fs is frozen, no one can modify the fs yet, thus + * we don't need to hold device_list_mutex. + */ + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + ret = check_dev_super(device); + if (ret < 0) { + btrfs_handle_fs_error(fs_info, ret, + "super block on devid %llu got modified unexpectedly", + device->devid); + break; + } + } clear_bit(BTRFS_FS_FROZEN, &fs_info->flags); + + /* + * We still return 0, to allow VFS layer to unfreeze the fs even the + * above checks failed. Since the fs is either fine or read-only, we're + * safe to continue, without causing further damage. + */ return 0; } @@ -2568,7 +2336,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) * the end of RCU grace period. */ rcu_read_lock(); - seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\"); + seq_escape(m, btrfs_dev_name(fs_info->fs_devices->latest_dev), " \t\n\\"); rcu_read_unlock(); return 0; @@ -2617,7 +2385,7 @@ static __cold void btrfs_interface_exit(void) misc_deregister(&btrfs_misc); } -static void __init btrfs_print_mod_info(void) +static int __init btrfs_print_mod_info(void) { static const char options[] = "" #ifdef CONFIG_BTRFS_DEBUG @@ -2644,115 +2412,125 @@ static void __init btrfs_print_mod_info(void) #endif ; pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options); + return 0; } -static int __init init_btrfs_fs(void) +static int register_btrfs(void) { - int err; - - btrfs_props_init(); - - err = btrfs_init_sysfs(); - if (err) - return err; - - btrfs_init_compress(); - - err = btrfs_init_cachep(); - if (err) - goto free_compress; - - err = extent_io_init(); - if (err) - goto free_cachep; - - err = extent_state_cache_init(); - if (err) - goto free_extent_io; - - err = extent_map_init(); - if (err) - goto free_extent_state_cache; - - err = ordered_data_init(); - if (err) - goto free_extent_map; - - err = btrfs_delayed_inode_init(); - if (err) - goto free_ordered_data; - - err = btrfs_auto_defrag_init(); - if (err) - goto free_delayed_inode; - - err = btrfs_delayed_ref_init(); - if (err) - goto free_auto_defrag; - - err = btrfs_prelim_ref_init(); - if (err) - goto free_delayed_ref; - - err = btrfs_interface_init(); - if (err) - goto free_prelim_ref; + return register_filesystem(&btrfs_fs_type); +} - btrfs_print_mod_info(); +static void unregister_btrfs(void) +{ + unregister_filesystem(&btrfs_fs_type); +} - err = btrfs_run_sanity_tests(); - if (err) - goto unregister_ioctl; +/* Helper structure for long init/exit functions. */ +struct init_sequence { + int (*init_func)(void); + /* Can be NULL if the init_func doesn't need cleanup. */ + void (*exit_func)(void); +}; - err = register_filesystem(&btrfs_fs_type); - if (err) - goto unregister_ioctl; +static const struct init_sequence mod_init_seq[] = { + { + .init_func = btrfs_props_init, + .exit_func = NULL, + }, { + .init_func = btrfs_init_sysfs, + .exit_func = btrfs_exit_sysfs, + }, { + .init_func = btrfs_init_compress, + .exit_func = btrfs_exit_compress, + }, { + .init_func = btrfs_init_cachep, + .exit_func = btrfs_destroy_cachep, + }, { + .init_func = btrfs_transaction_init, + .exit_func = btrfs_transaction_exit, + }, { + .init_func = btrfs_ctree_init, + .exit_func = btrfs_ctree_exit, + }, { + .init_func = btrfs_free_space_init, + .exit_func = btrfs_free_space_exit, + }, { + .init_func = extent_state_init_cachep, + .exit_func = extent_state_free_cachep, + }, { + .init_func = extent_buffer_init_cachep, + .exit_func = extent_buffer_free_cachep, + }, { + .init_func = btrfs_bioset_init, + .exit_func = btrfs_bioset_exit, + }, { + .init_func = extent_map_init, + .exit_func = extent_map_exit, + }, { + .init_func = ordered_data_init, + .exit_func = ordered_data_exit, + }, { + .init_func = btrfs_delayed_inode_init, + .exit_func = btrfs_delayed_inode_exit, + }, { + .init_func = btrfs_auto_defrag_init, + .exit_func = btrfs_auto_defrag_exit, + }, { + .init_func = btrfs_delayed_ref_init, + .exit_func = btrfs_delayed_ref_exit, + }, { + .init_func = btrfs_prelim_ref_init, + .exit_func = btrfs_prelim_ref_exit, + }, { + .init_func = btrfs_interface_init, + .exit_func = btrfs_interface_exit, + }, { + .init_func = btrfs_print_mod_info, + .exit_func = NULL, + }, { + .init_func = btrfs_run_sanity_tests, + .exit_func = NULL, + }, { + .init_func = register_btrfs, + .exit_func = unregister_btrfs, + } +}; - return 0; +static bool mod_init_result[ARRAY_SIZE(mod_init_seq)]; -unregister_ioctl: - btrfs_interface_exit(); -free_prelim_ref: - btrfs_prelim_ref_exit(); -free_delayed_ref: - btrfs_delayed_ref_exit(); -free_auto_defrag: - btrfs_auto_defrag_exit(); -free_delayed_inode: - btrfs_delayed_inode_exit(); -free_ordered_data: - ordered_data_exit(); -free_extent_map: - extent_map_exit(); -free_extent_state_cache: - extent_state_cache_exit(); -free_extent_io: - extent_io_exit(); -free_cachep: - btrfs_destroy_cachep(); -free_compress: - btrfs_exit_compress(); - btrfs_exit_sysfs(); +static __always_inline void btrfs_exit_btrfs_fs(void) +{ + int i; - return err; + for (i = ARRAY_SIZE(mod_init_seq) - 1; i >= 0; i--) { + if (!mod_init_result[i]) + continue; + if (mod_init_seq[i].exit_func) + mod_init_seq[i].exit_func(); + mod_init_result[i] = false; + } } static void __exit exit_btrfs_fs(void) { - btrfs_destroy_cachep(); - btrfs_delayed_ref_exit(); - btrfs_auto_defrag_exit(); - btrfs_delayed_inode_exit(); - btrfs_prelim_ref_exit(); - ordered_data_exit(); - extent_map_exit(); - extent_state_cache_exit(); - extent_io_exit(); - btrfs_interface_exit(); - unregister_filesystem(&btrfs_fs_type); - btrfs_exit_sysfs(); - btrfs_cleanup_fs_uuids(); - btrfs_exit_compress(); + btrfs_exit_btrfs_fs(); +} + +static int __init init_btrfs_fs(void) +{ + int ret; + int i; + + for (i = 0; i < ARRAY_SIZE(mod_init_seq); i++) { + ASSERT(!mod_init_result[i]); + ret = mod_init_seq[i].init_func(); + if (ret < 0) { + btrfs_exit_btrfs_fs(); + return ret; + } + mod_init_result[i] = true; + } + return 0; } late_initcall(init_btrfs_fs); diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h new file mode 100644 index 000000000000..8dbb909b364f --- /dev/null +++ b/fs/btrfs/super.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SUPER_H +#define BTRFS_SUPER_H + +int btrfs_parse_options(struct btrfs_fs_info *info, char *options, + unsigned long new_flags); +int btrfs_sync_fs(struct super_block *sb, int wait); +char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid); + +static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline void btrfs_set_sb_rdonly(struct super_block *sb) +{ + sb->s_flags |= SB_RDONLY; + set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); +} + +static inline void btrfs_clear_sb_rdonly(struct super_block *sb) +{ + sb->s_flags &= ~SB_RDONLY; + clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); +} + +#endif diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index d5d0717fd09a..45615ce36498 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -10,7 +10,7 @@ #include <linux/completion.h> #include <linux/bug.h> #include <crypto/hash.h> - +#include "messages.h" #include "ctree.h" #include "discard.h" #include "disk-io.h" @@ -22,6 +22,8 @@ #include "block-group.h" #include "qgroup.h" #include "misc.h" +#include "fs.h" +#include "accessors.h" /* * Structure name Path @@ -35,12 +37,12 @@ * qgroup_attrs /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid> * space_info_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type> * raid_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile> + * discard_attrs /sys/fs/btrfs/<uuid>/discard * * When built with BTRFS_CONFIG_DEBUG: * * btrfs_debug_feature_attrs /sys/fs/btrfs/debug * btrfs_debug_mount_attrs /sys/fs/btrfs/<uuid>/debug - * discard_debug_attrs /sys/fs/btrfs/<uuid>/debug/discard */ struct btrfs_feature_attr { @@ -248,7 +250,7 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, /* * We don't want to do full transaction commit from inside sysfs */ - btrfs_set_pending(fs_info, COMMIT); + set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); wake_up_process(fs_info->transaction_kthread); return count; @@ -286,6 +288,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); +BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE); BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); @@ -317,6 +320,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(metadata_uuid), BTRFS_FEAT_ATTR_PTR(free_space_tree), BTRFS_FEAT_ATTR_PTR(raid1c34), + BTRFS_FEAT_ATTR_PTR(block_group_tree), #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_PTR(zoned), #endif @@ -429,12 +433,10 @@ static const struct attribute_group btrfs_static_feature_attr_group = { .attrs = btrfs_supported_static_feature_attrs, }; -#ifdef CONFIG_BTRFS_DEBUG - /* * Discard statistics and tunables */ -#define discard_to_fs_info(_kobj) to_fs_info((_kobj)->parent->parent) +#define discard_to_fs_info(_kobj) to_fs_info(get_btrfs_kobj(_kobj)) static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj, struct kobj_attribute *a, @@ -583,11 +585,11 @@ BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show, btrfs_discard_max_discard_size_store); /* - * Per-filesystem debugging of discard (when mounted with discard=async). + * Per-filesystem stats for discard (when mounted with discard=async). * - * Path: /sys/fs/btrfs/<uuid>/debug/discard/ + * Path: /sys/fs/btrfs/<uuid>/discard/ */ -static const struct attribute *discard_debug_attrs[] = { +static const struct attribute *discard_attrs[] = { BTRFS_ATTR_PTR(discard, discardable_bytes), BTRFS_ATTR_PTR(discard, discardable_extents), BTRFS_ATTR_PTR(discard, discard_bitmap_bytes), @@ -599,6 +601,8 @@ static const struct attribute *discard_debug_attrs[] = { NULL, }; +#ifdef CONFIG_BTRFS_DEBUG + /* * Per-filesystem runtime debugging exported via sysfs. * @@ -760,7 +764,7 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj, val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE); /* Limit stripe size to 10% of available space. */ - val = min(div_factor(fs_info->fs_devices->total_rw_bytes, 1), val); + val = min(mult_perc(fs_info->fs_devices->total_rw_bytes, 10), val); /* Must be multiple of 256M. */ val &= ~((u64)SZ_256M - 1); @@ -837,11 +841,8 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, char *buf) { struct btrfs_space_info *space_info = to_space_info(kobj); - ssize_t ret; - - ret = sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold)); - return ret; + return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold)); } static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, @@ -960,7 +961,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj, /* * We don't want to do full transaction commit from inside sysfs */ - btrfs_set_pending(fs_info, COMMIT); + set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); wake_up_process(fs_info->transaction_kthread); return len; @@ -1150,25 +1151,6 @@ static ssize_t btrfs_generation_show(struct kobject *kobj, } BTRFS_ATTR(, generation, btrfs_generation_show); -/* - * Look for an exact string @string in @buffer with possible leading or - * trailing whitespace - */ -static bool strmatch(const char *buffer, const char *string) -{ - const size_t len = strlen(string); - - /* Skip leading whitespace */ - buffer = skip_spaces(buffer); - - /* Match entire string, check if the rest is whitespace or empty */ - if (strncmp(string, buffer, len) == 0 && - strlen(skip_spaces(buffer + len)) == 0) - return true; - - return false; -} - static const char * const btrfs_read_policy_name[] = { "pid" }; static ssize_t btrfs_read_policy_show(struct kobject *kobj, @@ -1180,16 +1162,16 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { if (fs_devices->read_policy == i) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s[%s]", + ret += sysfs_emit_at(buf, ret, "%s[%s]", (ret == 0 ? "" : " "), btrfs_read_policy_name[i]); else - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", + ret += sysfs_emit_at(buf, ret, "%s%s", (ret == 0 ? "" : " "), btrfs_read_policy_name[i]); } - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } @@ -1202,7 +1184,7 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, int i; for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (strmatch(buf, btrfs_read_policy_name[i])) { + if (sysfs_streq(buf, btrfs_read_policy_name[i])) { if (i != fs_devices->read_policy) { fs_devices->read_policy = i; btrfs_info(fs_devices->fs_info, @@ -1222,11 +1204,8 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - ssize_t ret; - ret = sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold)); - - return ret; + return sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold)); } static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, @@ -1427,13 +1406,12 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) kobject_del(fs_info->space_info_kobj); kobject_put(fs_info->space_info_kobj); } -#ifdef CONFIG_BTRFS_DEBUG - if (fs_info->discard_debug_kobj) { - sysfs_remove_files(fs_info->discard_debug_kobj, - discard_debug_attrs); - kobject_del(fs_info->discard_debug_kobj); - kobject_put(fs_info->discard_debug_kobj); + if (fs_info->discard_kobj) { + sysfs_remove_files(fs_info->discard_kobj, discard_attrs); + kobject_del(fs_info->discard_kobj); + kobject_put(fs_info->discard_kobj); } +#ifdef CONFIG_BTRFS_DEBUG if (fs_info->debug_kobj) { sysfs_remove_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); kobject_del(fs_info->debug_kobj); @@ -2001,20 +1979,18 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); if (error) goto failure; +#endif /* Discard directory */ - fs_info->discard_debug_kobj = kobject_create_and_add("discard", - fs_info->debug_kobj); - if (!fs_info->discard_debug_kobj) { + fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj); + if (!fs_info->discard_kobj) { error = -ENOMEM; goto failure; } - error = sysfs_create_files(fs_info->discard_debug_kobj, - discard_debug_attrs); + error = sysfs_create_files(fs_info->discard_kobj, discard_attrs); if (error) goto failure; -#endif error = addrm_unknown_feature_attrs(fs_info, true); if (error) @@ -2041,6 +2017,98 @@ failure: return error; } +static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + bool enabled; + + spin_lock(&fs_info->qgroup_lock); + enabled = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON; + spin_unlock(&fs_info->qgroup_lock); + + return sysfs_emit(buf, "%d\n", enabled); +} +BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show); + +static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + bool inconsistent; + + spin_lock(&fs_info->qgroup_lock); + inconsistent = (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT); + spin_unlock(&fs_info->qgroup_lock); + + return sysfs_emit(buf, "%d\n", inconsistent); +} +BTRFS_ATTR(qgroups, inconsistent, qgroup_inconsistent_show); + +static ssize_t qgroup_drop_subtree_thres_show(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + u8 result; + + spin_lock(&fs_info->qgroup_lock); + result = fs_info->qgroup_drop_subtree_thres; + spin_unlock(&fs_info->qgroup_lock); + + return sysfs_emit(buf, "%d\n", result); +} + +static ssize_t qgroup_drop_subtree_thres_store(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + u8 new_thres; + int ret; + + ret = kstrtou8(buf, 10, &new_thres); + if (ret) + return -EINVAL; + + if (new_thres > BTRFS_MAX_LEVEL) + return -EINVAL; + + spin_lock(&fs_info->qgroup_lock); + fs_info->qgroup_drop_subtree_thres = new_thres; + spin_unlock(&fs_info->qgroup_lock); + + return len; +} +BTRFS_ATTR_RW(qgroups, drop_subtree_threshold, qgroup_drop_subtree_thres_show, + qgroup_drop_subtree_thres_store); + +/* + * Qgroups global info + * + * Path: /sys/fs/btrfs/<uuid>/qgroups/ + */ +static struct attribute *qgroups_attrs[] = { + BTRFS_ATTR_PTR(qgroups, enabled), + BTRFS_ATTR_PTR(qgroups, inconsistent), + BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold), + NULL +}; +ATTRIBUTE_GROUPS(qgroups); + +static void qgroups_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static struct kobj_type qgroups_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = qgroups_groups, + .release = qgroups_release, +}; + static inline struct btrfs_fs_info *qgroup_kobj_to_fs_info(struct kobject *kobj) { return to_fs_info(kobj->parent->parent); @@ -2166,11 +2234,15 @@ int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info) if (fs_info->qgroups_kobj) return 0; - fs_info->qgroups_kobj = kobject_create_and_add("qgroups", fsid_kobj); - if (!fs_info->qgroups_kobj) { - ret = -ENOMEM; + fs_info->qgroups_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (!fs_info->qgroups_kobj) + return -ENOMEM; + + ret = kobject_init_and_add(fs_info->qgroups_kobj, &qgroups_ktype, + fsid_kobj, "qgroups"); + if (ret < 0) goto out; - } + rbtree_postorder_for_each_entry_safe(qgroup, next, &fs_info->qgroup_tree, node) { ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); @@ -2251,8 +2323,11 @@ int __init btrfs_init_sysfs(void) #ifdef CONFIG_BTRFS_DEBUG ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group); - if (ret) - goto out2; + if (ret) { + sysfs_unmerge_group(&btrfs_kset->kobj, + &btrfs_static_feature_attr_group); + goto out_remove_group; + } #endif return 0; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index cc9377cf56a3..181469fc0bb3 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -16,6 +16,7 @@ #include "../disk-io.h" #include "../qgroup.h" #include "../block-group.h" +#include "../fs.h" static struct vfsmount *test_mnt = NULL; @@ -101,7 +102,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info) if (!dev) return ERR_PTR(-ENOMEM); - extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL); + extent_io_tree_init(NULL, &dev->alloc_state, 0); INIT_LIST_HEAD(&dev->dev_list); list_add(&dev->dev_list, &fs_info->fs_devices->devices); @@ -200,7 +201,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) void btrfs_free_dummy_root(struct btrfs_root *root) { - if (!root) + if (IS_ERR_OR_NULL(root)) return; /* Will be freed by btrfs_free_fs_roots */ if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) @@ -243,7 +244,7 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group *cache) { if (!cache) return; - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); kfree(cache->free_space_ctl); kfree(cache); } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index b7d181a08eab..5ef0b90e25c3 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -8,6 +8,7 @@ #include "../ctree.h" #include "../extent_io.h" #include "../disk-io.h" +#include "../accessors.h" static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) { diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index a232b15b8021..dfc5c7fa6038 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -4,6 +4,7 @@ */ #include <linux/pagemap.h> +#include <linux/pagevec.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/sizes.h> @@ -20,39 +21,40 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, unsigned long flags) { int ret; - struct page *pages[16]; + struct folio_batch fbatch; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - unsigned long nr_pages = end_index - index + 1; int i; int count = 0; int loops = 0; - while (nr_pages > 0) { - ret = find_get_pages_contig(inode->i_mapping, index, - min_t(unsigned long, nr_pages, - ARRAY_SIZE(pages)), pages); + folio_batch_init(&fbatch); + + while (index <= end_index) { + ret = filemap_get_folios_contig(inode->i_mapping, &index, + end_index, &fbatch); for (i = 0; i < ret; i++) { + struct folio *folio = fbatch.folios[i]; + if (flags & PROCESS_TEST_LOCKED && - !PageLocked(pages[i])) + !folio_test_locked(folio)) count++; - if (flags & PROCESS_UNLOCK && PageLocked(pages[i])) - unlock_page(pages[i]); - put_page(pages[i]); + if (flags & PROCESS_UNLOCK && folio_test_locked(folio)) + folio_unlock(folio); if (flags & PROCESS_RELEASE) - put_page(pages[i]); + folio_put(folio); } - nr_pages -= ret; - index += ret; + folio_batch_release(&fbatch); cond_resched(); loops++; if (loops > 100000) { printk(KERN_ERR - "stuck in a loop, start %llu, end %llu, nr_pages %lu, ret %d\n", - start, end, nr_pages, ret); + "stuck in a loop, start %llu, end %llu, ret %d\n", + start, end, ret); break; } } + return count; } @@ -80,7 +82,6 @@ static void extent_flag_to_str(const struct extent_state *state, char *dest) PRINT_ONE_FLAG(state, dest, cur, NODATASUM); PRINT_ONE_FLAG(state, dest, cur, CLEAR_META_RESV); PRINT_ONE_FLAG(state, dest, cur, NEED_WAIT); - PRINT_ONE_FLAG(state, dest, cur, DAMAGED); PRINT_ONE_FLAG(state, dest, cur, NORESERVE); PRINT_ONE_FLAG(state, dest, cur, QGROUP_RESERVED); PRINT_ONE_FLAG(state, dest, cur, CLEAR_DATA_RESV); @@ -131,7 +132,7 @@ static int test_find_delalloc(u32 sectorsize) * Passing NULL as we don't have fs_info but tracepoints are not used * at this point */ - extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST, NULL); + extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST); /* * First go through and create and mark all of our pages dirty, we pin @@ -172,7 +173,7 @@ static int test_find_delalloc(u32 sectorsize) sectorsize - 1, start, end); goto out_bits; } - unlock_extent(tmp, start, end); + unlock_extent(tmp, start, end, NULL); unlock_page(locked_page); put_page(locked_page); @@ -208,7 +209,7 @@ static int test_find_delalloc(u32 sectorsize) test_err("there were unlocked pages in the range"); goto out_bits; } - unlock_extent(tmp, start, end); + unlock_extent(tmp, start, end, NULL); /* locked_page was unlocked above */ put_page(locked_page); @@ -263,7 +264,7 @@ static int test_find_delalloc(u32 sectorsize) test_err("pages in range were not all locked"); goto out_bits; } - unlock_extent(tmp, start, end); + unlock_extent(tmp, start, end, NULL); /* * Now to test where we run into a page that is no longer dirty in the @@ -488,7 +489,7 @@ static int test_find_first_clear_extent_bit(void) test_msg("running find_first_clear_extent_bit test"); - extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL); + extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST); /* Test correct handling of empty tree */ find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED); diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 5930cdcae5cb..ebf68fcd2149 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -82,7 +82,7 @@ static int test_extents(struct btrfs_block_group *cache) } /* Cleanup */ - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); return 0; } @@ -149,7 +149,7 @@ static int test_bitmaps(struct btrfs_block_group *cache, u32 sectorsize) return -1; } - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); return 0; } @@ -230,7 +230,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache, return -1; } - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); /* Now with the extent entry offset into the bitmap */ ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1); @@ -266,7 +266,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache, * [ bitmap ] * [ del ] */ - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1); if (ret) { test_err("couldn't add bitmap %d", ret); @@ -291,7 +291,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache, return -1; } - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); /* * This blew up before, we have part of the free space in a bitmap and @@ -317,7 +317,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache, return ret; } - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); return 0; } @@ -629,7 +629,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache, if (ret) return ret; - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); /* * Now test a similar scenario, but where our extent entry is located @@ -819,7 +819,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache, return ret; cache->free_space_ctl->op = orig_free_space_ops; - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); return 0; } @@ -868,7 +868,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize) } /* Now validate bitmaps do the correct thing. */ - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); for (i = 0; i < 2; i++) { offset = i * BITS_PER_BITMAP * sectorsize; bytes = (i + 1) * SZ_1M; @@ -891,7 +891,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize) } /* Now validate bitmaps with different ->max_extent_size. */ - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); orig_free_space_ops = cache->free_space_ctl->op; cache->free_space_ctl->op = &test_free_space_ops; @@ -998,7 +998,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize) } cache->free_space_ctl->op = orig_free_space_ops; - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); return 0; } diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 13734ed43bfc..b61972046feb 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -10,6 +10,7 @@ #include "../free-space-tree.h" #include "../transaction.h" #include "../block-group.h" +#include "../accessors.h" struct free_space_extent { u64 start; @@ -470,7 +471,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, } cache->bitmap_low_thresh = 0; cache->bitmap_high_thresh = (u32)-1; - cache->needs_free_space = 1; + set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags); cache->fs_info = root->fs_info; btrfs_init_dummy_trans(&trans, root->fs_info); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index cac89c388131..05b03f5eab83 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -11,6 +11,7 @@ #include "../extent_io.h" #include "../volumes.h" #include "../compression.h" +#include "../accessors.h" static void insert_extent(struct btrfs_root *root, u64 start, u64 len, u64 ram_bytes, u64 offset, u64 disk_bytenr, @@ -72,8 +73,8 @@ static void insert_inode_item_key(struct btrfs_root *root) * diagram of how the extents will look though this may not be possible we still * want to make sure everything acts normally (the last number is not inclusive) * - * [0 - 5][5 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291] - * [hole ][inline][hole but no extent][ hole ][ regular ][regular1 split] + * [0 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291] + * [inline][hole but no extent][ hole ][ regular ][regular1 split] * * [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ] * [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written] @@ -90,19 +91,12 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) u64 disk_bytenr = SZ_1M; u64 offset = 0; - /* First we want a hole */ - insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, - slot); - slot++; - offset += 5; - /* - * Now we want an inline extent, I don't think this is possible but hey - * why not? Also keep in mind if we have an inline extent it counts as - * the whole first page. If we were to expand it we would have to cow - * and we wouldn't have an inline extent anymore. + * Tree-checker has strict limits on inline extents that they can only + * exist at file offset 0, thus we can only have one inline file extent + * at most. */ - insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0, + insert_extent(root, offset, 6, 6, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0, slot); slot++; offset = sectorsize; @@ -267,7 +261,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } free_extent_map(em); - btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); + btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); /* * All of the magic numbers are based on the mapping setup in @@ -281,37 +275,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != EXTENT_MAP_HOLE) { - test_err("expected a hole, got %llu", em->block_start); - goto out; - } - if (em->start != 0 || em->len != 5) { - test_err( - "unexpected extent wanted start 0 len 5, got start %llu len %llu", - em->start, em->len); - goto out; - } - if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); - goto out; - } - offset = em->start + em->len; - free_extent_map(em); - - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); - if (IS_ERR(em)) { - test_err("got an error when we shouldn't have"); - goto out; - } if (em->block_start != EXTENT_MAP_INLINE) { test_err("expected an inline, got %llu", em->block_start); goto out; } - if (em->start != offset || em->len != (sectorsize - 5)) { + /* + * For inline extent, we always round up the em to sectorsize, as + * they are either: + * + * a) a hidden hole + * The range will be zeroed at inline extent read time. + * + * b) a file extent with unaligned bytenr + * Tree checker will reject it. + */ + if (em->start != 0 || em->len != sectorsize) { test_err( - "unexpected extent wanted start %llu len 1, got start %llu len %llu", - offset, em->start, em->len); + "unexpected extent wanted start 0 len %u, got start %llu len %llu", + sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -975,7 +957,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_UPTODATE, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1043,7 +1025,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_UPTODATE, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1076,7 +1058,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* Empty */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_UPTODATE, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1092,7 +1074,7 @@ out: if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_UPTODATE, NULL); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index eee1e4459541..3fc8dc3fd980 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -10,6 +10,8 @@ #include "../disk-io.h" #include "../qgroup.h" #include "../backref.h" +#include "../fs.h" +#include "../accessors.h" static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid) @@ -203,6 +205,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, static int test_no_shared_qgroup(struct btrfs_root *root, u32 sectorsize, u32 nodesize) { + struct btrfs_backref_walk_ctx ctx = { 0 }; struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; struct ulist *old_roots = NULL; @@ -218,30 +221,38 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return ret; } + ctx.bytenr = nodesize; + ctx.trans = &trans; + ctx.fs_info = fs_info; + /* * Since the test trans doesn't have the complicated delayed refs, * we can only call btrfs_qgroup_account_extent() directly to test * quota. */ - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, BTRFS_FS_TREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); @@ -250,32 +261,38 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return ret; } + /* btrfs_qgroup_account_extent() always frees the ulists passed to it. */ + old_roots = NULL; + new_roots = NULL; + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, nodesize, nodesize)) { test_err("qgroup counts didn't match expected values"); return -EINVAL; } - old_roots = NULL; - new_roots = NULL; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = remove_extent_item(root, nodesize, nodesize); - if (ret) + if (ret) { + ulist_free(old_roots); return -EINVAL; + } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); @@ -300,6 +317,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root, static int test_multiple_refs(struct btrfs_root *root, u32 sectorsize, u32 nodesize) { + struct btrfs_backref_walk_ctx ctx = { 0 }; struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; struct ulist *old_roots = NULL; @@ -320,25 +338,33 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ctx.bytenr = nodesize; + ctx.trans = &trans; + ctx.fs_info = fs_info; + + ret = btrfs_find_all_roots(&ctx, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, BTRFS_FS_TREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); @@ -353,25 +379,29 @@ static int test_multiple_refs(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = add_tree_ref(root, nodesize, nodesize, 0, BTRFS_FIRST_FREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); @@ -392,25 +422,29 @@ static int test_multiple_refs(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = remove_extent_ref(root, nodesize, nodesize, 0, BTRFS_FIRST_FREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0bec10740ad3..b8c52e89688c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -6,6 +6,7 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/writeback.h> #include <linux/pagemap.h> #include <linux/blkdev.h> @@ -23,6 +24,18 @@ #include "block-group.h" #include "space-info.h" #include "zoned.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "defrag.h" +#include "dir-item.h" +#include "uuid-tree.h" +#include "ioctl.h" +#include "relocation.h" +#include "scrub.h" + +static struct kmem_cache *btrfs_trans_handle_cachep; #define BTRFS_ROOT_TRANS_TAG 0 @@ -161,7 +174,6 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root, *tmp; - struct btrfs_caching_control *caching_ctl, *next; /* * At this point no one can be using this transaction to modify any tree @@ -196,46 +208,6 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) } spin_unlock(&cur_trans->dropped_roots_lock); - /* - * We have to update the last_byte_to_unpin under the commit_root_sem, - * at the same time we swap out the commit roots. - * - * This is because we must have a real view of the last spot the caching - * kthreads were while caching. Consider the following views of the - * extent tree for a block group - * - * commit root - * +----+----+----+----+----+----+----+ - * |\\\\| |\\\\|\\\\| |\\\\|\\\\| - * +----+----+----+----+----+----+----+ - * 0 1 2 3 4 5 6 7 - * - * new commit root - * +----+----+----+----+----+----+----+ - * | | | |\\\\| | |\\\\| - * +----+----+----+----+----+----+----+ - * 0 1 2 3 4 5 6 7 - * - * If the cache_ctl->progress was at 3, then we are only allowed to - * unpin [0,1) and [2,3], because the caching thread has already - * processed those extents. We are not allowed to unpin [5,6), because - * the caching thread will re-start it's search from 3, and thus find - * the hole from [4,6) to add to the free space cache. - */ - write_lock(&fs_info->block_group_cache_lock); - list_for_each_entry_safe(caching_ctl, next, - &fs_info->caching_block_groups, list) { - struct btrfs_block_group *cache = caching_ctl->block_group; - - if (btrfs_block_group_done(cache)) { - cache->last_byte_to_unpin = (u64)-1; - list_del_init(&caching_ctl->list); - btrfs_put_caching_control(caching_ctl); - } else { - cache->last_byte_to_unpin = caching_ctl->progress; - } - } - write_unlock(&fs_info->block_group_cache_lock); up_write(&fs_info->commit_root_sem); } @@ -313,6 +285,8 @@ loop: atomic_inc(&cur_trans->num_writers); extwriter_counter_inc(cur_trans, type); spin_unlock(&fs_info->trans_lock); + btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers); + btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters); return 0; } spin_unlock(&fs_info->trans_lock); @@ -334,16 +308,23 @@ loop: if (!cur_trans) return -ENOMEM; + btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers); + btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters); + spin_lock(&fs_info->trans_lock); if (fs_info->running_transaction) { /* * someone started a transaction after we unlocked. Make sure * to redo the checks above */ + btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); kfree(cur_trans); goto loop; } else if (BTRFS_FS_ERROR(fs_info)) { spin_unlock(&fs_info->trans_lock); + btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); kfree(cur_trans); return -EROFS; } @@ -397,9 +378,9 @@ loop: spin_lock_init(&cur_trans->releasing_ebs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, - IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode); + IO_TREE_TRANS_DIRTY_PAGES); extent_io_tree_init(fs_info, &cur_trans->pinned_extents, - IO_TREE_FS_PINNED_EXTENTS, NULL); + IO_TREE_FS_PINNED_EXTENTS); fs_info->generation++; cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; @@ -541,6 +522,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info) refcount_inc(&cur_trans->use_count); spin_unlock(&fs_info->trans_lock); + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); wait_event(fs_info->transaction_wait, cur_trans->state >= TRANS_STATE_UNBLOCKED || TRANS_ABORTED(cur_trans)); @@ -625,7 +607,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, */ num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); if (flush == BTRFS_RESERVE_FLUSH_ALL && - delayed_refs_rsv->full == 0) { + btrfs_block_rsv_full(delayed_refs_rsv) == 0) { delayed_refs_bytes = num_bytes; num_bytes <<= 1; } @@ -650,7 +632,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, if (rsv->space_info->force_alloc) do_chunk_alloc = true; } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && - !delayed_refs_rsv->full) { + !btrfs_block_rsv_full(delayed_refs_rsv)) { /* * Some people call with btrfs_start_transaction(root, 0) * because they can be throttled, but have some other mechanism @@ -859,6 +841,15 @@ static noinline void wait_for_commit(struct btrfs_transaction *commit, u64 transid = commit->transid; bool put = false; + /* + * At the moment this function is called with min_state either being + * TRANS_STATE_COMPLETED or TRANS_STATE_SUPER_COMMITTED. + */ + if (min_state == TRANS_STATE_COMPLETED) + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); + else + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); + while (1) { wait_event(commit->commit_wait, commit->state >= min_state); if (put) @@ -958,7 +949,7 @@ static bool should_end_transaction(struct btrfs_trans_handle *trans) if (btrfs_check_space_for_delayed_refs(fs_info)) return true; - return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); + return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 50); } bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) @@ -1022,6 +1013,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, extwriter_counter_dec(cur_trans, trans->type); cond_wake_up(&cur_trans->writer_wait); + + btrfs_lockdep_release(info, btrfs_trans_num_extwriters); + btrfs_lockdep_release(info, btrfs_trans_num_writers); + btrfs_put_transaction(cur_trans); if (current->journal_info == trans) @@ -1134,7 +1129,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, * it's safe to do it (through extent_io_tree_release()). */ err = clear_extent_bit(dirty_pages, start, end, - EXTENT_NEED_WAIT, 0, 0, &cached_state); + EXTENT_NEED_WAIT, &cached_state); if (err == -ENOMEM) err = 0; if (!err) @@ -1625,10 +1620,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; - struct inode *parent_inode; + struct inode *parent_inode = pending->dir; struct btrfs_path *path; struct btrfs_dir_item *dir_item; - struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; struct timespec64 cur_time; @@ -1637,6 +1631,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 index = 0; u64 objectid; u64 root_flags; + unsigned int nofs_flags; + struct fscrypt_name fname; ASSERT(pending->path); path = pending->path; @@ -1644,9 +1640,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ASSERT(pending->root_item); new_root_item = pending->root_item; + /* + * We're inside a transaction and must make sure that any potential + * allocations with GFP_KERNEL in fscrypt won't recurse back to + * filesystem. + */ + nofs_flags = memalloc_nofs_save(); + pending->error = fscrypt_setup_filename(parent_inode, + &pending->dentry->d_name, 0, + &fname); + memalloc_nofs_restore(nofs_flags); + if (pending->error) + goto free_pending; + pending->error = btrfs_get_free_objectid(tree_root, &objectid); if (pending->error) - goto no_free_objectid; + goto free_fname; /* * Make qgroup to skip current new snapshot's qgroupid, as it is @@ -1675,8 +1684,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 1); - dentry = pending->dentry; - parent_inode = pending->dir; parent_root = BTRFS_I(parent_inode)->root; ret = record_root_in_trans(trans, parent_root, 0); if (ret) @@ -1692,8 +1699,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, btrfs_ino(BTRFS_I(parent_inode)), - dentry->d_name.name, - dentry->d_name.len, 0); + &fname.disk_name, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; goto dir_item_existed; @@ -1788,7 +1794,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_add_root_ref(trans, objectid, parent_root->root_key.objectid, btrfs_ino(BTRFS_I(parent_inode)), index, - dentry->d_name.name, dentry->d_name.len); + &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1820,9 +1826,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret < 0) goto fail; - ret = btrfs_insert_dir_item(trans, dentry->d_name.name, - dentry->d_name.len, BTRFS_I(parent_inode), - &key, BTRFS_FT_DIR, index); + ret = btrfs_insert_dir_item(trans, &fname.disk_name, + BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, + index); /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { @@ -1831,7 +1837,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + - dentry->d_name.len * 2); + fname.disk_name.len * 2); parent_inode->i_mtime = current_time(parent_inode); parent_inode->i_ctime = parent_inode->i_mtime; ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); @@ -1863,7 +1869,9 @@ dir_item_existed: trans->bytes_reserved = 0; clear_skip_qgroup: btrfs_clear_skip_qgroup(trans); -no_free_objectid: +free_fname: + fscrypt_free_filename(&fname); +free_pending: kfree(new_root_item); pending->root_item = NULL; btrfs_free_path(path); @@ -1912,14 +1920,6 @@ static void update_super_roots(struct btrfs_fs_info *fs_info) super->cache_generation = 0; if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) super->uuid_tree_generation = root_item->generation; - - if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { - root_item = &fs_info->block_group_root->root_item; - - super->block_group_root = root_item->bytenr; - super->block_group_root_generation = root_item->generation; - super->block_group_root_level = root_item->level; - } } int btrfs_transaction_in_commit(struct btrfs_fs_info *info) @@ -1967,6 +1967,7 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) * Wait for the current transaction commit to start and block * subsequent transaction joins */ + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); wait_event(fs_info->transaction_blocked_wait, cur_trans->state >= TRANS_STATE_COMMIT_START || TRANS_ABORTED(cur_trans)); @@ -1994,6 +1995,12 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) if (cur_trans == fs_info->running_transaction) { cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&fs_info->trans_lock); + + /* + * The thread has already released the lockdep map as reader + * already in btrfs_commit_transaction(). + */ + btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers); wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); @@ -2118,12 +2125,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ktime_t interval; ASSERT(refcount_read(&trans->use_count) == 1); + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + + clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); /* Stop the commit early if ->aborted is set */ if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; - btrfs_end_transaction(trans); - return ret; + goto lockdep_trans_commit_start_release; } btrfs_trans_release_metadata(trans); @@ -2140,10 +2149,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * Any running threads may add more while we are here. */ ret = btrfs_run_delayed_refs(trans, 0); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } + if (ret) + goto lockdep_trans_commit_start_release; } btrfs_create_pending_block_groups(trans); @@ -2172,10 +2179,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (run_it) { ret = btrfs_start_dirty_block_groups(trans); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } + if (ret) + goto lockdep_trans_commit_start_release; } } @@ -2190,6 +2195,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (trans->in_fsync) want_state = TRANS_STATE_SUPER_COMMITTED; + + btrfs_trans_state_lockdep_release(fs_info, + BTRFS_LOCKDEP_TRANS_COMMIT_START); ret = btrfs_end_transaction(trans); wait_for_commit(cur_trans, want_state); @@ -2203,6 +2211,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&fs_info->transaction_blocked_wait); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); if (cur_trans->list.prev != &fs_info->trans_list) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; @@ -2222,7 +2231,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_put_transaction(prev_trans); if (ret) - goto cleanup_transaction; + goto lockdep_release; } else { spin_unlock(&fs_info->trans_lock); } @@ -2236,7 +2245,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ if (BTRFS_FS_ERROR(fs_info)) { ret = -EROFS; - goto cleanup_transaction; + goto lockdep_release; } } @@ -2250,19 +2259,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ret = btrfs_start_delalloc_flush(fs_info); if (ret) - goto cleanup_transaction; + goto lockdep_release; ret = btrfs_run_delayed_items(trans); if (ret) - goto cleanup_transaction; + goto lockdep_release; + /* + * The thread has started/joined the transaction thus it holds the + * lockdep map as a reader. It has to release it before acquiring the + * lockdep map as a writer. + */ + btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); + btrfs_might_wait_for_event(fs_info, btrfs_trans_num_extwriters); wait_event(cur_trans->writer_wait, extwriter_counter_read(cur_trans) == 0); /* some pending stuffs might be added after the previous flush. */ ret = btrfs_run_delayed_items(trans); - if (ret) + if (ret) { + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); goto cleanup_transaction; + } btrfs_wait_delalloc_flush(fs_info); @@ -2271,6 +2289,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * transaction. Otherwise if this transaction commits before the ordered * extents complete we lose logged data after a power failure. */ + btrfs_might_wait_for_event(fs_info, btrfs_trans_pending_ordered); wait_event(cur_trans->pending_wait, atomic_read(&cur_trans->pending_ordered) == 0); @@ -2284,10 +2303,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) add_pending_snapshot(trans); cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&fs_info->trans_lock); + + /* + * The thread has started/joined the transaction thus it holds the + * lockdep map as a reader. It has to release it before acquiring the + * lockdep map as a writer. + */ + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); + btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers); wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); /* + * Make lockdep happy by acquiring the state locks after + * btrfs_trans_num_writers is released. If we acquired the state locks + * before releasing the btrfs_trans_num_writers lock then lockdep would + * complain because we did not follow the reverse order unlocking rule. + */ + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); + + /* * We've started the commit, clear the flag in case we were triggered to * do an async commit but somebody else started before the transaction * kthread could do the work. @@ -2296,6 +2333,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); goto scrub_continue; } /* @@ -2344,12 +2382,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto unlock_reloc; - /* - * Since the transaction is done, we can apply the pending changes - * before the next transaction. - */ - btrfs_apply_pending_changes(fs_info); - /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots */ @@ -2430,6 +2462,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) mutex_unlock(&fs_info->reloc_mutex); wake_up(&fs_info->transaction_wait); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); ret = btrfs_write_and_wait_transaction(trans); if (ret) { @@ -2461,6 +2494,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ cur_trans->state = TRANS_STATE_SUPER_COMMITTED; wake_up(&cur_trans->commit_wait); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); btrfs_finish_extent_commit(trans); @@ -2474,6 +2508,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ cur_trans->state = TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); spin_lock(&fs_info->trans_lock); list_del_init(&cur_trans->list); @@ -2502,7 +2537,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) unlock_reloc: mutex_unlock(&fs_info->reloc_mutex); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); scrub_continue: + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); btrfs_scrub_continue(fs_info); cleanup_transaction: btrfs_trans_release_metadata(trans); @@ -2515,6 +2553,16 @@ cleanup_transaction: cleanup_transaction(trans, ret); return ret; + +lockdep_release: + btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); + goto cleanup_transaction; + +lockdep_trans_commit_start_release: + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_end_transaction(trans); + return ret; } /* @@ -2556,21 +2604,17 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) return (ret < 0) ? 0 : 1; } -void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info) +int __init btrfs_transaction_init(void) { - unsigned long prev; - unsigned long bit; - - prev = xchg(&fs_info->pending_changes, 0); - if (!prev) - return; - - bit = 1 << BTRFS_PENDING_COMMIT; - if (prev & bit) - btrfs_debug(fs_info, "pending commit done"); - prev &= ~bit; + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", + sizeof(struct btrfs_trans_handle), 0, + SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); + if (!btrfs_trans_handle_cachep) + return -ENOMEM; + return 0; +} - if (prev) - btrfs_warn(fs_info, - "unknown pending changes left 0x%lx, ignoring", prev); +void __cold btrfs_transaction_exit(void) +{ + kmem_cache_destroy(btrfs_trans_handle_cachep); } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 970ff316069d..97f6c39f59c8 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -10,6 +10,7 @@ #include "btrfs_inode.h" #include "delayed-ref.h" #include "ctree.h" +#include "misc.h" enum btrfs_trans_state { TRANS_STATE_RUNNING, @@ -98,14 +99,15 @@ struct btrfs_transaction { struct list_head releasing_ebs; }; -#define __TRANS_FREEZABLE (1U << 0) - -#define __TRANS_START (1U << 9) -#define __TRANS_ATTACH (1U << 10) -#define __TRANS_JOIN (1U << 11) -#define __TRANS_JOIN_NOLOCK (1U << 12) -#define __TRANS_DUMMY (1U << 13) -#define __TRANS_JOIN_NOSTART (1U << 14) +enum { + ENUM_BIT(__TRANS_FREEZABLE), + ENUM_BIT(__TRANS_START), + ENUM_BIT(__TRANS_ATTACH), + ENUM_BIT(__TRANS_JOIN), + ENUM_BIT(__TRANS_JOIN_NOLOCK), + ENUM_BIT(__TRANS_DUMMY), + ENUM_BIT(__TRANS_JOIN_NOSTART), +}; #define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) #define TRANS_ATTACH (__TRANS_ATTACH) @@ -231,9 +233,11 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); void btrfs_put_transaction(struct btrfs_transaction *transaction); -void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); +int __init btrfs_transaction_init(void); +void __cold btrfs_transaction_exit(void); + #endif diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 43f905ab0a18..baad1ed7e111 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -18,6 +18,7 @@ #include <linux/types.h> #include <linux/stddef.h> #include <linux/error-injection.h> +#include "messages.h" #include "ctree.h" #include "tree-checker.h" #include "disk-io.h" @@ -25,6 +26,9 @@ #include "volumes.h" #include "misc.h" #include "btrfs_inode.h" +#include "fs.h" +#include "accessors.h" +#include "file-item.h" /* * Error message should follow the following format: @@ -528,7 +532,7 @@ static int check_dir_item(struct extent_buffer *leaf, } /* dir type check */ - dir_type = btrfs_dir_type(leaf, di); + dir_type = btrfs_dir_ftype(leaf, di); if (unlikely(dir_type >= BTRFS_FT_MAX)) { dir_item_err(leaf, slot, "invalid dir item type, have %u expect [0, %u)", @@ -1780,10 +1784,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) /* Also check if the item pointer overlaps with btrfs item. */ if (unlikely(btrfs_item_ptr_offset(leaf, slot) < - btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item))) { + btrfs_item_nr_offset(leaf, slot) + sizeof(struct btrfs_item))) { generic_err(leaf, slot, "slot overlaps with its data, item end %lu data start %lu", - btrfs_item_nr_offset(slot) + + btrfs_item_nr_offset(leaf, slot) + sizeof(struct btrfs_item), btrfs_item_ptr_offset(leaf, slot)); return -EUCLEAN; diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index ece497e26558..bfb5efa4e01f 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -6,8 +6,39 @@ #ifndef BTRFS_TREE_CHECKER_H #define BTRFS_TREE_CHECKER_H -#include "ctree.h" -#include "extent_io.h" +#include <uapi/linux/btrfs_tree.h> + +struct extent_buffer; +struct btrfs_chunk; + +/* All the extra info needed to verify the parentness of a tree block. */ +struct btrfs_tree_parent_check { + /* + * The owner check against the tree block. + * + * Can be 0 to skip the owner check. + */ + u64 owner_root; + + /* + * Expected transid, can be 0 to skip the check, but such skip + * should only be utlized for backref walk related code. + */ + u64 transid; + + /* + * The expected first key. + * + * This check can be skipped if @has_first_key is false, such skip + * can happen for case where we don't have the parent node key, + * e.g. reading the tree root, doing backref walk. + */ + struct btrfs_key first_key; + bool has_first_key; + + /* The expected level. Should always be set. */ + u8 level; +}; /* * Comprehensive leaf checker. diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c deleted file mode 100644 index b6cf39f4e7e4..000000000000 --- a/fs/btrfs/tree-defrag.c +++ /dev/null @@ -1,132 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2007 Oracle. All rights reserved. - */ - -#include <linux/sched.h> -#include "ctree.h" -#include "disk-io.h" -#include "print-tree.h" -#include "transaction.h" -#include "locking.h" - -/* - * Defrag all the leaves in a given btree. - * Read all the leaves and try to get key order to - * better reflect disk order - */ - -int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_path *path = NULL; - struct btrfs_key key; - int ret = 0; - int wret; - int level; - int next_key_ret = 0; - u64 last_ret = 0; - - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - goto out; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - level = btrfs_header_level(root->node); - - if (level == 0) - goto out; - - if (root->defrag_progress.objectid == 0) { - struct extent_buffer *root_node; - u32 nritems; - - root_node = btrfs_lock_root_node(root); - nritems = btrfs_header_nritems(root_node); - root->defrag_max.objectid = 0; - /* from above we know this is not a leaf */ - btrfs_node_key_to_cpu(root_node, &root->defrag_max, - nritems - 1); - btrfs_tree_unlock(root_node); - free_extent_buffer(root_node); - memset(&key, 0, sizeof(key)); - } else { - memcpy(&key, &root->defrag_progress, sizeof(key)); - } - - path->keep_locks = 1; - - ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); - if (ret < 0) - goto out; - if (ret > 0) { - ret = 0; - goto out; - } - btrfs_release_path(path); - /* - * We don't need a lock on a leaf. btrfs_realloc_node() will lock all - * leafs from path->nodes[1], so set lowest_level to 1 to avoid later - * a deadlock (attempting to write lock an already write locked leaf). - */ - path->lowest_level = 1; - wret = btrfs_search_slot(trans, root, &key, path, 0, 1); - - if (wret < 0) { - ret = wret; - goto out; - } - if (!path->nodes[1]) { - ret = 0; - goto out; - } - /* - * The node at level 1 must always be locked when our path has - * keep_locks set and lowest_level is 1, regardless of the value of - * path->slots[1]. - */ - BUG_ON(path->locks[1] == 0); - ret = btrfs_realloc_node(trans, root, - path->nodes[1], 0, - &last_ret, - &root->defrag_progress); - if (ret) { - WARN_ON(ret == -EAGAIN); - goto out; - } - /* - * Now that we reallocated the node we can find the next key. Note that - * btrfs_find_next_key() can release our path and do another search - * without COWing, this is because even with path->keep_locks = 1, - * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a - * node when path->slots[node_level - 1] does not point to the last - * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore - * we search for the next key after reallocating our node. - */ - path->slots[1] = btrfs_header_nritems(path->nodes[1]); - next_key_ret = btrfs_find_next_key(root, path, &key, 1, - BTRFS_OLDEST_GENERATION); - if (next_key_ret == 0) { - memcpy(&root->defrag_progress, &key, sizeof(key)); - ret = -EAGAIN; - } -out: - btrfs_free_path(path); - if (ret == -EAGAIN) { - if (root->defrag_max.objectid > root->defrag_progress.objectid) - goto done; - if (root->defrag_max.type > root->defrag_progress.type) - goto done; - if (root->defrag_max.offset > root->defrag_progress.offset) - goto done; - ret = 0; - } -done: - if (ret != -EAGAIN) - memset(&root->defrag_progress, 0, - sizeof(root->defrag_progress)); - - return ret; -} diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9205c4a5ca81..a3c43f0b1c95 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -21,6 +21,17 @@ #include "space-info.h" #include "zoned.h" #include "inode-item.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "dir-item.h" +#include "file-item.h" +#include "file.h" +#include "orphan.h" +#include "tree-checker.h" + +#define MAX_CONFLICT_INODES 10 /* magic values for the inode_only field in btrfs_log_inode: * @@ -31,8 +42,6 @@ enum { LOG_INODE_ALL, LOG_INODE_EXISTS, - LOG_OTHER_INODE, - LOG_OTHER_INODE_ALL, }; /* @@ -333,7 +342,12 @@ static int process_one_buffer(struct btrfs_root *log, * pin down any logged extents, so we have to read the block. */ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { - ret = btrfs_read_extent_buffer(eb, gen, level, NULL); + struct btrfs_tree_parent_check check = { + .level = level, + .transid = gen + }; + + ret = btrfs_read_extent_buffer(eb, &check); if (ret) return ret; } @@ -351,11 +365,25 @@ static int process_one_buffer(struct btrfs_root *log, return ret; } -static int do_overwrite_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +/* + * Item overwrite used by replay and tree logging. eb, slot and key all refer + * to the src data we are copying out. + * + * root is the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and + * will be released on exit). + * + * If the key is already in the destination tree the existing item is + * overwritten. If the existing item isn't big enough, it is extended. + * If it is too large, it is truncated. + * + * If the key isn't in the destination yet, a new item is inserted. + */ +static int overwrite_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) { int ret; u32 item_size; @@ -363,31 +391,24 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans, int save_old_i_size = 0; unsigned long src_ptr; unsigned long dst_ptr; - int overwrite_root = 0; bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) - overwrite_root = 1; + /* + * This is only used during log replay, so the root is always from a + * fs/subvolume tree. In case we ever need to support a log root, then + * we'll have to clone the leaf in the path, release the path and use + * the leaf before writing into the log tree. See the comments at + * copy_items() for more details. + */ + ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); item_size = btrfs_item_size(eb, slot); src_ptr = btrfs_item_ptr_offset(eb, slot); - /* Our caller must have done a search for the key for us. */ - ASSERT(path->nodes[0] != NULL); - - /* - * And the slot must point to the exact key or the slot where the key - * should be at (the first item with a key greater than 'key') - */ - if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { - struct btrfs_key found_key; - - btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - ret = btrfs_comp_cpu_keys(&found_key, key); - ASSERT(ret >= 0); - } else { - ret = 1; - } + /* Look for the key in the destination tree. */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + return ret; if (ret == 0) { char *src_copy; @@ -532,8 +553,7 @@ insert: goto no_copy; } - if (overwrite_root && - S_ISDIR(btrfs_inode_mode(eb, src_item)) && + if (S_ISDIR(btrfs_inode_mode(eb, src_item)) && S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { save_old_i_size = 1; saved_i_size = btrfs_inode_size(path->nodes[0], @@ -565,34 +585,19 @@ no_copy: return 0; } -/* - * Item overwrite used by replay and tree logging. eb, slot and key all refer - * to the src data we are copying out. - * - * root is the tree we are copying into, and path is a scratch - * path for use in this function (it should be released on entry and - * will be released on exit). - * - * If the key is already in the destination tree the existing item is - * overwritten. If the existing item isn't big enough, it is extended. - * If it is too large, it is truncated. - * - * If the key isn't in the destination yet, a new item is inserted. - */ -static int overwrite_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, + struct fscrypt_str *name) { - int ret; + char *buf; - /* Look for the key in the destination tree. */ - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) - return ret; + buf = kmalloc(len, GFP_NOFS); + if (!buf) + return -ENOMEM; - return do_overwrite_item(trans, root, path, eb, slot, key); + read_extent_buffer(eb, buf, (unsigned long)start, len); + name->name = buf; + name->len = len; + return 0; } /* @@ -747,8 +752,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, */ ret = btrfs_qgroup_trace_extent(trans, btrfs_file_extent_disk_bytenr(eb, item), - btrfs_file_extent_disk_num_bytes(eb, item), - GFP_NOFS); + btrfs_file_extent_disk_num_bytes(eb, item)); if (ret < 0) goto out; @@ -799,9 +803,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, btrfs_file_extent_num_bytes(eb, item); } - ret = btrfs_lookup_csums_range(root->log_root, + ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1, - &ordered_sums, 0); + &ordered_sums, 0, false); if (ret) goto out; /* @@ -901,12 +905,11 @@ out: static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, - int name_len) + const struct fscrypt_str *name) { int ret; - ret = btrfs_unlink_inode(trans, dir, inode, name, name_len); + ret = btrfs_unlink_inode(trans, dir, inode, name); if (ret) return ret; /* @@ -933,8 +936,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, { struct btrfs_root *root = dir->root; struct inode *inode; - char *name; - int name_len; + struct fscrypt_str name; struct extent_buffer *leaf; struct btrfs_key location; int ret; @@ -942,12 +944,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; btrfs_dir_item_key_to_cpu(leaf, di, &location); - name_len = btrfs_dir_name_len(leaf, di); - name = kmalloc(name_len, GFP_NOFS); - if (!name) + ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name); + if (ret) return -ENOMEM; - read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); btrfs_release_path(path); inode = read_one_inode(root, location.objectid); @@ -960,10 +960,9 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name, - name_len); + ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name); out: - kfree(name); + kfree(name.name); iput(inode); return ret; } @@ -978,14 +977,14 @@ out: static noinline int inode_in_dir(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, u64 objectid, u64 index, - const char *name, int name_len) + struct fscrypt_str *name) { struct btrfs_dir_item *di; struct btrfs_key location; int ret = 0; di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, - index, name, name_len, 0); + index, name, 0); if (IS_ERR(di)) { ret = PTR_ERR(di); goto out; @@ -998,7 +997,7 @@ static noinline int inode_in_dir(struct btrfs_root *root, } btrfs_release_path(path); - di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); + di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0); if (IS_ERR(di)) { ret = PTR_ERR(di); goto out; @@ -1025,7 +1024,7 @@ out: static noinline int backref_in_log(struct btrfs_root *log, struct btrfs_key *key, u64 ref_objectid, - const char *name, int namelen) + const struct fscrypt_str *name) { struct btrfs_path *path; int ret; @@ -1045,12 +1044,10 @@ static noinline int backref_in_log(struct btrfs_root *log, if (key->type == BTRFS_INODE_EXTREF_KEY) ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], - ref_objectid, - name, namelen); + ref_objectid, name); else ret = !!btrfs_find_name_in_backref(path->nodes[0], - path->slots[0], - name, namelen); + path->slots[0], name); out: btrfs_free_path(path); return ret; @@ -1063,12 +1060,9 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, u64 inode_objectid, u64 parent_objectid, - u64 ref_index, char *name, int namelen, - int *search_done) + u64 ref_index, struct fscrypt_str *name) { int ret; - char *victim_name; - int victim_name_len; struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key search_key; @@ -1100,50 +1094,40 @@ again: ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); while (ptr < ptr_end) { - victim_ref = (struct btrfs_inode_ref *)ptr; - victim_name_len = btrfs_inode_ref_name_len(leaf, - victim_ref); - victim_name = kmalloc(victim_name_len, GFP_NOFS); - if (!victim_name) - return -ENOMEM; + struct fscrypt_str victim_name; - read_extent_buffer(leaf, victim_name, - (unsigned long)(victim_ref + 1), - victim_name_len); + victim_ref = (struct btrfs_inode_ref *)ptr; + ret = read_alloc_one_name(leaf, (victim_ref + 1), + btrfs_inode_ref_name_len(leaf, victim_ref), + &victim_name); + if (ret) + return ret; ret = backref_in_log(log_root, &search_key, - parent_objectid, victim_name, - victim_name_len); + parent_objectid, &victim_name); if (ret < 0) { - kfree(victim_name); + kfree(victim_name.name); return ret; } else if (!ret) { inc_nlink(&inode->vfs_inode); btrfs_release_path(path); ret = unlink_inode_for_log_replay(trans, dir, inode, - victim_name, victim_name_len); - kfree(victim_name); + &victim_name); + kfree(victim_name.name); if (ret) return ret; - *search_done = 1; goto again; } - kfree(victim_name); + kfree(victim_name.name); - ptr = (unsigned long)(victim_ref + 1) + victim_name_len; + ptr = (unsigned long)(victim_ref + 1) + victim_name.len; } - - /* - * NOTE: we have searched root tree and checked the - * corresponding ref, it does not need to check again. - */ - *search_done = 1; } btrfs_release_path(path); /* Same search but for extended refs */ - extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, + extref = btrfs_lookup_inode_extref(NULL, root, path, name, inode_objectid, parent_objectid, 0, 0); if (IS_ERR(extref)) { @@ -1160,29 +1144,28 @@ again: base = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { - extref = (struct btrfs_inode_extref *)(base + cur_offset); + struct fscrypt_str victim_name; - victim_name_len = btrfs_inode_extref_name_len(leaf, extref); + extref = (struct btrfs_inode_extref *)(base + cur_offset); if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) goto next; - victim_name = kmalloc(victim_name_len, GFP_NOFS); - if (!victim_name) - return -ENOMEM; - read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, - victim_name_len); + ret = read_alloc_one_name(leaf, &extref->name, + btrfs_inode_extref_name_len(leaf, extref), + &victim_name); + if (ret) + return ret; search_key.objectid = inode_objectid; search_key.type = BTRFS_INODE_EXTREF_KEY; search_key.offset = btrfs_extref_hash(parent_objectid, - victim_name, - victim_name_len); + victim_name.name, + victim_name.len); ret = backref_in_log(log_root, &search_key, - parent_objectid, victim_name, - victim_name_len); + parent_objectid, &victim_name); if (ret < 0) { - kfree(victim_name); + kfree(victim_name.name); return ret; } else if (!ret) { ret = -ENOENT; @@ -1194,28 +1177,24 @@ again: ret = unlink_inode_for_log_replay(trans, BTRFS_I(victim_parent), - inode, - victim_name, - victim_name_len); + inode, &victim_name); } iput(victim_parent); - kfree(victim_name); + kfree(victim_name.name); if (ret) return ret; - *search_done = 1; goto again; } - kfree(victim_name); + kfree(victim_name.name); next: - cur_offset += victim_name_len + sizeof(*extref); + cur_offset += victim_name.len + sizeof(*extref); } - *search_done = 1; } btrfs_release_path(path); /* look for a conflicting sequence number */ di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), - ref_index, name, namelen, 0); + ref_index, name, 0); if (IS_ERR(di)) { return PTR_ERR(di); } else if (di) { @@ -1226,8 +1205,7 @@ next: btrfs_release_path(path); /* look for a conflicting name */ - di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), - name, namelen, 0); + di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0); if (IS_ERR(di)) { return PTR_ERR(di); } else if (di) { @@ -1241,20 +1219,18 @@ next: } static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - u32 *namelen, char **name, u64 *index, + struct fscrypt_str *name, u64 *index, u64 *parent_objectid) { struct btrfs_inode_extref *extref; + int ret; extref = (struct btrfs_inode_extref *)ref_ptr; - *namelen = btrfs_inode_extref_name_len(eb, extref); - *name = kmalloc(*namelen, GFP_NOFS); - if (*name == NULL) - return -ENOMEM; - - read_extent_buffer(eb, *name, (unsigned long)&extref->name, - *namelen); + ret = read_alloc_one_name(eb, &extref->name, + btrfs_inode_extref_name_len(eb, extref), name); + if (ret) + return ret; if (index) *index = btrfs_inode_extref_index(eb, extref); @@ -1265,18 +1241,17 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, } static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - u32 *namelen, char **name, u64 *index) + struct fscrypt_str *name, u64 *index) { struct btrfs_inode_ref *ref; + int ret; ref = (struct btrfs_inode_ref *)ref_ptr; - *namelen = btrfs_inode_ref_name_len(eb, ref); - *name = kmalloc(*namelen, GFP_NOFS); - if (*name == NULL) - return -ENOMEM; - - read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); + ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref), + name); + if (ret) + return ret; if (index) *index = btrfs_inode_ref_index(eb, ref); @@ -1318,28 +1293,24 @@ again: ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); while (ref_ptr < ref_end) { - char *name = NULL; - int namelen; + struct fscrypt_str name; u64 parent_id; if (key->type == BTRFS_INODE_EXTREF_KEY) { - ret = extref_get_fields(eb, ref_ptr, &namelen, &name, + ret = extref_get_fields(eb, ref_ptr, &name, NULL, &parent_id); } else { parent_id = key->offset; - ret = ref_get_fields(eb, ref_ptr, &namelen, &name, - NULL); + ret = ref_get_fields(eb, ref_ptr, &name, NULL); } if (ret) goto out; if (key->type == BTRFS_INODE_EXTREF_KEY) ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, - parent_id, name, - namelen); + parent_id, &name); else - ret = !!btrfs_find_name_in_backref(log_eb, log_slot, - name, namelen); + ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); if (!ret) { struct inode *dir; @@ -1348,20 +1319,20 @@ again: dir = read_one_inode(root, parent_id); if (!dir) { ret = -ENOENT; - kfree(name); + kfree(name.name); goto out; } ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), - inode, name, namelen); - kfree(name); + inode, &name); + kfree(name.name); iput(dir); if (ret) goto out; goto again; } - kfree(name); - ref_ptr += namelen; + kfree(name.name); + ref_ptr += name.len; if (key->type == BTRFS_INODE_EXTREF_KEY) ref_ptr += sizeof(struct btrfs_inode_extref); else @@ -1373,103 +1344,6 @@ again: return ret; } -static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, - const u8 ref_type, const char *name, - const int namelen) -{ - struct btrfs_key key; - struct btrfs_path *path; - const u64 parent_id = btrfs_ino(BTRFS_I(dir)); - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = btrfs_ino(BTRFS_I(inode)); - key.type = ref_type; - if (key.type == BTRFS_INODE_REF_KEY) - key.offset = parent_id; - else - key.offset = btrfs_extref_hash(parent_id, name, namelen); - - ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0); - if (ret < 0) - goto out; - if (ret > 0) { - ret = 0; - goto out; - } - if (key.type == BTRFS_INODE_EXTREF_KEY) - ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], - path->slots[0], parent_id, name, namelen); - else - ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, namelen); - -out: - btrfs_free_path(path); - return ret; -} - -static int add_link(struct btrfs_trans_handle *trans, - struct inode *dir, struct inode *inode, const char *name, - int namelen, u64 ref_index) -{ - struct btrfs_root *root = BTRFS_I(dir)->root; - struct btrfs_dir_item *dir_item; - struct btrfs_key key; - struct btrfs_path *path; - struct inode *other_inode = NULL; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - dir_item = btrfs_lookup_dir_item(NULL, root, path, - btrfs_ino(BTRFS_I(dir)), - name, namelen, 0); - if (!dir_item) { - btrfs_release_path(path); - goto add_link; - } else if (IS_ERR(dir_item)) { - ret = PTR_ERR(dir_item); - goto out; - } - - /* - * Our inode's dentry collides with the dentry of another inode which is - * in the log but not yet processed since it has a higher inode number. - * So delete that other dentry. - */ - btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key); - btrfs_release_path(path); - other_inode = read_one_inode(root, key.objectid); - if (!other_inode) { - ret = -ENOENT; - goto out; - } - ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(other_inode), - name, namelen); - if (ret) - goto out; - /* - * If we dropped the link count to 0, bump it so that later the iput() - * on the inode will not free it. We will fixup the link count later. - */ - if (other_inode->i_nlink == 0) - set_nlink(other_inode, 1); -add_link: - ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - name, namelen, 0, ref_index); -out: - iput(other_inode); - btrfs_free_path(path); - - return ret; -} - /* * replay one inode back reference item found in the log tree. * eb, slot and key refer to the buffer and key found in the log tree. @@ -1487,10 +1361,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; - char *name = NULL; - int namelen; + struct fscrypt_str name; int ret; - int search_done = 0; int log_ref_ver = 0; u64 parent_objectid; u64 inode_objectid; @@ -1533,7 +1405,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, while (ref_ptr < ref_end) { if (log_ref_ver) { - ret = extref_get_fields(eb, ref_ptr, &namelen, &name, + ret = extref_get_fields(eb, ref_ptr, &name, &ref_index, &parent_objectid); /* * parent object can change from one array @@ -1546,15 +1418,13 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, goto out; } } else { - ret = ref_get_fields(eb, ref_ptr, &namelen, &name, - &ref_index); + ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); } if (ret) goto out; ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), - btrfs_ino(BTRFS_I(inode)), ref_index, - name, namelen); + btrfs_ino(BTRFS_I(inode)), ref_index, &name); if (ret < 0) { goto out; } else if (ret == 0) { @@ -1565,51 +1435,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * overwrite any existing back reference, and we don't * want to create dangling pointers in the directory. */ - - if (!search_done) { - ret = __add_inode_ref(trans, root, path, log, - BTRFS_I(dir), - BTRFS_I(inode), - inode_objectid, - parent_objectid, - ref_index, name, namelen, - &search_done); - if (ret) { - if (ret == 1) - ret = 0; - goto out; - } - } - - /* - * If a reference item already exists for this inode - * with the same parent and name, but different index, - * drop it and the corresponding directory index entries - * from the parent before adding the new reference item - * and dir index entries, otherwise we would fail with - * -EEXIST returned from btrfs_add_link() below. - */ - ret = btrfs_inode_ref_exists(inode, dir, key->type, - name, namelen); - if (ret > 0) { - ret = unlink_inode_for_log_replay(trans, - BTRFS_I(dir), - BTRFS_I(inode), - name, namelen); - /* - * If we dropped the link count to 0, bump it so - * that later the iput() on the inode will not - * free it. We will fixup the link count later. - */ - if (!ret && inode->i_nlink == 0) - set_nlink(inode, 1); - } - if (ret < 0) + ret = __add_inode_ref(trans, root, path, log, + BTRFS_I(dir), BTRFS_I(inode), + inode_objectid, parent_objectid, + ref_index, &name); + if (ret) { + if (ret == 1) + ret = 0; goto out; + } /* insert our name */ - ret = add_link(trans, dir, inode, name, namelen, - ref_index); + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), + &name, 0, ref_index); if (ret) goto out; @@ -1619,9 +1457,9 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } /* Else, ret == 1, we already have a perfect match, we're done. */ - ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; - kfree(name); - name = NULL; + ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len; + kfree(name.name); + name.name = NULL; if (log_ref_ver) { iput(dir); dir = NULL; @@ -1645,7 +1483,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ret = overwrite_item(trans, root, path, eb, slot, key); out: btrfs_release_path(path); - kfree(name); + kfree(name.name); iput(dir); iput(inode); return ret; @@ -1917,7 +1755,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, static noinline int insert_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 dirid, u64 index, - char *name, int name_len, + const struct fscrypt_str *name, struct btrfs_key *location) { struct inode *inode; @@ -1935,7 +1773,7 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, } ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, - name_len, 1, index); + 1, index); /* FIXME, put inode into FIXUP list */ @@ -1949,7 +1787,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_dir_item *dst_di, const struct btrfs_key *log_key, - u8 log_type, + u8 log_flags, bool exists) { struct btrfs_key found_key; @@ -1959,7 +1797,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, if (found_key.objectid == log_key->objectid && found_key.type == log_key->type && found_key.offset == log_key->offset && - btrfs_dir_type(path->nodes[0], dst_di) == log_type) + btrfs_dir_flags(path->nodes[0], dst_di) == log_flags) return 1; /* @@ -1995,8 +1833,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di, struct btrfs_key *key) { - char *name; - int name_len; + struct fscrypt_str name; struct btrfs_dir_item *dir_dst_di; struct btrfs_dir_item *index_dst_di; bool dir_dst_matches = false; @@ -2004,7 +1841,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_key log_key; struct btrfs_key search_key; struct inode *dir; - u8 log_type; + u8 log_flags; bool exists; int ret; bool update_size = true; @@ -2014,17 +1851,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, if (!dir) return -EIO; - name_len = btrfs_dir_name_len(eb, di); - name = kmalloc(name_len, GFP_NOFS); - if (!name) { - ret = -ENOMEM; + ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); + if (ret) goto out; - } - - log_type = btrfs_dir_type(eb, di); - read_extent_buffer(eb, name, (unsigned long)(di + 1), - name_len); + log_flags = btrfs_dir_flags(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &log_key); ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); btrfs_release_path(path); @@ -2034,14 +1865,14 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = 0; dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, - name, name_len, 1); + &name, 1); if (IS_ERR(dir_dst_di)) { ret = PTR_ERR(dir_dst_di); goto out; } else if (dir_dst_di) { ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - dir_dst_di, &log_key, log_type, - exists); + dir_dst_di, &log_key, + log_flags, exists); if (ret < 0) goto out; dir_dst_matches = (ret == 1); @@ -2051,14 +1882,14 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, key->objectid, key->offset, - name, name_len, 1); + &name, 1); if (IS_ERR(index_dst_di)) { ret = PTR_ERR(index_dst_di); goto out; } else if (index_dst_di) { ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, index_dst_di, &log_key, - log_type, exists); + log_flags, exists); if (ret < 0) goto out; index_dst_matches = (ret == 1); @@ -2079,7 +1910,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, search_key.objectid = log_key.objectid; search_key.type = BTRFS_INODE_REF_KEY; search_key.offset = key->objectid; - ret = backref_in_log(root->log_root, &search_key, 0, name, name_len); + ret = backref_in_log(root->log_root, &search_key, 0, &name); if (ret < 0) { goto out; } else if (ret) { @@ -2092,8 +1923,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, search_key.objectid = log_key.objectid; search_key.type = BTRFS_INODE_EXTREF_KEY; search_key.offset = key->objectid; - ret = backref_in_log(root->log_root, &search_key, key->objectid, name, - name_len); + ret = backref_in_log(root->log_root, &search_key, key->objectid, &name); if (ret < 0) { goto out; } else if (ret) { @@ -2104,7 +1934,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, } btrfs_release_path(path); ret = insert_one_name(trans, root, key->objectid, key->offset, - name, name_len, &log_key); + &name, &log_key); if (ret && ret != -ENOENT && ret != -EEXIST) goto out; if (!ret) @@ -2114,10 +1944,10 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, out: if (!ret && update_size) { - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); + btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2); ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); } - kfree(name); + kfree(name.name); iput(dir); if (!ret && name_added) ret = 1; @@ -2168,7 +1998,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, * to ever delete the parent directory has it would result in stale * dentries that can never be deleted. */ - if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { + if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) { struct btrfs_path *fixup_path; struct btrfs_key di_key; @@ -2283,8 +2113,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; - int name_len; - char *name; + struct fscrypt_str name; struct inode *inode = NULL; struct btrfs_key location; @@ -2299,22 +2128,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, eb = path->nodes[0]; slot = path->slots[0]; di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); - name_len = btrfs_dir_name_len(eb, di); - name = kmalloc(name_len, GFP_NOFS); - if (!name) { - ret = -ENOMEM; + ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); + if (ret) goto out; - } - - read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); if (log) { struct btrfs_dir_item *log_di; log_di = btrfs_lookup_dir_index_item(trans, log, log_path, dir_key->objectid, - dir_key->offset, - name, name_len, 0); + dir_key->offset, &name, 0); if (IS_ERR(log_di)) { ret = PTR_ERR(log_di); goto out; @@ -2340,7 +2163,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, inc_nlink(inode); ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), - name, name_len); + &name); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset @@ -2349,7 +2172,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); btrfs_release_path(log_path); - kfree(name); + kfree(name.name); iput(inode); return ret; } @@ -2570,13 +2393,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen, int level) { int nritems; + struct btrfs_tree_parent_check check = { + .transid = gen, + .level = level + }; struct btrfs_path *path; struct btrfs_root *root = wc->replay_dest; struct btrfs_key key; int i; int ret; - ret = btrfs_read_extent_buffer(eb, gen, level, NULL); + ret = btrfs_read_extent_buffer(eb, &check); if (ret) return ret; @@ -2756,7 +2583,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, int ret = 0; while (*level > 0) { - struct btrfs_key first_key; + struct btrfs_tree_parent_check check = { 0 }; cur = path->nodes[*level]; @@ -2768,7 +2595,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(cur, path->slots[*level]); ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); - btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); + check.transid = ptr_gen; + check.level = *level - 1; + check.has_first_key = true; + btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]); blocksize = fs_info->nodesize; next = btrfs_find_create_tree_block(fs_info, bytenr, @@ -2787,8 +2617,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, path->slots[*level]++; if (wc->free) { - ret = btrfs_read_extent_buffer(next, ptr_gen, - *level - 1, &first_key); + ret = btrfs_read_extent_buffer(next, &check); if (ret) { free_extent_buffer(next); return ret; @@ -2816,7 +2645,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, free_extent_buffer(next); continue; } - ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key); + ret = btrfs_read_extent_buffer(next, &check); if (ret) { free_extent_buffer(next); return ret; @@ -3588,7 +3417,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, u64 dir_ino, - const char *name, int name_len, + const struct fscrypt_str *name, u64 index) { struct btrfs_dir_item *di; @@ -3598,7 +3427,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, * for dir item keys. */ di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, - index, name, name_len, -1); + index, name, -1); if (IS_ERR(di)) return PTR_ERR(di); else if (!di) @@ -3635,7 +3464,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, */ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index) { struct btrfs_path *path; @@ -3662,7 +3491,7 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, } ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), - name, name_len, index); + name, index); btrfs_free_path(path); out_unlock: mutex_unlock(&dir->log_mutex); @@ -3674,7 +3503,7 @@ out_unlock: /* see comments for btrfs_del_dir_entries_in_log */ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct fscrypt_str *name, struct btrfs_inode *inode, u64 dirid) { struct btrfs_root *log; @@ -3695,7 +3524,7 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, log = root->log_root; mutex_lock(&inode->log_mutex); - ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), + ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), dirid, &index); mutex_unlock(&inode->log_mutex); if (ret < 0 && ret != -ENOENT) @@ -3834,15 +3663,29 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, u64 *last_old_dentry_offset) { struct btrfs_root *log = inode->root->log_root; - struct extent_buffer *src = path->nodes[0]; - const int nritems = btrfs_header_nritems(src); + struct extent_buffer *src; + const int nritems = btrfs_header_nritems(path->nodes[0]); const u64 ino = btrfs_ino(inode); bool last_found = false; int batch_start = 0; int batch_size = 0; int i; - for (i = path->slots[0]; i < nritems; i++) { + /* + * We need to clone the leaf, release the read lock on it, and use the + * clone before modifying the log tree. See the comment at copy_items() + * about why we need to do this. + */ + src = btrfs_clone_extent_buffer(path->nodes[0]); + if (!src) + return -ENOMEM; + + i = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = src; + path->slots[0] = i; + + for (; i < nritems; i++) { struct btrfs_dir_item *di; struct btrfs_key key; int ret; @@ -3875,6 +3718,11 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, *last_old_dentry_offset = key.offset; continue; } + + /* If we logged this dir index item before, we can skip it. */ + if (key.offset <= inode->last_dir_index_offset) + continue; + /* * We must make sure that when we log a directory entry, the * corresponding inode, after log replay, has a matching link @@ -3905,51 +3753,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, ctx->log_new_dentries = true; } - if (!ctx->logged_before) - goto add_to_batch; - - /* - * If we were logged before and have logged dir items, we can skip - * checking if any item with a key offset larger than the last one - * we logged is in the log tree, saving time and avoiding adding - * contention on the log tree. We can only rely on the value of - * last_dir_index_offset when we know for sure that the inode was - * previously logged in the current transaction. - */ - if (key.offset > inode->last_dir_index_offset) - goto add_to_batch; - /* - * Check if the key was already logged before. If not we can add - * it to a batch for bulk insertion. - */ - ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0); - if (ret < 0) { - return ret; - } else if (ret > 0) { - btrfs_release_path(dst_path); - goto add_to_batch; - } - - /* - * Item exists in the log. Overwrite the item in the log if it - * has different content or do nothing if it has exactly the same - * content. And then flush the current batch if any - do it after - * overwriting the current item, or we would deadlock otherwise, - * since we are holding a path for the existing item. - */ - ret = do_overwrite_item(trans, log, dst_path, src, i, &key); - if (ret < 0) - return ret; - - if (batch_size > 0) { - ret = flush_dir_items_batch(trans, log, src, dst_path, - batch_start, batch_size); - if (ret < 0) - return ret; - batch_size = 0; - } - continue; -add_to_batch: if (batch_size == 0) batch_start = i; batch_size++; @@ -4136,6 +3939,71 @@ done: } /* + * If the inode was logged before and it was evicted, then its + * last_dir_index_offset is (u64)-1, so we don't the value of the last index + * key offset. If that's the case, search for it and update the inode. This + * is to avoid lookups in the log tree every time we try to insert a dir index + * key from a leaf changed in the current transaction, and to allow us to always + * do batch insertions of dir index keys. + */ +static int update_last_dir_index_offset(struct btrfs_inode *inode, + struct btrfs_path *path, + const struct btrfs_log_ctx *ctx) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_key key; + int ret; + + lockdep_assert_held(&inode->log_mutex); + + if (inode->last_dir_index_offset != (u64)-1) + return 0; + + if (!ctx->logged_before) { + inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1; + return 0; + } + + key.objectid = ino; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); + /* + * An error happened or we actually have an index key with an offset + * value of (u64)-1. Bail out, we're done. + */ + if (ret <= 0) + goto out; + + ret = 0; + inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1; + + /* + * No dir index items, bail out and leave last_dir_index_offset with + * the value right before the first valid index value. + */ + if (path->slots[0] == 0) + goto out; + + /* + * btrfs_search_slot() left us at one slot beyond the slot with the last + * index key, or beyond the last key of the directory that is not an + * index key. If we have an index key before, set last_dir_index_offset + * to its offset value, otherwise leave it with a value right before the + * first valid index value, as it means we have an empty directory. + */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY) + inode->last_dir_index_offset = key.offset; + +out: + btrfs_release_path(path); + + return ret; +} + +/* * logging directories is very similar to logging inodes, We find all the items * from the current transaction and write them to the log. * @@ -4157,6 +4025,10 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, u64 max_key; int ret; + ret = update_last_dir_index_offset(inode, path, ctx); + if (ret) + return ret; + min_key = BTRFS_DIR_START_INDEX; max_key = 0; ctx->last_dir_item_offset = inode->last_dir_index_offset; @@ -4382,8 +4254,8 @@ static int log_csums(struct btrfs_trans_handle *trans, * file which happens to refer to the same extent as well. Such races * can leave checksum items in the log with overlapping ranges. */ - ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr, - lock_end, &cached_state); + ret = lock_extent(&log_root->log_csum_range, sums->bytenr, lock_end, + &cached_state); if (ret) return ret; /* @@ -4399,8 +4271,8 @@ static int log_csums(struct btrfs_trans_handle *trans, if (!ret) ret = btrfs_csum_file_blocks(trans, log_root, sums); - unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end, - &cached_state); + unlock_extent(&log_root->log_csum_range, sums->bytenr, lock_end, + &cached_state); return ret; } @@ -4414,7 +4286,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, { struct btrfs_root *log = inode->root->log_root; struct btrfs_file_extent_item *extent; - struct extent_buffer *src = src_path->nodes[0]; + struct extent_buffer *src; int ret = 0; struct btrfs_key *ins_keys; u32 *ins_sizes; @@ -4425,6 +4297,43 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); const u64 i_size = i_size_read(&inode->vfs_inode); + /* + * To keep lockdep happy and avoid deadlocks, clone the source leaf and + * use the clone. This is because otherwise we would be changing the log + * tree, to insert items from the subvolume tree or insert csum items, + * while holding a read lock on a leaf from the subvolume tree, which + * creates a nasty lock dependency when COWing log tree nodes/leaves: + * + * 1) Modifying the log tree triggers an extent buffer allocation while + * holding a write lock on a parent extent buffer from the log tree. + * Allocating the pages for an extent buffer, or the extent buffer + * struct, can trigger inode eviction and finally the inode eviction + * will trigger a release/remove of a delayed node, which requires + * taking the delayed node's mutex; + * + * 2) Allocating a metadata extent for a log tree can trigger the async + * reclaim thread and make us wait for it to release enough space and + * unblock our reservation ticket. The reclaim thread can start + * flushing delayed items, and that in turn results in the need to + * lock delayed node mutexes and in the need to write lock extent + * buffers of a subvolume tree - all this while holding a write lock + * on the parent extent buffer in the log tree. + * + * So one task in scenario 1) running in parallel with another task in + * scenario 2) could lead to a deadlock, one wanting to lock a delayed + * node mutex while having a read lock on a leaf from the subvolume, + * while the other is holding the delayed node's mutex and wants to + * write lock the same subvolume leaf for flushing delayed items. + */ + src = btrfs_clone_extent_buffer(src_path->nodes[0]); + if (!src) + return -ENOMEM; + + i = src_path->slots[0]; + btrfs_release_path(src_path); + src_path->nodes[0] = src; + src_path->slots[0] = i; + ins_data = kmalloc(nr * sizeof(struct btrfs_key) + nr * sizeof(u32), GFP_NOFS); if (!ins_data) @@ -4511,9 +4420,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr); disk_bytenr += extent_offset; - ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, - disk_bytenr + extent_num_bytes - 1, - &ordered_sums, 0); + ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, + disk_bytenr + extent_num_bytes - 1, + &ordered_sums, 0, false); if (ret) goto out; @@ -4706,10 +4615,9 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, /* block start is already adjusted for the file extent offset. */ csum_root = btrfs_csum_root(trans->fs_info, em->block_start); - ret = btrfs_lookup_csums_range(csum_root, - em->block_start + csum_offset, - em->block_start + csum_offset + - csum_len - 1, &ordered_sums, 0); + ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset, + em->block_start + csum_offset + + csum_len - 1, &ordered_sums, 0, false); if (ret) return ret; @@ -5221,10 +5129,9 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, * leafs from the log root. */ btrfs_release_path(path); - ret = btrfs_insert_file_extent(trans, root->log_root, - ino, prev_extent_end, 0, - 0, hole_len, 0, hole_len, - 0, 0, 0); + ret = btrfs_insert_hole_extent(trans, root->log_root, + ino, prev_extent_end, + hole_len); if (ret < 0) return ret; @@ -5253,10 +5160,8 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, btrfs_release_path(path); hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); - ret = btrfs_insert_file_extent(trans, root->log_root, - ino, prev_extent_end, 0, 0, - hole_len, 0, hole_len, - 0, 0, 0); + ret = btrfs_insert_hole_extent(trans, root->log_root, ino, + prev_extent_end, hole_len); if (ret < 0) return ret; } @@ -5332,6 +5237,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, u32 this_len; unsigned long name_ptr; struct btrfs_dir_item *di; + struct fscrypt_str name_str; if (key->type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; @@ -5365,8 +5271,11 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, } read_extent_buffer(eb, name, name_ptr, this_name_len); + + name_str.name = name; + name_str.len = this_name_len; di = btrfs_lookup_dir_item(NULL, inode->root, search_path, - parent, name, this_name_len, 0); + parent, &name_str, 0); if (di && !IS_ERR(di)) { struct btrfs_key di_key; @@ -5399,111 +5308,461 @@ out: return ret; } +/* + * Check if we need to log an inode. This is used in contexts where while + * logging an inode we need to log another inode (either that it exists or in + * full mode). This is used instead of btrfs_inode_in_log() because the later + * requires the inode to be in the log and have the log transaction committed, + * while here we do not care if the log transaction was already committed - our + * caller will commit the log later - and we want to avoid logging an inode + * multiple times when multiple tasks have joined the same log transaction. + */ +static bool need_log_inode(const struct btrfs_trans_handle *trans, + const struct btrfs_inode *inode) +{ + /* + * If a directory was not modified, no dentries added or removed, we can + * and should avoid logging it. + */ + if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) + return false; + + /* + * If this inode does not have new/updated/deleted xattrs since the last + * time it was logged and is flagged as logged in the current transaction, + * we can skip logging it. As for new/deleted names, those are updated in + * the log by link/unlink/rename operations. + * In case the inode was logged and then evicted and reloaded, its + * logged_trans will be 0, in which case we have to fully log it since + * logged_trans is a transient field, not persisted. + */ + if (inode->logged_trans == trans->transid && + !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) + return false; + + return true; +} + +struct btrfs_dir_list { + u64 ino; + struct list_head list; +}; + +/* + * Log the inodes of the new dentries of a directory. + * See process_dir_items_leaf() for details about why it is needed. + * This is a recursive operation - if an existing dentry corresponds to a + * directory, that directory's new entries are logged too (same behaviour as + * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes + * the dentries point to we do not acquire their VFS lock, otherwise lockdep + * complains about the following circular lock dependency / possible deadlock: + * + * CPU0 CPU1 + * ---- ---- + * lock(&type->i_mutex_dir_key#3/2); + * lock(sb_internal#2); + * lock(&type->i_mutex_dir_key#3/2); + * lock(&sb->s_type->i_mutex_key#14); + * + * Where sb_internal is the lock (a counter that works as a lock) acquired by + * sb_start_intwrite() in btrfs_start_transaction(). + * Not acquiring the VFS lock of the inodes is still safe because: + * + * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible + * that while logging the inode new references (names) are added or removed + * from the inode, leaving the logged inode item with a link count that does + * not match the number of logged inode reference items. This is fine because + * at log replay time we compute the real number of links and correct the + * link count in the inode item (see replay_one_buffer() and + * link_to_fixup_dir()); + * + * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that + * while logging the inode's items new index items (key type + * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item + * has a size that doesn't match the sum of the lengths of all the logged + * names - this is ok, not a problem, because at log replay time we set the + * directory's i_size to the correct value (see replay_one_name() and + * overwrite_item()). + */ +static int log_new_dir_dentries(struct btrfs_trans_handle *trans, + struct btrfs_inode *start_inode, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *root = start_inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + LIST_HEAD(dir_list); + struct btrfs_dir_list *dir_elem; + u64 ino = btrfs_ino(start_inode); + int ret = 0; + + /* + * If we are logging a new name, as part of a link or rename operation, + * don't bother logging new dentries, as we just want to log the names + * of an inode and that any new parents exist. + */ + if (ctx->logging_new_name) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (true) { + struct extent_buffer *leaf; + struct btrfs_key min_key; + bool continue_curr_inode = true; + int nritems; + int i; + + min_key.objectid = ino; + min_key.type = BTRFS_DIR_INDEX_KEY; + min_key.offset = 0; +again: + btrfs_release_path(path); + ret = btrfs_search_forward(root, &min_key, path, trans->transid); + if (ret < 0) { + break; + } else if (ret > 0) { + ret = 0; + goto next; + } + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_dir_item *di; + struct btrfs_key di_key; + struct inode *di_inode; + int log_mode = LOG_INODE_EXISTS; + int type; + + btrfs_item_key_to_cpu(leaf, &min_key, i); + if (min_key.objectid != ino || + min_key.type != BTRFS_DIR_INDEX_KEY) { + continue_curr_inode = false; + break; + } + + di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); + type = btrfs_dir_ftype(leaf, di); + if (btrfs_dir_transid(leaf, di) < trans->transid) + continue; + btrfs_dir_item_key_to_cpu(leaf, di, &di_key); + if (di_key.type == BTRFS_ROOT_ITEM_KEY) + continue; + + btrfs_release_path(path); + di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); + if (IS_ERR(di_inode)) { + ret = PTR_ERR(di_inode); + goto out; + } + + if (!need_log_inode(trans, BTRFS_I(di_inode))) { + btrfs_add_delayed_iput(BTRFS_I(di_inode)); + break; + } + + ctx->log_new_dentries = false; + if (type == BTRFS_FT_DIR) + log_mode = LOG_INODE_ALL; + ret = btrfs_log_inode(trans, BTRFS_I(di_inode), + log_mode, ctx); + btrfs_add_delayed_iput(BTRFS_I(di_inode)); + if (ret) + goto out; + if (ctx->log_new_dentries) { + dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); + if (!dir_elem) { + ret = -ENOMEM; + goto out; + } + dir_elem->ino = di_key.objectid; + list_add_tail(&dir_elem->list, &dir_list); + } + break; + } + + if (continue_curr_inode && min_key.offset < (u64)-1) { + min_key.offset++; + goto again; + } + +next: + if (list_empty(&dir_list)) + break; + + dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list); + ino = dir_elem->ino; + list_del(&dir_elem->list); + kfree(dir_elem); + } +out: + btrfs_free_path(path); + if (ret) { + struct btrfs_dir_list *next; + + list_for_each_entry_safe(dir_elem, next, &dir_list, list) + kfree(dir_elem); + } + + return ret; +} + struct btrfs_ino_list { u64 ino; u64 parent; struct list_head list; }; -static int log_conflicting_inodes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_log_ctx *ctx, - u64 ino, u64 parent) +static void free_conflicting_inodes(struct btrfs_log_ctx *ctx) +{ + struct btrfs_ino_list *curr; + struct btrfs_ino_list *next; + + list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) { + list_del(&curr->list); + kfree(curr); + } +} + +static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino, + struct btrfs_path *path) +{ + struct btrfs_key key; + int ret; + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + path->search_commit_root = 1; + path->skip_locking = 1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (WARN_ON_ONCE(ret > 0)) { + /* + * We have previously found the inode through the commit root + * so this should not happen. If it does, just error out and + * fallback to a transaction commit. + */ + ret = -ENOENT; + } else if (ret == 0) { + struct btrfs_inode_item *item; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item))) + ret = 1; + } + + btrfs_release_path(path); + path->search_commit_root = 0; + path->skip_locking = 0; + + return ret; +} + +static int add_conflicting_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 ino, u64 parent, + struct btrfs_log_ctx *ctx) { struct btrfs_ino_list *ino_elem; - LIST_HEAD(inode_list); - int ret = 0; + struct inode *inode; + + /* + * It's rare to have a lot of conflicting inodes, in practice it is not + * common to have more than 1 or 2. We don't want to collect too many, + * as we could end up logging too many inodes (even if only in + * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction + * commits. + */ + if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) + return BTRFS_LOG_FORCE_COMMIT; + + inode = btrfs_iget(root->fs_info->sb, ino, root); + /* + * If the other inode that had a conflicting dir entry was deleted in + * the current transaction then we either: + * + * 1) Log the parent directory (later after adding it to the list) if + * the inode is a directory. This is because it may be a deleted + * subvolume/snapshot or it may be a regular directory that had + * deleted subvolumes/snapshots (or subdirectories that had them), + * and at the moment we can't deal with dropping subvolumes/snapshots + * during log replay. So we just log the parent, which will result in + * a fallback to a transaction commit if we are dealing with those + * cases (last_unlink_trans will match the current transaction); + * + * 2) Do nothing if it's not a directory. During log replay we simply + * unlink the conflicting dentry from the parent directory and then + * add the dentry for our inode. Like this we can avoid logging the + * parent directory (and maybe fallback to a transaction commit in + * case it has a last_unlink_trans == trans->transid, due to moving + * some inode from it to some other directory). + */ + if (IS_ERR(inode)) { + int ret = PTR_ERR(inode); + + if (ret != -ENOENT) + return ret; + + ret = conflicting_inode_is_dir(root, ino, path); + /* Not a directory or we got an error. */ + if (ret <= 0) + return ret; + + /* Conflicting inode is a directory, so we'll log its parent. */ + ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); + if (!ino_elem) + return -ENOMEM; + ino_elem->ino = ino; + ino_elem->parent = parent; + list_add_tail(&ino_elem->list, &ctx->conflict_inodes); + ctx->num_conflict_inodes++; + + return 0; + } + + /* + * If the inode was already logged skip it - otherwise we can hit an + * infinite loop. Example: + * + * From the commit root (previous transaction) we have the following + * inodes: + * + * inode 257 a directory + * inode 258 with references "zz" and "zz_link" on inode 257 + * inode 259 with reference "a" on inode 257 + * + * And in the current (uncommitted) transaction we have: + * + * inode 257 a directory, unchanged + * inode 258 with references "a" and "a2" on inode 257 + * inode 259 with reference "zz_link" on inode 257 + * inode 261 with reference "zz" on inode 257 + * + * When logging inode 261 the following infinite loop could + * happen if we don't skip already logged inodes: + * + * - we detect inode 258 as a conflicting inode, with inode 261 + * on reference "zz", and log it; + * + * - we detect inode 259 as a conflicting inode, with inode 258 + * on reference "a", and log it; + * + * - we detect inode 258 as a conflicting inode, with inode 259 + * on reference "zz_link", and log it - again! After this we + * repeat the above steps forever. + * + * Here we can use need_log_inode() because we only need to log the + * inode in LOG_INODE_EXISTS mode and rename operations update the log, + * so that the log ends up with the new name and without the old name. + */ + if (!need_log_inode(trans, BTRFS_I(inode))) { + btrfs_add_delayed_iput(BTRFS_I(inode)); + return 0; + } + + btrfs_add_delayed_iput(BTRFS_I(inode)); ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); if (!ino_elem) return -ENOMEM; ino_elem->ino = ino; ino_elem->parent = parent; - list_add_tail(&ino_elem->list, &inode_list); + list_add_tail(&ino_elem->list, &ctx->conflict_inodes); + ctx->num_conflict_inodes++; - while (!list_empty(&inode_list)) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_key key; - struct inode *inode; + return 0; +} - ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list, - list); - ino = ino_elem->ino; - parent = ino_elem->parent; - list_del(&ino_elem->list); - kfree(ino_elem); - if (ret) - continue; +static int log_conflicting_inodes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret = 0; - btrfs_release_path(path); + /* + * Conflicting inodes are logged by the first call to btrfs_log_inode(), + * otherwise we could have unbounded recursion of btrfs_log_inode() + * calls. This check guarantees we can have only 1 level of recursion. + */ + if (ctx->logging_conflict_inodes) + return 0; + + ctx->logging_conflict_inodes = true; + + /* + * New conflicting inodes may be found and added to the list while we + * are logging a conflicting inode, so keep iterating while the list is + * not empty. + */ + while (!list_empty(&ctx->conflict_inodes)) { + struct btrfs_ino_list *curr; + struct inode *inode; + u64 ino; + u64 parent; + + curr = list_first_entry(&ctx->conflict_inodes, + struct btrfs_ino_list, list); + ino = curr->ino; + parent = curr->parent; + list_del(&curr->list); + kfree(curr); inode = btrfs_iget(fs_info->sb, ino, root); /* * If the other inode that had a conflicting dir entry was * deleted in the current transaction, we need to log its parent - * directory. + * directory. See the comment at add_conflicting_inode(). */ if (IS_ERR(inode)) { ret = PTR_ERR(inode); - if (ret == -ENOENT) { - inode = btrfs_iget(fs_info->sb, parent, root); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - } else { - ret = btrfs_log_inode(trans, - BTRFS_I(inode), - LOG_OTHER_INODE_ALL, - ctx); - btrfs_add_delayed_iput(inode); - } + if (ret != -ENOENT) + break; + + inode = btrfs_iget(fs_info->sb, parent, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + break; } + + /* + * Always log the directory, we cannot make this + * conditional on need_log_inode() because the directory + * might have been logged in LOG_INODE_EXISTS mode or + * the dir index of the conflicting inode is not in a + * dir index key range logged for the directory. So we + * must make sure the deletion is recorded. + */ + ret = btrfs_log_inode(trans, BTRFS_I(inode), + LOG_INODE_ALL, ctx); + btrfs_add_delayed_iput(BTRFS_I(inode)); + if (ret) + break; continue; } + /* - * If the inode was already logged skip it - otherwise we can - * hit an infinite loop. Example: - * - * From the commit root (previous transaction) we have the - * following inodes: - * - * inode 257 a directory - * inode 258 with references "zz" and "zz_link" on inode 257 - * inode 259 with reference "a" on inode 257 - * - * And in the current (uncommitted) transaction we have: - * - * inode 257 a directory, unchanged - * inode 258 with references "a" and "a2" on inode 257 - * inode 259 with reference "zz_link" on inode 257 - * inode 261 with reference "zz" on inode 257 - * - * When logging inode 261 the following infinite loop could - * happen if we don't skip already logged inodes: - * - * - we detect inode 258 as a conflicting inode, with inode 261 - * on reference "zz", and log it; + * Here we can use need_log_inode() because we only need to log + * the inode in LOG_INODE_EXISTS mode and rename operations + * update the log, so that the log ends up with the new name and + * without the old name. * - * - we detect inode 259 as a conflicting inode, with inode 258 - * on reference "a", and log it; - * - * - we detect inode 258 as a conflicting inode, with inode 259 - * on reference "zz_link", and log it - again! After this we - * repeat the above steps forever. - */ - spin_lock(&BTRFS_I(inode)->lock); - /* - * Check the inode's logged_trans only instead of - * btrfs_inode_in_log(). This is because the last_log_commit of - * the inode is not updated when we only log that it exists (see - * btrfs_log_inode()). + * We did this check at add_conflicting_inode(), but here we do + * it again because if some other task logged the inode after + * that, we can avoid doing it again. */ - if (BTRFS_I(inode)->logged_trans == trans->transid) { - spin_unlock(&BTRFS_I(inode)->lock); - btrfs_add_delayed_iput(inode); + if (!need_log_inode(trans, BTRFS_I(inode))) { + btrfs_add_delayed_iput(BTRFS_I(inode)); continue; } - spin_unlock(&BTRFS_I(inode)->lock); + /* * We are safe logging the other inode without acquiring its * lock as long as we log with the LOG_INODE_EXISTS mode. We @@ -5511,67 +5770,16 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * well because during a rename we pin the log and update the * log with the new name before we unpin it. */ - ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx); - if (ret) { - btrfs_add_delayed_iput(inode); - continue; - } - - key.objectid = ino; - key.type = BTRFS_INODE_REF_KEY; - key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - btrfs_add_delayed_iput(inode); - continue; - } - - while (true) { - struct extent_buffer *leaf = path->nodes[0]; - int slot = path->slots[0]; - u64 other_ino = 0; - u64 other_parent = 0; - - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - break; - } else if (ret > 0) { - ret = 0; - break; - } - continue; - } - - btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid != ino || - (key.type != BTRFS_INODE_REF_KEY && - key.type != BTRFS_INODE_EXTREF_KEY)) { - ret = 0; - break; - } - - ret = btrfs_check_ref_name_override(leaf, slot, &key, - BTRFS_I(inode), &other_ino, - &other_parent); - if (ret < 0) - break; - if (ret > 0) { - ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); - if (!ino_elem) { - ret = -ENOMEM; - break; - } - ino_elem->ino = other_ino; - ino_elem->parent = other_parent; - list_add_tail(&ino_elem->list, &inode_list); - ret = 0; - } - path->slots[0]++; - } - btrfs_add_delayed_iput(inode); + ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(BTRFS_I(inode)); + if (ret) + break; } + ctx->logging_conflict_inodes = false; + if (ret) + free_conflicting_inodes(ctx); + return ret; } @@ -5582,7 +5790,6 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_path *dst_path, const u64 logged_isize, - const bool recursive_logging, const int inode_only, struct btrfs_log_ctx *ctx, bool *need_log_inode_item) @@ -5621,8 +5828,8 @@ again: break; } else if ((min_key->type == BTRFS_INODE_REF_KEY || min_key->type == BTRFS_INODE_EXTREF_KEY) && - inode->generation == trans->transid && - !recursive_logging) { + (inode->generation == trans->transid || + ctx->logging_conflict_inodes)) { u64 other_ino = 0; u64 other_parent = 0; @@ -5646,11 +5853,12 @@ again: return ret; ins_nr = 0; - ret = log_conflicting_inodes(trans, root, path, - ctx, other_ino, other_parent); + btrfs_release_path(path); + ret = add_conflicting_inode(trans, root, path, + other_ino, + other_parent, ctx); if (ret) return ret; - btrfs_release_path(path); goto next_key; } } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { @@ -5733,6 +5941,371 @@ next_key: return ret; } +static int insert_delayed_items_batch(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + const struct btrfs_item_batch *batch, + const struct btrfs_delayed_item *first_item) +{ + const struct btrfs_delayed_item *curr = first_item; + int ret; + + ret = btrfs_insert_empty_items(trans, log, path, batch); + if (ret) + return ret; + + for (int i = 0; i < batch->nr; i++) { + char *data_ptr; + + data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char); + write_extent_buffer(path->nodes[0], &curr->data, + (unsigned long)data_ptr, curr->data_len); + curr = list_next_entry(curr, log_list); + path->slots[0]++; + } + + btrfs_release_path(path); + + return 0; +} + +static int log_delayed_insertion_items(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + const struct list_head *delayed_ins_list, + struct btrfs_log_ctx *ctx) +{ + /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */ + const int max_batch_size = 195; + const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info); + const u64 ino = btrfs_ino(inode); + struct btrfs_root *log = inode->root->log_root; + struct btrfs_item_batch batch = { + .nr = 0, + .total_data_size = 0, + }; + const struct btrfs_delayed_item *first = NULL; + const struct btrfs_delayed_item *curr; + char *ins_data; + struct btrfs_key *ins_keys; + u32 *ins_sizes; + u64 curr_batch_size = 0; + int batch_idx = 0; + int ret; + + /* We are adding dir index items to the log tree. */ + lockdep_assert_held(&inode->log_mutex); + + /* + * We collect delayed items before copying index keys from the subvolume + * to the log tree. However just after we collected them, they may have + * been flushed (all of them or just some of them), and therefore we + * could have copied them from the subvolume tree to the log tree. + * So find the first delayed item that was not yet logged (they are + * sorted by index number). + */ + list_for_each_entry(curr, delayed_ins_list, log_list) { + if (curr->index > inode->last_dir_index_offset) { + first = curr; + break; + } + } + + /* Empty list or all delayed items were already logged. */ + if (!first) + return 0; + + ins_data = kmalloc(max_batch_size * sizeof(u32) + + max_batch_size * sizeof(struct btrfs_key), GFP_NOFS); + if (!ins_data) + return -ENOMEM; + ins_sizes = (u32 *)ins_data; + batch.data_sizes = ins_sizes; + ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32)); + batch.keys = ins_keys; + + curr = first; + while (!list_entry_is_head(curr, delayed_ins_list, log_list)) { + const u32 curr_size = curr->data_len + sizeof(struct btrfs_item); + + if (curr_batch_size + curr_size > leaf_data_size || + batch.nr == max_batch_size) { + ret = insert_delayed_items_batch(trans, log, path, + &batch, first); + if (ret) + goto out; + batch_idx = 0; + batch.nr = 0; + batch.total_data_size = 0; + curr_batch_size = 0; + first = curr; + } + + ins_sizes[batch_idx] = curr->data_len; + ins_keys[batch_idx].objectid = ino; + ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY; + ins_keys[batch_idx].offset = curr->index; + curr_batch_size += curr_size; + batch.total_data_size += curr->data_len; + batch.nr++; + batch_idx++; + curr = list_next_entry(curr, log_list); + } + + ASSERT(batch.nr >= 1); + ret = insert_delayed_items_batch(trans, log, path, &batch, first); + + curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item, + log_list); + inode->last_dir_index_offset = curr->index; +out: + kfree(ins_data); + + return ret; +} + +static int log_delayed_deletions_full(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + const struct list_head *delayed_del_list, + struct btrfs_log_ctx *ctx) +{ + const u64 ino = btrfs_ino(inode); + const struct btrfs_delayed_item *curr; + + curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item, + log_list); + + while (!list_entry_is_head(curr, delayed_del_list, log_list)) { + u64 first_dir_index = curr->index; + u64 last_dir_index; + const struct btrfs_delayed_item *next; + int ret; + + /* + * Find a range of consecutive dir index items to delete. Like + * this we log a single dir range item spanning several contiguous + * dir items instead of logging one range item per dir index item. + */ + next = list_next_entry(curr, log_list); + while (!list_entry_is_head(next, delayed_del_list, log_list)) { + if (next->index != curr->index + 1) + break; + curr = next; + next = list_next_entry(next, log_list); + } + + last_dir_index = curr->index; + ASSERT(last_dir_index >= first_dir_index); + + ret = insert_dir_log_key(trans, inode->root->log_root, path, + ino, first_dir_index, last_dir_index); + if (ret) + return ret; + curr = list_next_entry(curr, log_list); + } + + return 0; +} + +static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + struct btrfs_log_ctx *ctx, + const struct list_head *delayed_del_list, + const struct btrfs_delayed_item *first, + const struct btrfs_delayed_item **last_ret) +{ + const struct btrfs_delayed_item *next; + struct extent_buffer *leaf = path->nodes[0]; + const int last_slot = btrfs_header_nritems(leaf) - 1; + int slot = path->slots[0] + 1; + const u64 ino = btrfs_ino(inode); + + next = list_next_entry(first, log_list); + + while (slot < last_slot && + !list_entry_is_head(next, delayed_del_list, log_list)) { + struct btrfs_key key; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino || + key.type != BTRFS_DIR_INDEX_KEY || + key.offset != next->index) + break; + + slot++; + *last_ret = next; + next = list_next_entry(next, log_list); + } + + return btrfs_del_items(trans, inode->root->log_root, path, + path->slots[0], slot - path->slots[0]); +} + +static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + const struct list_head *delayed_del_list, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *log = inode->root->log_root; + const struct btrfs_delayed_item *curr; + u64 last_range_start; + u64 last_range_end = 0; + struct btrfs_key key; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_DIR_INDEX_KEY; + curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item, + log_list); + + while (!list_entry_is_head(curr, delayed_del_list, log_list)) { + const struct btrfs_delayed_item *last = curr; + u64 first_dir_index = curr->index; + u64 last_dir_index; + bool deleted_items = false; + int ret; + + key.offset = curr->index; + ret = btrfs_search_slot(trans, log, &key, path, -1, 1); + if (ret < 0) { + return ret; + } else if (ret == 0) { + ret = batch_delete_dir_index_items(trans, inode, path, ctx, + delayed_del_list, curr, + &last); + if (ret) + return ret; + deleted_items = true; + } + + btrfs_release_path(path); + + /* + * If we deleted items from the leaf, it means we have a range + * item logging their range, so no need to add one or update an + * existing one. Otherwise we have to log a dir range item. + */ + if (deleted_items) + goto next_batch; + + last_dir_index = last->index; + ASSERT(last_dir_index >= first_dir_index); + /* + * If this range starts right after where the previous one ends, + * then we want to reuse the previous range item and change its + * end offset to the end of this range. This is just to minimize + * leaf space usage, by avoiding adding a new range item. + */ + if (last_range_end != 0 && first_dir_index == last_range_end + 1) + first_dir_index = last_range_start; + + ret = insert_dir_log_key(trans, log, path, key.objectid, + first_dir_index, last_dir_index); + if (ret) + return ret; + + last_range_start = first_dir_index; + last_range_end = last_dir_index; +next_batch: + curr = list_next_entry(last, log_list); + } + + return 0; +} + +static int log_delayed_deletion_items(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + const struct list_head *delayed_del_list, + struct btrfs_log_ctx *ctx) +{ + /* + * We are deleting dir index items from the log tree or adding range + * items to it. + */ + lockdep_assert_held(&inode->log_mutex); + + if (list_empty(delayed_del_list)) + return 0; + + if (ctx->logged_before) + return log_delayed_deletions_incremental(trans, inode, path, + delayed_del_list, ctx); + + return log_delayed_deletions_full(trans, inode, path, delayed_del_list, + ctx); +} + +/* + * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed + * items instead of the subvolume tree. + */ +static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + const struct list_head *delayed_ins_list, + struct btrfs_log_ctx *ctx) +{ + const bool orig_log_new_dentries = ctx->log_new_dentries; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_item *item; + int ret = 0; + + /* + * No need for the log mutex, plus to avoid potential deadlocks or + * lockdep annotations due to nesting of delayed inode mutexes and log + * mutexes. + */ + lockdep_assert_not_held(&inode->log_mutex); + + ASSERT(!ctx->logging_new_delayed_dentries); + ctx->logging_new_delayed_dentries = true; + + list_for_each_entry(item, delayed_ins_list, log_list) { + struct btrfs_dir_item *dir_item; + struct inode *di_inode; + struct btrfs_key key; + int log_mode = LOG_INODE_EXISTS; + + dir_item = (struct btrfs_dir_item *)item->data; + btrfs_disk_key_to_cpu(&key, &dir_item->location); + + if (key.type == BTRFS_ROOT_ITEM_KEY) + continue; + + di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root); + if (IS_ERR(di_inode)) { + ret = PTR_ERR(di_inode); + break; + } + + if (!need_log_inode(trans, BTRFS_I(di_inode))) { + btrfs_add_delayed_iput(BTRFS_I(di_inode)); + continue; + } + + if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR) + log_mode = LOG_INODE_ALL; + + ctx->log_new_dentries = false; + ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); + + if (!ret && ctx->log_new_dentries) + ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx); + + btrfs_add_delayed_iput(BTRFS_I(di_inode)); + + if (ret) + break; + } + + ctx->log_new_dentries = orig_log_new_dentries; + ctx->logging_new_delayed_dentries = false; + + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -5764,9 +6337,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, u64 logged_isize = 0; bool need_log_inode_item = true; bool xattrs_logged = false; - bool recursive_logging = false; bool inode_item_dropped = true; - const bool orig_logged_before = ctx->logged_before; + bool full_dir_logging = false; + LIST_HEAD(delayed_ins_list); + LIST_HEAD(delayed_del_list); path = btrfs_alloc_path(); if (!path) @@ -5794,27 +6368,46 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key.type = (u8)-1; max_key.offset = (u64)-1; + if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL) + full_dir_logging = true; + /* - * Only run delayed items if we are a directory. We want to make sure - * all directory indexes hit the fs/subvolume tree so we can find them - * and figure out which index ranges have to be logged. + * If we are logging a directory while we are logging dentries of the + * delayed items of some other inode, then we need to flush the delayed + * items of this directory and not log the delayed items directly. This + * is to prevent more than one level of recursion into btrfs_log_inode() + * by having something like this: + * + * $ mkdir -p a/b/c/d/e/f/g/h/... + * $ xfs_io -c "fsync" a + * + * Where all directories in the path did not exist before and are + * created in the current transaction. + * So in such a case we directly log the delayed items of the main + * directory ("a") without flushing them first, while for each of its + * subdirectories we flush their delayed items before logging them. + * This prevents a potential unbounded recursion like this: + * + * btrfs_log_inode() + * log_new_delayed_dentries() + * btrfs_log_inode() + * log_new_delayed_dentries() + * btrfs_log_inode() + * log_new_delayed_dentries() + * (...) + * + * We have thresholds for the maximum number of delayed items to have in + * memory, and once they are hit, the items are flushed asynchronously. + * However the limit is quite high, so lets prevent deep levels of + * recursion to happen by limiting the maximum depth to be 1. */ - if (S_ISDIR(inode->vfs_inode.i_mode)) { + if (full_dir_logging && ctx->logging_new_delayed_dentries) { ret = btrfs_commit_inode_delayed_items(trans, inode); if (ret) goto out; } - if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { - recursive_logging = true; - if (inode_only == LOG_OTHER_INODE) - inode_only = LOG_INODE_EXISTS; - else - inode_only = LOG_INODE_ALL; - mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); - } else { - mutex_lock(&inode->log_mutex); - } + mutex_lock(&inode->log_mutex); /* * For symlinks, we must always log their content, which is stored in an @@ -5846,9 +6439,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * to known the file was moved from A to B, so logging just A would * result in losing the file after a log replay. */ - if (S_ISDIR(inode->vfs_inode.i_mode) && - inode_only == LOG_INODE_ALL && - inode->last_unlink_trans >= trans->transid) { + if (full_dir_logging && inode->last_unlink_trans >= trans->transid) { btrfs_set_log_full_commit(trans); ret = BTRFS_LOG_FORCE_COMMIT; goto out_unlock; @@ -5859,14 +6450,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * copies of everything. */ if (S_ISDIR(inode->vfs_inode.i_mode)) { - int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; - clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); - if (inode_only == LOG_INODE_EXISTS) - max_key_type = BTRFS_XATTR_ITEM_KEY; if (ctx->logged_before) ret = drop_inode_items(trans, log, path, inode, - max_key_type); + BTRFS_XATTR_ITEM_KEY); } else { if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { /* @@ -5922,9 +6509,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (ret) goto out_unlock; + /* + * If we are logging a directory in full mode, collect the delayed items + * before iterating the subvolume tree, so that we don't miss any new + * dir index items in case they get flushed while or right after we are + * iterating the subvolume tree. + */ + if (full_dir_logging && !ctx->logging_new_delayed_dentries) + btrfs_log_get_delayed_items(inode, &delayed_ins_list, + &delayed_del_list); + ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, path, dst_path, logged_isize, - recursive_logging, inode_only, ctx, + inode_only, ctx, &need_log_inode_item); if (ret) goto out_unlock; @@ -5977,10 +6574,18 @@ log_extents: write_unlock(&em_tree->lock); } - if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { + if (full_dir_logging) { ret = log_directory_changes(trans, inode, path, dst_path, ctx); if (ret) goto out_unlock; + ret = log_delayed_insertion_items(trans, inode, path, + &delayed_ins_list, ctx); + if (ret) + goto out_unlock; + ret = log_delayed_deletion_items(trans, inode, path, + &delayed_del_list, ctx); + if (ret) + goto out_unlock; } spin_lock(&inode->lock); @@ -6033,208 +6638,20 @@ out: btrfs_free_path(path); btrfs_free_path(dst_path); - if (recursive_logging) - ctx->logged_before = orig_logged_before; - - return ret; -} - -/* - * Check if we need to log an inode. This is used in contexts where while - * logging an inode we need to log another inode (either that it exists or in - * full mode). This is used instead of btrfs_inode_in_log() because the later - * requires the inode to be in the log and have the log transaction committed, - * while here we do not care if the log transaction was already committed - our - * caller will commit the log later - and we want to avoid logging an inode - * multiple times when multiple tasks have joined the same log transaction. - */ -static bool need_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) -{ - /* - * If a directory was not modified, no dentries added or removed, we can - * and should avoid logging it. - */ - if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) - return false; - - /* - * If this inode does not have new/updated/deleted xattrs since the last - * time it was logged and is flagged as logged in the current transaction, - * we can skip logging it. As for new/deleted names, those are updated in - * the log by link/unlink/rename operations. - * In case the inode was logged and then evicted and reloaded, its - * logged_trans will be 0, in which case we have to fully log it since - * logged_trans is a transient field, not persisted. - */ - if (inode->logged_trans == trans->transid && - !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) - return false; - - return true; -} - -struct btrfs_dir_list { - u64 ino; - struct list_head list; -}; - -/* - * Log the inodes of the new dentries of a directory. See log_dir_items() for - * details about the why it is needed. - * This is a recursive operation - if an existing dentry corresponds to a - * directory, that directory's new entries are logged too (same behaviour as - * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes - * the dentries point to we do not lock their i_mutex, otherwise lockdep - * complains about the following circular lock dependency / possible deadlock: - * - * CPU0 CPU1 - * ---- ---- - * lock(&type->i_mutex_dir_key#3/2); - * lock(sb_internal#2); - * lock(&type->i_mutex_dir_key#3/2); - * lock(&sb->s_type->i_mutex_key#14); - * - * Where sb_internal is the lock (a counter that works as a lock) acquired by - * sb_start_intwrite() in btrfs_start_transaction(). - * Not locking i_mutex of the inodes is still safe because: - * - * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible - * that while logging the inode new references (names) are added or removed - * from the inode, leaving the logged inode item with a link count that does - * not match the number of logged inode reference items. This is fine because - * at log replay time we compute the real number of links and correct the - * link count in the inode item (see replay_one_buffer() and - * link_to_fixup_dir()); - * - * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that - * while logging the inode's items new index items (key type - * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item - * has a size that doesn't match the sum of the lengths of all the logged - * names - this is ok, not a problem, because at log replay time we set the - * directory's i_size to the correct value (see replay_one_name() and - * do_overwrite_item()). - */ -static int log_new_dir_dentries(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *start_inode, - struct btrfs_log_ctx *ctx) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; - LIST_HEAD(dir_list); - struct btrfs_dir_list *dir_elem; - int ret = 0; - - /* - * If we are logging a new name, as part of a link or rename operation, - * don't bother logging new dentries, as we just want to log the names - * of an inode and that any new parents exist. - */ - if (ctx->logging_new_name) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); - if (!dir_elem) { - btrfs_free_path(path); - return -ENOMEM; - } - dir_elem->ino = btrfs_ino(start_inode); - list_add_tail(&dir_elem->list, &dir_list); - - while (!list_empty(&dir_list)) { - struct extent_buffer *leaf; - struct btrfs_key min_key; - int nritems; - int i; - - dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, - list); - if (ret) - goto next_dir_inode; - - min_key.objectid = dir_elem->ino; - min_key.type = BTRFS_DIR_INDEX_KEY; - min_key.offset = 0; -again: - btrfs_release_path(path); - ret = btrfs_search_forward(root, &min_key, path, trans->transid); - if (ret < 0) { - goto next_dir_inode; - } else if (ret > 0) { - ret = 0; - goto next_dir_inode; - } - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - for (i = path->slots[0]; i < nritems; i++) { - struct btrfs_dir_item *di; - struct btrfs_key di_key; - struct inode *di_inode; - struct btrfs_dir_list *new_dir_elem; - int log_mode = LOG_INODE_EXISTS; - int type; - - btrfs_item_key_to_cpu(leaf, &min_key, i); - if (min_key.objectid != dir_elem->ino || - min_key.type != BTRFS_DIR_INDEX_KEY) - goto next_dir_inode; - - di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); - type = btrfs_dir_type(leaf, di); - if (btrfs_dir_transid(leaf, di) < trans->transid) - continue; - btrfs_dir_item_key_to_cpu(leaf, di, &di_key); - if (di_key.type == BTRFS_ROOT_ITEM_KEY) - continue; - - btrfs_release_path(path); - di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); - if (IS_ERR(di_inode)) { - ret = PTR_ERR(di_inode); - goto next_dir_inode; - } + if (ret) + free_conflicting_inodes(ctx); + else + ret = log_conflicting_inodes(trans, inode->root, ctx); - if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(di_inode); - break; - } + if (full_dir_logging && !ctx->logging_new_delayed_dentries) { + if (!ret) + ret = log_new_delayed_dentries(trans, inode, + &delayed_ins_list, ctx); - ctx->log_new_dentries = false; - if (type == BTRFS_FT_DIR) - log_mode = LOG_INODE_ALL; - ret = btrfs_log_inode(trans, BTRFS_I(di_inode), - log_mode, ctx); - btrfs_add_delayed_iput(di_inode); - if (ret) - goto next_dir_inode; - if (ctx->log_new_dentries) { - new_dir_elem = kmalloc(sizeof(*new_dir_elem), - GFP_NOFS); - if (!new_dir_elem) { - ret = -ENOMEM; - goto next_dir_inode; - } - new_dir_elem->ino = di_key.objectid; - list_add_tail(&new_dir_elem->list, &dir_list); - } - break; - } - if (min_key.offset < (u64)-1) { - min_key.offset++; - goto again; - } -next_dir_inode: - list_del(&dir_elem->list); - kfree(dir_elem); + btrfs_log_put_delayed_items(inode, &delayed_ins_list, + &delayed_del_list); } - btrfs_free_path(path); return ret; } @@ -6338,7 +6755,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, } if (!need_log_inode(trans, BTRFS_I(dir_inode))) { - btrfs_add_delayed_iput(dir_inode); + btrfs_add_delayed_iput(BTRFS_I(dir_inode)); continue; } @@ -6346,9 +6763,9 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), LOG_INODE_ALL, ctx); if (!ret && ctx->log_new_dentries) - ret = log_new_dir_dentries(trans, root, + ret = log_new_dir_dentries(trans, BTRFS_I(dir_inode), ctx); - btrfs_add_delayed_iput(dir_inode); + btrfs_add_delayed_iput(BTRFS_I(dir_inode)); if (ret) goto out; } @@ -6393,7 +6810,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, need_log_inode(trans, BTRFS_I(inode))) ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); if (ret) return ret; @@ -6661,7 +7078,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; if (log_dentries) - ret = log_new_dir_dentries(trans, root, inode, ctx); + ret = log_new_dir_dentries(trans, inode, ctx); else ret = 0; end_trans: @@ -6955,7 +7372,7 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, mutex_unlock(&dir->log_mutex); } -/** +/* * Update the log after adding a new name for an inode. * * @trans: Transaction handle. @@ -7022,9 +7439,14 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, if (old_dir && old_dir->logged_trans == trans->transid) { struct btrfs_root *log = old_dir->root->log_root; struct btrfs_path *path; + struct fscrypt_name fname; ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); + ret = fscrypt_setup_filename(&old_dir->vfs_inode, + &old_dentry->d_name, 0, &fname); + if (ret) + goto out; /* * We have two inodes to update in the log, the old directory and * the inode that got renamed, so we must pin the log to prevent @@ -7044,6 +7466,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; + fscrypt_free_filename(&fname); goto out; } @@ -7059,8 +7482,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, */ mutex_lock(&old_dir->log_mutex); ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), - old_dentry->d_name.name, - old_dentry->d_name.len, old_dir_index); + &fname.disk_name, old_dir_index); if (ret > 0) { /* * The dentry does not exist in the log, so record its @@ -7074,6 +7496,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, mutex_unlock(&old_dir->log_mutex); btrfs_free_path(path); + fscrypt_free_filename(&fname); if (ret < 0) goto out; } @@ -7088,6 +7511,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * inconsistent state after a rename operation. */ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); + ASSERT(list_empty(&ctx.conflict_inodes)); out: /* * If an error happened mark the log for a full commit because it's not diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 57ab5f3b8dc7..85b43075ac58 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -6,6 +6,7 @@ #ifndef BTRFS_TREE_LOG_H #define BTRFS_TREE_LOG_H +#include "messages.h" #include "ctree.h" #include "transaction.h" @@ -20,6 +21,7 @@ struct btrfs_log_ctx { int log_transid; bool log_new_dentries; bool logging_new_name; + bool logging_new_delayed_dentries; /* Indicate if the inode being logged was logged before. */ bool logged_before; /* Tracks the last logged dir item/index key offset. */ @@ -28,6 +30,9 @@ struct btrfs_log_ctx { struct list_head list; /* Only used for fast fsyncs. */ struct list_head ordered_extents; + struct list_head conflict_inodes; + int num_conflict_inodes; + bool logging_conflict_inodes; }; static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, @@ -37,10 +42,14 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, ctx->log_transid = 0; ctx->log_new_dentries = false; ctx->logging_new_name = false; + ctx->logging_new_delayed_dentries = false; ctx->logged_before = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); INIT_LIST_HEAD(&ctx->ordered_extents); + INIT_LIST_HEAD(&ctx->conflict_inodes); + ctx->num_conflict_inodes = 0; + ctx->logging_conflict_inodes = false; } static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) @@ -78,11 +87,11 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx); void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index); void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct fscrypt_str *name, struct btrfs_inode *inode, u64 dirid); void btrfs_end_log_trans(struct btrfs_root *root); void btrfs_pin_log_trans(struct btrfs_root *root); diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 8a3a14686d3e..a555baa0143a 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -1,7 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 +#include "messages.h" #include "tree-mod-log.h" #include "disk-io.h" +#include "fs.h" +#include "accessors.h" +#include "tree-checker.h" struct tree_mod_root { u64 logical; @@ -197,12 +201,11 @@ static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, int slot, - enum btrfs_mod_log_op op, - gfp_t flags) + enum btrfs_mod_log_op op) { struct tree_mod_elem *tm; - tm = kzalloc(sizeof(*tm), flags); + tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) return NULL; @@ -220,7 +223,7 @@ static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, } int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, - enum btrfs_mod_log_op op, gfp_t flags) + enum btrfs_mod_log_op op) { struct tree_mod_elem *tm; int ret; @@ -228,7 +231,7 @@ int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, if (!tree_mod_need_log(eb->fs_info, eb)) return 0; - tm = alloc_tree_mod_elem(eb, slot, op, flags); + tm = alloc_tree_mod_elem(eb, slot, op); if (!tm) return -ENOMEM; @@ -276,7 +279,7 @@ int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, - BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -364,7 +367,7 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, } for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(old_root, i, - BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -502,14 +505,14 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, tm_list_rem = tm_list + nr_items; for (i = 0; i < nr_items; i++) { tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, - BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE); if (!tm_list_rem[i]) { ret = -ENOMEM; goto free_tms; } tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, - BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); + BTRFS_MOD_LOG_KEY_ADD); if (!tm_list_add[i]) { ret = -ENOMEM; goto free_tms; @@ -564,7 +567,7 @@ int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i, - BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -694,8 +697,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, n--; break; case BTRFS_MOD_LOG_MOVE_KEYS: - o_dst = btrfs_node_key_ptr_offset(tm->slot); - o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); + o_dst = btrfs_node_key_ptr_offset(eb, tm->slot); + o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot); memmove_extent_buffer(eb, o_dst, o_src, tm->move.nr_items * p_size); break; @@ -819,10 +822,15 @@ struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq) tm = tree_mod_log_search(fs_info, logical, time_seq); if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + struct btrfs_tree_parent_check check = { 0 }; + btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - old = read_tree_block(fs_info, logical, root->root_key.objectid, - 0, level, NULL); + + check.level = level; + check.owner_root = root->root_key.objectid; + + old = read_tree_block(fs_info, logical, &check); if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { if (!IS_ERR(old)) free_extent_buffer(old); diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h index 12605d19621b..94f10afeee97 100644 --- a/fs/btrfs/tree-mod-log.h +++ b/fs/btrfs/tree-mod-log.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef BTRFS_TREE_MOD_LOG_H #define BTRFS_TREE_MOD_LOG_H @@ -32,7 +32,7 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, struct extent_buffer *new_root, bool log_removal); int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, - enum btrfs_mod_log_op op, gfp_t flags); + enum btrfs_mod_log_op op); int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 3374c9e9be67..33606025513d 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -5,6 +5,7 @@ */ #include <linux/slab.h> +#include "messages.h" #include "ulist.h" #include "ctree.h" @@ -37,8 +38,9 @@ * loop would be similar to the above. */ -/** - * ulist_init - freshly initialize a ulist +/* + * Freshly initialize a ulist. + * * @ulist: the ulist to initialize * * Note: don't use this function to init an already used ulist, use @@ -51,8 +53,9 @@ void ulist_init(struct ulist *ulist) ulist->nnodes = 0; } -/** - * ulist_release - free up additionally allocated memory for the ulist +/* + * Free up additionally allocated memory for the ulist. + * * @ulist: the ulist from which to free the additional memory * * This is useful in cases where the base 'struct ulist' has been statically @@ -70,8 +73,9 @@ void ulist_release(struct ulist *ulist) INIT_LIST_HEAD(&ulist->nodes); } -/** - * ulist_reinit - prepare a ulist for reuse +/* + * Prepare a ulist for reuse. + * * @ulist: ulist to be reused * * Free up all additional memory allocated for the list elements and reinit @@ -83,8 +87,9 @@ void ulist_reinit(struct ulist *ulist) ulist_init(ulist); } -/** - * ulist_alloc - dynamically allocate a ulist +/* + * Dynamically allocate a ulist. + * * @gfp_mask: allocation flags to for base allocation * * The allocated ulist will be returned in an initialized state. @@ -101,8 +106,9 @@ struct ulist *ulist_alloc(gfp_t gfp_mask) return ulist; } -/** - * ulist_free - free dynamically allocated ulist +/* + * Free dynamically allocated ulist. + * * @ulist: ulist to free * * It is not necessary to call ulist_release before. @@ -163,8 +169,9 @@ static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) return 0; } -/** - * ulist_add - add an element to the ulist +/* + * Add an element to the ulist. + * * @ulist: ulist to add the element to * @val: value to add to ulist * @aux: auxiliary value to store along with val @@ -242,8 +249,9 @@ int ulist_del(struct ulist *ulist, u64 val, u64 aux) return 0; } -/** - * ulist_next - iterate ulist +/* + * Iterate ulist. + * * @ulist: ulist to iterate * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator) * @@ -258,7 +266,7 @@ int ulist_del(struct ulist *ulist, u64 val, u64 aux) * It is allowed to call ulist_add during an enumeration. Newly added items * are guaranteed to show up in the running enumeration. */ -struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) +struct ulist_node *ulist_next(const struct ulist *ulist, struct ulist_iterator *uiter) { struct ulist_node *node; diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 02fda0a2d4ce..b2cef187ea8e 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -66,7 +66,7 @@ static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux, #endif } -struct ulist_node *ulist_next(struct ulist *ulist, +struct ulist_node *ulist_next(const struct ulist *ulist, struct ulist_iterator *uiter); #define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL) diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index b458452a1aaf..7c7001f42b14 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -5,11 +5,14 @@ #include <linux/uuid.h> #include <asm/unaligned.h> +#include "messages.h" #include "ctree.h" #include "transaction.h" #include "disk-io.h" #include "print-tree.h" - +#include "fs.h" +#include "accessors.h" +#include "uuid-tree.h" static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key) { diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h new file mode 100644 index 000000000000..5350c87fe2ca --- /dev/null +++ b/fs/btrfs/uuid-tree.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_UUID_TREE_H +#define BTRFS_UUID_TREE_H + +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, + u64 subid); +int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, + u64 subid); +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); + +#endif diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 90eb5c2830a9..bf9eb693a6a7 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -10,11 +10,17 @@ #include <linux/iversion.h> #include <linux/fsverity.h> #include <linux/sched/mm.h> +#include "messages.h" #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" #include "disk-io.h" #include "locking.h" +#include "fs.h" +#include "accessors.h" +#include "ioctl.h" +#include "verity.h" +#include "orphan.h" /* * Implementation of the interface defined in struct fsverity_operations. @@ -659,8 +665,7 @@ rollback: * * Returns the size on success or a negative error code on failure. */ -static int btrfs_get_verity_descriptor(struct inode *inode, void *buf, - size_t buf_size) +int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size) { u64 true_size; int ret = 0; diff --git a/fs/btrfs/verity.h b/fs/btrfs/verity.h new file mode 100644 index 000000000000..91c10f7d0a46 --- /dev/null +++ b/fs/btrfs/verity.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_VERITY_H +#define BTRFS_VERITY_H + +#ifdef CONFIG_FS_VERITY + +extern const struct fsverity_operations btrfs_verityops; + +int btrfs_drop_verity_items(struct btrfs_inode *inode); +int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size); + +#else + +static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) +{ + return 0; +} + +static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf, + size_t buf_size) +{ + return -EPERM; +} + +#endif + +#endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f63ff91e2883..aa25fa335d3e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5,12 +5,9 @@ #include <linux/sched.h> #include <linux/sched/mm.h> -#include <linux/bio.h> #include <linux/slab.h> -#include <linux/blkdev.h> #include <linux/ratelimit.h> #include <linux/kthread.h> -#include <linux/raid/pq.h> #include <linux/semaphore.h> #include <linux/uuid.h> #include <linux/list_sort.h> @@ -23,8 +20,6 @@ #include "print-tree.h" #include "volumes.h" #include "raid56.h" -#include "async-thread.h" -#include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" #include "sysfs.h" @@ -33,6 +28,13 @@ #include "block-group.h" #include "discard.h" #include "zoned.h" +#include "fs.h" +#include "accessors.h" +#include "uuid-tree.h" +#include "ioctl.h" +#include "relocation.h" +#include "scrub.h" +#include "super.h" #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID10 | \ @@ -246,11 +248,6 @@ out_overflow:; static int init_first_rw_device(struct btrfs_trans_handle *trans); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); -static int __btrfs_map_block(struct btrfs_fs_info *fs_info, - enum btrfs_map_op op, - u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, - int mirror_num, int need_raid_map); /* * Device locking @@ -527,14 +524,14 @@ error: return ret; } -/** - * Search and remove all stale devices (which are not mounted). - * When both inputs are NULL, it will search and release all stale devices. +/* + * Search and remove all stale devices (which are not mounted). When both + * inputs are NULL, it will search and release all stale devices. * - * @devt: Optional. When provided will it release all unmounted devices - * matching this devt only. + * @devt: Optional. When provided will it release all unmounted devices + * matching this devt only. * @skip_device: Optional. Will skip this device when searching for the stale - * devices. + * devices. * * Return: 0 for success or if @devt is 0. * -EBUSY if @devt is a mounted device. @@ -639,6 +636,9 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, if (!bdev_nonrot(bdev)) fs_devices->rotating = true; + if (bdev_max_discard_sectors(bdev)) + fs_devices->discardable = true; + device->bdev = bdev; clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); device->mode = flags; @@ -833,26 +833,23 @@ static noinline struct btrfs_device *device_list_add(const char *path, } if (!device) { + unsigned int nofs_flag; + if (fs_devices->opened) { mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EBUSY); } + nofs_flag = memalloc_nofs_save(); device = btrfs_alloc_device(NULL, &devid, - disk_super->dev_item.uuid); + disk_super->dev_item.uuid, path); + memalloc_nofs_restore(nofs_flag); if (IS_ERR(device)) { mutex_unlock(&fs_devices->device_list_mutex); /* we can safely leave the fs_devices entry around */ return device; } - name = rcu_string_strdup(path, GFP_NOFS); - if (!name) { - btrfs_free_device(device); - mutex_unlock(&fs_devices->device_list_mutex); - return ERR_PTR(-ENOMEM); - } - rcu_assign_pointer(device->name, name); device->devt = path_devt; list_add_rcu(&device->dev_list, &fs_devices->devices); @@ -932,7 +929,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, } btrfs_info_in_rcu(NULL, "devid %llu device path %s changed to %s scanned by %s (%d)", - devid, rcu_str_deref(device->name), + devid, btrfs_dev_name(device), path, current->comm, task_pid_nr(current)); } @@ -985,28 +982,32 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) fs_devices->total_devices = orig->total_devices; list_for_each_entry(orig_dev, &orig->devices, dev_list) { - struct rcu_string *name; + const char *dev_path = NULL; + + /* + * This is ok to do without RCU read locked because we hold the + * uuid mutex so nothing we touch in here is going to disappear. + */ + if (orig_dev->name) + dev_path = orig_dev->name->str; device = btrfs_alloc_device(NULL, &orig_dev->devid, - orig_dev->uuid); + orig_dev->uuid, dev_path); if (IS_ERR(device)) { ret = PTR_ERR(device); goto error; } - /* - * This is ok to do without rcu read locked because we hold the - * uuid mutex so nothing we touch in here is going to disappear. - */ - if (orig_dev->name) { - name = rcu_string_strdup(orig_dev->name->str, - GFP_KERNEL); - if (!name) { + if (orig_dev->zone_info) { + struct btrfs_zoned_device_info *zone_info; + + zone_info = btrfs_clone_dev_zone_info(orig_dev); + if (!zone_info) { btrfs_free_device(device); ret = -ENOMEM; goto error; } - rcu_assign_pointer(device->name, name); + device->zone_info = zone_info; } list_add(&device->dev_list, &fs_devices->devices); @@ -1459,8 +1460,9 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device, return changed; } -/** - * dev_extent_hole_check - check if specified hole is suitable for allocation +/* + * Check if specified hole is suitable for allocation. + * * @device: the device which we have the hole * @hole_start: starting position of the hole * @hole_size: the size of the hole @@ -1514,7 +1516,8 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, } /* - * find_free_dev_extent_start - find free space in the specified device + * Find free space in the specified device. + * * @device: the device which we search the free space in * @num_bytes: the size of the free space that we need * @search_start: the position from which to begin the search @@ -1522,9 +1525,8 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, * @len: the size of the free space. that we find, or the size * of the max free space if we don't find suitable free space * - * this uses a pretty simple search, the expectation is that it is - * called very infrequently and that a given device has a small number - * of extents + * This does a pretty simple search, the expectation is that it is called very + * infrequently and that a given device has a small number of extents. * * @start is used to store the start of the free space if we find. But if we * don't find suitable free space, it will be used to store the start position @@ -2017,7 +2019,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct page *page; int ret; - disk_super = btrfs_read_dev_one_super(bdev, copy_num); + disk_super = btrfs_read_dev_one_super(bdev, copy_num, false); if (IS_ERR(disk_super)) continue; @@ -2087,7 +2089,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, if (btrfs_pinned_by_swapfile(fs_info, device)) { btrfs_warn_in_rcu(fs_info, "cannot remove device %s (devid %llu) due to active swapfile", - rcu_str_deref(device->name), device->devid); + btrfs_dev_name(device), device->devid); return -ETXTBSY; } @@ -2303,8 +2305,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) btrfs_free_device(tgtdev); } -/** - * Populate args from device at path +/* + * Populate args from device at path. * * @fs_info: the filesystem * @args: the args to populate @@ -2579,7 +2581,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct btrfs_device *device; struct block_device *bdev; struct super_block *sb = fs_info->sb; - struct rcu_string *name; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *seed_devices; u64 orig_super_total_bytes; @@ -2620,20 +2621,13 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } rcu_read_unlock(); - device = btrfs_alloc_device(fs_info, NULL, NULL); + device = btrfs_alloc_device(fs_info, NULL, NULL, device_path); if (IS_ERR(device)) { /* we can safely leave the fs_devices entry around */ ret = PTR_ERR(device); goto error; } - name = rcu_string_strdup(device_path, GFP_KERNEL); - if (!name) { - ret = -ENOMEM; - goto error_free_device; - } - rcu_assign_pointer(device->name, name); - device->fs_info = fs_info; device->bdev = bdev; ret = lookup_bdev(device_path, &device->devt); @@ -3589,16 +3583,14 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_off if (bargs->usage_min == 0) user_thresh_min = 0; else - user_thresh_min = div_factor_fine(cache->length, - bargs->usage_min); + user_thresh_min = mult_perc(cache->length, bargs->usage_min); if (bargs->usage_max == 0) user_thresh_max = 1; else if (bargs->usage_max > 100) user_thresh_max = cache->length; else - user_thresh_max = div_factor_fine(cache->length, - bargs->usage_max); + user_thresh_max = mult_perc(cache->length, bargs->usage_max); if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) ret = 0; @@ -3622,7 +3614,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, else if (bargs->usage > 100) user_thresh = cache->length; else - user_thresh = div_factor_fine(cache->length, bargs->usage); + user_thresh = mult_perc(cache->length, bargs->usage); if (chunk_used < user_thresh) ret = 0; @@ -4012,10 +4004,11 @@ error: return ret; } -/** - * alloc_profile_is_valid - see if a given profile is valid and reduced - * @flags: profile to validate - * @extended: if true @flags is treated as an extended profile +/* + * See if a given profile is valid and reduced. + * + * @flags: profile to validate + * @extended: if true @flags is treated as an extended profile */ static int alloc_profile_is_valid(u64 flags, int extended) { @@ -5087,7 +5080,7 @@ static void init_alloc_chunk_ctl_policy_regular( ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); /* We don't want a chunk larger than 10% of writable space */ - ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), + ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10), ctl->max_chunk_size); ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; } @@ -5118,7 +5111,7 @@ static void init_alloc_chunk_ctl_policy_zoned( } /* We don't want a chunk larger than 10% of writable space */ - limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1), + limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10), zone_size), min_chunk_size); ctl->max_chunk_size = min(limit, ctl->max_chunk_size); @@ -5595,7 +5588,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, if (ret) goto out; - bg->chunk_item_inserted = 1; + set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); @@ -5894,9 +5887,11 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_ * and the stripes. */ sizeof(u64) * (total_stripes), - GFP_NOFS|__GFP_NOFAIL); + GFP_NOFS); + + if (!bioc) + return NULL; - atomic_set(&bioc->error, 0); refcount_set(&bioc->refs, 1); bioc->fs_info = fs_info; @@ -6092,7 +6087,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, int ret = 0; ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &length, &bioc, 0, 0); + logical, &length, &bioc, NULL, NULL, 0); if (ret) { ASSERT(bioc == NULL); return ret; @@ -6153,9 +6148,7 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) cache = btrfs_lookup_block_group(fs_info, logical); - spin_lock(&cache->lock); - ret = cache->to_copy; - spin_unlock(&cache->lock); + ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); btrfs_put_block_group(cache); return ret; @@ -6351,11 +6344,19 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, return 0; } -static int __btrfs_map_block(struct btrfs_fs_info *fs_info, - enum btrfs_map_op op, - u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, - int mirror_num, int need_raid_map) +static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, + u32 stripe_index, u64 stripe_offset, u64 stripe_nr) +{ + dst->dev = map->stripes[stripe_index].dev; + dst->physical = map->stripes[stripe_index].physical + + stripe_offset + stripe_nr * map->stripe_len; +} + +int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 *length, + struct btrfs_io_context **bioc_ret, + struct btrfs_io_stripe *smap, int *mirror_num_ret, + int need_raid_map) { struct extent_map *em; struct map_lookup *map; @@ -6366,6 +6367,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int data_stripes; int i; int ret = 0; + int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); int num_stripes; int max_errors = 0; int tgtdev_indexes = 0; @@ -6526,6 +6528,29 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, tgtdev_indexes = num_stripes; } + /* + * If this I/O maps to a single device, try to return the device and + * physical block information on the stack instead of allocating an + * I/O context structure. + */ + if (smap && num_alloc_stripes == 1 && + !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) && + (!need_full_stripe(op) || !dev_replace_is_ongoing || + !dev_replace->tgtdev)) { + if (patch_the_first_stripe_for_dev_replace) { + smap->dev = dev_replace->tgtdev; + smap->physical = physical_to_patch_in_first_stripe; + *mirror_num_ret = map->num_stripes + 1; + } else { + set_io_stripe(smap, map, stripe_index, stripe_offset, + stripe_nr); + *mirror_num_ret = mirror_num; + } + *bioc_ret = NULL; + ret = 0; + goto out; + } + bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes); if (!bioc) { ret = -ENOMEM; @@ -6533,9 +6558,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } for (i = 0; i < num_stripes; i++) { - bioc->stripes[i].physical = map->stripes[stripe_index].physical + - stripe_offset + stripe_nr * map->stripe_len; - bioc->stripes[i].dev = map->stripes[stripe_index].dev; + set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset, + stripe_nr); stripe_index++; } @@ -6603,7 +6627,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_io_context **bioc_ret, int mirror_num) { return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, - mirror_num, 0); + NULL, &mirror_num, 0); } /* For Scrub/replace */ @@ -6611,185 +6635,8 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret) { - return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1); -} - -static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_io_context *bioc) -{ - if (bioc->orig_bio->bi_opf & REQ_META) - return bioc->fs_info->endio_meta_workers; - return bioc->fs_info->endio_workers; -} - -static void btrfs_end_bio_work(struct work_struct *work) -{ - struct btrfs_bio *bbio = - container_of(work, struct btrfs_bio, end_io_work); - - bio_endio(&bbio->bio); -} - -static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async) -{ - struct bio *orig_bio = bioc->orig_bio; - struct btrfs_bio *bbio = btrfs_bio(orig_bio); - - bbio->mirror_num = bioc->mirror_num; - orig_bio->bi_private = bioc->private; - orig_bio->bi_end_io = bioc->end_io; - - /* - * Only send an error to the higher layers if it is beyond the tolerance - * threshold. - */ - if (atomic_read(&bioc->error) > bioc->max_errors) - orig_bio->bi_status = BLK_STS_IOERR; - else - orig_bio->bi_status = BLK_STS_OK; - - if (btrfs_op(orig_bio) == BTRFS_MAP_READ && async) { - INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); - queue_work(btrfs_end_io_wq(bioc), &bbio->end_io_work); - } else { - bio_endio(orig_bio); - } - - btrfs_put_bioc(bioc); -} - -static void btrfs_end_bio(struct bio *bio) -{ - struct btrfs_io_stripe *stripe = bio->bi_private; - struct btrfs_io_context *bioc = stripe->bioc; - - if (bio->bi_status) { - atomic_inc(&bioc->error); - if (bio->bi_status == BLK_STS_IOERR || - bio->bi_status == BLK_STS_TARGET) { - if (btrfs_op(bio) == BTRFS_MAP_WRITE) - btrfs_dev_stat_inc_and_print(stripe->dev, - BTRFS_DEV_STAT_WRITE_ERRS); - else if (!(bio->bi_opf & REQ_RAHEAD)) - btrfs_dev_stat_inc_and_print(stripe->dev, - BTRFS_DEV_STAT_READ_ERRS); - if (bio->bi_opf & REQ_PREFLUSH) - btrfs_dev_stat_inc_and_print(stripe->dev, - BTRFS_DEV_STAT_FLUSH_ERRS); - } - } - - if (bio != bioc->orig_bio) - bio_put(bio); - - btrfs_bio_counter_dec(bioc->fs_info); - if (atomic_dec_and_test(&bioc->stripes_pending)) - btrfs_end_bioc(bioc, true); -} - -static void submit_stripe_bio(struct btrfs_io_context *bioc, - struct bio *orig_bio, int dev_nr, bool clone) -{ - struct btrfs_fs_info *fs_info = bioc->fs_info; - struct btrfs_device *dev = bioc->stripes[dev_nr].dev; - u64 physical = bioc->stripes[dev_nr].physical; - struct bio *bio; - - if (!dev || !dev->bdev || - test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || - (btrfs_op(orig_bio) == BTRFS_MAP_WRITE && - !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { - atomic_inc(&bioc->error); - if (atomic_dec_and_test(&bioc->stripes_pending)) - btrfs_end_bioc(bioc, false); - return; - } - - if (clone) { - bio = bio_alloc_clone(dev->bdev, orig_bio, GFP_NOFS, &fs_bio_set); - } else { - bio = orig_bio; - bio_set_dev(bio, dev->bdev); - btrfs_bio(bio)->device = dev; - } - - bioc->stripes[dev_nr].bioc = bioc; - bio->bi_private = &bioc->stripes[dev_nr]; - bio->bi_end_io = btrfs_end_bio; - bio->bi_iter.bi_sector = physical >> 9; - /* - * For zone append writing, bi_sector must point the beginning of the - * zone - */ - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - if (btrfs_dev_is_sequential(dev, physical)) { - u64 zone_start = round_down(physical, fs_info->zone_size); - - bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; - } else { - bio->bi_opf &= ~REQ_OP_ZONE_APPEND; - bio->bi_opf |= REQ_OP_WRITE; - } - } - btrfs_debug_in_rcu(fs_info, - "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", - __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, - (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), - dev->devid, bio->bi_iter.bi_size); - - btrfs_bio_counter_inc_noblocked(fs_info); - - btrfsic_check_bio(bio); - submit_bio(bio); -} - -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) -{ - u64 logical = bio->bi_iter.bi_sector << 9; - u64 length = bio->bi_iter.bi_size; - u64 map_length = length; - int ret; - int dev_nr; - int total_devs; - struct btrfs_io_context *bioc = NULL; - - btrfs_bio_counter_inc_blocked(fs_info); - ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, - &map_length, &bioc, mirror_num, 1); - if (ret) { - btrfs_bio_counter_dec(fs_info); - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); - return; - } - - total_devs = bioc->num_stripes; - bioc->orig_bio = bio; - bioc->private = bio->bi_private; - bioc->end_io = bio->bi_end_io; - atomic_set(&bioc->stripes_pending, total_devs); - - if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { - if (btrfs_op(bio) == BTRFS_MAP_WRITE) - raid56_parity_write(bio, bioc); - else - raid56_parity_recover(bio, bioc, mirror_num, true); - return; - } - - if (map_length < length) { - btrfs_crit(fs_info, - "mapping failed logical %llu bio len %llu len %llu", - logical, length, map_length); - BUG(); - } - - for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { - const bool should_clone = (dev_nr < total_devs - 1); - - submit_stripe_bio(bioc, bio, dev_nr, should_clone); - } - btrfs_bio_counter_dec(fs_info); + return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, + NULL, NULL, 1); } static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, @@ -6805,18 +6652,18 @@ static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, const struct btrfs_device *device) { - ASSERT((args->devid != (u64)-1) || args->missing); + if (args->missing) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && + !device->bdev) + return true; + return false; + } - if ((args->devid != (u64)-1) && device->devid != args->devid) + if (device->devid != args->devid) return false; if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0) return false; - if (!args->missing) - return true; - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && - !device->bdev) - return true; - return false; + return true; } /* @@ -6863,8 +6710,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, * always do NOFS because we use it in a lot of other GFP_KERNEL safe * places. */ + nofs_flag = memalloc_nofs_save(); - device = btrfs_alloc_device(NULL, &devid, dev_uuid); + device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL); memalloc_nofs_restore(nofs_flag); if (IS_ERR(device)) return device; @@ -6879,22 +6727,24 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, return device; } -/** - * btrfs_alloc_device - allocate struct btrfs_device +/* + * Allocate new device struct, set up devid and UUID. + * * @fs_info: used only for generating a new devid, can be NULL if * devid is provided (i.e. @devid != NULL). * @devid: a pointer to devid for this device. If NULL a new devid * is generated. * @uuid: a pointer to UUID for this device. If NULL a new UUID * is generated. + * @path: a pointer to device path if available, NULL otherwise. * * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() * on error. Returned struct is not linked onto any lists and must be * destroyed with btrfs_free_device. */ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, - const u64 *devid, - const u8 *uuid) + const u64 *devid, const u8 *uuid, + const char *path) { struct btrfs_device *dev; u64 tmp; @@ -6912,8 +6762,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); - extent_io_tree_init(fs_info, &dev->alloc_state, - IO_TREE_DEVICE_ALLOC_STATE, NULL); + extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE); if (devid) tmp = *devid; @@ -6933,6 +6782,17 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, else generate_random_uuid(dev->uuid); + if (path) { + struct rcu_string *name; + + name = rcu_string_strdup(path, GFP_KERNEL); + if (!name) { + btrfs_free_device(dev); + return ERR_PTR(-ENOMEM); + } + rcu_assign_pointer(dev->name, name); + } + return dev; } @@ -7029,6 +6889,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, u64 devid; u64 type; u8 uuid[BTRFS_UUID_SIZE]; + int index; int num_stripes; int ret; int i; @@ -7036,6 +6897,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, logical = key->offset; length = btrfs_chunk_length(leaf, chunk); type = btrfs_chunk_type(leaf, chunk); + index = btrfs_bg_flags_to_raid_index(type); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); #if BITS_PER_LONG == 32 @@ -7089,7 +6951,15 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->io_align = btrfs_chunk_io_align(leaf, chunk); map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); map->type = type; - map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + /* + * We can't use the sub_stripes value, as for profiles other than + * RAID10, they may have 0 as sub_stripes for filesystems created by + * older mkfs (<v5.4). + * In that case, it can cause divide-by-zero errors later. + * Since currently sub_stripes is fixed for each profile, let's + * use the trusted value instead. + */ + map->sub_stripes = btrfs_raid_array[index].sub_stripes; map->verified_stripes = 0; em->orig_block_len = btrfs_calc_stripe_length(em); for (i = 0; i < num_stripes; i++) { @@ -7106,8 +6976,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->stripes[i].dev = handle_missing_device(fs_info, devid, uuid); if (IS_ERR(map->stripes[i].dev)) { + ret = PTR_ERR(map->stripes[i].dev); free_extent_map(em); - return PTR_ERR(map->stripes[i].dev); + return ret; } } @@ -7621,10 +7492,11 @@ error: return ret; } -void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) +int btrfs_init_devices_late(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; + int ret = 0; fs_devices->fs_info = fs_info; @@ -7633,12 +7505,18 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) device->fs_info = fs_info; list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { - list_for_each_entry(device, &seed_devs->devices, dev_list) + list_for_each_entry(device, &seed_devs->devices, dev_list) { device->fs_info = fs_info; + ret = btrfs_get_dev_zone_info(device, false); + if (ret) + break; + } seed_devs->fs_info = fs_info; } mutex_unlock(&fs_devices->device_list_mutex); + + return ret; } static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, @@ -7762,7 +7640,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, if (ret < 0) { btrfs_warn_in_rcu(fs_info, "error %d while searching for dev_stats item for device %s", - ret, rcu_str_deref(device->name)); + ret, btrfs_dev_name(device)); goto out; } @@ -7773,7 +7651,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, if (ret != 0) { btrfs_warn_in_rcu(fs_info, "delete too small dev_stats item for device %s failed %d", - rcu_str_deref(device->name), ret); + btrfs_dev_name(device), ret); goto out; } ret = 1; @@ -7787,7 +7665,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, if (ret < 0) { btrfs_warn_in_rcu(fs_info, "insert dev_stats item for device %s failed %d", - rcu_str_deref(device->name), ret); + btrfs_dev_name(device), ret); goto out; } } @@ -7852,7 +7730,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) return; btrfs_err_rl_in_rcu(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", - rcu_str_deref(dev->name), + btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), @@ -7872,7 +7750,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) btrfs_info_in_rcu(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", - rcu_str_deref(dev->name), + btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), @@ -8244,7 +8122,7 @@ static int relocating_repair_kthread(void *data) if (!cache) goto out; - if (!cache->relocating_repair) + if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) goto out; ret = btrfs_may_alloc_data_chunk(fs_info, target); @@ -8281,14 +8159,10 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) if (!cache) return true; - spin_lock(&cache->lock); - if (cache->relocating_repair) { - spin_unlock(&cache->lock); + if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) { btrfs_put_block_group(cache); return true; } - cache->relocating_repair = 1; - spin_unlock(&cache->lock); kthread_run(relocating_repair_kthread, cache, "btrfs-relocating-repair"); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 5639961b3626..6b7a05f6cf82 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -6,10 +6,12 @@ #ifndef BTRFS_VOLUMES_H #define BTRFS_VOLUMES_H -#include <linux/bio.h> #include <linux/sort.h> #include <linux/btrfs.h> #include "async-thread.h" +#include "messages.h" +#include "tree-checker.h" +#include "rcu-string.h" #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) @@ -181,6 +183,31 @@ struct btrfs_device { }; /* + * Block group or device which contains an active swapfile. Used for preventing + * unsafe operations while a swapfile is active. + * + * These are sorted on (ptr, inode) (note that a block group or device can + * contain more than one swapfile). We compare the pointer values because we + * don't actually care what the object is, we just need a quick check whether + * the object exists in the rbtree. + */ +struct btrfs_swapfile_pin { + struct rb_node node; + void *ptr; + struct inode *inode; + /* + * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr + * points to a struct btrfs_device. + */ + bool is_block_group; + /* + * Only used when 'is_block_group' is true and it is the number of + * extents used by a swapfile for this block group ('ptr' field). + */ + int bg_extent_count; +}; + +/* * If we read those variants at the context of their own lock, we needn't * use the following helpers, reading them directly is safe. */ @@ -329,6 +356,8 @@ struct btrfs_fs_devices { * nonrot flag set */ bool rotating; + /* Devices support TRIM/discard commands */ + bool discardable; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ @@ -343,8 +372,6 @@ struct btrfs_fs_devices { enum btrfs_read_policy read_policy; }; -#define BTRFS_BIO_INLINE_CSUM_SIZE 64 - #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ - sizeof(struct btrfs_chunk)) \ / sizeof(struct btrfs_stripe) + 1) @@ -354,69 +381,6 @@ struct btrfs_fs_devices { - 2 * sizeof(struct btrfs_chunk)) \ / sizeof(struct btrfs_stripe) + 1) -/* - * Maximum number of sectors for a single bio to limit the size of the - * checksum array. This matches the number of bio_vecs per bio and thus the - * I/O size for buffered I/O. - */ -#define BTRFS_MAX_BIO_SECTORS (256) - -/* - * Additional info to pass along bio. - * - * Mostly for btrfs specific features like csum and mirror_num. - */ -struct btrfs_bio { - unsigned int mirror_num; - - /* for direct I/O */ - u64 file_offset; - - /* @device is for stripe IO submission. */ - struct btrfs_device *device; - u8 *csum; - u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; - struct bvec_iter iter; - - /* For read end I/O handling */ - struct work_struct end_io_work; - - /* - * This member must come last, bio_alloc_bioset will allocate enough - * bytes for entire btrfs_bio but relies on bio being last. - */ - struct bio bio; -}; - -static inline struct btrfs_bio *btrfs_bio(struct bio *bio) -{ - return container_of(bio, struct btrfs_bio, bio); -} - -static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) -{ - if (bbio->csum != bbio->csum_inline) { - kfree(bbio->csum); - bbio->csum = NULL; - } -} - -/* - * Iterate through a btrfs_bio (@bbio) on a per-sector basis. - * - * bvl - struct bio_vec - * bbio - struct btrfs_bio - * iters - struct bvec_iter - * bio_offset - unsigned int - */ -#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ - for ((iter) = (bbio)->iter, (bio_offset) = 0; \ - (iter).bi_size && \ - (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ - (bio_offset) += fs_info->sectorsize, \ - bio_advance_iter_single(&(bbio)->bio, &(iter), \ - (fs_info)->sectorsize)) - struct btrfs_io_stripe { struct btrfs_device *dev; union { @@ -451,12 +415,9 @@ struct btrfs_discard_stripe { */ struct btrfs_io_context { refcount_t refs; - atomic_t stripes_pending; struct btrfs_fs_info *fs_info; u64 map_type; /* get from map_lookup->type */ - bio_end_io_t *end_io; struct bio *orig_bio; - void *private; atomic_t error; int max_errors; int num_stripes; @@ -561,6 +522,13 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio) } } +static inline unsigned long btrfs_chunk_item_size(int num_stripes) +{ + ASSERT(num_stripes); + return sizeof(struct btrfs_chunk) + + sizeof(struct btrfs_stripe) * (num_stripes - 1); +} + void btrfs_get_bioc(struct btrfs_io_context *bioc); void btrfs_put_bioc(struct btrfs_io_context *bioc); int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, @@ -569,6 +537,11 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret); +int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 *length, + struct btrfs_io_context **bioc_ret, + struct btrfs_io_stripe *smap, int *mirror_num_ret, + int need_raid_map); struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, u32 *num_stripes); @@ -580,7 +553,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type); void btrfs_mapping_tree_free(struct extent_map_tree *tree); -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, @@ -597,8 +569,8 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, const char *path); struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, - const u64 *devid, - const u8 *uuid); + const u64 *devid, const u8 *uuid, + const char *path); void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); void btrfs_free_device(struct btrfs_device *device); int btrfs_rm_device(struct btrfs_fs_info *fs_info, @@ -629,7 +601,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_get_dev_stats *stats); -void btrfs_init_devices_late(struct btrfs_fs_info *fs_info); +int btrfs_init_devices_late(struct btrfs_fs_info *fs_info); int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans); void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); @@ -699,6 +671,14 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev, atomic_inc(&dev->dev_stats_ccnt); } +static inline const char *btrfs_dev_name(const struct btrfs_device *device) +{ + if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) + return "<missing disk>"; + else + return rcu_str_deref(device->name); +} + void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); @@ -714,4 +694,6 @@ const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); +bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); + #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 5bb8d8c86311..0ed4b119a7ca 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -13,12 +13,16 @@ #include <linux/iversion.h> #include <linux/sched/mm.h> #include "ctree.h" +#include "fs.h" +#include "messages.h" #include "btrfs_inode.h" #include "transaction.h" #include "xattr.h" #include "disk-io.h" #include "props.h" #include "locking.h" +#include "accessors.h" +#include "dir-item.h" int btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index b4f44662cda7..01a13de11832 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -155,8 +155,8 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, in_page = find_get_page(mapping, start >> PAGE_SHIFT); data_in = kmap_local_page(in_page); - memcpy(workspace->buf + i * PAGE_SIZE, - data_in, PAGE_SIZE); + copy_page(workspace->buf + i * PAGE_SIZE, + data_in); start += PAGE_SIZE; } workspace->strm.next_in = workspace->buf; @@ -355,7 +355,7 @@ done: return ret; } -int zlib_decompress(struct list_head *ws, unsigned char *data_in, +int zlib_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 73c6929f7be6..a759668477bb 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -15,6 +15,8 @@ #include "transaction.h" #include "dev-replace.h" #include "space-info.h" +#include "fs.h" +#include "accessors.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -134,7 +136,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, super[i] = page_address(page[i]); } - if (super[0]->generation > super[1]->generation) + if (btrfs_super_generation(super[0]) > + btrfs_super_generation(super[1])) sector = zones[1].start; else sector = zones[0].start; @@ -391,8 +394,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) zone_sectors = bdev_zone_sectors(bdev); } - /* Check if it's power of 2 (see is_power_of_2) */ - ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); + ASSERT(is_power_of_two_u64(zone_sectors)); zone_info->zone_size = zone_sectors << SECTOR_SHIFT; /* We reject devices with a zone size larger than 8GB */ @@ -466,7 +468,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) goto out; } - zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); + zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); if (!zones) { ret = -ENOMEM; goto out; @@ -585,7 +587,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) } - kfree(zones); + kvfree(zones); switch (bdev_zoned_model(bdev)) { case BLK_ZONED_HM: @@ -617,7 +619,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) return 0; out: - kfree(zones); + kvfree(zones); out_free_zone_info: btrfs_destroy_dev_zone_info(device); @@ -639,6 +641,46 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device) device->zone_info = NULL; } +struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev) +{ + struct btrfs_zoned_device_info *zone_info; + + zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL); + if (!zone_info) + return NULL; + + zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->seq_zones) + goto out; + + bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones, + zone_info->nr_zones); + + zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->empty_zones) + goto out; + + bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones, + zone_info->nr_zones); + + zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->active_zones) + goto out; + + bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones, + zone_info->nr_zones); + zone_info->zone_cache = NULL; + + return zone_info; + +out: + bitmap_free(zone_info->seq_zones); + bitmap_free(zone_info->empty_zones); + bitmap_free(zone_info->active_zones); + kfree(zone_info); + return NULL; +} + int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) { @@ -652,80 +694,55 @@ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, return 0; } +static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) +{ + struct btrfs_device *device; + + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + if (device->bdev && + bdev_zoned_model(device->bdev) == BLK_ZONED_HM) { + btrfs_err(fs_info, + "zoned: mode not enabled but zoned device found: %pg", + device->bdev); + return -EINVAL; + } + } + + return 0; +} + int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; - u64 zoned_devices = 0; - u64 nr_devices = 0; u64 zone_size = 0; u64 max_zone_append_size = 0; - const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); - int ret = 0; + int ret; - /* Count zoned devices */ - list_for_each_entry(device, &fs_devices->devices, dev_list) { - enum blk_zoned_model model; + /* + * Host-Managed devices can't be used without the ZONED flag. With the + * ZONED all devices can be used, using zone emulation if required. + */ + if (!btrfs_fs_incompat(fs_info, ZONED)) + return btrfs_check_for_zoned_device(fs_info); + + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + struct btrfs_zoned_device_info *zone_info = device->zone_info; if (!device->bdev) continue; - model = bdev_zoned_model(device->bdev); - /* - * A Host-Managed zoned device must be used as a zoned device. - * A Host-Aware zoned device and a non-zoned devices can be - * treated as a zoned device, if ZONED flag is enabled in the - * superblock. - */ - if (model == BLK_ZONED_HM || - (model == BLK_ZONED_HA && incompat_zoned) || - (model == BLK_ZONED_NONE && incompat_zoned)) { - struct btrfs_zoned_device_info *zone_info; - - zone_info = device->zone_info; - zoned_devices++; - if (!zone_size) { - zone_size = zone_info->zone_size; - } else if (zone_info->zone_size != zone_size) { - btrfs_err(fs_info, + if (!zone_size) { + zone_size = zone_info->zone_size; + } else if (zone_info->zone_size != zone_size) { + btrfs_err(fs_info, "zoned: unequal block device zone sizes: have %llu found %llu", - device->zone_info->zone_size, - zone_size); - ret = -EINVAL; - goto out; - } - if (!max_zone_append_size || - (zone_info->max_zone_append_size && - zone_info->max_zone_append_size < max_zone_append_size)) - max_zone_append_size = - zone_info->max_zone_append_size; + zone_info->zone_size, zone_size); + return -EINVAL; } - nr_devices++; - } - - if (!zoned_devices && !incompat_zoned) - goto out; - - if (!zoned_devices && incompat_zoned) { - /* No zoned block device found on ZONED filesystem */ - btrfs_err(fs_info, - "zoned: no zoned devices found on a zoned filesystem"); - ret = -EINVAL; - goto out; - } - - if (zoned_devices && !incompat_zoned) { - btrfs_err(fs_info, - "zoned: mode not enabled but zoned device found"); - ret = -EINVAL; - goto out; - } - - if (zoned_devices != nr_devices) { - btrfs_err(fs_info, - "zoned: cannot mix zoned and regular devices"); - ret = -EINVAL; - goto out; + if (!max_zone_append_size || + (zone_info->max_zone_append_size && + zone_info->max_zone_append_size < max_zone_append_size)) + max_zone_append_size = zone_info->max_zone_append_size; } /* @@ -737,14 +754,12 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "zoned: zone size %llu not aligned to stripe %u", zone_size, BTRFS_STRIPE_LEN); - ret = -EINVAL; - goto out; + return -EINVAL; } if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { btrfs_err(fs_info, "zoned: mixed block groups not supported"); - ret = -EINVAL; - goto out; + return -EINVAL; } fs_info->zone_size = zone_size; @@ -760,11 +775,10 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) */ ret = btrfs_check_mountopts_zoned(fs_info); if (ret) - goto out; + return ret; btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); -out: - return ret; + return 0; } int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) @@ -1005,8 +1019,8 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); } -/** - * btrfs_find_allocatable_zones - find allocatable zones within a given region +/* + * Find allocatable zones within a given region. * * @device: the device to allocate a region on * @hole_start: the position of the hole to allocate the region @@ -1423,7 +1437,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } if (num_sequential > 0) - cache->seq_zone = true; + set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); if (num_conventional > 0) { /* Zone capacity is always zone size in emulation */ @@ -1436,7 +1450,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } else if (map->num_stripes == num_conventional) { cache->alloc_offset = last_alloc; - cache->zone_is_active = 1; + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); goto out; } } @@ -1452,7 +1466,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } cache->alloc_offset = alloc_offsets[0]; cache->zone_capacity = caps[0]; - cache->zone_is_active = test_bit(0, active); + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); break; case BTRFS_BLOCK_GROUP_DUP: if (map->type & BTRFS_BLOCK_GROUP_DATA) { @@ -1486,7 +1501,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } } else { - cache->zone_is_active = test_bit(0, active); + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &cache->runtime_flags); } cache->alloc_offset = alloc_offsets[0]; cache->zone_capacity = min(caps[0], caps[1]); @@ -1530,7 +1547,7 @@ out: if (!ret) { cache->meta_write_pointer = cache->alloc_offset + cache->start; - if (cache->zone_is_active) { + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) { btrfs_get_block_group(cache); spin_lock(&fs_info->zone_active_bgs_lock); list_add_tail(&cache->active_bg_list, @@ -1563,7 +1580,6 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) free = cache->zone_capacity - cache->alloc_offset; /* We only need ->free_space in ALLOC_SEQ block groups */ - cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; cache->free_space_ctl->free_space = free; cache->zone_unusable = unusable; @@ -1633,7 +1649,7 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) if (!cache) return false; - ret = cache->seq_zone; + ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); btrfs_put_block_group(cache); return ret; @@ -1847,7 +1863,7 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, return device; } -/** +/* * Activate block group and underlying device zones * * @block_group: the block group to activate @@ -1871,7 +1887,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) spin_lock(&space_info->lock); spin_lock(&block_group->lock); - if (block_group->zone_is_active) { + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { ret = true; goto out_unlock; } @@ -1897,7 +1913,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) } /* Successfully activated all the zones */ - block_group->zone_is_active = 1; + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); space_info->active_total_bytes += block_group->length; spin_unlock(&block_group->lock); btrfs_try_granting_tickets(fs_info, space_info); @@ -1960,7 +1976,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ int i; spin_lock(&block_group->lock); - if (!block_group->zone_is_active) { + if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } @@ -2001,7 +2017,8 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ * Bail out if someone already deactivated the block group, or * allocated space is left in the block group. */ - if (!block_group->zone_is_active) { + if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &block_group->runtime_flags)) { spin_unlock(&block_group->lock); btrfs_dec_block_group_ro(block_group); return 0; @@ -2014,7 +2031,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ } } - block_group->zone_is_active = 0; + clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); block_group->alloc_offset = block_group->zone_capacity; block_group->free_space_ctl->free_space = 0; btrfs_clear_treelog_bg(block_group); @@ -2137,7 +2154,8 @@ static void btrfs_zone_finish_endio_workfn(struct work_struct *work) void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb) { - if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) + if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) || + eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) return; if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) { @@ -2222,13 +2240,14 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); spin_lock(&block_group->lock); - if (!block_group->zoned_data_reloc_ongoing) + if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) goto out; /* All relocation extents are written. */ if (block_group->start + block_group->alloc_offset == logical + length) { /* Now, release this block group for further allocations. */ - block_group->zoned_data_reloc_ongoing = 0; + clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, + &block_group->runtime_flags); } out: @@ -2300,7 +2319,9 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, list) { if (!spin_trylock(&bg->lock)) continue; - if (btrfs_zoned_bg_is_full(bg) || bg->zone_is_active) { + if (btrfs_zoned_bg_is_full(bg) || + test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &bg->runtime_flags)) { spin_unlock(&bg->lock); continue; } diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index e17462db3a84..f43990985d80 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -5,6 +5,7 @@ #include <linux/types.h> #include <linux/blkdev.h> +#include "messages.h" #include "volumes.h" #include "disk-io.h" #include "block-group.h" @@ -36,6 +37,7 @@ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info); int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); +struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev); int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info); int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, @@ -103,6 +105,16 @@ static inline int btrfs_get_dev_zone_info(struct btrfs_device *device, static inline void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { } +/* + * In case the kernel is compiled without CONFIG_BLK_DEV_ZONED we'll never call + * into btrfs_clone_dev_zone_info() so it's safe to return NULL here. + */ +static inline struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info( + struct btrfs_device *orig_dev) +{ + return NULL; +} + static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info) { if (!btrfs_is_zoned(fs_info)) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 35a0224d4eb7..e34f1ab99d56 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -94,7 +94,7 @@ static inline struct workspace *list_to_workspace(struct list_head *list) void zstd_free_workspace(struct list_head *ws); struct list_head *zstd_alloc_workspace(unsigned int level); -/** +/* * Timer callback to free unused workspaces. * * @t: timer @@ -616,7 +616,7 @@ done: return ret; } -int zstd_decompress(struct list_head *ws, unsigned char *data_in, +int zstd_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { diff --git a/fs/buffer.c b/fs/buffer.c index 55e762a58eb6..d9c6d1fbb6dd 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -52,8 +52,8 @@ #include "internal.h" static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); -static int submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, - struct writeback_control *wbc); +static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, + struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -152,7 +152,7 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) /* * Default synchronous end-of-IO handler.. Just mark it up-to-date and - * unlock the buffer. This is what ll_rw_block uses too. + * unlock the buffer. */ void end_buffer_read_sync(struct buffer_head *bh, int uptodate) { @@ -491,8 +491,8 @@ int inode_has_buffers(struct inode *inode) * all already-submitted IO to complete, but does not queue any new * writes to the disk. * - * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as - * you dirty the buffers, and then use osync_inode_buffers to wait for + * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer + * as you dirty the buffers, and then use osync_inode_buffers to wait for * completion. Any other dirty buffers which are not yet queued for * write will not be flushed to disk by the osync. */ @@ -562,7 +562,7 @@ void write_boundary_block(struct block_device *bdev, struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); if (bh) { if (buffer_dirty(bh)) - ll_rw_block(REQ_OP_WRITE, 1, &bh); + write_dirty_buffer(bh, 0); put_bh(bh); } } @@ -1342,23 +1342,12 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *bh = __getblk(bdev, block, size); if (likely(bh)) { - ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh); + bh_readahead(bh, REQ_RAHEAD); brelse(bh); } } EXPORT_SYMBOL(__breadahead); -void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size, - gfp_t gfp) -{ - struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp); - if (likely(bh)) { - ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh); - brelse(bh); - } -} -EXPORT_SYMBOL(__breadahead_gfp); - /** * __bread_gfp() - reads a specified block and returns the bh * @bdev: the block_device to read from @@ -1464,19 +1453,15 @@ EXPORT_SYMBOL(set_bh_page); static void discard_buffer(struct buffer_head * bh) { - unsigned long b_state, b_state_old; + unsigned long b_state; lock_buffer(bh); clear_buffer_dirty(bh); bh->b_bdev = NULL; - b_state = bh->b_state; - for (;;) { - b_state_old = cmpxchg(&bh->b_state, b_state, - (b_state & ~BUFFER_FLAGS_DISCARD)); - if (b_state_old == b_state) - break; - b_state = b_state_old; - } + b_state = READ_ONCE(bh->b_state); + do { + } while (!try_cmpxchg(&bh->b_state, &b_state, + b_state & ~BUFFER_FLAGS_DISCARD)); unlock_buffer(bh); } @@ -1817,7 +1802,7 @@ done: /* * The page was marked dirty, but the buffers were * clean. Someone wrote them back by hand with - * ll_rw_block/submit_bh. A rare case. + * write_dirty_buffer/submit_bh. A rare case. */ end_page_writeback(page); @@ -2033,7 +2018,7 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { - ll_rw_block(REQ_OP_READ, 1, &bh); + bh_read_nowait(bh, 0); *wait_bh++=bh; } } @@ -2352,7 +2337,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; struct page *page; - void *fsdata; + void *fsdata = NULL; int err; err = inode_newsize_ok(inode, size); @@ -2378,7 +2363,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, const struct address_space_operations *aops = mapping->a_ops; unsigned int blocksize = i_blocksize(inode); struct page *page; - void *fsdata; + void *fsdata = NULL; pgoff_t index, curidx; loff_t curpos; unsigned zerofrom, offset, len; @@ -2593,11 +2578,9 @@ int block_truncate_page(struct address_space *mapping, set_buffer_uptodate(bh); if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { - err = -EIO; - ll_rw_block(REQ_OP_READ, 1, &bh); - wait_on_buffer(bh); + err = bh_read(bh, 0); /* Uhhuh. Read error. Complain and punt. */ - if (!buffer_uptodate(bh)) + if (err < 0) goto unlock; } @@ -2673,8 +2656,8 @@ static void end_bio_bh_io_sync(struct bio *bio) bio_put(bio); } -static int submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, - struct writeback_control *wbc) +static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, + struct writeback_control *wbc) { const enum req_op op = opf & REQ_OP_MASK; struct bio *bio; @@ -2717,70 +2700,14 @@ static int submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, } submit_bio(bio); - return 0; } -int submit_bh(blk_opf_t opf, struct buffer_head *bh) +void submit_bh(blk_opf_t opf, struct buffer_head *bh) { - return submit_bh_wbc(opf, bh, NULL); + submit_bh_wbc(opf, bh, NULL); } EXPORT_SYMBOL(submit_bh); -/** - * ll_rw_block: low-level access to block devices (DEPRECATED) - * @opf: block layer request operation and flags. - * @nr: number of &struct buffer_heads in the array - * @bhs: array of pointers to &struct buffer_head - * - * ll_rw_block() takes an array of pointers to &struct buffer_heads, and - * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE. - * @opf contains flags modifying the detailed I/O behavior, most notably - * %REQ_RAHEAD. - * - * This function drops any buffer that it cannot get a lock on (with the - * BH_Lock state bit), any buffer that appears to be clean when doing a write - * request, and any buffer that appears to be up-to-date when doing read - * request. Further it marks as clean buffers that are processed for - * writing (the buffer cache won't assume that they are actually clean - * until the buffer gets unlocked). - * - * ll_rw_block sets b_end_io to simple completion handler that marks - * the buffer up-to-date (if appropriate), unlocks the buffer and wakes - * any waiters. - * - * All of the buffers must be for the same device, and must also be a - * multiple of the current approved size for the device. - */ -void ll_rw_block(const blk_opf_t opf, int nr, struct buffer_head *bhs[]) -{ - const enum req_op op = opf & REQ_OP_MASK; - int i; - - for (i = 0; i < nr; i++) { - struct buffer_head *bh = bhs[i]; - - if (!trylock_buffer(bh)) - continue; - if (op == REQ_OP_WRITE) { - if (test_clear_buffer_dirty(bh)) { - bh->b_end_io = end_buffer_write_sync; - get_bh(bh); - submit_bh(opf, bh); - continue; - } - } else { - if (!buffer_uptodate(bh)) { - bh->b_end_io = end_buffer_read_sync; - get_bh(bh); - submit_bh(opf, bh); - continue; - } - } - unlock_buffer(bh); - } -} -EXPORT_SYMBOL(ll_rw_block); - void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags) { lock_buffer(bh); @@ -2801,8 +2728,6 @@ EXPORT_SYMBOL(write_dirty_buffer); */ int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags) { - int ret = 0; - WARN_ON(atomic_read(&bh->b_count) < 1); lock_buffer(bh); if (test_clear_buffer_dirty(bh)) { @@ -2817,14 +2742,14 @@ int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags) get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(REQ_OP_WRITE | op_flags, bh); + submit_bh(REQ_OP_WRITE | op_flags, bh); wait_on_buffer(bh); - if (!ret && !buffer_uptodate(bh)) - ret = -EIO; + if (!buffer_uptodate(bh)) + return -EIO; } else { unlock_buffer(bh); } - return ret; + return 0; } EXPORT_SYMBOL(__sync_dirty_buffer); @@ -3029,29 +2954,69 @@ int bh_uptodate_or_lock(struct buffer_head *bh) EXPORT_SYMBOL(bh_uptodate_or_lock); /** - * bh_submit_read - Submit a locked buffer for reading + * __bh_read - Submit read for a locked buffer * @bh: struct buffer_head + * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ + * @wait: wait until reading finish * - * Returns zero on success and -EIO on error. + * Returns zero on success or don't wait, and -EIO on error. */ -int bh_submit_read(struct buffer_head *bh) +int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { - BUG_ON(!buffer_locked(bh)); + int ret = 0; - if (buffer_uptodate(bh)) { - unlock_buffer(bh); - return 0; - } + BUG_ON(!buffer_locked(bh)); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, bh); - wait_on_buffer(bh); - if (buffer_uptodate(bh)) - return 0; - return -EIO; + submit_bh(REQ_OP_READ | op_flags, bh); + if (wait) { + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + ret = -EIO; + } + return ret; +} +EXPORT_SYMBOL(__bh_read); + +/** + * __bh_read_batch - Submit read for a batch of unlocked buffers + * @nr: entry number of the buffer batch + * @bhs: a batch of struct buffer_head + * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ + * @force_lock: force to get a lock on the buffer if set, otherwise drops any + * buffer that cannot lock. + * + * Returns zero on success or don't wait, and -EIO on error. + */ +void __bh_read_batch(int nr, struct buffer_head *bhs[], + blk_opf_t op_flags, bool force_lock) +{ + int i; + + for (i = 0; i < nr; i++) { + struct buffer_head *bh = bhs[i]; + + if (buffer_uptodate(bh)) + continue; + + if (force_lock) + lock_buffer(bh); + else + if (!trylock_buffer(bh)) + continue; + + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + continue; + } + + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(REQ_OP_READ | op_flags, bh); + } } -EXPORT_SYMBOL(bh_submit_read); +EXPORT_SYMBOL(__bh_read_batch); void __init buffer_init(void) { diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 000a28f46e59..175a25fcade8 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -385,38 +385,35 @@ static int cachefiles_write(struct netfs_cache_resources *cres, term_func, term_func_priv); } -/* - * Prepare a read operation, shortening it to a cached/uncached - * boundary as appropriate. - */ -static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq, - loff_t i_size) +static inline enum netfs_io_source +cachefiles_do_prepare_read(struct netfs_cache_resources *cres, + loff_t start, size_t *_len, loff_t i_size, + unsigned long *_flags, ino_t netfs_ino) { enum cachefiles_prepare_read_trace why; - struct netfs_io_request *rreq = subreq->rreq; - struct netfs_cache_resources *cres = &rreq->cache_resources; - struct cachefiles_object *object; + struct cachefiles_object *object = NULL; struct cachefiles_cache *cache; struct fscache_cookie *cookie = fscache_cres_cookie(cres); const struct cred *saved_cred; struct file *file = cachefiles_cres_file(cres); enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER; + size_t len = *_len; loff_t off, to; ino_t ino = file ? file_inode(file)->i_ino : 0; int rc; - _enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size); + _enter("%zx @%llx/%llx", len, start, i_size); - if (subreq->start >= i_size) { + if (start >= i_size) { ret = NETFS_FILL_WITH_ZEROES; why = cachefiles_trace_read_after_eof; goto out_no_object; } if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) { - __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); + __set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags); why = cachefiles_trace_read_no_data; - if (!test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags)) + if (!test_bit(NETFS_SREQ_ONDEMAND, _flags)) goto out_no_object; } @@ -437,7 +434,7 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest * retry: off = cachefiles_inject_read_error(); if (off == 0) - off = vfs_llseek(file, subreq->start, SEEK_DATA); + off = vfs_llseek(file, start, SEEK_DATA); if (off < 0 && off >= (loff_t)-MAX_ERRNO) { if (off == (loff_t)-ENXIO) { why = cachefiles_trace_read_seek_nxio; @@ -449,21 +446,22 @@ retry: goto out; } - if (off >= subreq->start + subreq->len) { + if (off >= start + len) { why = cachefiles_trace_read_found_hole; goto download_and_store; } - if (off > subreq->start) { + if (off > start) { off = round_up(off, cache->bsize); - subreq->len = off - subreq->start; + len = off - start; + *_len = len; why = cachefiles_trace_read_found_part; goto download_and_store; } to = cachefiles_inject_read_error(); if (to == 0) - to = vfs_llseek(file, subreq->start, SEEK_HOLE); + to = vfs_llseek(file, start, SEEK_HOLE); if (to < 0 && to >= (loff_t)-MAX_ERRNO) { trace_cachefiles_io_error(object, file_inode(file), to, cachefiles_trace_seek_error); @@ -471,12 +469,13 @@ retry: goto out; } - if (to < subreq->start + subreq->len) { - if (subreq->start + subreq->len >= i_size) + if (to < start + len) { + if (start + len >= i_size) to = round_up(to, cache->bsize); else to = round_down(to, cache->bsize); - subreq->len = to - subreq->start; + len = to - start; + *_len = len; } why = cachefiles_trace_read_have_data; @@ -484,12 +483,11 @@ retry: goto out; download_and_store: - __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); - if (test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags)) { - rc = cachefiles_ondemand_read(object, subreq->start, - subreq->len); + __set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags); + if (test_bit(NETFS_SREQ_ONDEMAND, _flags)) { + rc = cachefiles_ondemand_read(object, start, len); if (!rc) { - __clear_bit(NETFS_SREQ_ONDEMAND, &subreq->flags); + __clear_bit(NETFS_SREQ_ONDEMAND, _flags); goto retry; } ret = NETFS_INVALID_READ; @@ -497,11 +495,35 @@ download_and_store: out: cachefiles_end_secure(cache, saved_cred); out_no_object: - trace_cachefiles_prep_read(subreq, ret, why, ino); + trace_cachefiles_prep_read(object, start, len, *_flags, ret, why, ino, netfs_ino); return ret; } /* + * Prepare a read operation, shortening it to a cached/uncached + * boundary as appropriate. + */ +static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq, + loff_t i_size) +{ + return cachefiles_do_prepare_read(&subreq->rreq->cache_resources, + subreq->start, &subreq->len, i_size, + &subreq->flags, subreq->rreq->inode->i_ino); +} + +/* + * Prepare an on-demand read operation, shortening it to a cached/uncached + * boundary as appropriate. + */ +static enum netfs_io_source +cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres, + loff_t start, size_t *_len, loff_t i_size, + unsigned long *_flags, ino_t ino) +{ + return cachefiles_do_prepare_read(cres, start, _len, i_size, _flags, ino); +} + +/* * Prepare for a write to occur. */ int __cachefiles_prepare_write(struct cachefiles_object *object, @@ -621,6 +643,7 @@ static const struct netfs_cache_ops cachefiles_netfs_cache_ops = { .write = cachefiles_write, .prepare_read = cachefiles_prepare_read, .prepare_write = cachefiles_prepare_write, + .prepare_ondemand_read = cachefiles_prepare_ondemand_read, .query_occupancy = cachefiles_query_occupancy, }; diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index facf2ebe464b..03ca8f2f657a 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -15,9 +15,8 @@ * file or directory. The caller must hold the inode lock. */ static bool __cachefiles_mark_inode_in_use(struct cachefiles_object *object, - struct dentry *dentry) + struct inode *inode) { - struct inode *inode = d_backing_inode(dentry); bool can_use = false; if (!(inode->i_flags & S_KERNEL_FILE)) { @@ -26,21 +25,18 @@ static bool __cachefiles_mark_inode_in_use(struct cachefiles_object *object, can_use = true; } else { trace_cachefiles_mark_failed(object, inode); - pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", - dentry, inode->i_ino); } return can_use; } static bool cachefiles_mark_inode_in_use(struct cachefiles_object *object, - struct dentry *dentry) + struct inode *inode) { - struct inode *inode = d_backing_inode(dentry); bool can_use; inode_lock(inode); - can_use = __cachefiles_mark_inode_in_use(object, dentry); + can_use = __cachefiles_mark_inode_in_use(object, inode); inode_unlock(inode); return can_use; } @@ -49,21 +45,17 @@ static bool cachefiles_mark_inode_in_use(struct cachefiles_object *object, * Unmark a backing inode. The caller must hold the inode lock. */ static void __cachefiles_unmark_inode_in_use(struct cachefiles_object *object, - struct dentry *dentry) + struct inode *inode) { - struct inode *inode = d_backing_inode(dentry); - inode->i_flags &= ~S_KERNEL_FILE; trace_cachefiles_mark_inactive(object, inode); } static void cachefiles_do_unmark_inode_in_use(struct cachefiles_object *object, - struct dentry *dentry) + struct inode *inode) { - struct inode *inode = d_backing_inode(dentry); - inode_lock(inode); - __cachefiles_unmark_inode_in_use(object, dentry); + __cachefiles_unmark_inode_in_use(object, inode); inode_unlock(inode); } @@ -77,14 +69,12 @@ void cachefiles_unmark_inode_in_use(struct cachefiles_object *object, struct cachefiles_cache *cache = object->volume->cache; struct inode *inode = file_inode(file); - if (inode) { - cachefiles_do_unmark_inode_in_use(object, file->f_path.dentry); + cachefiles_do_unmark_inode_in_use(object, inode); - if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { - atomic_long_add(inode->i_blocks, &cache->b_released); - if (atomic_inc_return(&cache->f_released)) - cachefiles_state_changed(cache); - } + if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { + atomic_long_add(inode->i_blocks, &cache->b_released); + if (atomic_inc_return(&cache->f_released)) + cachefiles_state_changed(cache); } } @@ -164,8 +154,11 @@ retry: inode_lock(d_inode(subdir)); inode_unlock(d_inode(dir)); - if (!__cachefiles_mark_inode_in_use(NULL, subdir)) + if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) { + pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", + subdir, d_inode(subdir)->i_ino); goto mark_error; + } inode_unlock(d_inode(subdir)); @@ -224,9 +217,7 @@ nomem_d_alloc: void cachefiles_put_directory(struct dentry *dir) { if (dir) { - inode_lock(dir->d_inode); - __cachefiles_unmark_inode_in_use(NULL, dir); - inode_unlock(dir->d_inode); + cachefiles_do_unmark_inode_in_use(NULL, d_inode(dir)); dput(dir); } } @@ -410,7 +401,7 @@ try_again: "Rename failed with error %d", ret); } - __cachefiles_unmark_inode_in_use(object, rep); + __cachefiles_unmark_inode_in_use(object, d_inode(rep)); unlock_rename(cache->graveyard, dir); dput(grave); _leave(" = 0"); @@ -451,84 +442,72 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) const struct cred *saved_cred; struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash]; struct file *file; - struct path path; + const struct path parentpath = { .mnt = cache->mnt, .dentry = fan }; uint64_t ni_size; long ret; cachefiles_begin_secure(cache, &saved_cred); - path.mnt = cache->mnt; ret = cachefiles_inject_write_error(); - if (ret == 0) - path.dentry = vfs_tmpfile(&init_user_ns, fan, S_IFREG, O_RDWR); - else - path.dentry = ERR_PTR(ret); - if (IS_ERR(path.dentry)) { - trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(path.dentry), + if (ret == 0) { + file = vfs_tmpfile_open(&init_user_ns, &parentpath, S_IFREG, + O_RDWR | O_LARGEFILE | O_DIRECT, + cache->cache_cred); + ret = PTR_ERR_OR_ZERO(file); + } + if (ret) { + trace_cachefiles_vfs_error(object, d_inode(fan), ret, cachefiles_trace_tmpfile_error); - if (PTR_ERR(path.dentry) == -EIO) + if (ret == -EIO) cachefiles_io_error_obj(object, "Failed to create tmpfile"); - file = ERR_CAST(path.dentry); - goto out; + goto err; } - trace_cachefiles_tmpfile(object, d_backing_inode(path.dentry)); + trace_cachefiles_tmpfile(object, file_inode(file)); - if (!cachefiles_mark_inode_in_use(object, path.dentry)) { - file = ERR_PTR(-EBUSY); - goto out_dput; - } + /* This is a newly created file with no other possible user */ + if (!cachefiles_mark_inode_in_use(object, file_inode(file))) + WARN_ON(1); ret = cachefiles_ondemand_init_object(object); - if (ret < 0) { - file = ERR_PTR(ret); - goto out_unuse; - } + if (ret < 0) + goto err_unuse; ni_size = object->cookie->object_size; ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE); if (ni_size > 0) { - trace_cachefiles_trunc(object, d_backing_inode(path.dentry), 0, ni_size, + trace_cachefiles_trunc(object, file_inode(file), 0, ni_size, cachefiles_trunc_expand_tmpfile); ret = cachefiles_inject_write_error(); if (ret == 0) - ret = vfs_truncate(&path, ni_size); + ret = vfs_truncate(&file->f_path, ni_size); if (ret < 0) { trace_cachefiles_vfs_error( - object, d_backing_inode(path.dentry), ret, + object, file_inode(file), ret, cachefiles_trace_trunc_error); - file = ERR_PTR(ret); - goto out_unuse; + goto err_unuse; } } - file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT, - d_backing_inode(path.dentry), cache->cache_cred); - if (IS_ERR(file)) { - trace_cachefiles_vfs_error(object, d_backing_inode(path.dentry), - PTR_ERR(file), - cachefiles_trace_open_error); - goto out_unuse; - } + ret = -EINVAL; if (unlikely(!file->f_op->read_iter) || unlikely(!file->f_op->write_iter)) { fput(file); pr_notice("Cache does not support read_iter and write_iter\n"); - file = ERR_PTR(-EINVAL); - goto out_unuse; + goto err_unuse; } - - goto out_dput; - -out_unuse: - cachefiles_do_unmark_inode_in_use(object, path.dentry); -out_dput: - dput(path.dentry); out: cachefiles_end_secure(cache, saved_cred); return file; + +err_unuse: + cachefiles_do_unmark_inode_in_use(object, file_inode(file)); + fput(file); +err: + file = ERR_PTR(ret); + goto out; } /* @@ -569,8 +548,11 @@ static bool cachefiles_open_file(struct cachefiles_object *object, _enter("%pd", dentry); - if (!cachefiles_mark_inode_in_use(object, dentry)) + if (!cachefiles_mark_inode_in_use(object, d_inode(dentry))) { + pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", + dentry, d_inode(dentry)->i_ino); return false; + } /* We need to open a file interface onto a data file now as we can't do * it on demand because writeback called from do_exit() sees @@ -624,7 +606,7 @@ check_failed: error_fput: fput(file); error: - cachefiles_do_unmark_inode_in_use(object, dentry); + cachefiles_do_unmark_inode_in_use(object, d_inode(dentry)); dput(dentry); return false; } diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index f4fc8e0b847c..c7e8dd5b58d4 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -85,13 +85,14 @@ retry: return acl; } -int ceph_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ceph_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int ret = 0, size = 0; const char *name = NULL; char *value = NULL; struct iattr newattrs; + struct inode *inode = d_inode(dentry); struct timespec64 old_ctime = inode->i_ctime; umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index dcf701b05cc1..8c74871e37c9 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -288,7 +288,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) } len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len); - iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len); + iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter); if (err == 0) err = -EFAULT; @@ -327,7 +327,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) } dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len); - iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len); + iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off); if (err < 0) { dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err); @@ -1367,7 +1367,7 @@ out: folio_put(folio); if (check_cap) - ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); + ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY); return copied; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 53cfe026b3ea..4b159f97fe7b 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -754,6 +754,7 @@ void ceph_add_cap(struct inode *inode, cap->issue_seq = seq; cap->mseq = mseq; cap->cap_gen = gen; + wake_up_all(&ci->i_cap_wq); } /* @@ -1897,8 +1898,7 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci) * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without * further delay. */ -void ceph_check_caps(struct ceph_inode_info *ci, int flags, - struct ceph_mds_session *session) +void ceph_check_caps(struct ceph_inode_info *ci, int flags) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); @@ -1912,15 +1912,14 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, bool queue_invalidate = false; bool tried_invalidate = false; bool queue_writeback = false; - - if (session) - ceph_get_mds_session(session); + struct ceph_mds_session *session = NULL; spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { + ci->i_ceph_flags |= CEPH_I_ASYNC_CHECK_CAPS; + /* Don't send messages until we get async create reply */ spin_unlock(&ci->i_ceph_lock); - ceph_put_mds_session(session); return; } @@ -2247,7 +2246,6 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode) struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req1 = NULL, *req2 = NULL; - unsigned int max_sessions; int ret, err = 0; spin_lock(&ci->i_unsafe_lock); @@ -2266,27 +2264,23 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode) spin_unlock(&ci->i_unsafe_lock); /* - * The mdsc->max_sessions is unlikely to be changed - * mostly, here we will retry it by reallocating the - * sessions array memory to get rid of the mdsc->mutex - * lock. - */ -retry: - max_sessions = mdsc->max_sessions; - - /* * Trigger to flush the journal logs in all the relevant MDSes * manually, or in the worst case we must wait at most 5 seconds * to wait the journal logs to be flushed by the MDSes periodically. */ - if ((req1 || req2) && likely(max_sessions)) { - struct ceph_mds_session **sessions = NULL; - struct ceph_mds_session *s; + if (req1 || req2) { struct ceph_mds_request *req; + struct ceph_mds_session **sessions; + struct ceph_mds_session *s; + unsigned int max_sessions; int i; - sessions = kzalloc(max_sessions * sizeof(s), GFP_KERNEL); + mutex_lock(&mdsc->mutex); + max_sessions = mdsc->max_sessions; + + sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL); if (!sessions) { + mutex_unlock(&mdsc->mutex); err = -ENOMEM; goto out; } @@ -2298,16 +2292,6 @@ retry: s = req->r_session; if (!s) continue; - if (unlikely(s->s_mds >= max_sessions)) { - spin_unlock(&ci->i_unsafe_lock); - for (i = 0; i < max_sessions; i++) { - s = sessions[i]; - if (s) - ceph_put_mds_session(s); - } - kfree(sessions); - goto retry; - } if (!sessions[s->s_mds]) { s = ceph_get_mds_session(s); sessions[s->s_mds] = s; @@ -2320,16 +2304,6 @@ retry: s = req->r_session; if (!s) continue; - if (unlikely(s->s_mds >= max_sessions)) { - spin_unlock(&ci->i_unsafe_lock); - for (i = 0; i < max_sessions; i++) { - s = sessions[i]; - if (s) - ceph_put_mds_session(s); - } - kfree(sessions); - goto retry; - } if (!sessions[s->s_mds]) { s = ceph_get_mds_session(s); sessions[s->s_mds] = s; @@ -2341,11 +2315,12 @@ retry: /* the auth MDS */ spin_lock(&ci->i_ceph_lock); if (ci->i_auth_cap) { - s = ci->i_auth_cap->session; - if (!sessions[s->s_mds]) - sessions[s->s_mds] = ceph_get_mds_session(s); + s = ci->i_auth_cap->session; + if (!sessions[s->s_mds]) + sessions[s->s_mds] = ceph_get_mds_session(s); } spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&mdsc->mutex); /* send flush mdlog request to MDSes */ for (i = 0; i < max_sessions; i++) { @@ -2759,13 +2734,17 @@ again: * on transition from wanted -> needed caps. This is needed * for WRBUFFER|WR -> WR to avoid a new WR sync write from * going before a prior buffered writeback happens. + * + * For RDCACHE|RD -> RD, there is not need to wait and we can + * just exclude the revoking caps and force to sync read. */ int not = want & ~(have & need); int revoking = implemented & ~have; + int exclude = revoking & not; dout("get_cap_refs %p have %s but not %s (revoking %s)\n", inode, ceph_cap_string(have), ceph_cap_string(not), ceph_cap_string(revoking)); - if ((revoking & not) == 0) { + if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) { if (!snap_rwsem_locked && !ci->i_head_snapc && (need & CEPH_CAP_FILE_WR)) { @@ -2787,7 +2766,7 @@ again: snap_rwsem_locked = true; } if ((have & want) == want) - *got = need | want; + *got = need | (want & ~exclude); else *got = need; ceph_take_cap_refs(ci, *got, true); @@ -2870,7 +2849,7 @@ static void check_max_size(struct inode *inode, loff_t endoff) check = 1; spin_unlock(&ci->i_ceph_lock); if (check) - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY); } static inline int get_used_fmode(int caps) @@ -3159,7 +3138,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, switch (mode) { case PUT_CAP_REFS_SYNC: if (last) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); else if (flushsnaps) ceph_flush_snaps(ci, NULL); break; @@ -3274,7 +3253,7 @@ unlock: spin_unlock(&ci->i_ceph_lock); if (last) { - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); } else if (flush_snaps) { ceph_flush_snaps(ci, NULL); } @@ -3550,6 +3529,9 @@ static void handle_cap_grant(struct inode *inode, check_caps = 1; /* check auth cap only */ else check_caps = 2; /* check all caps */ + /* If there is new caps, try to wake up the waiters */ + if (~cap->issued & newcaps) + wake = true; cap->issued = newcaps; cap->implemented |= newcaps; } else if (cap->issued == newcaps) { @@ -3620,10 +3602,9 @@ static void handle_cap_grant(struct inode *inode, mutex_unlock(&session->s_mutex); if (check_caps == 1) - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL, - session); + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL); else if (check_caps == 2) - ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session); + ceph_check_caps(ci, CHECK_CAPS_NOINVAL); } /* @@ -4349,7 +4330,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) if (inode) { spin_unlock(&mdsc->cap_delay_lock); dout("check_delayed_caps on %p\n", inode); - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); iput(inode); spin_lock(&mdsc->cap_delay_lock); } @@ -4378,7 +4359,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s) dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode)); spin_unlock(&mdsc->cap_dirty_lock); ceph_wait_on_async_create(inode); - ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); + ceph_check_caps(ci, CHECK_CAPS_FLUSH); iput(inode); spin_lock(&mdsc->cap_dirty_lock); } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index e7e2ebac330d..6c7026cc8988 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -2033,7 +2033,7 @@ const struct inode_operations ceph_dir_iops = { .getattr = ceph_getattr, .setattr = ceph_setattr, .listxattr = ceph_listxattr, - .get_acl = ceph_get_acl, + .get_inode_acl = ceph_get_acl, .set_acl = ceph_set_acl, .mknod = ceph_mknod, .symlink = ceph_symlink, diff --git a/fs/ceph/export.c b/fs/ceph/export.c index e0fa66ac8b9f..f780e4e0d062 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -181,6 +181,7 @@ struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino) static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) { struct inode *inode = __lookup_inode(sb, ino); + struct ceph_inode_info *ci = ceph_inode(inode); int err; if (IS_ERR(inode)) @@ -192,7 +193,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) return ERR_PTR(err); } /* -ESTALE if inode as been unlinked and no file is open */ - if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) { + if ((inode->i_nlink == 0) && !__ceph_is_file_opened(ci)) { iput(inode); return ERR_PTR(-ESTALE); } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 04fd34557de8..764598e1efd9 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -313,7 +313,7 @@ int ceph_renew_caps(struct inode *inode, int fmode) spin_unlock(&ci->i_ceph_lock); dout("renew caps %p want %s issued %s updating mds_wanted\n", inode, ceph_cap_string(wanted), ceph_cap_string(issued)); - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); return 0; } spin_unlock(&ci->i_ceph_lock); @@ -408,7 +408,7 @@ int ceph_open(struct inode *inode, struct file *file) if ((issued & wanted) != wanted && (mds_wanted & wanted) != wanted && ceph_snap(inode) != CEPH_SNAPDIR) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); return ceph_init_file(inode, file, fmode); } else if (ceph_snap(inode) != CEPH_NOSNAP && @@ -534,14 +534,23 @@ static void wake_async_create_waiters(struct inode *inode, struct ceph_mds_session *session) { struct ceph_inode_info *ci = ceph_inode(inode); + bool check_cap = false; spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); + + if (ci->i_ceph_flags & CEPH_I_ASYNC_CHECK_CAPS) { + ci->i_ceph_flags &= ~CEPH_I_ASYNC_CHECK_CAPS; + check_cap = true; + } } ceph_kick_flushing_inode_caps(session, ci); spin_unlock(&ci->i_ceph_lock); + + if (check_cap) + ceph_check_caps(ci, CHECK_CAPS_FLUSH); } static void ceph_async_create_cb(struct ceph_mds_client *mdsc, @@ -1092,7 +1101,7 @@ static void ceph_aio_complete(struct inode *inode, loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len; if (endoff > i_size_read(inode)) { if (ceph_inode_set_size(inode, endoff)) - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY); } spin_lock(&ci->i_ceph_lock); @@ -1161,7 +1170,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) aio_req->total_len = rc + zlen; } - iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs, + iov_iter_bvec(&i, ITER_DEST, osd_data->bvec_pos.bvecs, osd_data->num_bvecs, len); iov_iter_advance(&i, rc); iov_iter_zero(zlen, &i); @@ -1400,7 +1409,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, int zlen = min_t(size_t, len - ret, size - pos - ret); - iov_iter_bvec(&i, READ, bvecs, num_pages, len); + iov_iter_bvec(&i, ITER_DEST, bvecs, num_pages, len); iov_iter_advance(&i, ret); iov_iter_zero(zlen, &i); ret += zlen; @@ -1421,8 +1430,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (write && pos > size) { if (ceph_inode_set_size(inode, pos)) ceph_check_caps(ceph_inode(inode), - CHECK_CAPS_AUTHONLY, - NULL); + CHECK_CAPS_AUTHONLY); } } @@ -1577,8 +1585,7 @@ out: check_caps = ceph_inode_set_size(inode, pos); if (check_caps) ceph_check_caps(ceph_inode(inode), - CHECK_CAPS_AUTHONLY, - NULL); + CHECK_CAPS_AUTHONLY); } } @@ -1906,7 +1913,7 @@ retry_snap: if (dirty) __mark_inode_dirty(inode, dirty); if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos)) - ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); + ceph_check_caps(ci, CHECK_CAPS_FLUSH); } dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", @@ -2521,8 +2528,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, /* Let the MDS know about dst file size change */ if (ceph_inode_set_size(dst_inode, dst_off) || ceph_quota_is_max_bytes_approaching(dst_inode, dst_off)) - ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH, - NULL); + ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH); } /* Mark Fw dirty */ spin_lock(&dst_ci->i_ceph_lock); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 42351d7a0dd6..23d05ec87fcc 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -126,7 +126,7 @@ const struct inode_operations ceph_file_iops = { .setattr = ceph_setattr, .getattr = ceph_getattr, .listxattr = ceph_listxattr, - .get_acl = ceph_get_acl, + .get_inode_acl = ceph_get_acl, .set_acl = ceph_set_acl, }; @@ -362,7 +362,7 @@ static int ceph_fill_fragtree(struct inode *inode, if (nsplits != ci->i_fragtree_nsplits) { update = true; } else if (nsplits) { - i = prandom_u32() % nsplits; + i = get_random_u32_below(nsplits); id = le32_to_cpu(fragtree->splits[i].frag); if (!__ceph_find_frag(ci, id)) update = true; @@ -1909,7 +1909,7 @@ static void ceph_do_invalidate_pages(struct inode *inode) mutex_unlock(&ci->i_truncate_mutex); out: if (check) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); } /* @@ -1969,7 +1969,7 @@ retry: mutex_unlock(&ci->i_truncate_mutex); if (wrbuffer_refs == 0) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); wake_up_all(&ci->i_cap_wq); } @@ -1991,7 +1991,7 @@ static void ceph_inode_work(struct work_struct *work) __ceph_do_pending_vmtruncate(inode); if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask)) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask)) ceph_flush_snaps(ci, NULL); @@ -2192,6 +2192,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, &prealloc_cf); inode->i_ctime = attr->ia_ctime; + inode_inc_iversion_raw(inode); } release &= issued; @@ -2254,7 +2255,7 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, err = __ceph_setattr(inode, attr); if (err >= 0 && (attr->ia_valid & ATTR_MODE)) - err = posix_acl_chmod(&init_user_ns, inode, attr->ia_mode); + err = posix_acl_chmod(&init_user_ns, dentry, attr->ia_mode); return err; } @@ -2356,6 +2357,7 @@ int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, goto out; } + req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR; req->r_path2 = kstrdup(name, GFP_NOFS); if (!req->r_path2) { err = -ENOMEM; @@ -2447,6 +2449,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); + struct super_block *sb = inode->i_sb; struct ceph_inode_info *ci = ceph_inode(inode); u32 valid_mask = STATX_BASIC_STATS; int err = 0; @@ -2476,16 +2479,34 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, } if (ceph_snap(inode) == CEPH_NOSNAP) - stat->dev = inode->i_sb->s_dev; + stat->dev = sb->s_dev; else stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; if (S_ISDIR(inode->i_mode)) { - if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), - RBYTES)) + if (ceph_test_mount_opt(ceph_sb_to_client(sb), RBYTES)) { stat->size = ci->i_rbytes; - else + } else if (ceph_snap(inode) == CEPH_SNAPDIR) { + struct ceph_inode_info *pci; + struct ceph_snap_realm *realm; + struct inode *parent; + + parent = ceph_lookup_inode(sb, ceph_ino(inode)); + if (IS_ERR(parent)) + return PTR_ERR(parent); + + pci = ceph_inode(parent); + spin_lock(&pci->i_ceph_lock); + realm = pci->i_snap_realm; + if (realm) + stat->size = realm->num_snaps; + else + stat->size = 0; + spin_unlock(&pci->i_ceph_lock); + iput(parent); + } else { stat->size = ci->i_files + ci->i_subdirs; + } stat->blocks = 0; stat->blksize = 65536; /* diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 6e061bf62ad4..deac817647eb 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -253,7 +253,7 @@ static long ceph_ioctl_lazyio(struct file *file) spin_unlock(&ci->i_ceph_lock); dout("ioctl_layzio: file %p marked lazy\n", file); - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); } else { dout("ioctl_layzio: file %p already lazy\n", file); } diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 3e2843e86e27..f3b461c708a8 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -364,7 +364,7 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) *fcntl_count = 0; *flock_count = 0; - ctx = inode->i_flctx; + ctx = locks_inode_context(inode); if (ctx) { spin_lock(&ctx->flc_lock); list_for_each_entry(lock, &ctx->flc_posix, fl_list) @@ -418,7 +418,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode, int num_fcntl_locks, int num_flock_locks) { struct file_lock *lock; - struct file_lock_context *ctx = inode->i_flctx; + struct file_lock_context *ctx = locks_inode_context(inode); int err = 0; int seen_fcntl = 0; int seen_flock = 0; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 80f8b9ec1a31..26a0a8b9975e 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2318,6 +2318,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) INIT_LIST_HEAD(&req->r_unsafe_dir_item); INIT_LIST_HEAD(&req->r_unsafe_target_item); req->r_fmode = -1; + req->r_feature_needed = -1; kref_init(&req->r_kref); RB_CLEAR_NODE(&req->r_node); INIT_LIST_HEAD(&req->r_wait); @@ -2916,6 +2917,16 @@ static void __do_request(struct ceph_mds_client *mdsc, dout("do_request mds%d session %p state %s\n", mds, session, ceph_session_state_name(session->s_state)); + + /* + * The old ceph will crash the MDSs when see unknown OPs + */ + if (req->r_feature_needed > 0 && + !test_bit(req->r_feature_needed, &session->s_features)) { + err = -EOPNOTSUPP; + goto out_session; + } + if (session->s_state != CEPH_MDS_SESSION_OPEN && session->s_state != CEPH_MDS_SESSION_HUNG) { /* diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 256e3eada6c1..0598faa50e2e 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -31,8 +31,9 @@ enum ceph_feature_type { CEPHFS_FEATURE_METRIC_COLLECT, CEPHFS_FEATURE_ALTERNATE_NAME, CEPHFS_FEATURE_NOTIFY_SESSION_STATE, + CEPHFS_FEATURE_OP_GETVXATTR, - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_NOTIFY_SESSION_STATE, + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR, }; #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ @@ -44,6 +45,7 @@ enum ceph_feature_type { CEPHFS_FEATURE_DELEG_INO, \ CEPHFS_FEATURE_METRIC_COLLECT, \ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ + CEPHFS_FEATURE_OP_GETVXATTR, \ } /* @@ -336,6 +338,8 @@ struct ceph_mds_request { long long r_dir_ordered_cnt; int r_readdir_cache_idx; + int r_feature_needed; + struct ceph_cap_reservation r_caps_reservation; }; diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 8d0a6d2c2da4..7dac21ee6ce7 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -29,7 +29,7 @@ static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) return -1; /* pick */ - n = prandom_u32() % n; + n = get_random_u32_below(n); for (j = 0, i = 0; i < m->possible_max_rank; i++) { if (CEPH_MDS_IS_READY(i, ignore_laggy)) j++; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 864cdaa0d2bd..e4151852184e 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -763,7 +763,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, struct ceph_mds_snap_realm *ri; /* encoded */ __le64 *snaps; /* encoded */ __le64 *prior_parent_snaps; /* encoded */ - struct ceph_snap_realm *realm = NULL; + struct ceph_snap_realm *realm; struct ceph_snap_realm *first_realm = NULL; struct ceph_snap_realm *realm_to_rebuild = NULL; int rebuild_snapcs; @@ -774,6 +774,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, dout("%s deletion=%d\n", __func__, deletion); more: + realm = NULL; rebuild_snapcs = 0; ceph_decode_need(&p, e, sizeof(*ri), bad); ri = p; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 40630e6f691c..30bdb391a0dc 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -593,6 +593,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */ #define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT) #define CEPH_I_SHUTDOWN (1 << 13) /* inode is no longer usable */ +#define CEPH_I_ASYNC_CHECK_CAPS (1 << 14) /* check caps immediately after async + creating finishes */ /* * Masks of ceph inode work. @@ -1117,7 +1119,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx); struct posix_acl *ceph_get_acl(struct inode *, int, bool); int ceph_set_acl(struct user_namespace *mnt_userns, - struct inode *inode, struct posix_acl *acl, int type); + struct dentry *dentry, struct posix_acl *acl, int type); int ceph_pre_init_acls(struct inode *dir, umode_t *mode, struct ceph_acl_sec_ctx *as_ctx); void ceph_init_inode_acls(struct inode *inode, @@ -1200,8 +1202,7 @@ extern void ceph_remove_capsnap(struct inode *inode, extern void ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession); extern bool __ceph_should_report_size(struct ceph_inode_info *ci); -extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, - struct ceph_mds_session *session); +extern void ceph_check_caps(struct ceph_inode_info *ci, int flags); extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc); extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); extern int ceph_drop_caps_for_unlink(struct inode *inode); diff --git a/fs/char_dev.c b/fs/char_dev.c index ba0ded7842a7..13deb45f1ec6 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -483,17 +483,24 @@ int cdev_add(struct cdev *p, dev_t dev, unsigned count) p->dev = dev; p->count = count; - if (WARN_ON(dev == WHITEOUT_DEV)) - return -EBUSY; + if (WARN_ON(dev == WHITEOUT_DEV)) { + error = -EBUSY; + goto err; + } error = kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p); if (error) - return error; + goto err; kobject_get(p->kobj.parent); return 0; + +err: + kfree_const(p->kobj.name); + p->kobj.name = NULL; + return error; } /** @@ -547,7 +554,7 @@ int cdev_device_add(struct cdev *cdev, struct device *dev) } rc = device_add(dev); - if (rc) + if (rc && dev->devt) cdev_del(cdev); return rc; diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c index b401339f6e73..60399081046a 100644 --- a/fs/cifs/cached_dir.c +++ b/fs/cifs/cached_dir.c @@ -5,12 +5,99 @@ * Copyright (c) 2022, Ronnie Sahlberg <lsahlber@redhat.com> */ +#include <linux/namei.h> #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" #include "smb2proto.h" #include "cached_dir.h" +static struct cached_fid *init_cached_dir(const char *path); +static void free_cached_dir(struct cached_fid *cfid); + +static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, + const char *path, + bool lookup_only) +{ + struct cached_fid *cfid; + + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry(cfid, &cfids->entries, entry) { + if (!strcmp(cfid->path, path)) { + /* + * If it doesn't have a lease it is either not yet + * fully cached or it may be in the process of + * being deleted due to a lease break. + */ + if (!cfid->has_lease) { + spin_unlock(&cfids->cfid_list_lock); + return NULL; + } + kref_get(&cfid->refcount); + spin_unlock(&cfids->cfid_list_lock); + return cfid; + } + } + if (lookup_only) { + spin_unlock(&cfids->cfid_list_lock); + return NULL; + } + if (cfids->num_entries >= MAX_CACHED_FIDS) { + spin_unlock(&cfids->cfid_list_lock); + return NULL; + } + cfid = init_cached_dir(path); + if (cfid == NULL) { + spin_unlock(&cfids->cfid_list_lock); + return NULL; + } + cfid->cfids = cfids; + cfids->num_entries++; + list_add(&cfid->entry, &cfids->entries); + cfid->on_list = true; + kref_get(&cfid->refcount); + spin_unlock(&cfids->cfid_list_lock); + return cfid; +} + +static struct dentry * +path_to_dentry(struct cifs_sb_info *cifs_sb, const char *path) +{ + struct dentry *dentry; + const char *s, *p; + char sep; + + sep = CIFS_DIR_SEP(cifs_sb); + dentry = dget(cifs_sb->root); + s = path; + + do { + struct inode *dir = d_inode(dentry); + struct dentry *child; + + if (!S_ISDIR(dir->i_mode)) { + dput(dentry); + dentry = ERR_PTR(-ENOTDIR); + break; + } + + /* skip separators */ + while (*s == sep) + s++; + if (!*s) + break; + p = s++; + /* next separator */ + while (*s && *s != sep) + s++; + + child = lookup_positive_unlocked(p, dentry, s - p); + dput(dentry); + dentry = child; + } while (!IS_ERR(dentry)); + return dentry; +} + /* * Open the and cache a directory handle. * If error then *cfid is not initialized. @@ -31,54 +118,57 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; struct kvec qi_iov[1]; int rc, flags = 0; - __le16 utf16_path = 0; /* Null - since an open of top of share */ + __le16 *utf16_path = NULL; u8 oplock = SMB2_OPLOCK_LEVEL_II; struct cifs_fid *pfid; - struct dentry *dentry; + struct dentry *dentry = NULL; struct cached_fid *cfid; + struct cached_fids *cfids; - if (tcon == NULL || tcon->nohandlecache || + if (tcon == NULL || tcon->cfids == NULL || tcon->nohandlecache || is_smb1_server(tcon->ses->server)) return -EOPNOTSUPP; ses = tcon->ses; server = ses->server; + cfids = tcon->cfids; - if (cifs_sb->root == NULL) - return -ENOENT; + if (!server->ops->new_lease_key) + return -EIO; - if (strlen(path)) + if (cifs_sb->root == NULL) return -ENOENT; - dentry = cifs_sb->root; + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); + if (!utf16_path) + return -ENOMEM; - cfid = tcon->cfid; - mutex_lock(&cfid->fid_mutex); - if (cfid->is_valid) { - cifs_dbg(FYI, "found a cached root file handle\n"); + cfid = find_or_create_cached_dir(cfids, path, lookup_only); + if (cfid == NULL) { + kfree(utf16_path); + return -ENOENT; + } + /* + * At this point we either have a lease already and we can just + * return it. If not we are guaranteed to be the only thread accessing + * this cfid. + */ + if (cfid->has_lease) { *ret_cfid = cfid; - kref_get(&cfid->refcount); - mutex_unlock(&cfid->fid_mutex); + kfree(utf16_path); return 0; } /* * We do not hold the lock for the open because in case - * SMB2_open needs to reconnect, it will end up calling - * cifs_mark_open_files_invalid() which takes the lock again - * thus causing a deadlock + * SMB2_open needs to reconnect. + * This is safe because no other thread will be able to get a ref + * to the cfid until we have finished opening the file and (possibly) + * acquired a lease. */ - mutex_unlock(&cfid->fid_mutex); - - if (lookup_only) - return -ENOENT; - if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - if (!server->ops->new_lease_key) - return -EIO; - pfid = &cfid->fid; server->ops->new_lease_key(pfid); @@ -99,7 +189,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, oparms.reconnect = false; rc = SMB2_open_init(tcon, server, - &rqst[0], &oplock, &oparms, &utf16_path); + &rqst[0], &oplock, &oparms, utf16_path); if (rc) goto oshr_free; smb2_set_next_command(tcon, &rqst[0]); @@ -122,47 +212,13 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, rc = compound_send_recv(xid, ses, server, flags, 2, rqst, resp_buftype, rsp_iov); - mutex_lock(&cfid->fid_mutex); - - /* - * Now we need to check again as the cached root might have - * been successfully re-opened from a concurrent process - */ - - if (cfid->is_valid) { - /* work was already done */ - - /* stash fids for close() later */ - struct cifs_fid fid = { - .persistent_fid = pfid->persistent_fid, - .volatile_fid = pfid->volatile_fid, - }; - - /* - * caller expects this func to set the fid in cfid to valid - * cached root, so increment the refcount. - */ - kref_get(&cfid->refcount); - - mutex_unlock(&cfid->fid_mutex); - - if (rc == 0) { - /* close extra handle outside of crit sec */ - SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); - } - rc = 0; - goto oshr_free; - } - - /* Cached root is still invalid, continue normaly */ - if (rc) { if (rc == -EREMCHG) { tcon->need_reconnect = true; pr_warn_once("server share %s deleted\n", - tcon->treeName); + tcon->tree_name); } - goto oshr_exit; + goto oshr_free; } atomic_inc(&tcon->num_remote_opens); @@ -174,30 +230,18 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId); #endif /* CIFS_DEBUG2 */ - cfid->tcon = tcon; - cfid->is_valid = true; - cfid->dentry = dentry; - dget(dentry); - kref_init(&cfid->refcount); + if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE) + goto oshr_free; - /* BB TBD check to see if oplock level check can be removed below */ - if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) { - /* - * See commit 2f94a3125b87. Increment the refcount when we - * get a lease for root, release it if lease break occurs - */ - kref_get(&cfid->refcount); - cfid->has_lease = true; - smb2_parse_contexts(server, o_rsp, - &oparms.fid->epoch, - oparms.fid->lease_key, &oplock, - NULL, NULL); - } else - goto oshr_exit; + + smb2_parse_contexts(server, o_rsp, + &oparms.fid->epoch, + oparms.fid->lease_key, &oplock, + NULL, NULL); qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info)) - goto oshr_exit; + goto oshr_free; if (!smb2_validate_and_copy_iov( le16_to_cpu(qi_rsp->OutputBufferOffset), sizeof(struct smb2_file_all_info), @@ -205,15 +249,42 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, (char *)&cfid->file_all_info)) cfid->file_all_info_is_valid = true; + if (!path[0]) + dentry = dget(cifs_sb->root); + else { + dentry = path_to_dentry(cifs_sb, path); + if (IS_ERR(dentry)) { + rc = -ENOENT; + goto oshr_free; + } + } + cfid->dentry = dentry; + cfid->tcon = tcon; cfid->time = jiffies; + cfid->is_open = true; + cfid->has_lease = true; -oshr_exit: - mutex_unlock(&cfid->fid_mutex); oshr_free: + kfree(utf16_path); SMB2_open_free(&rqst[0]); SMB2_query_info_free(&rqst[1]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + spin_lock(&cfids->cfid_list_lock); + if (!cfid->has_lease) { + if (cfid->on_list) { + list_del(&cfid->entry); + cfid->on_list = false; + cfids->num_entries--; + } + rc = -ENOENT; + } + spin_unlock(&cfids->cfid_list_lock); + if (rc) { + free_cached_dir(cfid); + cfid = NULL; + } + if (rc == 0) *ret_cfid = cfid; @@ -225,18 +296,22 @@ int open_cached_dir_by_dentry(struct cifs_tcon *tcon, struct cached_fid **ret_cfid) { struct cached_fid *cfid; + struct cached_fids *cfids = tcon->cfids; - cfid = tcon->cfid; + if (cfids == NULL) + return -ENOENT; - mutex_lock(&cfid->fid_mutex); - if (cfid->dentry == dentry) { - cifs_dbg(FYI, "found a cached root file handle by dentry\n"); - *ret_cfid = cfid; - kref_get(&cfid->refcount); - mutex_unlock(&cfid->fid_mutex); - return 0; + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry(cfid, &cfids->entries, entry) { + if (dentry && cfid->dentry == dentry) { + cifs_dbg(FYI, "found a cached root file handle by dentry\n"); + kref_get(&cfid->refcount); + *ret_cfid = cfid; + spin_unlock(&cfids->cfid_list_lock); + return 0; + } } - mutex_unlock(&cfid->fid_mutex); + spin_unlock(&cfids->cfid_list_lock); return -ENOENT; } @@ -245,63 +320,50 @@ smb2_close_cached_fid(struct kref *ref) { struct cached_fid *cfid = container_of(ref, struct cached_fid, refcount); - struct cached_dirent *dirent, *q; - if (cfid->is_valid) { - cifs_dbg(FYI, "clear cached root file handle\n"); - SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, - cfid->fid.volatile_fid); + spin_lock(&cfid->cfids->cfid_list_lock); + if (cfid->on_list) { + list_del(&cfid->entry); + cfid->on_list = false; + cfid->cfids->num_entries--; } + spin_unlock(&cfid->cfids->cfid_list_lock); - /* - * We only check validity above to send SMB2_close, - * but we still need to invalidate these entries - * when this function is called - */ - cfid->is_valid = false; - cfid->file_all_info_is_valid = false; - cfid->has_lease = false; - if (cfid->dentry) { - dput(cfid->dentry); - cfid->dentry = NULL; - } - /* - * Delete all cached dirent names - */ - mutex_lock(&cfid->dirents.de_mutex); - list_for_each_entry_safe(dirent, q, &cfid->dirents.entries, entry) { - list_del(&dirent->entry); - kfree(dirent->name); - kfree(dirent); + dput(cfid->dentry); + cfid->dentry = NULL; + + if (cfid->is_open) { + SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, + cfid->fid.volatile_fid); } - cfid->dirents.is_valid = 0; - cfid->dirents.is_failed = 0; - cfid->dirents.ctx = NULL; - cfid->dirents.pos = 0; - mutex_unlock(&cfid->dirents.de_mutex); + free_cached_dir(cfid); } -void close_cached_dir(struct cached_fid *cfid) +void drop_cached_dir_by_name(const unsigned int xid, struct cifs_tcon *tcon, + const char *name, struct cifs_sb_info *cifs_sb) { - mutex_lock(&cfid->fid_mutex); - kref_put(&cfid->refcount, smb2_close_cached_fid); - mutex_unlock(&cfid->fid_mutex); -} + struct cached_fid *cfid = NULL; + int rc; -void close_cached_dir_lease_locked(struct cached_fid *cfid) -{ + rc = open_cached_dir(xid, tcon, name, cifs_sb, true, &cfid); + if (rc) { + cifs_dbg(FYI, "no cached dir found for rmdir(%s)\n", name); + return; + } + spin_lock(&cfid->cfids->cfid_list_lock); if (cfid->has_lease) { cfid->has_lease = false; kref_put(&cfid->refcount, smb2_close_cached_fid); } + spin_unlock(&cfid->cfids->cfid_list_lock); + close_cached_dir(cfid); } -void close_cached_dir_lease(struct cached_fid *cfid) + +void close_cached_dir(struct cached_fid *cfid) { - mutex_lock(&cfid->fid_mutex); - close_cached_dir_lease_locked(cfid); - mutex_unlock(&cfid->fid_mutex); + kref_put(&cfid->refcount, smb2_close_cached_fid); } /* @@ -314,34 +376,60 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb) struct cached_fid *cfid; struct cifs_tcon *tcon; struct tcon_link *tlink; + struct cached_fids *cfids; for (node = rb_first(root); node; node = rb_next(node)) { tlink = rb_entry(node, struct tcon_link, tl_rbnode); tcon = tlink_tcon(tlink); if (IS_ERR(tcon)) continue; - cfid = tcon->cfid; - mutex_lock(&cfid->fid_mutex); - if (cfid->dentry) { + cfids = tcon->cfids; + if (cfids == NULL) + continue; + list_for_each_entry(cfid, &cfids->entries, entry) { dput(cfid->dentry); cfid->dentry = NULL; } - mutex_unlock(&cfid->fid_mutex); } } /* - * Invalidate and close all cached dirs when a TCON has been reset + * Invalidate all cached dirs when a TCON has been reset * due to a session loss. */ void invalidate_all_cached_dirs(struct cifs_tcon *tcon) { - mutex_lock(&tcon->cfid->fid_mutex); - tcon->cfid->is_valid = false; - /* cached handle is not valid, so SMB2_CLOSE won't be sent below */ - close_cached_dir_lease_locked(tcon->cfid); - memset(&tcon->cfid->fid, 0, sizeof(struct cifs_fid)); - mutex_unlock(&tcon->cfid->fid_mutex); + struct cached_fids *cfids = tcon->cfids; + struct cached_fid *cfid, *q; + LIST_HEAD(entry); + + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { + list_move(&cfid->entry, &entry); + cfids->num_entries--; + cfid->is_open = false; + cfid->on_list = false; + /* To prevent race with smb2_cached_lease_break() */ + kref_get(&cfid->refcount); + } + spin_unlock(&cfids->cfid_list_lock); + + list_for_each_entry_safe(cfid, q, &entry, entry) { + list_del(&cfid->entry); + cancel_work_sync(&cfid->lease_break); + if (cfid->has_lease) { + /* + * We lease was never cancelled from the server so we + * need to drop the reference. + */ + spin_lock(&cfids->cfid_list_lock); + cfid->has_lease = false; + spin_unlock(&cfids->cfid_list_lock); + kref_put(&cfid->refcount, smb2_close_cached_fid); + } + /* Drop the extra reference opened above*/ + kref_put(&cfid->refcount, smb2_close_cached_fid); + } } static void @@ -350,39 +438,121 @@ smb2_cached_lease_break(struct work_struct *work) struct cached_fid *cfid = container_of(work, struct cached_fid, lease_break); - close_cached_dir_lease(cfid); + spin_lock(&cfid->cfids->cfid_list_lock); + cfid->has_lease = false; + spin_unlock(&cfid->cfids->cfid_list_lock); + kref_put(&cfid->refcount, smb2_close_cached_fid); } int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]) { - if (tcon->cfid->is_valid && - !memcmp(lease_key, - tcon->cfid->fid.lease_key, - SMB2_LEASE_KEY_SIZE)) { - tcon->cfid->time = 0; - INIT_WORK(&tcon->cfid->lease_break, - smb2_cached_lease_break); - queue_work(cifsiod_wq, - &tcon->cfid->lease_break); - return true; + struct cached_fids *cfids = tcon->cfids; + struct cached_fid *cfid; + + if (cfids == NULL) + return false; + + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry(cfid, &cfids->entries, entry) { + if (cfid->has_lease && + !memcmp(lease_key, + cfid->fid.lease_key, + SMB2_LEASE_KEY_SIZE)) { + cfid->time = 0; + /* + * We found a lease remove it from the list + * so no threads can access it. + */ + list_del(&cfid->entry); + cfid->on_list = false; + cfids->num_entries--; + + queue_work(cifsiod_wq, + &cfid->lease_break); + spin_unlock(&cfids->cfid_list_lock); + return true; + } } + spin_unlock(&cfids->cfid_list_lock); return false; } -struct cached_fid *init_cached_dir(void) +static struct cached_fid *init_cached_dir(const char *path) { struct cached_fid *cfid; - cfid = kzalloc(sizeof(*cfid), GFP_KERNEL); + cfid = kzalloc(sizeof(*cfid), GFP_ATOMIC); if (!cfid) return NULL; + cfid->path = kstrdup(path, GFP_ATOMIC); + if (!cfid->path) { + kfree(cfid); + return NULL; + } + + INIT_WORK(&cfid->lease_break, smb2_cached_lease_break); + INIT_LIST_HEAD(&cfid->entry); INIT_LIST_HEAD(&cfid->dirents.entries); mutex_init(&cfid->dirents.de_mutex); - mutex_init(&cfid->fid_mutex); + spin_lock_init(&cfid->fid_lock); + kref_init(&cfid->refcount); return cfid; } -void free_cached_dir(struct cifs_tcon *tcon) +static void free_cached_dir(struct cached_fid *cfid) +{ + struct cached_dirent *dirent, *q; + + dput(cfid->dentry); + cfid->dentry = NULL; + + /* + * Delete all cached dirent names + */ + list_for_each_entry_safe(dirent, q, &cfid->dirents.entries, entry) { + list_del(&dirent->entry); + kfree(dirent->name); + kfree(dirent); + } + + kfree(cfid->path); + cfid->path = NULL; + kfree(cfid); +} + +struct cached_fids *init_cached_dirs(void) +{ + struct cached_fids *cfids; + + cfids = kzalloc(sizeof(*cfids), GFP_KERNEL); + if (!cfids) + return NULL; + spin_lock_init(&cfids->cfid_list_lock); + INIT_LIST_HEAD(&cfids->entries); + return cfids; +} + +/* + * Called from tconInfoFree when we are tearing down the tcon. + * There are no active users or open files/directories at this point. + */ +void free_cached_dirs(struct cached_fids *cfids) { - kfree(tcon->cfid); + struct cached_fid *cfid, *q; + LIST_HEAD(entry); + + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { + cfid->on_list = false; + cfid->is_open = false; + list_move(&cfid->entry, &entry); + } + spin_unlock(&cfids->cfid_list_lock); + + list_for_each_entry_safe(cfid, q, &entry, entry) { + list_del(&cfid->entry); + free_cached_dir(cfid); + } + + kfree(cfids); } diff --git a/fs/cifs/cached_dir.h b/fs/cifs/cached_dir.h index bd262dc8b179..2f4e764c9ca9 100644 --- a/fs/cifs/cached_dir.h +++ b/fs/cifs/cached_dir.h @@ -31,13 +31,17 @@ struct cached_dirents { }; struct cached_fid { - bool is_valid:1; /* Do we have a useable root fid */ - bool file_all_info_is_valid:1; + struct list_head entry; + struct cached_fids *cfids; + const char *path; bool has_lease:1; + bool is_open:1; + bool on_list:1; + bool file_all_info_is_valid:1; unsigned long time; /* jiffies of when lease was taken */ struct kref refcount; struct cifs_fid fid; - struct mutex fid_mutex; + spinlock_t fid_lock; struct cifs_tcon *tcon; struct dentry *dentry; struct work_struct lease_break; @@ -45,8 +49,18 @@ struct cached_fid { struct cached_dirents dirents; }; -extern struct cached_fid *init_cached_dir(void); -extern void free_cached_dir(struct cifs_tcon *tcon); +#define MAX_CACHED_FIDS 16 +struct cached_fids { + /* Must be held when: + * - accessing the cfids->entries list + */ + spinlock_t cfid_list_lock; + int num_entries; + struct list_head entries; +}; + +extern struct cached_fids *init_cached_dirs(void); +extern void free_cached_dirs(struct cached_fids *cfids); extern int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, const char *path, struct cifs_sb_info *cifs_sb, @@ -55,8 +69,10 @@ extern int open_cached_dir_by_dentry(struct cifs_tcon *tcon, struct dentry *dentry, struct cached_fid **cfid); extern void close_cached_dir(struct cached_fid *cfid); -extern void close_cached_dir_lease(struct cached_fid *cfid); -extern void close_cached_dir_lease_locked(struct cached_fid *cfid); +extern void drop_cached_dir_by_name(const unsigned int xid, + struct cifs_tcon *tcon, + const char *name, + struct cifs_sb_info *cifs_sb); extern void close_all_cached_dirs(struct cifs_sb_info *cifs_sb); extern void invalidate_all_cached_dirs(struct cifs_tcon *tcon); extern int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]); diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index c05477e28cff..90850da390ae 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -87,7 +87,7 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) { __u32 dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType); - seq_printf(m, "%s Mounts: %d ", tcon->treeName, tcon->tc_count); + seq_printf(m, "%s Mounts: %d ", tcon->tree_name, tcon->tc_count); if (tcon->nativeFileSystem) seq_printf(m, "Type: %s ", tcon->nativeFileSystem); seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x\n\tPathComponentMax: %d Status: %d", @@ -601,7 +601,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { i++; - seq_printf(m, "\n%d) %s", i, tcon->treeName); + seq_printf(m, "\n%d) %s", i, tcon->tree_name); if (tcon->need_reconnect) seq_puts(m, "\tDISCONNECTED "); seq_printf(m, "\nSMBs: %d", diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index ee4ea2b60c0f..d44808263cfb 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -108,8 +108,8 @@ do { \ #define cifs_tcon_dbg_func(ratefunc, type, fmt, ...) \ do { \ const char *tn = ""; \ - if (tcon && tcon->treeName) \ - tn = tcon->treeName; \ + if (tcon && tcon->tree_name) \ + tn = tcon->tree_name; \ if ((type) & FYI && cifsFYI & CIFS_INFO) { \ pr_debug_ ## ratefunc("%s: %s " fmt, \ __FILE__, tn, ##__VA_ARGS__); \ @@ -150,7 +150,7 @@ do { \ #define cifs_tcon_dbg(type, fmt, ...) \ do { \ if (0) \ - pr_debug("%s " fmt, tcon->treeName, ##__VA_ARGS__); \ + pr_debug("%s " fmt, tcon->tree_name, ##__VA_ARGS__); \ } while (0) #define cifs_info(fmt, ...) \ diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index b87cbbe6d2d4..332588e77c31 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -91,6 +91,13 @@ struct smb3_notify { bool watch_tree; } __packed; +struct smb3_notify_info { + __u32 completion_filter; + bool watch_tree; + __u32 data_len; /* size of notify data below */ + __u8 notify_data[]; +} __packed; + #define CIFS_IOCTL_MAGIC 0xCF #define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) #define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) @@ -100,7 +107,8 @@ struct smb3_notify { #define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info) #define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify) #define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info) -#define CIFS_IOC_SHUTDOWN _IOR ('X', 125, __u32) +#define CIFS_IOC_NOTIFY_INFO _IOWR(CIFS_IOCTL_MAGIC, 11, struct smb3_notify_info) +#define CIFS_IOC_SHUTDOWN _IOR('X', 125, __u32) /* * Flags for going down operation diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 342717bf1dc2..6f3285f1dfee 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -189,7 +189,7 @@ init_cifs_spnego(void) * spnego upcalls. */ - cred = prepare_kernel_cred(NULL); + cred = prepare_kernel_cred(&init_task); if (!cred) return -ENOMEM; diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index 1e4c7cc5287f..7233c6a7e6d7 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -256,23 +256,23 @@ static struct cifs_swn_reg *cifs_find_swn_reg(struct cifs_tcon *tcon) const char *share_name; const char *net_name; - net_name = extract_hostname(tcon->treeName); + net_name = extract_hostname(tcon->tree_name); if (IS_ERR(net_name)) { int ret; ret = PTR_ERR(net_name); cifs_dbg(VFS, "%s: failed to extract host name from target '%s': %d\n", - __func__, tcon->treeName, ret); + __func__, tcon->tree_name, ret); return ERR_PTR(-EINVAL); } - share_name = extract_sharename(tcon->treeName); + share_name = extract_sharename(tcon->tree_name); if (IS_ERR(share_name)) { int ret; ret = PTR_ERR(share_name); cifs_dbg(VFS, "%s: failed to extract share name from target '%s': %d\n", - __func__, tcon->treeName, ret); + __func__, tcon->tree_name, ret); kfree(net_name); return ERR_PTR(-EINVAL); } @@ -335,14 +335,14 @@ static struct cifs_swn_reg *cifs_get_swn_reg(struct cifs_tcon *tcon) goto fail; } - reg->net_name = extract_hostname(tcon->treeName); + reg->net_name = extract_hostname(tcon->tree_name); if (IS_ERR(reg->net_name)) { ret = PTR_ERR(reg->net_name); cifs_dbg(VFS, "%s: failed to extract host name from target: %d\n", __func__, ret); goto fail_idr; } - reg->share_name = extract_sharename(tcon->treeName); + reg->share_name = extract_sharename(tcon->tree_name); if (IS_ERR(reg->share_name)) { ret = PTR_ERR(reg->share_name); cifs_dbg(VFS, "%s: failed to extract share name from target: %d\n", __func__, ret); diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index fa480d62f313..bbf58c2439da 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -13,6 +13,9 @@ #include <linux/string.h> #include <linux/keyctl.h> #include <linux/key-type.h> +#include <uapi/linux/posix_acl.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> #include <keys/user-type.h> #include "cifspdu.h" #include "cifsglob.h" @@ -20,6 +23,8 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "fs_context.h" +#include "cifs_fs_sb.h" +#include "cifs_unicode.h" /* security id for everyone/world system group */ static const struct cifs_sid sid_everyone = { @@ -465,7 +470,7 @@ init_cifs_idmap(void) * this is used to prevent malicious redirections from being installed * with add_key(). */ - cred = prepare_kernel_cred(NULL); + cred = prepare_kernel_cred(&init_task); if (!cred) return -ENOMEM; @@ -1668,3 +1673,137 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, kfree(pntsd); return rc; } + +struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, int type) +{ +#if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX) + struct posix_acl *acl = NULL; + ssize_t rc = -EOPNOTSUPP; + unsigned int xid; + struct super_block *sb = dentry->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + const char *full_path; + void *page; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return ERR_CAST(tlink); + pTcon = tlink_tcon(tlink); + + xid = get_xid(); + page = alloc_dentry_path(); + + full_path = build_path_from_dentry(dentry, page); + if (IS_ERR(full_path)) { + acl = ERR_CAST(full_path); + goto out; + } + + /* return alt name if available as pseudo attr */ + switch (type) { + case ACL_TYPE_ACCESS: + if (sb->s_flags & SB_POSIXACL) + rc = cifs_do_get_acl(xid, pTcon, full_path, &acl, + ACL_TYPE_ACCESS, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + break; + + case ACL_TYPE_DEFAULT: + if (sb->s_flags & SB_POSIXACL) + rc = cifs_do_get_acl(xid, pTcon, full_path, &acl, + ACL_TYPE_DEFAULT, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + break; + } + + if (rc < 0) { + if (rc == -EINVAL) + acl = ERR_PTR(-EOPNOTSUPP); + else + acl = ERR_PTR(rc); + } + +out: + free_dentry_path(page); + free_xid(xid); + cifs_put_tlink(tlink); + return acl; +#else + return ERR_PTR(-EOPNOTSUPP); +#endif +} + +int cifs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + struct posix_acl *acl, int type) +{ +#if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX) + int rc = -EOPNOTSUPP; + unsigned int xid; + struct super_block *sb = dentry->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + const char *full_path; + void *page; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + pTcon = tlink_tcon(tlink); + + xid = get_xid(); + page = alloc_dentry_path(); + + full_path = build_path_from_dentry(dentry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); + goto out; + } + + if (!acl) + goto out; + + /* return dos attributes as pseudo xattr */ + /* return alt name if available as pseudo attr */ + + /* if proc/fs/cifs/streamstoxattr is set then + search server for EAs or streams to + returns as xattrs */ + if (posix_acl_xattr_size(acl->a_count) > CIFSMaxBufSize) { + cifs_dbg(FYI, "size of EA value too large\n"); + rc = -EOPNOTSUPP; + goto out; + } + + switch (type) { + case ACL_TYPE_ACCESS: + if (sb->s_flags & SB_POSIXACL) + rc = cifs_do_set_acl(xid, pTcon, full_path, acl, + ACL_TYPE_ACCESS, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + break; + + case ACL_TYPE_DEFAULT: + if (sb->s_flags & SB_POSIXACL) + rc = cifs_do_set_acl(xid, pTcon, full_path, acl, + ACL_TYPE_DEFAULT, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + break; + } + +out: + free_dentry_path(page); + free_xid(xid); + cifs_put_tlink(tlink); + return rc; +#else + return -EOPNOTSUPP; +#endif +} diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 46f5718754f9..5db73c0f792a 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -103,26 +103,24 @@ static int cifs_calc_signature(struct smb_rqst *rqst, if (!rqst->rq_iov || !signature || !server) return -EINVAL; - rc = cifs_alloc_hash("md5", &server->secmech.md5, - &server->secmech.sdescmd5); + rc = cifs_alloc_hash("md5", &server->secmech.md5); if (rc) return -1; - rc = crypto_shash_init(&server->secmech.sdescmd5->shash); + rc = crypto_shash_init(server->secmech.md5); if (rc) { cifs_dbg(VFS, "%s: Could not init md5\n", __func__); return rc; } - rc = crypto_shash_update(&server->secmech.sdescmd5->shash, + rc = crypto_shash_update(server->secmech.md5, server->session_key.response, server->session_key.len); if (rc) { cifs_dbg(VFS, "%s: Could not update with response\n", __func__); return rc; } - return __cifs_calc_signature(rqst, server, signature, - &server->secmech.sdescmd5->shash); + return __cifs_calc_signature(rqst, server, signature, server->secmech.md5); } /* must be called with server->srv_mutex held */ @@ -412,7 +410,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, wchar_t *domain; wchar_t *server; - if (!ses->server->secmech.sdeschmacmd5) { + if (!ses->server->secmech.hmacmd5) { cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__); return -1; } @@ -420,14 +418,14 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, /* calculate md4 hash of password */ E_md4hash(ses->password, nt_hash, nls_cp); - rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm, nt_hash, CIFS_NTHASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not set NT Hash as a key\n", __func__); return rc; } - rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + rc = crypto_shash_init(ses->server->secmech.hmacmd5); if (rc) { cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); return rc; @@ -448,7 +446,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, memset(user, '\0', 2); } - rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(ses->server->secmech.hmacmd5, (char *)user, 2 * len); kfree(user); if (rc) { @@ -468,7 +466,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len, nls_cp); rc = - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + crypto_shash_update(ses->server->secmech.hmacmd5, (char *)domain, 2 * len); kfree(domain); if (rc) { @@ -488,7 +486,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len, nls_cp); rc = - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + crypto_shash_update(ses->server->secmech.hmacmd5, (char *)server, 2 * len); kfree(server); if (rc) { @@ -498,7 +496,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, } } - rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_final(ses->server->secmech.hmacmd5, ntlmv2_hash); if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); @@ -518,12 +516,12 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) hash_len = ses->auth_key.len - (CIFS_SESS_KEY_SIZE + offsetof(struct ntlmv2_resp, challenge.key[0])); - if (!ses->server->secmech.sdeschmacmd5) { + if (!ses->server->secmech.hmacmd5) { cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__); return -1; } - rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", @@ -531,7 +529,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) return rc; } - rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + rc = crypto_shash_init(ses->server->secmech.hmacmd5); if (rc) { cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); return rc; @@ -543,7 +541,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) else memcpy(ntlmv2->challenge.key, ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); - rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(ses->server->secmech.hmacmd5, ntlmv2->challenge.key, hash_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with response\n", __func__); @@ -551,7 +549,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) } /* Note that the MD5 digest over writes anon.challenge_key.key */ - rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_final(ses->server->secmech.hmacmd5, ntlmv2->ntlmv2_hash); if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); @@ -627,9 +625,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) cifs_server_lock(ses->server); - rc = cifs_alloc_hash("hmac(md5)", - &ses->server->secmech.hmacmd5, - &ses->server->secmech.sdeschmacmd5); + rc = cifs_alloc_hash("hmac(md5)", &ses->server->secmech.hmacmd5); if (rc) { goto unlock; } @@ -649,7 +645,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) } /* now calculate the session key for NTLMv2 */ - rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", @@ -657,13 +653,13 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) goto unlock; } - rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + rc = crypto_shash_init(ses->server->secmech.hmacmd5); if (rc) { cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); goto unlock; } - rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(ses->server->secmech.hmacmd5, ntlmv2->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { @@ -671,7 +667,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) goto unlock; } - rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_final(ses->server->secmech.hmacmd5, ses->auth_key.response); if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); @@ -679,7 +675,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) unlock: cifs_server_unlock(ses->server); setup_ntlmv2_rsp_ret: - kfree(tiblob); + kfree_sensitive(tiblob); return rc; } @@ -718,49 +714,19 @@ calc_seckey(struct cifs_ses *ses) void cifs_crypto_secmech_release(struct TCP_Server_Info *server) { - if (server->secmech.cmacaes) { - crypto_free_shash(server->secmech.cmacaes); - server->secmech.cmacaes = NULL; - } - - if (server->secmech.hmacsha256) { - crypto_free_shash(server->secmech.hmacsha256); - server->secmech.hmacsha256 = NULL; - } - - if (server->secmech.md5) { - crypto_free_shash(server->secmech.md5); - server->secmech.md5 = NULL; - } + cifs_free_hash(&server->secmech.aes_cmac); + cifs_free_hash(&server->secmech.hmacsha256); + cifs_free_hash(&server->secmech.md5); + cifs_free_hash(&server->secmech.sha512); + cifs_free_hash(&server->secmech.hmacmd5); - if (server->secmech.sha512) { - crypto_free_shash(server->secmech.sha512); - server->secmech.sha512 = NULL; + if (server->secmech.enc) { + crypto_free_aead(server->secmech.enc); + server->secmech.enc = NULL; } - if (server->secmech.hmacmd5) { - crypto_free_shash(server->secmech.hmacmd5); - server->secmech.hmacmd5 = NULL; + if (server->secmech.dec) { + crypto_free_aead(server->secmech.dec); + server->secmech.dec = NULL; } - - if (server->secmech.ccmaesencrypt) { - crypto_free_aead(server->secmech.ccmaesencrypt); - server->secmech.ccmaesencrypt = NULL; - } - - if (server->secmech.ccmaesdecrypt) { - crypto_free_aead(server->secmech.ccmaesdecrypt); - server->secmech.ccmaesdecrypt = NULL; - } - - kfree(server->secmech.sdesccmacaes); - server->secmech.sdesccmacaes = NULL; - kfree(server->secmech.sdeschmacsha256); - server->secmech.sdeschmacsha256 = NULL; - kfree(server->secmech.sdeschmacmd5); - server->secmech.sdeschmacmd5 = NULL; - kfree(server->secmech.sdescmd5); - server->secmech.sdescmd5 = NULL; - kfree(server->secmech.sdescsha512); - server->secmech.sdescsha512 = NULL; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 8042d7280dec..914cbb9de482 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -396,6 +396,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode->epoch = 0; spin_lock_init(&cifs_inode->open_file_lock); generate_random_uuid(cifs_inode->lease_key); + cifs_inode->symlink_target = NULL; /* * Can not set i_flags here - they get immediately overwritten to zero @@ -412,7 +413,11 @@ cifs_alloc_inode(struct super_block *sb) static void cifs_free_inode(struct inode *inode) { - kmem_cache_free(cifs_inode_cachep, CIFS_I(inode)); + struct cifsInodeInfo *cinode = CIFS_I(inode); + + if (S_ISLNK(inode->i_mode)) + kfree(cinode->symlink_target); + kmem_cache_free(cifs_inode_cachep, cinode); } static void @@ -673,9 +678,15 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",echo_interval=%lu", tcon->ses->server->echo_interval / HZ); - /* Only display max_credits if it was overridden on mount */ + /* Only display the following if overridden on mount */ if (tcon->ses->server->max_credits != SMB2_MAX_CREDITS_AVAILABLE) seq_printf(s, ",max_credits=%u", tcon->ses->server->max_credits); + if (tcon->ses->server->tcp_nodelay) + seq_puts(s, ",tcpnodelay"); + if (tcon->ses->server->noautotune) + seq_puts(s, ",noautotune"); + if (tcon->ses->server->noblocksnd) + seq_puts(s, ",noblocksend"); if (tcon->snapshot_time) seq_printf(s, ",snapshot=%llu", tcon->snapshot_time); @@ -1128,6 +1139,8 @@ const struct inode_operations cifs_dir_inode_ops = { .symlink = cifs_symlink, .mknod = cifs_mknod, .listxattr = cifs_listxattr, + .get_acl = cifs_get_acl, + .set_acl = cifs_set_acl, }; const struct inode_operations cifs_file_inode_ops = { @@ -1136,8 +1149,34 @@ const struct inode_operations cifs_file_inode_ops = { .permission = cifs_permission, .listxattr = cifs_listxattr, .fiemap = cifs_fiemap, + .get_acl = cifs_get_acl, + .set_acl = cifs_set_acl, }; +const char *cifs_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *done) +{ + char *target_path; + + target_path = kmalloc(PATH_MAX, GFP_KERNEL); + if (!target_path) + return ERR_PTR(-ENOMEM); + + spin_lock(&inode->i_lock); + if (likely(CIFS_I(inode)->symlink_target)) { + strscpy(target_path, CIFS_I(inode)->symlink_target, PATH_MAX); + } else { + kfree(target_path); + target_path = ERR_PTR(-EOPNOTSUPP); + } + spin_unlock(&inode->i_lock); + + if (!IS_ERR(target_path)) + set_delayed_call(done, kfree_link, target_path); + + return target_path; +} + const struct inode_operations cifs_symlink_inode_ops = { .get_link = cifs_get_link, .permission = cifs_permission, @@ -1252,7 +1291,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, rc = filemap_write_and_wait_range(src_inode->i_mapping, off, off + len - 1); if (rc) - goto out; + goto unlock; /* should we flush first and last page first */ truncate_inode_pages(&target_inode->i_data, 0); @@ -1268,6 +1307,8 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, * that target is updated on the server */ CIFS_I(target_inode)->time = 0; + +unlock: /* although unlocking in the reverse order from locking is not * strictly necessary here it is a little cleaner to be consistent */ @@ -1297,8 +1338,11 @@ static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off, ssize_t rc; struct cifsFileInfo *cfile = dst_file->private_data; - if (cfile->swapfile) - return -EOPNOTSUPP; + if (cfile->swapfile) { + rc = -EOPNOTSUPP; + free_xid(xid); + return rc; + } rc = cifs_file_copychunk_range(xid, src_file, off, dst_file, destoff, len, flags); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 5b4a7a32bdc5..00a573e0ad0e 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -105,8 +105,8 @@ extern int cifs_lock(struct file *, int, struct file_lock *); extern int cifs_fsync(struct file *, loff_t, loff_t, int); extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int); extern int cifs_flush(struct file *, fl_owner_t id); -extern int cifs_file_mmap(struct file * , struct vm_area_struct *); -extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *); +extern int cifs_file_mmap(struct file *file, struct vm_area_struct *vma); +extern int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma); extern const struct file_operations cifs_dir_ops; extern int cifs_dir_open(struct inode *inode, struct file *file); extern int cifs_readdir(struct file *file, struct dir_context *ctx); @@ -153,6 +153,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 39 -#define CIFS_VERSION "2.39" +#define SMB3_PRODUCT_BUILD 40 +#define CIFS_VERSION "2.40" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ae7f571a7dba..82f2d3070c26 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -13,6 +13,8 @@ #include <linux/in6.h> #include <linux/inet.h> #include <linux/slab.h> +#include <linux/scatterlist.h> +#include <linux/mm.h> #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/utsname.h> @@ -21,7 +23,6 @@ #include "cifs_fs_sb.h" #include "cifsacl.h" #include <crypto/internal/hash.h> -#include <linux/scatterlist.h> #include <uapi/linux/cifs/cifs_mount.h> #include "../smbfs_common/smb2pdu.h" #include "smb2pdu.h" @@ -153,26 +154,16 @@ struct session_key { char *response; }; -/* crypto security descriptor definition */ -struct sdesc { - struct shash_desc shash; - char ctx[]; -}; - /* crypto hashing related structure/fields, not specific to a sec mech */ struct cifs_secmech { - struct crypto_shash *hmacmd5; /* hmac-md5 hash function */ - struct crypto_shash *md5; /* md5 hash function */ - struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */ - struct crypto_shash *cmacaes; /* block-cipher based MAC function */ - struct crypto_shash *sha512; /* sha512 hash function */ - struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */ - struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */ - struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */ - struct sdesc *sdesccmacaes; /* ctxt to generate smb3 signature */ - struct sdesc *sdescsha512; /* ctxt to generate smb3.11 signing key */ - struct crypto_aead *ccmaesencrypt; /* smb3 encryption aead */ - struct crypto_aead *ccmaesdecrypt; /* smb3 decryption aead */ + struct shash_desc *hmacmd5; /* hmacmd5 hash function, for NTLMv2/CR1 hashes */ + struct shash_desc *md5; /* md5 hash function, for CIFS/SMB1 signatures */ + struct shash_desc *hmacsha256; /* hmac-sha256 hash function, for SMB2 signatures */ + struct shash_desc *sha512; /* sha512 hash function, for SMB3.1.1 preauth hash */ + struct shash_desc *aes_cmac; /* block-cipher based MAC function, for SMB3 signatures */ + + struct crypto_aead *enc; /* smb3 encryption AEAD TFM (AES-CCM and AES-GCM) */ + struct crypto_aead *dec; /* smb3 decryption AEAD TFM (AES-CCM and AES-GCM) */ }; /* per smb session structure/fields */ @@ -195,6 +186,19 @@ struct cifs_cred { struct cifs_ace *aces; }; +struct cifs_open_info_data { + char *symlink_target; + union { + struct smb2_file_all_info fi; + struct smb311_posix_qinfo posix_fi; + }; +}; + +static inline void cifs_free_open_info(struct cifs_open_info_data *data) +{ + kfree(data->symlink_target); +} + /* ***************************************************************** * Except the CIFS PDUs themselves all the @@ -317,20 +321,20 @@ struct smb_version_operations { int (*is_path_accessible)(const unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const char *); /* query path data from the server */ - int (*query_path_info)(const unsigned int, struct cifs_tcon *, - struct cifs_sb_info *, const char *, - FILE_ALL_INFO *, bool *, bool *); + int (*query_path_info)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse); /* query file data from the server */ - int (*query_file_info)(const unsigned int, struct cifs_tcon *, - struct cifs_fid *, FILE_ALL_INFO *); + int (*query_file_info)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, struct cifs_open_info_data *data); /* query reparse tag from srv to determine which type of special file */ int (*query_reparse_tag)(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *path, __u32 *reparse_tag); /* get server index number */ - int (*get_srv_inum)(const unsigned int, struct cifs_tcon *, - struct cifs_sb_info *, const char *, - u64 *uniqueid, FILE_ALL_INFO *); + int (*get_srv_inum)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, u64 *uniqueid, + struct cifs_open_info_data *data); /* set size by path */ int (*set_path_size)(const unsigned int, struct cifs_tcon *, const char *, __u64, struct cifs_sb_info *, bool); @@ -379,8 +383,8 @@ struct smb_version_operations { struct cifs_sb_info *, const char *, char **, bool); /* open a file for non-posix mounts */ - int (*open)(const unsigned int, struct cifs_open_parms *, - __u32 *, FILE_ALL_INFO *); + int (*open)(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, + void *buf); /* set fid protocol-specific info */ void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32); /* close a file */ @@ -451,7 +455,7 @@ struct smb_version_operations { int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *src_file, void __user *); int (*notify)(const unsigned int xid, struct file *pfile, - void __user *pbuf); + void __user *pbuf, bool return_changes); int (*query_mf_symlink)(unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const unsigned char *, char *, unsigned int *); @@ -782,6 +786,7 @@ static inline unsigned int in_flight(struct TCP_Server_Info *server) { unsigned int num; + spin_lock(&server->req_lock); num = server->in_flight; spin_unlock(&server->req_lock); @@ -792,6 +797,7 @@ static inline bool has_credits(struct TCP_Server_Info *server, int *credits, int num_credits) { int num; + spin_lock(&server->req_lock); num = *credits; spin_unlock(&server->req_lock); @@ -1022,7 +1028,7 @@ struct cifs_ses { struct TCP_Server_Info *server; /* pointer to server info */ int ses_count; /* reference counter */ enum ses_status_enum ses_status; /* updates protected by cifs_tcp_ses_lock */ - unsigned overrideSecFlg; /* if non-zero override global sec flags */ + unsigned int overrideSecFlg; /* if non-zero override global sec flags */ char *serverOS; /* name of operating system underlying server */ char *serverNOS; /* name of network operating system of server */ char *serverDomain; /* security realm of server */ @@ -1133,6 +1139,7 @@ struct cifs_fattr { struct timespec64 cf_mtime; struct timespec64 cf_ctime; u32 cf_cifstag; + char *cf_symlink_target; }; /* @@ -1149,7 +1156,7 @@ struct cifs_tcon { struct list_head openFileList; spinlock_t open_file_lock; /* protects list above */ struct cifs_ses *ses; /* pointer to session associated with */ - char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */ + char tree_name[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */ char *nativeFileSystem; char *password; /* for share-level security */ __u32 tid; /* The 4 byte tree id */ @@ -1228,7 +1235,7 @@ struct cifs_tcon { struct fscache_volume *fscache; /* cookie for share */ #endif struct list_head pending_opens; /* list of incomplete opens */ - struct cached_fid *cfid; /* Cached root fid */ + struct cached_fids *cfids; /* BB add field for back pointer to sb struct(s)? */ #ifdef CONFIG_CIFS_DFS_UPCALL struct list_head ulist; /* cache update list */ @@ -1377,7 +1384,7 @@ struct cifsFileInfo { __u32 pid; /* process id who opened file */ struct cifs_fid fid; /* file id from remote */ struct list_head rlist; /* reconnect list */ - /* BB add lock scope info here if needed */ ; + /* BB add lock scope info here if needed */ /* lock scope id (0 if none) */ struct dentry *dentry; struct tcon_link *tlink; @@ -1395,6 +1402,7 @@ struct cifsFileInfo { struct work_struct put; /* work for the final part of _put */ struct delayed_work deferred; bool deferred_close_scheduled; /* Flag to indicate close is scheduled */ + char *symlink_target; }; struct cifs_io_parms { @@ -1553,6 +1561,7 @@ struct cifsInodeInfo { struct list_head deferred_closes; /* list of deferred closes */ spinlock_t deferred_lock; /* protection on deferred list */ bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */ + char *symlink_target; }; static inline struct cifsInodeInfo * @@ -1763,6 +1772,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param, int number_of_items) { int i; + if ((number_of_items == 0) || (param == NULL)) return; for (i = 0; i < number_of_items; i++) { @@ -2121,4 +2131,80 @@ static inline size_t ntlmssp_workstation_name_size(const struct cifs_ses *ses) return sizeof(ses->workstation_name); } +static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const FILE_ALL_INFO *src) +{ + memcpy(dst, src, (size_t)((u8 *)&src->AccessFlags - (u8 *)src)); + dst->AccessFlags = src->AccessFlags; + dst->CurrentByteOffset = src->CurrentByteOffset; + dst->Mode = src->Mode; + dst->AlignmentRequirement = src->AlignmentRequirement; + dst->FileNameLength = src->FileNameLength; +} + +static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst, + int num_rqst, + const u8 *sig) +{ + unsigned int len, skip; + unsigned int nents = 0; + unsigned long addr; + int i, j; + + /* Assumes the first rqst has a transform header as the first iov. + * I.e. + * rqst[0].rq_iov[0] is transform header + * rqst[0].rq_iov[1+] data to be encrypted/decrypted + * rqst[1+].rq_iov[0+] data to be encrypted/decrypted + */ + for (i = 0; i < num_rqst; i++) { + /* + * The first rqst has a transform header where the + * first 20 bytes are not part of the encrypted blob. + */ + for (j = 0; j < rqst[i].rq_nvec; j++) { + struct kvec *iov = &rqst[i].rq_iov[j]; + + skip = (i == 0) && (j == 0) ? 20 : 0; + addr = (unsigned long)iov->iov_base + skip; + if (unlikely(is_vmalloc_addr((void *)addr))) { + len = iov->iov_len - skip; + nents += DIV_ROUND_UP(offset_in_page(addr) + len, + PAGE_SIZE); + } else { + nents++; + } + } + nents += rqst[i].rq_npages; + } + nents += DIV_ROUND_UP(offset_in_page(sig) + SMB2_SIGNATURE_SIZE, PAGE_SIZE); + return nents; +} + +/* We can not use the normal sg_set_buf() as we will sometimes pass a + * stack object as buf. + */ +static inline struct scatterlist *cifs_sg_set_buf(struct scatterlist *sg, + const void *buf, + unsigned int buflen) +{ + unsigned long addr = (unsigned long)buf; + unsigned int off = offset_in_page(addr); + + addr &= PAGE_MASK; + if (unlikely(is_vmalloc_addr((void *)addr))) { + do { + unsigned int len = min_t(unsigned int, buflen, PAGE_SIZE - off); + + sg_set_page(sg++, vmalloc_to_page((void *)addr), len, off); + + off = 0; + addr += PAGE_SIZE; + buflen -= len; + } while (buflen); + } else { + sg_set_page(sg++, virt_to_page(addr), buflen, off); + } + return sg; +} + #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index aeba371c4c70..623caece2b10 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -483,7 +483,7 @@ put_bcc(__u16 count, struct smb_hdr *hdr) typedef struct negotiate_req { struct smb_hdr hdr; /* wct = 0 */ __le16 ByteCount; - unsigned char DialectsArray[1]; + unsigned char DialectsArray[]; } __attribute__((packed)) NEGOTIATE_REQ; #define MIN_TZ_ADJ (15 * 60) /* minimum grid for timezones in seconds */ @@ -508,13 +508,14 @@ typedef struct negotiate_rsp { __u8 EncryptionKeyLength; __u16 ByteCount; union { - unsigned char EncryptionKey[1]; /* cap extended security off */ + /* cap extended security off */ + DECLARE_FLEX_ARRAY(unsigned char, EncryptionKey); /* followed by Domain name - if extended security is off */ /* followed by 16 bytes of server GUID */ /* then security blob if cap_extended_security negotiated */ struct { unsigned char GUID[SMB1_CLIENT_GUID_SIZE]; - unsigned char SecurityBlob[1]; + unsigned char SecurityBlob[]; } __attribute__((packed)) extended_response; } __attribute__((packed)) u; } __attribute__((packed)) NEGOTIATE_RSP; @@ -1428,7 +1429,7 @@ typedef struct smb_com_transaction_change_notify_req { __u8 WatchTree; /* 1 = Monitor subdirectories */ __u8 Reserved2; __le16 ByteCount; -/* __u8 Pad[3];*/ +/* __u8 Pad[3];*/ /* __u8 Data[1];*/ } __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_REQ; @@ -1751,8 +1752,7 @@ struct smb_com_transaction2_sfi_rsp { struct smb_hdr hdr; /* wct = 10 + SetupCount */ struct trans2_resp t2; __u16 ByteCount; - __u16 Reserved2; /* parameter word reserved - - present for infolevels > 100 */ + __u16 Reserved2; /* parameter word reserved - present for infolevels > 100 */ } __attribute__((packed)); struct smb_t2_qfi_req { @@ -1767,8 +1767,7 @@ struct smb_t2_qfi_rsp { struct smb_hdr hdr; /* wct = 10 + SetupCount */ struct trans2_resp t2; __u16 ByteCount; - __u16 Reserved2; /* parameter word reserved - - present for infolevels > 100 */ + __u16 Reserved2; /* parameter word reserved - present for infolevels > 100 */ } __attribute__((packed)); /* @@ -2145,13 +2144,11 @@ typedef struct { #define CIFS_UNIX_POSIX_PATH_OPS_CAP 0x00000020 /* Allow new POSIX path based calls including posix open and posix unlink */ -#define CIFS_UNIX_LARGE_READ_CAP 0x00000040 /* support reads >128K (up - to 0xFFFF00 */ +#define CIFS_UNIX_LARGE_READ_CAP 0x00000040 /* support reads >128K (up to 0xFFFF00 */ #define CIFS_UNIX_LARGE_WRITE_CAP 0x00000080 #define CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP 0x00000100 /* can do SPNEGO crypt */ #define CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP 0x00000200 /* must do */ -#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and - QFS PROXY call */ +#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and QFS PROXY call */ #ifdef CONFIG_CIFS_POSIX /* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send LockingX instead of posix locking call on unix sess (and we do not expect @@ -2367,8 +2364,7 @@ typedef struct { struct file_allocation_info { __le64 AllocationSize; /* Note old Samba srvr rounds this up too much */ -} __attribute__((packed)); /* size used on disk, for level 0x103 for set, - 0x105 for query */ +} __packed; /* size used on disk, for level 0x103 for set, 0x105 for query */ struct file_end_of_file_info { __le64 FileSize; /* offset to end of file */ @@ -2408,8 +2404,7 @@ struct cifs_posix_acl { /* access conrol list (ACL) */ __le16 access_entry_count; /* access ACL - count of entries */ __le16 default_entry_count; /* default ACL - count of entries */ struct cifs_posix_ace ace_array[]; - /* followed by - struct cifs_posix_ace default_ace_arraay[] */ + /* followed by struct cifs_posix_ace default_ace_array[] */ } __attribute__((packed)); /* level 0x204 */ /* types of access control entries already defined in posix_acl.h */ @@ -2428,17 +2423,17 @@ struct cifs_posix_acl { /* access conrol list (ACL) */ /* end of POSIX ACL definitions */ /* POSIX Open Flags */ -#define SMB_O_RDONLY 0x1 -#define SMB_O_WRONLY 0x2 -#define SMB_O_RDWR 0x4 -#define SMB_O_CREAT 0x10 -#define SMB_O_EXCL 0x20 -#define SMB_O_TRUNC 0x40 -#define SMB_O_APPEND 0x80 -#define SMB_O_SYNC 0x100 -#define SMB_O_DIRECTORY 0x200 -#define SMB_O_NOFOLLOW 0x400 -#define SMB_O_DIRECT 0x800 +#define SMB_O_RDONLY 0x1 +#define SMB_O_WRONLY 0x2 +#define SMB_O_RDWR 0x4 +#define SMB_O_CREAT 0x10 +#define SMB_O_EXCL 0x20 +#define SMB_O_TRUNC 0x40 +#define SMB_O_APPEND 0x80 +#define SMB_O_SYNC 0x100 +#define SMB_O_DIRECTORY 0x200 +#define SMB_O_NOFOLLOW 0x400 +#define SMB_O_DIRECT 0x800 typedef struct { __le32 OpenFlags; /* same as NT CreateX */ @@ -2715,15 +2710,13 @@ typedef struct file_xattr_info { __u32 xattr_value_len; char xattr_name[]; /* followed by xattr_value[xattr_value_len], no pad */ -} __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute info - level 0x205 */ +} __packed FILE_XATTR_INFO; /* extended attribute info level 0x205 */ /* flags for lsattr and chflags commands removed arein uapi/linux/fs.h */ typedef struct file_chattr_info { __le64 mask; /* list of all possible attribute bits */ __le64 mode; /* list of actual attribute bits on this inode */ -} __attribute__((packed)) FILE_CHATTR_INFO; /* ext attributes - (chattr, chflags) level 0x206 */ -#endif /* POSIX */ +} __packed FILE_CHATTR_INFO; /* ext attributes (chattr, chflags) level 0x206 */ +#endif /* POSIX */ #endif /* _CIFSPDU_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 3bc94bcc7177..7d4b37eeec98 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -124,7 +124,7 @@ extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *, struct kvec * /* resp vec */); extern int SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *ptcon, - struct smb_hdr *in_buf , + struct smb_hdr *in_buf, struct smb_hdr *out_buf, int *bytes_returned); void @@ -182,10 +182,9 @@ extern int cifs_unlock_range(struct cifsFileInfo *cfile, extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile); extern void cifs_down_write(struct rw_semaphore *sem); -extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, - struct file *file, - struct tcon_link *tlink, - __u32 oplock); +struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, + struct tcon_link *tlink, __u32 oplock, + const char *symlink_target); extern int cifs_posix_open(const char *full_path, struct inode **inode, struct super_block *sb, int mode, unsigned int f_flags, __u32 *oplock, __u16 *netfid, @@ -200,9 +199,9 @@ extern int cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr); extern struct inode *cifs_iget(struct super_block *sb, struct cifs_fattr *fattr); -extern int cifs_get_inode_info(struct inode **inode, const char *full_path, - FILE_ALL_INFO *data, struct super_block *sb, - int xid, const struct cifs_fid *fid); +int cifs_get_inode_info(struct inode **inode, const char *full_path, + struct cifs_open_info_data *data, struct super_block *sb, int xid, + const struct cifs_fid *fid); extern int smb311_posix_get_inode_info(struct inode **pinode, const char *search_path, struct super_block *sb, unsigned int xid); extern int cifs_get_inode_info_unix(struct inode **pinode, @@ -225,6 +224,10 @@ extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, const char *, u32 *, u32); extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *, const struct cifs_fid *, u32 *, u32); +extern struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, int type); +extern int cifs_set_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, struct posix_acl *acl, int type); extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, const char *, int); extern unsigned int setup_authusers_ACE(struct cifs_ace *pace); @@ -538,14 +541,14 @@ extern int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen); extern int CIFSSMBSetCIFSACL(const unsigned int, struct cifs_tcon *, __u16, struct cifs_ntsd *, __u32, int); -extern int CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, - const unsigned char *searchName, - char *acl_inf, const int buflen, const int acl_type, - const struct nls_table *nls_codepage, int remap_special_chars); -extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, - const unsigned char *fileName, - const char *local_acl, const int buflen, const int acl_type, - const struct nls_table *nls_codepage, int remap_special_chars); +extern int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, + struct posix_acl **acl, const int acl_type, + const struct nls_table *nls_codepage, int remap); +extern int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *fileName, + const struct posix_acl *acl, const int acl_type, + const struct nls_table *nls_codepage, int remap); extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon, const int netfid, __u64 *pExtAttrBits, __u64 *pMask); #endif /* CIFS_ALLOW_INSECURE_LEGACY */ @@ -598,12 +601,11 @@ struct cifs_aio_ctx *cifs_aio_ctx_alloc(void); void cifs_aio_ctx_release(struct kref *refcount); int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw); -int cifs_alloc_hash(const char *name, struct crypto_shash **shash, - struct sdesc **sdesc); -void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc); +int cifs_alloc_hash(const char *name, struct shash_desc **sdesc); +void cifs_free_hash(struct shash_desc **sdesc); -extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page, - unsigned int *len, unsigned int *offset); +void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page, + unsigned int *len, unsigned int *offset); struct cifs_chan * cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server); int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses); @@ -639,7 +641,7 @@ cifs_chan_is_iface_active(struct cifs_ses *ses, int cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server); int -SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon); +SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_mount); void extract_unc_hostname(const char *unc, const char **h, size_t *len); int copy_path_name(char *dst, const char *src); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 7aa91e272027..23f10e0d6e7e 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -465,7 +465,7 @@ CIFSSMBNegotiate(const unsigned int xid, for (i = 0; i < CIFS_NUM_PROT; i++) { size_t len = strlen(protocols[i].name) + 1; - memcpy(pSMB->DialectsArray+count, protocols[i].name, len); + memcpy(&pSMB->DialectsArray[count], protocols[i].name, len); count += len; } inc_rfc1001_len(pSMB, count); @@ -2305,7 +2305,7 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon, remap); } rename_info->target_name_len = cpu_to_le32(2 * len_of_str); - count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str); + count = sizeof(struct set_file_rename) + (2 * len_of_str); byte_count += count; pSMB->DataCount = cpu_to_le16(count); pSMB->TotalDataCount = pSMB->DataCount; @@ -2914,32 +2914,57 @@ CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon, #ifdef CONFIG_CIFS_POSIX -/*Convert an Access Control Entry from wire format to local POSIX xattr format*/ -static void cifs_convert_ace(struct posix_acl_xattr_entry *ace, - struct cifs_posix_ace *cifs_ace) +#ifdef CONFIG_FS_POSIX_ACL +/** + * cifs_init_posix_acl - convert ACL from cifs to POSIX ACL format + * @ace: POSIX ACL entry to store converted ACL into + * @cifs_ace: ACL in cifs format + * + * Convert an Access Control Entry from wire format to local POSIX xattr + * format. + * + * Note that the @cifs_uid member is used to store both {g,u}id_t. + */ +static void cifs_init_posix_acl(struct posix_acl_entry *ace, + struct cifs_posix_ace *cifs_ace) { /* u8 cifs fields do not need le conversion */ - ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm); - ace->e_tag = cpu_to_le16(cifs_ace->cifs_e_tag); - ace->e_id = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid)); -/* - cifs_dbg(FYI, "perm %d tag %d id %d\n", - ace->e_perm, ace->e_tag, ace->e_id); -*/ + ace->e_perm = cifs_ace->cifs_e_perm; + ace->e_tag = cifs_ace->cifs_e_tag; + switch (ace->e_tag) { + case ACL_USER: + ace->e_uid = make_kuid(&init_user_ns, + le64_to_cpu(cifs_ace->cifs_uid)); + break; + case ACL_GROUP: + ace->e_gid = make_kgid(&init_user_ns, + le64_to_cpu(cifs_ace->cifs_uid)); + break; + } return; } -/* Convert ACL from CIFS POSIX wire format to local Linux POSIX ACL xattr */ -static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, - const int acl_type, const int size_of_data_area) +/** + * cifs_to_posix_acl - copy cifs ACL format to POSIX ACL format + * @acl: ACLs returned in POSIX ACL format + * @src: ACLs in cifs format + * @acl_type: type of POSIX ACL requested + * @size_of_data_area: size of SMB we got + * + * This function converts ACLs from cifs format to POSIX ACL format. + * If @acl is NULL then the size of the buffer required to store POSIX ACLs in + * their uapi format is returned. + */ +static int cifs_to_posix_acl(struct posix_acl **acl, char *src, + const int acl_type, const int size_of_data_area) { int size = 0; - int i; __u16 count; struct cifs_posix_ace *pACE; struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)src; - struct posix_acl_xattr_header *local_acl = (void *)trgt; + struct posix_acl *kacl = NULL; + struct posix_acl_entry *pa, *pe; if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION) return -EOPNOTSUPP; @@ -2959,7 +2984,7 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, count = le16_to_cpu(cifs_acl->access_entry_count); size = sizeof(struct cifs_posix_acl); size += sizeof(struct cifs_posix_ace) * count; -/* skip past access ACEs to get to default ACEs */ + /* skip past access ACEs to get to default ACEs */ pACE = &cifs_acl->ace_array[count]; count = le16_to_cpu(cifs_acl->default_entry_count); size += sizeof(struct cifs_posix_ace) * count; @@ -2971,62 +2996,75 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, return -EINVAL; } - size = posix_acl_xattr_size(count); - if ((buflen == 0) || (local_acl == NULL)) { - /* used to query ACL EA size */ - } else if (size > buflen) { - return -ERANGE; - } else /* buffer big enough */ { - struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1); - - local_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); - for (i = 0; i < count ; i++) { - cifs_convert_ace(&ace[i], pACE); - pACE++; - } + /* Allocate number of POSIX ACLs to store in VFS format. */ + kacl = posix_acl_alloc(count, GFP_NOFS); + if (!kacl) + return -ENOMEM; + + FOREACH_ACL_ENTRY(pa, kacl, pe) { + cifs_init_posix_acl(pa, pACE); + pACE++; } - return size; + + *acl = kacl; + return 0; } -static void convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace, - const struct posix_acl_xattr_entry *local_ace) +/** + * cifs_init_ace - convert ACL entry from POSIX ACL to cifs format + * @cifs_ace: the cifs ACL entry to store into + * @local_ace: the POSIX ACL entry to convert + */ +static void cifs_init_ace(struct cifs_posix_ace *cifs_ace, + const struct posix_acl_entry *local_ace) { - cifs_ace->cifs_e_perm = le16_to_cpu(local_ace->e_perm); - cifs_ace->cifs_e_tag = le16_to_cpu(local_ace->e_tag); - /* BB is there a better way to handle the large uid? */ - if (local_ace->e_id == cpu_to_le32(-1)) { - /* Probably no need to le convert -1 on any arch but can not hurt */ + cifs_ace->cifs_e_perm = local_ace->e_perm; + cifs_ace->cifs_e_tag = local_ace->e_tag; + + switch (local_ace->e_tag) { + case ACL_USER: + cifs_ace->cifs_uid = + cpu_to_le64(from_kuid(&init_user_ns, local_ace->e_uid)); + break; + case ACL_GROUP: + cifs_ace->cifs_uid = + cpu_to_le64(from_kgid(&init_user_ns, local_ace->e_gid)); + break; + default: cifs_ace->cifs_uid = cpu_to_le64(-1); - } else - cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id)); -/* - cifs_dbg(FYI, "perm %d tag %d id %d\n", - ace->e_perm, ace->e_tag, ace->e_id); -*/ + } } -/* Convert ACL from local Linux POSIX xattr to CIFS POSIX ACL wire format */ -static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, - const int buflen, const int acl_type) +/** + * posix_acl_to_cifs - convert ACLs from POSIX ACL to cifs format + * @parm_data: ACLs in cifs format to conver to + * @acl: ACLs in POSIX ACL format to convert from + * @acl_type: the type of POSIX ACLs stored in @acl + * + * Return: the number cifs ACL entries after conversion + */ +static __u16 posix_acl_to_cifs(char *parm_data, const struct posix_acl *acl, + const int acl_type) { __u16 rc = 0; struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data; - struct posix_acl_xattr_header *local_acl = (void *)pACL; - struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1); + const struct posix_acl_entry *pa, *pe; int count; - int i; + int i = 0; - if ((buflen == 0) || (pACL == NULL) || (cifs_acl == NULL)) + if ((acl == NULL) || (cifs_acl == NULL)) return 0; - count = posix_acl_xattr_count((size_t)buflen); - cifs_dbg(FYI, "setting acl with %d entries from buf of length %d and version of %d\n", - count, buflen, le32_to_cpu(local_acl->a_version)); - if (le32_to_cpu(local_acl->a_version) != 2) { - cifs_dbg(FYI, "unknown POSIX ACL version %d\n", - le32_to_cpu(local_acl->a_version)); - return 0; - } + count = acl->a_count; + cifs_dbg(FYI, "setting acl with %d entries\n", count); + + /* + * Note that the uapi POSIX ACL version is verified by the VFS and is + * independent of the cifs ACL version. Changing the POSIX ACL version + * is a uapi change and if it's changed we will pass down the POSIX ACL + * version in struct posix_acl from the VFS. For now there's really + * only one that all filesystems know how to deal with. + */ cifs_acl->version = cpu_to_le16(1); if (acl_type == ACL_TYPE_ACCESS) { cifs_acl->access_entry_count = cpu_to_le16(count); @@ -3038,8 +3076,9 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, cifs_dbg(FYI, "unknown ACL type %d\n", acl_type); return 0; } - for (i = 0; i < count; i++) - convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], &ace[i]); + FOREACH_ACL_ENTRY(pa, acl, pe) { + cifs_init_ace(&cifs_acl->ace_array[i++], pa); + } if (rc == 0) { rc = (__u16)(count * sizeof(struct cifs_posix_ace)); rc += sizeof(struct cifs_posix_acl); @@ -3048,11 +3087,10 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, return rc; } -int -CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, - const unsigned char *searchName, - char *acl_inf, const int buflen, const int acl_type, - const struct nls_table *nls_codepage, int remap) +int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, struct posix_acl **acl, + const int acl_type, const struct nls_table *nls_codepage, + int remap) { /* SMB_QUERY_POSIX_ACL */ TRANSACTION2_QPI_REQ *pSMB = NULL; @@ -3124,23 +3162,26 @@ queryAclRetry: else { __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); __u16 count = le16_to_cpu(pSMBr->t2.DataCount); - rc = cifs_copy_posix_acl(acl_inf, + rc = cifs_to_posix_acl(acl, (char *)&pSMBr->hdr.Protocol+data_offset, - buflen, acl_type, count); + acl_type, count); } } cifs_buf_release(pSMB); + /* + * The else branch after SendReceive() doesn't return EAGAIN so if we + * allocated @acl in cifs_to_posix_acl() we are guaranteed to return + * here and don't leak POSIX ACLs. + */ if (rc == -EAGAIN) goto queryAclRetry; return rc; } -int -CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, - const unsigned char *fileName, - const char *local_acl, const int buflen, - const int acl_type, - const struct nls_table *nls_codepage, int remap) +int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *fileName, const struct posix_acl *acl, + const int acl_type, const struct nls_table *nls_codepage, + int remap) { struct smb_com_transaction2_spi_req *pSMB = NULL; struct smb_com_transaction2_spi_rsp *pSMBr = NULL; @@ -3181,7 +3222,7 @@ setAclRetry: pSMB->ParameterOffset = cpu_to_le16(param_offset); /* convert to on the wire format for POSIX ACL */ - data_count = ACL_to_cifs_posix(parm_data, local_acl, buflen, acl_type); + data_count = posix_acl_to_cifs(parm_data, acl, acl_type); if (data_count == 0) { rc = -EOPNOTSUPP; @@ -3211,6 +3252,23 @@ setACLerrorExit: goto setAclRetry; return rc; } +#else +int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, struct posix_acl **acl, + const int acl_type, const struct nls_table *nls_codepage, + int remap) +{ + return -EOPNOTSUPP; +} + +int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *fileName, const struct posix_acl *acl, + const int acl_type, const struct nls_table *nls_codepage, + int remap) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_FS_POSIX_ACL */ int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7ae6f2c08153..7bc7b5e03c51 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -155,7 +155,7 @@ static void smb2_query_server_interfaces(struct work_struct *work) /* * query server network interfaces, in case they change */ - rc = SMB3_request_interfaces(0, tcon); + rc = SMB3_request_interfaces(0, tcon, false); if (rc) { cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n", __func__, rc); @@ -311,7 +311,7 @@ cifs_abort_connection(struct TCP_Server_Info *server) } server->sequence_number = 0; server->session_estab = false; - kfree(server->session_key.response); + kfree_sensitive(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; server->lstrp = jiffies; @@ -759,7 +759,7 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, { struct msghdr smb_msg = {}; struct kvec iov = {.iov_base = buf, .iov_len = to_read}; - iov_iter_kvec(&smb_msg.msg_iter, READ, &iov, 1, to_read); + iov_iter_kvec(&smb_msg.msg_iter, ITER_DEST, &iov, 1, to_read); return cifs_readv_from_socket(server, &smb_msg); } @@ -774,7 +774,7 @@ cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read) * and cifs_readv_from_socket sets msg_control and msg_controllen * so little to initialize in struct msghdr */ - iov_iter_discard(&smb_msg.msg_iter, READ, to_read); + iov_iter_discard(&smb_msg.msg_iter, ITER_DEST, to_read); return cifs_readv_from_socket(server, &smb_msg); } @@ -786,7 +786,7 @@ cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, struct msghdr smb_msg = {}; struct bio_vec bv = { .bv_page = page, .bv_len = to_read, .bv_offset = page_offset}; - iov_iter_bvec(&smb_msg.msg_iter, READ, &bv, 1, to_read); + iov_iter_bvec(&smb_msg.msg_iter, ITER_DEST, &bv, 1, to_read); return cifs_readv_from_socket(server, &smb_msg); } @@ -1580,10 +1580,11 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) cifs_crypto_secmech_release(server); - kfree(server->session_key.response); + kfree_sensitive(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; kfree(server->hostname); + server->hostname = NULL; task = xchg(&server->tsk, NULL); if (task) @@ -1940,7 +1941,8 @@ void cifs_put_smb_ses(struct cifs_ses *ses) spin_unlock(&ses->ses_lock); cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); - cifs_dbg(FYI, "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->treeName : "NONE"); + cifs_dbg(FYI, + "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->tree_name : "NONE"); spin_lock(&cifs_tcp_ses_lock); if (--ses->ses_count > 0) { @@ -2293,7 +2295,7 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) { if (tcon->status == TID_EXITING) return 0; - if (strncmp(tcon->treeName, ctx->UNC, MAX_TREE_SIZE)) + if (strncmp(tcon->tree_name, ctx->UNC, MAX_TREE_SIZE)) return 0; if (tcon->seal != ctx->seal) return 0; @@ -2831,9 +2833,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) * sessinit is sent but no second negprot */ struct rfc1002_session_packet *ses_init_buf; + unsigned int req_noscope_len; struct smb_hdr *smb_buf; + ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet), GFP_KERNEL); + if (ses_init_buf) { ses_init_buf->trailer.session_req.called_len = 32; @@ -2869,8 +2874,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) ses_init_buf->trailer.session_req.scope2 = 0; smb_buf = (struct smb_hdr *)ses_init_buf; - /* sizeof RFC1002_SESSION_REQUEST with no scope */ - smb_buf->smb_buf_length = cpu_to_be32(0x81000044); + /* sizeof RFC1002_SESSION_REQUEST with no scopes */ + req_noscope_len = sizeof(struct rfc1002_session_packet) - 2; + + /* == cpu_to_be32(0x81000044) */ + smb_buf->smb_buf_length = + cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | req_noscope_len); rc = smb_send(server, smb_buf, 0x44); kfree(ses_init_buf); /* @@ -2935,6 +2944,7 @@ generic_ip_connect(struct TCP_Server_Info *server) cifs_dbg(FYI, "Socket created\n"); server->ssocket = socket; socket->sk->sk_allocation = GFP_NOFS; + socket->sk->sk_use_task_frag = false; if (sfamily == AF_INET6) cifs_reclassify_socket6(socket); else @@ -3846,9 +3856,13 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) uuid_copy(&cifs_sb->dfs_mount_id, &mnt_ctx.mount_id); out: - free_xid(mnt_ctx.xid); cifs_try_adding_channels(cifs_sb, mnt_ctx.ses); - return mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon); + rc = mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon); + if (rc) + goto error; + + free_xid(mnt_ctx.xid); + return rc; error: dfs_cache_put_refsrv_sessions(&mnt_ctx.mount_id); @@ -3875,8 +3889,12 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) goto error; } + rc = mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon); + if (rc) + goto error; + free_xid(mnt_ctx.xid); - return mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon); + return rc; error: mount_put_conns(&mnt_ctx); @@ -3921,12 +3939,11 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, pSMB->AndXCommand = 0xFF; pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO); bcc_ptr = &pSMB->Password[0]; - if (tcon->pipe || (ses->server->sec_mode & SECMODE_USER)) { - pSMB->PasswordLength = cpu_to_le16(1); /* minimum */ - *bcc_ptr = 0; /* password is null byte */ - bcc_ptr++; /* skip password */ - /* already aligned so no need to do it below */ - } + + pSMB->PasswordLength = cpu_to_le16(1); /* minimum */ + *bcc_ptr = 0; /* password is null byte */ + bcc_ptr++; /* skip password */ + /* already aligned so no need to do it below */ if (ses->server->sign) smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; @@ -3989,7 +4006,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, } bcc_ptr += length + 1; bytes_left -= (length + 1); - strscpy(tcon->treeName, tree, sizeof(tcon->treeName)); + strscpy(tcon->tree_name, tree, sizeof(tcon->tree_name)); /* mostly informational -- no need to fail on error here */ kfree(tcon->nativeFileSystem); @@ -4134,7 +4151,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (ses->auth_key.response) { cifs_dbg(FYI, "Free previous auth_key.response = %p\n", ses->auth_key.response); - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; ses->auth_key.len = 0; } @@ -4197,7 +4214,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) ctx->local_nls = cifs_sb->local_nls; ctx->linux_uid = fsuid; ctx->cred_uid = fsuid; - ctx->UNC = master_tcon->treeName; + ctx->UNC = master_tcon->tree_name; ctx->retry = master_tcon->retry; ctx->nocase = master_tcon->nocase; ctx->nohandlecache = master_tcon->nohandlecache; @@ -4663,7 +4680,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* If it is not dfs or there was no cached dfs referral, then reconnect to same share */ if (!server->current_fullpath || dfs_cache_noreq_find(server->current_fullpath + 1, &ref, &tl)) { - rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, cifs_sb->local_nls); + rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, cifs_sb->local_nls); goto out; } @@ -4707,7 +4724,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru tcon->status = TID_IN_TCON; spin_unlock(&tcon->tc_lock); - rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc); + rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, nlsc); if (rc) { spin_lock(&tcon->tc_lock); if (tcon->status == TID_IN_TCON) diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index a9b6c3eba6de..e70915ad7541 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -98,7 +98,7 @@ static struct cifs_ses *find_ipc_from_server_path(struct cifs_ses **ses, const c get_ipc_unc(path, unc, sizeof(unc)); for (; *ses; ses++) { - if (!strcasecmp(unc, (*ses)->tcon_ipc->treeName)) + if (!strcasecmp(unc, (*ses)->tcon_ipc->tree_name)) return *ses; } return ERR_PTR(-ENOENT); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 08f7392716e2..8b1c37158556 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -50,7 +50,7 @@ cifs_build_path_to_root(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_s } if (add_treename) - dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); + dfsplen = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1); else dfsplen = 0; @@ -59,7 +59,7 @@ cifs_build_path_to_root(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_s return full_path; if (dfsplen) - memcpy(full_path, tcon->treeName, dfsplen); + memcpy(full_path, tcon->tree_name, dfsplen); full_path[dfsplen] = CIFS_DIR_SEP(cifs_sb); memcpy(full_path + dfsplen + 1, ctx->prepath, pplen); convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); @@ -93,7 +93,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, return ERR_PTR(-ENOMEM); if (prefix) - dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); + dfsplen = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1); else dfsplen = 0; @@ -123,7 +123,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, } if (dfsplen) { s -= dfsplen; - memcpy(s, tcon->treeName, dfsplen); + memcpy(s, tcon->tree_name, dfsplen); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { int i; for (i = 0; i < dfsplen; i++) { @@ -165,10 +165,9 @@ check_name(struct dentry *direntry, struct cifs_tcon *tcon) /* Inode operations in similar order to how they appear in Linux file fs.h */ -static int -cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, - struct tcon_link *tlink, unsigned oflags, umode_t mode, - __u32 *oplock, struct cifs_fid *fid) +static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, + struct tcon_link *tlink, unsigned int oflags, umode_t mode, __u32 *oplock, + struct cifs_fid *fid, struct cifs_open_info_data *buf) { int rc = -ENOENT; int create_options = CREATE_NOT_DIR; @@ -177,7 +176,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, struct cifs_tcon *tcon = tlink_tcon(tlink); const char *full_path; void *page = alloc_dentry_path(); - FILE_ALL_INFO *buf = NULL; struct inode *newinode = NULL; int disposition; struct TCP_Server_Info *server = tcon->ses->server; @@ -290,12 +288,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, goto out; } - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (buf == NULL) { - rc = -ENOMEM; - goto out; - } - /* * if we're not using unix extensions, see if we need to set * ATTR_READONLY on the create call @@ -364,8 +356,7 @@ cifs_create_get_file_info: { #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ /* TODO: Add support for calling POSIX query info here, but passing in fid */ - rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb, - xid, fid); + rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb, xid, fid); if (newinode) { if (server->ops->set_lease_key) server->ops->set_lease_key(newinode, fid); @@ -402,7 +393,6 @@ cifs_create_set_dentry: d_add(direntry, newinode); out: - kfree(buf); free_dentry_path(page); return rc; @@ -423,10 +413,11 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, struct tcon_link *tlink; struct cifs_tcon *tcon; struct TCP_Server_Info *server; - struct cifs_fid fid; + struct cifs_fid fid = {}; struct cifs_pending_open open; __u32 oplock; struct cifsFileInfo *file_info; + struct cifs_open_info_data buf = {}; if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) return -EIO; @@ -484,8 +475,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, cifs_add_pending_open(&fid, tlink, &open); rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, - &oplock, &fid); - + &oplock, &fid, &buf); if (rc) { cifs_del_pending_open(&open); goto out; @@ -510,7 +500,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, file->f_op = &cifs_file_direct_ops; } - file_info = cifs_new_fileinfo(&fid, file, tlink, oplock); + file_info = cifs_new_fileinfo(&fid, file, tlink, oplock, buf.symlink_target); if (file_info == NULL) { if (server->ops->close) server->ops->close(xid, tcon, &fid); @@ -526,6 +516,7 @@ out: cifs_put_tlink(tlink); out_free_xid: free_xid(xid); + cifs_free_open_info(&buf); return rc; } @@ -547,12 +538,15 @@ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode, struct TCP_Server_Info *server; struct cifs_fid fid; __u32 oplock; + struct cifs_open_info_data buf = {}; cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %pd and dentry = 0x%p\n", inode, direntry, direntry); - if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) - return -EIO; + if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) { + rc = -EIO; + goto out_free_xid; + } tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); rc = PTR_ERR(tlink); @@ -565,11 +559,11 @@ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode, if (server->ops->new_lease_key) server->ops->new_lease_key(&fid); - rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, - &oplock, &fid); + rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, &oplock, &fid, &buf); if (!rc && server->ops->close) server->ops->close(xid, tcon, &fid); + cifs_free_open_info(&buf); cifs_put_tlink(tlink); out_free_xid: free_xid(xid); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 6f38b134a346..22dfc1f8b4f1 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -209,16 +209,14 @@ posix_open_ret: } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ -static int -cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, - struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock, - struct cifs_fid *fid, unsigned int xid) +static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock, + struct cifs_fid *fid, unsigned int xid, struct cifs_open_info_data *buf) { int rc; int desired_access; int disposition; int create_options = CREATE_NOT_DIR; - FILE_ALL_INFO *buf; struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; @@ -255,10 +253,6 @@ cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *ci /* BB pass O_SYNC flag through on file attributes .. BB */ - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (!buf) - return -ENOMEM; - /* O_SYNC also has bit for O_DSYNC so following check picks up either */ if (f_flags & O_SYNC) create_options |= CREATE_WRITE_THROUGH; @@ -276,9 +270,8 @@ cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *ci oparms.reconnect = false; rc = server->ops->open(xid, &oparms, oplock, buf); - if (rc) - goto out; + return rc; /* TODO: Add support for calling posix query info but with passing in fid */ if (tcon->unix_ext) @@ -294,8 +287,6 @@ cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *ci rc = -EOPENSTALE; } -out: - kfree(buf); return rc; } @@ -325,9 +316,9 @@ cifs_down_write(struct rw_semaphore *sem) static void cifsFileInfo_put_work(struct work_struct *work); -struct cifsFileInfo * -cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, - struct tcon_link *tlink, __u32 oplock) +struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, + struct tcon_link *tlink, __u32 oplock, + const char *symlink_target) { struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); @@ -347,6 +338,15 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, return NULL; } + if (symlink_target) { + cfile->symlink_target = kstrdup(symlink_target, GFP_KERNEL); + if (!cfile->symlink_target) { + kfree(fdlocks); + kfree(cfile); + return NULL; + } + } + INIT_LIST_HEAD(&fdlocks->locks); fdlocks->cfile = cfile; cfile->llist = fdlocks; @@ -440,6 +440,7 @@ static void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file) cifs_put_tlink(cifs_file->tlink); dput(cifs_file->dentry); cifs_sb_deactive(sb); + kfree(cifs_file->symlink_target); kfree(cifs_file); } @@ -488,7 +489,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, struct cifsInodeInfo *cifsi = CIFS_I(inode); struct super_block *sb = inode->i_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - struct cifs_fid fid; + struct cifs_fid fid = {}; struct cifs_pending_open open; bool oplock_break_cancelled; @@ -570,8 +571,9 @@ int cifs_open(struct inode *inode, struct file *file) void *page; const char *full_path; bool posix_open_ok = false; - struct cifs_fid fid; + struct cifs_fid fid = {}; struct cifs_pending_open open; + struct cifs_open_info_data data = {}; xid = get_xid(); @@ -662,15 +664,15 @@ int cifs_open(struct inode *inode, struct file *file) if (server->ops->get_lease_key) server->ops->get_lease_key(inode, &fid); - rc = cifs_nt_open(full_path, inode, cifs_sb, tcon, - file->f_flags, &oplock, &fid, xid); + rc = cifs_nt_open(full_path, inode, cifs_sb, tcon, file->f_flags, &oplock, &fid, + xid, &data); if (rc) { cifs_del_pending_open(&open); goto out; } } - cfile = cifs_new_fileinfo(&fid, file, tlink, oplock); + cfile = cifs_new_fileinfo(&fid, file, tlink, oplock, data.symlink_target); if (cfile == NULL) { if (server->ops->close) server->ops->close(xid, tcon, &fid); @@ -712,6 +714,7 @@ out: free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); + cifs_free_open_info(&data); return rc; } @@ -1410,7 +1413,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) struct inode *inode = d_inode(cfile->dentry); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct file_lock *flock; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); unsigned int count = 0, i; int rc = 0, xid, type; struct list_head locks_to_send, *el; @@ -1882,11 +1885,13 @@ int cifs_flock(struct file *file, int cmd, struct file_lock *fl) struct cifsFileInfo *cfile; __u32 type; - rc = -EACCES; xid = get_xid(); - if (!(fl->fl_flags & FL_FLOCK)) - return -ENOLCK; + if (!(fl->fl_flags & FL_FLOCK)) { + rc = -ENOLCK; + free_xid(xid); + return rc; + } cfile = (struct cifsFileInfo *)file->private_data; tcon = tlink_tcon(cfile->tlink); @@ -1905,8 +1910,9 @@ int cifs_flock(struct file *file, int cmd, struct file_lock *fl) * if no lock or unlock then nothing to do since we do not * know what it is */ + rc = -EOPNOTSUPP; free_xid(xid); - return -EOPNOTSUPP; + return rc; } rc = cifs_setlk(file, fl, type, wait_flag, posix_lck, lock, unlock, @@ -2428,12 +2434,16 @@ cifs_writev_complete(struct work_struct *work) struct cifs_writedata * cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete) { + struct cifs_writedata *writedata = NULL; struct page **pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (pages) - return cifs_writedata_direct_alloc(pages, complete); + if (pages) { + writedata = cifs_writedata_direct_alloc(pages, complete); + if (!writedata) + kvfree(pages); + } - return NULL; + return writedata; } struct cifs_writedata * @@ -2636,6 +2646,21 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, return rc; } +static int +cifs_writepage_locked(struct page *page, struct writeback_control *wbc); + +static int cifs_write_one_page(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret; + + ret = cifs_writepage_locked(page, wbc); + unlock_page(page); + mapping_set_error(mapping, ret); + return ret; +} + static int cifs_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -2652,10 +2677,11 @@ static int cifs_writepages(struct address_space *mapping, /* * If wsize is smaller than the page cache size, default to writing - * one page at a time via cifs_writepage + * one page at a time. */ if (cifs_sb->ctx->wsize < PAGE_SIZE) - return generic_writepages(mapping, wbc); + return write_cache_pages(mapping, wbc, cifs_write_one_page, + mapping); xid = get_xid(); if (wbc->range_cyclic) { @@ -2842,13 +2868,6 @@ retry_write: return rc; } -static int cifs_writepage(struct page *page, struct writeback_control *wbc) -{ - int rc = cifs_writepage_locked(page, wbc); - unlock_page(page); - return rc; -} - static int cifs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) @@ -3293,6 +3312,9 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, cifs_uncached_writev_complete); if (!wdata) { rc = -ENOMEM; + for (i = 0; i < nr_pages; i++) + put_page(pagevec[i]); + kvfree(pagevec); add_credits_and_wake_if(server, credits, 0); break; } @@ -3519,7 +3541,7 @@ static ssize_t __cifs_writev( ctx->iter = *from; ctx->len = len; } else { - rc = setup_aio_ctx_iter(ctx, from, WRITE); + rc = setup_aio_ctx_iter(ctx, from, ITER_SOURCE); if (rc) { kref_put(&ctx->refcount, cifs_aio_ctx_release); return rc; @@ -4263,7 +4285,7 @@ static ssize_t __cifs_readv( ctx->iter = *to; ctx->len = len; } else { - rc = setup_aio_ctx_iter(ctx, to, READ); + rc = setup_aio_ctx_iter(ctx, to, ITER_DEST); if (rc) { kref_put(&ctx->refcount, cifs_aio_ctx_release); return rc; @@ -4271,6 +4293,15 @@ static ssize_t __cifs_readv( len = ctx->len; } + if (direct) { + rc = filemap_write_and_wait_range(file->f_inode->i_mapping, + offset, offset + len - 1); + if (rc) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return -EAGAIN; + } + } + /* grab a lock here due to read response handlers can access ctx */ mutex_lock(&ctx->aio_mutex); @@ -5209,7 +5240,6 @@ static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio) const struct address_space_operations cifs_addr_ops = { .read_folio = cifs_read_folio, .readahead = cifs_readahead, - .writepage = cifs_writepage, .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, @@ -5218,10 +5248,10 @@ const struct address_space_operations cifs_addr_ops = { .direct_IO = cifs_direct_io, .invalidate_folio = cifs_invalidate_folio, .launder_folio = cifs_launder_folio, + .migrate_folio = filemap_migrate_folio, /* - * TODO: investigate and if useful we could add an cifs_migratePage - * helper (under an CONFIG_MIGRATION) in the future, and also - * investigate and add an is_dirty_writeback helper if needed + * TODO: investigate and if useful we could add an is_dirty_writeback + * helper if needed */ .swap_activate = cifs_swap_activate, .swap_deactivate = cifs_swap_deactivate, @@ -5234,7 +5264,6 @@ const struct address_space_operations cifs_addr_ops = { */ const struct address_space_operations cifs_addr_ops_smallbuf = { .read_folio = cifs_read_folio, - .writepage = cifs_writepage, .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, @@ -5242,4 +5271,5 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .release_folio = cifs_release_folio, .invalidate_folio = cifs_invalidate_folio, .launder_folio = cifs_launder_folio, + .migrate_folio = filemap_migrate_folio, }; diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 0e13dec86b25..2c92a821e028 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -791,6 +791,13 @@ do { \ cifs_sb->ctx->field = NULL; \ } while (0) +#define STEAL_STRING_SENSITIVE(cifs_sb, ctx, field) \ +do { \ + kfree_sensitive(ctx->field); \ + ctx->field = cifs_sb->ctx->field; \ + cifs_sb->ctx->field = NULL; \ +} while (0) + static int smb3_reconfigure(struct fs_context *fc) { struct smb3_fs_context *ctx = smb3_fc2context(fc); @@ -811,7 +818,7 @@ static int smb3_reconfigure(struct fs_context *fc) STEAL_STRING(cifs_sb, ctx, UNC); STEAL_STRING(cifs_sb, ctx, source); STEAL_STRING(cifs_sb, ctx, username); - STEAL_STRING(cifs_sb, ctx, password); + STEAL_STRING_SENSITIVE(cifs_sb, ctx, password); STEAL_STRING(cifs_sb, ctx, domainname); STEAL_STRING(cifs_sb, ctx, nodename); STEAL_STRING(cifs_sb, ctx, iocharset); @@ -877,16 +884,21 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->nodfs = 1; break; case Opt_hard: - if (result.negated) + if (result.negated) { + if (ctx->retry == 1) + cifs_dbg(VFS, "conflicting hard vs. soft mount options\n"); ctx->retry = 0; - else + } else ctx->retry = 1; break; case Opt_soft: if (result.negated) ctx->retry = 1; - else + else { + if (ctx->retry == 1) + cifs_dbg(VFS, "conflicting hard vs soft mount options\n"); ctx->retry = 0; + } break; case Opt_mapposix: if (result.negated) @@ -1162,7 +1174,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, } break; case Opt_pass: - kfree(ctx->password); + kfree_sensitive(ctx->password); ctx->password = NULL; if (strlen(param->string) == 0) break; @@ -1470,6 +1482,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, return 0; cifs_parse_mount_err: + kfree_sensitive(ctx->password); return -EINVAL; } diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 23ef56f55ce5..f6f3a6b75601 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -45,7 +45,7 @@ int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) memset(&key, 0, sizeof(key)); - sharename = extract_sharename(tcon->treeName); + sharename = extract_sharename(tcon->tree_name); if (IS_ERR(sharename)) { cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__); return -EINVAL; @@ -150,7 +150,7 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page) bvec[0].bv_page = page; bvec[0].bv_offset = 0; bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + iov_iter_bvec(&iter, ITER_DEST, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); ret = fscache_begin_read_operation(&cres, cookie); if (ret < 0) @@ -180,7 +180,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page, bvec[0].bv_page = page; bvec[0].bv_offset = 0; bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + iov_iter_bvec(&iter, ITER_SOURCE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); ret = fscache_begin_write_operation(&cres, cookie); if (ret < 0) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index bac08c20f559..286a5400b94e 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -210,6 +210,12 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) */ inode->i_blocks = (512 - 1 + fattr->cf_bytes) >> 9; } + + if (S_ISLNK(fattr->cf_mode)) { + kfree(cifs_i->symlink_target); + cifs_i->symlink_target = fattr->cf_symlink_target; + fattr->cf_symlink_target = NULL; + } spin_unlock(&inode->i_lock); if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL) @@ -347,13 +353,22 @@ cifs_get_file_info_unix(struct file *filp) int rc; unsigned int xid; FILE_UNIX_BASIC_INFO find_data; - struct cifs_fattr fattr; + struct cifs_fattr fattr = {}; struct inode *inode = file_inode(filp); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsFileInfo *cfile = filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); xid = get_xid(); + + if (cfile->symlink_target) { + fattr.cf_symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!fattr.cf_symlink_target) { + rc = -ENOMEM; + goto cifs_gfiunix_out; + } + } + rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->fid.netfid, &find_data); if (!rc) { cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); @@ -378,6 +393,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, FILE_UNIX_BASIC_INFO find_data; struct cifs_fattr fattr; struct cifs_tcon *tcon; + struct TCP_Server_Info *server; struct tcon_link *tlink; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -387,10 +403,12 @@ int cifs_get_inode_info_unix(struct inode **pinode, if (IS_ERR(tlink)) return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + server = tcon->ses->server; /* could have done a find first instead but this returns more info */ rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data, cifs_sb->local_nls, cifs_remap(cifs_sb)); + cifs_dbg(FYI, "%s: query path info: rc = %d\n", __func__, rc); cifs_put_tlink(tlink); if (!rc) { @@ -410,6 +428,17 @@ int cifs_get_inode_info_unix(struct inode **pinode, cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); } + if (S_ISLNK(fattr.cf_mode) && !fattr.cf_symlink_target) { + if (!server->ops->query_symlink) + return -EOPNOTSUPP; + rc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path, + &fattr.cf_symlink_target, false); + if (rc) { + cifs_dbg(FYI, "%s: query_symlink: %d\n", __func__, rc); + goto cgiiu_exit; + } + } + if (*pinode == NULL) { /* get new inode */ cifs_fill_uniqueid(sb, &fattr); @@ -432,6 +461,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, } cgiiu_exit: + kfree(fattr.cf_symlink_target); return rc; } #else @@ -601,10 +631,12 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, } /* Fill a cifs_fattr struct with info from POSIX info struct */ -static void -smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo *info, - struct super_block *sb, bool adjust_tz, bool symlink) +static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, + struct cifs_sid *owner, + struct cifs_sid *group, + struct super_block *sb, bool adjust_tz, bool symlink) { + struct smb311_posix_qinfo *info = &data->posix_fi; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); @@ -639,6 +671,8 @@ smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo * if (symlink) { fattr->cf_mode |= S_IFLNK; fattr->cf_dtype = DT_LNK; + fattr->cf_symlink_target = data->symlink_target; + data->symlink_target = NULL; } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode |= S_IFDIR; fattr->cf_dtype = DT_DIR; @@ -648,20 +682,18 @@ smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo * } /* else if reparse point ... TODO: add support for FIFO and blk dev; special file types */ - fattr->cf_uid = cifs_sb->ctx->linux_uid; /* TODO: map uid and gid from SID */ - fattr->cf_gid = cifs_sb->ctx->linux_gid; + sid_to_id(cifs_sb, owner, fattr, SIDOWNER); + sid_to_id(cifs_sb, group, fattr, SIDGROUP); cifs_dbg(FYI, "POSIX query info: mode 0x%x uniqueid 0x%llx nlink %d\n", fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink); } - -/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */ -static void -cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, - struct super_block *sb, bool adjust_tz, - bool symlink, u32 reparse_tag) +static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, + struct super_block *sb, bool adjust_tz, bool symlink, + u32 reparse_tag) { + struct smb2_file_all_info *info = &data->fi; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); @@ -703,7 +735,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, } else if (reparse_tag == IO_REPARSE_TAG_LX_BLK) { fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode; fattr->cf_dtype = DT_BLK; - } else if (symlink) { /* TODO add more reparse tag checks */ + } else if (symlink || reparse_tag == IO_REPARSE_TAG_SYMLINK || + reparse_tag == IO_REPARSE_TAG_NFS) { fattr->cf_mode = S_IFLNK; fattr->cf_dtype = DT_LNK; } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { @@ -735,6 +768,11 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, } } + if (S_ISLNK(fattr->cf_mode)) { + fattr->cf_symlink_target = data->symlink_target; + data->symlink_target = NULL; + } + fattr->cf_uid = cifs_sb->ctx->linux_uid; fattr->cf_gid = cifs_sb->ctx->linux_gid; } @@ -744,23 +782,28 @@ cifs_get_file_info(struct file *filp) { int rc; unsigned int xid; - FILE_ALL_INFO find_data; + struct cifs_open_info_data data = {}; struct cifs_fattr fattr; struct inode *inode = file_inode(filp); struct cifsFileInfo *cfile = filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; + bool symlink = false; + u32 tag = 0; if (!server->ops->query_file_info) return -ENOSYS; xid = get_xid(); - rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data); + rc = server->ops->query_file_info(xid, tcon, cfile, &data); switch (rc) { case 0: /* TODO: add support to query reparse tag */ - cifs_all_info_to_fattr(&fattr, &find_data, inode->i_sb, false, - false, 0 /* no reparse tag */); + if (data.symlink_target) { + symlink = true; + tag = IO_REPARSE_TAG_SYMLINK; + } + cifs_open_info_to_fattr(&fattr, &data, inode->i_sb, false, symlink, tag); break; case -EREMOTE: cifs_create_dfs_fattr(&fattr, inode->i_sb); @@ -789,6 +832,7 @@ cifs_get_file_info(struct file *filp) /* if filetype is different, return error */ rc = cifs_fattr_to_inode(inode, &fattr); cgfi_exit: + cifs_free_open_info(&data); free_xid(xid); return rc; } @@ -860,14 +904,9 @@ cifs_backup_query_path_info(int xid, } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ -static void -cifs_set_fattr_ino(int xid, - struct cifs_tcon *tcon, - struct super_block *sb, - struct inode **inode, - const char *full_path, - FILE_ALL_INFO *data, - struct cifs_fattr *fattr) +static void cifs_set_fattr_ino(int xid, struct cifs_tcon *tcon, struct super_block *sb, + struct inode **inode, const char *full_path, + struct cifs_open_info_data *data, struct cifs_fattr *fattr) { struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct TCP_Server_Info *server = tcon->ses->server; @@ -885,11 +924,8 @@ cifs_set_fattr_ino(int xid, * If we have an inode pass a NULL tcon to ensure we don't * make a round trip to the server. This only works for SMB2+. */ - rc = server->ops->get_srv_inum(xid, - *inode ? NULL : tcon, - cifs_sb, full_path, - &fattr->cf_uniqueid, - data); + rc = server->ops->get_srv_inum(xid, *inode ? NULL : tcon, cifs_sb, full_path, + &fattr->cf_uniqueid, data); if (rc) { /* * If that fails reuse existing ino or generate one @@ -913,7 +949,7 @@ cifs_set_fattr_ino(int xid, } else { /* make an ino by hashing the UNC */ fattr->cf_flags |= CIFS_FATTR_FAKE_ROOT_INO; - fattr->cf_uniqueid = simple_hashstr(tcon->treeName); + fattr->cf_uniqueid = simple_hashstr(tcon->tree_name); } } } @@ -923,14 +959,10 @@ static inline bool is_inode_cache_good(struct inode *ino) return ino && CIFS_CACHE_READ(CIFS_I(ino)) && CIFS_I(ino)->time != 0; } -int -cifs_get_inode_info(struct inode **inode, - const char *full_path, - FILE_ALL_INFO *in_data, - struct super_block *sb, int xid, - const struct cifs_fid *fid) +int cifs_get_inode_info(struct inode **inode, const char *full_path, + struct cifs_open_info_data *data, struct super_block *sb, int xid, + const struct cifs_fid *fid) { - struct cifs_tcon *tcon; struct TCP_Server_Info *server; struct tcon_link *tlink; @@ -938,8 +970,7 @@ cifs_get_inode_info(struct inode **inode, bool adjust_tz = false; struct cifs_fattr fattr = {0}; bool is_reparse_point = false; - FILE_ALL_INFO *data = in_data; - FILE_ALL_INFO *tmp_data = NULL; + struct cifs_open_info_data tmp_data = {}; void *smb1_backup_rsp_buf = NULL; int rc = 0; int tmprc = 0; @@ -960,21 +991,15 @@ cifs_get_inode_info(struct inode **inode, cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); goto out; } - tmp_data = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (!tmp_data) { - rc = -ENOMEM; - goto out; - } - rc = server->ops->query_path_info(xid, tcon, cifs_sb, - full_path, tmp_data, - &adjust_tz, &is_reparse_point); + rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path, &tmp_data, + &adjust_tz, &is_reparse_point); #ifdef CONFIG_CIFS_DFS_UPCALL if (rc == -ENOENT && is_tcon_dfs(tcon)) rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon, cifs_sb, full_path); #endif - data = tmp_data; + data = &tmp_data; } /* @@ -988,14 +1013,24 @@ cifs_get_inode_info(struct inode **inode, * since we have to check if its reparse tag matches a known * special file type e.g. symlink or fifo or char etc. */ - if ((le32_to_cpu(data->Attributes) & ATTR_REPARSE) && - server->ops->query_reparse_tag) { - rc = server->ops->query_reparse_tag(xid, tcon, cifs_sb, - full_path, &reparse_tag); - cifs_dbg(FYI, "reparse tag 0x%x\n", reparse_tag); + if (is_reparse_point && data->symlink_target) { + reparse_tag = IO_REPARSE_TAG_SYMLINK; + } else if ((le32_to_cpu(data->fi.Attributes) & ATTR_REPARSE) && + server->ops->query_reparse_tag) { + tmprc = server->ops->query_reparse_tag(xid, tcon, cifs_sb, full_path, + &reparse_tag); + if (tmprc) + cifs_dbg(FYI, "%s: query_reparse_tag: rc = %d\n", __func__, tmprc); + if (server->ops->query_symlink) { + tmprc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path, + &data->symlink_target, + is_reparse_point); + if (tmprc) + cifs_dbg(FYI, "%s: query_symlink: rc = %d\n", __func__, + tmprc); + } } - cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz, - is_reparse_point, reparse_tag); + cifs_open_info_to_fattr(&fattr, data, sb, adjust_tz, is_reparse_point, reparse_tag); break; case -EREMOTE: /* DFS link, no metadata available on this server */ @@ -1014,18 +1049,20 @@ cifs_get_inode_info(struct inode **inode, */ if (backup_cred(cifs_sb) && is_smb1_server(server)) { /* for easier reading */ + FILE_ALL_INFO *fi; FILE_DIRECTORY_INFO *fdi; SEARCH_ID_FULL_DIR_INFO *si; rc = cifs_backup_query_path_info(xid, tcon, sb, full_path, &smb1_backup_rsp_buf, - &data); + &fi); if (rc) goto out; - fdi = (FILE_DIRECTORY_INFO *)data; - si = (SEARCH_ID_FULL_DIR_INFO *)data; + move_cifs_info_to_smb2(&data->fi, fi); + fdi = (FILE_DIRECTORY_INFO *)fi; + si = (SEARCH_ID_FULL_DIR_INFO *)fi; cifs_dir_info_to_fattr(&fattr, fdi, cifs_sb); fattr.cf_uniqueid = le64_to_cpu(si->UniqueId); @@ -1123,7 +1160,8 @@ handle_mnt_opt: out: cifs_buf_release(smb1_backup_rsp_buf); cifs_put_tlink(tlink); - kfree(tmp_data); + cifs_free_open_info(&tmp_data); + kfree(fattr.cf_symlink_target); return rc; } @@ -1138,7 +1176,8 @@ smb311_posix_get_inode_info(struct inode **inode, bool adjust_tz = false; struct cifs_fattr fattr = {0}; bool symlink = false; - struct smb311_posix_qinfo *data = NULL; + struct cifs_open_info_data data = {}; + struct cifs_sid owner, group; int rc = 0; int tmprc = 0; @@ -1155,15 +1194,10 @@ smb311_posix_get_inode_info(struct inode **inode, cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); goto out; } - data = kmalloc(sizeof(struct smb311_posix_qinfo), GFP_KERNEL); - if (!data) { - rc = -ENOMEM; - goto out; - } - rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, - full_path, data, - &adjust_tz, &symlink); + rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, full_path, &data, + &owner, &group, &adjust_tz, + &symlink); /* * 2. Convert it to internal cifs metadata (fattr) @@ -1171,7 +1205,8 @@ smb311_posix_get_inode_info(struct inode **inode, switch (rc) { case 0: - smb311_posix_info_to_fattr(&fattr, data, sb, adjust_tz, symlink); + smb311_posix_info_to_fattr(&fattr, &data, &owner, &group, + sb, adjust_tz, symlink); break; case -EREMOTE: /* DFS link, no metadata available on this server */ @@ -1228,7 +1263,8 @@ smb311_posix_get_inode_info(struct inode **inode, } out: cifs_put_tlink(tlink); - kfree(data); + cifs_free_open_info(&data); + kfree(fattr.cf_symlink_target); return rc; } @@ -2265,13 +2301,13 @@ cifs_dentry_needs_reval(struct dentry *dentry) return true; if (!open_cached_dir_by_dentry(tcon, dentry->d_parent, &cfid)) { - mutex_lock(&cfid->fid_mutex); + spin_lock(&cfid->fid_lock); if (cfid->time && cifs_i->time > cfid->time) { - mutex_unlock(&cfid->fid_mutex); + spin_unlock(&cfid->fid_lock); close_cached_dir(cfid); return false; } - mutex_unlock(&cfid->fid_mutex); + spin_unlock(&cfid->fid_lock); close_cached_dir(cfid); } /* @@ -2327,7 +2363,7 @@ cifs_invalidate_mapping(struct inode *inode) static int cifs_wait_bit_killable(struct wait_bit_key *key, int mode) { - freezable_schedule_unsafe(); + schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; return 0; @@ -2345,7 +2381,7 @@ cifs_revalidate_mapping(struct inode *inode) return 0; rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable, - TASK_KILLABLE); + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (rc) return rc; diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index b6e6e5d6c8dd..6419ec47c2a8 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -343,7 +343,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = put_user(ExtAttrBits & FS_FL_USER_VISIBLE, (int __user *)arg); - if (rc != EOPNOTSUPP) + if (rc != -EOPNOTSUPP) break; } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ @@ -373,7 +373,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) * pSMBFile->fid.netfid, * extAttrBits, * &ExtAttrMask); - * if (rc != EOPNOTSUPP) + * if (rc != -EOPNOTSUPP) * break; */ @@ -484,12 +484,35 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) tcon = tlink_tcon(tlink); if (tcon && tcon->ses->server->ops->notify) { rc = tcon->ses->server->ops->notify(xid, - filep, (void __user *)arg); + filep, (void __user *)arg, + false /* no ret data */); cifs_dbg(FYI, "ioctl notify rc %d\n", rc); } else rc = -EOPNOTSUPP; cifs_put_tlink(tlink); break; + case CIFS_IOC_NOTIFY_INFO: + if (!S_ISDIR(inode->i_mode)) { + /* Notify can only be done on directories */ + rc = -EOPNOTSUPP; + break; + } + cifs_sb = CIFS_SB(inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + break; + } + tcon = tlink_tcon(tlink); + if (tcon && tcon->ses->server->ops->notify) { + rc = tcon->ses->server->ops->notify(xid, + filep, (void __user *)arg, + true /* return details */); + cifs_dbg(FYI, "ioctl notify info rc %d\n", rc); + } else + rc = -EOPNOTSUPP; + cifs_put_tlink(tlink); + break; case CIFS_IOC_SHUTDOWN: rc = cifs_shutdown(inode->i_sb, arg); break; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 6803cb27eecc..bd374feeccaa 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -38,29 +38,28 @@ static int symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) { int rc; - struct crypto_shash *md5 = NULL; - struct sdesc *sdescmd5 = NULL; + struct shash_desc *md5 = NULL; - rc = cifs_alloc_hash("md5", &md5, &sdescmd5); + rc = cifs_alloc_hash("md5", &md5); if (rc) goto symlink_hash_err; - rc = crypto_shash_init(&sdescmd5->shash); + rc = crypto_shash_init(md5); if (rc) { cifs_dbg(VFS, "%s: Could not init md5 shash\n", __func__); goto symlink_hash_err; } - rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len); + rc = crypto_shash_update(md5, link_str, link_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__); goto symlink_hash_err; } - rc = crypto_shash_final(&sdescmd5->shash, md5_hash); + rc = crypto_shash_final(md5, md5_hash); if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); symlink_hash_err: - cifs_free_hash(&md5, &sdescmd5); + cifs_free_hash(&md5); return rc; } @@ -202,40 +201,6 @@ out: return rc; } -static int -query_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const unsigned char *path, - char **symlinkinfo) -{ - int rc; - u8 *buf = NULL; - unsigned int link_len = 0; - unsigned int bytes_read = 0; - - buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - if (tcon->ses->server->ops->query_mf_symlink) - rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon, - cifs_sb, path, buf, &bytes_read); - else - rc = -ENOSYS; - - if (rc) - goto out; - - if (bytes_read == 0) { /* not a symlink */ - rc = -EINVAL; - goto out; - } - - rc = parse_mf_symlink(buf, bytes_read, &link_len, symlinkinfo); -out: - kfree(buf); - return rc; -} - int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, @@ -245,6 +210,7 @@ check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, u8 *buf = NULL; unsigned int link_len = 0; unsigned int bytes_read = 0; + char *symlink = NULL; if (!couldbe_mf_symlink(fattr)) /* it's not a symlink */ @@ -266,7 +232,7 @@ check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, if (bytes_read == 0) /* not a symlink */ goto out; - rc = parse_mf_symlink(buf, bytes_read, &link_len, NULL); + rc = parse_mf_symlink(buf, bytes_read, &link_len, &symlink); if (rc == -EINVAL) { /* it's not a symlink */ rc = 0; @@ -281,6 +247,7 @@ check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, fattr->cf_mode &= ~S_IFMT; fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO; fattr->cf_dtype = DT_LNK; + fattr->cf_symlink_target = symlink; out: kfree(buf); return rc; @@ -600,75 +567,6 @@ cifs_hl_exit: return rc; } -const char * -cifs_get_link(struct dentry *direntry, struct inode *inode, - struct delayed_call *done) -{ - int rc = -ENOMEM; - unsigned int xid; - const char *full_path; - void *page; - char *target_path = NULL; - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - struct tcon_link *tlink = NULL; - struct cifs_tcon *tcon; - struct TCP_Server_Info *server; - - if (!direntry) - return ERR_PTR(-ECHILD); - - xid = get_xid(); - - tlink = cifs_sb_tlink(cifs_sb); - if (IS_ERR(tlink)) { - free_xid(xid); - return ERR_CAST(tlink); - } - tcon = tlink_tcon(tlink); - server = tcon->ses->server; - - page = alloc_dentry_path(); - full_path = build_path_from_dentry(direntry, page); - if (IS_ERR(full_path)) { - free_xid(xid); - cifs_put_tlink(tlink); - free_dentry_path(page); - return ERR_CAST(full_path); - } - - cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode); - - rc = -EACCES; - /* - * First try Minshall+French Symlinks, if configured - * and fallback to UNIX Extensions Symlinks. - */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) - rc = query_mf_symlink(xid, tcon, cifs_sb, full_path, - &target_path); - - if (rc != 0 && server->ops->query_symlink) { - struct cifsInodeInfo *cifsi = CIFS_I(inode); - bool reparse_point = false; - - if (cifsi->cifsAttrs & ATTR_REPARSE) - reparse_point = true; - - rc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path, - &target_path, reparse_point); - } - - free_dentry_path(page); - free_xid(xid); - cifs_put_tlink(tlink); - if (rc != 0) { - kfree(target_path); - return ERR_PTR(rc); - } - set_delayed_call(done, kfree_link, target_path); - return target_path; -} - int cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, struct dentry *direntry, const char *symname) diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 87f60f736731..1cbecd64d697 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -117,8 +117,8 @@ tconInfoAlloc(void) ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL); if (!ret_buf) return NULL; - ret_buf->cfid = init_cached_dir(); - if (!ret_buf->cfid) { + ret_buf->cfids = init_cached_dirs(); + if (!ret_buf->cfids) { kfree(ret_buf); return NULL; } @@ -144,7 +144,7 @@ tconInfoFree(struct cifs_tcon *tcon) cifs_dbg(FYI, "Null buffer passed to tconInfoFree\n"); return; } - free_cached_dir(tcon); + free_cached_dirs(tcon->cfids); atomic_dec(&tconInfoAllocCount); kfree(tcon->nativeFileSystem); kfree_sensitive(tcon->password); @@ -400,6 +400,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) { struct smb_hdr *buf = (struct smb_hdr *)buffer; struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf; + struct TCP_Server_Info *pserver; struct cifs_ses *ses; struct cifs_tcon *tcon; struct cifsInodeInfo *pCifsInode; @@ -464,9 +465,12 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)) return false; + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(srv) ? srv->primary_server : srv; + /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &srv->smb_ses_list, smb_ses_list) { + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { if (tcon->tid != buf->Tid) continue; @@ -525,7 +529,7 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; cifs_sb->mnt_cifs_serverino_autodisabled = true; cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s\n", - tcon ? tcon->treeName : "new server"); + tcon ? tcon->tree_name : "new server"); cifs_dbg(VFS, "The server doesn't seem to support them properly or the files might be on different servers (DFS)\n"); cifs_dbg(VFS, "Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n"); @@ -824,7 +828,7 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) free_dentry_path(page); } -/* parses DFS refferal V3 structure +/* parses DFS referral V3 structure * caller is responsible for freeing target_nodes * returns: * - on success - 0 @@ -1071,59 +1075,58 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) /** * cifs_alloc_hash - allocate hash and hash context together * @name: The name of the crypto hash algo - * @shash: Where to put the pointer to the hash algo - * @sdesc: Where to put the pointer to the hash descriptor + * @sdesc: SHASH descriptor where to put the pointer to the hash TFM * * The caller has to make sure @sdesc is initialized to either NULL or - * a valid context. Both can be freed via cifs_free_hash(). + * a valid context. It can be freed via cifs_free_hash(). */ int -cifs_alloc_hash(const char *name, - struct crypto_shash **shash, struct sdesc **sdesc) +cifs_alloc_hash(const char *name, struct shash_desc **sdesc) { int rc = 0; - size_t size; + struct crypto_shash *alg = NULL; - if (*sdesc != NULL) + if (*sdesc) return 0; - *shash = crypto_alloc_shash(name, 0, 0); - if (IS_ERR(*shash)) { - cifs_dbg(VFS, "Could not allocate crypto %s\n", name); - rc = PTR_ERR(*shash); - *shash = NULL; + alg = crypto_alloc_shash(name, 0, 0); + if (IS_ERR(alg)) { + cifs_dbg(VFS, "Could not allocate shash TFM '%s'\n", name); + rc = PTR_ERR(alg); *sdesc = NULL; return rc; } - size = sizeof(struct shash_desc) + crypto_shash_descsize(*shash); - *sdesc = kmalloc(size, GFP_KERNEL); + *sdesc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(alg), GFP_KERNEL); if (*sdesc == NULL) { - cifs_dbg(VFS, "no memory left to allocate crypto %s\n", name); - crypto_free_shash(*shash); - *shash = NULL; + cifs_dbg(VFS, "no memory left to allocate shash TFM '%s'\n", name); + crypto_free_shash(alg); return -ENOMEM; } - (*sdesc)->shash.tfm = *shash; + (*sdesc)->tfm = alg; return 0; } /** * cifs_free_hash - free hash and hash context together - * @shash: Where to find the pointer to the hash algo - * @sdesc: Where to find the pointer to the hash descriptor + * @sdesc: Where to find the pointer to the hash TFM * - * Freeing a NULL hash or context is safe. + * Freeing a NULL descriptor is safe. */ void -cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc) +cifs_free_hash(struct shash_desc **sdesc) { - kfree(*sdesc); + if (unlikely(!sdesc) || !*sdesc) + return; + + if ((*sdesc)->tfm) { + crypto_free_shash((*sdesc)->tfm); + (*sdesc)->tfm = NULL; + } + + kfree_sensitive(*sdesc); *sdesc = NULL; - if (*shash) - crypto_free_shash(*shash); - *shash = NULL; } /** @@ -1133,8 +1136,8 @@ cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc) * @len: Where to store the length for this page: * @offset: Where to store the offset for this page */ -void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page, - unsigned int *len, unsigned int *offset) +void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page, + unsigned int *len, unsigned int *offset) { *len = rqst->rq_pagesz; *offset = (page == 0) ? rqst->rq_offset : 0; @@ -1328,7 +1331,7 @@ int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid, char *treename, *dfspath, sep; int treenamelen, linkpathlen, rc; - treename = tcon->treeName; + treename = tcon->tree_name; /* MS-DFSC: All paths in REQ_GET_DFS_REFERRAL and RESP_GET_DFS_REFERRAL * messages MUST be encoded with exactly one leading backslash, not two * leading backslashes. diff --git a/fs/cifs/netlink.c b/fs/cifs/netlink.c index 291cb606f149..147d9409252c 100644 --- a/fs/cifs/netlink.c +++ b/fs/cifs/netlink.c @@ -51,6 +51,7 @@ struct genl_family cifs_genl_family = { .policy = cifs_genl_policy, .ops = cifs_genl_ops, .n_ops = ARRAY_SIZE(cifs_genl_ops), + .resv_start_op = CIFS_GENL_CMD_SWN_NOTIFY + 1, .mcgrps = cifs_genl_mcgrps, .n_mcgrps = ARRAY_SIZE(cifs_genl_mcgrps), }; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 8e060c00c969..2d75ba5aaa8a 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -844,17 +844,34 @@ static bool emit_cached_dirents(struct cached_dirents *cde, struct dir_context *ctx) { struct cached_dirent *dirent; - int rc; + bool rc; list_for_each_entry(dirent, &cde->entries, entry) { - if (ctx->pos >= dirent->pos) + /* + * Skip all early entries prior to the current lseek() + * position. + */ + if (ctx->pos > dirent->pos) continue; + /* + * We recorded the current ->pos value for the dirent + * when we stored it in the cache. + * However, this sequence of ->pos values may have holes + * in it, for example dot-dirs returned from the server + * are suppressed. + * Handle this bu forcing ctx->pos to be the same as the + * ->pos of the current dirent we emit from the cache. + * This means that when we emit these entries from the cache + * we now emit them with the same ->pos value as in the + * initial scan. + */ ctx->pos = dirent->pos; rc = dir_emit(ctx, dirent->name, dirent->namelen, dirent->fattr.cf_uniqueid, dirent->fattr.cf_dtype); if (!rc) return rc; + ctx->pos++; } return true; } @@ -994,6 +1011,8 @@ static int cifs_filldir(char *find_entry, struct file *file, cifs_unix_basic_to_fattr(&fattr, &((FILE_UNIX_INFO *)find_entry)->basic, cifs_sb); + if (S_ISLNK(fattr.cf_mode)) + fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; break; case SMB_FIND_FILE_INFO_STANDARD: cifs_std_info_to_fattr(&fattr, @@ -1202,10 +1221,10 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) ctx->pos, tmp_buf); cifs_save_resume_key(current_entry, cifsFile); break; - } else - current_entry = - nxt_dir_entry(current_entry, end_of_smb, - cifsFile->srch_inf.info_level); + } + current_entry = + nxt_dir_entry(current_entry, end_of_smb, + cifsFile->srch_inf.info_level); } kfree(tmp_buf); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 3af3b05b6c74..9e7d9f0baa18 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -302,14 +302,14 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) /* now drop the ref to the current iface */ if (old_iface && iface) { - kref_put(&old_iface->refcount, release_iface); cifs_dbg(FYI, "replacing iface: %pIS with %pIS\n", &old_iface->sockaddr, &iface->sockaddr); - } else if (old_iface) { kref_put(&old_iface->refcount, release_iface); + } else if (old_iface) { cifs_dbg(FYI, "releasing ref to iface: %pIS\n", &old_iface->sockaddr); + kref_put(&old_iface->refcount, release_iface); } else { WARN_ON(!iface); cifs_dbg(FYI, "adding new iface: %pIS\n", &iface->sockaddr); @@ -496,6 +496,7 @@ out: cifs_put_tcp_session(chan->server, 0); } + free_xid(xid); return rc; } @@ -601,11 +602,6 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, /* BB FIXME add check that strings total less than 335 or will need to send them as arrays */ - /* unicode strings, must be word aligned before the call */ -/* if ((long) bcc_ptr % 2) { - *bcc_ptr = 0; - bcc_ptr++; - } */ /* copy user */ if (ses->user_name == NULL) { /* null user mount */ @@ -1213,10 +1209,18 @@ out_free_smb_buf: static void sess_free_buffer(struct sess_data *sess_data) { + struct kvec *iov = sess_data->iov; + + /* + * Zero the session data before freeing, as it might contain sensitive info (keys, etc). + * Note that iov[1] is already freed by caller. + */ + if (sess_data->buf0_type != CIFS_NO_BUFFER && iov[0].iov_base) + memzero_explicit(iov[0].iov_base, iov[0].iov_len); - free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); + free_rsp_buf(sess_data->buf0_type, iov[0].iov_base); sess_data->buf0_type = CIFS_NO_BUFFER; - kfree(sess_data->iov[2].iov_base); + kfree_sensitive(iov[2].iov_base); } static int @@ -1318,7 +1322,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data) } if (ses->capabilities & CAP_UNICODE) { - if (sess_data->iov[0].iov_len % 2) { + if (!IS_ALIGNED(sess_data->iov[0].iov_len, 2)) { *bcc_ptr = 0; bcc_ptr++; } @@ -1358,7 +1362,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data) /* no string area to decode, do nothing */ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { /* unicode string area must be word-aligned */ - if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) { ++bcc_ptr; --bytes_remaining; } @@ -1374,7 +1378,7 @@ out: sess_data->result = rc; sess_data->func = NULL; sess_free_buffer(sess_data); - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; } @@ -1442,8 +1446,7 @@ sess_auth_kerberos(struct sess_data *sess_data) if (ses->capabilities & CAP_UNICODE) { /* unicode strings must be word aligned */ - if ((sess_data->iov[0].iov_len - + sess_data->iov[1].iov_len) % 2) { + if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { *bcc_ptr = 0; bcc_ptr++; } @@ -1494,7 +1497,7 @@ sess_auth_kerberos(struct sess_data *sess_data) /* no string area to decode, do nothing */ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { /* unicode string area must be word-aligned */ - if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) { ++bcc_ptr; --bytes_remaining; } @@ -1513,7 +1516,7 @@ out: sess_data->result = rc; sess_data->func = NULL; sess_free_buffer(sess_data); - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; } @@ -1546,7 +1549,7 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data) bcc_ptr = sess_data->iov[2].iov_base; /* unicode strings must be word aligned */ - if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) { + if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { *bcc_ptr = 0; bcc_ptr++; } @@ -1648,7 +1651,7 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses); out_free_ntlmsspblob: - kfree(ntlmsspblob); + kfree_sensitive(ntlmsspblob); out: sess_free_buffer(sess_data); @@ -1658,9 +1661,9 @@ out: } /* Else error. Cleanup */ - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL; sess_data->func = NULL; @@ -1747,7 +1750,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) /* no string area to decode, do nothing */ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { /* unicode string area must be word-aligned */ - if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) { ++bcc_ptr; --bytes_remaining; } @@ -1759,7 +1762,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) } out_free_ntlmsspblob: - kfree(ntlmsspblob); + kfree_sensitive(ntlmsspblob); out: sess_free_buffer(sess_data); @@ -1767,9 +1770,9 @@ out: rc = sess_establish_session(sess_data); /* Cleanup */ - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL; sess_data->func = NULL; @@ -1845,7 +1848,7 @@ int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, rc = sess_data->result; out: - kfree(sess_data); + kfree_sensitive(sess_data); return rc; } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index f36b2d2d40ca..50480751e521 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -542,31 +542,32 @@ cifs_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -static int -cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - FILE_ALL_INFO *data, bool *adjustTZ, bool *symlink) +static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjustTZ, bool *symlink) { int rc; + FILE_ALL_INFO fi = {}; *symlink = false; /* could do find first instead but this returns more info */ - rc = CIFSSMBQPathInfo(xid, tcon, full_path, data, 0 /* not legacy */, - cifs_sb->local_nls, cifs_remap(cifs_sb)); + rc = CIFSSMBQPathInfo(xid, tcon, full_path, &fi, 0 /* not legacy */, cifs_sb->local_nls, + cifs_remap(cifs_sb)); /* * BB optimize code so we do not make the above call when server claims * no NT SMB support and the above call failed at least once - set flag * in tcon or mount. */ if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) { - rc = SMBQueryInformation(xid, tcon, full_path, data, - cifs_sb->local_nls, + rc = SMBQueryInformation(xid, tcon, full_path, &fi, cifs_sb->local_nls, cifs_remap(cifs_sb)); + if (!rc) + move_cifs_info_to_smb2(&data->fi, &fi); *adjustTZ = true; } - if (!rc && (le32_to_cpu(data->Attributes) & ATTR_REPARSE)) { + if (!rc && (le32_to_cpu(fi.Attributes) & ATTR_REPARSE)) { int tmprc; int oplock = 0; struct cifs_fid fid; @@ -592,10 +593,9 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -static int -cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - u64 *uniqueid, FILE_ALL_INFO *data) +static int cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + u64 *uniqueid, struct cifs_open_info_data *unused) { /* * We can not use the IndexNumber field by default from Windows or @@ -613,11 +613,22 @@ cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, cifs_remap(cifs_sb)); } -static int -cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_fid *fid, FILE_ALL_INFO *data) +static int cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, struct cifs_open_info_data *data) { - return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data); + int rc; + FILE_ALL_INFO fi = {}; + + if (cfile->symlink_target) { + data->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!data->symlink_target) + return -ENOMEM; + } + + rc = CIFSSMBQFileInfo(xid, tcon, cfile->fid.netfid, &fi); + if (!rc) + move_cifs_info_to_smb2(&data->fi, &fi); + return rc; } static void @@ -702,19 +713,20 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path, cifsInode->cifsAttrs = dosattrs; } -static int -cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, - __u32 *oplock, FILE_ALL_INFO *buf) +static int cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, + void *buf) { + FILE_ALL_INFO *fi = buf; + if (!(oparms->tcon->ses->capabilities & CAP_NT_SMBS)) return SMBLegacyOpen(xid, oparms->tcon, oparms->path, oparms->disposition, oparms->desired_access, oparms->create_options, - &oparms->fid->netfid, oplock, buf, + &oparms->fid->netfid, oplock, fi, oparms->cifs_sb->local_nls, cifs_remap(oparms->cifs_sb)); - return CIFS_open(xid, oparms, oplock, buf); + return CIFS_open(xid, oparms, oplock, fi); } static void diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 9dfd2dd612c2..ffbd9a99fc12 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -20,40 +20,125 @@ #include "cifs_unicode.h" #include "fscache.h" #include "smb2proto.h" +#include "smb2status.h" -int -smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, - __u32 *oplock, FILE_ALL_INFO *buf) +static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov) +{ + struct smb2_err_rsp *err = iov->iov_base; + struct smb2_symlink_err_rsp *sym = ERR_PTR(-EINVAL); + u32 len; + + if (err->ErrorContextCount) { + struct smb2_error_context_rsp *p, *end; + + len = (u32)err->ErrorContextCount * (offsetof(struct smb2_error_context_rsp, + ErrorContextData) + + sizeof(struct smb2_symlink_err_rsp)); + if (le32_to_cpu(err->ByteCount) < len || iov->iov_len < len + sizeof(*err)) + return ERR_PTR(-EINVAL); + + p = (struct smb2_error_context_rsp *)err->ErrorData; + end = (struct smb2_error_context_rsp *)((u8 *)err + iov->iov_len); + do { + if (le32_to_cpu(p->ErrorId) == SMB2_ERROR_ID_DEFAULT) { + sym = (struct smb2_symlink_err_rsp *)&p->ErrorContextData; + break; + } + cifs_dbg(FYI, "%s: skipping unhandled error context: 0x%x\n", + __func__, le32_to_cpu(p->ErrorId)); + + len = ALIGN(le32_to_cpu(p->ErrorDataLength), 8); + p = (struct smb2_error_context_rsp *)((u8 *)&p->ErrorContextData + len); + } while (p < end); + } else if (le32_to_cpu(err->ByteCount) >= sizeof(*sym) && + iov->iov_len >= SMB2_SYMLINK_STRUCT_SIZE) { + sym = (struct smb2_symlink_err_rsp *)err->ErrorData; + } + + if (!IS_ERR(sym) && (le32_to_cpu(sym->SymLinkErrorTag) != SYMLINK_ERROR_TAG || + le32_to_cpu(sym->ReparseTag) != IO_REPARSE_TAG_SYMLINK)) + sym = ERR_PTR(-EINVAL); + + return sym; +} + +int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec *iov, char **path) +{ + struct smb2_symlink_err_rsp *sym; + unsigned int sub_offs, sub_len; + unsigned int print_offs, print_len; + char *s; + + if (!cifs_sb || !iov || !iov->iov_base || !iov->iov_len || !path) + return -EINVAL; + + sym = symlink_data(iov); + if (IS_ERR(sym)) + return PTR_ERR(sym); + + sub_len = le16_to_cpu(sym->SubstituteNameLength); + sub_offs = le16_to_cpu(sym->SubstituteNameOffset); + print_len = le16_to_cpu(sym->PrintNameLength); + print_offs = le16_to_cpu(sym->PrintNameOffset); + + if (iov->iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offs + sub_len || + iov->iov_len < SMB2_SYMLINK_STRUCT_SIZE + print_offs + print_len) + return -EINVAL; + + s = cifs_strndup_from_utf16((char *)sym->PathBuffer + sub_offs, sub_len, true, + cifs_sb->local_nls); + if (!s) + return -ENOMEM; + convert_delimiter(s, '/'); + cifs_dbg(FYI, "%s: symlink target: %s\n", __func__, s); + + *path = s; + return 0; +} + +int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, void *buf) { int rc; __le16 *smb2_path; - struct smb2_file_all_info *smb2_data = NULL; __u8 smb2_oplock; + struct cifs_open_info_data *data = buf; + struct smb2_file_all_info file_info = {}; + struct smb2_file_all_info *smb2_data = data ? &file_info : NULL; + struct kvec err_iov = {}; + int err_buftype = CIFS_NO_BUFFER; struct cifs_fid *fid = oparms->fid; struct network_resiliency_req nr_ioctl_req; smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb); - if (smb2_path == NULL) { - rc = -ENOMEM; - goto out; - } - - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, - GFP_KERNEL); - if (smb2_data == NULL) { - rc = -ENOMEM; - goto out; - } + if (smb2_path == NULL) + return -ENOMEM; oparms->desired_access |= FILE_READ_ATTRIBUTES; smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH; - rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, - NULL, NULL); + rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov, + &err_buftype); + if (rc && data) { + struct smb2_hdr *hdr = err_iov.iov_base; + + if (unlikely(!err_iov.iov_base || err_buftype == CIFS_NO_BUFFER)) + rc = -ENOMEM; + else if (hdr->Status == STATUS_STOPPED_ON_SYMLINK) { + rc = smb2_parse_symlink_response(oparms->cifs_sb, &err_iov, + &data->symlink_target); + if (!rc) { + memset(smb2_data, 0, sizeof(*smb2_data)); + oparms->create_options |= OPEN_REPARSE_POINT; + rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, + NULL, NULL, NULL); + oparms->create_options &= ~OPEN_REPARSE_POINT; + } + } + } + if (rc) goto out; - if (oparms->tcon->use_resilient) { /* default timeout is 0, servers pick default (120 seconds) */ nr_ioctl_req.Timeout = @@ -73,7 +158,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, rc = 0; } - if (buf) { + if (smb2_data) { /* if open response does not have IndexNumber field - get it */ if (smb2_data->IndexNumber == 0) { rc = SMB2_get_srv_num(xid, oparms->tcon, @@ -89,12 +174,12 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, rc = 0; } } - move_smb2_info_to_cifs(buf, smb2_data); + memcpy(&data->fi, smb2_data, sizeof(data->fi)); } *oplock = smb2_oplock; out: - kfree(smb2_data); + free_rsp_buf(err_buftype, err_iov.iov_base); kfree(smb2_path); return rc; } diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index b83f59051b26..fbd46db1023a 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -24,6 +24,7 @@ #include "smb2pdu.h" #include "smb2proto.h" #include "cached_dir.h" +#include "smb2status.h" static void free_set_inf_compound(struct smb_rqst *rqst) @@ -50,13 +51,16 @@ struct cop_vars { /* * note: If cfile is passed, the reference to it is dropped here. * So make sure that you do not reuse cfile after return from this func. + * + * If passing @err_iov and @err_buftype, ensure to make them both large enough (>= 3) to hold all + * error responses. Caller is also responsible for freeing them up. */ -static int -smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - __u32 desired_access, __u32 create_disposition, - __u32 create_options, umode_t mode, void *ptr, int command, - struct cifsFileInfo *cfile) +static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + __u32 desired_access, __u32 create_disposition, __u32 create_options, + umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile, + __u8 **extbuf, size_t *extbuflen, + struct kvec *err_iov, int *err_buftype) { struct cop_vars *vars = NULL; struct kvec *rsp_iov; @@ -70,6 +74,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, int num_rqst = 0; int resp_buftype[3]; struct smb2_query_info_rsp *qi_rsp = NULL; + struct cifs_open_info_data *idata; int flags = 0; __u8 delete_pending[8] = {1, 0, 0, 0, 0, 0, 0, 0}; unsigned int size[2]; @@ -379,20 +384,25 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_open_free(&rqst[0]); if (rc == -EREMCHG) { - pr_warn_once("server share %s deleted\n", tcon->treeName); + pr_warn_once("server share %s deleted\n", tcon->tree_name); tcon->need_reconnect = true; } switch (command) { case SMB2_OP_QUERY_INFO: + idata = ptr; + if (rc == 0 && cfile && cfile->symlink_target) { + idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!idata->symlink_target) + rc = -ENOMEM; + } if (rc == 0) { qi_rsp = (struct smb2_query_info_rsp *) rsp_iov[1].iov_base; rc = smb2_validate_and_copy_iov( le16_to_cpu(qi_rsp->OutputBufferOffset), le32_to_cpu(qi_rsp->OutputBufferLength), - &rsp_iov[1], sizeof(struct smb2_file_all_info), - ptr); + &rsp_iov[1], sizeof(idata->fi), (char *)&idata->fi); } if (rqst[1].rq_iov) SMB2_query_info_free(&rqst[1]); @@ -406,13 +416,35 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, tcon->tid); break; case SMB2_OP_POSIX_QUERY_INFO: + idata = ptr; + if (rc == 0 && cfile && cfile->symlink_target) { + idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!idata->symlink_target) + rc = -ENOMEM; + } if (rc == 0) { qi_rsp = (struct smb2_query_info_rsp *) rsp_iov[1].iov_base; rc = smb2_validate_and_copy_iov( le16_to_cpu(qi_rsp->OutputBufferOffset), le32_to_cpu(qi_rsp->OutputBufferLength), - &rsp_iov[1], sizeof(struct smb311_posix_qinfo) /* add SIDs */, ptr); + &rsp_iov[1], sizeof(idata->posix_fi) /* add SIDs */, + (char *)&idata->posix_fi); + } + if (rc == 0) { + unsigned int length = le32_to_cpu(qi_rsp->OutputBufferLength); + + if (length > sizeof(idata->posix_fi)) { + char *base = (char *)rsp_iov[1].iov_base + + le16_to_cpu(qi_rsp->OutputBufferOffset) + + sizeof(idata->posix_fi); + *extbuflen = length - sizeof(idata->posix_fi); + *extbuf = kmemdup(base, *extbuflen, GFP_KERNEL); + if (!*extbuf) + rc = -ENOMEM; + } else { + rc = -EINVAL; + } } if (rqst[1].rq_iov) SMB2_query_info_free(&rqst[1]); @@ -477,42 +509,33 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, free_set_inf_compound(rqst); break; } - free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); - free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); - free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + + if (rc && err_iov && err_buftype) { + memcpy(err_iov, rsp_iov, 3 * sizeof(*err_iov)); + memcpy(err_buftype, resp_buftype, 3 * sizeof(*err_buftype)); + } else { + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + } kfree(vars); return rc; } -void -move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src) -{ - memcpy(dst, src, (size_t)(&src->CurrentByteOffset) - (size_t)src); - dst->CurrentByteOffset = src->CurrentByteOffset; - dst->Mode = src->Mode; - dst->AlignmentRequirement = src->AlignmentRequirement; - dst->IndexNumber1 = 0; /* we don't use it */ -} - -int -smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - FILE_ALL_INFO *data, bool *adjust_tz, bool *reparse) +int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse) { int rc; - struct smb2_file_all_info *smb2_data; __u32 create_options = 0; struct cifsFileInfo *cfile; struct cached_fid *cfid = NULL; + struct kvec err_iov[3] = {}; + int err_buftype[3] = {}; *adjust_tz = false; *reparse = false; - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, - GFP_KERNEL); - if (smb2_data == NULL) - return -ENOMEM; - if (strcmp(full_path, "")) rc = -ENOENT; else @@ -520,63 +543,65 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, /* If it is a root and its handle is cached then use it */ if (!rc) { if (cfid->file_all_info_is_valid) { - move_smb2_info_to_cifs(data, - &cfid->file_all_info); + memcpy(&data->fi, &cfid->file_all_info, sizeof(data->fi)); } else { - rc = SMB2_query_info(xid, tcon, - cfid->fid.persistent_fid, - cfid->fid.volatile_fid, smb2_data); - if (!rc) - move_smb2_info_to_cifs(data, smb2_data); + rc = SMB2_query_info(xid, tcon, cfid->fid.persistent_fid, + cfid->fid.volatile_fid, &data->fi); } close_cached_dir(cfid); - goto out; + return rc; } cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, - ACL_NO_MODE, smb2_data, SMB2_OP_QUERY_INFO, cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, + create_options, ACL_NO_MODE, data, SMB2_OP_QUERY_INFO, cfile, + NULL, NULL, err_iov, err_buftype); if (rc == -EOPNOTSUPP) { + if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER && + ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE && + ((struct smb2_hdr *)err_iov[0].iov_base)->Status == STATUS_STOPPED_ON_SYMLINK) { + rc = smb2_parse_symlink_response(cifs_sb, err_iov, &data->symlink_target); + if (rc) + goto out; + } *reparse = true; create_options |= OPEN_REPARSE_POINT; /* Failed on a symbolic link - query a reparse point info */ cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, - smb2_data, SMB2_OP_QUERY_INFO, cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, + FILE_OPEN, create_options, ACL_NO_MODE, data, + SMB2_OP_QUERY_INFO, cfile, NULL, NULL, NULL, NULL); } - if (rc) - goto out; - move_smb2_info_to_cifs(data, smb2_data); out: - kfree(smb2_data); + free_rsp_buf(err_buftype[0], err_iov[0].iov_base); + free_rsp_buf(err_buftype[1], err_iov[1].iov_base); + free_rsp_buf(err_buftype[2], err_iov[2].iov_base); return rc; } -int -smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - struct smb311_posix_qinfo *data, bool *adjust_tz, bool *reparse) +int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, + struct cifs_sid *owner, + struct cifs_sid *group, + bool *adjust_tz, bool *reparse) { int rc; __u32 create_options = 0; struct cifsFileInfo *cfile; - struct smb311_posix_qinfo *smb2_data; + struct kvec err_iov[3] = {}; + int err_buftype[3] = {}; + __u8 *sidsbuf = NULL; + __u8 *sidsbuf_end = NULL; + size_t sidsbuflen = 0; + size_t owner_len, group_len; *adjust_tz = false; *reparse = false; - /* BB TODO: Make struct larger when add support for parsing owner SIDs */ - smb2_data = kzalloc(sizeof(struct smb311_posix_qinfo), - GFP_KERNEL); - if (smb2_data == NULL) - return -ENOMEM; - /* * BB TODO: Add support for using the cached root handle. * Create SMB2_query_posix_info worker function to do non-compounded query @@ -585,29 +610,53 @@ smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, */ cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, - ACL_NO_MODE, smb2_data, SMB2_OP_POSIX_QUERY_INFO, cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, + create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile, + &sidsbuf, &sidsbuflen, err_iov, err_buftype); if (rc == -EOPNOTSUPP) { /* BB TODO: When support for special files added to Samba re-verify this path */ + if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER && + ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE && + ((struct smb2_hdr *)err_iov[0].iov_base)->Status == STATUS_STOPPED_ON_SYMLINK) { + rc = smb2_parse_symlink_response(cifs_sb, err_iov, &data->symlink_target); + if (rc) + goto out; + } *reparse = true; create_options |= OPEN_REPARSE_POINT; /* Failed on a symbolic link - query a reparse point info */ cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, - smb2_data, SMB2_OP_POSIX_QUERY_INFO, cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, + FILE_OPEN, create_options, ACL_NO_MODE, data, + SMB2_OP_POSIX_QUERY_INFO, cfile, + &sidsbuf, &sidsbuflen, NULL, NULL); } - if (rc) - goto out; - /* TODO: will need to allow for the 2 SIDs when add support for getting owner UID/GID */ - memcpy(data, smb2_data, sizeof(struct smb311_posix_qinfo)); + if (rc == 0) { + sidsbuf_end = sidsbuf + sidsbuflen; + + owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end); + if (owner_len == -1) { + rc = -EINVAL; + goto out; + } + memcpy(owner, sidsbuf, owner_len); + + group_len = posix_info_sid_size( + sidsbuf + owner_len, sidsbuf_end); + if (group_len == -1) { + rc = -EINVAL; + goto out; + } + memcpy(group, sidsbuf + owner_len, group_len); + } out: - kfree(smb2_data); + kfree(sidsbuf); + free_rsp_buf(err_buftype[0], err_iov[0].iov_base); + free_rsp_buf(err_buftype[1], err_iov[1].iov_base); + free_rsp_buf(err_buftype[2], err_iov[2].iov_base); return rc; } @@ -619,7 +668,7 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, return smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR, - NULL); + NULL, NULL, NULL, NULL, NULL); } void @@ -641,7 +690,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, tmprc = smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, CREATE_NOT_FILE, ACL_NO_MODE, - &data, SMB2_OP_SET_INFO, cfile); + &data, SMB2_OP_SET_INFO, cfile, NULL, NULL, NULL, NULL); if (tmprc == 0) cifs_i->cifsAttrs = dosattrs; } @@ -650,9 +699,10 @@ int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { + drop_cached_dir_by_name(xid, tcon, name, cifs_sb); return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE, - NULL, SMB2_OP_RMDIR, NULL); + NULL, SMB2_OP_RMDIR, NULL, NULL, NULL, NULL, NULL); } int @@ -661,7 +711,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL); + ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL, NULL, NULL); } static int @@ -680,7 +730,7 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, } rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name, - command, cfile); + command, cfile, NULL, NULL, NULL, NULL); smb2_rename_path: kfree(smb2_to_name); return rc; @@ -693,6 +743,7 @@ smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon, { struct cifsFileInfo *cfile; + drop_cached_dir_by_name(xid, tcon, from_name, cifs_sb); cifs_get_writable_path(tcon, from_name, FIND_WR_WITH_DELETE, &cfile); return smb2_set_path_attr(xid, tcon, from_name, to_name, @@ -720,7 +771,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); return smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE, - &eof, SMB2_OP_SET_EOF, cfile); + &eof, SMB2_OP_SET_EOF, cfile, NULL, NULL, NULL, NULL); } int @@ -746,7 +797,8 @@ smb2_set_file_info(struct inode *inode, const char *full_path, cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_WRITE_ATTRIBUTES, FILE_OPEN, - 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile); + 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile, + NULL, NULL, NULL, NULL); cifs_put_tlink(tlink); return rc; } diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index d73e5672aac4..572293c18e16 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -135,6 +135,7 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, int smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) { + struct TCP_Server_Info *pserver; struct smb2_hdr *shdr = (struct smb2_hdr *)buf; struct smb2_pdu *pdu = (struct smb2_pdu *)shdr; int hdr_size = sizeof(struct smb2_hdr); @@ -143,6 +144,9 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) __u32 calc_len; /* calculated length */ __u64 mid; + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + /* * Add function to do table lookup of StructureSize by command * ie Validate the wct via smb2_struct_sizes table above @@ -155,7 +159,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) /* decrypt frame now that it is completely read in */ spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(iter, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(iter, &pserver->smb_ses_list, smb_ses_list) { if (iter->Suid == le64_to_cpu(thdr->SessionId)) { ses = iter; break; @@ -248,7 +252,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) * Some windows servers (win2016) will pad also the final * PDU in a compound to 8 bytes. */ - if (((calc_len + 7) & ~7) == len) + if (ALIGN(calc_len, 8) == len) return 0; /* @@ -608,51 +612,52 @@ smb2_tcon_find_pending_open_lease(struct cifs_tcon *tcon, } static bool -smb2_is_valid_lease_break(char *buffer) +smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) { struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer; - struct TCP_Server_Info *server; + struct TCP_Server_Info *pserver; struct cifs_ses *ses; struct cifs_tcon *tcon; struct cifs_pending_open *open; cifs_dbg(FYI, "Checking for lease break\n"); + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { - list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { - spin_lock(&tcon->open_file_lock); - cifs_stats_inc( - &tcon->stats.cifs_stats.num_oplock_brks); - if (smb2_tcon_has_lease(tcon, rsp)) { - spin_unlock(&tcon->open_file_lock); - spin_unlock(&cifs_tcp_ses_lock); - return true; - } - open = smb2_tcon_find_pending_open_lease(tcon, - rsp); - if (open) { - __u8 lease_key[SMB2_LEASE_KEY_SIZE]; - struct tcon_link *tlink; - - tlink = cifs_get_tlink(open->tlink); - memcpy(lease_key, open->lease_key, - SMB2_LEASE_KEY_SIZE); - spin_unlock(&tcon->open_file_lock); - spin_unlock(&cifs_tcp_ses_lock); - smb2_queue_pending_open_break(tlink, - lease_key, - rsp->NewLeaseState); - return true; - } + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { + spin_lock(&tcon->open_file_lock); + cifs_stats_inc( + &tcon->stats.cifs_stats.num_oplock_brks); + if (smb2_tcon_has_lease(tcon, rsp)) { spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_tcp_ses_lock); + return true; + } + open = smb2_tcon_find_pending_open_lease(tcon, + rsp); + if (open) { + __u8 lease_key[SMB2_LEASE_KEY_SIZE]; + struct tcon_link *tlink; + + tlink = cifs_get_tlink(open->tlink); + memcpy(lease_key, open->lease_key, + SMB2_LEASE_KEY_SIZE); + spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_tcp_ses_lock); + smb2_queue_pending_open_break(tlink, + lease_key, + rsp->NewLeaseState); + return true; + } + spin_unlock(&tcon->open_file_lock); - if (cached_dir_lease_break(tcon, rsp->LeaseKey)) { - spin_unlock(&cifs_tcp_ses_lock); - return true; - } + if (cached_dir_lease_break(tcon, rsp->LeaseKey)) { + spin_unlock(&cifs_tcp_ses_lock); + return true; } } } @@ -671,6 +676,7 @@ bool smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) { struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer; + struct TCP_Server_Info *pserver; struct cifs_ses *ses; struct cifs_tcon *tcon; struct cifsInodeInfo *cinode; @@ -684,16 +690,19 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) if (rsp->StructureSize != smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) { if (le16_to_cpu(rsp->StructureSize) == 44) - return smb2_is_valid_lease_break(buffer); + return smb2_is_valid_lease_break(buffer, server); else return false; } cifs_dbg(FYI, "oplock level 0x%x\n", rsp->OplockLevel); + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { spin_lock(&tcon->open_file_lock); @@ -870,8 +879,8 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server, struct kvec *iov, int nvec) { int i, rc; - struct sdesc *d; struct smb2_hdr *hdr; + struct shash_desc *sha512 = NULL; hdr = (struct smb2_hdr *)iov[0].iov_base; /* neg prot are always taken */ @@ -901,14 +910,14 @@ ok: if (rc) return rc; - d = server->secmech.sdescsha512; - rc = crypto_shash_init(&d->shash); + sha512 = server->secmech.sha512; + rc = crypto_shash_init(sha512); if (rc) { cifs_dbg(VFS, "%s: Could not init sha512 shash\n", __func__); return rc; } - rc = crypto_shash_update(&d->shash, ses->preauth_sha_hash, + rc = crypto_shash_update(sha512, ses->preauth_sha_hash, SMB2_PREAUTH_HASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not update sha512 shash\n", __func__); @@ -916,8 +925,7 @@ ok: } for (i = 0; i < nvec; i++) { - rc = crypto_shash_update(&d->shash, - iov[i].iov_base, iov[i].iov_len); + rc = crypto_shash_update(sha512, iov[i].iov_base, iov[i].iov_len); if (rc) { cifs_dbg(VFS, "%s: Could not update sha512 shash\n", __func__); @@ -925,7 +933,7 @@ ok: } } - rc = crypto_shash_final(&d->shash, ses->preauth_sha_hash); + rc = crypto_shash_final(sha512, ses->preauth_sha_hash); if (rc) { cifs_dbg(VFS, "%s: Could not finalize sha512 shash\n", __func__); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 421be43af425..d33b00ac37de 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -512,8 +512,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) static int parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, - size_t buf_len, - struct cifs_ses *ses) + size_t buf_len, struct cifs_ses *ses, bool in_mount) { struct network_interface_info_ioctl_rsp *p; struct sockaddr_in *addr4; @@ -531,6 +530,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, p = buf; spin_lock(&ses->iface_lock); + ses->iface_count = 0; /* * Go through iface_list and do kref_put to remove * any unused ifaces. ifaces in use will be removed @@ -543,6 +543,21 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, } spin_unlock(&ses->iface_lock); + /* + * Samba server e.g. can return an empty interface list in some cases, + * which would only be a problem if we were requesting multichannel + */ + if (bytes_left == 0) { + /* avoid spamming logs every 10 minutes, so log only in mount */ + if ((ses->chan_max > 1) && in_mount) + cifs_dbg(VFS, + "multichannel not available\n" + "Empty network interface list returned by server %s\n", + ses->server->hostname); + rc = -EINVAL; + goto out; + } + while (bytes_left >= sizeof(*p)) { memset(&tmp_iface, 0, sizeof(tmp_iface)); tmp_iface.speed = le64_to_cpu(p->LinkSpeed); @@ -637,9 +652,9 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, kref_put(&iface->refcount, release_iface); } else list_add_tail(&info->iface_head, &ses->iface_list); - spin_unlock(&ses->iface_lock); ses->iface_count++; + spin_unlock(&ses->iface_lock); ses->iface_last_update = jiffies; next_iface: nb_iface++; @@ -673,7 +688,7 @@ out: } int -SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) +SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_mount) { int rc; unsigned int ret_data_len = 0; @@ -693,7 +708,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) goto out; } - rc = parse_server_interfaces(out_buf, ret_data_len, ses); + rc = parse_server_interfaces(out_buf, ret_data_len, ses, in_mount); if (rc) goto out; @@ -729,7 +744,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return; - SMB3_request_interfaces(xid, tcon); + SMB3_request_interfaces(xid, tcon, true /* called during mount */); SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, FS_ATTRIBUTE_INFORMATION); @@ -787,7 +802,7 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, rc = open_cached_dir(xid, tcon, full_path, cifs_sb, true, &cfid); if (!rc) { - if (cfid->is_valid) { + if (cfid->has_lease) { close_cached_dir(cfid); return 0; } @@ -817,33 +832,25 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -static int -smb2_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - u64 *uniqueid, FILE_ALL_INFO *data) +static int smb2_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + u64 *uniqueid, struct cifs_open_info_data *data) { - *uniqueid = le64_to_cpu(data->IndexNumber); + *uniqueid = le64_to_cpu(data->fi.IndexNumber); return 0; } -static int -smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_fid *fid, FILE_ALL_INFO *data) +static int smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, struct cifs_open_info_data *data) { - int rc; - struct smb2_file_all_info *smb2_data; + struct cifs_fid *fid = &cfile->fid; - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, - GFP_KERNEL); - if (smb2_data == NULL) - return -ENOMEM; - - rc = SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid, - smb2_data); - if (!rc) - move_smb2_info_to_cifs(data, smb2_data); - kfree(smb2_data); - return rc; + if (cfile->symlink_target) { + data->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!data->symlink_target) + return -ENOMEM; + } + return SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid, &data->fi); } #ifdef CONFIG_CIFS_XATTR @@ -1109,6 +1116,8 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, COMPOUND_FID, current->tgid, FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); + if (rc) + goto sea_exit; smb2_set_next_command(tcon, &rqst[1]); smb2_set_related(&rqst[1]); @@ -1119,6 +1128,8 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, rqst[2].rq_nvec = 1; rc = SMB2_close_init(tcon, server, &rqst[2], COMPOUND_FID, COMPOUND_FID, false); + if (rc) + goto sea_exit; smb2_set_related(&rqst[2]); rc = compound_send_recv(xid, ses, server, @@ -1327,7 +1338,7 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon, CIFSMaxBufSize, (char **)&res_key, &ret_data_len); if (rc == -EOPNOTSUPP) { - pr_warn_once("Server share %s does not support copy range\n", tcon->treeName); + pr_warn_once("Server share %s does not support copy range\n", tcon->tree_name); goto req_res_key_exit; } else if (rc) { cifs_tcon_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc); @@ -2012,9 +2023,10 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, static int smb3_notify(const unsigned int xid, struct file *pfile, - void __user *ioc_buf) + void __user *ioc_buf, bool return_changes) { - struct smb3_notify notify; + struct smb3_notify_info notify; + struct smb3_notify_info __user *pnotify_buf; struct dentry *dentry = pfile->f_path.dentry; struct inode *inode = file_inode(pfile); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -2022,10 +2034,12 @@ smb3_notify(const unsigned int xid, struct file *pfile, struct cifs_fid fid; struct cifs_tcon *tcon; const unsigned char *path; + char *returned_ioctl_info = NULL; void *page = alloc_dentry_path(); __le16 *utf16_path = NULL; u8 oplock = SMB2_OPLOCK_LEVEL_NONE; int rc = 0; + __u32 ret_len = 0; path = build_path_from_dentry(dentry, page); if (IS_ERR(path)) { @@ -2039,9 +2053,17 @@ smb3_notify(const unsigned int xid, struct file *pfile, goto notify_exit; } - if (copy_from_user(¬ify, ioc_buf, sizeof(struct smb3_notify))) { - rc = -EFAULT; - goto notify_exit; + if (return_changes) { + if (copy_from_user(¬ify, ioc_buf, sizeof(struct smb3_notify_info))) { + rc = -EFAULT; + goto notify_exit; + } + } else { + if (copy_from_user(¬ify, ioc_buf, sizeof(struct smb3_notify))) { + rc = -EFAULT; + goto notify_exit; + } + notify.data_len = 0; } tcon = cifs_sb_master_tcon(cifs_sb); @@ -2058,12 +2080,22 @@ smb3_notify(const unsigned int xid, struct file *pfile, goto notify_exit; rc = SMB2_change_notify(xid, tcon, fid.persistent_fid, fid.volatile_fid, - notify.watch_tree, notify.completion_filter); + notify.watch_tree, notify.completion_filter, + notify.data_len, &returned_ioctl_info, &ret_len); SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); cifs_dbg(FYI, "change notify for path %s rc %d\n", path, rc); - + if (return_changes && (ret_len > 0) && (notify.data_len > 0)) { + if (ret_len > notify.data_len) + ret_len = notify.data_len; + pnotify_buf = (struct smb3_notify_info __user *)ioc_buf; + if (copy_to_user(pnotify_buf->notify_data, returned_ioctl_info, ret_len)) + rc = -EFAULT; + else if (copy_to_user(&pnotify_buf->data_len, &ret_len, sizeof(ret_len))) + rc = -EFAULT; + } + kfree(returned_ioctl_info); notify_exit: free_dentry_path(page); kfree(utf16_path); @@ -2274,14 +2306,18 @@ static void smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) { struct smb2_hdr *shdr = (struct smb2_hdr *)buf; + struct TCP_Server_Info *pserver; struct cifs_ses *ses; struct cifs_tcon *tcon; if (shdr->Status != STATUS_NETWORK_NAME_DELETED) return; + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { if (tcon->tid == le32_to_cpu(shdr->Id.SyncId.TreeId)) { spin_lock(&tcon->tc_lock); @@ -2289,7 +2325,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) spin_unlock(&tcon->tc_lock); spin_unlock(&cifs_tcp_ses_lock); pr_warn_once("Server share %s deleted.\n", - tcon->treeName); + tcon->tree_name); return; } } @@ -2498,7 +2534,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, if (rc == -EREMCHG) { tcon->need_reconnect = true; pr_warn_once("server share %s deleted\n", - tcon->treeName); + tcon->tree_name); } goto qic_exit; } @@ -2814,9 +2850,6 @@ parse_reparse_point(struct reparse_data_buffer *buf, } } -#define SMB2_SYMLINK_STRUCT_SIZE \ - (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp)) - static int smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, @@ -2828,13 +2861,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; struct kvec err_iov = {NULL, 0}; - struct smb2_err_rsp *err_buf = NULL; - struct smb2_symlink_err_rsp *symlink; struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); - unsigned int sub_len; - unsigned int sub_offset; - unsigned int print_len; - unsigned int print_offset; int flags = CIFS_CP_CREATE_CLOSE_OP; struct smb_rqst rqst[3]; int resp_buftype[3]; @@ -2951,47 +2978,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, goto querty_exit; } - err_buf = err_iov.iov_base; - if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) || - err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE) { - rc = -EINVAL; - goto querty_exit; - } - - symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData; - if (le32_to_cpu(symlink->SymLinkErrorTag) != SYMLINK_ERROR_TAG || - le32_to_cpu(symlink->ReparseTag) != IO_REPARSE_TAG_SYMLINK) { - rc = -EINVAL; - goto querty_exit; - } - - /* open must fail on symlink - reset rc */ - rc = 0; - sub_len = le16_to_cpu(symlink->SubstituteNameLength); - sub_offset = le16_to_cpu(symlink->SubstituteNameOffset); - print_len = le16_to_cpu(symlink->PrintNameLength); - print_offset = le16_to_cpu(symlink->PrintNameOffset); - - if (err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) { - rc = -EINVAL; - goto querty_exit; - } - - if (err_iov.iov_len < - SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) { - rc = -EINVAL; - goto querty_exit; - } - - *target_path = cifs_strndup_from_utf16( - (char *)symlink->PathBuffer + sub_offset, - sub_len, true, cifs_sb->local_nls); - if (!(*target_path)) { - rc = -ENOMEM; - goto querty_exit; - } - convert_delimiter(*target_path, '/'); - cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); + rc = smb2_parse_symlink_response(cifs_sb, &err_iov, target_path); querty_exit: cifs_dbg(FYI, "query symlink rc %d\n", rc); @@ -4217,89 +4204,104 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8); } -/* We can not use the normal sg_set_buf() as we will sometimes pass a - * stack object as buf. - */ -static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf, - unsigned int buflen) +static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst *rqst, + int num_rqst, const u8 *sig, u8 **iv, + struct aead_request **req, struct scatterlist **sgl, + unsigned int *num_sgs) { - void *addr; - /* - * VMAP_STACK (at least) puts stack into the vmalloc address space - */ - if (is_vmalloc_addr(buf)) - addr = vmalloc_to_page(buf); - else - addr = virt_to_page(buf); - sg_set_page(sg, addr, buflen, offset_in_page(buf)); + unsigned int req_size = sizeof(**req) + crypto_aead_reqsize(tfm); + unsigned int iv_size = crypto_aead_ivsize(tfm); + unsigned int len; + u8 *p; + + *num_sgs = cifs_get_num_sgs(rqst, num_rqst, sig); + + len = iv_size; + len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1); + len = ALIGN(len, crypto_tfm_ctx_alignment()); + len += req_size; + len = ALIGN(len, __alignof__(struct scatterlist)); + len += *num_sgs * sizeof(**sgl); + + p = kmalloc(len, GFP_ATOMIC); + if (!p) + return NULL; + + *iv = (u8 *)PTR_ALIGN(p, crypto_aead_alignmask(tfm) + 1); + *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size, + crypto_tfm_ctx_alignment()); + *sgl = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size, + __alignof__(struct scatterlist)); + return p; } -/* Assumes the first rqst has a transform header as the first iov. - * I.e. - * rqst[0].rq_iov[0] is transform header - * rqst[0].rq_iov[1+] data to be encrypted/decrypted - * rqst[1+].rq_iov[0+] data to be encrypted/decrypted - */ -static struct scatterlist * -init_sg(int num_rqst, struct smb_rqst *rqst, u8 *sign) +static void *smb2_get_aead_req(struct crypto_aead *tfm, const struct smb_rqst *rqst, + int num_rqst, const u8 *sig, u8 **iv, + struct aead_request **req, struct scatterlist **sgl) { - unsigned int sg_len; + unsigned int off, len, skip; struct scatterlist *sg; - unsigned int i; - unsigned int j; - unsigned int idx = 0; - int skip; - - sg_len = 1; - for (i = 0; i < num_rqst; i++) - sg_len += rqst[i].rq_nvec + rqst[i].rq_npages; + unsigned int num_sgs; + unsigned long addr; + int i, j; + void *p; - sg = kmalloc_array(sg_len, sizeof(struct scatterlist), GFP_KERNEL); - if (!sg) + p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, sgl, &num_sgs); + if (!p) return NULL; - sg_init_table(sg, sg_len); + sg_init_table(*sgl, num_sgs); + sg = *sgl; + + /* Assumes the first rqst has a transform header as the first iov. + * I.e. + * rqst[0].rq_iov[0] is transform header + * rqst[0].rq_iov[1+] data to be encrypted/decrypted + * rqst[1+].rq_iov[0+] data to be encrypted/decrypted + */ for (i = 0; i < num_rqst; i++) { + /* + * The first rqst has a transform header where the + * first 20 bytes are not part of the encrypted blob. + */ for (j = 0; j < rqst[i].rq_nvec; j++) { - /* - * The first rqst has a transform header where the - * first 20 bytes are not part of the encrypted blob - */ - skip = (i == 0) && (j == 0) ? 20 : 0; - smb2_sg_set_buf(&sg[idx++], - rqst[i].rq_iov[j].iov_base + skip, - rqst[i].rq_iov[j].iov_len - skip); - } + struct kvec *iov = &rqst[i].rq_iov[j]; + skip = (i == 0) && (j == 0) ? 20 : 0; + addr = (unsigned long)iov->iov_base + skip; + len = iov->iov_len - skip; + sg = cifs_sg_set_buf(sg, (void *)addr, len); + } for (j = 0; j < rqst[i].rq_npages; j++) { - unsigned int len, offset; - - rqst_page_get_length(&rqst[i], j, &len, &offset); - sg_set_page(&sg[idx++], rqst[i].rq_pages[j], len, offset); + rqst_page_get_length(&rqst[i], j, &len, &off); + sg_set_page(sg++, rqst[i].rq_pages[j], len, off); } } - smb2_sg_set_buf(&sg[idx], sign, SMB2_SIGNATURE_SIZE); - return sg; + cifs_sg_set_buf(sg, sig, SMB2_SIGNATURE_SIZE); + + return p; } static int smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) { + struct TCP_Server_Info *pserver; struct cifs_ses *ses; u8 *ses_enc_key; + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { - if (ses->Suid == ses_id) { - spin_lock(&ses->ses_lock); - ses_enc_key = enc ? ses->smb3encryptionkey : - ses->smb3decryptionkey; - memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE); - spin_unlock(&ses->ses_lock); - spin_unlock(&cifs_tcp_ses_lock); - return 0; - } + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + if (ses->Suid == ses_id) { + spin_lock(&ses->ses_lock); + ses_enc_key = enc ? ses->smb3encryptionkey : + ses->smb3decryptionkey; + memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE); + spin_unlock(&ses->ses_lock); + spin_unlock(&cifs_tcp_ses_lock); + return 0; } } spin_unlock(&cifs_tcp_ses_lock); @@ -4325,11 +4327,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, u8 sign[SMB2_SIGNATURE_SIZE] = {}; u8 key[SMB3_ENC_DEC_KEY_SIZE]; struct aead_request *req; - char *iv; - unsigned int iv_len; + u8 *iv; DECLARE_CRYPTO_WAIT(wait); struct crypto_aead *tfm; unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); + void *creq; rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key); if (rc) { @@ -4344,8 +4346,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, return rc; } - tfm = enc ? server->secmech.ccmaesencrypt : - server->secmech.ccmaesdecrypt; + tfm = enc ? server->secmech.enc : server->secmech.dec; if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) @@ -4364,32 +4365,15 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, return rc; } - req = aead_request_alloc(tfm, GFP_KERNEL); - if (!req) { - cifs_server_dbg(VFS, "%s: Failed to alloc aead request\n", __func__); + creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg); + if (unlikely(!creq)) return -ENOMEM; - } if (!enc) { memcpy(sign, &tr_hdr->Signature, SMB2_SIGNATURE_SIZE); crypt_len += SMB2_SIGNATURE_SIZE; } - sg = init_sg(num_rqst, rqst, sign); - if (!sg) { - cifs_server_dbg(VFS, "%s: Failed to init sg\n", __func__); - rc = -ENOMEM; - goto free_req; - } - - iv_len = crypto_aead_ivsize(tfm); - iv = kzalloc(iv_len, GFP_KERNEL); - if (!iv) { - cifs_server_dbg(VFS, "%s: Failed to alloc iv\n", __func__); - rc = -ENOMEM; - goto free_sg; - } - if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES_GCM_NONCE); @@ -4398,6 +4382,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES_CCM_NONCE); } + aead_request_set_tfm(req, tfm); aead_request_set_crypt(req, sg, sg, crypt_len, iv); aead_request_set_ad(req, assoc_data_len); @@ -4410,11 +4395,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, if (!rc && enc) memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE); - kfree(iv); -free_sg: - kfree(sg); -free_req: - kfree(req); + kfree_sensitive(creq); return rc; } @@ -4457,21 +4438,27 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, int rc = -ENOMEM; for (i = 1; i < num_rqst; i++) { - npages = old_rq[i - 1].rq_npages; + struct smb_rqst *old = &old_rq[i - 1]; + struct smb_rqst *new = &new_rq[i]; + + orig_len += smb_rqst_len(server, old); + new->rq_iov = old->rq_iov; + new->rq_nvec = old->rq_nvec; + + npages = old->rq_npages; + if (!npages) + continue; + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); if (!pages) goto err_free; - new_rq[i].rq_pages = pages; - new_rq[i].rq_npages = npages; - new_rq[i].rq_offset = old_rq[i - 1].rq_offset; - new_rq[i].rq_pagesz = old_rq[i - 1].rq_pagesz; - new_rq[i].rq_tailsz = old_rq[i - 1].rq_tailsz; - new_rq[i].rq_iov = old_rq[i - 1].rq_iov; - new_rq[i].rq_nvec = old_rq[i - 1].rq_nvec; - - orig_len += smb_rqst_len(server, &old_rq[i - 1]); + new->rq_pages = pages; + new->rq_npages = npages; + new->rq_offset = old->rq_offset; + new->rq_pagesz = old->rq_pagesz; + new->rq_tailsz = old->rq_tailsz; for (j = 0; j < npages; j++) { pages[j] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); @@ -4484,14 +4471,14 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, char *dst, *src; unsigned int offset, len; - rqst_page_get_length(&new_rq[i], j, &len, &offset); + rqst_page_get_length(new, j, &len, &offset); - dst = (char *) kmap(new_rq[i].rq_pages[j]) + offset; - src = (char *) kmap(old_rq[i - 1].rq_pages[j]) + offset; + dst = kmap_local_page(new->rq_pages[j]) + offset; + src = kmap_local_page(old->rq_pages[j]) + offset; memcpy(dst, src, len); - kunmap(new_rq[i].rq_pages[j]); - kunmap(old_rq[i - 1].rq_pages[j]); + kunmap(new->rq_pages[j]); + kunmap(old->rq_pages[j]); } } @@ -4735,13 +4722,13 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - iov_iter_bvec(&iter, WRITE, bvec, npages, data_len); + iov_iter_bvec(&iter, ITER_SOURCE, bvec, npages, data_len); } else if (buf_len >= data_offset + data_len) { /* read response payload is in buf */ WARN_ONCE(npages > 0, "read data can be either in buf or in pages"); iov.iov_base = buf + data_offset; iov.iov_len = data_len; - iov_iter_kvec(&iter, WRITE, &iov, 1, data_len); + iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, data_len); } else { /* read response payload cannot be in both buf and pages */ WARN_ONCE(1, "buf can not contain only a part of read data"); @@ -5102,7 +5089,7 @@ smb2_make_node(unsigned int xid, struct inode *inode, { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); int rc = -EPERM; - FILE_ALL_INFO *buf = NULL; + struct cifs_open_info_data buf = {}; struct cifs_io_parms io_parms = {0}; __u32 oplock = 0; struct cifs_fid fid; @@ -5118,7 +5105,7 @@ smb2_make_node(unsigned int xid, struct inode *inode, * and was used by default in earlier versions of Windows */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) - goto out; + return rc; /* * TODO: Add ability to create instead via reparse point. Windows (e.g. @@ -5127,16 +5114,10 @@ smb2_make_node(unsigned int xid, struct inode *inode, */ if (!S_ISCHR(mode) && !S_ISBLK(mode)) - goto out; + return rc; cifs_dbg(FYI, "sfu compat create special file\n"); - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (buf == NULL) { - rc = -ENOMEM; - goto out; - } - oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = GENERIC_WRITE; @@ -5151,21 +5132,21 @@ smb2_make_node(unsigned int xid, struct inode *inode, oplock = REQ_OPLOCK; else oplock = 0; - rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf); + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &buf); if (rc) - goto out; + return rc; /* * BB Do not bother to decode buf since no local inode yet to put * timestamps in, but we can reuse it safely. */ - pdev = (struct win_dev *)buf; + pdev = (struct win_dev *)&buf.fi; io_parms.pid = current->tgid; io_parms.tcon = tcon; io_parms.offset = 0; io_parms.length = sizeof(struct win_dev); - iov[1].iov_base = buf; + iov[1].iov_base = &buf.fi; iov[1].iov_len = sizeof(struct win_dev); if (S_ISCHR(mode)) { memcpy(pdev->type, "IntxCHR", 8); @@ -5184,8 +5165,8 @@ smb2_make_node(unsigned int xid, struct inode *inode, d_drop(dentry); /* FIXME: add code here to set EAs */ -out: - kfree(buf); + + cifs_free_open_info(&buf); return rc; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 6352ab32c7e7..a5695748a89b 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -466,15 +466,14 @@ build_signing_ctxt(struct smb2_signing_capabilities *pneg_ctxt) /* * Context Data length must be rounded to multiple of 8 for some servers */ - pneg_ctxt->DataLength = cpu_to_le16(DIV_ROUND_UP( - sizeof(struct smb2_signing_capabilities) - - sizeof(struct smb2_neg_context) + - (num_algs * 2 /* sizeof u16 */), 8) * 8); + pneg_ctxt->DataLength = cpu_to_le16(ALIGN(sizeof(struct smb2_signing_capabilities) - + sizeof(struct smb2_neg_context) + + (num_algs * sizeof(u16)), 8)); pneg_ctxt->SigningAlgorithmCount = cpu_to_le16(num_algs); pneg_ctxt->SigningAlgorithms[0] = cpu_to_le16(SIGNING_ALG_AES_CMAC); - ctxt_len += 2 /* sizeof le16 */ * num_algs; - ctxt_len = DIV_ROUND_UP(ctxt_len, 8) * 8; + ctxt_len += sizeof(__le16) * num_algs; + ctxt_len = ALIGN(ctxt_len, 8); return ctxt_len; /* TBD add SIGNING_ALG_AES_GMAC and/or SIGNING_ALG_HMAC_SHA256 */ } @@ -511,8 +510,7 @@ build_netname_ctxt(struct smb2_netname_neg_context *pneg_ctxt, char *hostname) /* copy up to max of first 100 bytes of server name to NetName field */ pneg_ctxt->DataLength = cpu_to_le16(2 * cifs_strtoUTF16(pneg_ctxt->NetName, hostname, 100, cp)); /* context size is DataLength + minimal smb2_neg_context */ - return DIV_ROUND_UP(le16_to_cpu(pneg_ctxt->DataLength) + - sizeof(struct smb2_neg_context), 8) * 8; + return ALIGN(le16_to_cpu(pneg_ctxt->DataLength) + sizeof(struct smb2_neg_context), 8); } static void @@ -557,18 +555,18 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, * round up total_len of fixed part of SMB3 negotiate request to 8 * byte boundary before adding negotiate contexts */ - *total_len = roundup(*total_len, 8); + *total_len = ALIGN(*total_len, 8); pneg_ctxt = (*total_len) + (char *)req; req->NegotiateContextOffset = cpu_to_le32(*total_len); build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt); - ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_preauth_neg_context), 8) * 8; + ctxt_len = ALIGN(sizeof(struct smb2_preauth_neg_context), 8); *total_len += ctxt_len; pneg_ctxt += ctxt_len; build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt); - ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_encryption_neg_context), 8) * 8; + ctxt_len = ALIGN(sizeof(struct smb2_encryption_neg_context), 8); *total_len += ctxt_len; pneg_ctxt += ctxt_len; @@ -595,9 +593,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, if (server->compress_algorithm) { build_compression_ctxt((struct smb2_compression_capabilities_context *) pneg_ctxt); - ctxt_len = DIV_ROUND_UP( - sizeof(struct smb2_compression_capabilities_context), - 8) * 8; + ctxt_len = ALIGN(sizeof(struct smb2_compression_capabilities_context), 8); *total_len += ctxt_len; pneg_ctxt += ctxt_len; neg_context_count++; @@ -780,7 +776,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, if (rc) break; /* offsets must be 8 byte aligned */ - clen = (clen + 7) & ~0x7; + clen = ALIGN(clen, 8); offset += clen + sizeof(struct smb2_neg_context); len_of_ctxts -= clen; } @@ -873,7 +869,7 @@ SMB2_negotiate(const unsigned int xid, struct smb2_negotiate_rsp *rsp; struct kvec iov[1]; struct kvec rsp_iov; - int rc = 0; + int rc; int resp_buftype; int blob_offset, blob_length; char *security_blob; @@ -1169,9 +1165,9 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) pneg_inbuf->Dialects[0] = cpu_to_le16(server->vals->protocol_id); pneg_inbuf->DialectCount = cpu_to_le16(1); - /* structure is big enough for 3 dialects, sending only 1 */ + /* structure is big enough for 4 dialects, sending only 1 */ inbuflen = sizeof(*pneg_inbuf) - - sizeof(pneg_inbuf->Dialects[0]) * 2; + sizeof(pneg_inbuf->Dialects[0]) * 3; } rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, @@ -1345,7 +1341,13 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) static void SMB2_sess_free_buffer(struct SMB2_sess_data *sess_data) { - free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); + struct kvec *iov = sess_data->iov; + + /* iov[1] is already freed by caller */ + if (sess_data->buf0_type != CIFS_NO_BUFFER && iov[0].iov_base) + memzero_explicit(iov[0].iov_base, iov[0].iov_len); + + free_rsp_buf(sess_data->buf0_type, iov[0].iov_base); sess_data->buf0_type = CIFS_NO_BUFFER; } @@ -1477,6 +1479,8 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) out_put_spnego_key: key_invalidate(spnego_key); key_put(spnego_key); + if (rc) + kfree_sensitive(ses->auth_key.response); out: sess_data->result = rc; sess_data->func = NULL; @@ -1526,7 +1530,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) &blob_length, ses, server, sess_data->nls_cp); if (rc) - goto out_err; + goto out; if (use_spnego) { /* BB eventually need to add this */ @@ -1573,7 +1577,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) } out: - kfree(ntlmssp_blob); + kfree_sensitive(ntlmssp_blob); SMB2_sess_free_buffer(sess_data); if (!rc) { sess_data->result = 0; @@ -1581,7 +1585,7 @@ out: return; } out_err: - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL; sess_data->result = rc; sess_data->func = NULL; @@ -1657,9 +1661,9 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) } #endif out: - kfree(ntlmssp_blob); + kfree_sensitive(ntlmssp_blob); SMB2_sess_free_buffer(sess_data); - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL; sess_data->result = rc; sess_data->func = NULL; @@ -1737,7 +1741,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, cifs_server_dbg(VFS, "signing requested but authenticated as guest\n"); rc = sess_data->result; out: - kfree(sess_data); + kfree_sensitive(sess_data); return rc; } @@ -1930,7 +1934,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */ tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId); - strscpy(tcon->treeName, tree, sizeof(tcon->treeName)); + strscpy(tcon->tree_name, tree, sizeof(tcon->tree_name)); if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) && ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0)) @@ -1973,6 +1977,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if (!ses || !(ses->server)) return -EIO; + trace_smb3_tdis_enter(xid, tcon->tid, ses->Suid, tcon->tree_name); spin_lock(&ses->chan_lock); if ((tcon->need_reconnect) || (CIFS_ALL_CHANS_NEED_RECONNECT(tcon->ses))) { @@ -2004,8 +2009,11 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) rc = cifs_send_recv(xid, ses, ses->server, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); - if (rc) + if (rc) { cifs_stats_fail_inc(tcon, SMB2_TREE_DISCONNECT_HE); + trace_smb3_tdis_err(xid, tcon->tid, ses->Suid, rc); + } + trace_smb3_tdis_done(xid, tcon->tid, ses->Suid); return rc; } @@ -2411,9 +2419,9 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) unsigned int acelen, acl_size, ace_count; unsigned int owner_offset = 0; unsigned int group_offset = 0; - struct smb3_acl acl; + struct smb3_acl acl = {}; - *len = roundup(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8); + *len = round_up(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8); if (set_owner) { /* sizeof(struct owner_group_sids) is already multiple of 8 so no need to round */ @@ -2484,10 +2492,11 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) acl.AclRevision = ACL_REVISION; /* See 2.4.4.1 of MS-DTYP */ acl.AclSize = cpu_to_le16(acl_size); acl.AceCount = cpu_to_le16(ace_count); + /* acl.Sbz1 and Sbz2 MBZ so are not set here, but initialized above */ memcpy(aclptr, &acl, sizeof(struct smb3_acl)); buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd); - *len = roundup(ptr - (__u8 *)buf, 8); + *len = round_up((unsigned int)(ptr - (__u8 *)buf), 8); return buf; } @@ -2581,7 +2590,7 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len, * final path needs to be 8-byte aligned as specified in * MS-SMB2 2.2.13 SMB2 CREATE Request. */ - *out_size = roundup(*out_len * sizeof(__le16), 8); + *out_size = round_up(*out_len * sizeof(__le16), 8); *out_path = kzalloc(*out_size + sizeof(__le16) /* null */, GFP_KERNEL); if (!*out_path) return -ENOMEM; @@ -2674,7 +2683,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; rc = alloc_path_with_tree_prefix(©_path, ©_size, &name_len, - tcon->treeName, utf16_path); + tcon->tree_name, utf16_path); if (rc) goto err_free_req; @@ -2816,7 +2825,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; rc = alloc_path_with_tree_prefix(©_path, ©_size, &name_len, - tcon->treeName, path); + tcon->tree_name, path); if (rc) return rc; req->NameLength = cpu_to_le16(name_len * 2); @@ -2826,9 +2835,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2; /* MUST set path len (NameLength) to 0 opening root of share */ req->NameLength = cpu_to_le16(uni_path_len - 2); - copy_size = uni_path_len; - if (copy_size % 8 != 0) - copy_size = roundup(copy_size, 8); + copy_size = round_up(uni_path_len, 8); copy_path = kzalloc(copy_size, GFP_KERNEL); if (!copy_path) return -ENOMEM; @@ -3011,7 +3018,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, oparms->create_options, oparms->desired_access, rc); if (rc == -EREMCHG) { pr_warn_once("server share %s deleted\n", - tcon->treeName); + tcon->tree_name); tcon->need_reconnect = true; } goto creat_exit; @@ -3472,7 +3479,7 @@ smb2_validate_and_copy_iov(unsigned int offset, unsigned int buffer_length, if (rc) return rc; - memcpy(data, begin_of_buf, buffer_length); + memcpy(data, begin_of_buf, minbufsize); return 0; } @@ -3596,7 +3603,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, rc = smb2_validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), - &rsp_iov, min_len, *data); + &rsp_iov, dlen ? *dlen : min_len, *data); if (rc && allocated) { kfree(*data); *data = NULL; @@ -3702,11 +3709,13 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, bool watch_tree, - u32 completion_filter) + u32 completion_filter, u32 max_out_data_len, char **out_data, + u32 *plen /* returned data len */) { struct cifs_ses *ses = tcon->ses; struct TCP_Server_Info *server = cifs_pick_channel(ses); struct smb_rqst rqst; + struct smb2_change_notify_rsp *smb_rsp; struct kvec iov[1]; struct kvec rsp_iov = {NULL, 0}; int resp_buftype = CIFS_NO_BUFFER; @@ -3722,6 +3731,9 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, memset(&rqst, 0, sizeof(struct smb_rqst)); memset(&iov, 0, sizeof(iov)); + if (plen) + *plen = 0; + rqst.rq_iov = iov; rqst.rq_nvec = 1; @@ -3740,9 +3752,28 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, cifs_stats_fail_inc(tcon, SMB2_CHANGE_NOTIFY_HE); trace_smb3_notify_err(xid, persistent_fid, tcon->tid, ses->Suid, (u8)watch_tree, completion_filter, rc); - } else + } else { trace_smb3_notify_done(xid, persistent_fid, tcon->tid, - ses->Suid, (u8)watch_tree, completion_filter); + ses->Suid, (u8)watch_tree, completion_filter); + /* validate that notify information is plausible */ + if ((rsp_iov.iov_base == NULL) || + (rsp_iov.iov_len < sizeof(struct smb2_change_notify_rsp))) + goto cnotify_exit; + + smb_rsp = (struct smb2_change_notify_rsp *)rsp_iov.iov_base; + + smb2_validate_iov(le16_to_cpu(smb_rsp->OutputBufferOffset), + le32_to_cpu(smb_rsp->OutputBufferLength), &rsp_iov, + sizeof(struct file_notify_information)); + + *out_data = kmemdup((char *)smb_rsp + le16_to_cpu(smb_rsp->OutputBufferOffset), + le32_to_cpu(smb_rsp->OutputBufferLength), GFP_KERNEL); + if (*out_data == NULL) { + rc = -ENOMEM; + goto cnotify_exit; + } else + *plen = le32_to_cpu(smb_rsp->OutputBufferLength); + } cnotify_exit: if (rqst.rq_iov) @@ -4090,7 +4121,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len, if (request_type & CHAINED_REQUEST) { if (!(request_type & END_OF_CHAIN)) { /* next 8-byte aligned request */ - *total_len = DIV_ROUND_UP(*total_len, 8) * 8; + *total_len = ALIGN(*total_len, 8); shdr->NextCommand = cpu_to_le32(*total_len); } else /* END_OF_CHAIN */ shdr->NextCommand = 0; @@ -4429,7 +4460,7 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->bytes, wdata->result); if (wdata->result == -ENOSPC) pr_warn_once("Out of space writing to %s\n", - tcon->treeName); + tcon->tree_name); } else trace_smb3_write_done(0 /* no xid */, wdata->cfile->fid.persistent_fid, diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index f57881b8464f..1237bb86e93a 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -56,6 +56,9 @@ struct smb2_rdma_crypto_transform { #define COMPOUND_FID 0xFFFFFFFFFFFFFFFFULL +#define SMB2_SYMLINK_STRUCT_SIZE \ + (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp)) + #define SYMLINK_ERROR_TAG 0x4c4d5953 struct smb2_symlink_err_rsp { diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 3f740f24b96a..d5d7ffb7711c 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -53,16 +53,12 @@ extern bool smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv); extern int smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid); - -extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, - struct smb2_file_all_info *src); extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *path, __u32 *reparse_tag); -extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, - const char *full_path, FILE_ALL_INFO *data, - bool *adjust_tz, bool *symlink); +int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse); extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, __u64 size, struct cifs_sb_info *cifs_sb, bool set_alloc); @@ -95,9 +91,9 @@ extern int smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const unsigned char *path, char *pbuf, unsigned int *pbytes_read); -extern int smb2_open_file(const unsigned int xid, - struct cifs_open_parms *oparms, - __u32 *oplock, FILE_ALL_INFO *buf); +int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec *iov, char **path); +int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, + void *buf); extern int smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile); @@ -148,7 +144,8 @@ extern int SMB2_ioctl_init(struct cifs_tcon *tcon, extern void SMB2_ioctl_free(struct smb_rqst *rqst); extern int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, bool watch_tree, - u32 completion_filter); + u32 completion_filter, u32 max_out_data_len, + char **out_data, u32 *plen /* returned data len */); extern int __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, @@ -278,9 +275,12 @@ extern int smb2_query_info_compound(const unsigned int xid, struct kvec *rsp, int *buftype, struct cifs_sb_info *cifs_sb); /* query path info from the server using SMB311 POSIX extensions*/ -extern int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *sb, const char *path, struct smb311_posix_qinfo *qinf, - bool *adjust_tx, bool *symlink); +int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, + struct cifs_sid *owner, + struct cifs_sid *group, + bool *adjust_tz, bool *reparse); int posix_info_parse(const void *beg, const void *end, struct smb2_posix_info_parsed *out); int posix_info_sid_size(const void *beg, const void *end); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 1a5fc3314dbf..381babc1212c 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -32,19 +32,17 @@ smb3_crypto_shash_allocate(struct TCP_Server_Info *server) struct cifs_secmech *p = &server->secmech; int rc; - rc = cifs_alloc_hash("hmac(sha256)", - &p->hmacsha256, - &p->sdeschmacsha256); + rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256); if (rc) goto err; - rc = cifs_alloc_hash("cmac(aes)", &p->cmacaes, &p->sdesccmacaes); + rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac); if (rc) goto err; return 0; err: - cifs_free_hash(&p->hmacsha256, &p->sdeschmacsha256); + cifs_free_hash(&p->hmacsha256); return rc; } @@ -54,25 +52,23 @@ smb311_crypto_shash_allocate(struct TCP_Server_Info *server) struct cifs_secmech *p = &server->secmech; int rc = 0; - rc = cifs_alloc_hash("hmac(sha256)", - &p->hmacsha256, - &p->sdeschmacsha256); + rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256); if (rc) return rc; - rc = cifs_alloc_hash("cmac(aes)", &p->cmacaes, &p->sdesccmacaes); + rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac); if (rc) goto err; - rc = cifs_alloc_hash("sha512", &p->sha512, &p->sdescsha512); + rc = cifs_alloc_hash("sha512", &p->sha512); if (rc) goto err; return 0; err: - cifs_free_hash(&p->cmacaes, &p->sdesccmacaes); - cifs_free_hash(&p->hmacsha256, &p->sdeschmacsha256); + cifs_free_hash(&p->aes_cmac); + cifs_free_hash(&p->hmacsha256); return rc; } @@ -81,18 +77,19 @@ static int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) { struct cifs_chan *chan; + struct TCP_Server_Info *pserver; struct cifs_ses *ses = NULL; - struct TCP_Server_Info *it = NULL; int i; int rc = 0; spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(it, &cifs_tcp_ses_list, tcp_ses_list) { - list_for_each_entry(ses, &it->smb_ses_list, smb_ses_list) { - if (ses->Suid == ses_id) - goto found; - } + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + if (ses->Suid == ses_id) + goto found; } cifs_server_dbg(VFS, "%s: Could not find session 0x%llx\n", __func__, ses_id); @@ -140,9 +137,13 @@ out: static struct cifs_ses * smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id) { + struct TCP_Server_Info *pserver; struct cifs_ses *ses; - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { if (ses->Suid != ses_id) continue; ++ses->ses_count; @@ -219,34 +220,30 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, struct kvec *iov = rqst->rq_iov; struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; struct cifs_ses *ses; - struct shash_desc *shash; - struct crypto_shash *hash; - struct sdesc *sdesc = NULL; + struct shash_desc *shash = NULL; struct smb_rqst drqst; ses = smb2_find_smb_ses(server, le64_to_cpu(shdr->SessionId)); - if (!ses) { + if (unlikely(!ses)) { cifs_server_dbg(VFS, "%s: Could not find session\n", __func__); - return 0; + return -ENOENT; } memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE); memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE); if (allocate_crypto) { - rc = cifs_alloc_hash("hmac(sha256)", &hash, &sdesc); + rc = cifs_alloc_hash("hmac(sha256)", &shash); if (rc) { cifs_server_dbg(VFS, "%s: sha256 alloc failed\n", __func__); goto out; } - shash = &sdesc->shash; } else { - hash = server->secmech.hmacsha256; - shash = &server->secmech.sdeschmacsha256->shash; + shash = server->secmech.hmacsha256; } - rc = crypto_shash_setkey(hash, ses->auth_key.response, + rc = crypto_shash_setkey(shash->tfm, ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); if (rc) { cifs_server_dbg(VFS, @@ -288,7 +285,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, out: if (allocate_crypto) - cifs_free_hash(&hash, &sdesc); + cifs_free_hash(&shash); if (ses) cifs_put_smb_ses(ses); return rc; @@ -315,42 +312,38 @@ static int generate_key(struct cifs_ses *ses, struct kvec label, goto smb3signkey_ret; } - rc = crypto_shash_setkey(server->secmech.hmacsha256, + rc = crypto_shash_setkey(server->secmech.hmacsha256->tfm, ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); if (rc) { cifs_server_dbg(VFS, "%s: Could not set with session key\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash); + rc = crypto_shash_init(server->secmech.hmacsha256); if (rc) { cifs_server_dbg(VFS, "%s: Could not init sign hmac\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - i, 4); + rc = crypto_shash_update(server->secmech.hmacsha256, i, 4); if (rc) { cifs_server_dbg(VFS, "%s: Could not update with n\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - label.iov_base, label.iov_len); + rc = crypto_shash_update(server->secmech.hmacsha256, label.iov_base, label.iov_len); if (rc) { cifs_server_dbg(VFS, "%s: Could not update with label\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - &zero, 1); + rc = crypto_shash_update(server->secmech.hmacsha256, &zero, 1); if (rc) { cifs_server_dbg(VFS, "%s: Could not update with zero\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - context.iov_base, context.iov_len); + rc = crypto_shash_update(server->secmech.hmacsha256, context.iov_base, context.iov_len); if (rc) { cifs_server_dbg(VFS, "%s: Could not update with context\n", __func__); goto smb3signkey_ret; @@ -358,19 +351,16 @@ static int generate_key(struct cifs_ses *ses, struct kvec label, if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) { - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - L256, 4); + rc = crypto_shash_update(server->secmech.hmacsha256, L256, 4); } else { - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - L128, 4); + rc = crypto_shash_update(server->secmech.hmacsha256, L128, 4); } if (rc) { cifs_server_dbg(VFS, "%s: Could not update with L\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash, - hashptr); + rc = crypto_shash_final(server->secmech.hmacsha256, hashptr); if (rc) { cifs_server_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__); goto smb3signkey_ret; @@ -550,38 +540,35 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, unsigned char *sigptr = smb3_signature; struct kvec *iov = rqst->rq_iov; struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; - struct shash_desc *shash; - struct crypto_shash *hash; - struct sdesc *sdesc = NULL; + struct shash_desc *shash = NULL; struct smb_rqst drqst; u8 key[SMB3_SIGN_KEY_SIZE]; rc = smb2_get_sign_key(le64_to_cpu(shdr->SessionId), server, key); - if (rc) - return 0; + if (unlikely(rc)) { + cifs_server_dbg(VFS, "%s: Could not get signing key\n", __func__); + return rc; + } if (allocate_crypto) { - rc = cifs_alloc_hash("cmac(aes)", &hash, &sdesc); + rc = cifs_alloc_hash("cmac(aes)", &shash); if (rc) return rc; - - shash = &sdesc->shash; } else { - hash = server->secmech.cmacaes; - shash = &server->secmech.sdesccmacaes->shash; + shash = server->secmech.aes_cmac; } memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE); memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE); - rc = crypto_shash_setkey(hash, key, SMB2_CMACAES_SIZE); + rc = crypto_shash_setkey(shash->tfm, key, SMB2_CMACAES_SIZE); if (rc) { cifs_server_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__); goto out; } /* - * we already allocate sdesccmacaes when we init smb3 signing key, + * we already allocate aes_cmac when we init smb3 signing key, * so unlike smb2 case we do not have to check here if secmech are * initialized */ @@ -617,7 +604,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, out: if (allocate_crypto) - cifs_free_hash(&hash, &sdesc); + cifs_free_hash(&shash); return rc; } @@ -902,7 +889,7 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server) { struct crypto_aead *tfm; - if (!server->secmech.ccmaesencrypt) { + if (!server->secmech.enc) { if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) tfm = crypto_alloc_aead("gcm(aes)", 0, 0); @@ -913,23 +900,23 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server) __func__); return PTR_ERR(tfm); } - server->secmech.ccmaesencrypt = tfm; + server->secmech.enc = tfm; } - if (!server->secmech.ccmaesdecrypt) { + if (!server->secmech.dec) { if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) tfm = crypto_alloc_aead("gcm(aes)", 0, 0); else tfm = crypto_alloc_aead("ccm(aes)", 0, 0); if (IS_ERR(tfm)) { - crypto_free_aead(server->secmech.ccmaesencrypt); - server->secmech.ccmaesencrypt = NULL; + crypto_free_aead(server->secmech.enc); + server->secmech.enc = NULL; cifs_server_dbg(VFS, "%s: Failed to alloc decrypt aead\n", __func__); return PTR_ERR(tfm); } - server->secmech.ccmaesdecrypt = tfm; + server->secmech.dec = tfm; } return 0; diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 5fbbec22bcc8..90789aaa6567 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -90,7 +90,7 @@ int smbd_max_send_size = 1364; int smbd_max_fragmented_recv_size = 1024 * 1024; /* The maximum single-message size which can be received */ -int smbd_max_receive_size = 8192; +int smbd_max_receive_size = 1364; /* The timeout to initiate send of a keepalive message on idle */ int smbd_keep_alive_interval = 120; @@ -99,7 +99,7 @@ int smbd_keep_alive_interval = 120; * User configurable initial values for RDMA transport * The actual values used may be lower and are limited to hardware capabilities */ -/* Default maximum number of SGEs in a RDMA write/read */ +/* Default maximum number of pages in a single RDMA write/read */ int smbd_max_frmr_depth = 2048; /* If payload is less than this byte, use RDMA send/recv not read/write */ @@ -270,7 +270,7 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) struct smbd_request *request = container_of(wc->wr_cqe, struct smbd_request, cqe); - log_rdma_send(INFO, "smbd_request %p completed wc->status=%d\n", + log_rdma_send(INFO, "smbd_request 0x%p completed wc->status=%d\n", request, wc->status); if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) { @@ -448,7 +448,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) struct smbd_connection *info = response->info; int data_length = 0; - log_rdma_recv(INFO, "response=%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%x\n", + log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n", response, response->type, wc->status, wc->opcode, wc->byte_len, wc->pkey_index); @@ -723,7 +723,7 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info) send_wr.opcode = IB_WR_SEND; send_wr.send_flags = IB_SEND_SIGNALED; - log_rdma_send(INFO, "sge addr=%llx length=%x lkey=%x\n", + log_rdma_send(INFO, "sge addr=0x%llx length=%u lkey=0x%x\n", request->sge[0].addr, request->sge[0].length, request->sge[0].lkey); @@ -792,7 +792,7 @@ static int smbd_post_send(struct smbd_connection *info, for (i = 0; i < request->num_sge; i++) { log_rdma_send(INFO, - "rdma_request sge[%d] addr=%llu length=%u\n", + "rdma_request sge[%d] addr=0x%llx length=%u\n", i, request->sge[i].addr, request->sge[i].length); ib_dma_sync_single_for_device( info->id->device, @@ -1017,9 +1017,9 @@ static int smbd_post_send_data( { int i; u32 data_length = 0; - struct scatterlist sgl[SMBDIRECT_MAX_SGE]; + struct scatterlist sgl[SMBDIRECT_MAX_SEND_SGE - 1]; - if (n_vec > SMBDIRECT_MAX_SGE) { + if (n_vec > SMBDIRECT_MAX_SEND_SGE - 1) { cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec); return -EINVAL; } @@ -1079,7 +1079,7 @@ static int smbd_negotiate(struct smbd_connection *info) response->type = SMBD_NEGOTIATE_RESP; rc = smbd_post_recv(info, response); - log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=%llx iov.length=%x iov.lkey=%x\n", + log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n", rc, response->sge.addr, response->sge.length, response->sge.lkey); if (rc) @@ -1539,7 +1539,7 @@ static struct smbd_connection *_smbd_get_connection( if (smbd_send_credit_target > info->id->device->attrs.max_cqe || smbd_send_credit_target > info->id->device->attrs.max_qp_wr) { - log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n", + log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", smbd_send_credit_target, info->id->device->attrs.max_cqe, info->id->device->attrs.max_qp_wr); @@ -1548,7 +1548,7 @@ static struct smbd_connection *_smbd_get_connection( if (smbd_receive_credit_max > info->id->device->attrs.max_cqe || smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) { - log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n", + log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", smbd_receive_credit_max, info->id->device->attrs.max_cqe, info->id->device->attrs.max_qp_wr); @@ -1562,17 +1562,15 @@ static struct smbd_connection *_smbd_get_connection( info->max_receive_size = smbd_max_receive_size; info->keep_alive_interval = smbd_keep_alive_interval; - if (info->id->device->attrs.max_send_sge < SMBDIRECT_MAX_SGE) { + if (info->id->device->attrs.max_send_sge < SMBDIRECT_MAX_SEND_SGE || + info->id->device->attrs.max_recv_sge < SMBDIRECT_MAX_RECV_SGE) { log_rdma_event(ERR, - "warning: device max_send_sge = %d too small\n", - info->id->device->attrs.max_send_sge); - log_rdma_event(ERR, "Queue Pair creation may fail\n"); - } - if (info->id->device->attrs.max_recv_sge < SMBDIRECT_MAX_SGE) { - log_rdma_event(ERR, - "warning: device max_recv_sge = %d too small\n", + "device %.*s max_send_sge/max_recv_sge = %d/%d too small\n", + IB_DEVICE_NAME_MAX, + info->id->device->name, + info->id->device->attrs.max_send_sge, info->id->device->attrs.max_recv_sge); - log_rdma_event(ERR, "Queue Pair creation may fail\n"); + goto config_failed; } info->send_cq = NULL; @@ -1598,8 +1596,8 @@ static struct smbd_connection *_smbd_get_connection( qp_attr.qp_context = info; qp_attr.cap.max_send_wr = info->send_credit_target; qp_attr.cap.max_recv_wr = info->receive_credit_max; - qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SGE; - qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_SGE; + qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SEND_SGE; + qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_RECV_SGE; qp_attr.cap.max_inline_data = 0; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; qp_attr.qp_type = IB_QPT_RC; @@ -1986,10 +1984,11 @@ int smbd_send(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *rqst_array) { struct smbd_connection *info = server->smbd_conn; - struct kvec vec; + struct kvec vecs[SMBDIRECT_MAX_SEND_SGE - 1]; int nvecs; int size; unsigned int buflen, remaining_data_length; + unsigned int offset, remaining_vec_data_length; int start, i, j; int max_iov_size = info->max_send_size - sizeof(struct smbd_data_transfer); @@ -1998,10 +1997,8 @@ int smbd_send(struct TCP_Server_Info *server, struct smb_rqst *rqst; int rqst_idx; - if (info->transport_status != SMBD_CONNECTED) { - rc = -EAGAIN; - goto done; - } + if (info->transport_status != SMBD_CONNECTED) + return -EAGAIN; /* * Add in the page array if there is one. The caller needs to set @@ -2012,125 +2009,95 @@ int smbd_send(struct TCP_Server_Info *server, for (i = 0; i < num_rqst; i++) remaining_data_length += smb_rqst_len(server, &rqst_array[i]); - if (remaining_data_length > info->max_fragmented_send_size) { + if (unlikely(remaining_data_length > info->max_fragmented_send_size)) { + /* assertion: payload never exceeds negotiated maximum */ log_write(ERR, "payload size %d > max size %d\n", remaining_data_length, info->max_fragmented_send_size); - rc = -EINVAL; - goto done; + return -EINVAL; } log_write(INFO, "num_rqst=%d total length=%u\n", num_rqst, remaining_data_length); rqst_idx = 0; -next_rqst: - rqst = &rqst_array[rqst_idx]; - iov = rqst->rq_iov; - - cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n", - rqst_idx, smb_rqst_len(server, rqst)); - for (i = 0; i < rqst->rq_nvec; i++) - dump_smb(iov[i].iov_base, iov[i].iov_len); - - - log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d rq_tailsz=%d buflen=%lu\n", - rqst_idx, rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz, - rqst->rq_tailsz, smb_rqst_len(server, rqst)); - - start = i = 0; - buflen = 0; - while (true) { - buflen += iov[i].iov_len; - if (buflen > max_iov_size) { - if (i > start) { - remaining_data_length -= - (buflen-iov[i].iov_len); - log_write(INFO, "sending iov[] from start=%d i=%d nvecs=%d remaining_data_length=%d\n", - start, i, i - start, - remaining_data_length); - rc = smbd_post_send_data( - info, &iov[start], i-start, - remaining_data_length); - if (rc) - goto done; - } else { - /* iov[start] is too big, break it */ - nvecs = (buflen+max_iov_size-1)/max_iov_size; - log_write(INFO, "iov[%d] iov_base=%p buflen=%d break to %d vectors\n", - start, iov[start].iov_base, - buflen, nvecs); - for (j = 0; j < nvecs; j++) { - vec.iov_base = - (char *)iov[start].iov_base + - j*max_iov_size; - vec.iov_len = max_iov_size; - if (j == nvecs-1) - vec.iov_len = - buflen - - max_iov_size*(nvecs-1); - remaining_data_length -= vec.iov_len; - log_write(INFO, - "sending vec j=%d iov_base=%p iov_len=%zu remaining_data_length=%d\n", - j, vec.iov_base, vec.iov_len, - remaining_data_length); - rc = smbd_post_send_data( - info, &vec, 1, - remaining_data_length); - if (rc) - goto done; + do { + rqst = &rqst_array[rqst_idx]; + iov = rqst->rq_iov; + + cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n", + rqst_idx, smb_rqst_len(server, rqst)); + remaining_vec_data_length = 0; + for (i = 0; i < rqst->rq_nvec; i++) { + remaining_vec_data_length += iov[i].iov_len; + dump_smb(iov[i].iov_base, iov[i].iov_len); + } + + log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d rq_tailsz=%d buflen=%lu\n", + rqst_idx, rqst->rq_nvec, + rqst->rq_npages, rqst->rq_pagesz, + rqst->rq_tailsz, smb_rqst_len(server, rqst)); + + start = 0; + offset = 0; + do { + buflen = 0; + i = start; + j = 0; + while (i < rqst->rq_nvec && + j < SMBDIRECT_MAX_SEND_SGE - 1 && + buflen < max_iov_size) { + + vecs[j].iov_base = iov[i].iov_base + offset; + if (buflen + iov[i].iov_len > max_iov_size) { + vecs[j].iov_len = + max_iov_size - iov[i].iov_len; + buflen = max_iov_size; + offset = vecs[j].iov_len; + } else { + vecs[j].iov_len = + iov[i].iov_len - offset; + buflen += vecs[j].iov_len; + offset = 0; + ++i; } - i++; - if (i == rqst->rq_nvec) - break; + ++j; } + + remaining_vec_data_length -= buflen; + remaining_data_length -= buflen; + log_write(INFO, "sending %s iov[%d] from start=%d nvecs=%d remaining_data_length=%d\n", + remaining_vec_data_length > 0 ? + "partial" : "complete", + rqst->rq_nvec, start, j, + remaining_data_length); + start = i; - buflen = 0; - } else { - i++; - if (i == rqst->rq_nvec) { - /* send out all remaining vecs */ - remaining_data_length -= buflen; - log_write(INFO, "sending iov[] from start=%d i=%d nvecs=%d remaining_data_length=%d\n", - start, i, i - start, + rc = smbd_post_send_data(info, vecs, j, remaining_data_length); + if (rc) + goto done; + } while (remaining_vec_data_length > 0); + + /* now sending pages if there are any */ + for (i = 0; i < rqst->rq_npages; i++) { + rqst_page_get_length(rqst, i, &buflen, &offset); + nvecs = (buflen + max_iov_size - 1) / max_iov_size; + log_write(INFO, "sending pages buflen=%d nvecs=%d\n", + buflen, nvecs); + for (j = 0; j < nvecs; j++) { + size = min_t(unsigned int, max_iov_size, remaining_data_length); + remaining_data_length -= size; + log_write(INFO, "sending pages i=%d offset=%d size=%d remaining_data_length=%d\n", + i, j * max_iov_size + offset, size, remaining_data_length); - rc = smbd_post_send_data(info, &iov[start], - i-start, remaining_data_length); + rc = smbd_post_send_page( + info, rqst->rq_pages[i], + j*max_iov_size + offset, + size, remaining_data_length); if (rc) goto done; - break; } } - log_write(INFO, "looping i=%d buflen=%d\n", i, buflen); - } - - /* now sending pages if there are any */ - for (i = 0; i < rqst->rq_npages; i++) { - unsigned int offset; - - rqst_page_get_length(rqst, i, &buflen, &offset); - nvecs = (buflen + max_iov_size - 1) / max_iov_size; - log_write(INFO, "sending pages buflen=%d nvecs=%d\n", - buflen, nvecs); - for (j = 0; j < nvecs; j++) { - size = max_iov_size; - if (j == nvecs-1) - size = buflen - j*max_iov_size; - remaining_data_length -= size; - log_write(INFO, "sending pages i=%d offset=%d size=%d remaining_data_length=%d\n", - i, j * max_iov_size + offset, size, - remaining_data_length); - rc = smbd_post_send_page( - info, rqst->rq_pages[i], - j*max_iov_size + offset, - size, remaining_data_length); - if (rc) - goto done; - } - } - - rqst_idx++; - if (rqst_idx < num_rqst) - goto next_rqst; + } while (++rqst_idx < num_rqst); done: /* diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h index a87fca82a796..207ef979cd51 100644 --- a/fs/cifs/smbdirect.h +++ b/fs/cifs/smbdirect.h @@ -91,7 +91,7 @@ struct smbd_connection { /* Memory registrations */ /* Maximum number of RDMA read/write outstanding on this connection */ int responder_resources; - /* Maximum number of SGEs in a RDMA write/read */ + /* Maximum number of pages in a single RDMA write/read on this connection */ int max_frmr_depth; /* * If payload is less than or equal to the threshold, @@ -225,21 +225,25 @@ struct smbd_buffer_descriptor_v1 { __le32 length; } __packed; -/* Default maximum number of SGEs in a RDMA send/recv */ -#define SMBDIRECT_MAX_SGE 16 +/* Maximum number of SGEs used by smbdirect.c in any send work request */ +#define SMBDIRECT_MAX_SEND_SGE 6 + /* The context for a SMBD request */ struct smbd_request { struct smbd_connection *info; struct ib_cqe cqe; - /* the SGE entries for this packet */ - struct ib_sge sge[SMBDIRECT_MAX_SGE]; + /* the SGE entries for this work request */ + struct ib_sge sge[SMBDIRECT_MAX_SEND_SGE]; int num_sge; /* SMBD packet header follows this structure */ u8 packet[]; }; +/* Maximum number of SGEs used by smbdirect.c in any receive work request */ +#define SMBDIRECT_MAX_RECV_SGE 1 + /* The context for a SMBD response */ struct smbd_response { struct smbd_connection *info; diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index 6b88dc2e364f..110070ba8b04 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -372,6 +372,7 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter); DECLARE_EVENT_CLASS(smb3_inf_compound_done_class, @@ -409,6 +410,7 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done); DECLARE_EVENT_CLASS(smb3_inf_compound_err_class, @@ -451,6 +453,7 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err); /* * For logging SMB3 Status code and Command for responses which return errors diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 9a2753e21170..3851d0aaa288 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -347,7 +347,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, .iov_base = &rfc1002_marker, .iov_len = 4 }; - iov_iter_kvec(&smb_msg.msg_iter, WRITE, &hiov, 1, 4); + iov_iter_kvec(&smb_msg.msg_iter, ITER_SOURCE, &hiov, 1, 4); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) goto unmask; @@ -368,7 +368,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, size += iov[i].iov_len; } - iov_iter_kvec(&smb_msg.msg_iter, WRITE, iov, n_vec, size); + iov_iter_kvec(&smb_msg.msg_iter, ITER_SOURCE, iov, n_vec, size); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) @@ -384,7 +384,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, rqst_page_get_length(&rqst[j], i, &bvec.bv_len, &bvec.bv_offset); - iov_iter_bvec(&smb_msg.msg_iter, WRITE, + iov_iter_bvec(&smb_msg.msg_iter, ITER_SOURCE, &bvec, 1, bvec.bv_len); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) @@ -753,8 +753,9 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) { int error; - error = wait_event_freezekillable_unsafe(server->response_q, - midQ->mid_state != MID_REQUEST_SUBMITTED); + error = wait_event_state(server->response_q, + midQ->mid_state != MID_REQUEST_SUBMITTED, + (TASK_KILLABLE|TASK_FREEZABLE_UNSAFE)); if (error < 0) return -ERESTARTSYS; diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 998fa51f9b68..5f2fb2fd2e37 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -200,32 +200,6 @@ static int cifs_xattr_set(const struct xattr_handler *handler, } break; } - -#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY - case XATTR_ACL_ACCESS: -#ifdef CONFIG_CIFS_POSIX - if (!value) - goto out; - if (sb->s_flags & SB_POSIXACL) - rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, - value, (const int)size, - ACL_TYPE_ACCESS, cifs_sb->local_nls, - cifs_remap(cifs_sb)); -#endif /* CONFIG_CIFS_POSIX */ - break; - - case XATTR_ACL_DEFAULT: -#ifdef CONFIG_CIFS_POSIX - if (!value) - goto out; - if (sb->s_flags & SB_POSIXACL) - rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, - value, (const int)size, - ACL_TYPE_DEFAULT, cifs_sb->local_nls, - cifs_remap(cifs_sb)); -#endif /* CONFIG_CIFS_POSIX */ - break; -#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ } out: @@ -366,27 +340,6 @@ static int cifs_xattr_get(const struct xattr_handler *handler, } break; } -#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY - case XATTR_ACL_ACCESS: -#ifdef CONFIG_CIFS_POSIX - if (sb->s_flags & SB_POSIXACL) - rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, - value, size, ACL_TYPE_ACCESS, - cifs_sb->local_nls, - cifs_remap(cifs_sb)); -#endif /* CONFIG_CIFS_POSIX */ - break; - - case XATTR_ACL_DEFAULT: -#ifdef CONFIG_CIFS_POSIX - if (sb->s_flags & SB_POSIXACL) - rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, - value, size, ACL_TYPE_DEFAULT, - cifs_sb->local_nls, - cifs_remap(cifs_sb)); -#endif /* CONFIG_CIFS_POSIX */ - break; -#endif /* ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ } /* We could add an additional check for streams ie @@ -525,21 +478,6 @@ static const struct xattr_handler smb3_ntsd_full_xattr_handler = { .set = cifs_xattr_set, }; - -static const struct xattr_handler cifs_posix_acl_access_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_ACCESS, - .flags = XATTR_ACL_ACCESS, - .get = cifs_xattr_get, - .set = cifs_xattr_set, -}; - -static const struct xattr_handler cifs_posix_acl_default_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_DEFAULT, - .flags = XATTR_ACL_DEFAULT, - .get = cifs_xattr_get, - .set = cifs_xattr_set, -}; - const struct xattr_handler *cifs_xattr_handlers[] = { &cifs_user_xattr_handler, &cifs_os2_xattr_handler, @@ -549,7 +487,9 @@ const struct xattr_handler *cifs_xattr_handlers[] = { &smb3_ntsd_xattr_handler, /* alias for above since avoiding "cifs" */ &cifs_cifs_ntsd_full_xattr_handler, &smb3_ntsd_full_xattr_handler, /* alias for above since avoiding "cifs" */ - &cifs_posix_acl_access_xattr_handler, - &cifs_posix_acl_default_xattr_handler, +#ifdef CONFIG_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif NULL }; diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index d1f9d2632202..ec6519e1ca3b 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -316,6 +316,7 @@ static int configfs_create_dir(struct config_item *item, struct dentry *dentry, return 0; out_remove: + configfs_put(dentry->d_fsdata); configfs_remove_dirent(dentry); return PTR_ERR(inode); } @@ -382,6 +383,7 @@ int configfs_create_link(struct configfs_dirent *target, struct dentry *parent, return 0; out_remove: + configfs_put(dentry->d_fsdata); configfs_remove_dirent(dentry); return PTR_ERR(inode); } diff --git a/fs/coredump.c b/fs/coredump.c index 9f4aae202109..de78bde2991b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -68,7 +68,10 @@ struct core_name { static int expand_corename(struct core_name *cn, int size) { - char *corename = krealloc(cn->corename, size, GFP_KERNEL); + char *corename; + + size = kmalloc_size_roundup(size); + corename = krealloc(cn->corename, size, GFP_KERNEL); if (!corename) return -ENOMEM; @@ -76,7 +79,7 @@ static int expand_corename(struct core_name *cn, int size) if (size > core_name_size) /* racy but harmless */ core_name_size = size; - cn->size = ksize(corename); + cn->size = size; cn->corename = corename; return 0; } @@ -325,6 +328,10 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, err = cn_printf(cn, "%lu", rlimit(RLIMIT_CORE)); break; + /* CPU the task ran on */ + case 'C': + err = cn_printf(cn, "%d", cprm->cpu); + break; default: break; } @@ -354,7 +361,7 @@ static int zap_process(struct task_struct *start, int exit_code) struct task_struct *t; int nr = 0; - /* ignore all signals except SIGKILL, see prepare_signal() */ + /* Allow SIGKILL, see prepare_signal() */ start->signal->flags = SIGNAL_GROUP_EXIT; start->signal->group_exit_code = exit_code; start->signal->group_stop_count = 0; @@ -402,9 +409,8 @@ static int coredump_wait(int exit_code, struct core_state *core_state) if (core_waiters > 0) { struct core_thread *ptr; - freezer_do_not_count(); - wait_for_completion(&core_state->startup); - freezer_count(); + wait_for_completion_state(&core_state->startup, + TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); /* * Wait for all the threads to become inactive, so that * all the thread context (extended register state, like @@ -412,7 +418,7 @@ static int coredump_wait(int exit_code, struct core_state *core_state) */ ptr = core_state->dumper.next; while (ptr != NULL) { - wait_task_inactive(ptr->task, 0); + wait_task_inactive(ptr->task, TASK_ANY); ptr = ptr->next; } } @@ -526,7 +532,6 @@ void do_coredump(const kernel_siginfo_t *siginfo) static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { .siginfo = siginfo, - .regs = signal_pt_regs(), .limit = rlimit(RLIMIT_CORE), /* * We must use the same mm->flags while dumping core to avoid @@ -535,6 +540,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) */ .mm_flags = mm->flags, .vma_meta = NULL, + .cpu = raw_smp_processor_id(), }; audit_core_dumps(siginfo->si_signo); @@ -717,8 +723,8 @@ void do_coredump(const kernel_siginfo_t *siginfo) * filesystem. */ mnt_userns = file_mnt_user_ns(cprm.file); - if (!uid_eq(i_uid_into_mnt(mnt_userns, inode), - current_fsuid())) { + if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), + current_fsuid())) { pr_info_ratelimited("Core dump to %s aborted: cannot preserve file owner\n", cn.corename); goto close_fail; @@ -832,6 +838,39 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr) } } +static int dump_emit_page(struct coredump_params *cprm, struct page *page) +{ + struct bio_vec bvec = { + .bv_page = page, + .bv_offset = 0, + .bv_len = PAGE_SIZE, + }; + struct iov_iter iter; + struct file *file = cprm->file; + loff_t pos; + ssize_t n; + + if (cprm->to_skip) { + if (!__dump_skip(cprm, cprm->to_skip)) + return 0; + cprm->to_skip = 0; + } + if (cprm->written + PAGE_SIZE > cprm->limit) + return 0; + if (dump_interrupted()) + return 0; + pos = file->f_pos; + iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE); + n = __kernel_write_iter(cprm->file, &iter, &pos); + if (n != PAGE_SIZE) + return 0; + file->f_pos = pos; + cprm->written += PAGE_SIZE; + cprm->pos += PAGE_SIZE; + + return 1; +} + int dump_emit(struct coredump_params *cprm, const void *addr, int nr) { if (cprm->to_skip) { @@ -863,7 +902,6 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, for (addr = start; addr < start + len; addr += PAGE_SIZE) { struct page *page; - int stop; /* * To avoid having to allocate page tables for virtual address @@ -874,10 +912,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, */ page = get_dump_page(addr); if (page) { - void *kaddr = kmap_local_page(page); - - stop = !dump_emit(cprm, kaddr, PAGE_SIZE); - kunmap_local(kaddr); + int stop = !dump_emit_page(cprm, page); put_page(page); if (stop) return 0; @@ -1072,30 +1107,20 @@ whole: return vma->vm_end - vma->vm_start; } -static struct vm_area_struct *first_vma(struct task_struct *tsk, - struct vm_area_struct *gate_vma) -{ - struct vm_area_struct *ret = tsk->mm->mmap; - - if (ret) - return ret; - return gate_vma; -} - /* * Helper function for iterating across a vma list. It ensures that the caller * will visit `gate_vma' prior to terminating the search. */ -static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, +static struct vm_area_struct *coredump_next_vma(struct ma_state *mas, + struct vm_area_struct *vma, struct vm_area_struct *gate_vma) { - struct vm_area_struct *ret; - - ret = this_vma->vm_next; - if (ret) - return ret; - if (this_vma == gate_vma) + if (gate_vma && (vma == gate_vma)) return NULL; + + vma = mas_next(mas, ULONG_MAX); + if (vma) + return vma; return gate_vma; } @@ -1119,9 +1144,10 @@ static void free_vma_snapshot(struct coredump_params *cprm) */ static bool dump_vma_snapshot(struct coredump_params *cprm) { - struct vm_area_struct *vma, *gate_vma; + struct vm_area_struct *gate_vma, *vma = NULL; struct mm_struct *mm = current->mm; - int i; + MA_STATE(mas, &mm->mm_mt, 0, 0); + int i = 0; /* * Once the stack expansion code is fixed to not change VMA bounds @@ -1141,8 +1167,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) return false; } - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma), i++) { + while ((vma = coredump_next_vma(&mas, vma, gate_vma)) != NULL) { struct core_vma_metadata *m = cprm->vma_meta + i; m->start = vma->vm_start; @@ -1150,10 +1175,10 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) m->flags = vma->vm_flags; m->dump_size = vma_dump_size(vma, cprm->mm_flags); m->pgoff = vma->vm_pgoff; - m->file = vma->vm_file; if (m->file) get_file(m->file); + i++; } mmap_write_unlock(mm); diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 2217fe5ece6f..1b4403136d05 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -25,21 +25,25 @@ * then this function isn't applicable. This function may sleep, so it must be * called from a workqueue rather than from the bio's bi_end_io callback. * - * This function sets PG_error on any pages that contain any blocks that failed - * to be decrypted. The filesystem must not mark such pages uptodate. + * Return: %true on success; %false on failure. On failure, bio->bi_status is + * also set to an error status. */ -void fscrypt_decrypt_bio(struct bio *bio) +bool fscrypt_decrypt_bio(struct bio *bio) { struct bio_vec *bv; struct bvec_iter_all iter_all; bio_for_each_segment_all(bv, bio, iter_all) { struct page *page = bv->bv_page; - int ret = fscrypt_decrypt_pagecache_blocks(page, bv->bv_len, + int err = fscrypt_decrypt_pagecache_blocks(page, bv->bv_len, bv->bv_offset); - if (ret) - SetPageError(page); + + if (err) { + bio->bi_status = errno_to_blk_status(err); + return false; + } } + return true; } EXPORT_SYMBOL(fscrypt_decrypt_bio); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 3afdaa084773..316a778cec0f 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -184,7 +184,7 @@ struct fscrypt_symlink_data { struct fscrypt_prepared_key { struct crypto_skcipher *tfm; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT - struct fscrypt_blk_crypto_key *blk_key; + struct blk_crypto_key *blk_key; #endif }; @@ -225,7 +225,7 @@ struct fscrypt_info { * will be NULL if the master key was found in a process-subscribed * keyring rather than in the filesystem-level keyring. */ - struct key *ci_master_key; + struct fscrypt_master_key *ci_master_key; /* * Link in list of inodes that were unlocked with the master key. @@ -344,7 +344,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_info *ci); -void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key); +void fscrypt_destroy_inline_crypt_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key); /* * Check whether the crypto transform or blk-crypto key has been allocated in @@ -390,7 +391,8 @@ fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, } static inline void -fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key) +fscrypt_destroy_inline_crypt_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key) { } @@ -437,6 +439,34 @@ struct fscrypt_master_key_secret { struct fscrypt_master_key { /* + * Link in ->s_master_keys->key_hashtable. + * Only valid if ->mk_active_refs > 0. + */ + struct hlist_node mk_node; + + /* Semaphore that protects ->mk_secret and ->mk_users */ + struct rw_semaphore mk_sem; + + /* + * Active and structural reference counts. An active ref guarantees + * that the struct continues to exist, continues to be in the keyring + * ->s_master_keys, and that any embedded subkeys (e.g. + * ->mk_direct_keys) that have been prepared continue to exist. + * A structural ref only guarantees that the struct continues to exist. + * + * There is one active ref associated with ->mk_secret being present, + * and one active ref for each inode in ->mk_decrypted_inodes. + * + * There is one structural ref associated with the active refcount being + * nonzero. Finding a key in the keyring also takes a structural ref, + * which is then held temporarily while the key is operated on. + */ + refcount_t mk_active_refs; + refcount_t mk_struct_refs; + + struct rcu_head mk_rcu_head; + + /* * The secret key material. After FS_IOC_REMOVE_ENCRYPTION_KEY is * executed, this is wiped and no new inodes can be unlocked with this * key; however, there may still be inodes in ->mk_decrypted_inodes @@ -444,7 +474,10 @@ struct fscrypt_master_key { * FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or * FS_IOC_ADD_ENCRYPTION_KEY can add the secret again. * - * Locking: protected by this master key's key->sem. + * While ->mk_secret is present, one ref in ->mk_active_refs is held. + * + * Locking: protected by ->mk_sem. The manipulation of ->mk_active_refs + * associated with this field is protected by ->mk_sem as well. */ struct fscrypt_master_key_secret mk_secret; @@ -465,23 +498,13 @@ struct fscrypt_master_key { * * This is NULL for v1 policy keys; those can only be added by root. * - * Locking: in addition to this keyring's own semaphore, this is - * protected by this master key's key->sem, so we can do atomic - * search+insert. It can also be searched without taking any locks, but - * in that case the returned key may have already been removed. + * Locking: protected by ->mk_sem. (We don't just rely on the keyrings + * subsystem semaphore ->mk_users->sem, as we need support for atomic + * search+insert along with proper synchronization with ->mk_secret.) */ struct key *mk_users; /* - * Length of ->mk_decrypted_inodes, plus one if mk_secret is present. - * Once this goes to 0, the master key is removed from ->s_master_keys. - * The 'struct fscrypt_master_key' will continue to live as long as the - * 'struct key' whose payload it is, but we won't let this reference - * count rise again. - */ - refcount_t mk_refcount; - - /* * List of inodes that were unlocked using this key. This allows the * inodes to be evicted efficiently if the key is removed. */ @@ -506,10 +529,10 @@ static inline bool is_master_key_secret_present(const struct fscrypt_master_key_secret *secret) { /* - * The READ_ONCE() is only necessary for fscrypt_drop_inode() and - * fscrypt_key_describe(). These run in atomic context, so they can't - * take the key semaphore and thus 'secret' can change concurrently - * which would be a data race. But they only need to know whether the + * The READ_ONCE() is only necessary for fscrypt_drop_inode(). + * fscrypt_drop_inode() runs in atomic context, so it can't take the key + * semaphore and thus 'secret' can change concurrently which would be a + * data race. But fscrypt_drop_inode() only need to know whether the * secret *was* present at the time of check, so READ_ONCE() suffices. */ return READ_ONCE(secret->size) != 0; @@ -538,7 +561,12 @@ static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec) return 0; } -struct key * +void fscrypt_put_master_key(struct fscrypt_master_key *mk); + +void fscrypt_put_master_key_activeref(struct super_block *sb, + struct fscrypt_master_key *mk); + +struct fscrypt_master_key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); @@ -569,7 +597,8 @@ extern struct fscrypt_mode fscrypt_modes[]; int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_info *ci); -void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key); +void fscrypt_destroy_prepared_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key); int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key); diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 7c01025879b3..7b8c5a1104b5 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -5,8 +5,6 @@ * Encryption hooks for higher-level filesystem operations. */ -#include <linux/key.h> - #include "fscrypt_private.h" /** @@ -142,7 +140,6 @@ int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags) { struct fscrypt_info *ci; - struct key *key; struct fscrypt_master_key *mk; int err; @@ -158,14 +155,13 @@ int fscrypt_prepare_setflags(struct inode *inode, ci = inode->i_crypt_info; if (ci->ci_policy.version != FSCRYPT_POLICY_V2) return -EINVAL; - key = ci->ci_master_key; - mk = key->payload.data[0]; - down_read(&key->sem); + mk = ci->ci_master_key; + down_read(&mk->mk_sem); if (is_master_key_secret_present(&mk->mk_secret)) err = fscrypt_derive_dirhash_key(ci, mk); else err = -ENOKEY; - up_read(&key->sem); + up_read(&mk->mk_sem); return err; } return 0; diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 90f3e68f166e..8bfb3ce86476 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -12,7 +12,7 @@ * provides the key and IV to use. */ -#include <linux/blk-crypto-profile.h> +#include <linux/blk-crypto.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/sched/mm.h> @@ -21,26 +21,22 @@ #include "fscrypt_private.h" -struct fscrypt_blk_crypto_key { - struct blk_crypto_key base; - int num_devs; - struct request_queue *devs[]; -}; - -static int fscrypt_get_num_devices(struct super_block *sb) +static struct block_device **fscrypt_get_devices(struct super_block *sb, + unsigned int *num_devs) { - if (sb->s_cop->get_num_devices) - return sb->s_cop->get_num_devices(sb); - return 1; -} + struct block_device **devs; -static void fscrypt_get_devices(struct super_block *sb, int num_devs, - struct request_queue **devs) -{ - if (num_devs == 1) - devs[0] = bdev_get_queue(sb->s_bdev); - else - sb->s_cop->get_devices(sb, devs); + if (sb->s_cop->get_devices) { + devs = sb->s_cop->get_devices(sb, num_devs); + if (devs) + return devs; + } + devs = kmalloc(sizeof(*devs), GFP_KERNEL); + if (!devs) + return ERR_PTR(-ENOMEM); + devs[0] = sb->s_bdev; + *num_devs = 1; + return devs; } static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci) @@ -74,15 +70,15 @@ static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci) * helpful for debugging problems where the "wrong" implementation is used. */ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode, - struct request_queue **devs, - int num_devs, + struct block_device **devs, + unsigned int num_devs, const struct blk_crypto_config *cfg) { - int i; + unsigned int i; for (i = 0; i < num_devs; i++) { if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || - __blk_crypto_cfg_supported(devs[i]->crypto_profile, cfg)) { + blk_crypto_config_supported_natively(devs[i], cfg)) { if (!xchg(&mode->logged_blk_crypto_native, 1)) pr_info("fscrypt: %s using blk-crypto (native)\n", mode->friendly_name); @@ -99,9 +95,9 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci) const struct inode *inode = ci->ci_inode; struct super_block *sb = inode->i_sb; struct blk_crypto_config crypto_cfg; - int num_devs; - struct request_queue **devs; - int i; + struct block_device **devs; + unsigned int num_devs; + unsigned int i; /* The file must need contents encryption, not filenames encryption */ if (!S_ISREG(inode->i_mode)) @@ -129,17 +125,16 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci) return 0; /* - * On all the filesystem's devices, blk-crypto must support the crypto - * configuration that the file would use. + * On all the filesystem's block devices, blk-crypto must support the + * crypto configuration that the file would use. */ crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode; crypto_cfg.data_unit_size = sb->s_blocksize; crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci); - num_devs = fscrypt_get_num_devices(sb); - devs = kmalloc_array(num_devs, sizeof(*devs), GFP_KERNEL); - if (!devs) - return -ENOMEM; - fscrypt_get_devices(sb, num_devs, devs); + + devs = fscrypt_get_devices(sb, &num_devs); + if (IS_ERR(devs)) + return PTR_ERR(devs); for (i = 0; i < num_devs; i++) { if (!blk_crypto_config_supported(devs[i], &crypto_cfg)) @@ -162,49 +157,40 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const struct inode *inode = ci->ci_inode; struct super_block *sb = inode->i_sb; enum blk_crypto_mode_num crypto_mode = ci->ci_mode->blk_crypto_mode; - int num_devs = fscrypt_get_num_devices(sb); - int queue_refs = 0; - struct fscrypt_blk_crypto_key *blk_key; + struct blk_crypto_key *blk_key; + struct block_device **devs; + unsigned int num_devs; + unsigned int i; int err; - int i; - blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_KERNEL); + blk_key = kmalloc(sizeof(*blk_key), GFP_KERNEL); if (!blk_key) return -ENOMEM; - blk_key->num_devs = num_devs; - fscrypt_get_devices(sb, num_devs, blk_key->devs); - - err = blk_crypto_init_key(&blk_key->base, raw_key, crypto_mode, + err = blk_crypto_init_key(blk_key, raw_key, crypto_mode, fscrypt_get_dun_bytes(ci), sb->s_blocksize); if (err) { fscrypt_err(inode, "error %d initializing blk-crypto key", err); goto fail; } - /* - * We have to start using blk-crypto on all the filesystem's devices. - * We also have to save all the request_queue's for later so that the - * key can be evicted from them. This is needed because some keys - * aren't destroyed until after the filesystem was already unmounted - * (namely, the per-mode keys in struct fscrypt_master_key). - */ + /* Start using blk-crypto on all the filesystem's block devices. */ + devs = fscrypt_get_devices(sb, &num_devs); + if (IS_ERR(devs)) { + err = PTR_ERR(devs); + goto fail; + } for (i = 0; i < num_devs; i++) { - if (!blk_get_queue(blk_key->devs[i])) { - fscrypt_err(inode, "couldn't get request_queue"); - err = -EAGAIN; - goto fail; - } - queue_refs++; - - err = blk_crypto_start_using_key(&blk_key->base, - blk_key->devs[i]); - if (err) { - fscrypt_err(inode, - "error %d starting to use blk-crypto", err); - goto fail; - } + err = blk_crypto_start_using_key(devs[i], blk_key); + if (err) + break; } + kfree(devs); + if (err) { + fscrypt_err(inode, "error %d starting to use blk-crypto", err); + goto fail; + } + /* * Pairs with the smp_load_acquire() in fscrypt_is_key_prepared(). * I.e., here we publish ->blk_key with a RELEASE barrier so that @@ -215,24 +201,29 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, return 0; fail: - for (i = 0; i < queue_refs; i++) - blk_put_queue(blk_key->devs[i]); kfree_sensitive(blk_key); return err; } -void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key) +void fscrypt_destroy_inline_crypt_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key) { - struct fscrypt_blk_crypto_key *blk_key = prep_key->blk_key; - int i; + struct blk_crypto_key *blk_key = prep_key->blk_key; + struct block_device **devs; + unsigned int num_devs; + unsigned int i; - if (blk_key) { - for (i = 0; i < blk_key->num_devs; i++) { - blk_crypto_evict_key(blk_key->devs[i], &blk_key->base); - blk_put_queue(blk_key->devs[i]); - } - kfree_sensitive(blk_key); + if (!blk_key) + return; + + /* Evict the key from all the filesystem's block devices. */ + devs = fscrypt_get_devices(sb, &num_devs); + if (!IS_ERR(devs)) { + for (i = 0; i < num_devs; i++) + blk_crypto_evict_key(devs[i], blk_key); + kfree(devs); } + kfree_sensitive(blk_key); } bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) @@ -282,7 +273,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, ci = inode->i_crypt_info; fscrypt_generate_dun(ci, first_lblk, dun); - bio_crypt_set_ctx(bio, &ci->ci_enc_key.blk_key->base, dun, gfp_mask); + bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask); } EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx); @@ -369,7 +360,7 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, * uses the same pointer. I.e., there's currently no need to support * merging requests where the keys are the same but the pointers differ. */ - if (bc->bc_key != &inode->i_crypt_info->ci_enc_key.blk_key->base) + if (bc->bc_key != inode->i_crypt_info->ci_enc_key.blk_key) return false; fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun); @@ -401,46 +392,45 @@ bool fscrypt_mergeable_bio_bh(struct bio *bio, EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh); /** - * fscrypt_dio_supported() - check whether a DIO (direct I/O) request is - * supported as far as encryption is concerned - * @iocb: the file and position the I/O is targeting - * @iter: the I/O data segment(s) + * fscrypt_dio_supported() - check whether DIO (direct I/O) is supported on an + * inode, as far as encryption is concerned + * @inode: the inode in question * * Return: %true if there are no encryption constraints that prevent DIO from * being supported; %false if DIO is unsupported. (Note that in the * %true case, the filesystem might have other, non-encryption-related - * constraints that prevent DIO from actually being supported.) + * constraints that prevent DIO from actually being supported. Also, on + * encrypted files the filesystem is still responsible for only allowing + * DIO when requests are filesystem-block-aligned.) */ -bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter) +bool fscrypt_dio_supported(struct inode *inode) { - const struct inode *inode = file_inode(iocb->ki_filp); - const unsigned int blocksize = i_blocksize(inode); + int err; /* If the file is unencrypted, no veto from us. */ if (!fscrypt_needs_contents_encryption(inode)) return true; - /* We only support DIO with inline crypto, not fs-layer crypto. */ - if (!fscrypt_inode_uses_inline_crypto(inode)) - return false; - /* - * Since the granularity of encryption is filesystem blocks, the file - * position and total I/O length must be aligned to the filesystem block - * size -- not just to the block device's logical block size as is - * traditionally the case for DIO on many filesystems. + * We only support DIO with inline crypto, not fs-layer crypto. * - * We require that the user-provided memory buffers be filesystem block - * aligned too. It is simpler to have a single alignment value required - * for all properties of the I/O, as is normally the case for DIO. - * Also, allowing less aligned buffers would imply that data units could - * cross bvecs, which would greatly complicate the I/O stack, which - * assumes that bios can be split at any bvec boundary. + * To determine whether the inode is using inline crypto, we have to set + * up the key if it wasn't already done. This is because in the current + * design of fscrypt, the decision of whether to use inline crypto or + * not isn't made until the inode's encryption key is being set up. In + * the DIO read/write case, the key will always be set up already, since + * the file will be open. But in the case of statx(), the key might not + * be set up yet, as the file might not have been opened yet. */ - if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), blocksize)) + err = fscrypt_require_key(inode); + if (err) { + /* + * Key unavailable or couldn't be set up. This edge case isn't + * worth worrying about; just report that DIO is unsupported. + */ return false; - - return true; + } + return fscrypt_inode_uses_inline_crypto(inode); } EXPORT_SYMBOL_GPL(fscrypt_dio_supported); diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index caee9f8620dd..78dd2ff306bd 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -18,6 +18,7 @@ * information about these ioctls. */ +#include <asm/unaligned.h> #include <crypto/skcipher.h> #include <linux/key-type.h> #include <linux/random.h> @@ -25,6 +26,18 @@ #include "fscrypt_private.h" +/* The master encryption keys for a filesystem (->s_master_keys) */ +struct fscrypt_keyring { + /* + * Lock that protects ->key_hashtable. It does *not* protect the + * fscrypt_master_key structs themselves. + */ + spinlock_t lock; + + /* Hash table that maps fscrypt_key_specifier to fscrypt_master_key */ + struct hlist_head key_hashtable[128]; +}; + static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret) { fscrypt_destroy_hkdf(&secret->hkdf); @@ -38,66 +51,80 @@ static void move_master_key_secret(struct fscrypt_master_key_secret *dst, memzero_explicit(src, sizeof(*src)); } -static void free_master_key(struct fscrypt_master_key *mk) +static void fscrypt_free_master_key(struct rcu_head *head) { - size_t i; - - wipe_master_key_secret(&mk->mk_secret); - - for (i = 0; i <= FSCRYPT_MODE_MAX; i++) { - fscrypt_destroy_prepared_key(&mk->mk_direct_keys[i]); - fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_64_keys[i]); - fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_32_keys[i]); - } - - key_put(mk->mk_users); + struct fscrypt_master_key *mk = + container_of(head, struct fscrypt_master_key, mk_rcu_head); + /* + * The master key secret and any embedded subkeys should have already + * been wiped when the last active reference to the fscrypt_master_key + * struct was dropped; doing it here would be unnecessarily late. + * Nevertheless, use kfree_sensitive() in case anything was missed. + */ kfree_sensitive(mk); } -static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec) +void fscrypt_put_master_key(struct fscrypt_master_key *mk) { - if (spec->__reserved) - return false; - return master_key_spec_len(spec) != 0; + if (!refcount_dec_and_test(&mk->mk_struct_refs)) + return; + /* + * No structural references left, so free ->mk_users, and also free the + * fscrypt_master_key struct itself after an RCU grace period ensures + * that concurrent keyring lookups can no longer find it. + */ + WARN_ON(refcount_read(&mk->mk_active_refs) != 0); + key_put(mk->mk_users); + mk->mk_users = NULL; + call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key); } -static int fscrypt_key_instantiate(struct key *key, - struct key_preparsed_payload *prep) +void fscrypt_put_master_key_activeref(struct super_block *sb, + struct fscrypt_master_key *mk) { - key->payload.data[0] = (struct fscrypt_master_key *)prep->data; - return 0; -} + size_t i; -static void fscrypt_key_destroy(struct key *key) -{ - free_master_key(key->payload.data[0]); -} + if (!refcount_dec_and_test(&mk->mk_active_refs)) + return; + /* + * No active references left, so complete the full removal of this + * fscrypt_master_key struct by removing it from the keyring and + * destroying any subkeys embedded in it. + */ -static void fscrypt_key_describe(const struct key *key, struct seq_file *m) -{ - seq_puts(m, key->description); + spin_lock(&sb->s_master_keys->lock); + hlist_del_rcu(&mk->mk_node); + spin_unlock(&sb->s_master_keys->lock); - if (key_is_positive(key)) { - const struct fscrypt_master_key *mk = key->payload.data[0]; + /* + * ->mk_active_refs == 0 implies that ->mk_secret is not present and + * that ->mk_decrypted_inodes is empty. + */ + WARN_ON(is_master_key_secret_present(&mk->mk_secret)); + WARN_ON(!list_empty(&mk->mk_decrypted_inodes)); - if (!is_master_key_secret_present(&mk->mk_secret)) - seq_puts(m, ": secret removed"); + for (i = 0; i <= FSCRYPT_MODE_MAX; i++) { + fscrypt_destroy_prepared_key( + sb, &mk->mk_direct_keys[i]); + fscrypt_destroy_prepared_key( + sb, &mk->mk_iv_ino_lblk_64_keys[i]); + fscrypt_destroy_prepared_key( + sb, &mk->mk_iv_ino_lblk_32_keys[i]); } + memzero_explicit(&mk->mk_ino_hash_key, + sizeof(mk->mk_ino_hash_key)); + mk->mk_ino_hash_key_initialized = false; + + /* Drop the structural ref associated with the active refs. */ + fscrypt_put_master_key(mk); } -/* - * Type of key in ->s_master_keys. Each key of this type represents a master - * key which has been added to the filesystem. Its payload is a - * 'struct fscrypt_master_key'. The "." prefix in the key type name prevents - * users from adding keys of this type via the keyrings syscalls rather than via - * the intended method of FS_IOC_ADD_ENCRYPTION_KEY. - */ -static struct key_type key_type_fscrypt = { - .name = "._fscrypt", - .instantiate = fscrypt_key_instantiate, - .destroy = fscrypt_key_destroy, - .describe = fscrypt_key_describe, -}; +static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec) +{ + if (spec->__reserved) + return false; + return master_key_spec_len(spec) != 0; +} static int fscrypt_user_key_instantiate(struct key *key, struct key_preparsed_payload *prep) @@ -131,32 +158,6 @@ static struct key_type key_type_fscrypt_user = { .describe = fscrypt_user_key_describe, }; -/* Search ->s_master_keys or ->mk_users */ -static struct key *search_fscrypt_keyring(struct key *keyring, - struct key_type *type, - const char *description) -{ - /* - * We need to mark the keyring reference as "possessed" so that we - * acquire permission to search it, via the KEY_POS_SEARCH permission. - */ - key_ref_t keyref = make_key_ref(keyring, true /* possessed */); - - keyref = keyring_search(keyref, type, description, false); - if (IS_ERR(keyref)) { - if (PTR_ERR(keyref) == -EAGAIN || /* not found */ - PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */ - keyref = ERR_PTR(-ENOKEY); - return ERR_CAST(keyref); - } - return key_ref_to_ptr(keyref); -} - -#define FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE \ - (CONST_STRLEN("fscrypt-") + sizeof_field(struct super_block, s_id)) - -#define FSCRYPT_MK_DESCRIPTION_SIZE (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + 1) - #define FSCRYPT_MK_USERS_DESCRIPTION_SIZE \ (CONST_STRLEN("fscrypt-") + 2 * FSCRYPT_KEY_IDENTIFIER_SIZE + \ CONST_STRLEN("-users") + 1) @@ -164,21 +165,6 @@ static struct key *search_fscrypt_keyring(struct key *keyring, #define FSCRYPT_MK_USER_DESCRIPTION_SIZE \ (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + CONST_STRLEN(".uid.") + 10 + 1) -static void format_fs_keyring_description( - char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE], - const struct super_block *sb) -{ - sprintf(description, "fscrypt-%s", sb->s_id); -} - -static void format_mk_description( - char description[FSCRYPT_MK_DESCRIPTION_SIZE], - const struct fscrypt_key_specifier *mk_spec) -{ - sprintf(description, "%*phN", - master_key_spec_len(mk_spec), (u8 *)&mk_spec->u); -} - static void format_mk_users_keyring_description( char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE], const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) @@ -199,20 +185,15 @@ static void format_mk_user_description( /* Create ->s_master_keys if needed. Synchronized by fscrypt_add_key_mutex. */ static int allocate_filesystem_keyring(struct super_block *sb) { - char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE]; - struct key *keyring; + struct fscrypt_keyring *keyring; if (sb->s_master_keys) return 0; - format_fs_keyring_description(description, sb); - keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, - current_cred(), KEY_POS_SEARCH | - KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW, - KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); - if (IS_ERR(keyring)) - return PTR_ERR(keyring); - + keyring = kzalloc(sizeof(*keyring), GFP_KERNEL); + if (!keyring) + return -ENOMEM; + spin_lock_init(&keyring->lock); /* * Pairs with the smp_load_acquire() in fscrypt_find_master_key(). * I.e., here we publish ->s_master_keys with a RELEASE barrier so that @@ -222,21 +203,80 @@ static int allocate_filesystem_keyring(struct super_block *sb) return 0; } -void fscrypt_sb_free(struct super_block *sb) +/* + * Release all encryption keys that have been added to the filesystem, along + * with the keyring that contains them. + * + * This is called at unmount time. The filesystem's underlying block device(s) + * are still available at this time; this is important because after user file + * accesses have been allowed, this function may need to evict keys from the + * keyslots of an inline crypto engine, which requires the block device(s). + * + * This is also called when the super_block is being freed. This is needed to + * avoid a memory leak if mounting fails after the "test_dummy_encryption" + * option was processed, as in that case the unmount-time call isn't made. + */ +void fscrypt_destroy_keyring(struct super_block *sb) { - key_put(sb->s_master_keys); + struct fscrypt_keyring *keyring = sb->s_master_keys; + size_t i; + + if (!keyring) + return; + + for (i = 0; i < ARRAY_SIZE(keyring->key_hashtable); i++) { + struct hlist_head *bucket = &keyring->key_hashtable[i]; + struct fscrypt_master_key *mk; + struct hlist_node *tmp; + + hlist_for_each_entry_safe(mk, tmp, bucket, mk_node) { + /* + * Since all inodes were already evicted, every key + * remaining in the keyring should have an empty inode + * list, and should only still be in the keyring due to + * the single active ref associated with ->mk_secret. + * There should be no structural refs beyond the one + * associated with the active ref. + */ + WARN_ON(refcount_read(&mk->mk_active_refs) != 1); + WARN_ON(refcount_read(&mk->mk_struct_refs) != 1); + WARN_ON(!is_master_key_secret_present(&mk->mk_secret)); + wipe_master_key_secret(&mk->mk_secret); + fscrypt_put_master_key_activeref(sb, mk); + } + } + kfree_sensitive(keyring); sb->s_master_keys = NULL; } +static struct hlist_head * +fscrypt_mk_hash_bucket(struct fscrypt_keyring *keyring, + const struct fscrypt_key_specifier *mk_spec) +{ + /* + * Since key specifiers should be "random" values, it is sufficient to + * use a trivial hash function that just takes the first several bits of + * the key specifier. + */ + unsigned long i = get_unaligned((unsigned long *)&mk_spec->u); + + return &keyring->key_hashtable[i % ARRAY_SIZE(keyring->key_hashtable)]; +} + /* - * Find the specified master key in ->s_master_keys. - * Returns ERR_PTR(-ENOKEY) if not found. + * Find the specified master key struct in ->s_master_keys and take a structural + * ref to it. The structural ref guarantees that the key struct continues to + * exist, but it does *not* guarantee that ->s_master_keys continues to contain + * the key struct. The structural ref needs to be dropped by + * fscrypt_put_master_key(). Returns NULL if the key struct is not found. */ -struct key *fscrypt_find_master_key(struct super_block *sb, - const struct fscrypt_key_specifier *mk_spec) +struct fscrypt_master_key * +fscrypt_find_master_key(struct super_block *sb, + const struct fscrypt_key_specifier *mk_spec) { - struct key *keyring; - char description[FSCRYPT_MK_DESCRIPTION_SIZE]; + struct fscrypt_keyring *keyring; + struct hlist_head *bucket; + struct fscrypt_master_key *mk; /* * Pairs with the smp_store_release() in allocate_filesystem_keyring(). @@ -246,10 +286,38 @@ struct key *fscrypt_find_master_key(struct super_block *sb, */ keyring = smp_load_acquire(&sb->s_master_keys); if (keyring == NULL) - return ERR_PTR(-ENOKEY); /* No keyring yet, so no keys yet. */ - - format_mk_description(description, mk_spec); - return search_fscrypt_keyring(keyring, &key_type_fscrypt, description); + return NULL; /* No keyring yet, so no keys yet. */ + + bucket = fscrypt_mk_hash_bucket(keyring, mk_spec); + rcu_read_lock(); + switch (mk_spec->type) { + case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: + hlist_for_each_entry_rcu(mk, bucket, mk_node) { + if (mk->mk_spec.type == + FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && + memcmp(mk->mk_spec.u.descriptor, + mk_spec->u.descriptor, + FSCRYPT_KEY_DESCRIPTOR_SIZE) == 0 && + refcount_inc_not_zero(&mk->mk_struct_refs)) + goto out; + } + break; + case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: + hlist_for_each_entry_rcu(mk, bucket, mk_node) { + if (mk->mk_spec.type == + FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER && + memcmp(mk->mk_spec.u.identifier, + mk_spec->u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE) == 0 && + refcount_inc_not_zero(&mk->mk_struct_refs)) + goto out; + } + break; + } + mk = NULL; +out: + rcu_read_unlock(); + return mk; } static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk) @@ -277,17 +345,30 @@ static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk) static struct key *find_master_key_user(struct fscrypt_master_key *mk) { char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE]; + key_ref_t keyref; format_mk_user_description(description, mk->mk_spec.u.identifier); - return search_fscrypt_keyring(mk->mk_users, &key_type_fscrypt_user, - description); + + /* + * We need to mark the keyring reference as "possessed" so that we + * acquire permission to search it, via the KEY_POS_SEARCH permission. + */ + keyref = keyring_search(make_key_ref(mk->mk_users, true /*possessed*/), + &key_type_fscrypt_user, description, false); + if (IS_ERR(keyref)) { + if (PTR_ERR(keyref) == -EAGAIN || /* not found */ + PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */ + keyref = ERR_PTR(-ENOKEY); + return ERR_CAST(keyref); + } + return key_ref_to_ptr(keyref); } /* * Give the current user a "key" in ->mk_users. This charges the user's quota * and marks the master key as added by the current user, so that it cannot be - * removed by another user with the key. Either the master key's key->sem must - * be held for write, or the master key must be still undergoing initialization. + * removed by another user with the key. Either ->mk_sem must be held for + * write, or the master key must be still undergoing initialization. */ static int add_master_key_user(struct fscrypt_master_key *mk) { @@ -309,7 +390,7 @@ static int add_master_key_user(struct fscrypt_master_key *mk) /* * Remove the current user's "key" from ->mk_users. - * The master key's key->sem must be held for write. + * ->mk_sem must be held for write. * * Returns 0 if removed, -ENOKEY if not found, or another -errno code. */ @@ -327,63 +408,48 @@ static int remove_master_key_user(struct fscrypt_master_key *mk) } /* - * Allocate a new fscrypt_master_key which contains the given secret, set it as - * the payload of a new 'struct key' of type fscrypt, and link the 'struct key' - * into the given keyring. Synchronized by fscrypt_add_key_mutex. + * Allocate a new fscrypt_master_key, transfer the given secret over to it, and + * insert it into sb->s_master_keys. */ -static int add_new_master_key(struct fscrypt_master_key_secret *secret, - const struct fscrypt_key_specifier *mk_spec, - struct key *keyring) +static int add_new_master_key(struct super_block *sb, + struct fscrypt_master_key_secret *secret, + const struct fscrypt_key_specifier *mk_spec) { + struct fscrypt_keyring *keyring = sb->s_master_keys; struct fscrypt_master_key *mk; - char description[FSCRYPT_MK_DESCRIPTION_SIZE]; - struct key *key; int err; mk = kzalloc(sizeof(*mk), GFP_KERNEL); if (!mk) return -ENOMEM; + init_rwsem(&mk->mk_sem); + refcount_set(&mk->mk_struct_refs, 1); mk->mk_spec = *mk_spec; - move_master_key_secret(&mk->mk_secret, secret); - - refcount_set(&mk->mk_refcount, 1); /* secret is present */ INIT_LIST_HEAD(&mk->mk_decrypted_inodes); spin_lock_init(&mk->mk_decrypted_inodes_lock); if (mk_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { err = allocate_master_key_users_keyring(mk); if (err) - goto out_free_mk; + goto out_put; err = add_master_key_user(mk); if (err) - goto out_free_mk; + goto out_put; } - /* - * Note that we don't charge this key to anyone's quota, since when - * ->mk_users is in use those keys are charged instead, and otherwise - * (when ->mk_users isn't in use) only root can add these keys. - */ - format_mk_description(description, mk_spec); - key = key_alloc(&key_type_fscrypt, description, - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), - KEY_POS_SEARCH | KEY_USR_SEARCH | KEY_USR_VIEW, - KEY_ALLOC_NOT_IN_QUOTA, NULL); - if (IS_ERR(key)) { - err = PTR_ERR(key); - goto out_free_mk; - } - err = key_instantiate_and_link(key, mk, sizeof(*mk), keyring, NULL); - key_put(key); - if (err) - goto out_free_mk; + move_master_key_secret(&mk->mk_secret, secret); + refcount_set(&mk->mk_active_refs, 1); /* ->mk_secret is present */ + spin_lock(&keyring->lock); + hlist_add_head_rcu(&mk->mk_node, + fscrypt_mk_hash_bucket(keyring, mk_spec)); + spin_unlock(&keyring->lock); return 0; -out_free_mk: - free_master_key(mk); +out_put: + fscrypt_put_master_key(mk); return err; } @@ -392,42 +458,34 @@ out_free_mk: static int add_existing_master_key(struct fscrypt_master_key *mk, struct fscrypt_master_key_secret *secret) { - struct key *mk_user; - bool rekey; int err; /* * If the current user is already in ->mk_users, then there's nothing to - * do. (Not applicable for v1 policy keys, which have NULL ->mk_users.) + * do. Otherwise, we need to add the user to ->mk_users. (Neither is + * applicable for v1 policy keys, which have NULL ->mk_users.) */ if (mk->mk_users) { - mk_user = find_master_key_user(mk); + struct key *mk_user = find_master_key_user(mk); + if (mk_user != ERR_PTR(-ENOKEY)) { if (IS_ERR(mk_user)) return PTR_ERR(mk_user); key_put(mk_user); return 0; } - } - - /* If we'll be re-adding ->mk_secret, try to take the reference. */ - rekey = !is_master_key_secret_present(&mk->mk_secret); - if (rekey && !refcount_inc_not_zero(&mk->mk_refcount)) - return KEY_DEAD; - - /* Add the current user to ->mk_users, if applicable. */ - if (mk->mk_users) { err = add_master_key_user(mk); - if (err) { - if (rekey && refcount_dec_and_test(&mk->mk_refcount)) - return KEY_DEAD; + if (err) return err; - } } /* Re-add the secret if needed. */ - if (rekey) + if (!is_master_key_secret_present(&mk->mk_secret)) { + if (!refcount_inc_not_zero(&mk->mk_active_refs)) + return KEY_DEAD; move_master_key_secret(&mk->mk_secret, secret); + } + return 0; } @@ -436,38 +494,36 @@ static int do_add_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec) { static DEFINE_MUTEX(fscrypt_add_key_mutex); - struct key *key; + struct fscrypt_master_key *mk; int err; mutex_lock(&fscrypt_add_key_mutex); /* serialize find + link */ -retry: - key = fscrypt_find_master_key(sb, mk_spec); - if (IS_ERR(key)) { - err = PTR_ERR(key); - if (err != -ENOKEY) - goto out_unlock; + + mk = fscrypt_find_master_key(sb, mk_spec); + if (!mk) { /* Didn't find the key in ->s_master_keys. Add it. */ err = allocate_filesystem_keyring(sb); - if (err) - goto out_unlock; - err = add_new_master_key(secret, mk_spec, sb->s_master_keys); + if (!err) + err = add_new_master_key(sb, secret, mk_spec); } else { /* * Found the key in ->s_master_keys. Re-add the secret if * needed, and add the user to ->mk_users if needed. */ - down_write(&key->sem); - err = add_existing_master_key(key->payload.data[0], secret); - up_write(&key->sem); + down_write(&mk->mk_sem); + err = add_existing_master_key(mk, secret); + up_write(&mk->mk_sem); if (err == KEY_DEAD) { - /* Key being removed or needs to be removed */ - key_invalidate(key); - key_put(key); - goto retry; + /* + * We found a key struct, but it's already been fully + * removed. Ignore the old struct and add a new one. + * fscrypt_add_key_mutex means we don't need to worry + * about concurrent adds. + */ + err = add_new_master_key(sb, secret, mk_spec); } - key_put(key); + fscrypt_put_master_key(mk); } -out_unlock: mutex_unlock(&fscrypt_add_key_mutex); return err; } @@ -771,19 +827,19 @@ int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { struct fscrypt_key_specifier mk_spec; - struct key *key, *mk_user; struct fscrypt_master_key *mk; + struct key *mk_user; int err; mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; memcpy(mk_spec.u.identifier, identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); - key = fscrypt_find_master_key(sb, &mk_spec); - if (IS_ERR(key)) { - err = PTR_ERR(key); + mk = fscrypt_find_master_key(sb, &mk_spec); + if (!mk) { + err = -ENOKEY; goto out; } - mk = key->payload.data[0]; + down_read(&mk->mk_sem); mk_user = find_master_key_user(mk); if (IS_ERR(mk_user)) { err = PTR_ERR(mk_user); @@ -791,7 +847,8 @@ int fscrypt_verify_key_added(struct super_block *sb, key_put(mk_user); err = 0; } - key_put(key); + up_read(&mk->mk_sem); + fscrypt_put_master_key(mk); out: if (err == -ENOKEY && capable(CAP_FOWNER)) err = 0; @@ -953,11 +1010,10 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) struct super_block *sb = file_inode(filp)->i_sb; struct fscrypt_remove_key_arg __user *uarg = _uarg; struct fscrypt_remove_key_arg arg; - struct key *key; struct fscrypt_master_key *mk; u32 status_flags = 0; int err; - bool dead; + bool inodes_remain; if (copy_from_user(&arg, uarg, sizeof(arg))) return -EFAULT; @@ -977,12 +1033,10 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) return -EACCES; /* Find the key being removed. */ - key = fscrypt_find_master_key(sb, &arg.key_spec); - if (IS_ERR(key)) - return PTR_ERR(key); - mk = key->payload.data[0]; - - down_write(&key->sem); + mk = fscrypt_find_master_key(sb, &arg.key_spec); + if (!mk) + return -ENOKEY; + down_write(&mk->mk_sem); /* If relevant, remove current user's (or all users) claim to the key */ if (mk->mk_users && mk->mk_users->keys.nr_leaves_on_tree != 0) { @@ -991,7 +1045,7 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) else err = remove_master_key_user(mk); if (err) { - up_write(&key->sem); + up_write(&mk->mk_sem); goto out_put_key; } if (mk->mk_users->keys.nr_leaves_on_tree != 0) { @@ -1003,26 +1057,22 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) status_flags |= FSCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS; err = 0; - up_write(&key->sem); + up_write(&mk->mk_sem); goto out_put_key; } } /* No user claims remaining. Go ahead and wipe the secret. */ - dead = false; + err = -ENOKEY; if (is_master_key_secret_present(&mk->mk_secret)) { wipe_master_key_secret(&mk->mk_secret); - dead = refcount_dec_and_test(&mk->mk_refcount); - } - up_write(&key->sem); - if (dead) { - /* - * No inodes reference the key, and we wiped the secret, so the - * key object is free to be removed from the keyring. - */ - key_invalidate(key); + fscrypt_put_master_key_activeref(sb, mk); err = 0; - } else { + } + inodes_remain = refcount_read(&mk->mk_active_refs) > 0; + up_write(&mk->mk_sem); + + if (inodes_remain) { /* Some inodes still reference this key; try to evict them. */ err = try_to_lock_encrypted_files(sb, mk); if (err == -EBUSY) { @@ -1038,7 +1088,7 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) * has been fully removed including all files locked. */ out_put_key: - key_put(key); + fscrypt_put_master_key(mk); if (err == 0) err = put_user(status_flags, &uarg->removal_status_flags); return err; @@ -1085,7 +1135,6 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg) { struct super_block *sb = file_inode(filp)->i_sb; struct fscrypt_get_key_status_arg arg; - struct key *key; struct fscrypt_master_key *mk; int err; @@ -1102,19 +1151,18 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg) arg.user_count = 0; memset(arg.__out_reserved, 0, sizeof(arg.__out_reserved)); - key = fscrypt_find_master_key(sb, &arg.key_spec); - if (IS_ERR(key)) { - if (key != ERR_PTR(-ENOKEY)) - return PTR_ERR(key); + mk = fscrypt_find_master_key(sb, &arg.key_spec); + if (!mk) { arg.status = FSCRYPT_KEY_STATUS_ABSENT; err = 0; goto out; } - mk = key->payload.data[0]; - down_read(&key->sem); + down_read(&mk->mk_sem); if (!is_master_key_secret_present(&mk->mk_secret)) { - arg.status = FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED; + arg.status = refcount_read(&mk->mk_active_refs) > 0 ? + FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED : + FSCRYPT_KEY_STATUS_ABSENT /* raced with full removal */; err = 0; goto out_release_key; } @@ -1136,8 +1184,8 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg) } err = 0; out_release_key: - up_read(&key->sem); - key_put(key); + up_read(&mk->mk_sem); + fscrypt_put_master_key(mk); out: if (!err && copy_to_user(uarg, &arg, sizeof(arg))) err = -EFAULT; @@ -1149,13 +1197,9 @@ int __init fscrypt_init_keyring(void) { int err; - err = register_key_type(&key_type_fscrypt); - if (err) - return err; - err = register_key_type(&key_type_fscrypt_user); if (err) - goto err_unregister_fscrypt; + return err; err = register_key_type(&key_type_fscrypt_provisioning); if (err) @@ -1165,7 +1209,5 @@ int __init fscrypt_init_keyring(void) err_unregister_fscrypt_user: unregister_key_type(&key_type_fscrypt_user); -err_unregister_fscrypt: - unregister_key_type(&key_type_fscrypt); return err; } diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index fbc71abdabe3..94757ccd3056 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -9,7 +9,6 @@ */ #include <crypto/skcipher.h> -#include <linux/key.h> #include <linux/random.h> #include "fscrypt_private.h" @@ -45,6 +44,21 @@ struct fscrypt_mode fscrypt_modes[] = { .security_strength = 16, .ivsize = 16, }, + [FSCRYPT_MODE_SM4_XTS] = { + .friendly_name = "SM4-XTS", + .cipher_str = "xts(sm4)", + .keysize = 32, + .security_strength = 16, + .ivsize = 16, + .blk_crypto_mode = BLK_ENCRYPTION_MODE_SM4_XTS, + }, + [FSCRYPT_MODE_SM4_CTS] = { + .friendly_name = "SM4-CTS-CBC", + .cipher_str = "cts(cbc(sm4))", + .keysize = 16, + .security_strength = 16, + .ivsize = 16, + }, [FSCRYPT_MODE_ADIANTUM] = { .friendly_name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", @@ -155,10 +169,12 @@ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, } /* Destroy a crypto transform object and/or blk-crypto key. */ -void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key) +void fscrypt_destroy_prepared_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key) { crypto_free_skcipher(prep_key->tfm); - fscrypt_destroy_inline_crypt_key(prep_key); + fscrypt_destroy_inline_crypt_key(sb, prep_key); + memzero_explicit(prep_key, sizeof(*prep_key)); } /* Given a per-file encryption key, set up the file's crypto transform object */ @@ -412,20 +428,18 @@ static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk, /* * Find the master key, then set up the inode's actual encryption key. * - * If the master key is found in the filesystem-level keyring, then the - * corresponding 'struct key' is returned in *master_key_ret with its semaphore - * read-locked. This is needed to ensure that only one task links the - * fscrypt_info into ->mk_decrypted_inodes (as multiple tasks may race to create - * an fscrypt_info for the same inode), and to synchronize the master key being - * removed with a new inode starting to use it. + * If the master key is found in the filesystem-level keyring, then it is + * returned in *mk_ret with its semaphore read-locked. This is needed to ensure + * that only one task links the fscrypt_info into ->mk_decrypted_inodes (as + * multiple tasks may race to create an fscrypt_info for the same inode), and to + * synchronize the master key being removed with a new inode starting to use it. */ static int setup_file_encryption_key(struct fscrypt_info *ci, bool need_dirhash_key, - struct key **master_key_ret) + struct fscrypt_master_key **mk_ret) { - struct key *key; - struct fscrypt_master_key *mk = NULL; struct fscrypt_key_specifier mk_spec; + struct fscrypt_master_key *mk; int err; err = fscrypt_select_encryption_impl(ci); @@ -436,11 +450,10 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, if (err) return err; - key = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec); - if (IS_ERR(key)) { - if (key != ERR_PTR(-ENOKEY) || - ci->ci_policy.version != FSCRYPT_POLICY_V1) - return PTR_ERR(key); + mk = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec); + if (!mk) { + if (ci->ci_policy.version != FSCRYPT_POLICY_V1) + return -ENOKEY; /* * As a legacy fallback for v1 policies, search for the key in @@ -450,9 +463,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, */ return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci); } - - mk = key->payload.data[0]; - down_read(&key->sem); + down_read(&mk->mk_sem); /* Has the secret been removed (via FS_IOC_REMOVE_ENCRYPTION_KEY)? */ if (!is_master_key_secret_present(&mk->mk_secret)) { @@ -480,18 +491,18 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, if (err) goto out_release_key; - *master_key_ret = key; + *mk_ret = mk; return 0; out_release_key: - up_read(&key->sem); - key_put(key); + up_read(&mk->mk_sem); + fscrypt_put_master_key(mk); return err; } static void put_crypt_info(struct fscrypt_info *ci) { - struct key *key; + struct fscrypt_master_key *mk; if (!ci) return; @@ -499,26 +510,21 @@ static void put_crypt_info(struct fscrypt_info *ci) if (ci->ci_direct_key) fscrypt_put_direct_key(ci->ci_direct_key); else if (ci->ci_owns_key) - fscrypt_destroy_prepared_key(&ci->ci_enc_key); - - key = ci->ci_master_key; - if (key) { - struct fscrypt_master_key *mk = key->payload.data[0]; + fscrypt_destroy_prepared_key(ci->ci_inode->i_sb, + &ci->ci_enc_key); + mk = ci->ci_master_key; + if (mk) { /* * Remove this inode from the list of inodes that were unlocked - * with the master key. - * - * In addition, if we're removing the last inode from a key that - * already had its secret removed, invalidate the key so that it - * gets removed from ->s_master_keys. + * with the master key. In addition, if we're removing the last + * inode from a master key struct that already had its secret + * removed, then complete the full removal of the struct. */ spin_lock(&mk->mk_decrypted_inodes_lock); list_del(&ci->ci_master_key_link); spin_unlock(&mk->mk_decrypted_inodes_lock); - if (refcount_dec_and_test(&mk->mk_refcount)) - key_invalidate(key); - key_put(key); + fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk); } memzero_explicit(ci, sizeof(*ci)); kmem_cache_free(fscrypt_info_cachep, ci); @@ -532,7 +538,7 @@ fscrypt_setup_encryption_info(struct inode *inode, { struct fscrypt_info *crypt_info; struct fscrypt_mode *mode; - struct key *master_key = NULL; + struct fscrypt_master_key *mk = NULL; int res; res = fscrypt_initialize(inode->i_sb->s_cop->flags); @@ -555,8 +561,7 @@ fscrypt_setup_encryption_info(struct inode *inode, WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE); crypt_info->ci_mode = mode; - res = setup_file_encryption_key(crypt_info, need_dirhash_key, - &master_key); + res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk); if (res) goto out; @@ -571,12 +576,9 @@ fscrypt_setup_encryption_info(struct inode *inode, * We won the race and set ->i_crypt_info to our crypt_info. * Now link it into the master key's inode list. */ - if (master_key) { - struct fscrypt_master_key *mk = - master_key->payload.data[0]; - - refcount_inc(&mk->mk_refcount); - crypt_info->ci_master_key = key_get(master_key); + if (mk) { + crypt_info->ci_master_key = mk; + refcount_inc(&mk->mk_active_refs); spin_lock(&mk->mk_decrypted_inodes_lock); list_add(&crypt_info->ci_master_key_link, &mk->mk_decrypted_inodes); @@ -586,9 +588,9 @@ fscrypt_setup_encryption_info(struct inode *inode, } res = 0; out: - if (master_key) { - up_read(&master_key->sem); - key_put(master_key); + if (mk) { + up_read(&mk->mk_sem); + fscrypt_put_master_key(mk); } put_crypt_info(crypt_info); return res; @@ -753,7 +755,6 @@ EXPORT_SYMBOL(fscrypt_free_inode); int fscrypt_drop_inode(struct inode *inode) { const struct fscrypt_info *ci = fscrypt_get_info(inode); - const struct fscrypt_master_key *mk; /* * If ci is NULL, then the inode doesn't have an encryption key set up @@ -763,7 +764,6 @@ int fscrypt_drop_inode(struct inode *inode) */ if (!ci || !ci->ci_master_key) return 0; - mk = ci->ci_master_key->payload.data[0]; /* * With proper, non-racy use of FS_IOC_REMOVE_ENCRYPTION_KEY, all inodes @@ -782,6 +782,6 @@ int fscrypt_drop_inode(struct inode *inode) * then the thread removing the key will either evict the inode itself * or will correctly detect that it wasn't evicted due to the race. */ - return !is_master_key_secret_present(&mk->mk_secret); + return !is_master_key_secret_present(&ci->ci_master_key->mk_secret); } EXPORT_SYMBOL_GPL(fscrypt_drop_inode); diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c index 2762c5350432..75dabd9b27f9 100644 --- a/fs/crypto/keysetup_v1.c +++ b/fs/crypto/keysetup_v1.c @@ -143,6 +143,7 @@ invalid: /* Master key referenced by DIRECT_KEY policy */ struct fscrypt_direct_key { + struct super_block *dk_sb; struct hlist_node dk_node; refcount_t dk_refcount; const struct fscrypt_mode *dk_mode; @@ -154,7 +155,7 @@ struct fscrypt_direct_key { static void free_direct_key(struct fscrypt_direct_key *dk) { if (dk) { - fscrypt_destroy_prepared_key(&dk->dk_key); + fscrypt_destroy_prepared_key(dk->dk_sb, &dk->dk_key); kfree_sensitive(dk); } } @@ -231,6 +232,7 @@ fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key) dk = kzalloc(sizeof(*dk), GFP_KERNEL); if (!dk) return ERR_PTR(-ENOMEM); + dk->dk_sb = ci->ci_inode->i_sb; refcount_set(&dk->dk_refcount, 1); dk->dk_mode = ci->ci_mode; err = fscrypt_prepare_key(&dk->dk_key, raw_key, ci); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 80b8ca0f340b..893661b52376 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -61,6 +61,13 @@ fscrypt_get_dummy_policy(struct super_block *sb) return sb->s_cop->get_dummy_policy(sb); } +/* + * Return %true if the given combination of encryption modes is supported for v1 + * (and later) encryption policies. + * + * Do *not* add anything new here, since v1 encryption policies are deprecated. + * New combinations of modes should go in fscrypt_valid_enc_modes_v2() only. + */ static bool fscrypt_valid_enc_modes_v1(u32 contents_mode, u32 filenames_mode) { if (contents_mode == FSCRYPT_MODE_AES_256_XTS && @@ -83,6 +90,11 @@ static bool fscrypt_valid_enc_modes_v2(u32 contents_mode, u32 filenames_mode) if (contents_mode == FSCRYPT_MODE_AES_256_XTS && filenames_mode == FSCRYPT_MODE_AES_256_HCTR2) return true; + + if (contents_mode == FSCRYPT_MODE_SM4_XTS && + filenames_mode == FSCRYPT_MODE_SM4_CTS) + return true; + return fscrypt_valid_enc_modes_v1(contents_mode, filenames_mode); } @@ -744,12 +756,8 @@ int fscrypt_set_context(struct inode *inode, void *fs_data) * delayed key setup that requires the inode number. */ if (ci->ci_policy.version == FSCRYPT_POLICY_V2 && - (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) { - const struct fscrypt_master_key *mk = - ci->ci_master_key->payload.data[0]; - - fscrypt_hash_inode_number(ci, mk); - } + (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) + fscrypt_hash_inode_number(ci, ci->ci_master_key); return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, fs_data); } @@ -833,19 +841,6 @@ bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, } EXPORT_SYMBOL_GPL(fscrypt_dummy_policies_equal); -/* Deprecated, do not use */ -int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg, - struct fscrypt_dummy_policy *dummy_policy) -{ - struct fs_parameter param = { - .type = fs_value_is_string, - .string = arg ? (char *)arg : "", - }; - return fscrypt_parse_test_dummy_encryption(¶m, dummy_policy) ?: - fscrypt_add_test_dummy_key(sb, dummy_policy); -} -EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption); - /** * fscrypt_show_test_dummy_encryption() - show '-o test_dummy_encryption' * @seq: the seq_file to print the option to diff --git a/fs/d_path.c b/fs/d_path.c index e4e0ebad1f15..56a6ee4c6331 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -34,7 +34,7 @@ static bool prepend_char(struct prepend_buffer *p, unsigned char c) } /* - * The source of the prepend data can be an optimistoc load + * The source of the prepend data can be an optimistic load * of a dentry name and length. And because we don't hold any * locks, the length and the pointer to the name may not be * in sync if a concurrent rename happens, and the kernel @@ -297,8 +297,7 @@ EXPORT_SYMBOL(d_path); /* * Helper function for dentry_operations.d_dname() members */ -char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen, - const char *fmt, ...) +char *dynamic_dname(char *buffer, int buflen, const char *fmt, ...) { va_list args; char temp[64]; @@ -334,35 +334,41 @@ static unsigned long dax_end_pfn(void *entry) for (pfn = dax_to_pfn(entry); \ pfn < dax_end_pfn(entry); pfn++) -static inline bool dax_mapping_is_cow(struct address_space *mapping) +static inline bool dax_page_is_shared(struct page *page) { - return (unsigned long)mapping == PAGE_MAPPING_DAX_COW; + return page->mapping == PAGE_MAPPING_DAX_SHARED; } /* - * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount. + * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the + * refcount. */ -static inline void dax_mapping_set_cow(struct page *page) +static inline void dax_page_share_get(struct page *page) { - if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) { + if (page->mapping != PAGE_MAPPING_DAX_SHARED) { /* * Reset the index if the page was already mapped * regularly before. */ if (page->mapping) - page->index = 1; - page->mapping = (void *)PAGE_MAPPING_DAX_COW; + page->share = 1; + page->mapping = PAGE_MAPPING_DAX_SHARED; } - page->index++; + page->share++; +} + +static inline unsigned long dax_page_share_put(struct page *page) +{ + return --page->share; } /* - * When it is called in dax_insert_entry(), the cow flag will indicate that + * When it is called in dax_insert_entry(), the shared flag will indicate that * whether this entry is shared by multiple files. If so, set the page->mapping - * FS_DAX_MAPPING_COW, and use page->index as refcount. + * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount. */ static void dax_associate_entry(void *entry, struct address_space *mapping, - struct vm_area_struct *vma, unsigned long address, bool cow) + struct vm_area_struct *vma, unsigned long address, bool shared) { unsigned long size = dax_entry_size(entry), pfn, index; int i = 0; @@ -374,8 +380,8 @@ static void dax_associate_entry(void *entry, struct address_space *mapping, for_each_mapped_pfn(entry, pfn) { struct page *page = pfn_to_page(pfn); - if (cow) { - dax_mapping_set_cow(page); + if (shared) { + dax_page_share_get(page); } else { WARN_ON_ONCE(page->mapping); page->mapping = mapping; @@ -396,9 +402,9 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping, struct page *page = pfn_to_page(pfn); WARN_ON_ONCE(trunc && page_ref_count(page) > 1); - if (dax_mapping_is_cow(page->mapping)) { - /* keep the CoW flag if this page is still shared */ - if (page->index-- > 0) + if (dax_page_is_shared(page)) { + /* keep the shared flag if this page is still shared */ + if (dax_page_share_put(page) > 0) continue; } else WARN_ON_ONCE(page->mapping && page->mapping != mapping); @@ -840,12 +846,6 @@ static bool dax_fault_is_synchronous(const struct iomap_iter *iter, (iter->iomap.flags & IOMAP_F_DIRTY); } -static bool dax_fault_is_cow(const struct iomap_iter *iter) -{ - return (iter->flags & IOMAP_WRITE) && - (iter->iomap.flags & IOMAP_F_SHARED); -} - /* * By this point grab_mapping_entry() has ensured that we have a locked entry * of the appropriate size so we don't have to worry about downgrading PMDs to @@ -859,13 +859,14 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, { struct address_space *mapping = vmf->vma->vm_file->f_mapping; void *new_entry = dax_make_entry(pfn, flags); - bool dirty = !dax_fault_is_synchronous(iter, vmf->vma); - bool cow = dax_fault_is_cow(iter); + bool write = iter->flags & IOMAP_WRITE; + bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma); + bool shared = iter->iomap.flags & IOMAP_F_SHARED; if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) { + if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) { unsigned long index = xas->xa_index; /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) @@ -877,12 +878,12 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, xas_reset(xas); xas_lock_irq(xas); - if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { + if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { void *old; dax_disassociate_entry(entry, mapping, false); dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address, - cow); + shared); /* * Only swap our new entry into the page cache if the current * entry is a zero page or an empty entry. If a normal PTE or @@ -902,7 +903,7 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, if (dirty) xas_set_mark(xas, PAGECACHE_TAG_DIRTY); - if (cow) + if (write && shared) xas_set_mark(xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irq(xas); @@ -1086,7 +1087,8 @@ out: } /** - * dax_iomap_cow_copy - Copy the data from source to destination before write + * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page + * by copying the data before and after the range to be written. * @pos: address to do copy from. * @length: size of copy operation. * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE) @@ -1095,35 +1097,50 @@ out: * * This can be called from two places. Either during DAX write fault (page * aligned), to copy the length size data to daddr. Or, while doing normal DAX - * write operation, dax_iomap_actor() might call this to do the copy of either + * write operation, dax_iomap_iter() might call this to do the copy of either * start or end unaligned address. In the latter case the rest of the copy of - * aligned ranges is taken care by dax_iomap_actor() itself. + * aligned ranges is taken care by dax_iomap_iter() itself. + * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the + * area to make sure no old data remains. */ -static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size, +static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, const struct iomap *srcmap, void *daddr) { loff_t head_off = pos & (align_size - 1); size_t size = ALIGN(head_off + length, align_size); loff_t end = pos + length; loff_t pg_end = round_up(end, align_size); + /* copy_all is usually in page fault case */ bool copy_all = head_off == 0 && end == pg_end; + /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */ + bool zero_edge = srcmap->flags & IOMAP_F_SHARED || + srcmap->type == IOMAP_UNWRITTEN; void *saddr = 0; int ret = 0; - ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); - if (ret) - return ret; + if (!zero_edge) { + ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); + if (ret) + return ret; + } if (copy_all) { - ret = copy_mc_to_kernel(daddr, saddr, length); - return ret ? -EIO : 0; + if (zero_edge) + memset(daddr, 0, size); + else + ret = copy_mc_to_kernel(daddr, saddr, length); + goto out; } /* Copy the head part of the range */ if (head_off) { - ret = copy_mc_to_kernel(daddr, saddr, head_off); - if (ret) - return -EIO; + if (zero_edge) + memset(daddr, 0, head_off); + else { + ret = copy_mc_to_kernel(daddr, saddr, head_off); + if (ret) + return -EIO; + } } /* Copy the tail part of the range */ @@ -1131,12 +1148,19 @@ static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size, loff_t tail_off = head_off + length; loff_t tail_len = pg_end - end; - ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off, - tail_len); - if (ret) - return -EIO; + if (zero_edge) + memset(daddr + tail_off, 0, tail_len); + else { + ret = copy_mc_to_kernel(daddr + tail_off, + saddr + tail_off, tail_len); + if (ret) + return -EIO; + } } - return 0; +out: + if (zero_edge) + dax_flush(srcmap->dax_dev, daddr, size); + return ret ? -EIO : 0; } /* @@ -1221,6 +1245,58 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, } #endif /* CONFIG_FS_DAX_PMD */ +static s64 dax_unshare_iter(struct iomap_iter *iter) +{ + struct iomap *iomap = &iter->iomap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t pos = iter->pos; + loff_t length = iomap_length(iter); + int id = 0; + s64 ret = 0; + void *daddr = NULL, *saddr = NULL; + + /* don't bother with blocks that are not shared to start with */ + if (!(iomap->flags & IOMAP_F_SHARED)) + return length; + /* don't bother with holes or unwritten extents */ + if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) + return length; + + id = dax_read_lock(); + ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL); + if (ret < 0) + goto out_unlock; + + ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL); + if (ret < 0) + goto out_unlock; + + ret = copy_mc_to_kernel(daddr, saddr, length); + if (ret) + ret = -EIO; + +out_unlock: + dax_read_unlock(id); + return ret; +} + +int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, + const struct iomap_ops *ops) +{ + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .len = len, + .flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX, + }; + int ret; + + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = dax_unshare_iter(&iter); + return ret; +} +EXPORT_SYMBOL_GPL(dax_file_unshare); + static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) { const struct iomap *iomap = &iter->iomap; @@ -1235,13 +1311,10 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) if (ret < 0) return ret; memset(kaddr + offset, 0, size); - if (srcmap->addr != iomap->addr) { - ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap, - kaddr); - if (ret < 0) - return ret; - dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE); - } else + if (iomap->flags & IOMAP_F_SHARED) + ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap, + kaddr); + else dax_flush(iomap->dax_dev, kaddr + offset, size); return ret; } @@ -1258,6 +1331,15 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) return length; + /* + * invalidate the pages whose sharing state is to be changed + * because of CoW. + */ + if (iomap->flags & IOMAP_F_SHARED) + invalidate_inode_pages2_range(iter->inode->i_mapping, + pos >> PAGE_SHIFT, + (pos + length - 1) >> PAGE_SHIFT); + do { unsigned offset = offset_in_page(pos); unsigned size = min_t(u64, PAGE_SIZE - offset, length); @@ -1318,12 +1400,13 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, struct iov_iter *iter) { const struct iomap *iomap = &iomi->iomap; - const struct iomap *srcmap = &iomi->srcmap; + const struct iomap *srcmap = iomap_iter_srcmap(iomi); loff_t length = iomap_length(iomi); loff_t pos = iomi->pos; struct dax_device *dax_dev = iomap->dax_dev; loff_t end = pos + length, done = 0; bool write = iov_iter_rw(iter) == WRITE; + bool cow = write && iomap->flags & IOMAP_F_SHARED; ssize_t ret = 0; size_t xfer; int id; @@ -1350,7 +1433,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, * into page tables. We have to tear down these mappings so that data * written by write(2) is visible in mmap. */ - if (iomap->flags & IOMAP_F_NEW) { + if (iomap->flags & IOMAP_F_NEW || cow) { invalidate_inode_pages2_range(iomi->inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); @@ -1384,10 +1467,9 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, break; } - if (write && - srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) { - ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap, - kaddr); + if (cow) { + ret = dax_iomap_copy_around(pos, length, PAGE_SIZE, + srcmap, kaddr); if (ret) break; } @@ -1532,7 +1614,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, struct xa_state *xas, void **entry, bool pmd) { const struct iomap *iomap = &iter->iomap; - const struct iomap *srcmap = &iter->srcmap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); size_t size = pmd ? PMD_SIZE : PAGE_SIZE; loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT; bool write = iter->flags & IOMAP_WRITE; @@ -1563,9 +1645,8 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags); - if (write && - srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) { - err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr); + if (write && iomap->flags & IOMAP_F_SHARED) { + err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr); if (err) return dax_fault_return(err); } @@ -1936,15 +2017,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, .len = len, .flags = IOMAP_DAX, }; - int ret; + int ret, compared = 0; - while ((ret = iomap_iter(&src_iter, ops)) > 0) { - while ((ret = iomap_iter(&dst_iter, ops)) > 0) { - dst_iter.processed = dax_range_compare_iter(&src_iter, - &dst_iter, len, same); - } - if (ret <= 0) - src_iter.processed = ret; + while ((ret = iomap_iter(&src_iter, ops)) > 0 && + (ret = iomap_iter(&dst_iter, ops)) > 0) { + compared = dax_range_compare_iter(&src_iter, &dst_iter, len, + same); + if (compared < 0) + return ret; + src_iter.processed = dst_iter.processed = compared; } return ret; } diff --git a/fs/dcache.c b/fs/dcache.c index bb0c4d0038db..52e6d5fdab6b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2597,15 +2597,7 @@ EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { - /* - * The caller holds a spinlock (dentry::d_lock). On !PREEMPT_RT - * kernels spin_lock() implicitly disables preemption, but not on - * PREEMPT_RT. So for RT it has to be done explicitly to protect - * the sequence count write side critical section against a reader - * or another writer preempting, which would result in a live lock. - */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); for (;;) { unsigned n = dir->i_dir_seq; if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) @@ -2618,8 +2610,7 @@ static inline void end_dir_add(struct inode *dir, unsigned int n, wait_queue_head_t *d_wait) { smp_store_release(&dir->i_dir_seq, n + 2); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); wake_up_all(d_wait); } @@ -3258,8 +3249,10 @@ void d_genocide(struct dentry *parent) EXPORT_SYMBOL(d_genocide); -void d_tmpfile(struct dentry *dentry, struct inode *inode) +void d_tmpfile(struct file *file, struct inode *inode) { + struct dentry *dentry = file->f_path.dentry; + inode_dec_link_count(inode); BUG_ON(dentry->d_name.name != dentry->d_iname || !hlist_unhashed(&dentry->d_u.d_alias) || diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 950c63fa4d0b..b54f470e0d03 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -378,8 +378,8 @@ ssize_t debugfs_attr_read(struct file *file, char __user *buf, } EXPORT_SYMBOL_GPL(debugfs_attr_read); -ssize_t debugfs_attr_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) +static ssize_t debugfs_attr_write_xsigned(struct file *file, const char __user *buf, + size_t len, loff_t *ppos, bool is_signed) { struct dentry *dentry = F_DENTRY(file); ssize_t ret; @@ -387,12 +387,28 @@ ssize_t debugfs_attr_write(struct file *file, const char __user *buf, ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; - ret = simple_attr_write(file, buf, len, ppos); + if (is_signed) + ret = simple_attr_write_signed(file, buf, len, ppos); + else + ret = simple_attr_write(file, buf, len, ppos); debugfs_file_put(dentry); return ret; } + +ssize_t debugfs_attr_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return debugfs_attr_write_xsigned(file, buf, len, ppos, false); +} EXPORT_SYMBOL_GPL(debugfs_attr_write); +ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return debugfs_attr_write_xsigned(file, buf, len, ppos, true); +} +EXPORT_SYMBOL_GPL(debugfs_attr_write_signed); + static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode, struct dentry *parent, void *value, const struct file_operations *fops, @@ -738,11 +754,11 @@ static int debugfs_atomic_t_get(void *data, u64 *val) *val = atomic_read((atomic_t *)data); return 0; } -DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get, +DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t, debugfs_atomic_t_get, debugfs_atomic_t_set, "%lld\n"); -DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, +DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n"); -DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, +DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); /** @@ -1121,7 +1137,7 @@ void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs, } EXPORT_SYMBOL_GPL(debugfs_print_regs32); -static int debugfs_show_regset32(struct seq_file *s, void *data) +static int debugfs_regset32_show(struct seq_file *s, void *data) { struct debugfs_regset32 *regset = s->private; @@ -1136,17 +1152,7 @@ static int debugfs_show_regset32(struct seq_file *s, void *data) return 0; } -static int debugfs_open_regset32(struct inode *inode, struct file *file) -{ - return single_open(file, debugfs_show_regset32, inode->i_private); -} - -static const struct file_operations fops_regset32 = { - .open = debugfs_open_regset32, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(debugfs_regset32); /** * debugfs_create_regset32 - create a debugfs file that returns register values @@ -1167,7 +1173,7 @@ void debugfs_create_regset32(const char *name, umode_t mode, struct dentry *parent, struct debugfs_regset32 *regset) { - debugfs_create_file(name, mode, parent, regset, &fops_regset32); + debugfs_create_file(name, mode, parent, regset, &debugfs_regset32_fops); } EXPORT_SYMBOL_GPL(debugfs_create_regset32); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 232cfdf095ae..2e8e112b1993 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -82,6 +82,8 @@ struct debugfs_mount_opts { kuid_t uid; kgid_t gid; umode_t mode; + /* Opt_* bitfield. */ + unsigned int opts; }; enum { @@ -111,6 +113,7 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts) kgid_t gid; char *p; + opts->opts = 0; opts->mode = DEBUGFS_DEFAULT_MODE; while ((p = strsep(&data, ",")) != NULL) { @@ -145,24 +148,44 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts) * but traditionally debugfs has ignored all mount options */ } + + opts->opts |= BIT(token); } return 0; } -static int debugfs_apply_options(struct super_block *sb) +static void _debugfs_apply_options(struct super_block *sb, bool remount) { struct debugfs_fs_info *fsi = sb->s_fs_info; struct inode *inode = d_inode(sb->s_root); struct debugfs_mount_opts *opts = &fsi->mount_opts; - inode->i_mode &= ~S_IALLUGO; - inode->i_mode |= opts->mode; + /* + * On remount, only reset mode/uid/gid if they were provided as mount + * options. + */ - inode->i_uid = opts->uid; - inode->i_gid = opts->gid; + if (!remount || opts->opts & BIT(Opt_mode)) { + inode->i_mode &= ~S_IALLUGO; + inode->i_mode |= opts->mode; + } - return 0; + if (!remount || opts->opts & BIT(Opt_uid)) + inode->i_uid = opts->uid; + + if (!remount || opts->opts & BIT(Opt_gid)) + inode->i_gid = opts->gid; +} + +static void debugfs_apply_options(struct super_block *sb) +{ + _debugfs_apply_options(sb, false); +} + +static void debugfs_apply_options_remount(struct super_block *sb) +{ + _debugfs_apply_options(sb, true); } static int debugfs_remount(struct super_block *sb, int *flags, char *data) @@ -175,7 +198,7 @@ static int debugfs_remount(struct super_block *sb, int *flags, char *data) if (err) goto fail; - debugfs_apply_options(sb); + debugfs_apply_options_remount(sb); fail: return err; diff --git a/fs/direct-io.c b/fs/direct-io.c index f669163d5860..03d381377ae1 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -421,8 +421,6 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) unsigned long flags; bio->bi_private = dio; - /* don't account direct I/O as memory stall */ - bio_clear_flag(bio, BIO_WORKINGSET); spin_lock_irqsave(&dio->bio_lock, flags); dio->refcount++; diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 19ef136f9e4f..26fef9945cc9 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -12,55 +12,67 @@ #include <trace/events/dlm.h> #include "dlm_internal.h" +#include "memory.h" #include "lock.h" #include "user.h" #include "ast.h" -static uint64_t dlm_cb_seq; -static DEFINE_SPINLOCK(dlm_cb_seq_spin); +void dlm_release_callback(struct kref *ref) +{ + struct dlm_callback *cb = container_of(ref, struct dlm_callback, ref); + + dlm_free_cb(cb); +} + +void dlm_callback_set_last_ptr(struct dlm_callback **from, + struct dlm_callback *to) +{ + if (*from) + kref_put(&(*from)->ref, dlm_release_callback); + + if (to) + kref_get(&to->ref); + + *from = to; +} -static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb) +void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb) { - int i; - - log_print("last_bast %x %llu flags %x mode %d sb %d %x", - lkb->lkb_id, - (unsigned long long)lkb->lkb_last_bast.seq, - lkb->lkb_last_bast.flags, - lkb->lkb_last_bast.mode, - lkb->lkb_last_bast.sb_status, - lkb->lkb_last_bast.sb_flags); - - log_print("last_cast %x %llu flags %x mode %d sb %d %x", - lkb->lkb_id, - (unsigned long long)lkb->lkb_last_cast.seq, - lkb->lkb_last_cast.flags, - lkb->lkb_last_cast.mode, - lkb->lkb_last_cast.sb_status, - lkb->lkb_last_cast.sb_flags); - - for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { - log_print("cb %x %llu flags %x mode %d sb %d %x", - lkb->lkb_id, - (unsigned long long)lkb->lkb_callbacks[i].seq, - lkb->lkb_callbacks[i].flags, - lkb->lkb_callbacks[i].mode, - lkb->lkb_callbacks[i].sb_status, - lkb->lkb_callbacks[i].sb_flags); + struct dlm_callback *cb, *safe; + + list_for_each_entry_safe(cb, safe, &lkb->lkb_callbacks, list) { + list_del(&cb->list); + kref_put(&cb->ref, dlm_release_callback); } + + lkb->lkb_flags &= ~DLM_IFL_CB_PENDING; + + /* invalidate */ + dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL); + dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL); + lkb->lkb_last_bast_mode = -1; } -int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, - int status, uint32_t sbflags, uint64_t seq) +int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; - uint64_t prev_seq; + int rv = DLM_ENQUEUE_CALLBACK_SUCCESS; + struct dlm_callback *cb; int prev_mode; - int i, rv; - for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { - if (lkb->lkb_callbacks[i].seq) - continue; + if (flags & DLM_CB_BAST) { + /* if cb is a bast, it should be skipped if the blocking mode is + * compatible with the last granted mode + */ + if (lkb->lkb_last_cast) { + if (dlm_modes_compat(mode, lkb->lkb_last_cast->mode)) { + log_debug(ls, "skip %x bast mode %d for cast mode %d", + lkb->lkb_id, mode, + lkb->lkb_last_cast->mode); + goto out; + } + } /* * Suppress some redundant basts here, do more on removal. @@ -68,148 +80,95 @@ int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, * is a bast for the same mode or a more restrictive mode. * (the addional > PR check is needed for PR/CW inversion) */ - - if ((i > 0) && (flags & DLM_CB_BAST) && - (lkb->lkb_callbacks[i-1].flags & DLM_CB_BAST)) { - - prev_seq = lkb->lkb_callbacks[i-1].seq; - prev_mode = lkb->lkb_callbacks[i-1].mode; + if (lkb->lkb_last_cb && lkb->lkb_last_cb->flags & DLM_CB_BAST) { + prev_mode = lkb->lkb_last_cb->mode; if ((prev_mode == mode) || (prev_mode > mode && prev_mode > DLM_LOCK_PR)) { - - log_debug(ls, "skip %x add bast %llu mode %d " - "for bast %llu mode %d", - lkb->lkb_id, - (unsigned long long)seq, - mode, - (unsigned long long)prev_seq, - prev_mode); - rv = 0; + log_debug(ls, "skip %x add bast mode %d for bast mode %d", + lkb->lkb_id, mode, prev_mode); goto out; } } - - lkb->lkb_callbacks[i].seq = seq; - lkb->lkb_callbacks[i].flags = flags; - lkb->lkb_callbacks[i].mode = mode; - lkb->lkb_callbacks[i].sb_status = status; - lkb->lkb_callbacks[i].sb_flags = (sbflags & 0x000000FF); - rv = 0; - break; } - if (i == DLM_CALLBACKS_SIZE) { - log_error(ls, "no callbacks %x %llu flags %x mode %d sb %d %x", - lkb->lkb_id, (unsigned long long)seq, - flags, mode, status, sbflags); - dlm_dump_lkb_callbacks(lkb); - rv = -1; - goto out; - } - out: - return rv; -} - -int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_callback *cb, int *resid) -{ - int i, rv; - - *resid = 0; - - if (!lkb->lkb_callbacks[0].seq) { - rv = -ENOENT; + cb = dlm_allocate_cb(); + if (!cb) { + rv = DLM_ENQUEUE_CALLBACK_FAILURE; goto out; } - /* oldest undelivered cb is callbacks[0] */ - - memcpy(cb, &lkb->lkb_callbacks[0], sizeof(struct dlm_callback)); - memset(&lkb->lkb_callbacks[0], 0, sizeof(struct dlm_callback)); - - /* shift others down */ - - for (i = 1; i < DLM_CALLBACKS_SIZE; i++) { - if (!lkb->lkb_callbacks[i].seq) - break; - memcpy(&lkb->lkb_callbacks[i-1], &lkb->lkb_callbacks[i], - sizeof(struct dlm_callback)); - memset(&lkb->lkb_callbacks[i], 0, sizeof(struct dlm_callback)); - (*resid)++; + cb->flags = flags; + cb->mode = mode; + cb->sb_status = status; + cb->sb_flags = (sbflags & 0x000000FF); + kref_init(&cb->ref); + if (!(lkb->lkb_flags & DLM_IFL_CB_PENDING)) { + lkb->lkb_flags |= DLM_IFL_CB_PENDING; + rv = DLM_ENQUEUE_CALLBACK_NEED_SCHED; } + list_add_tail(&cb->list, &lkb->lkb_callbacks); - /* if cb is a bast, it should be skipped if the blocking mode is - compatible with the last granted mode */ - - if ((cb->flags & DLM_CB_BAST) && lkb->lkb_last_cast.seq) { - if (dlm_modes_compat(cb->mode, lkb->lkb_last_cast.mode)) { - cb->flags |= DLM_CB_SKIP; - - log_debug(ls, "skip %x bast %llu mode %d " - "for cast %llu mode %d", - lkb->lkb_id, - (unsigned long long)cb->seq, - cb->mode, - (unsigned long long)lkb->lkb_last_cast.seq, - lkb->lkb_last_cast.mode); - rv = 0; - goto out; - } - } + if (flags & DLM_CB_CAST) + dlm_callback_set_last_ptr(&lkb->lkb_last_cast, cb); - if (cb->flags & DLM_CB_CAST) { - memcpy(&lkb->lkb_last_cast, cb, sizeof(struct dlm_callback)); - lkb->lkb_last_cast_time = ktime_get(); - } + dlm_callback_set_last_ptr(&lkb->lkb_last_cb, cb); - if (cb->flags & DLM_CB_BAST) { - memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback)); - lkb->lkb_last_bast_time = ktime_get(); - } - rv = 0; out: return rv; } +int dlm_dequeue_lkb_callback(struct dlm_lkb *lkb, struct dlm_callback **cb) +{ + /* oldest undelivered cb is callbacks first entry */ + *cb = list_first_entry_or_null(&lkb->lkb_callbacks, + struct dlm_callback, list); + if (!*cb) + return DLM_DEQUEUE_CALLBACK_EMPTY; + + /* remove it from callbacks so shift others down */ + list_del(&(*cb)->list); + if (list_empty(&lkb->lkb_callbacks)) + return DLM_DEQUEUE_CALLBACK_LAST; + + return DLM_DEQUEUE_CALLBACK_SUCCESS; +} + void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, uint32_t sbflags) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; - uint64_t new_seq, prev_seq; int rv; - spin_lock(&dlm_cb_seq_spin); - new_seq = ++dlm_cb_seq; - if (!dlm_cb_seq) - new_seq = ++dlm_cb_seq; - spin_unlock(&dlm_cb_seq_spin); - if (lkb->lkb_flags & DLM_IFL_USER) { - dlm_user_add_ast(lkb, flags, mode, status, sbflags, new_seq); + dlm_user_add_ast(lkb, flags, mode, status, sbflags); return; } - mutex_lock(&lkb->lkb_cb_mutex); - prev_seq = lkb->lkb_callbacks[0].seq; - - rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, new_seq); - if (rv < 0) - goto out; - - if (!prev_seq) { + spin_lock(&lkb->lkb_cb_lock); + rv = dlm_enqueue_lkb_callback(lkb, flags, mode, status, sbflags); + switch (rv) { + case DLM_ENQUEUE_CALLBACK_NEED_SCHED: kref_get(&lkb->lkb_ref); + spin_lock(&ls->ls_cb_lock); if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) { - mutex_lock(&ls->ls_cb_mutex); list_add(&lkb->lkb_cb_list, &ls->ls_cb_delay); - mutex_unlock(&ls->ls_cb_mutex); } else { queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); } + spin_unlock(&ls->ls_cb_lock); + break; + case DLM_ENQUEUE_CALLBACK_FAILURE: + WARN_ON_ONCE(1); + break; + case DLM_ENQUEUE_CALLBACK_SUCCESS: + break; + default: + WARN_ON_ONCE(1); + break; } - out: - mutex_unlock(&lkb->lkb_cb_mutex); + spin_unlock(&lkb->lkb_cb_lock); } void dlm_callback_work(struct work_struct *work) @@ -218,53 +177,46 @@ void dlm_callback_work(struct work_struct *work) struct dlm_ls *ls = lkb->lkb_resource->res_ls; void (*castfn) (void *astparam); void (*bastfn) (void *astparam, int mode); - struct dlm_callback callbacks[DLM_CALLBACKS_SIZE]; - int i, rv, resid; - - memset(&callbacks, 0, sizeof(callbacks)); + struct dlm_callback *cb; + int rv; - mutex_lock(&lkb->lkb_cb_mutex); - if (!lkb->lkb_callbacks[0].seq) { - /* no callback work exists, shouldn't happen */ - log_error(ls, "dlm_callback_work %x no work", lkb->lkb_id); - dlm_print_lkb(lkb); - dlm_dump_lkb_callbacks(lkb); - } + spin_lock(&lkb->lkb_cb_lock); + rv = dlm_dequeue_lkb_callback(lkb, &cb); + spin_unlock(&lkb->lkb_cb_lock); - for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { - rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid); - if (rv < 0) - break; - } + if (WARN_ON_ONCE(rv == DLM_DEQUEUE_CALLBACK_EMPTY)) + goto out; - if (resid) { - /* cbs remain, loop should have removed all, shouldn't happen */ - log_error(ls, "dlm_callback_work %x resid %d", lkb->lkb_id, - resid); - dlm_print_lkb(lkb); - dlm_dump_lkb_callbacks(lkb); - } - mutex_unlock(&lkb->lkb_cb_mutex); + for (;;) { + castfn = lkb->lkb_astfn; + bastfn = lkb->lkb_bastfn; + + if (cb->flags & DLM_CB_BAST) { + trace_dlm_bast(ls, lkb, cb->mode); + lkb->lkb_last_bast_time = ktime_get(); + lkb->lkb_last_bast_mode = cb->mode; + bastfn(lkb->lkb_astparam, cb->mode); + } else if (cb->flags & DLM_CB_CAST) { + lkb->lkb_lksb->sb_status = cb->sb_status; + lkb->lkb_lksb->sb_flags = cb->sb_flags; + trace_dlm_ast(ls, lkb); + lkb->lkb_last_cast_time = ktime_get(); + castfn(lkb->lkb_astparam); + } - castfn = lkb->lkb_astfn; - bastfn = lkb->lkb_bastfn; + kref_put(&cb->ref, dlm_release_callback); - for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { - if (!callbacks[i].seq) + spin_lock(&lkb->lkb_cb_lock); + rv = dlm_dequeue_lkb_callback(lkb, &cb); + if (rv == DLM_DEQUEUE_CALLBACK_EMPTY) { + lkb->lkb_flags &= ~DLM_IFL_CB_PENDING; + spin_unlock(&lkb->lkb_cb_lock); break; - if (callbacks[i].flags & DLM_CB_SKIP) { - continue; - } else if (callbacks[i].flags & DLM_CB_BAST) { - trace_dlm_bast(ls, lkb, callbacks[i].mode); - bastfn(lkb->lkb_astparam, callbacks[i].mode); - } else if (callbacks[i].flags & DLM_CB_CAST) { - lkb->lkb_lksb->sb_status = callbacks[i].sb_status; - lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags; - trace_dlm_ast(ls, lkb); - castfn(lkb->lkb_astparam); } + spin_unlock(&lkb->lkb_cb_lock); } +out: /* undo kref_get from dlm_add_callback, may cause lkb to be freed */ dlm_put_lkb(lkb); } @@ -288,10 +240,13 @@ void dlm_callback_stop(struct dlm_ls *ls) void dlm_callback_suspend(struct dlm_ls *ls) { - set_bit(LSFL_CB_DELAY, &ls->ls_flags); + if (ls->ls_callback_wq) { + spin_lock(&ls->ls_cb_lock); + set_bit(LSFL_CB_DELAY, &ls->ls_flags); + spin_unlock(&ls->ls_cb_lock); - if (ls->ls_callback_wq) flush_workqueue(ls->ls_callback_wq); + } } #define MAX_CB_QUEUE 25 @@ -302,13 +257,11 @@ void dlm_callback_resume(struct dlm_ls *ls) int count = 0, sum = 0; bool empty; - clear_bit(LSFL_CB_DELAY, &ls->ls_flags); - if (!ls->ls_callback_wq) return; more: - mutex_lock(&ls->ls_cb_mutex); + spin_lock(&ls->ls_cb_lock); list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) { list_del_init(&lkb->lkb_cb_list); queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); @@ -317,7 +270,9 @@ more: break; } empty = list_empty(&ls->ls_cb_delay); - mutex_unlock(&ls->ls_cb_mutex); + if (empty) + clear_bit(LSFL_CB_DELAY, &ls->ls_flags); + spin_unlock(&ls->ls_cb_lock); sum += count; if (!empty) { diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h index 181ad7d20c4d..880b11882495 100644 --- a/fs/dlm/ast.h +++ b/fs/dlm/ast.h @@ -11,14 +11,22 @@ #ifndef __ASTD_DOT_H__ #define __ASTD_DOT_H__ -void dlm_del_ast(struct dlm_lkb *lkb); -int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, - int status, uint32_t sbflags, uint64_t seq); -int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_callback *cb, int *resid); +#define DLM_ENQUEUE_CALLBACK_NEED_SCHED 1 +#define DLM_ENQUEUE_CALLBACK_SUCCESS 0 +#define DLM_ENQUEUE_CALLBACK_FAILURE -1 +int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags); +#define DLM_DEQUEUE_CALLBACK_EMPTY 2 +#define DLM_DEQUEUE_CALLBACK_LAST 1 +#define DLM_DEQUEUE_CALLBACK_SUCCESS 0 +int dlm_dequeue_lkb_callback(struct dlm_lkb *lkb, struct dlm_callback **cb); void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, uint32_t sbflags); +void dlm_callback_set_last_ptr(struct dlm_callback **from, + struct dlm_callback *to); +void dlm_release_callback(struct kref *ref); +void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb); void dlm_callback_work(struct work_struct *work); int dlm_callback_start(struct dlm_ls *ls); void dlm_callback_stop(struct dlm_ls *ls); diff --git a/fs/dlm/config.c b/fs/dlm/config.c index ac8b62106ce0..20b60709eccf 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -183,7 +183,7 @@ static int dlm_check_protocol_and_dlm_running(unsigned int x) return -EINVAL; } - if (dlm_allow_conn) + if (dlm_lowcomms_is_running()) return -EBUSY; return 0; @@ -194,7 +194,7 @@ static int dlm_check_zero_and_dlm_running(unsigned int x) if (!x) return -EINVAL; - if (dlm_allow_conn) + if (dlm_lowcomms_is_running()) return -EBUSY; return 0; diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 8fb04ebbafb5..8a0e1b1f74ad 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -246,7 +246,7 @@ static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb, lkb->lkb_status, lkb->lkb_grmode, lkb->lkb_rqmode, - lkb->lkb_last_bast.mode, + lkb->lkb_last_bast_mode, rsb_lookup, lkb->lkb_wait_type, lkb->lkb_lvbseq, diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 8aca8085d24e..ab1a55337a6e 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -211,6 +211,7 @@ struct dlm_args { #endif #define DLM_IFL_DEADLOCK_CANCEL 0x01000000 #define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */ +#define DLM_IFL_CB_PENDING 0x04000000 /* least significant 2 bytes are message changed, they are full transmitted * but at receive side only the 2 bytes LSB will be set. * @@ -222,18 +223,17 @@ struct dlm_args { #define DLM_IFL_USER 0x00000001 #define DLM_IFL_ORPHAN 0x00000002 -#define DLM_CALLBACKS_SIZE 6 - #define DLM_CB_CAST 0x00000001 #define DLM_CB_BAST 0x00000002 -#define DLM_CB_SKIP 0x00000004 struct dlm_callback { - uint64_t seq; uint32_t flags; /* DLM_CBF_ */ int sb_status; /* copy to lksb status */ uint8_t sb_flags; /* copy to lksb flags */ int8_t mode; /* rq mode of bast, gr mode of cast */ + + struct list_head list; + struct kref ref; }; struct dlm_lkb { @@ -268,12 +268,13 @@ struct dlm_lkb { unsigned long lkb_timeout_cs; #endif - struct mutex lkb_cb_mutex; + spinlock_t lkb_cb_lock; struct work_struct lkb_cb_work; struct list_head lkb_cb_list; /* for ls_cb_delay or proc->asts */ - struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE]; - struct dlm_callback lkb_last_cast; - struct dlm_callback lkb_last_bast; + struct list_head lkb_callbacks; + struct dlm_callback *lkb_last_cast; + struct dlm_callback *lkb_last_cb; + int lkb_last_bast_mode; ktime_t lkb_last_cast_time; /* for debugging */ ktime_t lkb_last_bast_time; /* for debugging */ @@ -591,11 +592,7 @@ struct dlm_ls { int ls_new_rsb_count; struct list_head ls_new_rsb; /* new rsb structs */ - spinlock_t ls_remove_spin; - wait_queue_head_t ls_remove_wait; - char ls_remove_name[DLM_RESNAME_MAXLEN+1]; char *ls_remove_names[DLM_REMOVE_NAMES_MAX]; - int ls_remove_len; int ls_remove_lens[DLM_REMOVE_NAMES_MAX]; struct list_head ls_nodes; /* current nodes in ls */ @@ -631,7 +628,7 @@ struct dlm_ls { /* recovery related */ - struct mutex ls_cb_mutex; + spinlock_t ls_cb_lock; struct list_head ls_cb_delay; /* save for queue_work later */ struct timer_list ls_timer; struct task_struct *ls_recoverd_task; @@ -661,7 +658,7 @@ struct dlm_ls { spinlock_t ls_recover_idr_lock; wait_queue_head_t ls_wait_general; wait_queue_head_t ls_recover_lock_wait; - struct mutex ls_clear_proc_locks; + spinlock_t ls_clear_proc_locks; struct list_head ls_root_list; /* root resources */ struct rw_semaphore ls_root_sem; /* protect root_list */ @@ -670,7 +667,7 @@ struct dlm_ls { void *ls_ops_arg; int ls_namelen; - char ls_name[1]; + char ls_name[DLM_LOCKSPACE_LEN + 1]; }; /* diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index dac7eb75dba9..e1adfa5aed05 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -401,7 +401,7 @@ static int pre_rsb_struct(struct dlm_ls *ls) unlock any spinlocks, go back and call pre_rsb_struct again. Otherwise, take an rsb off the list and return it. */ -static int get_rsb_struct(struct dlm_ls *ls, char *name, int len, +static int get_rsb_struct(struct dlm_ls *ls, const void *name, int len, struct dlm_rsb **r_ret) { struct dlm_rsb *r; @@ -412,7 +412,8 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len, count = ls->ls_new_rsb_count; spin_unlock(&ls->ls_new_rsb_spin); log_debug(ls, "find_rsb retry %d %d %s", - count, dlm_config.ci_new_rsb_count, name); + count, dlm_config.ci_new_rsb_count, + (const char *)name); return -EAGAIN; } @@ -448,7 +449,7 @@ static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen) return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN); } -int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len, +int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len, struct dlm_rsb **r_ret) { struct rb_node *node = tree->rb_node; @@ -546,7 +547,7 @@ static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree) * while that rsb has a potentially stale master.) */ -static int find_rsb_dir(struct dlm_ls *ls, char *name, int len, +static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len, uint32_t hash, uint32_t b, int dir_nodeid, int from_nodeid, unsigned int flags, struct dlm_rsb **r_ret) @@ -724,7 +725,7 @@ static int find_rsb_dir(struct dlm_ls *ls, char *name, int len, dlm_recover_locks) before we've made ourself master (in dlm_recover_masters). */ -static int find_rsb_nodir(struct dlm_ls *ls, char *name, int len, +static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len, uint32_t hash, uint32_t b, int dir_nodeid, int from_nodeid, unsigned int flags, struct dlm_rsb **r_ret) @@ -818,8 +819,9 @@ static int find_rsb_nodir(struct dlm_ls *ls, char *name, int len, return error; } -static int find_rsb(struct dlm_ls *ls, char *name, int len, int from_nodeid, - unsigned int flags, struct dlm_rsb **r_ret) +static int find_rsb(struct dlm_ls *ls, const void *name, int len, + int from_nodeid, unsigned int flags, + struct dlm_rsb **r_ret) { uint32_t hash, b; int dir_nodeid; @@ -1207,6 +1209,7 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret, if (!lkb) return -ENOMEM; + lkb->lkb_last_bast_mode = -1; lkb->lkb_nodeid = -1; lkb->lkb_grmode = DLM_LOCK_IV; kref_init(&lkb->lkb_ref); @@ -1216,7 +1219,8 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret, INIT_LIST_HEAD(&lkb->lkb_time_list); #endif INIT_LIST_HEAD(&lkb->lkb_cb_list); - mutex_init(&lkb->lkb_cb_mutex); + INIT_LIST_HEAD(&lkb->lkb_callbacks); + spin_lock_init(&lkb->lkb_cb_lock); INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work); idr_preload(GFP_NOFS); @@ -1585,37 +1589,6 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms) return error; } -/* If there's an rsb for the same resource being removed, ensure - * that the remove message is sent before the new lookup message. - */ - -#define DLM_WAIT_PENDING_COND(ls, r) \ - (ls->ls_remove_len && \ - !rsb_cmp(r, ls->ls_remove_name, \ - ls->ls_remove_len)) - -static void wait_pending_remove(struct dlm_rsb *r) -{ - struct dlm_ls *ls = r->res_ls; - restart: - spin_lock(&ls->ls_remove_spin); - if (DLM_WAIT_PENDING_COND(ls, r)) { - log_debug(ls, "delay lookup for remove dir %d %s", - r->res_dir_nodeid, r->res_name); - spin_unlock(&ls->ls_remove_spin); - wait_event(ls->ls_remove_wait, !DLM_WAIT_PENDING_COND(ls, r)); - goto restart; - } - spin_unlock(&ls->ls_remove_spin); -} - -/* - * ls_remove_spin protects ls_remove_name and ls_remove_len which are - * read by other threads in wait_pending_remove. ls_remove_names - * and ls_remove_lens are only used by the scan thread, so they do - * not need protection. - */ - static void shrink_bucket(struct dlm_ls *ls, int b) { struct rb_node *n, *next; @@ -1697,11 +1670,6 @@ static void shrink_bucket(struct dlm_ls *ls, int b) * list and sending the removal. Keeping this gap small is * important to keep us (the master node) from being out of sync * with the remote dir node for very long. - * - * From the time the rsb is removed from toss until just after - * send_remove, the rsb name is saved in ls_remove_name. A new - * lookup checks this to ensure that a new lookup message for the - * same resource name is not sent just before the remove message. */ for (i = 0; i < remote_count; i++) { @@ -1748,22 +1716,8 @@ static void shrink_bucket(struct dlm_ls *ls, int b) } rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); - - /* block lookup of same name until we've sent remove */ - spin_lock(&ls->ls_remove_spin); - ls->ls_remove_len = len; - memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); - spin_unlock(&ls->ls_remove_spin); - spin_unlock(&ls->ls_rsbtbl[b].lock); - send_remove(r); - - /* allow lookup of name again */ - spin_lock(&ls->ls_remove_spin); - ls->ls_remove_len = 0; - memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN); - spin_unlock(&ls->ls_remove_spin); - wake_up(&ls->ls_remove_wait); + spin_unlock(&ls->ls_rsbtbl[b].lock); dlm_free_rsb(r); } @@ -2714,8 +2668,6 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) return 0; } - wait_pending_remove(r); - r->res_first_lkid = lkb->lkb_id; send_lookup(r, lkb); return 1; @@ -2864,17 +2816,9 @@ static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args) static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_args *args) { - int rv = -EINVAL; + int rv = -EBUSY; if (args->flags & DLM_LKF_CONVERT) { - if (lkb->lkb_flags & DLM_IFL_MSTCPY) - goto out; - - if (args->flags & DLM_LKF_QUECVT && - !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1]) - goto out; - - rv = -EBUSY; if (lkb->lkb_status != DLM_LKSTS_GRANTED) goto out; @@ -2884,6 +2828,14 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, if (is_overlap(lkb)) goto out; + + rv = -EINVAL; + if (lkb->lkb_flags & DLM_IFL_MSTCPY) + goto out; + + if (args->flags & DLM_LKF_QUECVT && + !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1]) + goto out; } lkb->lkb_exflags = args->flags; @@ -2900,11 +2852,25 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, #endif rv = 0; out: - if (rv) - log_debug(ls, "validate_lock_args %d %x %x %x %d %d %s", + switch (rv) { + case 0: + break; + case -EINVAL: + /* annoy the user because dlm usage is wrong */ + WARN_ON(1); + log_error(ls, "%s %d %x %x %x %d %d %s", __func__, + rv, lkb->lkb_id, lkb->lkb_flags, args->flags, + lkb->lkb_status, lkb->lkb_wait_type, + lkb->lkb_resource->res_name); + break; + default: + log_debug(ls, "%s %d %x %x %x %d %d %s", __func__, rv, lkb->lkb_id, lkb->lkb_flags, args->flags, lkb->lkb_status, lkb->lkb_wait_type, lkb->lkb_resource->res_name); + break; + } + return rv; } @@ -2918,23 +2884,12 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; - int rv = -EINVAL; + int rv = -EBUSY; - if (lkb->lkb_flags & DLM_IFL_MSTCPY) { - log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id); - dlm_print_lkb(lkb); + /* normal unlock not allowed if there's any op in progress */ + if (!(args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) && + (lkb->lkb_wait_type || lkb->lkb_wait_count)) goto out; - } - - /* an lkb may still exist even though the lock is EOL'ed due to a - cancel, unlock or failed noqueue request; an app can't use these - locks; return same error as if the lkid had not been found at all */ - - if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) { - log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id); - rv = -ENOENT; - goto out; - } /* an lkb may be waiting for an rsb lookup to complete where the lookup was initiated by another lock */ @@ -2949,7 +2904,24 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) unhold_lkb(lkb); /* undoes create_lkb() */ } /* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */ - rv = -EBUSY; + goto out; + } + + rv = -EINVAL; + if (lkb->lkb_flags & DLM_IFL_MSTCPY) { + log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id); + dlm_print_lkb(lkb); + goto out; + } + + /* an lkb may still exist even though the lock is EOL'ed due to a + * cancel, unlock or failed noqueue request; an app can't use these + * locks; return same error as if the lkid had not been found at all + */ + + if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) { + log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id); + rv = -ENOENT; goto out; } @@ -3022,14 +2994,8 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) goto out; } /* add_to_waiters() will set OVERLAP_UNLOCK */ - goto out_ok; } - /* normal unlock not allowed if there's any op in progress */ - rv = -EBUSY; - if (lkb->lkb_wait_type || lkb->lkb_wait_count) - goto out; - out_ok: /* an overlapping op shouldn't blow away exflags from other op */ lkb->lkb_exflags |= args->flags; @@ -3037,11 +3003,25 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) lkb->lkb_astparam = args->astparam; rv = 0; out: - if (rv) - log_debug(ls, "validate_unlock_args %d %x %x %x %x %d %s", rv, + switch (rv) { + case 0: + break; + case -EINVAL: + /* annoy the user because dlm usage is wrong */ + WARN_ON(1); + log_error(ls, "%s %d %x %x %x %x %d %s", __func__, rv, + lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags, + args->flags, lkb->lkb_wait_type, + lkb->lkb_resource->res_name); + break; + default: + log_debug(ls, "%s %d %x %x %x %x %d %s", __func__, rv, lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags, args->flags, lkb->lkb_wait_type, lkb->lkb_resource->res_name); + break; + } + return rv; } @@ -3292,8 +3272,9 @@ static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) * request_lock(), convert_lock(), unlock_lock(), cancel_lock() */ -static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name, - int len, struct dlm_args *args) +static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, + const void *name, int len, + struct dlm_args *args) { struct dlm_rsb *r; int error; @@ -3392,7 +3373,7 @@ int dlm_lock(dlm_lockspace_t *lockspace, int mode, struct dlm_lksb *lksb, uint32_t flags, - void *name, + const void *name, unsigned int namelen, uint32_t parent_lkid, void (*ast) (void *astarg), @@ -3438,7 +3419,7 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error == -EINPROGRESS) error = 0; out_put: - trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error); + trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error, true); if (convert || error) __put_lkb(ls, lkb); @@ -3521,7 +3502,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace, static int _create_message(struct dlm_ls *ls, int mb_len, int to_nodeid, int mstype, struct dlm_message **ms_ret, - struct dlm_mhandle **mh_ret) + struct dlm_mhandle **mh_ret, + gfp_t allocation) { struct dlm_message *ms; struct dlm_mhandle *mh; @@ -3531,7 +3513,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len, pass into midcomms_commit and a message buffer (mb) that we write our data into */ - mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb); + mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, allocation, &mb); if (!mh) return -ENOBUFS; @@ -3553,7 +3535,8 @@ static int _create_message(struct dlm_ls *ls, int mb_len, static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb, int to_nodeid, int mstype, struct dlm_message **ms_ret, - struct dlm_mhandle **mh_ret) + struct dlm_mhandle **mh_ret, + gfp_t allocation) { int mb_len = sizeof(struct dlm_message); @@ -3574,15 +3557,16 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb, } return _create_message(r->res_ls, mb_len, to_nodeid, mstype, - ms_ret, mh_ret); + ms_ret, mh_ret, allocation); } /* further lowcomms enhancements or alternate implementations may make the return value from this function useful at some point */ -static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms) +static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms, + const void *name, int namelen) { - dlm_midcomms_commit_mhandle(mh); + dlm_midcomms_commit_mhandle(mh, name, namelen); return 0; } @@ -3623,7 +3607,7 @@ static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb, case cpu_to_le32(DLM_MSG_REQUEST_REPLY): case cpu_to_le32(DLM_MSG_CONVERT_REPLY): case cpu_to_le32(DLM_MSG_GRANT): - if (!lkb->lkb_lvbptr) + if (!lkb->lkb_lvbptr || !(lkb->lkb_exflags & DLM_LKF_VALBLK)) break; memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); break; @@ -3642,13 +3626,13 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype) if (error) return error; - error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); + error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS); if (error) goto fail; send_args(r, lkb, ms); - error = send_message(mh, ms); + error = send_message(mh, ms, r->res_name, r->res_length); if (error) goto fail; return 0; @@ -3703,7 +3687,8 @@ static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb) to_nodeid = lkb->lkb_nodeid; - error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh); + error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh, + GFP_NOFS); if (error) goto out; @@ -3711,7 +3696,7 @@ static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb) ms->m_result = 0; - error = send_message(mh, ms); + error = send_message(mh, ms, r->res_name, r->res_length); out: return error; } @@ -3724,7 +3709,8 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode) to_nodeid = lkb->lkb_nodeid; - error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh); + error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh, + GFP_NOFS); if (error) goto out; @@ -3732,7 +3718,7 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode) ms->m_bastmode = cpu_to_le32(mode); - error = send_message(mh, ms); + error = send_message(mh, ms, r->res_name, r->res_length); out: return error; } @@ -3749,13 +3735,14 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb) if (error) return error; - error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh); + error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh, + GFP_NOFS); if (error) goto fail; send_args(r, lkb, ms); - error = send_message(mh, ms); + error = send_message(mh, ms, r->res_name, r->res_length); if (error) goto fail; return 0; @@ -3773,14 +3760,15 @@ static int send_remove(struct dlm_rsb *r) to_nodeid = dlm_dir_nodeid(r); - error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh); + error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh, + GFP_ATOMIC); if (error) goto out; memcpy(ms->m_extra, r->res_name, r->res_length); ms->m_hash = cpu_to_le32(r->res_hash); - error = send_message(mh, ms); + error = send_message(mh, ms, r->res_name, r->res_length); out: return error; } @@ -3794,7 +3782,7 @@ static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, to_nodeid = lkb->lkb_nodeid; - error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); + error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS); if (error) goto out; @@ -3802,7 +3790,7 @@ static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, ms->m_result = cpu_to_le32(to_dlm_errno(rv)); - error = send_message(mh, ms); + error = send_message(mh, ms, r->res_name, r->res_length); out: return error; } @@ -3835,7 +3823,8 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, struct dlm_mhandle *mh; int error, nodeid = le32_to_cpu(ms_in->m_header.h_nodeid); - error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh); + error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh, + GFP_NOFS); if (error) goto out; @@ -3843,7 +3832,7 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, ms->m_result = cpu_to_le32(to_dlm_errno(rv)); ms->m_nodeid = cpu_to_le32(ret_nodeid); - error = send_message(mh, ms); + error = send_message(mh, ms, ms_in->m_extra, receive_extralen(ms_in)); out: return error; } @@ -4013,66 +4002,6 @@ out: return error; } -static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len) -{ - char name[DLM_RESNAME_MAXLEN + 1]; - struct dlm_message *ms; - struct dlm_mhandle *mh; - struct dlm_rsb *r; - uint32_t hash, b; - int rv, dir_nodeid; - - memset(name, 0, sizeof(name)); - memcpy(name, ms_name, len); - - hash = jhash(name, len, 0); - b = hash & (ls->ls_rsbtbl_size - 1); - - dir_nodeid = dlm_hash2nodeid(ls, hash); - - log_error(ls, "send_repeat_remove dir %d %s", dir_nodeid, name); - - spin_lock(&ls->ls_rsbtbl[b].lock); - rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); - if (!rv) { - spin_unlock(&ls->ls_rsbtbl[b].lock); - log_error(ls, "repeat_remove on keep %s", name); - return; - } - - rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); - if (!rv) { - spin_unlock(&ls->ls_rsbtbl[b].lock); - log_error(ls, "repeat_remove on toss %s", name); - return; - } - - /* use ls->remove_name2 to avoid conflict with shrink? */ - - spin_lock(&ls->ls_remove_spin); - ls->ls_remove_len = len; - memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); - spin_unlock(&ls->ls_remove_spin); - spin_unlock(&ls->ls_rsbtbl[b].lock); - - rv = _create_message(ls, sizeof(struct dlm_message) + len, - dir_nodeid, DLM_MSG_REMOVE, &ms, &mh); - if (rv) - goto out; - - memcpy(ms->m_extra, name, len); - ms->m_hash = cpu_to_le32(hash); - - send_message(mh, ms); - -out: - spin_lock(&ls->ls_remove_spin); - ls->ls_remove_len = 0; - memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN); - spin_unlock(&ls->ls_remove_spin); - wake_up(&ls->ls_remove_wait); -} - static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; @@ -4142,25 +4071,11 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) ENOTBLK request failures when the lookup reply designating us as master is delayed. */ - /* We could repeatedly return -EBADR here if our send_remove() is - delayed in being sent/arriving/being processed on the dir node. - Another node would repeatedly lookup up the master, and the dir - node would continue returning our nodeid until our send_remove - took effect. - - We send another remove message in case our previous send_remove - was lost/ignored/missed somehow. */ - if (error != -ENOTBLK) { log_limit(ls, "receive_request %x from %d %d", le32_to_cpu(ms->m_lkid), from_nodeid, error); } - if (namelen && error == -EBADR) { - send_repeat_remove(ls, ms->m_extra, namelen); - msleep(1000); - } - setup_stub_lkb(ls, ms); send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); return error; @@ -5080,8 +4995,11 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid) down_read(&ls->ls_recv_active); if (hd->h_cmd == DLM_MSG) dlm_receive_message(ls, &p->message, nodeid); - else + else if (hd->h_cmd == DLM_RCOM) dlm_receive_rcom(ls, &p->rcom, nodeid); + else + log_error(ls, "invalid h_cmd %d from %d lockspace %x", + hd->h_cmd, nodeid, le32_to_cpu(hd->u.h_lockspace)); up_read(&ls->ls_recv_active); dlm_put_lockspace(ls); @@ -5801,6 +5719,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, { struct dlm_lkb *lkb; struct dlm_args args; + bool do_put = true; int error; dlm_lock_recovery(ls); @@ -5811,13 +5730,14 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, goto out; } + trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags); + if (flags & DLM_LKF_VALBLK) { ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS); if (!ua->lksb.sb_lvbptr) { kfree(ua); - __put_lkb(ls, lkb); error = -ENOMEM; - goto out; + goto out_put; } } #ifdef CONFIG_DLM_DEPRECATED_API @@ -5831,8 +5751,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, kfree(ua->lksb.sb_lvbptr); ua->lksb.sb_lvbptr = NULL; kfree(ua); - __put_lkb(ls, lkb); - goto out; + goto out_put; } /* After ua is attached to lkb it will be freed by dlm_free_lkb(). @@ -5851,8 +5770,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, error = 0; fallthrough; default: - __put_lkb(ls, lkb); - goto out; + goto out_put; } /* add this new lkb to the per-process list of locks */ @@ -5860,6 +5778,11 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, hold_lkb(lkb); list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks); spin_unlock(&ua->proc->locks_spin); + do_put = false; + out_put: + trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error, false); + if (do_put) + __put_lkb(ls, lkb); out: dlm_unlock_recovery(ls); return error; @@ -5885,6 +5808,8 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, if (error) goto out; + trace_dlm_lock_start(ls, lkb, NULL, 0, mode, flags); + /* user can change the params on its lock when it converts it, or add an lvb that didn't exist before */ @@ -5922,6 +5847,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, if (error == -EINPROGRESS || error == -EAGAIN || error == -EDEADLK) error = 0; out_put: + trace_dlm_lock_end(ls, lkb, NULL, 0, mode, flags, error, false); dlm_put_lkb(lkb); out: dlm_unlock_recovery(ls); @@ -6014,6 +5940,8 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, if (error) goto out; + trace_dlm_unlock_start(ls, lkb, flags); + ua = lkb->lkb_ua; if (lvb_in && ua->lksb.sb_lvbptr) @@ -6042,6 +5970,7 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking); spin_unlock(&ua->proc->locks_spin); out_put: + trace_dlm_unlock_end(ls, lkb, flags, error); dlm_put_lkb(lkb); out: dlm_unlock_recovery(ls); @@ -6063,6 +5992,8 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, if (error) goto out; + trace_dlm_unlock_start(ls, lkb, flags); + ua = lkb->lkb_ua; if (ua_tmp->castparam) ua->castparam = ua_tmp->castparam; @@ -6080,6 +6011,7 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, if (error == -EBUSY) error = 0; out_put: + trace_dlm_unlock_end(ls, lkb, flags, error); dlm_put_lkb(lkb); out: dlm_unlock_recovery(ls); @@ -6101,6 +6033,8 @@ int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid) if (error) goto out; + trace_dlm_unlock_start(ls, lkb, flags); + ua = lkb->lkb_ua; error = set_unlock_args(flags, ua, &args); @@ -6129,6 +6063,7 @@ int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid) if (error == -EBUSY) error = 0; out_put: + trace_dlm_unlock_end(ls, lkb, flags, error); dlm_put_lkb(lkb); out: dlm_unlock_recovery(ls); @@ -6184,7 +6119,7 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls, { struct dlm_lkb *lkb = NULL; - mutex_lock(&ls->ls_clear_proc_locks); + spin_lock(&ls->ls_clear_proc_locks); if (list_empty(&proc->locks)) goto out; @@ -6196,7 +6131,7 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls, else lkb->lkb_flags |= DLM_IFL_DEAD; out: - mutex_unlock(&ls->ls_clear_proc_locks); + spin_unlock(&ls->ls_clear_proc_locks); return lkb; } @@ -6233,7 +6168,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) dlm_put_lkb(lkb); } - mutex_lock(&ls->ls_clear_proc_locks); + spin_lock(&ls->ls_clear_proc_locks); /* in-progress unlocks */ list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) { @@ -6243,13 +6178,12 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) } list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) { - memset(&lkb->lkb_callbacks, 0, - sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE); + dlm_purge_lkb_callbacks(lkb); list_del_init(&lkb->lkb_cb_list); dlm_put_lkb(lkb); } - mutex_unlock(&ls->ls_clear_proc_locks); + spin_unlock(&ls->ls_clear_proc_locks); dlm_unlock_recovery(ls); } @@ -6285,8 +6219,7 @@ static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) spin_lock(&proc->asts_spin); list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) { - memset(&lkb->lkb_callbacks, 0, - sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE); + dlm_purge_lkb_callbacks(lkb); list_del_init(&lkb->lkb_cb_list); dlm_put_lkb(lkb); } @@ -6317,13 +6250,13 @@ static int send_purge(struct dlm_ls *ls, int nodeid, int pid) int error; error = _create_message(ls, sizeof(struct dlm_message), nodeid, - DLM_MSG_PURGE, &ms, &mh); + DLM_MSG_PURGE, &ms, &mh, GFP_NOFS); if (error) return error; ms->m_nodeid = cpu_to_le32(nodeid); ms->m_pid = cpu_to_le32(pid); - return send_message(mh, ms); + return send_message(mh, ms, NULL, 0); } int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index a7b6474f009d..40c76b5544da 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -36,7 +36,7 @@ static inline void dlm_adjust_timeouts(struct dlm_ls *ls) { } int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len, unsigned int flags, int *r_nodeid, int *result); -int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len, +int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len, struct dlm_rsb **r_ret); void dlm_recover_purge(struct dlm_ls *ls); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 3972f4d86c75..d0b4e2181a5f 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -17,7 +17,6 @@ #include "recoverd.h" #include "dir.h" #include "midcomms.h" -#include "lowcomms.h" #include "config.h" #include "memory.h" #include "lock.h" @@ -391,7 +390,7 @@ static int threads_start(void) /* Thread for sending/receiving messages for all lockspace's */ error = dlm_midcomms_start(); if (error) { - log_print("cannot start dlm lowcomms %d", error); + log_print("cannot start dlm midcomms %d", error); goto scand_fail; } @@ -416,7 +415,7 @@ static int new_lockspace(const char *name, const char *cluster, if (namelen > DLM_LOCKSPACE_LEN || namelen == 0) return -EINVAL; - if (!lvblen || (lvblen % 8)) + if (lvblen % 8) return -EINVAL; if (!try_module_get(THIS_MODULE)) @@ -473,7 +472,7 @@ static int new_lockspace(const char *name, const char *cluster, error = -ENOMEM; - ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS); + ls = kzalloc(sizeof(*ls), GFP_NOFS); if (!ls) goto out; memcpy(ls->ls_name, name, namelen); @@ -524,9 +523,6 @@ static int new_lockspace(const char *name, const char *cluster, spin_lock_init(&ls->ls_rsbtbl[i].lock); } - spin_lock_init(&ls->ls_remove_spin); - init_waitqueue_head(&ls->ls_remove_wait); - for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) { ls->ls_remove_names[i] = kzalloc(DLM_RESNAME_MAXLEN+1, GFP_KERNEL); @@ -567,7 +563,7 @@ static int new_lockspace(const char *name, const char *cluster, init_completion(&ls->ls_recovery_done); ls->ls_recovery_result = -1; - mutex_init(&ls->ls_cb_mutex); + spin_lock_init(&ls->ls_cb_lock); INIT_LIST_HEAD(&ls->ls_cb_delay); ls->ls_recoverd_task = NULL; @@ -584,7 +580,7 @@ static int new_lockspace(const char *name, const char *cluster, atomic_set(&ls->ls_requestqueue_cnt, 0); init_waitqueue_head(&ls->ls_requestqueue_wait); mutex_init(&ls->ls_requestqueue_mutex); - mutex_init(&ls->ls_clear_proc_locks); + spin_lock_init(&ls->ls_clear_proc_locks); /* Due backwards compatibility with 3.1 we need to use maximum * possible dlm message size to be sure the message will fit and @@ -703,10 +699,11 @@ static int new_lockspace(const char *name, const char *cluster, return error; } -int dlm_new_lockspace(const char *name, const char *cluster, - uint32_t flags, int lvblen, - const struct dlm_lockspace_ops *ops, void *ops_arg, - int *ops_result, dlm_lockspace_t **lockspace) +static int __dlm_new_lockspace(const char *name, const char *cluster, + uint32_t flags, int lvblen, + const struct dlm_lockspace_ops *ops, + void *ops_arg, int *ops_result, + dlm_lockspace_t **lockspace) { int error = 0; @@ -725,13 +722,32 @@ int dlm_new_lockspace(const char *name, const char *cluster, if (!ls_count) { dlm_scand_stop(); dlm_midcomms_shutdown(); - dlm_lowcomms_stop(); + dlm_midcomms_stop(); } out: mutex_unlock(&ls_lock); return error; } +int dlm_new_lockspace(const char *name, const char *cluster, uint32_t flags, + int lvblen, const struct dlm_lockspace_ops *ops, + void *ops_arg, int *ops_result, + dlm_lockspace_t **lockspace) +{ + return __dlm_new_lockspace(name, cluster, flags | DLM_LSFL_FS, lvblen, + ops, ops_arg, ops_result, lockspace); +} + +int dlm_new_user_lockspace(const char *name, const char *cluster, + uint32_t flags, int lvblen, + const struct dlm_lockspace_ops *ops, + void *ops_arg, int *ops_result, + dlm_lockspace_t **lockspace) +{ + return __dlm_new_lockspace(name, cluster, flags, lvblen, ops, + ops_arg, ops_result, lockspace); +} + static int lkb_idr_is_local(int id, void *p, void *data) { struct dlm_lkb *lkb = p; @@ -909,7 +925,7 @@ int dlm_release_lockspace(void *lockspace, int force) if (!error) ls_count--; if (!ls_count) - dlm_lowcomms_stop(); + dlm_midcomms_stop(); mutex_unlock(&ls_lock); return error; diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h index 306fc4f4ea15..03f4a4a3a871 100644 --- a/fs/dlm/lockspace.h +++ b/fs/dlm/lockspace.h @@ -12,6 +12,14 @@ #ifndef __LOCKSPACE_DOT_H__ #define __LOCKSPACE_DOT_H__ +/* DLM_LSFL_FS + * The lockspace user is in the kernel (i.e. filesystem). Enables + * direct bast/cast callbacks. + * + * internal lockspace flag - will be removed in future + */ +#define DLM_LSFL_FS 0x00000004 + int dlm_lockspace_init(void); void dlm_lockspace_exit(void); struct dlm_ls *dlm_find_lockspace_global(uint32_t id); @@ -20,6 +28,11 @@ struct dlm_ls *dlm_find_lockspace_device(int minor); void dlm_put_lockspace(struct dlm_ls *ls); void dlm_stop_lockspaces(void); void dlm_stop_lockspaces_check(void); +int dlm_new_user_lockspace(const char *name, const char *cluster, + uint32_t flags, int lvblen, + const struct dlm_lockspace_ops *ops, + void *ops_arg, int *ops_result, + dlm_lockspace_t **lockspace); #endif /* __LOCKSPACE_DOT_H__ */ diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index a4e84e8d94c8..4450721ec83c 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -63,41 +63,49 @@ #define NEEDED_RMEM (4*1024*1024) -/* Number of messages to send before rescheduling */ -#define MAX_SEND_MSG_COUNT 25 -#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(10000) - struct connection { struct socket *sock; /* NULL if not connected */ uint32_t nodeid; /* So we know who we are in the list */ - struct mutex sock_mutex; + /* this semaphore is used to allow parallel recv/send in read + * lock mode. When we release a sock we need to held the write lock. + * + * However this is locking code and not nice. When we remove the + * othercon handling we can look into other mechanism to synchronize + * io handling to call sock_release() at the right time. + */ + struct rw_semaphore sock_lock; unsigned long flags; -#define CF_READ_PENDING 1 -#define CF_WRITE_PENDING 2 -#define CF_INIT_PENDING 4 +#define CF_APP_LIMITED 0 +#define CF_RECV_PENDING 1 +#define CF_SEND_PENDING 2 +#define CF_RECV_INTR 3 +#define CF_IO_STOP 4 #define CF_IS_OTHERCON 5 -#define CF_CLOSE 6 -#define CF_APP_LIMITED 7 -#define CF_CLOSING 8 -#define CF_SHUTDOWN 9 -#define CF_CONNECTED 10 -#define CF_RECONNECT 11 -#define CF_DELAY_CONNECT 12 -#define CF_EOF 13 struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; - atomic_t writequeue_cnt; int retries; -#define MAX_CONNECT_RETRIES 3 struct hlist_node list; + /* due some connect()/accept() races we currently have this cross over + * connection attempt second connection for one node. + * + * There is a solution to avoid the race by introducing a connect + * rule as e.g. our_nodeid > nodeid_to_connect who is allowed to + * connect. Otherside can connect but will only be considered that + * the other side wants to have a reconnect. + * + * However changing to this behaviour will break backwards compatible. + * In a DLM protocol major version upgrade we should remove this! + */ struct connection *othercon; - struct connection *sendcon; - struct work_struct rwork; /* Receive workqueue */ - struct work_struct swork; /* Send workqueue */ - wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */ - unsigned char *rx_buf; - int rx_buflen; + struct work_struct rwork; /* receive worker */ + struct work_struct swork; /* send worker */ + unsigned char rx_leftover_buf[DLM_MAX_SOCKET_BUFSIZE]; int rx_leftover; + int mark; + int addr_count; + int curr_addr_index; + struct sockaddr_storage addr[DLM_MAX_ADDR_COUNT]; + spinlock_t addrs_lock; struct rcu_head rcu; }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -136,13 +144,12 @@ struct dlm_msg { struct kref ref; }; -struct dlm_node_addr { - struct list_head list; +struct processqueue_entry { + unsigned char *buf; int nodeid; - int mark; - int addr_count; - int curr_addr_index; - struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; + int buflen; + + struct list_head list; }; struct dlm_proto_ops { @@ -157,10 +164,6 @@ struct dlm_proto_ops { int (*listen_validate)(void); void (*listen_sockopts)(struct socket *sock); int (*listen_bind)(struct socket *sock); - /* What to do to shutdown */ - void (*shutdown_action)(struct connection *con); - /* What to do to eof check */ - bool (*eof_condition)(struct connection *con); }; static struct listen_sock_callbacks { @@ -170,17 +173,13 @@ static struct listen_sock_callbacks { void (*sk_write_space)(struct sock *); } listen_sock; -static LIST_HEAD(dlm_node_addrs); -static DEFINE_SPINLOCK(dlm_node_addrs_spin); - static struct listen_connection listen_con; -static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT]; +static struct sockaddr_storage dlm_local_addr[DLM_MAX_ADDR_COUNT]; static int dlm_local_count; -int dlm_allow_conn; /* Work queues */ -static struct workqueue_struct *recv_workqueue; -static struct workqueue_struct *send_workqueue; +static struct workqueue_struct *io_workqueue; +static struct workqueue_struct *process_workqueue; static struct hlist_head connection_hash[CONN_HASH_SIZE]; static DEFINE_SPINLOCK(connections_lock); @@ -188,8 +187,45 @@ DEFINE_STATIC_SRCU(connections_srcu); static const struct dlm_proto_ops *dlm_proto_ops; +#define DLM_IO_SUCCESS 0 +#define DLM_IO_END 1 +#define DLM_IO_EOF 2 +#define DLM_IO_RESCHED 3 + static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); +static void process_dlm_messages(struct work_struct *work); + +static DECLARE_WORK(process_work, process_dlm_messages); +static DEFINE_SPINLOCK(processqueue_lock); +static bool process_dlm_messages_pending; +static LIST_HEAD(processqueue); + +bool dlm_lowcomms_is_running(void) +{ + return !!listen_con.sock; +} + +static void lowcomms_queue_swork(struct connection *con) +{ + assert_spin_locked(&con->writequeue_lock); + + if (!test_bit(CF_IO_STOP, &con->flags) && + !test_bit(CF_APP_LIMITED, &con->flags) && + !test_and_set_bit(CF_SEND_PENDING, &con->flags)) + queue_work(io_workqueue, &con->swork); +} + +static void lowcomms_queue_rwork(struct connection *con) +{ +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(!lockdep_sock_is_held(con->sock->sk)); +#endif + + if (!test_bit(CF_IO_STOP, &con->flags) && + !test_and_set_bit(CF_RECV_PENDING, &con->flags)) + queue_work(io_workqueue, &con->rwork); +} static void writequeue_entry_ctor(void *data) { @@ -214,15 +250,12 @@ static struct writequeue_entry *con_next_wq(struct connection *con) { struct writequeue_entry *e; - if (list_empty(&con->writequeue)) - return NULL; - - e = list_first_entry(&con->writequeue, struct writequeue_entry, - list); + e = list_first_entry_or_null(&con->writequeue, struct writequeue_entry, + list); /* if len is zero nothing is to send, if there are users filling * buffers we wait until the users are done so we can send more. */ - if (e->users || e->len == 0) + if (!e || e->users || e->len == 0) return NULL; return e; @@ -240,28 +273,15 @@ static struct connection *__find_con(int nodeid, int r) return NULL; } -static bool tcp_eof_condition(struct connection *con) -{ - return atomic_read(&con->writequeue_cnt); -} - -static int dlm_con_init(struct connection *con, int nodeid) +static void dlm_con_init(struct connection *con, int nodeid) { - con->rx_buflen = dlm_config.ci_buffer_size; - con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS); - if (!con->rx_buf) - return -ENOMEM; - con->nodeid = nodeid; - mutex_init(&con->sock_mutex); + init_rwsem(&con->sock_lock); INIT_LIST_HEAD(&con->writequeue); spin_lock_init(&con->writequeue_lock); - atomic_set(&con->writequeue_cnt, 0); INIT_WORK(&con->swork, process_send_sockets); INIT_WORK(&con->rwork, process_recv_sockets); - init_waitqueue_head(&con->shutdown_wait); - - return 0; + spin_lock_init(&con->addrs_lock); } /* @@ -271,7 +291,7 @@ static int dlm_con_init(struct connection *con, int nodeid) static struct connection *nodeid2con(int nodeid, gfp_t alloc) { struct connection *con, *tmp; - int r, ret; + int r; r = nodeid_hash(nodeid); con = __find_con(nodeid, r); @@ -282,11 +302,7 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) if (!con) return NULL; - ret = dlm_con_init(con, nodeid); - if (ret) { - kfree(con); - return NULL; - } + dlm_con_init(con, nodeid); spin_lock(&connections_lock); /* Because multiple workqueues/threads calls this function it can @@ -298,7 +314,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) tmp = __find_con(nodeid, r); if (tmp) { spin_unlock(&connections_lock); - kfree(con->rx_buf); kfree(con); return tmp; } @@ -309,29 +324,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) return con; } -/* Loop round all connections */ -static void foreach_conn(void (*conn_func)(struct connection *c)) -{ - int i; - struct connection *con; - - for (i = 0; i < CONN_HASH_SIZE; i++) { - hlist_for_each_entry_rcu(con, &connection_hash[i], list) - conn_func(con); - } -} - -static struct dlm_node_addr *find_node_addr(int nodeid) -{ - struct dlm_node_addr *na; - - list_for_each_entry(na, &dlm_node_addrs, list) { - if (na->nodeid == nodeid) - return na; - } - return NULL; -} - static int addr_compare(const struct sockaddr_storage *x, const struct sockaddr_storage *y) { @@ -365,40 +357,47 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, unsigned int *mark) { struct sockaddr_storage sas; - struct dlm_node_addr *na; + struct connection *con; + int idx; if (!dlm_local_count) return -1; - spin_lock(&dlm_node_addrs_spin); - na = find_node_addr(nodeid); - if (na && na->addr_count) { - memcpy(&sas, na->addr[na->curr_addr_index], - sizeof(struct sockaddr_storage)); + idx = srcu_read_lock(&connections_srcu); + con = nodeid2con(nodeid, 0); + if (!con) { + srcu_read_unlock(&connections_srcu, idx); + return -ENOENT; + } - if (try_new_addr) { - na->curr_addr_index++; - if (na->curr_addr_index == na->addr_count) - na->curr_addr_index = 0; - } + spin_lock(&con->addrs_lock); + if (!con->addr_count) { + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); + return -ENOENT; } - spin_unlock(&dlm_node_addrs_spin); - if (!na) - return -EEXIST; + memcpy(&sas, &con->addr[con->curr_addr_index], + sizeof(struct sockaddr_storage)); - if (!na->addr_count) - return -ENOENT; + if (try_new_addr) { + con->curr_addr_index++; + if (con->curr_addr_index == con->addr_count) + con->curr_addr_index = 0; + } - *mark = na->mark; + *mark = con->mark; + spin_unlock(&con->addrs_lock); if (sas_out) memcpy(sas_out, &sas, sizeof(struct sockaddr_storage)); - if (!sa_out) + if (!sa_out) { + srcu_read_unlock(&connections_srcu, idx); return 0; + } - if (dlm_local_addr[0]->ss_family == AF_INET) { + if (dlm_local_addr[0].ss_family == AF_INET) { struct sockaddr_in *in4 = (struct sockaddr_in *) &sas; struct sockaddr_in *ret4 = (struct sockaddr_in *) sa_out; ret4->sin_addr.s_addr = in4->sin_addr.s_addr; @@ -408,43 +407,46 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, ret6->sin6_addr = in6->sin6_addr; } + srcu_read_unlock(&connections_srcu, idx); return 0; } static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid, unsigned int *mark) { - struct dlm_node_addr *na; - int rv = -EEXIST; - int addr_i; - - spin_lock(&dlm_node_addrs_spin); - list_for_each_entry(na, &dlm_node_addrs, list) { - if (!na->addr_count) - continue; - - for (addr_i = 0; addr_i < na->addr_count; addr_i++) { - if (addr_compare(na->addr[addr_i], addr)) { - *nodeid = na->nodeid; - *mark = na->mark; - rv = 0; - goto unlock; + struct connection *con; + int i, idx, addr_i; + + idx = srcu_read_lock(&connections_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(con, &connection_hash[i], list) { + WARN_ON_ONCE(!con->addr_count); + + spin_lock(&con->addrs_lock); + for (addr_i = 0; addr_i < con->addr_count; addr_i++) { + if (addr_compare(&con->addr[addr_i], addr)) { + *nodeid = con->nodeid; + *mark = con->mark; + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); + return 0; + } } + spin_unlock(&con->addrs_lock); } } -unlock: - spin_unlock(&dlm_node_addrs_spin); - return rv; + srcu_read_unlock(&connections_srcu, idx); + + return -ENOENT; } -/* caller need to held dlm_node_addrs_spin lock */ -static bool dlm_lowcomms_na_has_addr(const struct dlm_node_addr *na, - const struct sockaddr_storage *addr) +static bool dlm_lowcomms_con_has_addr(const struct connection *con, + const struct sockaddr_storage *addr) { int i; - for (i = 0; i < na->addr_count; i++) { - if (addr_compare(na->addr[i], addr)) + for (i = 0; i < con->addr_count; i++) { + if (addr_compare(&con->addr[i], addr)) return true; } @@ -453,118 +455,82 @@ static bool dlm_lowcomms_na_has_addr(const struct dlm_node_addr *na, int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) { - struct sockaddr_storage *new_addr; - struct dlm_node_addr *new_node, *na; - bool ret; - - new_node = kzalloc(sizeof(struct dlm_node_addr), GFP_NOFS); - if (!new_node) - return -ENOMEM; + struct connection *con; + bool ret, idx; - new_addr = kzalloc(sizeof(struct sockaddr_storage), GFP_NOFS); - if (!new_addr) { - kfree(new_node); + idx = srcu_read_lock(&connections_srcu); + con = nodeid2con(nodeid, GFP_NOFS); + if (!con) { + srcu_read_unlock(&connections_srcu, idx); return -ENOMEM; } - memcpy(new_addr, addr, len); - - spin_lock(&dlm_node_addrs_spin); - na = find_node_addr(nodeid); - if (!na) { - new_node->nodeid = nodeid; - new_node->addr[0] = new_addr; - new_node->addr_count = 1; - new_node->mark = dlm_config.ci_mark; - list_add(&new_node->list, &dlm_node_addrs); - spin_unlock(&dlm_node_addrs_spin); + spin_lock(&con->addrs_lock); + if (!con->addr_count) { + memcpy(&con->addr[0], addr, sizeof(*addr)); + con->addr_count = 1; + con->mark = dlm_config.ci_mark; + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); return 0; } - ret = dlm_lowcomms_na_has_addr(na, addr); + ret = dlm_lowcomms_con_has_addr(con, addr); if (ret) { - spin_unlock(&dlm_node_addrs_spin); - kfree(new_addr); - kfree(new_node); + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); return -EEXIST; } - if (na->addr_count >= DLM_MAX_ADDR_COUNT) { - spin_unlock(&dlm_node_addrs_spin); - kfree(new_addr); - kfree(new_node); + if (con->addr_count >= DLM_MAX_ADDR_COUNT) { + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); return -ENOSPC; } - na->addr[na->addr_count++] = new_addr; - spin_unlock(&dlm_node_addrs_spin); - kfree(new_node); + memcpy(&con->addr[con->addr_count++], addr, sizeof(*addr)); + srcu_read_unlock(&connections_srcu, idx); + spin_unlock(&con->addrs_lock); return 0; } /* Data available on socket or listen socket received a connect */ static void lowcomms_data_ready(struct sock *sk) { - struct connection *con; - - con = sock2con(sk); - if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags)) - queue_work(recv_workqueue, &con->rwork); -} - -static void lowcomms_listen_data_ready(struct sock *sk) -{ - if (!dlm_allow_conn) - return; + struct connection *con = sock2con(sk); - queue_work(recv_workqueue, &listen_con.rwork); + set_bit(CF_RECV_INTR, &con->flags); + lowcomms_queue_rwork(con); } static void lowcomms_write_space(struct sock *sk) { - struct connection *con; - - con = sock2con(sk); - if (!con) - return; - - if (!test_and_set_bit(CF_CONNECTED, &con->flags)) { - log_print("connected to node %d", con->nodeid); - queue_work(send_workqueue, &con->swork); - return; - } + struct connection *con = sock2con(sk); clear_bit(SOCK_NOSPACE, &con->sock->flags); + spin_lock_bh(&con->writequeue_lock); if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) { con->sock->sk->sk_write_pending--; clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags); } - queue_work(send_workqueue, &con->swork); -} - -static inline void lowcomms_connect_sock(struct connection *con) -{ - if (test_bit(CF_CLOSE, &con->flags)) - return; - queue_work(send_workqueue, &con->swork); - cond_resched(); + lowcomms_queue_swork(con); + spin_unlock_bh(&con->writequeue_lock); } static void lowcomms_state_change(struct sock *sk) { /* SCTP layer is not calling sk_data_ready when the connection - * is done, so we catch the signal through here. Also, it - * doesn't switch socket state when entering shutdown, so we - * skip the write in that case. + * is done, so we catch the signal through here. */ - if (sk->sk_shutdown) { - if (sk->sk_shutdown == RCV_SHUTDOWN) - lowcomms_data_ready(sk); - } else if (sk->sk_state == TCP_ESTABLISHED) { - lowcomms_write_space(sk); - } + if (sk->sk_shutdown == RCV_SHUTDOWN) + lowcomms_data_ready(sk); +} + +static void lowcomms_listen_data_ready(struct sock *sk) +{ + queue_work(io_workqueue, &listen_con.rwork); } int dlm_lowcomms_connect_node(int nodeid) @@ -576,47 +542,49 @@ int dlm_lowcomms_connect_node(int nodeid) return 0; idx = srcu_read_lock(&connections_srcu); - con = nodeid2con(nodeid, GFP_NOFS); - if (!con) { + con = nodeid2con(nodeid, 0); + if (WARN_ON_ONCE(!con)) { srcu_read_unlock(&connections_srcu, idx); - return -ENOMEM; + return -ENOENT; } - lowcomms_connect_sock(con); + down_read(&con->sock_lock); + if (!con->sock) { + spin_lock_bh(&con->writequeue_lock); + lowcomms_queue_swork(con); + spin_unlock_bh(&con->writequeue_lock); + } + up_read(&con->sock_lock); srcu_read_unlock(&connections_srcu, idx); + cond_resched(); return 0; } int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark) { - struct dlm_node_addr *na; + struct connection *con; + int idx; - spin_lock(&dlm_node_addrs_spin); - na = find_node_addr(nodeid); - if (!na) { - spin_unlock(&dlm_node_addrs_spin); + idx = srcu_read_lock(&connections_srcu); + con = nodeid2con(nodeid, 0); + if (!con) { + srcu_read_unlock(&connections_srcu, idx); return -ENOENT; } - na->mark = mark; - spin_unlock(&dlm_node_addrs_spin); - + spin_lock(&con->addrs_lock); + con->mark = mark; + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); return 0; } static void lowcomms_error_report(struct sock *sk) { - struct connection *con; - void (*orig_report)(struct sock *) = NULL; + struct connection *con = sock2con(sk); struct inet_sock *inet; - con = sock2con(sk); - if (con == NULL) - goto out; - - orig_report = listen_sock.sk_error_report; - inet = inet_sk(sk); switch (sk->sk_family) { case AF_INET: @@ -642,66 +610,25 @@ static void lowcomms_error_report(struct sock *sk) "invalid socket family %d set, " "sk_err=%d/%d\n", dlm_our_nodeid(), sk->sk_family, sk->sk_err, sk->sk_err_soft); - goto out; - } - - /* below sendcon only handling */ - if (test_bit(CF_IS_OTHERCON, &con->flags)) - con = con->sendcon; - - switch (sk->sk_err) { - case ECONNREFUSED: - set_bit(CF_DELAY_CONNECT, &con->flags); - break; - default: break; } - if (!test_and_set_bit(CF_RECONNECT, &con->flags)) - queue_work(send_workqueue, &con->swork); + dlm_midcomms_unack_msg_resend(con->nodeid); -out: - if (orig_report) - orig_report(sk); + listen_sock.sk_error_report(sk); } -/* Note: sk_callback_lock must be locked before calling this function. */ -static void save_listen_callbacks(struct socket *sock) +static void restore_callbacks(struct sock *sk) { - struct sock *sk = sock->sk; - - listen_sock.sk_data_ready = sk->sk_data_ready; - listen_sock.sk_state_change = sk->sk_state_change; - listen_sock.sk_write_space = sk->sk_write_space; - listen_sock.sk_error_report = sk->sk_error_report; -} - -static void restore_callbacks(struct socket *sock) -{ - struct sock *sk = sock->sk; +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(!lockdep_sock_is_held(sk)); +#endif - lock_sock(sk); sk->sk_user_data = NULL; sk->sk_data_ready = listen_sock.sk_data_ready; sk->sk_state_change = listen_sock.sk_state_change; sk->sk_write_space = listen_sock.sk_write_space; sk->sk_error_report = listen_sock.sk_error_report; - release_sock(sk); -} - -static void add_listen_sock(struct socket *sock, struct listen_connection *con) -{ - struct sock *sk = sock->sk; - - lock_sock(sk); - save_listen_callbacks(sock); - con->sock = sock; - - sk->sk_user_data = con; - sk->sk_allocation = GFP_NOFS; - /* Install a data_ready callback */ - sk->sk_data_ready = lowcomms_listen_data_ready; - release_sock(sk); } /* Make a socket active */ @@ -713,11 +640,12 @@ static void add_sock(struct socket *sock, struct connection *con) con->sock = sock; sk->sk_user_data = con; - /* Install a data_ready callback */ sk->sk_data_ready = lowcomms_data_ready; sk->sk_write_space = lowcomms_write_space; - sk->sk_state_change = lowcomms_state_change; + if (dlm_config.ci_protocol == DLM_PROTO_SCTP) + sk->sk_state_change = lowcomms_state_change; sk->sk_allocation = GFP_NOFS; + sk->sk_use_task_frag = false; sk->sk_error_report = lowcomms_error_report; release_sock(sk); } @@ -727,7 +655,7 @@ static void add_sock(struct socket *sock, struct connection *con) static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, int *addr_len) { - saddr->ss_family = dlm_local_addr[0]->ss_family; + saddr->ss_family = dlm_local_addr[0].ss_family; if (saddr->ss_family == AF_INET) { struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr; in4_addr->sin_port = cpu_to_be16(port); @@ -773,43 +701,67 @@ static void free_entry(struct writequeue_entry *e) } list_del(&e->list); - atomic_dec(&e->con->writequeue_cnt); kref_put(&e->ref, dlm_page_release); } static void dlm_close_sock(struct socket **sock) { - if (*sock) { - restore_callbacks(*sock); - sock_release(*sock); - *sock = NULL; + lock_sock((*sock)->sk); + restore_callbacks((*sock)->sk); + release_sock((*sock)->sk); + + sock_release(*sock); + *sock = NULL; +} + +static void allow_connection_io(struct connection *con) +{ + if (con->othercon) + clear_bit(CF_IO_STOP, &con->othercon->flags); + clear_bit(CF_IO_STOP, &con->flags); +} + +static void stop_connection_io(struct connection *con) +{ + if (con->othercon) + stop_connection_io(con->othercon); + + down_write(&con->sock_lock); + if (con->sock) { + lock_sock(con->sock->sk); + restore_callbacks(con->sock->sk); + + spin_lock_bh(&con->writequeue_lock); + set_bit(CF_IO_STOP, &con->flags); + spin_unlock_bh(&con->writequeue_lock); + release_sock(con->sock->sk); + } else { + spin_lock_bh(&con->writequeue_lock); + set_bit(CF_IO_STOP, &con->flags); + spin_unlock_bh(&con->writequeue_lock); } + up_write(&con->sock_lock); + + cancel_work_sync(&con->swork); + cancel_work_sync(&con->rwork); } /* Close a remote connection and tidy up */ -static void close_connection(struct connection *con, bool and_other, - bool tx, bool rx) +static void close_connection(struct connection *con, bool and_other) { - bool closing = test_and_set_bit(CF_CLOSING, &con->flags); struct writequeue_entry *e; - if (tx && !closing && cancel_work_sync(&con->swork)) { - log_print("canceled swork for node %d", con->nodeid); - clear_bit(CF_WRITE_PENDING, &con->flags); - } - if (rx && !closing && cancel_work_sync(&con->rwork)) { - log_print("canceled rwork for node %d", con->nodeid); - clear_bit(CF_READ_PENDING, &con->flags); + if (con->othercon && and_other) + close_connection(con->othercon, false); + + down_write(&con->sock_lock); + if (!con->sock) { + up_write(&con->sock_lock); + return; } - mutex_lock(&con->sock_mutex); dlm_close_sock(&con->sock); - if (con->othercon && and_other) { - /* Will only re-enter once. */ - close_connection(con->othercon, false, tx, rx); - } - /* if we send a writequeue entry only a half way, we drop the * whole entry because reconnection and that we not start of the * middle of a msg which will confuse the other end. @@ -821,200 +773,209 @@ static void close_connection(struct connection *con, bool and_other, * our policy is to start on a clean state when disconnects, we don't * know what's send/received on transport layer in this case. */ - spin_lock(&con->writequeue_lock); + spin_lock_bh(&con->writequeue_lock); if (!list_empty(&con->writequeue)) { e = list_first_entry(&con->writequeue, struct writequeue_entry, list); if (e->dirty) free_entry(e); } - spin_unlock(&con->writequeue_lock); + spin_unlock_bh(&con->writequeue_lock); con->rx_leftover = 0; con->retries = 0; clear_bit(CF_APP_LIMITED, &con->flags); - clear_bit(CF_CONNECTED, &con->flags); - clear_bit(CF_DELAY_CONNECT, &con->flags); - clear_bit(CF_RECONNECT, &con->flags); - clear_bit(CF_EOF, &con->flags); - mutex_unlock(&con->sock_mutex); - clear_bit(CF_CLOSING, &con->flags); + clear_bit(CF_RECV_PENDING, &con->flags); + clear_bit(CF_SEND_PENDING, &con->flags); + up_write(&con->sock_lock); } -static void shutdown_connection(struct connection *con) +static struct processqueue_entry *new_processqueue_entry(int nodeid, + int buflen) { - int ret; - - flush_work(&con->swork); + struct processqueue_entry *pentry; - mutex_lock(&con->sock_mutex); - /* nothing to shutdown */ - if (!con->sock) { - mutex_unlock(&con->sock_mutex); - return; - } + pentry = kmalloc(sizeof(*pentry), GFP_NOFS); + if (!pentry) + return NULL; - set_bit(CF_SHUTDOWN, &con->flags); - ret = kernel_sock_shutdown(con->sock, SHUT_WR); - mutex_unlock(&con->sock_mutex); - if (ret) { - log_print("Connection %p failed to shutdown: %d will force close", - con, ret); - goto force_close; - } else { - ret = wait_event_timeout(con->shutdown_wait, - !test_bit(CF_SHUTDOWN, &con->flags), - DLM_SHUTDOWN_WAIT_TIMEOUT); - if (ret == 0) { - log_print("Connection %p shutdown timed out, will force close", - con); - goto force_close; - } + pentry->buf = kmalloc(buflen, GFP_NOFS); + if (!pentry->buf) { + kfree(pentry); + return NULL; } - return; + pentry->nodeid = nodeid; + return pentry; +} -force_close: - clear_bit(CF_SHUTDOWN, &con->flags); - close_connection(con, false, true, true); +static void free_processqueue_entry(struct processqueue_entry *pentry) +{ + kfree(pentry->buf); + kfree(pentry); } -static void dlm_tcp_shutdown(struct connection *con) +struct dlm_processed_nodes { + int nodeid; + + struct list_head list; +}; + +static void add_processed_node(int nodeid, struct list_head *processed_nodes) { - if (con->othercon) - shutdown_connection(con->othercon); - shutdown_connection(con); + struct dlm_processed_nodes *n; + + list_for_each_entry(n, processed_nodes, list) { + /* we already remembered this node */ + if (n->nodeid == nodeid) + return; + } + + /* if it's fails in worst case we simple don't send an ack back. + * We try it next time. + */ + n = kmalloc(sizeof(*n), GFP_NOFS); + if (!n) + return; + + n->nodeid = nodeid; + list_add(&n->list, processed_nodes); } -static int con_realloc_receive_buf(struct connection *con, int newlen) +static void process_dlm_messages(struct work_struct *work) { - unsigned char *newbuf; + struct dlm_processed_nodes *n, *n_tmp; + struct processqueue_entry *pentry; + LIST_HEAD(processed_nodes); - newbuf = kmalloc(newlen, GFP_NOFS); - if (!newbuf) - return -ENOMEM; + spin_lock(&processqueue_lock); + pentry = list_first_entry_or_null(&processqueue, + struct processqueue_entry, list); + if (WARN_ON_ONCE(!pentry)) { + spin_unlock(&processqueue_lock); + return; + } + + list_del(&pentry->list); + spin_unlock(&processqueue_lock); - /* copy any leftover from last receive */ - if (con->rx_leftover) - memmove(newbuf, con->rx_buf, con->rx_leftover); + for (;;) { + dlm_process_incoming_buffer(pentry->nodeid, pentry->buf, + pentry->buflen); + add_processed_node(pentry->nodeid, &processed_nodes); + free_processqueue_entry(pentry); + + spin_lock(&processqueue_lock); + pentry = list_first_entry_or_null(&processqueue, + struct processqueue_entry, list); + if (!pentry) { + process_dlm_messages_pending = false; + spin_unlock(&processqueue_lock); + break; + } - /* swap to new buffer space */ - kfree(con->rx_buf); - con->rx_buflen = newlen; - con->rx_buf = newbuf; + list_del(&pentry->list); + spin_unlock(&processqueue_lock); + } - return 0; + /* send ack back after we processed couple of messages */ + list_for_each_entry_safe(n, n_tmp, &processed_nodes, list) { + list_del(&n->list); + dlm_midcomms_receive_done(n->nodeid); + kfree(n); + } } /* Data received from remote end */ -static int receive_from_sock(struct connection *con) +static int receive_from_sock(struct connection *con, int buflen) { + struct processqueue_entry *pentry; + int ret, buflen_real; struct msghdr msg; struct kvec iov; - int ret, buflen; - - mutex_lock(&con->sock_mutex); - if (con->sock == NULL) { - ret = -EAGAIN; - goto out_close; - } + pentry = new_processqueue_entry(con->nodeid, buflen); + if (!pentry) + return DLM_IO_RESCHED; - /* realloc if we get new buffer size to read out */ - buflen = dlm_config.ci_buffer_size; - if (con->rx_buflen != buflen && con->rx_leftover <= buflen) { - ret = con_realloc_receive_buf(con, buflen); - if (ret < 0) - goto out_resched; - } + memcpy(pentry->buf, con->rx_leftover_buf, con->rx_leftover); - for (;;) { - /* calculate new buffer parameter regarding last receive and - * possible leftover bytes - */ - iov.iov_base = con->rx_buf + con->rx_leftover; - iov.iov_len = con->rx_buflen - con->rx_leftover; - - memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; - ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, - msg.msg_flags); - trace_dlm_recv(con->nodeid, ret); - if (ret == -EAGAIN) - break; - else if (ret <= 0) - goto out_close; - - /* new buflen according readed bytes and leftover from last receive */ - buflen = ret + con->rx_leftover; - ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen); - if (ret < 0) - goto out_close; - - /* calculate leftover bytes from process and put it into begin of - * the receive buffer, so next receive we have the full message - * at the start address of the receive buffer. - */ - con->rx_leftover = buflen - ret; - if (con->rx_leftover) { - memmove(con->rx_buf, con->rx_buf + ret, - con->rx_leftover); + /* calculate new buffer parameter regarding last receive and + * possible leftover bytes + */ + iov.iov_base = pentry->buf + con->rx_leftover; + iov.iov_len = buflen - con->rx_leftover; + + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + clear_bit(CF_RECV_INTR, &con->flags); +again: + ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, + msg.msg_flags); + trace_dlm_recv(con->nodeid, ret); + if (ret == -EAGAIN) { + lock_sock(con->sock->sk); + if (test_and_clear_bit(CF_RECV_INTR, &con->flags)) { + release_sock(con->sock->sk); + goto again; } + + clear_bit(CF_RECV_PENDING, &con->flags); + release_sock(con->sock->sk); + free_processqueue_entry(pentry); + return DLM_IO_END; + } else if (ret == 0) { + /* close will clear CF_RECV_PENDING */ + free_processqueue_entry(pentry); + return DLM_IO_EOF; + } else if (ret < 0) { + free_processqueue_entry(pentry); + return ret; } - dlm_midcomms_receive_done(con->nodeid); - mutex_unlock(&con->sock_mutex); - return 0; + /* new buflen according readed bytes and leftover from last receive */ + buflen_real = ret + con->rx_leftover; + ret = dlm_validate_incoming_buffer(con->nodeid, pentry->buf, + buflen_real); + if (ret < 0) { + free_processqueue_entry(pentry); + return ret; + } -out_resched: - if (!test_and_set_bit(CF_READ_PENDING, &con->flags)) - queue_work(recv_workqueue, &con->rwork); - mutex_unlock(&con->sock_mutex); - return -EAGAIN; - -out_close: - if (ret == 0) { - log_print("connection %p got EOF from %d", - con, con->nodeid); - - if (dlm_proto_ops->eof_condition && - dlm_proto_ops->eof_condition(con)) { - set_bit(CF_EOF, &con->flags); - mutex_unlock(&con->sock_mutex); - } else { - mutex_unlock(&con->sock_mutex); - close_connection(con, false, true, false); + pentry->buflen = ret; - /* handling for tcp shutdown */ - clear_bit(CF_SHUTDOWN, &con->flags); - wake_up(&con->shutdown_wait); - } + /* calculate leftover bytes from process and put it into begin of + * the receive buffer, so next receive we have the full message + * at the start address of the receive buffer. + */ + con->rx_leftover = buflen_real - ret; + memmove(con->rx_leftover_buf, pentry->buf + ret, + con->rx_leftover); - /* signal to breaking receive worker */ - ret = -1; - } else { - mutex_unlock(&con->sock_mutex); + spin_lock(&processqueue_lock); + list_add_tail(&pentry->list, &processqueue); + if (!process_dlm_messages_pending) { + process_dlm_messages_pending = true; + queue_work(process_workqueue, &process_work); } - return ret; + spin_unlock(&processqueue_lock); + + return DLM_IO_SUCCESS; } /* Listening socket is busy, accept a connection */ -static int accept_from_sock(struct listen_connection *con) +static int accept_from_sock(void) { - int result; struct sockaddr_storage peeraddr; - struct socket *newsock; - int len, idx; - int nodeid; + int len, idx, result, nodeid; struct connection *newcon; - struct connection *addcon; + struct socket *newsock; unsigned int mark; - if (!con->sock) - return -ENOTCONN; - - result = kernel_accept(con->sock, &newsock, O_NONBLOCK); - if (result < 0) + result = kernel_accept(listen_con.sock, &newsock, O_NONBLOCK); + if (result == -EAGAIN) + return DLM_IO_END; + else if (result < 0) goto accept_err; /* Get the connected socket's peer */ @@ -1062,16 +1023,16 @@ static int accept_from_sock(struct listen_connection *con) * In this case we store the incoming one in "othercon" */ idx = srcu_read_lock(&connections_srcu); - newcon = nodeid2con(nodeid, GFP_NOFS); - if (!newcon) { + newcon = nodeid2con(nodeid, 0); + if (WARN_ON_ONCE(!newcon)) { srcu_read_unlock(&connections_srcu, idx); - result = -ENOMEM; + result = -ENOENT; goto accept_err; } sock_set_mark(newsock->sk, mark); - mutex_lock(&newcon->sock_mutex); + down_write(&newcon->sock_lock); if (newcon->sock) { struct connection *othercon = newcon->othercon; @@ -1079,63 +1040,50 @@ static int accept_from_sock(struct listen_connection *con) othercon = kzalloc(sizeof(*othercon), GFP_NOFS); if (!othercon) { log_print("failed to allocate incoming socket"); - mutex_unlock(&newcon->sock_mutex); + up_write(&newcon->sock_lock); srcu_read_unlock(&connections_srcu, idx); result = -ENOMEM; goto accept_err; } - result = dlm_con_init(othercon, nodeid); - if (result < 0) { - kfree(othercon); - mutex_unlock(&newcon->sock_mutex); - srcu_read_unlock(&connections_srcu, idx); - goto accept_err; - } - - lockdep_set_subclass(&othercon->sock_mutex, 1); - set_bit(CF_IS_OTHERCON, &othercon->flags); + dlm_con_init(othercon, nodeid); + lockdep_set_subclass(&othercon->sock_lock, 1); newcon->othercon = othercon; - othercon->sendcon = newcon; + set_bit(CF_IS_OTHERCON, &othercon->flags); } else { /* close other sock con if we have something new */ - close_connection(othercon, false, true, false); + close_connection(othercon, false); } - mutex_lock(&othercon->sock_mutex); + down_write(&othercon->sock_lock); add_sock(newsock, othercon); - addcon = othercon; - mutex_unlock(&othercon->sock_mutex); + + /* check if we receved something while adding */ + lock_sock(othercon->sock->sk); + lowcomms_queue_rwork(othercon); + release_sock(othercon->sock->sk); + up_write(&othercon->sock_lock); } else { /* accept copies the sk after we've saved the callbacks, so we don't want to save them a second time or comm errors will result in calling sk_error_report recursively. */ add_sock(newsock, newcon); - addcon = newcon; - } - - set_bit(CF_CONNECTED, &addcon->flags); - mutex_unlock(&newcon->sock_mutex); - - /* - * Add it to the active queue in case we got data - * between processing the accept adding the socket - * to the read_sockets list - */ - if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags)) - queue_work(recv_workqueue, &addcon->rwork); + /* check if we receved something while adding */ + lock_sock(newcon->sock->sk); + lowcomms_queue_rwork(newcon); + release_sock(newcon->sock->sk); + } + up_write(&newcon->sock_lock); srcu_read_unlock(&connections_srcu, idx); - return 0; + return DLM_IO_SUCCESS; accept_err: if (newsock) sock_release(newsock); - if (result != -EAGAIN) - log_print("error accepting connection from node: %d", result); return result; } @@ -1167,7 +1115,7 @@ static int sctp_bind_addrs(struct socket *sock, uint16_t port) int i, addr_len, result = 0; for (i = 0; i < dlm_local_count; i++) { - memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr)); + memcpy(&localaddr, &dlm_local_addr[i], sizeof(localaddr)); make_sockaddr(&localaddr, port, &addr_len); if (!i) @@ -1187,7 +1135,7 @@ static int sctp_bind_addrs(struct socket *sock, uint16_t port) /* Get local addresses */ static void init_local(void) { - struct sockaddr_storage sas, *addr; + struct sockaddr_storage sas; int i; dlm_local_count = 0; @@ -1195,21 +1143,10 @@ static void init_local(void) if (dlm_our_addr(&sas, i)) break; - addr = kmemdup(&sas, sizeof(*addr), GFP_NOFS); - if (!addr) - break; - dlm_local_addr[dlm_local_count++] = addr; + memcpy(&dlm_local_addr[dlm_local_count++], &sas, sizeof(sas)); } } -static void deinit_local(void) -{ - int i; - - for (i = 0; i < dlm_local_count; i++) - kfree(dlm_local_addr[i]); -} - static struct writequeue_entry *new_writequeue_entry(struct connection *con) { struct writequeue_entry *entry; @@ -1240,7 +1177,7 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len, { struct writequeue_entry *e; - spin_lock(&con->writequeue_lock); + spin_lock_bh(&con->writequeue_lock); if (!list_empty(&con->writequeue)) { e = list_last_entry(&con->writequeue, struct writequeue_entry, list); if (DLM_WQ_REMAIN_BYTES(e) >= len) { @@ -1263,14 +1200,13 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len, kref_get(&e->ref); *ppc = page_address(e->page); e->end += len; - atomic_inc(&con->writequeue_cnt); if (cb) cb(data); list_add_tail(&e->list, &con->writequeue); out: - spin_unlock(&con->writequeue_lock); + spin_unlock_bh(&con->writequeue_lock); return e; }; @@ -1319,13 +1255,13 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, len < sizeof(struct dlm_header)) { BUILD_BUG_ON(PAGE_SIZE < DLM_MAX_SOCKET_BUFSIZE); log_print("failed to allocate a buffer of size %d", len); - WARN_ON(1); + WARN_ON_ONCE(1); return NULL; } idx = srcu_read_lock(&connections_srcu); - con = nodeid2con(nodeid, allocation); - if (!con) { + con = nodeid2con(nodeid, 0); + if (WARN_ON_ONCE(!con)) { srcu_read_unlock(&connections_srcu, idx); return NULL; } @@ -1336,6 +1272,8 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, return NULL; } + /* for dlm_lowcomms_commit_msg() */ + kref_get(&msg->ref); /* we assume if successful commit must called */ msg->idx = idx; return msg; @@ -1348,7 +1286,7 @@ static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg) struct connection *con = e->con; int users; - spin_lock(&con->writequeue_lock); + spin_lock_bh(&con->writequeue_lock); kref_get(&msg->ref); list_add(&msg->list, &e->msgs); @@ -1357,13 +1295,11 @@ static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg) goto out; e->len = DLM_WQ_LENGTH_BYTES(e); - spin_unlock(&con->writequeue_lock); - queue_work(send_workqueue, &con->swork); - return; + lowcomms_queue_swork(con); out: - spin_unlock(&con->writequeue_lock); + spin_unlock_bh(&con->writequeue_lock); return; } @@ -1375,6 +1311,8 @@ void dlm_lowcomms_commit_msg(struct dlm_msg *msg) { _dlm_lowcomms_commit_msg(msg); srcu_read_unlock(&connections_srcu, msg->idx); + /* because dlm_lowcomms_new_msg() */ + kref_put(&msg->ref, dlm_msg_release); } #endif @@ -1383,7 +1321,7 @@ void dlm_lowcomms_put_msg(struct dlm_msg *msg) kref_put(&msg->ref, dlm_msg_release); } -/* does not held connections_srcu, usage workqueue only */ +/* does not held connections_srcu, usage lowcomms_error_report only */ int dlm_lowcomms_resend_msg(struct dlm_msg *msg) { struct dlm_msg *msg_resend; @@ -1409,90 +1347,79 @@ int dlm_lowcomms_resend_msg(struct dlm_msg *msg) } /* Send a message */ -static void send_to_sock(struct connection *con) +static int send_to_sock(struct connection *con) { const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; struct writequeue_entry *e; int len, offset, ret; - int count = 0; - mutex_lock(&con->sock_mutex); - if (con->sock == NULL) - goto out_connect; + spin_lock_bh(&con->writequeue_lock); + e = con_next_wq(con); + if (!e) { + clear_bit(CF_SEND_PENDING, &con->flags); + spin_unlock_bh(&con->writequeue_lock); + return DLM_IO_END; + } - spin_lock(&con->writequeue_lock); - for (;;) { - e = con_next_wq(con); - if (!e) - break; + len = e->len; + offset = e->offset; + WARN_ON_ONCE(len == 0 && e->users == 0); + spin_unlock_bh(&con->writequeue_lock); - len = e->len; - offset = e->offset; - BUG_ON(len == 0 && e->users == 0); - spin_unlock(&con->writequeue_lock); - - ret = kernel_sendpage(con->sock, e->page, offset, len, - msg_flags); - trace_dlm_send(con->nodeid, ret); - if (ret == -EAGAIN || ret == 0) { - if (ret == -EAGAIN && - test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && - !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { - /* Notify TCP that we're limited by the - * application window size. - */ - set_bit(SOCK_NOSPACE, &con->sock->flags); - con->sock->sk->sk_write_pending++; - } - cond_resched(); - goto out; - } else if (ret < 0) - goto out; - - /* Don't starve people filling buffers */ - if (++count >= MAX_SEND_MSG_COUNT) { - cond_resched(); - count = 0; + ret = kernel_sendpage(con->sock, e->page, offset, len, + msg_flags); + trace_dlm_send(con->nodeid, ret); + if (ret == -EAGAIN || ret == 0) { + lock_sock(con->sock->sk); + spin_lock_bh(&con->writequeue_lock); + if (test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && + !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { + /* Notify TCP that we're limited by the + * application window size. + */ + set_bit(SOCK_NOSPACE, &con->sock->sk->sk_socket->flags); + con->sock->sk->sk_write_pending++; + + clear_bit(CF_SEND_PENDING, &con->flags); + spin_unlock_bh(&con->writequeue_lock); + release_sock(con->sock->sk); + + /* wait for write_space() event */ + return DLM_IO_END; } + spin_unlock_bh(&con->writequeue_lock); + release_sock(con->sock->sk); - spin_lock(&con->writequeue_lock); - writequeue_entry_complete(e, ret); - } - spin_unlock(&con->writequeue_lock); - - /* close if we got EOF */ - if (test_and_clear_bit(CF_EOF, &con->flags)) { - mutex_unlock(&con->sock_mutex); - close_connection(con, false, false, true); - - /* handling for tcp shutdown */ - clear_bit(CF_SHUTDOWN, &con->flags); - wake_up(&con->shutdown_wait); - } else { - mutex_unlock(&con->sock_mutex); + return DLM_IO_RESCHED; + } else if (ret < 0) { + return ret; } - return; - -out: - mutex_unlock(&con->sock_mutex); - return; + spin_lock_bh(&con->writequeue_lock); + writequeue_entry_complete(e, ret); + spin_unlock_bh(&con->writequeue_lock); -out_connect: - mutex_unlock(&con->sock_mutex); - queue_work(send_workqueue, &con->swork); - cond_resched(); + return DLM_IO_SUCCESS; } static void clean_one_writequeue(struct connection *con) { struct writequeue_entry *e, *safe; - spin_lock(&con->writequeue_lock); + spin_lock_bh(&con->writequeue_lock); list_for_each_entry_safe(e, safe, &con->writequeue, list) { free_entry(e); } - spin_unlock(&con->writequeue_lock); + spin_unlock_bh(&con->writequeue_lock); +} + +static void connection_release(struct rcu_head *rcu) +{ + struct connection *con = container_of(rcu, struct connection, rcu); + + WARN_ON_ONCE(!list_empty(&con->writequeue)); + WARN_ON_ONCE(con->sock); + kfree(con); } /* Called from recovery when it knows that a node has @@ -1500,286 +1427,311 @@ static void clean_one_writequeue(struct connection *con) int dlm_lowcomms_close(int nodeid) { struct connection *con; - struct dlm_node_addr *na; int idx; log_print("closing connection to node %d", nodeid); + idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, 0); - if (con) { - set_bit(CF_CLOSE, &con->flags); - close_connection(con, true, true, true); - clean_one_writequeue(con); + if (WARN_ON_ONCE(!con)) { + srcu_read_unlock(&connections_srcu, idx); + return -ENOENT; + } + + stop_connection_io(con); + log_print("io handling for node: %d stopped", nodeid); + close_connection(con, true); + + spin_lock(&connections_lock); + hlist_del_rcu(&con->list); + spin_unlock(&connections_lock); + + clean_one_writequeue(con); + call_srcu(&connections_srcu, &con->rcu, connection_release); + if (con->othercon) { + clean_one_writequeue(con->othercon); if (con->othercon) - clean_one_writequeue(con->othercon); + call_srcu(&connections_srcu, &con->othercon->rcu, connection_release); } srcu_read_unlock(&connections_srcu, idx); - spin_lock(&dlm_node_addrs_spin); - na = find_node_addr(nodeid); - if (na) { - list_del(&na->list); - while (na->addr_count--) - kfree(na->addr[na->addr_count]); - kfree(na); - } - spin_unlock(&dlm_node_addrs_spin); + /* for debugging we print when we are done to compare with other + * messages in between. This function need to be correctly synchronized + * with io handling + */ + log_print("closing connection to node %d done", nodeid); return 0; } -/* Receive workqueue function */ +/* Receive worker function */ static void process_recv_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, rwork); + int ret, buflen; + + down_read(&con->sock_lock); + if (!con->sock) { + up_read(&con->sock_lock); + return; + } + + buflen = READ_ONCE(dlm_config.ci_buffer_size); + do { + ret = receive_from_sock(con, buflen); + } while (ret == DLM_IO_SUCCESS); + up_read(&con->sock_lock); - clear_bit(CF_READ_PENDING, &con->flags); - receive_from_sock(con); + switch (ret) { + case DLM_IO_END: + /* CF_RECV_PENDING cleared */ + break; + case DLM_IO_EOF: + close_connection(con, false); + /* CF_RECV_PENDING cleared */ + break; + case DLM_IO_RESCHED: + cond_resched(); + queue_work(io_workqueue, &con->rwork); + /* CF_RECV_PENDING not cleared */ + break; + default: + if (ret < 0) { + if (test_bit(CF_IS_OTHERCON, &con->flags)) { + close_connection(con, false); + } else { + spin_lock_bh(&con->writequeue_lock); + lowcomms_queue_swork(con); + spin_unlock_bh(&con->writequeue_lock); + } + + /* CF_RECV_PENDING cleared for othercon + * we trigger send queue if not already done + * and process_send_sockets will handle it + */ + break; + } + + WARN_ON_ONCE(1); + break; + } } static void process_listen_recv_socket(struct work_struct *work) { - accept_from_sock(&listen_con); + int ret; + + if (WARN_ON_ONCE(!listen_con.sock)) + return; + + do { + ret = accept_from_sock(); + } while (ret == DLM_IO_SUCCESS); + + if (ret < 0) + log_print("critical error accepting connection: %d", ret); } -static void dlm_connect(struct connection *con) +static int dlm_connect(struct connection *con) { struct sockaddr_storage addr; int result, addr_len; struct socket *sock; unsigned int mark; - /* Some odd races can cause double-connects, ignore them */ - if (con->retries++ > MAX_CONNECT_RETRIES) - return; - - if (con->sock) { - log_print("node %d already connected.", con->nodeid); - return; - } - memset(&addr, 0, sizeof(addr)); result = nodeid_to_addr(con->nodeid, &addr, NULL, dlm_proto_ops->try_new_addr, &mark); if (result < 0) { log_print("no address for nodeid %d", con->nodeid); - return; + return result; } /* Create a socket to communicate with */ - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family, SOCK_STREAM, dlm_proto_ops->proto, &sock); if (result < 0) - goto socket_err; + return result; sock_set_mark(sock->sk, mark); dlm_proto_ops->sockopts(sock); - add_sock(sock, con); - result = dlm_proto_ops->bind(sock); - if (result < 0) - goto add_sock_err; + if (result < 0) { + sock_release(sock); + return result; + } + + add_sock(sock, con); log_print_ratelimited("connecting to %d", con->nodeid); make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len); result = dlm_proto_ops->connect(con, sock, (struct sockaddr *)&addr, addr_len); - if (result < 0) - goto add_sock_err; - - return; - -add_sock_err: - dlm_close_sock(&con->sock); + switch (result) { + case -EINPROGRESS: + /* not an error */ + fallthrough; + case 0: + break; + default: + if (result < 0) + dlm_close_sock(&con->sock); -socket_err: - /* - * Some errors are fatal and this list might need adjusting. For other - * errors we try again until the max number of retries is reached. - */ - if (result != -EHOSTUNREACH && - result != -ENETUNREACH && - result != -ENETDOWN && - result != -EINVAL && - result != -EPROTONOSUPPORT) { - log_print("connect %d try %d error %d", con->nodeid, - con->retries, result); - msleep(1000); - lowcomms_connect_sock(con); + break; } + + return result; } -/* Send workqueue function */ +/* Send worker function */ static void process_send_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, swork); + int ret; - WARN_ON(test_bit(CF_IS_OTHERCON, &con->flags)); - - clear_bit(CF_WRITE_PENDING, &con->flags); + WARN_ON_ONCE(test_bit(CF_IS_OTHERCON, &con->flags)); - if (test_and_clear_bit(CF_RECONNECT, &con->flags)) { - close_connection(con, false, false, true); - dlm_midcomms_unack_msg_resend(con->nodeid); + down_read(&con->sock_lock); + if (!con->sock) { + up_read(&con->sock_lock); + down_write(&con->sock_lock); + if (!con->sock) { + ret = dlm_connect(con); + switch (ret) { + case 0: + break; + case -EINPROGRESS: + /* avoid spamming resched on connection + * we might can switch to a state_change + * event based mechanism if established + */ + msleep(100); + break; + default: + /* CF_SEND_PENDING not cleared */ + up_write(&con->sock_lock); + log_print("connect to node %d try %d error %d", + con->nodeid, con->retries++, ret); + msleep(1000); + /* For now we try forever to reconnect. In + * future we should send a event to cluster + * manager to fence itself after certain amount + * of retries. + */ + queue_work(io_workqueue, &con->swork); + return; + } + } + downgrade_write(&con->sock_lock); } - if (con->sock == NULL) { - if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags)) - msleep(1000); + do { + ret = send_to_sock(con); + } while (ret == DLM_IO_SUCCESS); + up_read(&con->sock_lock); - mutex_lock(&con->sock_mutex); - dlm_connect(con); - mutex_unlock(&con->sock_mutex); - } + switch (ret) { + case DLM_IO_END: + /* CF_SEND_PENDING cleared */ + break; + case DLM_IO_RESCHED: + /* CF_SEND_PENDING not cleared */ + cond_resched(); + queue_work(io_workqueue, &con->swork); + break; + default: + if (ret < 0) { + close_connection(con, false); + + /* CF_SEND_PENDING cleared */ + spin_lock_bh(&con->writequeue_lock); + lowcomms_queue_swork(con); + spin_unlock_bh(&con->writequeue_lock); + break; + } - if (!list_empty(&con->writequeue)) - send_to_sock(con); + WARN_ON_ONCE(1); + break; + } } static void work_stop(void) { - if (recv_workqueue) { - destroy_workqueue(recv_workqueue); - recv_workqueue = NULL; + if (io_workqueue) { + destroy_workqueue(io_workqueue); + io_workqueue = NULL; } - if (send_workqueue) { - destroy_workqueue(send_workqueue); - send_workqueue = NULL; + if (process_workqueue) { + destroy_workqueue(process_workqueue); + process_workqueue = NULL; } } static int work_start(void) { - recv_workqueue = alloc_ordered_workqueue("dlm_recv", WQ_MEM_RECLAIM); - if (!recv_workqueue) { - log_print("can't start dlm_recv"); + io_workqueue = alloc_workqueue("dlm_io", WQ_HIGHPRI | WQ_MEM_RECLAIM, + 0); + if (!io_workqueue) { + log_print("can't start dlm_io"); return -ENOMEM; } - send_workqueue = alloc_ordered_workqueue("dlm_send", WQ_MEM_RECLAIM); - if (!send_workqueue) { - log_print("can't start dlm_send"); - destroy_workqueue(recv_workqueue); - recv_workqueue = NULL; + /* ordered dlm message process queue, + * should be converted to a tasklet + */ + process_workqueue = alloc_ordered_workqueue("dlm_process", + WQ_HIGHPRI | WQ_MEM_RECLAIM); + if (!process_workqueue) { + log_print("can't start dlm_process"); + destroy_workqueue(io_workqueue); + io_workqueue = NULL; return -ENOMEM; } return 0; } -static void shutdown_conn(struct connection *con) -{ - if (dlm_proto_ops->shutdown_action) - dlm_proto_ops->shutdown_action(con); -} - void dlm_lowcomms_shutdown(void) { - int idx; - - /* Set all the flags to prevent any - * socket activity. - */ - dlm_allow_conn = 0; - - if (recv_workqueue) - flush_workqueue(recv_workqueue); - if (send_workqueue) - flush_workqueue(send_workqueue); + /* stop lowcomms_listen_data_ready calls */ + lock_sock(listen_con.sock->sk); + listen_con.sock->sk->sk_data_ready = listen_sock.sk_data_ready; + release_sock(listen_con.sock->sk); + cancel_work_sync(&listen_con.rwork); dlm_close_sock(&listen_con.sock); - idx = srcu_read_lock(&connections_srcu); - foreach_conn(shutdown_conn); - srcu_read_unlock(&connections_srcu, idx); -} - -static void _stop_conn(struct connection *con, bool and_other) -{ - mutex_lock(&con->sock_mutex); - set_bit(CF_CLOSE, &con->flags); - set_bit(CF_READ_PENDING, &con->flags); - set_bit(CF_WRITE_PENDING, &con->flags); - if (con->sock && con->sock->sk) { - lock_sock(con->sock->sk); - con->sock->sk->sk_user_data = NULL; - release_sock(con->sock->sk); - } - if (con->othercon && and_other) - _stop_conn(con->othercon, false); - mutex_unlock(&con->sock_mutex); -} - -static void stop_conn(struct connection *con) -{ - _stop_conn(con, true); + flush_workqueue(process_workqueue); } -static void connection_release(struct rcu_head *rcu) +void dlm_lowcomms_shutdown_node(int nodeid, bool force) { - struct connection *con = container_of(rcu, struct connection, rcu); - - kfree(con->rx_buf); - kfree(con); -} + struct connection *con; + int idx; -static void free_conn(struct connection *con) -{ - close_connection(con, true, true, true); - spin_lock(&connections_lock); - hlist_del_rcu(&con->list); - spin_unlock(&connections_lock); - if (con->othercon) { - clean_one_writequeue(con->othercon); - call_srcu(&connections_srcu, &con->othercon->rcu, - connection_release); + idx = srcu_read_lock(&connections_srcu); + con = nodeid2con(nodeid, 0); + if (WARN_ON_ONCE(!con)) { + srcu_read_unlock(&connections_srcu, idx); + return; } - clean_one_writequeue(con); - call_srcu(&connections_srcu, &con->rcu, connection_release); -} -static void work_flush(void) -{ - int ok; - int i; - struct connection *con; - - do { - ok = 1; - foreach_conn(stop_conn); - if (recv_workqueue) - flush_workqueue(recv_workqueue); - if (send_workqueue) - flush_workqueue(send_workqueue); - for (i = 0; i < CONN_HASH_SIZE && ok; i++) { - hlist_for_each_entry_rcu(con, &connection_hash[i], - list) { - ok &= test_bit(CF_READ_PENDING, &con->flags); - ok &= test_bit(CF_WRITE_PENDING, &con->flags); - if (con->othercon) { - ok &= test_bit(CF_READ_PENDING, - &con->othercon->flags); - ok &= test_bit(CF_WRITE_PENDING, - &con->othercon->flags); - } - } - } - } while (!ok); + flush_work(&con->swork); + stop_connection_io(con); + WARN_ON_ONCE(!force && !list_empty(&con->writequeue)); + close_connection(con, true); + clean_one_writequeue(con); + if (con->othercon) + clean_one_writequeue(con->othercon); + allow_connection_io(con); + srcu_read_unlock(&connections_srcu, idx); } void dlm_lowcomms_stop(void) { - int idx; - - idx = srcu_read_lock(&connections_srcu); - work_flush(); - foreach_conn(free_conn); - srcu_read_unlock(&connections_srcu, idx); work_stop(); - deinit_local(); - dlm_proto_ops = NULL; } @@ -1795,7 +1747,7 @@ static int dlm_listen_for_all(void) if (result < 0) return result; - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family, SOCK_STREAM, dlm_proto_ops->proto, &sock); if (result < 0) { log_print("Can't create comms socket: %d", result); @@ -1809,14 +1761,23 @@ static int dlm_listen_for_all(void) if (result < 0) goto out; - save_listen_callbacks(sock); - add_listen_sock(sock, &listen_con); + lock_sock(sock->sk); + listen_sock.sk_data_ready = sock->sk->sk_data_ready; + listen_sock.sk_write_space = sock->sk->sk_write_space; + listen_sock.sk_error_report = sock->sk->sk_error_report; + listen_sock.sk_state_change = sock->sk->sk_state_change; + + listen_con.sock = sock; + + sock->sk->sk_allocation = GFP_NOFS; + sock->sk->sk_use_task_frag = false; + sock->sk->sk_data_ready = lowcomms_listen_data_ready; + release_sock(sock->sk); - INIT_WORK(&listen_con.rwork, process_listen_recv_socket); result = sock->ops->listen(sock, 5); if (result < 0) { dlm_close_sock(&listen_con.sock); - goto out; + return result; } return 0; @@ -1834,7 +1795,7 @@ static int dlm_tcp_bind(struct socket *sock) /* Bind to our cluster-known address connecting to avoid * routing problems. */ - memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr)); + memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr)); make_sockaddr(&src_addr, 0, &addr_len); result = sock->ops->bind(sock, (struct sockaddr *)&src_addr, @@ -1850,17 +1811,7 @@ static int dlm_tcp_bind(struct socket *sock) static int dlm_tcp_connect(struct connection *con, struct socket *sock, struct sockaddr *addr, int addr_len) { - int ret; - - ret = sock->ops->connect(sock, addr, addr_len, O_NONBLOCK); - switch (ret) { - case -EINPROGRESS: - fallthrough; - case 0: - return 0; - } - - return ret; + return sock->ops->connect(sock, addr, addr_len, O_NONBLOCK); } static int dlm_tcp_listen_validate(void) @@ -1891,8 +1842,8 @@ static int dlm_tcp_listen_bind(struct socket *sock) int addr_len; /* Bind to our port */ - make_sockaddr(dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len); - return sock->ops->bind(sock, (struct sockaddr *)dlm_local_addr[0], + make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len); + return sock->ops->bind(sock, (struct sockaddr *)&dlm_local_addr[0], addr_len); } @@ -1905,8 +1856,6 @@ static const struct dlm_proto_ops dlm_tcp_ops = { .listen_validate = dlm_tcp_listen_validate, .listen_sockopts = dlm_tcp_listen_sockopts, .listen_bind = dlm_tcp_listen_bind, - .shutdown_action = dlm_tcp_shutdown, - .eof_condition = tcp_eof_condition, }; static int dlm_sctp_bind(struct socket *sock) @@ -1927,13 +1876,7 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock, sock_set_sndtimeo(sock->sk, 5); ret = sock->ops->connect(sock, addr, addr_len, 0); sock_set_sndtimeo(sock->sk, 0); - if (ret < 0) - return ret; - - if (!test_and_set_bit(CF_CONNECTED, &con->flags)) - log_print("connected to node %d", con->nodeid); - - return 0; + return ret; } static int dlm_sctp_listen_validate(void) @@ -1973,11 +1916,7 @@ static const struct dlm_proto_ops dlm_sctp_ops = { int dlm_lowcomms_start(void) { - int error = -EINVAL; - int i; - - for (i = 0; i < CONN_HASH_SIZE; i++) - INIT_HLIST_HEAD(&connection_hash[i]); + int error; init_local(); if (!dlm_local_count) { @@ -1986,13 +1925,9 @@ int dlm_lowcomms_start(void) goto fail; } - INIT_WORK(&listen_con.rwork, process_listen_recv_socket); - error = work_start(); if (error) - goto fail_local; - - dlm_allow_conn = 1; + goto fail; /* Start listening */ switch (dlm_config.ci_protocol) { @@ -2018,25 +1953,38 @@ int dlm_lowcomms_start(void) fail_listen: dlm_proto_ops = NULL; fail_proto_ops: - dlm_allow_conn = 0; - dlm_close_sock(&listen_con.sock); work_stop(); -fail_local: - deinit_local(); fail: return error; } +void dlm_lowcomms_init(void) +{ + int i; + + for (i = 0; i < CONN_HASH_SIZE; i++) + INIT_HLIST_HEAD(&connection_hash[i]); + + INIT_WORK(&listen_con.rwork, process_listen_recv_socket); +} + void dlm_lowcomms_exit(void) { - struct dlm_node_addr *na, *safe; + struct connection *con; + int i, idx; - spin_lock(&dlm_node_addrs_spin); - list_for_each_entry_safe(na, safe, &dlm_node_addrs, list) { - list_del(&na->list); - while (na->addr_count--) - kfree(na->addr[na->addr_count]); - kfree(na); + idx = srcu_read_lock(&connections_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(con, &connection_hash[i], list) { + spin_lock(&connections_lock); + hlist_del_rcu(&con->list); + spin_unlock(&connections_lock); + + if (con->othercon) + call_srcu(&connections_srcu, &con->othercon->rcu, + connection_release); + call_srcu(&connections_srcu, &con->rcu, connection_release); + } } - spin_unlock(&dlm_node_addrs_spin); + srcu_read_unlock(&connections_srcu, idx); } diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index 29369feea991..3e8dca66183b 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -29,12 +29,14 @@ static inline int nodeid_hash(int nodeid) return nodeid & (CONN_HASH_SIZE-1); } -/* switch to check if dlm is running */ -extern int dlm_allow_conn; +/* check if dlm is running */ +bool dlm_lowcomms_is_running(void); int dlm_lowcomms_start(void); void dlm_lowcomms_shutdown(void); +void dlm_lowcomms_shutdown_node(int nodeid, bool force); void dlm_lowcomms_stop(void); +void dlm_lowcomms_init(void); void dlm_lowcomms_exit(void); int dlm_lowcomms_close(int nodeid); struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, diff --git a/fs/dlm/main.c b/fs/dlm/main.c index 1c5be4b70ac1..a77338be3237 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -17,7 +17,7 @@ #include "user.h" #include "memory.h" #include "config.h" -#include "lowcomms.h" +#include "midcomms.h" #define CREATE_TRACE_POINTS #include <trace/events/dlm.h> @@ -30,6 +30,8 @@ static int __init init_dlm(void) if (error) goto out; + dlm_midcomms_init(); + error = dlm_lockspace_init(); if (error) goto out_mem; @@ -66,6 +68,7 @@ static int __init init_dlm(void) out_lockspace: dlm_lockspace_exit(); out_mem: + dlm_midcomms_exit(); dlm_memory_exit(); out: return error; @@ -79,7 +82,7 @@ static void __exit exit_dlm(void) dlm_config_exit(); dlm_memory_exit(); dlm_lockspace_exit(); - dlm_lowcomms_exit(); + dlm_midcomms_exit(); dlm_unregister_debugfs(); } diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 2af2ccfe43a9..923c01a8a0aa 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -573,7 +573,10 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) node = &rv->nodes[i]; if (dlm_is_member(ls, node->nodeid)) continue; - dlm_add_member(ls, node); + error = dlm_add_member(ls, node); + if (error) + return error; + log_rinfo(ls, "add member %d", node->nodeid); } diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index ce35c3c19aeb..eb7a08641fcf 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -14,12 +14,14 @@ #include "lowcomms.h" #include "config.h" #include "memory.h" +#include "ast.h" static struct kmem_cache *writequeue_cache; static struct kmem_cache *mhandle_cache; static struct kmem_cache *msg_cache; static struct kmem_cache *lkb_cache; static struct kmem_cache *rsb_cache; +static struct kmem_cache *cb_cache; int __init dlm_memory_init(void) @@ -46,8 +48,16 @@ int __init dlm_memory_init(void) if (!rsb_cache) goto rsb; + cb_cache = kmem_cache_create("dlm_cb", sizeof(struct dlm_callback), + __alignof__(struct dlm_callback), 0, + NULL); + if (!rsb_cache) + goto cb; + return 0; +cb: + kmem_cache_destroy(rsb_cache); rsb: kmem_cache_destroy(msg_cache); msg: @@ -67,6 +77,7 @@ void dlm_memory_exit(void) kmem_cache_destroy(msg_cache); kmem_cache_destroy(lkb_cache); kmem_cache_destroy(rsb_cache); + kmem_cache_destroy(cb_cache); } char *dlm_allocate_lvb(struct dlm_ls *ls) @@ -115,12 +126,17 @@ void dlm_free_lkb(struct dlm_lkb *lkb) kfree(ua); } } + + /* drop references if they are set */ + dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL); + dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL); + kmem_cache_free(lkb_cache, lkb); } -struct dlm_mhandle *dlm_allocate_mhandle(void) +struct dlm_mhandle *dlm_allocate_mhandle(gfp_t allocation) { - return kmem_cache_alloc(mhandle_cache, GFP_NOFS); + return kmem_cache_alloc(mhandle_cache, allocation); } void dlm_free_mhandle(struct dlm_mhandle *mhandle) @@ -147,3 +163,13 @@ void dlm_free_msg(struct dlm_msg *msg) { kmem_cache_free(msg_cache, msg); } + +struct dlm_callback *dlm_allocate_cb(void) +{ + return kmem_cache_alloc(cb_cache, GFP_ATOMIC); +} + +void dlm_free_cb(struct dlm_callback *cb) +{ + kmem_cache_free(cb_cache, cb); +} diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h index 7bd3f1a391ca..6b29563d24f7 100644 --- a/fs/dlm/memory.h +++ b/fs/dlm/memory.h @@ -20,12 +20,14 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls); void dlm_free_lkb(struct dlm_lkb *l); char *dlm_allocate_lvb(struct dlm_ls *ls); void dlm_free_lvb(char *l); -struct dlm_mhandle *dlm_allocate_mhandle(void); +struct dlm_mhandle *dlm_allocate_mhandle(gfp_t allocation); void dlm_free_mhandle(struct dlm_mhandle *mhandle); struct writequeue_entry *dlm_allocate_writequeue(void); void dlm_free_writequeue(struct writequeue_entry *writequeue); struct dlm_msg *dlm_allocate_msg(gfp_t allocation); void dlm_free_msg(struct dlm_msg *msg); +struct dlm_callback *dlm_allocate_cb(void); +void dlm_free_cb(struct dlm_callback *cb); #endif /* __MEMORY_DOT_H__ */ diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 6489bc22ad61..fc015a6abe17 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -132,6 +132,7 @@ */ #define DLM_DEBUG_FENCE_TERMINATION 0 +#include <trace/events/dlm.h> #include <net/tcp.h> #include "dlm_internal.h" @@ -194,7 +195,7 @@ struct midcomms_node { }; struct dlm_mhandle { - const struct dlm_header *inner_hd; + const union dlm_packet *inner_p; struct midcomms_node *node; struct dlm_opts *opts; struct dlm_msg *msg; @@ -305,11 +306,11 @@ static void dlm_send_queue_flush(struct midcomms_node *node) pr_debug("flush midcomms send queue of node %d\n", node->nodeid); rcu_read_lock(); - spin_lock(&node->send_queue_lock); + spin_lock_bh(&node->send_queue_lock); list_for_each_entry_rcu(mh, &node->send_queue, list) { dlm_mhandle_delete(node, mh); } - spin_unlock(&node->send_queue_lock); + spin_unlock_bh(&node->send_queue_lock); rcu_read_unlock(); } @@ -415,7 +416,7 @@ static int dlm_send_fin(struct midcomms_node *node, m_header->h_cmd = DLM_FIN; pr_debug("sending fin msg to node %d\n", node->nodeid); - dlm_midcomms_commit_mhandle(mh); + dlm_midcomms_commit_mhandle(mh, NULL, 0); set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags); return 0; @@ -436,7 +437,7 @@ static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq) } } - spin_lock(&node->send_queue_lock); + spin_lock_bh(&node->send_queue_lock); list_for_each_entry_rcu(mh, &node->send_queue, list) { if (before(mh->seq, seq)) { dlm_mhandle_delete(node, mh); @@ -445,7 +446,7 @@ static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq) break; } } - spin_unlock(&node->send_queue_lock); + spin_unlock_bh(&node->send_queue_lock); rcu_read_unlock(); } @@ -468,12 +469,26 @@ static void dlm_pas_fin_ack_rcv(struct midcomms_node *node) spin_unlock(&node->state_lock); log_print("%s: unexpected state: %d\n", __func__, node->state); - WARN_ON(1); + WARN_ON_ONCE(1); return; } spin_unlock(&node->state_lock); } +static void dlm_receive_buffer_3_2_trace(uint32_t seq, union dlm_packet *p) +{ + switch (p->header.h_cmd) { + case DLM_MSG: + trace_dlm_recv_message(dlm_our_nodeid(), seq, &p->message); + break; + case DLM_RCOM: + trace_dlm_recv_rcom(dlm_our_nodeid(), seq, &p->rcom); + break; + default: + break; + } +} + static void dlm_midcomms_receive_buffer(union dlm_packet *p, struct midcomms_node *node, uint32_t seq) @@ -525,7 +540,7 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, spin_unlock(&node->state_lock); log_print("%s: unexpected state: %d\n", __func__, node->state); - WARN_ON(1); + WARN_ON_ONCE(1); return; } spin_unlock(&node->state_lock); @@ -533,7 +548,8 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); break; default: - WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); + WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); + dlm_receive_buffer_3_2_trace(seq, p); dlm_receive_buffer(p, node->nodeid); set_bit(DLM_NODE_ULP_DELIVERED, &node->flags); break; @@ -754,7 +770,7 @@ static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid) goto out; } - WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); + WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); dlm_receive_buffer(p, nodeid); break; case DLM_OPTS: @@ -874,12 +890,7 @@ static void dlm_midcomms_receive_buffer_3_1(union dlm_packet *p, int nodeid) dlm_receive_buffer(p, nodeid); } -/* - * Called from the low-level comms layer to process a buffer of - * commands. - */ - -int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) +int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len) { const unsigned char *ptr = buf; const struct dlm_header *hd; @@ -914,6 +925,32 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) if (msglen > len) break; + ret += msglen; + len -= msglen; + ptr += msglen; + } + + return ret; +} + +/* + * Called from the low-level comms layer to process a buffer of + * commands. + */ +int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) +{ + const unsigned char *ptr = buf; + const struct dlm_header *hd; + uint16_t msglen; + int ret = 0; + + while (len >= sizeof(struct dlm_header)) { + hd = (struct dlm_header *)ptr; + + msglen = le16_to_cpu(hd->h_length); + if (msglen > len) + break; + switch (hd->h_version) { case cpu_to_le32(DLM_VERSION_3_1): dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid); @@ -1030,9 +1067,9 @@ static void midcomms_new_msg_cb(void *data) atomic_inc(&mh->node->send_queue_cnt); - spin_lock(&mh->node->send_queue_lock); + spin_lock_bh(&mh->node->send_queue_lock); list_add_tail_rcu(&mh->list, &mh->node->send_queue); - spin_unlock(&mh->node->send_queue_lock); + spin_unlock_bh(&mh->node->send_queue_lock); mh->seq = mh->node->seq_send++; } @@ -1055,7 +1092,7 @@ static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int node dlm_fill_opts_header(opts, len, mh->seq); *ppc += sizeof(*opts); - mh->inner_hd = (const struct dlm_header *)*ppc; + mh->inner_p = (const union dlm_packet *)*ppc; return msg; } @@ -1079,9 +1116,9 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, } /* this is a bug, however we going on and hope it will be resolved */ - WARN_ON(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags)); + WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags)); - mh = dlm_allocate_mhandle(); + mh = dlm_allocate_mhandle(allocation); if (!mh) goto err; @@ -1111,7 +1148,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, break; default: dlm_free_mhandle(mh); - WARN_ON(1); + WARN_ON_ONCE(1); goto err; } @@ -1130,11 +1167,32 @@ err: } #endif -static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh) +static void dlm_midcomms_commit_msg_3_2_trace(const struct dlm_mhandle *mh, + const void *name, int namelen) +{ + switch (mh->inner_p->header.h_cmd) { + case DLM_MSG: + trace_dlm_send_message(mh->node->nodeid, mh->seq, + &mh->inner_p->message, + name, namelen); + break; + case DLM_RCOM: + trace_dlm_send_rcom(mh->node->nodeid, mh->seq, + &mh->inner_p->rcom); + break; + default: + /* nothing to trace */ + break; + } +} + +static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh, + const void *name, int namelen) { /* nexthdr chain for fast lookup */ - mh->opts->o_nextcmd = mh->inner_hd->h_cmd; + mh->opts->o_nextcmd = mh->inner_p->header.h_cmd; mh->committed = true; + dlm_midcomms_commit_msg_3_2_trace(mh, name, namelen); dlm_lowcomms_commit_msg(mh->msg); } @@ -1142,8 +1200,10 @@ static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh) * dlm_midcomms_get_mhandle */ #ifndef __CHECKER__ -void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh) +void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, + const void *name, int namelen) { + switch (mh->node->version) { case DLM_VERSION_3_1: srcu_read_unlock(&nodes_srcu, mh->idx); @@ -1154,12 +1214,12 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh) dlm_free_mhandle(mh); break; case DLM_VERSION_3_2: - dlm_midcomms_commit_msg_3_2(mh); + dlm_midcomms_commit_msg_3_2(mh, name, namelen); srcu_read_unlock(&nodes_srcu, mh->idx); break; default: srcu_read_unlock(&nodes_srcu, mh->idx); - WARN_ON(1); + WARN_ON_ONCE(1); break; } } @@ -1167,12 +1227,27 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh) int dlm_midcomms_start(void) { + return dlm_lowcomms_start(); +} + +void dlm_midcomms_stop(void) +{ + dlm_lowcomms_stop(); +} + +void dlm_midcomms_init(void) +{ int i; for (i = 0; i < CONN_HASH_SIZE; i++) INIT_HLIST_HEAD(&node_hash[i]); - return dlm_lowcomms_start(); + dlm_lowcomms_init(); +} + +void dlm_midcomms_exit(void) +{ + dlm_lowcomms_exit(); } static void dlm_act_fin_ack_rcv(struct midcomms_node *node) @@ -1201,7 +1276,7 @@ static void dlm_act_fin_ack_rcv(struct midcomms_node *node) spin_unlock(&node->state_lock); log_print("%s: unexpected state: %d\n", __func__, node->state); - WARN_ON(1); + WARN_ON_ONCE(1); return; } spin_unlock(&node->state_lock); @@ -1319,7 +1394,7 @@ static void midcomms_node_release(struct rcu_head *rcu) { struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu); - WARN_ON(atomic_read(&node->send_queue_cnt)); + WARN_ON_ONCE(atomic_read(&node->send_queue_cnt)); kfree(node); } @@ -1372,11 +1447,13 @@ static void midcomms_shutdown(struct midcomms_node *node) pr_debug("active shutdown timed out for node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); midcomms_node_reset(node); + dlm_lowcomms_shutdown_node(node->nodeid, true); return; } pr_debug("active shutdown done for node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); + dlm_lowcomms_shutdown_node(node->nodeid, false); } void dlm_midcomms_shutdown(void) @@ -1384,6 +1461,8 @@ void dlm_midcomms_shutdown(void) struct midcomms_node *node; int i, idx; + dlm_lowcomms_shutdown(); + mutex_lock(&close_lock); idx = srcu_read_lock(&nodes_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { @@ -1401,8 +1480,6 @@ void dlm_midcomms_shutdown(void) } srcu_read_unlock(&nodes_srcu, idx); mutex_unlock(&close_lock); - - dlm_lowcomms_shutdown(); } int dlm_midcomms_close(int nodeid) diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index 82bcd9661922..bea1cee4279c 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -14,12 +14,17 @@ struct midcomms_node; +int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len); int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen); struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, gfp_t allocation, char **ppc); -void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh); +void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name, + int namelen); int dlm_midcomms_close(int nodeid); int dlm_midcomms_start(void); +void dlm_midcomms_stop(void); +void dlm_midcomms_init(void); +void dlm_midcomms_exit(void); void dlm_midcomms_shutdown(void); void dlm_midcomms_add_member(int nodeid); void dlm_midcomms_remove_member(int nodeid); diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index 67f68d48d60c..4de4b8651c6c 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -75,6 +75,7 @@ static struct genl_family family __ro_after_init = { .version = DLM_GENL_VERSION, .small_ops = dlm_nl_ops, .n_small_ops = ARRAY_SIZE(dlm_nl_ops), + .resv_start_op = DLM_CMD_HELLO + 1, .module = THIS_MODULE, }; diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index f19860315043..b76d52e2f6bd 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -91,7 +91,7 @@ static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type, static void send_rcom(struct dlm_mhandle *mh, struct dlm_rcom *rc) { - dlm_midcomms_commit_mhandle(mh); + dlm_midcomms_commit_mhandle(mh, NULL, 0); } static void send_rcom_stateless(struct dlm_msg *msg, struct dlm_rcom *rc) @@ -516,7 +516,7 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) rf = (struct rcom_config *) rc->rc_buf; rf->rf_lvblen = cpu_to_le32(~0U); - dlm_midcomms_commit_mhandle(mh); + dlm_midcomms_commit_mhandle(mh, NULL, 0); return 0; } diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index 036a9a0078f6..8be2893ad15b 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -44,7 +44,8 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) e->recover_seq = ls->ls_recover_seq & 0xFFFFFFFF; e->nodeid = nodeid; - memcpy(&e->request, ms, le16_to_cpu(ms->m_header.h_length)); + memcpy(&e->request, ms, sizeof(*ms)); + memcpy(&e->request.m_extra, ms->m_extra, length); atomic_inc(&ls->ls_requestqueue_cnt); mutex_lock(&ls->ls_requestqueue_mutex); diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 99e8f0744513..35129505ddda 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -16,6 +16,8 @@ #include <linux/slab.h> #include <linux/sched/signal.h> +#include <trace/events/dlm.h> + #include "dlm_internal.h" #include "lockspace.h" #include "lock.h" @@ -23,6 +25,7 @@ #include "user.h" #include "ast.h" #include "config.h" +#include "memory.h" static const char name_prefix[] = "dlm"; static const struct file_operations device_fops; @@ -173,7 +176,7 @@ static int lkb_is_endoflife(int mode, int status) being removed and then remove that lkb from the orphans list and free it */ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, - int status, uint32_t sbflags, uint64_t seq) + int status, uint32_t sbflags) { struct dlm_ls *ls; struct dlm_user_args *ua; @@ -184,7 +187,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, return; ls = lkb->lkb_resource->res_ls; - mutex_lock(&ls->ls_clear_proc_locks); + spin_lock(&ls->ls_clear_proc_locks); /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed @@ -207,16 +210,22 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, spin_lock(&proc->asts_spin); - rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq); - if (rv < 0) { + rv = dlm_enqueue_lkb_callback(lkb, flags, mode, status, sbflags); + switch (rv) { + case DLM_ENQUEUE_CALLBACK_FAILURE: spin_unlock(&proc->asts_spin); + WARN_ON_ONCE(1); goto out; - } - - if (list_empty(&lkb->lkb_cb_list)) { + case DLM_ENQUEUE_CALLBACK_NEED_SCHED: kref_get(&lkb->lkb_ref); list_add_tail(&lkb->lkb_cb_list, &proc->asts); wake_up_interruptible(&proc->wait); + break; + case DLM_ENQUEUE_CALLBACK_SUCCESS: + break; + default: + WARN_ON_ONCE(1); + break; } spin_unlock(&proc->asts_spin); @@ -230,7 +239,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, spin_unlock(&proc->locks_spin); } out: - mutex_unlock(&ls->ls_clear_proc_locks); + spin_unlock(&ls->ls_clear_proc_locks); } static int device_user_lock(struct dlm_user_proc *proc, @@ -421,9 +430,9 @@ static int device_create_lockspace(struct dlm_lspace_params *params) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - error = dlm_new_lockspace(params->name, dlm_config.ci_cluster_name, params->flags, - DLM_USER_LVB_LEN, NULL, NULL, NULL, - &lockspace); + error = dlm_new_user_lockspace(params->name, dlm_config.ci_cluster_name, + params->flags, DLM_USER_LVB_LEN, NULL, + NULL, NULL, &lockspace); if (error) return error; @@ -798,8 +807,8 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, struct dlm_user_proc *proc = file->private_data; struct dlm_lkb *lkb; DECLARE_WAITQUEUE(wait, current); - struct dlm_callback cb; - int rv, resid, copy_lvb = 0; + struct dlm_callback *cb; + int rv, copy_lvb = 0; int old_mode, new_mode; if (count == sizeof(struct dlm_device_version)) { @@ -855,50 +864,58 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, without removing lkb_cb_list; so empty lkb_cb_list is always consistent with empty lkb_callbacks */ - lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_cb_list); + lkb = list_first_entry(&proc->asts, struct dlm_lkb, lkb_cb_list); /* rem_lkb_callback sets a new lkb_last_cast */ - old_mode = lkb->lkb_last_cast.mode; + old_mode = lkb->lkb_last_cast->mode; - rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid); - if (rv < 0) { + rv = dlm_dequeue_lkb_callback(lkb, &cb); + switch (rv) { + case DLM_DEQUEUE_CALLBACK_EMPTY: /* this shouldn't happen; lkb should have been removed from - list when resid was zero */ + * list when last item was dequeued + */ log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id); list_del_init(&lkb->lkb_cb_list); spin_unlock(&proc->asts_spin); /* removes ref for proc->asts, may cause lkb to be freed */ dlm_put_lkb(lkb); + WARN_ON_ONCE(1); goto try_another; - } - if (!resid) + case DLM_DEQUEUE_CALLBACK_LAST: list_del_init(&lkb->lkb_cb_list); - spin_unlock(&proc->asts_spin); - - if (cb.flags & DLM_CB_SKIP) { - /* removes ref for proc->asts, may cause lkb to be freed */ - if (!resid) - dlm_put_lkb(lkb); - goto try_another; + lkb->lkb_flags &= ~DLM_IFL_CB_PENDING; + break; + case DLM_DEQUEUE_CALLBACK_SUCCESS: + break; + default: + WARN_ON_ONCE(1); + break; } + spin_unlock(&proc->asts_spin); - if (cb.flags & DLM_CB_CAST) { - new_mode = cb.mode; + if (cb->flags & DLM_CB_BAST) { + trace_dlm_bast(lkb->lkb_resource->res_ls, lkb, cb->mode); + } else if (cb->flags & DLM_CB_CAST) { + new_mode = cb->mode; - if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr && + if (!cb->sb_status && lkb->lkb_lksb->sb_lvbptr && dlm_lvb_operations[old_mode + 1][new_mode + 1]) copy_lvb = 1; - lkb->lkb_lksb->sb_status = cb.sb_status; - lkb->lkb_lksb->sb_flags = cb.sb_flags; + lkb->lkb_lksb->sb_status = cb->sb_status; + lkb->lkb_lksb->sb_flags = cb->sb_flags; + trace_dlm_ast(lkb->lkb_resource->res_ls, lkb); } rv = copy_result_to_user(lkb->lkb_ua, test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags), - cb.flags, cb.mode, copy_lvb, buf, count); + cb->flags, cb->mode, copy_lvb, buf, count); + + kref_put(&cb->ref, dlm_release_callback); /* removes ref for proc->asts, may cause lkb to be freed */ - if (!resid) + if (rv == DLM_DEQUEUE_CALLBACK_LAST) dlm_put_lkb(lkb); return rv; diff --git a/fs/dlm/user.h b/fs/dlm/user.h index 6b9bce6b96e0..33059452d79e 100644 --- a/fs/dlm/user.h +++ b/fs/dlm/user.h @@ -7,7 +7,7 @@ #define __USER_DOT_H__ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, - int status, uint32_t sbflags, uint64_t seq); + int status, uint32_t sbflags); int dlm_user_init(void); void dlm_user_exit(void); int dlm_device_deregister(struct dlm_ls *ls); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 5f2b49e13731..f2ed0c0266cb 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -506,7 +506,7 @@ ecryptfs_dentry_to_lower(struct dentry *dentry) return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry; } -static inline struct path * +static inline const struct path * ecryptfs_dentry_to_lower_path(struct dentry *dentry) { return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 18d5b91cb573..268b74499c28 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -33,7 +33,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, struct iov_iter *to) { ssize_t rc; - struct path *path; + const struct path *path; struct file *file = iocb->ki_filp; rc = generic_file_read_iter(iocb, to); @@ -53,7 +53,7 @@ struct ecryptfs_getdents_callback { }; /* Inspired by generic filldir in fs/readdir.c */ -static int +static bool ecryptfs_filldir(struct dir_context *ctx, const char *lower_name, int lower_namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -61,18 +61,19 @@ ecryptfs_filldir(struct dir_context *ctx, const char *lower_name, container_of(ctx, struct ecryptfs_getdents_callback, ctx); size_t name_size; char *name; - int rc; + int err; + bool res; buf->filldir_called++; - rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size, - buf->sb, lower_name, - lower_namelen); - if (rc) { - if (rc != -EINVAL) { + err = ecryptfs_decode_and_decrypt_filename(&name, &name_size, + buf->sb, lower_name, + lower_namelen); + if (err) { + if (err != -EINVAL) { ecryptfs_printk(KERN_DEBUG, "%s: Error attempting to decode and decrypt filename [%s]; rc = [%d]\n", - __func__, lower_name, rc); - return rc; + __func__, lower_name, err); + return false; } /* Mask -EINVAL errors as these are most likely due a plaintext @@ -81,16 +82,15 @@ ecryptfs_filldir(struct dir_context *ctx, const char *lower_name, * the "lost+found" dentry in the root directory of an Ext4 * filesystem. */ - return 0; + return true; } buf->caller->pos = buf->ctx.pos; - rc = !dir_emit(buf->caller, name, name_size, ino, d_type); + res = dir_emit(buf->caller, name, name_size, ino, d_type); kfree(name); - if (!rc) + if (res) buf->entries_written++; - - return rc; + return res; } /** @@ -111,14 +111,8 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx) lower_file = ecryptfs_file_to_lower(file); rc = iterate_dir(lower_file, &buf.ctx); ctx->pos = buf.ctx.pos; - if (rc < 0) - goto out; - if (buf.filldir_called && !buf.entries_written) - goto out; - if (rc >= 0) - fsstack_copy_attr_atime(inode, - file_inode(lower_file)); -out: + if (rc >= 0 && (buf.entries_written || !buf.filldir_called)) + fsstack_copy_attr_atime(inode, file_inode(lower_file)); return rc; } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 16d50dface59..f3cd00fac9c3 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -18,6 +18,8 @@ #include <linux/fs_stack.h> #include <linux/slab.h> #include <linux/xattr.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> #include <linux/fileattr.h> #include <asm/unaligned.h> #include "ecryptfs_kernel.h" @@ -317,7 +319,7 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode) static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, struct dentry *lower_dentry) { - struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent); + const struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent); struct inode *inode, *lower_inode; struct ecryptfs_dentry_info *dentry_info; int rc = 0; @@ -1120,6 +1122,28 @@ static int ecryptfs_fileattr_set(struct user_namespace *mnt_userns, return rc; } +static struct posix_acl *ecryptfs_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, int type) +{ + return vfs_get_acl(mnt_userns, ecryptfs_dentry_to_lower(dentry), + posix_acl_xattr_name(type)); +} + +static int ecryptfs_set_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, struct posix_acl *acl, + int type) +{ + int rc; + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + struct inode *lower_inode = d_inode(lower_dentry); + + rc = vfs_set_acl(&init_user_ns, lower_dentry, + posix_acl_xattr_name(type), acl); + if (!rc) + fsstack_copy_attr_all(d_inode(dentry), lower_inode); + return rc; +} + const struct inode_operations ecryptfs_symlink_iops = { .get_link = ecryptfs_get_link, .permission = ecryptfs_permission, @@ -1143,6 +1167,8 @@ const struct inode_operations ecryptfs_dir_iops = { .listxattr = ecryptfs_listxattr, .fileattr_get = ecryptfs_fileattr_get, .fileattr_set = ecryptfs_fileattr_set, + .get_acl = ecryptfs_get_acl, + .set_acl = ecryptfs_set_acl, }; const struct inode_operations ecryptfs_main_iops = { @@ -1152,6 +1178,8 @@ const struct inode_operations ecryptfs_main_iops = { .listxattr = ecryptfs_listxattr, .fileattr_get = ecryptfs_fileattr_get, .fileattr_set = ecryptfs_fileattr_set, + .get_acl = ecryptfs_get_acl, + .set_acl = ecryptfs_set_acl, }; static int ecryptfs_xattr_get(const struct xattr_handler *handler, @@ -1182,6 +1210,10 @@ static const struct xattr_handler ecryptfs_xattr_handler = { }; const struct xattr_handler *ecryptfs_xattr_handlers[] = { +#ifdef CONFIG_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif &ecryptfs_xattr_handler, NULL }; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 2dd23a82e0de..2dc927ba067f 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -105,7 +105,7 @@ static int ecryptfs_init_lower_file(struct dentry *dentry, struct file **lower_file) { const struct cred *cred = current_cred(); - struct path *path = ecryptfs_dentry_to_lower_path(dentry); + const struct path *path = ecryptfs_dentry_to_lower_path(dentry); int rc; rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt, diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 939e5e242b98..617f3ad2485e 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -91,6 +91,10 @@ static int efivarfs_create(struct user_namespace *mnt_userns, struct inode *dir, err = guid_parse(dentry->d_name.name + namelen + 1, &var->var.VendorGuid); if (err) goto out; + if (guid_equal(&var->var.VendorGuid, &LINUX_EFI_RANDOM_SEED_TABLE_GUID)) { + err = -EPERM; + goto out; + } if (efivar_variable_is_removable(var->var.VendorGuid, dentry->d_name.name, namelen)) diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 6780fc81cc11..07e82e246666 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -116,6 +116,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, int err = -ENOMEM; bool is_removable = false; + if (guid_equal(&vendor, &LINUX_EFI_RANDOM_SEED_TABLE_GUID)) + return 0; + entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return err; diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c index a0ef63cfcecb..9e4f47808bd5 100644 --- a/fs/efivarfs/vars.c +++ b/fs/efivarfs/vars.c @@ -651,22 +651,6 @@ int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes, if (err) return err; - /* - * Ensure that the available space hasn't shrunk below the safe level - */ - status = check_var_size(attributes, *size + ucs2_strsize(name, 1024)); - if (status != EFI_SUCCESS) { - if (status != EFI_UNSUPPORTED) { - err = efi_status_to_err(status); - goto out; - } - - if (*size > 65536) { - err = -ENOSPC; - goto out; - } - } - status = efivar_set_variable_locked(name, vendor, attributes, *size, data, false); if (status != EFI_SUCCESS) { diff --git a/fs/erofs/data.c b/fs/erofs/data.c index fe8ac0e163f7..f57f921683d7 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -13,9 +13,7 @@ void erofs_unmap_metabuf(struct erofs_buf *buf) { if (buf->kmap_type == EROFS_KMAP) - kunmap(buf->page); - else if (buf->kmap_type == EROFS_KMAP_ATOMIC) - kunmap_atomic(buf->base); + kunmap_local(buf->base); buf->base = NULL; buf->kmap_type = EROFS_NO_KMAP; } @@ -54,9 +52,7 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode, } if (buf->kmap_type == EROFS_NO_KMAP) { if (type == EROFS_KMAP) - buf->base = kmap(page); - else if (type == EROFS_KMAP_ATOMIC) - buf->base = kmap_atomic(page); + buf->base = kmap_local_page(page); buf->kmap_type = type; } else if (buf->kmap_type != type) { DBG_BUGON(1); @@ -403,6 +399,8 @@ const struct address_space_operations erofs_raw_access_aops = { .readahead = erofs_readahead, .bmap = erofs_bmap, .direct_IO = noop_direct_IO, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, }; #ifdef CONFIG_FS_DAX diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 2d55569f96ac..51b7ac7166d9 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -317,52 +317,61 @@ dstmap_out: return ret; } -static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq, - struct page **pagepool) +static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, + struct page **pagepool) { - const unsigned int nrpages_out = + const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + const unsigned int outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; const unsigned int righthalf = min_t(unsigned int, rq->outputsize, PAGE_SIZE - rq->pageofs_out); const unsigned int lefthalf = rq->outputsize - righthalf; + const unsigned int interlaced_offset = + rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out; unsigned char *src, *dst; - if (nrpages_out > 2) { + if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) { DBG_BUGON(1); - return -EIO; + return -EFSCORRUPTED; } if (rq->out[0] == *rq->in) { - DBG_BUGON(nrpages_out != 1); + DBG_BUGON(rq->pageofs_out); return 0; } - src = kmap_atomic(*rq->in) + rq->pageofs_in; + src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in; if (rq->out[0]) { - dst = kmap_atomic(rq->out[0]); - memcpy(dst + rq->pageofs_out, src, righthalf); - kunmap_atomic(dst); + dst = kmap_local_page(rq->out[0]); + memcpy(dst + rq->pageofs_out, src + interlaced_offset, + righthalf); + kunmap_local(dst); } - if (nrpages_out == 2) { - DBG_BUGON(!rq->out[1]); - if (rq->out[1] == *rq->in) { + if (outpages > inpages) { + DBG_BUGON(!rq->out[outpages - 1]); + if (rq->out[outpages - 1] != rq->in[inpages - 1]) { + dst = kmap_local_page(rq->out[outpages - 1]); + memcpy(dst, interlaced_offset ? src : + (src + righthalf), lefthalf); + kunmap_local(dst); + } else if (!interlaced_offset) { memmove(src, src + righthalf, lefthalf); - } else { - dst = kmap_atomic(rq->out[1]); - memcpy(dst, src + righthalf, lefthalf); - kunmap_atomic(dst); } } - kunmap_atomic(src); + kunmap_local(src); return 0; } static struct z_erofs_decompressor decompressors[] = { [Z_EROFS_COMPRESSION_SHIFTED] = { - .decompress = z_erofs_shifted_transform, + .decompress = z_erofs_transform_plain, .name = "shifted" }, + [Z_EROFS_COMPRESSION_INTERLACED] = { + .decompress = z_erofs_transform_plain, + .name = "interlaced" + }, [Z_EROFS_COMPRESSION_LZ4] = { .decompress = z_erofs_lz4_decompress, .name = "lz4" diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 5e59b3f523eb..091fd5adf818 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -217,6 +217,9 @@ again: strm->buf.out_size = min_t(u32, outlen, PAGE_SIZE - pageofs); outlen -= strm->buf.out_size; + if (!rq->out[no] && rq->fillgaps) /* deduped */ + rq->out[no] = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); if (rq->out[no]) strm->buf.out = kmap(rq->out[no]) + pageofs; pageofs = 0; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 2b48373f690b..dbcd24371002 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -25,6 +25,8 @@ #define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008 #define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008 #define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010 +#define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020 +#define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020 #define EROFS_ALL_FEATURE_INCOMPAT \ (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ @@ -32,7 +34,9 @@ EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \ - EROFS_FEATURE_INCOMPAT_ZTAILPACKING) + EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \ + EROFS_FEATURE_INCOMPAT_FRAGMENTS | \ + EROFS_FEATURE_INCOMPAT_DEDUPE) #define EROFS_SB_EXTSLOT_SIZE 16 @@ -71,7 +75,9 @@ struct erofs_super_block { } __packed u1; __le16 extra_devices; /* # of devices besides the primary device */ __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */ - __u8 reserved2[38]; + __u8 reserved[6]; + __le64 packed_nid; /* nid of the special packed inode */ + __u8 reserved2[24]; }; /* @@ -295,16 +301,27 @@ struct z_erofs_lzma_cfgs { * bit 1 : HEAD1 big pcluster (0 - off; 1 - on) * bit 2 : HEAD2 big pcluster (0 - off; 1 - on) * bit 3 : tailpacking inline pcluster (0 - off; 1 - on) + * bit 4 : interlaced plain pcluster (0 - off; 1 - on) + * bit 5 : fragment pcluster (0 - off; 1 - on) */ #define Z_EROFS_ADVISE_COMPACTED_2B 0x0001 #define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002 #define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004 #define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008 +#define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010 +#define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0020 +#define Z_EROFS_FRAGMENT_INODE_BIT 7 struct z_erofs_map_header { - __le16 h_reserved1; - /* indicates the encoded size of tailpacking data */ - __le16 h_idata_size; + union { + /* fragment data offset in the packed inode */ + __le32 h_fragmentoff; + struct { + __le16 h_reserved1; + /* indicates the encoded size of tailpacking data */ + __le16 h_idata_size; + }; + }; __le16 h_advise; /* * bit 0-3 : algorithm type of head 1 (logical cluster type 01); @@ -313,7 +330,8 @@ struct z_erofs_map_header { __u8 h_algorithmtype; /* * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096; - * bit 3-7 : reserved. + * bit 3-6 : reserved; + * bit 7 : move the whole file into packed inode or not. */ __u8 h_clusterbits; }; @@ -355,6 +373,9 @@ enum { #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2 #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0 +/* (noncompact only, HEAD) This pcluster refers to partial decompressed data */ +#define Z_EROFS_VLE_DI_PARTIAL_REF (1 << 15) + /* * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the * compressed block count of a compressed extent (in logical clusters, aka. @@ -402,6 +423,10 @@ struct erofs_dirent { /* check the EROFS on-disk layout strictly at compile time */ static inline void erofs_check_ondisk_layout_definitions(void) { + const __le64 fmh = *(__le64 *)&(struct z_erofs_map_header) { + .h_clusterbits = 1 << Z_EROFS_FRAGMENT_INODE_BIT + }; + BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128); BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32); BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64); @@ -419,6 +444,9 @@ static inline void erofs_check_ondisk_layout_definitions(void) BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); + /* exclude old compiler versions like gcc 7.5.0 */ + BUILD_BUG_ON(__builtin_constant_p(fmh) ? + fmh != cpu_to_le64(1ULL << 63) : 0); } #endif diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index b5fd9d71e67f..014e20962376 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -1,429 +1,440 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2022, Alibaba Cloud + * Copyright (C) 2022, Bytedance Inc. All rights reserved. */ #include <linux/fscache.h> #include "internal.h" -static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping, +static DEFINE_MUTEX(erofs_domain_list_lock); +static DEFINE_MUTEX(erofs_domain_cookies_lock); +static LIST_HEAD(erofs_domain_list); +static struct vfsmount *erofs_pseudo_mnt; + +struct erofs_fscache_request { + struct erofs_fscache_request *primary; + struct netfs_cache_resources cache_resources; + struct address_space *mapping; /* The mapping being accessed */ + loff_t start; /* Start position */ + size_t len; /* Length of the request */ + size_t submitted; /* Length of submitted */ + short error; /* 0 or error that occurred */ + refcount_t ref; +}; + +static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_space *mapping, loff_t start, size_t len) { - struct netfs_io_request *rreq; + struct erofs_fscache_request *req; - rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL); - if (!rreq) + req = kzalloc(sizeof(struct erofs_fscache_request), GFP_KERNEL); + if (!req) return ERR_PTR(-ENOMEM); - rreq->start = start; - rreq->len = len; - rreq->mapping = mapping; - rreq->inode = mapping->host; - INIT_LIST_HEAD(&rreq->subrequests); - refcount_set(&rreq->ref, 1); - return rreq; -} + req->mapping = mapping; + req->start = start; + req->len = len; + refcount_set(&req->ref, 1); -static void erofs_fscache_put_request(struct netfs_io_request *rreq) -{ - if (!refcount_dec_and_test(&rreq->ref)) - return; - if (rreq->cache_resources.ops) - rreq->cache_resources.ops->end_operation(&rreq->cache_resources); - kfree(rreq); + return req; } -static void erofs_fscache_put_subrequest(struct netfs_io_subrequest *subreq) +static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary, + size_t len) { - if (!refcount_dec_and_test(&subreq->ref)) - return; - erofs_fscache_put_request(subreq->rreq); - kfree(subreq); -} + struct erofs_fscache_request *req; -static void erofs_fscache_clear_subrequests(struct netfs_io_request *rreq) -{ - struct netfs_io_subrequest *subreq; + /* use primary request for the first submission */ + if (!primary->submitted) { + refcount_inc(&primary->ref); + return primary; + } - while (!list_empty(&rreq->subrequests)) { - subreq = list_first_entry(&rreq->subrequests, - struct netfs_io_subrequest, rreq_link); - list_del(&subreq->rreq_link); - erofs_fscache_put_subrequest(subreq); + req = erofs_fscache_req_alloc(primary->mapping, + primary->start + primary->submitted, len); + if (!IS_ERR(req)) { + req->primary = primary; + refcount_inc(&primary->ref); } + return req; } -static void erofs_fscache_rreq_unlock_folios(struct netfs_io_request *rreq) +static void erofs_fscache_req_complete(struct erofs_fscache_request *req) { - struct netfs_io_subrequest *subreq; struct folio *folio; - unsigned int iopos = 0; - pgoff_t start_page = rreq->start / PAGE_SIZE; - pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; - bool subreq_failed = false; - - XA_STATE(xas, &rreq->mapping->i_pages, start_page); + bool failed = req->error; + pgoff_t start_page = req->start / PAGE_SIZE; + pgoff_t last_page = ((req->start + req->len) / PAGE_SIZE) - 1; - subreq = list_first_entry(&rreq->subrequests, - struct netfs_io_subrequest, rreq_link); - subreq_failed = (subreq->error < 0); + XA_STATE(xas, &req->mapping->i_pages, start_page); rcu_read_lock(); xas_for_each(&xas, folio, last_page) { - unsigned int pgpos = - (folio_index(folio) - start_page) * PAGE_SIZE; - unsigned int pgend = pgpos + folio_size(folio); - bool pg_failed = false; - - for (;;) { - if (!subreq) { - pg_failed = true; - break; - } - - pg_failed |= subreq_failed; - if (pgend < iopos + subreq->len) - break; - - iopos += subreq->len; - if (!list_is_last(&subreq->rreq_link, - &rreq->subrequests)) { - subreq = list_next_entry(subreq, rreq_link); - subreq_failed = (subreq->error < 0); - } else { - subreq = NULL; - subreq_failed = false; - } - if (pgend == iopos) - break; - } - - if (!pg_failed) + if (xas_retry(&xas, folio)) + continue; + if (!failed) folio_mark_uptodate(folio); - folio_unlock(folio); } rcu_read_unlock(); } -static void erofs_fscache_rreq_complete(struct netfs_io_request *rreq) +static void erofs_fscache_req_put(struct erofs_fscache_request *req) { - erofs_fscache_rreq_unlock_folios(rreq); - erofs_fscache_clear_subrequests(rreq); - erofs_fscache_put_request(rreq); + if (refcount_dec_and_test(&req->ref)) { + if (req->cache_resources.ops) + req->cache_resources.ops->end_operation(&req->cache_resources); + if (!req->primary) + erofs_fscache_req_complete(req); + else + erofs_fscache_req_put(req->primary); + kfree(req); + } } -static void erofc_fscache_subreq_complete(void *priv, +static void erofs_fscache_subreq_complete(void *priv, ssize_t transferred_or_error, bool was_async) { - struct netfs_io_subrequest *subreq = priv; - struct netfs_io_request *rreq = subreq->rreq; - - if (IS_ERR_VALUE(transferred_or_error)) - subreq->error = transferred_or_error; - - if (atomic_dec_and_test(&rreq->nr_outstanding)) - erofs_fscache_rreq_complete(rreq); + struct erofs_fscache_request *req = priv; - erofs_fscache_put_subrequest(subreq); + if (IS_ERR_VALUE(transferred_or_error)) { + if (req->primary) + req->primary->error = transferred_or_error; + else + req->error = transferred_or_error; + } + erofs_fscache_req_put(req); } /* - * Read data from fscache and fill the read data into page cache described by - * @rreq, which shall be both aligned with PAGE_SIZE. @pstart describes - * the start physical address in the cache file. + * Read data from fscache (cookie, pstart, len), and fill the read data into + * page cache described by (req->mapping, lstart, len). @pstart describeis the + * start physical address in the cache file. */ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, - struct netfs_io_request *rreq, loff_t pstart) + struct erofs_fscache_request *req, loff_t pstart, size_t len) { enum netfs_io_source source; - struct super_block *sb = rreq->mapping->host->i_sb; - struct netfs_io_subrequest *subreq; - struct netfs_cache_resources *cres = &rreq->cache_resources; + struct super_block *sb = req->mapping->host->i_sb; + struct netfs_cache_resources *cres = &req->cache_resources; struct iov_iter iter; - loff_t start = rreq->start; - size_t len = rreq->len; + loff_t lstart = req->start + req->submitted; size_t done = 0; int ret; - atomic_set(&rreq->nr_outstanding, 1); + DBG_BUGON(len > req->len - req->submitted); ret = fscache_begin_read_operation(cres, cookie); if (ret) - goto out; + return ret; while (done < len) { - subreq = kzalloc(sizeof(struct netfs_io_subrequest), - GFP_KERNEL); - if (subreq) { - INIT_LIST_HEAD(&subreq->rreq_link); - refcount_set(&subreq->ref, 2); - subreq->rreq = rreq; - refcount_inc(&rreq->ref); - } else { - ret = -ENOMEM; - goto out; - } + loff_t sstart = pstart + done; + size_t slen = len - done; + unsigned long flags = 1 << NETFS_SREQ_ONDEMAND; - subreq->start = pstart + done; - subreq->len = len - done; - subreq->flags = 1 << NETFS_SREQ_ONDEMAND; - - list_add_tail(&subreq->rreq_link, &rreq->subrequests); - - source = cres->ops->prepare_read(subreq, LLONG_MAX); - if (WARN_ON(subreq->len == 0)) + source = cres->ops->prepare_ondemand_read(cres, + sstart, &slen, LLONG_MAX, &flags, 0); + if (WARN_ON(slen == 0)) source = NETFS_INVALID_READ; if (source != NETFS_READ_FROM_CACHE) { - erofs_err(sb, "failed to fscache prepare_read (source %d)", - source); - ret = -EIO; - subreq->error = ret; - erofs_fscache_put_subrequest(subreq); - goto out; + erofs_err(sb, "failed to fscache prepare_read (source %d)", source); + return -EIO; } - atomic_inc(&rreq->nr_outstanding); + refcount_inc(&req->ref); + iov_iter_xarray(&iter, ITER_DEST, &req->mapping->i_pages, + lstart + done, slen); - iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, - start + done, subreq->len); - - ret = fscache_read(cres, subreq->start, &iter, - NETFS_READ_HOLE_FAIL, - erofc_fscache_subreq_complete, subreq); + ret = fscache_read(cres, sstart, &iter, NETFS_READ_HOLE_FAIL, + erofs_fscache_subreq_complete, req); if (ret == -EIOCBQUEUED) ret = 0; if (ret) { erofs_err(sb, "failed to fscache_read (ret %d)", ret); - goto out; + return ret; } - done += subreq->len; + done += slen; } -out: - if (atomic_dec_and_test(&rreq->nr_outstanding)) - erofs_fscache_rreq_complete(rreq); - - return ret; + DBG_BUGON(done != len); + return 0; } static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) { int ret; struct super_block *sb = folio_mapping(folio)->host->i_sb; - struct netfs_io_request *rreq; + struct erofs_fscache_request *req; struct erofs_map_dev mdev = { .m_deviceid = 0, .m_pa = folio_pos(folio), }; ret = erofs_map_dev(sb, &mdev); - if (ret) - goto out; + if (ret) { + folio_unlock(folio); + return ret; + } - rreq = erofs_fscache_alloc_request(folio_mapping(folio), + req = erofs_fscache_req_alloc(folio_mapping(folio), folio_pos(folio), folio_size(folio)); - if (IS_ERR(rreq)) { - ret = PTR_ERR(rreq); - goto out; + if (IS_ERR(req)) { + folio_unlock(folio); + return PTR_ERR(req); } - return erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - rreq, mdev.m_pa); -out: - folio_unlock(folio); + ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, + req, mdev.m_pa, folio_size(folio)); + if (ret) + req->error = ret; + + erofs_fscache_req_put(req); return ret; } -static int erofs_fscache_read_folio_inline(struct folio *folio, - struct erofs_map_blocks *map) +static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) { - struct super_block *sb = folio_mapping(folio)->host->i_sb; - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - erofs_blk_t blknr; - size_t offset, len; - void *src, *dst; - - /* For tail packing layout, the offset may be non-zero. */ - offset = erofs_blkoff(map->m_pa); - blknr = erofs_blknr(map->m_pa); - len = map->m_llen; - - src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP); - if (IS_ERR(src)) - return PTR_ERR(src); - - dst = kmap_local_folio(folio, 0); - memcpy(dst, src + offset, len); - memset(dst + len, 0, PAGE_SIZE - len); - kunmap_local(dst); - - erofs_put_metabuf(&buf); - return 0; -} - -static int erofs_fscache_read_folio(struct file *file, struct folio *folio) -{ - struct inode *inode = folio_mapping(folio)->host; + struct address_space *mapping = primary->mapping; + struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; + struct erofs_fscache_request *req; struct erofs_map_blocks map; struct erofs_map_dev mdev; - struct netfs_io_request *rreq; - erofs_off_t pos; - loff_t pstart; + struct iov_iter iter; + loff_t pos = primary->start + primary->submitted; + size_t count; int ret; - DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ); - - pos = folio_pos(folio); map.m_la = pos; - ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); if (ret) - goto out_unlock; + return ret; - if (!(map.m_flags & EROFS_MAP_MAPPED)) { - folio_zero_range(folio, 0, folio_size(folio)); - goto out_uptodate; + if (map.m_flags & EROFS_MAP_META) { + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + erofs_blk_t blknr; + size_t offset, size; + void *src; + + /* For tail packing layout, the offset may be non-zero. */ + offset = erofs_blkoff(map.m_pa); + blknr = erofs_blknr(map.m_pa); + size = map.m_llen; + + src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP); + if (IS_ERR(src)) + return PTR_ERR(src); + + iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, PAGE_SIZE); + if (copy_to_iter(src + offset, size, &iter) != size) { + erofs_put_metabuf(&buf); + return -EFAULT; + } + iov_iter_zero(PAGE_SIZE - size, &iter); + erofs_put_metabuf(&buf); + primary->submitted += PAGE_SIZE; + return 0; } - if (map.m_flags & EROFS_MAP_META) { - ret = erofs_fscache_read_folio_inline(folio, &map); - goto out_uptodate; + count = primary->len - primary->submitted; + if (!(map.m_flags & EROFS_MAP_MAPPED)) { + iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count); + iov_iter_zero(count, &iter); + primary->submitted += count; + return 0; } + count = min_t(size_t, map.m_llen - (pos - map.m_la), count); + DBG_BUGON(!count || count % PAGE_SIZE); + mdev = (struct erofs_map_dev) { .m_deviceid = map.m_deviceid, .m_pa = map.m_pa, }; - ret = erofs_map_dev(sb, &mdev); if (ret) - goto out_unlock; + return ret; + req = erofs_fscache_req_chain(primary, count); + if (IS_ERR(req)) + return PTR_ERR(req); - rreq = erofs_fscache_alloc_request(folio_mapping(folio), - folio_pos(folio), folio_size(folio)); - if (IS_ERR(rreq)) { - ret = PTR_ERR(rreq); - goto out_unlock; - } + ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, + req, mdev.m_pa + (pos - map.m_la), count); + erofs_fscache_req_put(req); + primary->submitted += count; + return ret; +} + +static int erofs_fscache_data_read(struct erofs_fscache_request *req) +{ + int ret; - pstart = mdev.m_pa + (pos - map.m_la); - return erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - rreq, pstart); + do { + ret = erofs_fscache_data_read_slice(req); + if (ret) + req->error = ret; + } while (!ret && req->submitted < req->len); -out_uptodate: - if (!ret) - folio_mark_uptodate(folio); -out_unlock: - folio_unlock(folio); return ret; } -static void erofs_fscache_advance_folios(struct readahead_control *rac, - size_t len, bool unlock) +static int erofs_fscache_read_folio(struct file *file, struct folio *folio) { - while (len) { - struct folio *folio = readahead_folio(rac); - len -= folio_size(folio); - if (unlock) { - folio_mark_uptodate(folio); - folio_unlock(folio); - } + struct erofs_fscache_request *req; + int ret; + + req = erofs_fscache_req_alloc(folio_mapping(folio), + folio_pos(folio), folio_size(folio)); + if (IS_ERR(req)) { + folio_unlock(folio); + return PTR_ERR(req); } + + ret = erofs_fscache_data_read(req); + erofs_fscache_req_put(req); + return ret; } static void erofs_fscache_readahead(struct readahead_control *rac) { - struct inode *inode = rac->mapping->host; - struct super_block *sb = inode->i_sb; - size_t len, count, done = 0; - erofs_off_t pos; - loff_t start, offset; - int ret; + struct erofs_fscache_request *req; if (!readahead_count(rac)) return; - start = readahead_pos(rac); - len = readahead_length(rac); + req = erofs_fscache_req_alloc(rac->mapping, + readahead_pos(rac), readahead_length(rac)); + if (IS_ERR(req)) + return; - do { - struct erofs_map_blocks map; - struct erofs_map_dev mdev; - struct netfs_io_request *rreq; + /* The request completion will drop refs on the folios. */ + while (readahead_folio(rac)) + ; - pos = start + done; - map.m_la = pos; + erofs_fscache_data_read(req); + erofs_fscache_req_put(req); +} - ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); - if (ret) - return; +static const struct address_space_operations erofs_fscache_meta_aops = { + .read_folio = erofs_fscache_meta_read_folio, +}; - offset = start + done; - count = min_t(size_t, map.m_llen - (pos - map.m_la), - len - done); +const struct address_space_operations erofs_fscache_access_aops = { + .read_folio = erofs_fscache_read_folio, + .readahead = erofs_fscache_readahead, +}; - if (!(map.m_flags & EROFS_MAP_MAPPED)) { - struct iov_iter iter; +static void erofs_fscache_domain_put(struct erofs_domain *domain) +{ + if (!domain) + return; + mutex_lock(&erofs_domain_list_lock); + if (refcount_dec_and_test(&domain->ref)) { + list_del(&domain->list); + if (list_empty(&erofs_domain_list)) { + kern_unmount(erofs_pseudo_mnt); + erofs_pseudo_mnt = NULL; + } + mutex_unlock(&erofs_domain_list_lock); + fscache_relinquish_volume(domain->volume, NULL, false); + kfree(domain->domain_id); + kfree(domain); + return; + } + mutex_unlock(&erofs_domain_list_lock); +} - iov_iter_xarray(&iter, READ, &rac->mapping->i_pages, - offset, count); - iov_iter_zero(count, &iter); +static int erofs_fscache_register_volume(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + char *domain_id = sbi->domain_id; + struct fscache_volume *volume; + char *name; + int ret = 0; - erofs_fscache_advance_folios(rac, count, true); - ret = count; - continue; - } + name = kasprintf(GFP_KERNEL, "erofs,%s", + domain_id ? domain_id : sbi->fsid); + if (!name) + return -ENOMEM; - if (map.m_flags & EROFS_MAP_META) { - struct folio *folio = readahead_folio(rac); + volume = fscache_acquire_volume(name, NULL, NULL, 0); + if (IS_ERR_OR_NULL(volume)) { + erofs_err(sb, "failed to register volume for %s", name); + ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP; + volume = NULL; + } - ret = erofs_fscache_read_folio_inline(folio, &map); - if (!ret) { - folio_mark_uptodate(folio); - ret = folio_size(folio); - } + sbi->volume = volume; + kfree(name); + return ret; +} - folio_unlock(folio); - continue; - } +static int erofs_fscache_init_domain(struct super_block *sb) +{ + int err; + struct erofs_domain *domain; + struct erofs_sb_info *sbi = EROFS_SB(sb); - mdev = (struct erofs_map_dev) { - .m_deviceid = map.m_deviceid, - .m_pa = map.m_pa, - }; - ret = erofs_map_dev(sb, &mdev); - if (ret) - return; + domain = kzalloc(sizeof(struct erofs_domain), GFP_KERNEL); + if (!domain) + return -ENOMEM; - rreq = erofs_fscache_alloc_request(rac->mapping, offset, count); - if (IS_ERR(rreq)) - return; - /* - * Drop the ref of folios here. Unlock them in - * rreq_unlock_folios() when rreq complete. - */ - erofs_fscache_advance_folios(rac, count, false); - ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - rreq, mdev.m_pa + (pos - map.m_la)); - if (!ret) - ret = count; - } while (ret > 0 && ((done += ret) < len)); + domain->domain_id = kstrdup(sbi->domain_id, GFP_KERNEL); + if (!domain->domain_id) { + kfree(domain); + return -ENOMEM; + } + + err = erofs_fscache_register_volume(sb); + if (err) + goto out; + + if (!erofs_pseudo_mnt) { + erofs_pseudo_mnt = kern_mount(&erofs_fs_type); + if (IS_ERR(erofs_pseudo_mnt)) { + err = PTR_ERR(erofs_pseudo_mnt); + goto out; + } + } + + domain->volume = sbi->volume; + refcount_set(&domain->ref, 1); + list_add(&domain->list, &erofs_domain_list); + sbi->domain = domain; + return 0; +out: + kfree(domain->domain_id); + kfree(domain); + return err; } -static const struct address_space_operations erofs_fscache_meta_aops = { - .read_folio = erofs_fscache_meta_read_folio, -}; +static int erofs_fscache_register_domain(struct super_block *sb) +{ + int err; + struct erofs_domain *domain; + struct erofs_sb_info *sbi = EROFS_SB(sb); -const struct address_space_operations erofs_fscache_access_aops = { - .read_folio = erofs_fscache_read_folio, - .readahead = erofs_fscache_readahead, -}; + mutex_lock(&erofs_domain_list_lock); + list_for_each_entry(domain, &erofs_domain_list, list) { + if (!strcmp(domain->domain_id, sbi->domain_id)) { + sbi->domain = domain; + sbi->volume = domain->volume; + refcount_inc(&domain->ref); + mutex_unlock(&erofs_domain_list_lock); + return 0; + } + } + err = erofs_fscache_init_domain(sb); + mutex_unlock(&erofs_domain_list_lock); + return err; +} -int erofs_fscache_register_cookie(struct super_block *sb, - struct erofs_fscache **fscache, - char *name, bool need_inode) +static +struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, + char *name, + unsigned int flags) { struct fscache_volume *volume = EROFS_SB(sb)->volume; struct erofs_fscache *ctx; @@ -432,7 +443,7 @@ int erofs_fscache_register_cookie(struct super_block *sb, ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) - return -ENOMEM; + return ERR_PTR(-ENOMEM); cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE, name, strlen(name), NULL, 0, 0); @@ -445,7 +456,7 @@ int erofs_fscache_register_cookie(struct super_block *sb, fscache_use_cookie(cookie, false); ctx->cookie = cookie; - if (need_inode) { + if (flags & EROFS_REG_COOKIE_NEED_INODE) { struct inode *const inode = new_inode(sb); if (!inode) { @@ -462,63 +473,171 @@ int erofs_fscache_register_cookie(struct super_block *sb, ctx->inode = inode; } - *fscache = ctx; - return 0; + return ctx; err_cookie: fscache_unuse_cookie(ctx->cookie, NULL, NULL); fscache_relinquish_cookie(ctx->cookie, false); - ctx->cookie = NULL; err: kfree(ctx); - return ret; + return ERR_PTR(ret); } -void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache) +static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx) { - struct erofs_fscache *ctx = *fscache; - - if (!ctx) - return; - fscache_unuse_cookie(ctx->cookie, NULL, NULL); fscache_relinquish_cookie(ctx->cookie, false); - ctx->cookie = NULL; - iput(ctx->inode); - ctx->inode = NULL; - + kfree(ctx->name); kfree(ctx); - *fscache = NULL; } -int erofs_fscache_register_fs(struct super_block *sb) +static +struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb, + char *name, + unsigned int flags) { - struct erofs_sb_info *sbi = EROFS_SB(sb); - struct fscache_volume *volume; - char *name; - int ret = 0; + int err; + struct inode *inode; + struct erofs_fscache *ctx; + struct erofs_domain *domain = EROFS_SB(sb)->domain; - name = kasprintf(GFP_KERNEL, "erofs,%s", sbi->opt.fsid); - if (!name) - return -ENOMEM; + ctx = erofs_fscache_acquire_cookie(sb, name, flags); + if (IS_ERR(ctx)) + return ctx; - volume = fscache_acquire_volume(name, NULL, NULL, 0); - if (IS_ERR_OR_NULL(volume)) { - erofs_err(sb, "failed to register volume for %s", name); - ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP; - volume = NULL; + ctx->name = kstrdup(name, GFP_KERNEL); + if (!ctx->name) { + err = -ENOMEM; + goto out; } - sbi->volume = volume; - kfree(name); - return ret; + inode = new_inode(erofs_pseudo_mnt->mnt_sb); + if (!inode) { + err = -ENOMEM; + goto out; + } + + ctx->domain = domain; + ctx->anon_inode = inode; + inode->i_private = ctx; + refcount_inc(&domain->ref); + return ctx; +out: + erofs_fscache_relinquish_cookie(ctx); + return ERR_PTR(err); +} + +static +struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, + char *name, + unsigned int flags) +{ + struct inode *inode; + struct erofs_fscache *ctx; + struct erofs_domain *domain = EROFS_SB(sb)->domain; + struct super_block *psb = erofs_pseudo_mnt->mnt_sb; + + mutex_lock(&erofs_domain_cookies_lock); + spin_lock(&psb->s_inode_list_lock); + list_for_each_entry(inode, &psb->s_inodes, i_sb_list) { + ctx = inode->i_private; + if (!ctx || ctx->domain != domain || strcmp(ctx->name, name)) + continue; + if (!(flags & EROFS_REG_COOKIE_NEED_NOEXIST)) { + igrab(inode); + } else { + erofs_err(sb, "%s already exists in domain %s", name, + domain->domain_id); + ctx = ERR_PTR(-EEXIST); + } + spin_unlock(&psb->s_inode_list_lock); + mutex_unlock(&erofs_domain_cookies_lock); + return ctx; + } + spin_unlock(&psb->s_inode_list_lock); + ctx = erofs_fscache_domain_init_cookie(sb, name, flags); + mutex_unlock(&erofs_domain_cookies_lock); + return ctx; +} + +struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, + char *name, + unsigned int flags) +{ + if (EROFS_SB(sb)->domain_id) + return erofs_domain_register_cookie(sb, name, flags); + return erofs_fscache_acquire_cookie(sb, name, flags); +} + +void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx) +{ + bool drop; + struct erofs_domain *domain; + + if (!ctx) + return; + domain = ctx->domain; + if (domain) { + mutex_lock(&erofs_domain_cookies_lock); + drop = atomic_read(&ctx->anon_inode->i_count) == 1; + iput(ctx->anon_inode); + mutex_unlock(&erofs_domain_cookies_lock); + if (!drop) + return; + } + + erofs_fscache_relinquish_cookie(ctx); + erofs_fscache_domain_put(domain); +} + +int erofs_fscache_register_fs(struct super_block *sb) +{ + int ret; + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_fscache *fscache; + unsigned int flags; + + if (sbi->domain_id) + ret = erofs_fscache_register_domain(sb); + else + ret = erofs_fscache_register_volume(sb); + if (ret) + return ret; + + /* + * When shared domain is enabled, using NEED_NOEXIST to guarantee + * the primary data blob (aka fsid) is unique in the shared domain. + * + * For non-shared-domain case, fscache_acquire_volume() invoked by + * erofs_fscache_register_volume() has already guaranteed + * the uniqueness of primary data blob. + * + * Acquired domain/volume will be relinquished in kill_sb() on error. + */ + flags = EROFS_REG_COOKIE_NEED_INODE; + if (sbi->domain_id) + flags |= EROFS_REG_COOKIE_NEED_NOEXIST; + fscache = erofs_fscache_register_cookie(sb, sbi->fsid, flags); + if (IS_ERR(fscache)) + return PTR_ERR(fscache); + + sbi->s_fscache = fscache; + return 0; } void erofs_fscache_unregister_fs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - fscache_relinquish_volume(sbi->volume, NULL, false); + erofs_fscache_unregister_cookie(sbi->s_fscache); + + if (sbi->domain) + erofs_fscache_domain_put(sbi->domain); + else + fscache_relinquish_volume(sbi->volume, NULL, false); + + sbi->s_fscache = NULL; sbi->volume = NULL; + sbi->domain = NULL; } diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 95a403720e8c..d3b8736fa124 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -214,7 +214,7 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, /* if it cannot be handled with fast symlink scheme */ if (vi->datalayout != EROFS_INODE_FLAT_INLINE || - inode->i_size >= EROFS_BLKSIZ) { + inode->i_size >= EROFS_BLKSIZ || inode->i_size < 0) { inode->i_op = &erofs_symlink_iops; return 0; } @@ -241,7 +241,7 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, return 0; } -static int erofs_fill_inode(struct inode *inode, int isdir) +static int erofs_fill_inode(struct inode *inode) { struct erofs_inode *vi = EROFS_I(inode); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; @@ -249,7 +249,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir) unsigned int ofs; int err = 0; - trace_erofs_fill_inode(inode, isdir); + trace_erofs_fill_inode(inode); /* read inode base data from disk */ kaddr = erofs_read_inode(&buf, inode, &ofs); @@ -268,6 +268,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir) case S_IFDIR: inode->i_op = &erofs_dir_iops; inode->i_fop = &erofs_dir_fops; + inode_nohighmem(inode); break; case S_IFLNK: err = erofs_fill_symlink(inode, kaddr, ofs); @@ -295,6 +296,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir) goto out_unlock; } inode->i_mapping->a_ops = &erofs_raw_access_aops; + mapping_set_large_folios(inode->i_mapping); #ifdef CONFIG_EROFS_FS_ONDEMAND if (erofs_is_fscache_mode(inode->i_sb)) inode->i_mapping->a_ops = &erofs_fscache_access_aops; @@ -324,21 +326,13 @@ static int erofs_iget_set_actor(struct inode *inode, void *opaque) return 0; } -static inline struct inode *erofs_iget_locked(struct super_block *sb, - erofs_nid_t nid) +struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid) { const unsigned long hashval = erofs_inode_hash(nid); + struct inode *inode; - return iget5_locked(sb, hashval, erofs_ilookup_test_actor, + inode = iget5_locked(sb, hashval, erofs_ilookup_test_actor, erofs_iget_set_actor, &nid); -} - -struct inode *erofs_iget(struct super_block *sb, - erofs_nid_t nid, - bool isdir) -{ - struct inode *inode = erofs_iget_locked(sb, nid); - if (!inode) return ERR_PTR(-ENOMEM); @@ -348,10 +342,10 @@ struct inode *erofs_iget(struct super_block *sb, vi->nid = nid; - err = erofs_fill_inode(inode, isdir); - if (!err) + err = erofs_fill_inode(inode); + if (!err) { unlock_new_inode(inode); - else { + } else { iget_failed(inode); inode = ERR_PTR(err); } @@ -379,7 +373,7 @@ int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, const struct inode_operations erofs_generic_iops = { .getattr = erofs_getattr, .listxattr = erofs_listxattr, - .get_acl = erofs_get_acl, + .get_inode_acl = erofs_get_acl, .fiemap = erofs_fiemap, }; @@ -387,12 +381,12 @@ const struct inode_operations erofs_symlink_iops = { .get_link = page_get_link, .getattr = erofs_getattr, .listxattr = erofs_listxattr, - .get_acl = erofs_get_acl, + .get_inode_acl = erofs_get_acl, }; const struct inode_operations erofs_fast_symlink_iops = { .get_link = simple_get_link, .getattr = erofs_getattr, .listxattr = erofs_listxattr, - .get_acl = erofs_get_acl, + .get_inode_acl = erofs_get_acl, }; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index a01cc82795a2..bb8501c0ff5b 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -75,7 +75,6 @@ struct erofs_mount_opts { unsigned int max_sync_decompress_pages; #endif unsigned int mount_opt; - char *fsid; }; struct erofs_dev_context { @@ -88,6 +87,8 @@ struct erofs_dev_context { struct erofs_fs_context { struct erofs_mount_opts opt; struct erofs_dev_context *devs; + char *fsid; + char *domain_id; }; /* all filesystem-wide lz4 configurations */ @@ -98,9 +99,19 @@ struct erofs_sb_lz4_info { u16 max_pclusterblks; }; +struct erofs_domain { + refcount_t ref; + struct list_head list; + struct fscache_volume *volume; + char *domain_id; +}; + struct erofs_fscache { struct fscache_cookie *cookie; struct inode *inode; + struct inode *anon_inode; + struct erofs_domain *domain; + char *name; }; struct erofs_sb_info { @@ -120,6 +131,7 @@ struct erofs_sb_info { struct inode *managed_cache; struct erofs_sb_lz4_info lz4; + struct inode *packed_inode; #endif /* CONFIG_EROFS_FS_ZIP */ struct erofs_dev_context *devs; struct dax_device *dax_dev; @@ -157,6 +169,9 @@ struct erofs_sb_info { /* fscache support */ struct fscache_volume *volume; struct erofs_fscache *s_fscache; + struct erofs_domain *domain; + char *fsid; + char *domain_id; }; #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) @@ -183,7 +198,6 @@ enum { EROFS_ZIP_CACHE_READAROUND }; -#ifdef CONFIG_EROFS_FS_ZIP #define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL) /* basic unit of the workstation of a super_block */ @@ -223,7 +237,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) return atomic_cond_read_relaxed(&grp->refcount, VAL != EROFS_LOCKED_MAGIC); } -#endif /* !CONFIG_EROFS_FS_ZIP */ /* we strictly follow PAGE_SIZE and no buffer head yet */ #define LOG_BLOCK_SIZE PAGE_SHIFT @@ -242,8 +255,7 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) enum erofs_kmap_type { EROFS_NO_KMAP, /* don't map the buffer */ - EROFS_KMAP, /* use kmap() to map the buffer */ - EROFS_KMAP_ATOMIC, /* use kmap_atomic() to map the buffer */ + EROFS_KMAP, /* use kmap_local_page() to map the buffer */ }; struct erofs_buf { @@ -277,6 +289,8 @@ EROFS_FEATURE_FUNCS(chunked_file, incompat, INCOMPAT_CHUNKED_FILE) EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE) EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2) EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING) +EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) +EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) /* atomic flag definitions */ @@ -312,8 +326,13 @@ struct erofs_inode { unsigned char z_algorithmtype[2]; unsigned char z_logical_clusterbits; unsigned long z_tailextent_headlcn; - erofs_off_t z_idataoff; - unsigned short z_idata_size; + union { + struct { + erofs_off_t z_idataoff; + unsigned short z_idata_size; + }; + erofs_off_t z_fragmentoff; + }; }; #endif /* CONFIG_EROFS_FS_ZIP */ }; @@ -364,6 +383,7 @@ struct page *erofs_grab_cache_page_nowait(struct address_space *mapping, } extern const struct super_operations erofs_sops; +extern struct file_system_type erofs_fs_type; extern const struct address_space_operations erofs_raw_access_aops; extern const struct address_space_operations z_erofs_aops; @@ -371,6 +391,8 @@ extern const struct address_space_operations z_erofs_aops; enum { BH_Encoded = BH_PrivateStart, BH_FullMapped, + BH_Fragment, + BH_Partialref, }; /* Has a disk mapping */ @@ -381,6 +403,10 @@ enum { #define EROFS_MAP_ENCODED (1 << BH_Encoded) /* The length of extent is full */ #define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) +/* Located in the special packed inode */ +#define EROFS_MAP_FRAGMENT (1 << BH_Fragment) +/* The extent refers to partial decompressed data */ +#define EROFS_MAP_PARTIAL_REF (1 << BH_Partialref) struct erofs_map_blocks { struct erofs_buf buf; @@ -402,11 +428,12 @@ struct erofs_map_blocks { #define EROFS_GET_BLOCKS_FIEMAP 0x0002 /* Used to map the whole extent if non-negligible data is requested for LZMA */ #define EROFS_GET_BLOCKS_READMORE 0x0004 -/* Used to map tail extent for tailpacking inline pcluster */ +/* Used to map tail extent for tailpacking inline or fragment pcluster */ #define EROFS_GET_BLOCKS_FINDTAIL 0x0008 enum { Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, + Z_EROFS_COMPRESSION_INTERLACED, Z_EROFS_COMPRESSION_RUNTIME_MAX }; @@ -466,7 +493,7 @@ extern const struct inode_operations erofs_generic_iops; extern const struct inode_operations erofs_symlink_iops; extern const struct inode_operations erofs_fast_symlink_iops; -struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir); +struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid); int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); @@ -576,32 +603,37 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb, } #endif /* !CONFIG_EROFS_FS_ZIP */ +/* flags for erofs_fscache_register_cookie() */ +#define EROFS_REG_COOKIE_NEED_INODE 1 +#define EROFS_REG_COOKIE_NEED_NOEXIST 2 + /* fscache.c */ #ifdef CONFIG_EROFS_FS_ONDEMAND int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); -int erofs_fscache_register_cookie(struct super_block *sb, - struct erofs_fscache **fscache, - char *name, bool need_inode); -void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache); +struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, + char *name, + unsigned int flags); +void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache); extern const struct address_space_operations erofs_fscache_access_aops; #else static inline int erofs_fscache_register_fs(struct super_block *sb) { - return 0; + return -EOPNOTSUPP; } static inline void erofs_fscache_unregister_fs(struct super_block *sb) {} -static inline int erofs_fscache_register_cookie(struct super_block *sb, - struct erofs_fscache **fscache, - char *name, bool need_inode) +static inline +struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, + char *name, + unsigned int flags) { - return -EOPNOTSUPP; + return ERR_PTR(-EOPNOTSUPP); } -static inline void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache) +static inline void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache) { } #endif diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index fd75506799c4..b64a108fac92 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -185,7 +185,6 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, if (IS_ERR(de)) return PTR_ERR(de); - /* the target page has been mapped */ if (ndirents) de = find_target_dirent(&qn, (u8 *)de, EROFS_BLKSIZ, ndirents); @@ -197,9 +196,7 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, return PTR_ERR_OR_ZERO(de); } -/* NOTE: i_mutex is already held by vfs */ -static struct dentry *erofs_lookup(struct inode *dir, - struct dentry *dentry, +static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { int err; @@ -207,17 +204,11 @@ static struct dentry *erofs_lookup(struct inode *dir, unsigned int d_type; struct inode *inode; - DBG_BUGON(!d_really_is_negative(dentry)); - /* dentry must be unhashed in lookup, no need to worry about */ - DBG_BUGON(!d_unhashed(dentry)); - trace_erofs_lookup(dir, dentry, flags); - /* file name exceeds fs limit */ if (dentry->d_name.len > EROFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - /* false uninitialized warnings on gcc 4.8.x */ err = erofs_namei(dir, &dentry->d_name, &nid, &d_type); if (err == -ENOENT) { @@ -228,7 +219,7 @@ static struct dentry *erofs_lookup(struct inode *dir, } else { erofs_dbg("%s, %pd (nid %llu) found, d_type %u", __func__, dentry, nid, d_type); - inode = erofs_iget(dir->i_sb, nid, d_type == FT_DIR); + inode = erofs_iget(dir->i_sb, nid); } return d_splice_alias(inode, dentry); } @@ -237,6 +228,6 @@ const struct inode_operations erofs_dir_iops = { .lookup = erofs_lookup, .getattr = erofs_getattr, .listxattr = erofs_listxattr, - .get_acl = erofs_get_acl, + .get_inode_acl = erofs_get_acl, .fiemap = erofs_fiemap, }; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 3173debeaa5a..481788c24a68 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -224,10 +224,10 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_device_info *dif, erofs_off_t *pos) { struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_fscache *fscache; struct erofs_deviceslot *dis; struct block_device *bdev; void *ptr; - int ret; ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*pos), EROFS_KMAP); if (IS_ERR(ptr)) @@ -245,10 +245,10 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, } if (erofs_is_fscache_mode(sb)) { - ret = erofs_fscache_register_cookie(sb, &dif->fscache, - dif->path, false); - if (ret) - return ret; + fscache = erofs_fscache_register_cookie(sb, dif->path, 0); + if (IS_ERR(fscache)) + return PTR_ERR(fscache); + dif->fscache = fscache; } else { bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL, sb->s_type); @@ -381,6 +381,17 @@ static int erofs_read_superblock(struct super_block *sb) #endif sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); sbi->root_nid = le16_to_cpu(dsb->root_nid); +#ifdef CONFIG_EROFS_FS_ZIP + sbi->packed_inode = NULL; + if (erofs_sb_has_fragments(sbi) && dsb->packed_nid) { + sbi->packed_inode = + erofs_iget(sb, le64_to_cpu(dsb->packed_nid)); + if (IS_ERR(sbi->packed_inode)) { + ret = PTR_ERR(sbi->packed_inode); + goto out; + } + } +#endif sbi->inos = le64_to_cpu(dsb->inos); sbi->build_time = le64_to_cpu(dsb->build_time); @@ -411,6 +422,10 @@ static int erofs_read_superblock(struct super_block *sb) erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!"); if (erofs_is_fscache_mode(sb)) erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!"); + if (erofs_sb_has_fragments(sbi)) + erofs_info(sb, "EXPERIMENTAL compressed fragments feature in use. Use at your own risk!"); + if (erofs_sb_has_dedupe(sbi)) + erofs_info(sb, "EXPERIMENTAL global deduplication feature in use. Use at your own risk!"); out: erofs_put_metabuf(&buf); return ret; @@ -440,6 +455,7 @@ enum { Opt_dax_enum, Opt_device, Opt_fsid, + Opt_domain_id, Opt_err }; @@ -465,6 +481,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums), fsparam_string("device", Opt_device), fsparam_string("fsid", Opt_fsid), + fsparam_string("domain_id", Opt_domain_id), {} }; @@ -562,14 +579,24 @@ static int erofs_fc_parse_param(struct fs_context *fc, break; case Opt_fsid: #ifdef CONFIG_EROFS_FS_ONDEMAND - kfree(ctx->opt.fsid); - ctx->opt.fsid = kstrdup(param->string, GFP_KERNEL); - if (!ctx->opt.fsid) + kfree(ctx->fsid); + ctx->fsid = kstrdup(param->string, GFP_KERNEL); + if (!ctx->fsid) return -ENOMEM; #else errorfc(fc, "fsid option not supported"); #endif break; + case Opt_domain_id: +#ifdef CONFIG_EROFS_FS_ONDEMAND + kfree(ctx->domain_id); + ctx->domain_id = kstrdup(param->string, GFP_KERNEL); + if (!ctx->domain_id) + return -ENOMEM; +#else + errorfc(fc, "domain_id option not supported"); +#endif + break; default: return -ENOPARAM; } @@ -641,7 +668,7 @@ static int erofs_init_managed_cache(struct super_block *sb) { return 0; } static struct inode *erofs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { - return erofs_iget(sb, ino, false); + return erofs_iget(sb, ino); } static struct dentry *erofs_fh_to_dentry(struct super_block *sb, @@ -667,7 +694,7 @@ static struct dentry *erofs_get_parent(struct dentry *child) err = erofs_namei(d_inode(child), &dotdot_name, &nid, &d_type); if (err) return ERR_PTR(err); - return d_obtain_alias(erofs_iget(child->d_sb, nid, d_type == FT_DIR)); + return d_obtain_alias(erofs_iget(child->d_sb, nid)); } static const struct export_operations erofs_export_ops = { @@ -676,6 +703,13 @@ static const struct export_operations erofs_export_ops = { .get_parent = erofs_get_parent, }; +static int erofs_fc_fill_pseudo_super(struct super_block *sb, struct fs_context *fc) +{ + static const struct tree_descr empty_descr = {""}; + + return simple_fill_super(sb, EROFS_SUPER_MAGIC, &empty_descr); +} + static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; @@ -694,9 +728,12 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_fs_info = sbi; sbi->opt = ctx->opt; - ctx->opt.fsid = NULL; sbi->devs = ctx->devs; ctx->devs = NULL; + sbi->fsid = ctx->fsid; + ctx->fsid = NULL; + sbi->domain_id = ctx->domain_id; + ctx->domain_id = NULL; if (erofs_is_fscache_mode(sb)) { sb->s_blocksize = EROFS_BLKSIZ; @@ -706,11 +743,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; - err = erofs_fscache_register_cookie(sb, &sbi->s_fscache, - sbi->opt.fsid, true); - if (err) - return err; - err = super_setup_bdi(sb); if (err) return err; @@ -752,7 +784,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) #endif /* get the root inode */ - inode = erofs_iget(sb, ROOT_NID(sbi), true); + inode = erofs_iget(sb, ROOT_NID(sbi)); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -781,11 +813,16 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return 0; } +static int erofs_fc_anon_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, erofs_fc_fill_pseudo_super); +} + static int erofs_fc_get_tree(struct fs_context *fc) { struct erofs_fs_context *ctx = fc->fs_private; - if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->opt.fsid) + if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->fsid) return get_tree_nodev(fc, erofs_fc_fill_super); return get_tree_bdev(fc, erofs_fc_fill_super); @@ -799,6 +836,9 @@ static int erofs_fc_reconfigure(struct fs_context *fc) DBG_BUGON(!sb_rdonly(sb)); + if (ctx->fsid || ctx->domain_id) + erofs_info(sb, "ignoring reconfiguration for fsid|domain_id."); + if (test_opt(&ctx->opt, POSIX_ACL)) fc->sb_flags |= SB_POSIXACL; else @@ -817,7 +857,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data) fs_put_dax(dif->dax_dev, NULL); if (dif->bdev) blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL); - erofs_fscache_unregister_cookie(&dif->fscache); + erofs_fscache_unregister_cookie(dif->fscache); + dif->fscache = NULL; kfree(dif->path); kfree(dif); return 0; @@ -837,7 +878,8 @@ static void erofs_fc_free(struct fs_context *fc) struct erofs_fs_context *ctx = fc->fs_private; erofs_free_dev_context(ctx->devs); - kfree(ctx->opt.fsid); + kfree(ctx->fsid); + kfree(ctx->domain_id); kfree(ctx); } @@ -848,10 +890,21 @@ static const struct fs_context_operations erofs_context_ops = { .free = erofs_fc_free, }; +static const struct fs_context_operations erofs_anon_context_ops = { + .get_tree = erofs_fc_anon_get_tree, +}; + static int erofs_init_fs_context(struct fs_context *fc) { - struct erofs_fs_context *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + struct erofs_fs_context *ctx; + + /* pseudo mount for anon inodes */ + if (fc->sb_flags & SB_KERNMOUNT) { + fc->ops = &erofs_anon_context_ops; + return 0; + } + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL); @@ -878,8 +931,14 @@ static void erofs_kill_sb(struct super_block *sb) WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC); + /* pseudo mount for anon inodes */ + if (sb->s_flags & SB_KERNMOUNT) { + kill_anon_super(sb); + return; + } + if (erofs_is_fscache_mode(sb)) - generic_shutdown_super(sb); + kill_anon_super(sb); else kill_block_super(sb); @@ -889,9 +948,9 @@ static void erofs_kill_sb(struct super_block *sb) erofs_free_dev_context(sbi->devs); fs_put_dax(sbi->dax_dev, NULL); - erofs_fscache_unregister_cookie(&sbi->s_fscache); erofs_fscache_unregister_fs(sb); - kfree(sbi->opt.fsid); + kfree(sbi->fsid); + kfree(sbi->domain_id); kfree(sbi); sb->s_fs_info = NULL; } @@ -908,11 +967,13 @@ static void erofs_put_super(struct super_block *sb) #ifdef CONFIG_EROFS_FS_ZIP iput(sbi->managed_cache); sbi->managed_cache = NULL; + iput(sbi->packed_inode); + sbi->packed_inode = NULL; #endif - erofs_fscache_unregister_cookie(&sbi->s_fscache); + erofs_fscache_unregister_fs(sb); } -static struct file_system_type erofs_fs_type = { +struct file_system_type erofs_fs_type = { .owner = THIS_MODULE, .name = "erofs", .init_fs_context = erofs_init_fs_context, @@ -1042,8 +1103,10 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(opt, DAX_NEVER)) seq_puts(seq, ",dax=never"); #ifdef CONFIG_EROFS_FS_ONDEMAND - if (opt->fsid) - seq_printf(seq, ",fsid=%s", opt->fsid); + if (sbi->fsid) + seq_printf(seq, ",fsid=%s", sbi->fsid); + if (sbi->domain_id) + seq_printf(seq, ",domain_id=%s", sbi->domain_id); #endif return 0; } diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index c1383e508bbe..fd476961f742 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -76,6 +76,8 @@ EROFS_ATTR_FEATURE(device_table); EROFS_ATTR_FEATURE(compr_head2); EROFS_ATTR_FEATURE(sb_chksum); EROFS_ATTR_FEATURE(ztailpacking); +EROFS_ATTR_FEATURE(fragments); +EROFS_ATTR_FEATURE(dedupe); static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(zero_padding), @@ -86,6 +88,8 @@ static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(compr_head2), ATTR_LIST(sb_chksum), ATTR_LIST(ztailpacking), + ATTR_LIST(fragments), + ATTR_LIST(dedupe), NULL, }; ATTRIBUTE_GROUPS(erofs_feat); @@ -201,12 +205,27 @@ static struct kobject erofs_feat = { int erofs_register_sysfs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); + char *name; + char *str = NULL; int err; + if (erofs_is_fscache_mode(sb)) { + if (sbi->domain_id) { + str = kasprintf(GFP_KERNEL, "%s,%s", sbi->domain_id, + sbi->fsid); + if (!str) + return -ENOMEM; + name = str; + } else { + name = sbi->fsid; + } + } else { + name = sb->s_id; + } sbi->s_kobj.kset = &erofs_root; init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", - erofs_is_fscache_mode(sb) ? sbi->opt.fsid : sb->s_id); + err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", name); + kfree(str); if (err) goto put_sb_kobj; return 0; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 8106bcb5a38d..a62fb8a3318a 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -148,7 +148,7 @@ static inline int xattr_iter_fixup(struct xattr_iter *it) it->blkaddr += erofs_blknr(it->ofs); it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr, - EROFS_KMAP_ATOMIC); + EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); it->ofs = erofs_blkoff(it->ofs); @@ -174,7 +174,7 @@ static int inline_xattr_iter_begin(struct xattr_iter *it, it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs); it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr, - EROFS_KMAP_ATOMIC); + EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); return vi->xattr_isize - xattr_header_sz; @@ -368,7 +368,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr, - EROFS_KMAP_ATOMIC); + EROFS_KMAP); if (IS_ERR(it->it.kaddr)) return PTR_ERR(it->it.kaddr); it->it.blkaddr = blkaddr; @@ -580,7 +580,7 @@ static int shared_listxattr(struct listxattr_iter *it) it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr, - EROFS_KMAP_ATOMIC); + EROFS_KMAP); if (IS_ERR(it->it.kaddr)) return PTR_ERR(it->it.kaddr); it->it.blkaddr = blkaddr; diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index 332462c59f11..0a43c9ee9f8f 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -39,9 +39,7 @@ static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi, #ifdef CONFIG_EROFS_FS_XATTR extern const struct xattr_handler erofs_xattr_user_handler; extern const struct xattr_handler erofs_xattr_trusted_handler; -#ifdef CONFIG_EROFS_FS_SECURITY extern const struct xattr_handler erofs_xattr_security_handler; -#endif static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx) { diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 5792ca9e0d5e..ccf7c55d477f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -7,6 +7,7 @@ #include "zdata.h" #include "compress.h" #include <linux/prefetch.h> +#include <linux/psi.h> #include <trace/events/erofs.h> @@ -174,16 +175,6 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) DBG_BUGON(1); } -/* how to allocate cached pages for a pcluster */ -enum z_erofs_cache_alloctype { - DONTALLOC, /* don't allocate any cached pages */ - /* - * try to use cached I/O if page allocation succeeds or fallback - * to in-place I/O instead to avoid any direct reclaim. - */ - TRYALLOC, -}; - /* * tagged pointer with 1-bit tag for all compressed pages * tag 0 - the page is just found with an extra page reference @@ -291,12 +282,29 @@ struct z_erofs_decompress_frontend { .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } +static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) +{ + unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy; + + if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) + return false; + + if (fe->backmost) + return true; + + if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND && + fe->map.m_la < fe->headoffset) + return true; + + return false; +} + static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, - enum z_erofs_cache_alloctype type, struct page **pagepool) { struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); struct z_erofs_pcluster *pcl = fe->pcl; + bool shouldalloc = z_erofs_should_alloc_cache(fe); bool standalone = true; /* * optimistic allocation without direct reclaim since inplace I/O @@ -325,18 +333,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, } else { /* I/O is needed, no possible to decompress directly */ standalone = false; - switch (type) { - case TRYALLOC: - newpage = erofs_allocpage(pagepool, gfp); - if (!newpage) - continue; - set_page_private(newpage, - Z_EROFS_PREALLOCATED_PAGE); - t = tag_compressed_page_justfound(newpage); - break; - default: /* DONTALLOC */ + if (!shouldalloc) continue; - } + + /* + * try to use cached I/O if page allocation + * succeeds or fallback to in-place I/O instead + * to avoid any direct reclaim. + */ + newpage = erofs_allocpage(pagepool, gfp); + if (!newpage) + continue; + set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); + t = tag_compressed_page_justfound(newpage); } if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, @@ -487,7 +496,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) struct erofs_workgroup *grp; int err; - if (!(map->m_flags & EROFS_MAP_ENCODED)) { + if (!(map->m_flags & EROFS_MAP_ENCODED) || + (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) { DBG_BUGON(1); return -EFSCORRUPTED; } @@ -636,30 +646,45 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) return true; } -static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, - unsigned int cachestrategy, - erofs_off_t la) +static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, + struct page *page, unsigned int pageofs, + unsigned int len) { - if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) - return false; + struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + u8 *src, *dst; + unsigned int i, cnt; - if (fe->backmost) - return true; + if (!packed_inode) + return -EFSCORRUPTED; - return cachestrategy >= EROFS_ZIP_CACHE_READAROUND && - la < fe->headoffset; + pos += EROFS_I(inode)->z_fragmentoff; + for (i = 0; i < len; i += cnt) { + cnt = min_t(unsigned int, len - i, + EROFS_BLKSIZ - erofs_blkoff(pos)); + src = erofs_bread(&buf, packed_inode, + erofs_blknr(pos), EROFS_KMAP); + if (IS_ERR(src)) { + erofs_put_metabuf(&buf); + return PTR_ERR(src); + } + + dst = kmap_local_page(page); + memcpy(dst + pageofs + i, src + erofs_blkoff(pos), cnt); + kunmap_local(dst); + pos += cnt; + } + erofs_put_metabuf(&buf); + return 0; } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct page *page, struct page **pagepool) { struct inode *const inode = fe->inode; - struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); bool tight = true, exclusive; - - enum z_erofs_cache_alloctype cache_strategy; unsigned int cur, end, spiltted; int err = 0; @@ -688,7 +713,8 @@ repeat: /* didn't get a valid pcluster previously (very rare) */ } - if (!(map->m_flags & EROFS_MAP_MAPPED)) + if (!(map->m_flags & EROFS_MAP_MAPPED) || + map->m_flags & EROFS_MAP_FRAGMENT) goto hitted; err = z_erofs_collector_begin(fe); @@ -712,13 +738,7 @@ repeat: fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } else { /* bind cache first when cached decompression is preferred */ - if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, - map->m_la)) - cache_strategy = TRYALLOC; - else - cache_strategy = DONTALLOC; - - z_erofs_bind_cache(fe, cache_strategy, pagepool); + z_erofs_bind_cache(fe, pagepool); } hitted: /* @@ -735,6 +755,24 @@ hitted: zero_user_segment(page, cur, end); goto next_part; } + if (map->m_flags & EROFS_MAP_FRAGMENT) { + unsigned int pageofs, skip, len; + + if (offset > map->m_la) { + pageofs = 0; + skip = offset - map->m_la; + } else { + pageofs = map->m_la & ~PAGE_MASK; + skip = 0; + } + len = min_t(unsigned int, map->m_llen - skip, end - cur); + err = z_erofs_read_fragment(inode, skip, page, pageofs, len); + if (err) + goto out; + ++spiltted; + tight = false; + goto next_part; + } exclusive = (!cur && (!spiltted || tight)); if (cur) @@ -764,14 +802,14 @@ retry: ++spiltted; if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) fe->pcl->multibases = true; - - if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && - fe->pcl->length == map->m_llen) - fe->pcl->partial = false; if (fe->pcl->length < offset + end - map->m_la) { fe->pcl->length = offset + end - map->m_la; fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; } + if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && + !(map->m_flags & EROFS_MAP_PARTIAL_REF) && + fe->pcl->length == map->m_llen) + fe->pcl->partial = false; next_part: /* shorten the remaining extent to update progress */ map->m_llen = offset + cur - map->m_la; @@ -838,15 +876,13 @@ static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) { unsigned int pgnr; - struct page *oldpage; pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; DBG_BUGON(pgnr >= be->nr_pages); - oldpage = be->decompressed_pages[pgnr]; - be->decompressed_pages[pgnr] = bvec->page; - - if (!oldpage) + if (!be->decompressed_pages[pgnr]) { + be->decompressed_pages[pgnr] = bvec->page; return; + } } /* (cold path) one pcluster is requested multiple times */ @@ -1365,6 +1401,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; + unsigned long pflags; + int memstall = 0; bi_private = jobqueueset_init(sb, q, fgq, force_fg); qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; @@ -1415,9 +1453,18 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, last_bdev != mdev.m_bdev)) { submit_bio_retry: submit_bio(bio); + if (memstall) { + psi_memstall_leave(&pflags); + memstall = 0; + } bio = NULL; } + if (unlikely(PageWorkingset(page)) && !memstall) { + psi_memstall_enter(&pflags); + memstall = 1; + } + if (!bio) { bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOIO); @@ -1445,8 +1492,11 @@ submit_bio_retry: move_to_bypass_jobqueue(pcl, qtail, owned_head); } while (owned_head != Z_EROFS_PCLUSTER_TAIL); - if (bio) + if (bio) { submit_bio(bio); + if (memstall) + psi_memstall_leave(&pflags); + } /* * although background is preferred, no one is pending for submission. diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index e7f04c4fbb81..d98c95212985 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -126,10 +126,10 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) } /* - * bit 31: I/O error occurred on this page - * bit 0 - 30: remaining parts to complete this page + * bit 30: I/O error occurred on this page + * bit 0 - 29: remaining parts to complete this page */ -#define Z_EROFS_PAGE_EIO (1 << 31) +#define Z_EROFS_PAGE_EIO (1 << 30) static inline void z_erofs_onlinepage_init(struct page *page) { diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index d58549ca1df9..0150570c33aa 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -17,7 +17,7 @@ int z_erofs_fill_inode(struct inode *inode) struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); if (!erofs_sb_has_big_pcluster(sbi) && - !erofs_sb_has_ztailpacking(sbi) && + !erofs_sb_has_ztailpacking(sbi) && !erofs_sb_has_fragments(sbi) && vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) { vi->z_advise = 0; vi->z_algorithmtype[0] = 0; @@ -55,20 +55,25 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) goto out_unlock; - DBG_BUGON(!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && - !erofs_sb_has_ztailpacking(EROFS_SB(sb)) && - vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY); - pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + vi->xattr_isize, 8); - kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), - EROFS_KMAP_ATOMIC); + kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); if (IS_ERR(kaddr)) { err = PTR_ERR(kaddr); goto out_unlock; } h = kaddr + erofs_blkoff(pos); + /* + * if the highest bit of the 8-byte map header is set, the whole file + * is stored in the packed inode. The rest bits keeps z_fragmentoff. + */ + if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) { + vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; + vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63); + vi->z_tailextent_headlcn = 0; + goto done; + } vi->z_advise = le16_to_cpu(h->h_advise); vi->z_algorithmtype[0] = h->h_algorithmtype & 15; vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; @@ -79,7 +84,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", headnr + 1, vi->z_algorithmtype[headnr], vi->nid); err = -EOPNOTSUPP; - goto unmap_done; + goto out_put_metabuf; } vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); @@ -89,7 +94,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", vi->nid); err = -EFSCORRUPTED; - goto unmap_done; + goto out_put_metabuf; } if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION && !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ @@ -97,12 +102,8 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", vi->nid); err = -EFSCORRUPTED; - goto unmap_done; + goto out_put_metabuf; } -unmap_done: - erofs_put_metabuf(&buf); - if (err) - goto out_unlock; if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { struct erofs_map_blocks map = { @@ -121,11 +122,28 @@ unmap_done: err = -EFSCORRUPTED; } if (err < 0) - goto out_unlock; + goto out_put_metabuf; } + + if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && + !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { + struct erofs_map_blocks map = { + .buf = __EROFS_BUF_INITIALIZER + }; + + vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); + err = z_erofs_do_map_blocks(inode, &map, + EROFS_GET_BLOCKS_FINDTAIL); + erofs_put_metabuf(&map.buf); + if (err < 0) + goto out_put_metabuf; + } +done: /* paired with smp_mb() at the beginning of the function */ smp_mb(); set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); +out_put_metabuf: + erofs_put_metabuf(&buf); out_unlock: clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); return err; @@ -143,20 +161,9 @@ struct z_erofs_maprecorder { u16 delta[2]; erofs_blk_t pblk, compressedblks; erofs_off_t nextpackoff; + bool partialref; }; -static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m, - erofs_blk_t eblk) -{ - struct super_block *const sb = m->inode->i_sb; - - m->kaddr = erofs_read_metabuf(&m->map->buf, sb, eblk, - EROFS_KMAP_ATOMIC); - if (IS_ERR(m->kaddr)) - return PTR_ERR(m->kaddr); - return 0; -} - static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, unsigned long lcn) { @@ -169,11 +176,11 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, lcn * sizeof(struct z_erofs_vle_decompressed_index); struct z_erofs_vle_decompressed_index *di; unsigned int advise, type; - int err; - err = z_erofs_reload_indexes(m, erofs_blknr(pos)); - if (err) - return err; + m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, + erofs_blknr(pos), EROFS_KMAP); + if (IS_ERR(m->kaddr)) + return PTR_ERR(m->kaddr); m->nextpackoff = pos + sizeof(struct z_erofs_vle_decompressed_index); m->lcn = lcn; @@ -201,6 +208,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: + if (advise & Z_EROFS_VLE_DI_PARTIAL_REF) + m->partialref = true; m->clusterofs = le16_to_cpu(di->di_clusterofs); m->pblk = le32_to_cpu(di->di_u.blkaddr); break; @@ -370,7 +379,6 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, unsigned int compacted_4b_initial, compacted_2b; unsigned int amortizedshift; erofs_off_t pos; - int err; if (lclusterbits != 12) return -EOPNOTSUPP; @@ -407,9 +415,10 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, amortizedshift = 2; out: pos += lcn * (1 << amortizedshift); - err = z_erofs_reload_indexes(m, erofs_blknr(pos)); - if (err) - return err; + m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, + erofs_blknr(pos), EROFS_KMAP); + if (IS_ERR(m->kaddr)) + return PTR_ERR(m->kaddr); return unpack_compacted_index(m, amortizedshift, pos, lookahead); } @@ -598,6 +607,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, { struct erofs_inode *const vi = EROFS_I(inode); bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER; + bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; struct z_erofs_maprecorder m = { .inode = inode, .map = map, @@ -663,28 +673,47 @@ static int z_erofs_do_map_blocks(struct inode *inode, err = -EOPNOTSUPP; goto unmap_out; } - + if (m.partialref) + map->m_flags |= EROFS_MAP_PARTIAL_REF; map->m_llen = end - map->m_la; - if (flags & EROFS_GET_BLOCKS_FINDTAIL) + if (flags & EROFS_GET_BLOCKS_FINDTAIL) { vi->z_tailextent_headlcn = m.lcn; + /* for non-compact indexes, fragmentoff is 64 bits */ + if (fragment && + vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) + vi->z_fragmentoff |= (u64)m.pblk << 32; + } if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) { map->m_flags |= EROFS_MAP_META; map->m_pa = vi->z_idataoff; map->m_plen = vi->z_idata_size; + } else if (fragment && m.lcn == vi->z_tailextent_headlcn) { + map->m_flags |= EROFS_MAP_FRAGMENT; } else { map->m_pa = blknr_to_addr(m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) - goto out; + goto unmap_out; } - if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) - map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; - else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) + if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) { + if (map->m_llen > map->m_plen) { + DBG_BUGON(1); + err = -EFSCORRUPTED; + goto unmap_out; + } + if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER) + map->m_algorithmformat = + Z_EROFS_COMPRESSION_INTERLACED; + else + map->m_algorithmformat = + Z_EROFS_COMPRESSION_SHIFTED; + } else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) { map->m_algorithmformat = vi->z_algorithmtype[1]; - else + } else { map->m_algorithmformat = vi->z_algorithmtype[0]; + } if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && @@ -694,21 +723,19 @@ static int z_erofs_do_map_blocks(struct inode *inode, if (!err) map->m_flags |= EROFS_MAP_FULL_MAPPED; } + unmap_out: erofs_unmap_metabuf(&m.map->buf); - -out: erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o", __func__, map->m_la, map->m_pa, map->m_llen, map->m_plen, map->m_flags); - return err; } -int z_erofs_map_blocks_iter(struct inode *inode, - struct erofs_map_blocks *map, +int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) { + struct erofs_inode *const vi = EROFS_I(inode); int err = 0; trace_z_erofs_map_blocks_iter_enter(inode, map, flags); @@ -725,6 +752,15 @@ int z_erofs_map_blocks_iter(struct inode *inode, if (err) goto out; + if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && + !vi->z_tailextent_headlcn) { + map->m_la = 0; + map->m_llen = inode->i_size; + map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED | + EROFS_MAP_FRAGMENT; + goto out; + } + err = z_erofs_do_map_blocks(inode, map, flags); out: trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err); @@ -751,7 +787,8 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset, iomap->length = map.m_llen; if (map.m_flags & EROFS_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; - iomap->addr = map.m_pa; + iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ? + IOMAP_NULL_ADDR : map.m_pa; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; diff --git a/fs/eventfd.c b/fs/eventfd.c index 3627dd7d25db..249ca6c0b784 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -43,21 +43,7 @@ struct eventfd_ctx { int id; }; -/** - * eventfd_signal - Adds @n to the eventfd counter. - * @ctx: [in] Pointer to the eventfd context. - * @n: [in] Value of the counter to be added to the eventfd internal counter. - * The value cannot be negative. - * - * This function is supposed to be called by the kernel in paths that do not - * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX - * value, and we signal this as overflow condition by returning a EPOLLERR - * to poll(2). - * - * Returns the amount by which the counter was incremented. This will be less - * than @n if the counter has overflowed. - */ -__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) +__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask) { unsigned long flags; @@ -69,21 +55,40 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) * it returns false, the eventfd_signal() call should be deferred to a * safe context. */ - if (WARN_ON_ONCE(current->in_eventfd_signal)) + if (WARN_ON_ONCE(current->in_eventfd)) return 0; spin_lock_irqsave(&ctx->wqh.lock, flags); - current->in_eventfd_signal = 1; + current->in_eventfd = 1; if (ULLONG_MAX - ctx->count < n) n = ULLONG_MAX - ctx->count; ctx->count += n; if (waitqueue_active(&ctx->wqh)) - wake_up_locked_poll(&ctx->wqh, EPOLLIN); - current->in_eventfd_signal = 0; + wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask); + current->in_eventfd = 0; spin_unlock_irqrestore(&ctx->wqh.lock, flags); return n; } + +/** + * eventfd_signal - Adds @n to the eventfd counter. + * @ctx: [in] Pointer to the eventfd context. + * @n: [in] Value of the counter to be added to the eventfd internal counter. + * The value cannot be negative. + * + * This function is supposed to be called by the kernel in paths that do not + * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX + * value, and we signal this as overflow condition by returning a EPOLLERR + * to poll(2). + * + * Returns the amount by which the counter was incremented. This will be less + * than @n if the counter has overflowed. + */ +__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) +{ + return eventfd_signal_mask(ctx, n, 0); +} EXPORT_SYMBOL_GPL(eventfd_signal); static void eventfd_free_ctx(struct eventfd_ctx *ctx) @@ -253,8 +258,10 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to) __set_current_state(TASK_RUNNING); } eventfd_ctx_do_read(ctx, &ucnt); + current->in_eventfd = 1; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLOUT); + current->in_eventfd = 0; spin_unlock_irq(&ctx->wqh.lock); if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt))) return -EFAULT; @@ -301,8 +308,10 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c } if (likely(res > 0)) { ctx->count += ucnt; + current->in_eventfd = 1; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN); + current->in_eventfd = 0; } spin_unlock_irq(&ctx->wqh.lock); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8b56b94e2f56..64659b110973 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -491,7 +491,8 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi) +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, + unsigned pollflags) { struct eventpoll *ep_src; unsigned long flags; @@ -522,16 +523,17 @@ static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi) } spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); ep->nests = nests + 1; - wake_up_locked_poll(&ep->poll_wait, EPOLLIN); + wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags); ep->nests = 0; spin_unlock_irqrestore(&ep->poll_wait.lock, flags); } #else -static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi) +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, + unsigned pollflags) { - wake_up_poll(&ep->poll_wait, EPOLLIN); + wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags); } #endif @@ -742,7 +744,7 @@ static void ep_free(struct eventpoll *ep) /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) - ep_poll_safewake(ep, NULL); + ep_poll_safewake(ep, NULL, 0); /* * We need to lock this because we could be hit by @@ -1065,7 +1067,7 @@ static inline bool list_add_tail_lockless(struct list_head *new, * added to the list from another CPU: the winner observes * new->next == new. */ - if (cmpxchg(&new->next, new, head) != new) + if (!try_cmpxchg(&new->next, &new, head)) return false; /* @@ -1208,7 +1210,7 @@ out_unlock: /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(ep, epi); + ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE); if (!(epi->event.events & EPOLLEXCLUSIVE)) ewake = 1; @@ -1553,7 +1555,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(ep, NULL); + ep_poll_safewake(ep, NULL, 0); return 0; } @@ -1629,7 +1631,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(ep, NULL); + ep_poll_safewake(ep, NULL, 0); return 0; } diff --git a/fs/exec.c b/fs/exec.c index d046dbb9cbd0..ab913243a367 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -28,7 +28,6 @@ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/mm.h> -#include <linux/vmacache.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/swap.h> @@ -65,6 +64,7 @@ #include <linux/io_uring.h> #include <linux/syscall_user_dispatch.h> #include <linux/coredump.h> +#include <linux/time_namespace.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -172,7 +172,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) exit: fput(file); out: - return error; + return error; } #endif /* #ifdef CONFIG_USELIB */ @@ -200,7 +200,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, { struct page *page; int ret; - unsigned int gup_flags = FOLL_FORCE; + unsigned int gup_flags = 0; #ifdef CONFIG_STACK_GROWSUP if (write) { @@ -683,6 +683,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) unsigned long length = old_end - old_start; unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; + VMA_ITERATOR(vmi, mm, new_start); + struct vm_area_struct *next; struct mmu_gather tlb; BUG_ON(new_start > new_end); @@ -691,7 +693,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * ensure there are no vmas between where we want to go * and where we are */ - if (vma != find_vma(mm, new_start)) + if (vma != vma_next(&vmi)) return -EFAULT; /* @@ -710,12 +712,13 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) lru_add_drain(); tlb_gather_mmu(&tlb, mm); + next = vma_next(&vmi); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. */ free_pgd_range(&tlb, new_end, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); + next ? next->vm_start : USER_PGTABLES_CEILING); } else { /* * otherwise, clean from old_start; this is done to not touch @@ -724,7 +727,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * for the others its just a little faster. */ free_pgd_range(&tlb, old_start, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); + next ? next->vm_start : USER_PGTABLES_CEILING); } tlb_finish_mmu(&tlb); @@ -840,16 +843,13 @@ int setup_arg_pages(struct linux_binprm *bprm, * will align it up. */ rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK; + + stack_expand = min(rlim_stack, stack_size + stack_expand); + #ifdef CONFIG_STACK_GROWSUP - if (stack_size + stack_expand > rlim_stack) - stack_base = vma->vm_start + rlim_stack; - else - stack_base = vma->vm_end + stack_expand; + stack_base = vma->vm_start + stack_expand; #else - if (stack_size + stack_expand > rlim_stack) - stack_base = vma->vm_end - rlim_stack; - else - stack_base = vma->vm_start - stack_expand; + stack_base = vma->vm_end - stack_expand; #endif current->mm->start_stack = bprm->p; ret = expand_stack(vma, stack_base); @@ -957,8 +957,7 @@ struct file *open_exec(const char *name) } EXPORT_SYMBOL(open_exec); -#if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \ - defined(CONFIG_BINFMT_ELF_FDPIC) +#if defined(CONFIG_BINFMT_FLAT) || defined(CONFIG_BINFMT_ELF_FDPIC) ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); @@ -1023,9 +1022,9 @@ static int exec_mmap(struct mm_struct *mm) activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); - tsk->mm->vmacache_seqnum = 0; - vmacache_flush(tsk); + lru_gen_add_mm(mm); task_unlock(tsk); + lru_gen_use_mm(mm); if (old_mm) { mmap_read_unlock(old_mm); BUG_ON(active_mm != old_mm); @@ -1196,11 +1195,11 @@ static int unshare_sighand(struct task_struct *me) return -ENOMEM; refcount_set(&newsighand->count, 1); - memcpy(newsighand->action, oldsighand->action, - sizeof(newsighand->action)); write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); + memcpy(newsighand->action, oldsighand->action, + sizeof(newsighand->action)); rcu_assign_pointer(me->sighand, newsighand); spin_unlock(&oldsighand->siglock); write_unlock_irq(&tasklist_lock); @@ -1296,6 +1295,10 @@ int begin_new_exec(struct linux_binprm * bprm) bprm->mm = NULL; + retval = exec_task_namespaces(); + if (retval) + goto out_unlock; + #ifdef CONFIG_POSIX_TIMERS spin_lock_irq(&me->sighand->siglock); posix_cpu_timers_exit(me); @@ -1567,6 +1570,12 @@ static void check_unsafe_exec(struct linux_binprm *bprm) if (task_no_new_privs(current)) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; + /* + * If another task is sharing our fs, we cannot safely + * suid exec because the differently privileged task + * will be able to manipulate the current directory, etc. + * It would be nice to force an unshare instead... + */ t = p; n_fs = 1; spin_lock(&p->fs->lock); @@ -1588,10 +1597,10 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) { /* Handle suid and sgid on files */ struct user_namespace *mnt_userns; - struct inode *inode; + struct inode *inode = file_inode(file); unsigned int mode; - kuid_t uid; - kgid_t gid; + vfsuid_t vfsuid; + vfsgid_t vfsgid; if (!mnt_may_suid(file->f_path.mnt)) return; @@ -1599,7 +1608,6 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) if (task_no_new_privs(current)) return; - inode = file->f_path.dentry->d_inode; mode = READ_ONCE(inode->i_mode); if (!(mode & (S_ISUID|S_ISGID))) return; @@ -1611,23 +1619,23 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) /* reload atomically mode/uid/gid now that lock held */ mode = inode->i_mode; - uid = i_uid_into_mnt(mnt_userns, inode); - gid = i_gid_into_mnt(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid = i_gid_into_vfsgid(mnt_userns, inode); inode_unlock(inode); /* We ignore suid/sgid if there are no mappings for them in the ns */ - if (!kuid_has_mapping(bprm->cred->user_ns, uid) || - !kgid_has_mapping(bprm->cred->user_ns, gid)) + if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) || + !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid)) return; if (mode & S_ISUID) { bprm->per_clear |= PER_CLEAR_ON_SETID; - bprm->cred->euid = uid; + bprm->cred->euid = vfsuid_into_kuid(vfsuid); } if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { bprm->per_clear |= PER_CLEAR_ON_SETID; - bprm->cred->egid = gid; + bprm->cred->egid = vfsgid_into_kgid(vfsgid); } } @@ -1748,6 +1756,7 @@ static int search_binary_handler(struct linux_binprm *bprm) return retval; } +/* binfmt handlers will call back into begin_new_exec() on success. */ static int exec_binprm(struct linux_binprm *bprm) { pid_t old_pid, old_vpid; @@ -1806,6 +1815,11 @@ static int bprm_execve(struct linux_binprm *bprm, if (retval) return retval; + /* + * Check for unsafe execution states before exec_binprm(), which + * will call back into begin_new_exec(), into bprm_creds_from_file(), + * where setuid-ness is evaluated. + */ check_unsafe_exec(bprm); current->in_execve = 1; @@ -1881,7 +1895,7 @@ static int do_execveat_common(int fd, struct filename *filename, * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && - is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { + is_rlimit_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { retval = -EAGAIN; goto out_ret; } diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index a27b55ec060a..1dfa67f307f1 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -33,10 +33,9 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb, struct exfat_chain *p_dir, int entry, unsigned short *uniname) { int i; - struct exfat_entry_set_cache *es; + struct exfat_entry_set_cache es; - es = exfat_get_dentry_set(sb, p_dir, entry, ES_ALL_ENTRIES); - if (!es) + if (exfat_get_dentry_set(&es, sb, p_dir, entry, ES_ALL_ENTRIES)) return; /* @@ -45,8 +44,8 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb, * Third entry : first file-name entry * So, the index of first file-name dentry should start from 2. */ - for (i = 2; i < es->num_entries; i++) { - struct exfat_dentry *ep = exfat_get_dentry_cached(es, i); + for (i = ES_IDX_FIRST_FILENAME; i < es.num_entries; i++) { + struct exfat_dentry *ep = exfat_get_dentry_cached(&es, i); /* end of name entry */ if (exfat_get_entry_type(ep) != TYPE_EXTEND) @@ -56,13 +55,13 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb, uniname += EXFAT_FILE_NAME_LEN; } - exfat_free_dentry_set(es, false); + exfat_put_dentry_set(&es, false); } /* read a directory entry from the opened directory */ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_entry *dir_entry) { - int i, dentries_per_clu, dentries_per_clu_bits = 0, num_ext; + int i, dentries_per_clu, num_ext; unsigned int type, clu_offset, max_dentries; struct exfat_chain dir, clu; struct exfat_uni_name uni_name; @@ -84,11 +83,10 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags); dentries_per_clu = sbi->dentries_per_clu; - dentries_per_clu_bits = ilog2(dentries_per_clu); max_dentries = (unsigned int)min_t(u64, MAX_EXFAT_DENTRIES, - (u64)sbi->num_clusters << dentries_per_clu_bits); + (u64)EXFAT_CLU_TO_DEN(sbi->num_clusters, sbi)); - clu_offset = dentry >> dentries_per_clu_bits; + clu_offset = EXFAT_DEN_TO_CLU(dentry, sbi); exfat_chain_dup(&clu, &dir); if (clu.flags == ALLOC_NO_FAT_CHAIN) { @@ -163,7 +161,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent dir_entry->entry = dentry; brelse(bh); - ei->hint_bmap.off = dentry >> dentries_per_clu_bits; + ei->hint_bmap.off = EXFAT_DEN_TO_CLU(dentry, sbi); ei->hint_bmap.clu = clu.dir; *cpos = EXFAT_DEN_TO_B(dentry + 1 + num_ext); @@ -212,9 +210,9 @@ static void exfat_free_namebuf(struct exfat_dentry_namebuf *nb) /* skip iterating emit_dots when dir is empty */ #define ITER_POS_FILLED_DOTS (2) -static int exfat_iterate(struct file *filp, struct dir_context *ctx) +static int exfat_iterate(struct file *file, struct dir_context *ctx) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct inode *tmp; struct exfat_dir_entry de; @@ -228,7 +226,7 @@ static int exfat_iterate(struct file *filp, struct dir_context *ctx) mutex_lock(&EXFAT_SB(sb)->s_lock); cpos = ctx->pos; - if (!dir_emit_dots(filp, ctx)) + if (!dir_emit_dots(file, ctx)) goto unlock; if (ctx->pos == ITER_POS_FILLED_DOTS) { @@ -337,7 +335,7 @@ int exfat_calc_num_entries(struct exfat_uni_name *p_uniname) return -EINVAL; /* 1 file entry + 1 stream entry + name entries */ - return ((len - 1) / EXFAT_FILE_NAME_LEN + 3); + return ES_ENTRY_NUM(len); } unsigned int exfat_get_entry_type(struct exfat_dentry *ep) @@ -592,18 +590,18 @@ void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es) unsigned short chksum = 0; struct exfat_dentry *ep; - for (i = 0; i < es->num_entries; i++) { + for (i = ES_IDX_FILE; i < es->num_entries; i++) { ep = exfat_get_dentry_cached(es, i); chksum = exfat_calc_chksum16(ep, DENTRY_SIZE, chksum, chksum_type); chksum_type = CS_DEFAULT; } - ep = exfat_get_dentry_cached(es, 0); + ep = exfat_get_dentry_cached(es, ES_IDX_FILE); ep->dentry.file.checksum = cpu_to_le16(chksum); es->modified = true; } -int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync) +int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync) { int i, err = 0; @@ -615,7 +613,10 @@ int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync) bforget(es->bh[i]); else brelse(es->bh[i]); - kfree(es); + + if (IS_DYNAMIC_ES(es)) + kfree(es->bh); + return err; } @@ -812,14 +813,14 @@ struct exfat_dentry *exfat_get_dentry_cached( * pointer of entry set on success, * NULL on failure. */ -struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, - struct exfat_chain *p_dir, int entry, unsigned int type) +int exfat_get_dentry_set(struct exfat_entry_set_cache *es, + struct super_block *sb, struct exfat_chain *p_dir, int entry, + unsigned int type) { int ret, i, num_bh; - unsigned int off, byte_offset, clu = 0; + unsigned int off; sector_t sec; struct exfat_sb_info *sbi = EXFAT_SB(sb); - struct exfat_entry_set_cache *es; struct exfat_dentry *ep; int num_entries; enum exfat_validate_dentry_mode mode = ES_MODE_STARTED; @@ -827,52 +828,51 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, if (p_dir->dir == DIR_DELETED) { exfat_err(sb, "access to deleted dentry"); - return NULL; + return -EIO; } - byte_offset = EXFAT_DEN_TO_B(entry); - ret = exfat_walk_fat_chain(sb, p_dir, byte_offset, &clu); + ret = exfat_find_location(sb, p_dir, entry, &sec, &off); if (ret) - return NULL; + return ret; - es = kzalloc(sizeof(*es), GFP_KERNEL); - if (!es) - return NULL; + memset(es, 0, sizeof(*es)); es->sb = sb; es->modified = false; - - /* byte offset in cluster */ - byte_offset = EXFAT_CLU_OFFSET(byte_offset, sbi); - - /* byte offset in sector */ - off = EXFAT_BLK_OFFSET(byte_offset, sb); es->start_off = off; - - /* sector offset in cluster */ - sec = EXFAT_B_TO_BLK(byte_offset, sb); - sec += exfat_cluster_to_sector(sbi, clu); + es->bh = es->__bh; bh = sb_bread(sb, sec); if (!bh) - goto free_es; + return -EIO; es->bh[es->num_bh++] = bh; - ep = exfat_get_dentry_cached(es, 0); + ep = exfat_get_dentry_cached(es, ES_IDX_FILE); if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode)) - goto free_es; + goto put_es; num_entries = type == ES_ALL_ENTRIES ? ep->dentry.file.num_ext + 1 : type; es->num_entries = num_entries; num_bh = EXFAT_B_TO_BLK_ROUND_UP(off + num_entries * DENTRY_SIZE, sb); + if (num_bh > ARRAY_SIZE(es->__bh)) { + es->bh = kmalloc_array(num_bh, sizeof(*es->bh), GFP_KERNEL); + if (!es->bh) { + brelse(bh); + return -ENOMEM; + } + es->bh[0] = bh; + } + for (i = 1; i < num_bh; i++) { /* get the next sector */ if (exfat_is_last_sector_in_cluster(sbi, sec)) { + unsigned int clu = exfat_sector_to_cluster(sbi, sec); + if (p_dir->flags == ALLOC_NO_FAT_CHAIN) clu++; else if (exfat_get_next_cluster(sb, &clu)) - goto free_es; + goto put_es; sec = exfat_cluster_to_sector(sbi, clu); } else { sec++; @@ -880,21 +880,51 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, bh = sb_bread(sb, sec); if (!bh) - goto free_es; + goto put_es; es->bh[es->num_bh++] = bh; } /* validate cached dentries */ - for (i = 1; i < num_entries; i++) { + for (i = ES_IDX_STREAM; i < num_entries; i++) { ep = exfat_get_dentry_cached(es, i); if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode)) - goto free_es; + goto put_es; } - return es; + return 0; + +put_es: + exfat_put_dentry_set(es, false); + return -EIO; +} -free_es: - exfat_free_dentry_set(es, false); - return NULL; +static inline void exfat_reset_empty_hint(struct exfat_hint_femp *hint_femp) +{ + hint_femp->eidx = EXFAT_HINT_NONE; + hint_femp->count = 0; +} + +static inline void exfat_set_empty_hint(struct exfat_inode_info *ei, + struct exfat_hint_femp *candi_empty, struct exfat_chain *clu, + int dentry, int num_entries, int entry_type) +{ + if (ei->hint_femp.eidx == EXFAT_HINT_NONE || + ei->hint_femp.eidx > dentry) { + int total_entries = EXFAT_B_TO_DEN(i_size_read(&ei->vfs_inode)); + + if (candi_empty->count == 0) { + candi_empty->cur = *clu; + candi_empty->eidx = dentry; + } + + if (entry_type == TYPE_UNUSED) + candi_empty->count += total_entries - dentry; + else + candi_empty->count++; + + if (candi_empty->count == num_entries || + candi_empty->count + candi_empty->eidx == total_entries) + ei->hint_femp = *candi_empty; + } } enum { @@ -917,17 +947,21 @@ enum { */ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname, - int num_entries, unsigned int type, struct exfat_hint *hint_opt) + struct exfat_hint *hint_opt) { int i, rewind = 0, dentry = 0, end_eidx = 0, num_ext = 0, len; int order, step, name_len = 0; - int dentries_per_clu, num_empty = 0; + int dentries_per_clu; unsigned int entry_type; unsigned short *uniname = NULL; struct exfat_chain clu; struct exfat_hint *hint_stat = &ei->hint_stat; struct exfat_hint_femp candi_empty; struct exfat_sb_info *sbi = EXFAT_SB(sb); + int num_entries = exfat_calc_num_entries(p_uniname); + + if (num_entries < 0) + return num_entries; dentries_per_clu = sbi->dentries_per_clu; @@ -939,10 +973,13 @@ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, end_eidx = dentry; } - candi_empty.eidx = EXFAT_HINT_NONE; + exfat_reset_empty_hint(&ei->hint_femp); + rewind: order = 0; step = DIRENT_STEP_FILE; + exfat_reset_empty_hint(&candi_empty); + while (clu.dir != EXFAT_EOF_CLUSTER) { i = dentry & (dentries_per_clu - 1); for (; i < dentries_per_clu; i++, dentry++) { @@ -962,26 +999,9 @@ rewind: entry_type == TYPE_DELETED) { step = DIRENT_STEP_FILE; - num_empty++; - if (candi_empty.eidx == EXFAT_HINT_NONE && - num_empty == 1) { - exfat_chain_set(&candi_empty.cur, - clu.dir, clu.size, clu.flags); - } - - if (candi_empty.eidx == EXFAT_HINT_NONE && - num_empty >= num_entries) { - candi_empty.eidx = - dentry - (num_empty - 1); - WARN_ON(candi_empty.eidx < 0); - candi_empty.count = num_empty; - - if (ei->hint_femp.eidx == - EXFAT_HINT_NONE || - candi_empty.eidx <= - ei->hint_femp.eidx) - ei->hint_femp = candi_empty; - } + exfat_set_empty_hint(ei, &candi_empty, &clu, + dentry, num_entries, + entry_type); brelse(bh); if (entry_type == TYPE_UNUSED) @@ -989,17 +1009,14 @@ rewind: continue; } - num_empty = 0; - candi_empty.eidx = EXFAT_HINT_NONE; + exfat_reset_empty_hint(&candi_empty); if (entry_type == TYPE_FILE || entry_type == TYPE_DIR) { step = DIRENT_STEP_FILE; hint_opt->clu = clu.dir; hint_opt->eidx = i; - if (type == TYPE_ALL || type == entry_type) { - num_ext = ep->dentry.file.num_ext; - step = DIRENT_STEP_STRM; - } + num_ext = ep->dentry.file.num_ext; + step = DIRENT_STEP_STRM; brelse(bh); continue; } @@ -1090,12 +1107,19 @@ not_found: rewind = 1; dentry = 0; clu.dir = p_dir->dir; - /* reset empty hint */ - num_empty = 0; - candi_empty.eidx = EXFAT_HINT_NONE; goto rewind; } + /* + * set the EXFAT_EOF_CLUSTER flag to avoid search + * from the beginning again when allocated a new cluster + */ + if (ei->hint_femp.eidx == EXFAT_HINT_NONE) { + ei->hint_femp.cur.dir = EXFAT_EOF_CLUSTER; + ei->hint_femp.eidx = p_dir->size * dentries_per_clu; + ei->hint_femp.count = 0; + } + /* initialized hint_stat */ hint_stat->clu = p_dir->dir; hint_stat->eidx = 0; diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index a8f8eee4937c..bc6d21d7c5ad 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -9,6 +9,7 @@ #include <linux/fs.h> #include <linux/ratelimit.h> #include <linux/nls.h> +#include <linux/blkdev.h> #define EXFAT_ROOT_INO 1 @@ -41,6 +42,14 @@ enum { #define ES_2_ENTRIES 2 #define ES_ALL_ENTRIES 0 +#define ES_IDX_FILE 0 +#define ES_IDX_STREAM 1 +#define ES_IDX_FIRST_FILENAME 2 +#define EXFAT_FILENAME_ENTRY_NUM(name_len) \ + DIV_ROUND_UP(name_len, EXFAT_FILE_NAME_LEN) +#define ES_IDX_LAST_FILENAME(name_len) \ + (ES_IDX_FIRST_FILENAME + EXFAT_FILENAME_ENTRY_NUM(name_len) - 1) + #define DIR_DELETED 0xFFFF0321 /* type values */ @@ -62,15 +71,11 @@ enum { #define TYPE_PADDING 0x0402 #define TYPE_ACLTAB 0x0403 #define TYPE_BENIGN_SEC 0x0800 -#define TYPE_ALL 0x0FFF #define MAX_CHARSET_SIZE 6 /* max size of multi-byte character */ #define MAX_NAME_LENGTH 255 /* max len of file name excluding NULL */ #define MAX_VFSNAME_BUF_SIZE ((MAX_NAME_LENGTH + 1) * MAX_CHARSET_SIZE) -/* Enough size to hold 256 dentry (even 512 Byte sector) */ -#define DIR_CACHE_SIZE (256*sizeof(struct exfat_dentry)/512+1) - #define EXFAT_HINT_NONE -1 #define EXFAT_MIN_SUBDIR 2 @@ -95,12 +100,18 @@ enum { /* * helpers for block size to dentry size conversion. */ -#define EXFAT_B_TO_DEN_IDX(b, sbi) \ - ((b) << ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS)) #define EXFAT_B_TO_DEN(b) ((b) >> DENTRY_SIZE_BITS) #define EXFAT_DEN_TO_B(b) ((b) << DENTRY_SIZE_BITS) /* + * helpers for cluster size to dentry size conversion. + */ +#define EXFAT_CLU_TO_DEN(clu, sbi) \ + ((clu) << ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS)) +#define EXFAT_DEN_TO_CLU(dentry, sbi) \ + ((dentry) >> ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS)) + +/* * helpers for fat entry. */ #define FAT_ENT_SIZE (4) @@ -125,6 +136,17 @@ enum { #define BITS_PER_BYTE_MASK 0x7 #define IGNORED_BITS_REMAINED(clu, clu_base) ((1 << ((clu) - (clu_base))) - 1) +#define ES_ENTRY_NUM(name_len) (ES_IDX_LAST_FILENAME(name_len) + 1) +/* 19 entries = 1 file entry + 1 stream entry + 17 filename entries */ +#define ES_MAX_ENTRY_NUM ES_ENTRY_NUM(MAX_NAME_LENGTH) + +/* + * 19 entries x 32 bytes/entry = 608 bytes. + * The 608 bytes are in 3 sectors at most (even 512 Byte sector). + */ +#define DIR_CACHE_SIZE \ + (DIV_ROUND_UP(EXFAT_DEN_TO_B(ES_MAX_ENTRY_NUM), SECTOR_SIZE) + 1) + struct exfat_dentry_namebuf { char *lfn; int lfnbuf_len; /* usually MAX_UNINAME_BUF_SIZE */ @@ -166,13 +188,16 @@ struct exfat_hint { struct exfat_entry_set_cache { struct super_block *sb; - bool modified; unsigned int start_off; int num_bh; - struct buffer_head *bh[DIR_CACHE_SIZE]; + struct buffer_head *__bh[DIR_CACHE_SIZE]; + struct buffer_head **bh; unsigned int num_entries; + bool modified; }; +#define IS_DYNAMIC_ES(es) ((es)->__bh != (es)->bh) + struct exfat_dir_entry { struct exfat_chain dir; int entry; @@ -375,7 +400,7 @@ static inline sector_t exfat_cluster_to_sector(struct exfat_sb_info *sbi, sbi->data_start_sector; } -static inline int exfat_sector_to_cluster(struct exfat_sb_info *sbi, +static inline unsigned int exfat_sector_to_cluster(struct exfat_sb_info *sbi, sector_t sec) { return ((sec - sbi->data_start_sector) >> sbi->sect_per_clus_bits) + @@ -423,8 +448,8 @@ int exfat_trim_fs(struct inode *inode, struct fstrim_range *range); /* file.c */ extern const struct file_operations exfat_file_operations; -int __exfat_truncate(struct inode *inode, loff_t new_size); -void exfat_truncate(struct inode *inode, loff_t size); +int __exfat_truncate(struct inode *inode); +void exfat_truncate(struct inode *inode); int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *attr); int exfat_getattr(struct user_namespace *mnt_userns, const struct path *path, @@ -464,15 +489,16 @@ void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es); int exfat_calc_num_entries(struct exfat_uni_name *p_uniname); int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname, - int num_entries, unsigned int type, struct exfat_hint *hint_opt); + struct exfat_hint *hint_opt); int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu); struct exfat_dentry *exfat_get_dentry(struct super_block *sb, struct exfat_chain *p_dir, int entry, struct buffer_head **bh); struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es, int num); -struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, - struct exfat_chain *p_dir, int entry, unsigned int type); -int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync); +int exfat_get_dentry_set(struct exfat_entry_set_cache *es, + struct super_block *sb, struct exfat_chain *p_dir, int entry, + unsigned int type); +int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync); int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir); /* inode.c */ diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 4e0793f35e8f..f5b29072775d 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -93,7 +93,7 @@ static int exfat_sanitize_mode(const struct exfat_sb_info *sbi, } /* resize the file length */ -int __exfat_truncate(struct inode *inode, loff_t new_size) +int __exfat_truncate(struct inode *inode) { unsigned int num_clusters_new, num_clusters_phys; unsigned int last_clu = EXFAT_FREE_CLUSTER; @@ -113,7 +113,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) exfat_chain_set(&clu, ei->start_clu, num_clusters_phys, ei->flags); - if (new_size > 0) { + if (i_size_read(inode) > 0) { /* * Truncate FAT chain num_clusters after the first cluster * num_clusters = min(new, phys); @@ -143,8 +143,6 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) ei->start_clu = EXFAT_EOF_CLUSTER; } - i_size_write(inode, new_size); - if (ei->type == TYPE_FILE) ei->attr |= ATTR_ARCHIVE; @@ -189,7 +187,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) return 0; } -void exfat_truncate(struct inode *inode, loff_t size) +void exfat_truncate(struct inode *inode) { struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); @@ -207,7 +205,7 @@ void exfat_truncate(struct inode *inode, loff_t size) goto write_size; } - err = __exfat_truncate(inode, i_size_read(inode)); + err = __exfat_truncate(inode); if (err) goto write_size; @@ -310,7 +308,7 @@ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * __exfat_write_inode() is called from exfat_truncate(), inode * is already written by it, so mark_inode_dirty() is unneeded. */ - exfat_truncate(inode, attr->ia_size); + exfat_truncate(inode); up_write(&EXFAT_I(inode)->truncate_lock); } else mark_inode_dirty(inode); diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index a795437b86d0..5b644cb057fa 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -21,7 +21,7 @@ int __exfat_write_inode(struct inode *inode, int sync) { unsigned long long on_disk_size; struct exfat_dentry *ep, *ep2; - struct exfat_entry_set_cache *es = NULL; + struct exfat_entry_set_cache es; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); @@ -42,11 +42,10 @@ int __exfat_write_inode(struct inode *inode, int sync) exfat_set_volume_dirty(sb); /* get the directory entry of given file or directory */ - es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES); - if (!es) + if (exfat_get_dentry_set(&es, sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES)) return -EIO; - ep = exfat_get_dentry_cached(es, 0); - ep2 = exfat_get_dentry_cached(es, 1); + ep = exfat_get_dentry_cached(&es, ES_IDX_FILE); + ep2 = exfat_get_dentry_cached(&es, ES_IDX_STREAM); ep->dentry.file.attr = cpu_to_le16(exfat_make_attr(inode)); @@ -83,8 +82,8 @@ int __exfat_write_inode(struct inode *inode, int sync) ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER; } - exfat_update_dir_chksum_with_entry_set(es); - return exfat_free_dentry_set(es, sync); + exfat_update_dir_chksum_with_entry_set(&es); + return exfat_put_dentry_set(&es, sync); } int exfat_write_inode(struct inode *inode, struct writeback_control *wbc) @@ -345,11 +344,6 @@ static void exfat_readahead(struct readahead_control *rac) mpage_readahead(rac, exfat_get_block); } -static int exfat_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, exfat_get_block, wbc); -} - static int exfat_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -363,7 +357,7 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to) if (to > i_size_read(inode)) { truncate_pagecache(inode, i_size_read(inode)); inode->i_mtime = inode->i_ctime = current_time(inode); - exfat_truncate(inode, EXFAT_I(inode)->i_size_aligned); + exfat_truncate(inode); } } @@ -473,12 +467,12 @@ static const struct address_space_operations exfat_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = exfat_read_folio, .readahead = exfat_readahead, - .writepage = exfat_writepage, .writepages = exfat_writepages, .write_begin = exfat_write_begin, .write_end = exfat_write_end, .direct_IO = exfat_direct_IO, - .bmap = exfat_aop_bmap + .bmap = exfat_aop_bmap, + .migrate_folio = buffer_migrate_folio, }; static inline unsigned long exfat_hash(loff_t i_pos) @@ -552,7 +546,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; inode_inc_iversion(inode); - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); if (info->attr & ATTR_SUBDIR) { /* directory */ inode->i_generation &= ~1; @@ -627,7 +621,7 @@ void exfat_evict_inode(struct inode *inode) if (!inode->i_nlink) { i_size_write(inode, 0); mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock); - __exfat_truncate(inode, 0); + __exfat_truncate(inode); mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock); } diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index b617bebc3d0f..5f995eba5dbb 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -224,11 +224,18 @@ static int exfat_search_empty_slot(struct super_block *sb, if (hint_femp->eidx != EXFAT_HINT_NONE) { dentry = hint_femp->eidx; - if (num_entries <= hint_femp->count) { - hint_femp->eidx = EXFAT_HINT_NONE; - return dentry; - } + /* + * If hint_femp->count is enough, it is needed to check if + * there are actual empty entries. + * Otherwise, and if "dentry + hint_famp->count" is also equal + * to "p_dir->size * dentries_per_clu", it means ENOSPC. + */ + if (dentry + hint_femp->count == p_dir->size * dentries_per_clu && + num_entries > hint_femp->count) + return -ENOSPC; + + hint_femp->eidx = EXFAT_HINT_NONE; exfat_chain_dup(&clu, &hint_femp->cur); } else { exfat_chain_dup(&clu, p_dir); @@ -293,6 +300,12 @@ static int exfat_search_empty_slot(struct super_block *sb, } } + hint_femp->eidx = p_dir->size * dentries_per_clu - num_empty; + hint_femp->count = num_empty; + if (num_empty == 0) + exfat_chain_set(&hint_femp->cur, EXFAT_EOF_CLUSTER, 0, + clu.flags); + return -ENOSPC; } @@ -369,15 +382,11 @@ static int exfat_find_empty_entry(struct inode *inode, if (exfat_ent_set(sb, last_clu, clu.dir)) return -EIO; - if (hint_femp.eidx == EXFAT_HINT_NONE) { - /* the special case that new dentry - * should be allocated from the start of new cluster - */ - hint_femp.eidx = EXFAT_B_TO_DEN_IDX(p_dir->size, sbi); - hint_femp.count = sbi->dentries_per_clu; - + if (hint_femp.cur.dir == EXFAT_EOF_CLUSTER) exfat_chain_set(&hint_femp.cur, clu.dir, 0, clu.flags); - } + + hint_femp.count += sbi->dentries_per_clu; + hint_femp.cur.size++; p_dir->size++; size = EXFAT_CLU_TO_B(p_dir->size, sbi); @@ -588,14 +597,14 @@ unlock: static int exfat_find(struct inode *dir, struct qstr *qname, struct exfat_dir_entry *info) { - int ret, dentry, num_entries, count; + int ret, dentry, count; struct exfat_chain cdir; struct exfat_uni_name uni_name; struct super_block *sb = dir->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(dir); struct exfat_dentry *ep, *ep2; - struct exfat_entry_set_cache *es; + struct exfat_entry_set_cache es; /* for optimized dir & entry to prevent long traverse of cluster chain */ struct exfat_hint hint_opt; @@ -607,10 +616,6 @@ static int exfat_find(struct inode *dir, struct qstr *qname, if (ret) return ret; - num_entries = exfat_calc_num_entries(&uni_name); - if (num_entries < 0) - return num_entries; - /* check the validation of hint_stat and initialize it if required */ if (ei->version != (inode_peek_iversion_raw(dir) & 0xffffffff)) { ei->hint_stat.clu = cdir.dir; @@ -620,9 +625,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname, } /* search the file name for directories */ - dentry = exfat_find_dir_entry(sb, ei, &cdir, &uni_name, - num_entries, TYPE_ALL, &hint_opt); - + dentry = exfat_find_dir_entry(sb, ei, &cdir, &uni_name, &hint_opt); if (dentry < 0) return dentry; /* -error value */ @@ -635,11 +638,10 @@ static int exfat_find(struct inode *dir, struct qstr *qname, if (cdir.flags & ALLOC_NO_FAT_CHAIN) cdir.size -= dentry / sbi->dentries_per_clu; dentry = hint_opt.eidx; - es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES); - if (!es) + if (exfat_get_dentry_set(&es, sb, &cdir, dentry, ES_2_ENTRIES)) return -EIO; - ep = exfat_get_dentry_cached(es, 0); - ep2 = exfat_get_dentry_cached(es, 1); + ep = exfat_get_dentry_cached(&es, ES_IDX_FILE); + ep2 = exfat_get_dentry_cached(&es, ES_IDX_STREAM); info->type = exfat_get_entry_type(ep); info->attr = le16_to_cpu(ep->dentry.file.attr); @@ -668,7 +670,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname, ep->dentry.file.access_time, ep->dentry.file.access_date, 0); - exfat_free_dentry_set(es, false); + exfat_put_dentry_set(&es, false); if (ei->start_clu == EXFAT_FREE_CLUSTER) { exfat_fs_error(sb, @@ -1167,7 +1169,7 @@ static int __exfat_rename(struct inode *old_parent_inode, struct exfat_inode_info *new_ei = NULL; unsigned int new_entry_type = TYPE_UNUSED; int new_entry = 0; - struct buffer_head *old_bh, *new_bh = NULL; + struct buffer_head *new_bh = NULL; /* check the validity of pointer parameters */ if (new_path == NULL || strlen(new_path) == 0) @@ -1183,13 +1185,6 @@ static int __exfat_rename(struct inode *old_parent_inode, EXFAT_I(old_parent_inode)->flags); dentry = ei->entry; - ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh); - if (!ep) { - ret = -EIO; - goto out; - } - brelse(old_bh); - /* check whether new dir is existing directory and empty */ if (new_inode) { ret = -EIO; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 3ef80d000e13..3204bd33e4e8 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -18,7 +18,7 @@ #include <linux/sched.h> #include <linux/cred.h> -#define dprintk(fmt, args...) do{}while(0) +#define dprintk(fmt, args...) pr_debug(fmt, ##args) static int get_name(const struct path *path, char *name, struct dentry *child); @@ -132,8 +132,8 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, inode_unlock(dentry->d_inode); if (IS_ERR(parent)) { - dprintk("%s: get_parent of %ld failed, err %d\n", - __func__, dentry->d_inode->i_ino, PTR_ERR(parent)); + dprintk("get_parent of %lu failed, err %ld\n", + dentry->d_inode->i_ino, PTR_ERR(parent)); return parent; } @@ -147,7 +147,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, dprintk("%s: found name: %s\n", __func__, nbuf); tmp = lookup_one_unlocked(mnt_user_ns(mnt), nbuf, parent, strlen(nbuf)); if (IS_ERR(tmp)) { - dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp)); + dprintk("lookup failed: %ld\n", PTR_ERR(tmp)); err = PTR_ERR(tmp); goto out_err; } @@ -248,21 +248,20 @@ struct getdents_callback { * A rather strange filldir function to capture * the name matching the specified inode number. */ -static int filldir_one(struct dir_context *ctx, const char *name, int len, +static bool filldir_one(struct dir_context *ctx, const char *name, int len, loff_t pos, u64 ino, unsigned int d_type) { struct getdents_callback *buf = container_of(ctx, struct getdents_callback, ctx); - int result = 0; buf->sequence++; if (buf->ino == ino && len <= NAME_MAX) { memcpy(buf->name, name, len); buf->name[len] = '\0'; buf->found = 1; - result = -1; + return false; // no more } - return result; + return true; } /** diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index bf298967c5b8..440d5f1e9d47 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -219,11 +219,12 @@ __ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) * inode->i_mutex: down */ int -ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int error; int update_mode = 0; + struct inode *inode = d_inode(dentry); umode_t mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 925ab6287d35..3841becb94ff 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -56,7 +56,7 @@ static inline int ext2_acl_count(size_t size) /* acl.c */ extern struct posix_acl *ext2_get_acl(struct inode *inode, int type, bool rcu); -extern int ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +extern int ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); extern int ext2_init_acl (struct inode *, struct inode *); diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index c17ccc19b938..eca60b747c6b 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -126,6 +126,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) struct ext2_group_desc * desc; struct buffer_head * bh = NULL; ext2_fsblk_t bitmap_blk; + int ret; desc = ext2_get_group_desc(sb, block_group, NULL); if (!desc) @@ -139,10 +140,10 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) block_group, le32_to_cpu(desc->bg_block_bitmap)); return NULL; } - if (likely(bh_uptodate_or_lock(bh))) + ret = bh_read(bh, 0); + if (ret > 0) return bh; - - if (bh_submit_read(bh) < 0) { + if (ret < 0) { brelse(bh); ext2_error(sb, __func__, "Cannot read block bitmap - " @@ -666,7 +667,7 @@ ext2_try_to_allocate(struct super_block *sb, int group, { ext2_fsblk_t group_first_block = ext2_group_first_block_no(sb, group); ext2_fsblk_t group_last_block = ext2_group_last_block_no(sb, group); - ext2_grpblk_t start, end; + ext2_grpblk_t start, end; unsigned long num = 0; start = 0; @@ -1480,11 +1481,11 @@ unsigned long ext2_count_free_blocks (struct super_block * sb) desc_count, bitmap_count); return bitmap_count; #else - for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { - desc = ext2_get_group_desc (sb, i, NULL); - if (!desc) - continue; - desc_count += le16_to_cpu(desc->bg_free_blocks_count); + for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { + desc = ext2_get_group_desc(sb, i, NULL); + if (!desc) + continue; + desc_count += le16_to_cpu(desc->bg_free_blocks_count); } return desc_count; #endif diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 8f597753ac12..e5cbc27ba459 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -81,11 +81,10 @@ ext2_last_byte(struct inode *inode, unsigned long page_nr) return last_byte; } -static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) +static void ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; - int err = 0; inode_inc_iversion(dir); block_write_end(NULL, mapping, pos, len, len, page, NULL); @@ -94,16 +93,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) i_size_write(dir, pos+len); mark_inode_dirty(dir); } - - if (IS_DIRSYNC(dir)) { - err = write_one_page(page); - if (!err) - err = sync_inode_metadata(dir, 1); - } else { - unlock_page(page); - } - - return err; + unlock_page(page); } static bool ext2_check_page(struct page *page, int quiet, char *kaddr) @@ -413,7 +403,7 @@ found: return de; } -/** +/* * Return the '..' directory entry and the page in which the entry was found * (as a parameter - p). * @@ -460,6 +450,17 @@ static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len) return __block_write_begin(page, pos, len, ext2_get_block); } + +static int ext2_handle_dirsync(struct inode *dir) +{ + int err; + + err = filemap_write_and_wait(dir->i_mapping); + if (!err) + err = sync_inode_metadata(dir, 1); + return err; +} + void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, struct page *page, void *page_addr, struct inode *inode, int update_times) @@ -474,11 +475,12 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, BUG_ON(err); de->inode = cpu_to_le32(inode->i_ino); ext2_set_de_type(de, inode); - err = ext2_commit_chunk(page, pos, len); + ext2_commit_chunk(page, pos, len); if (update_times) dir->i_mtime = dir->i_ctime = current_time(dir); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); + ext2_handle_dirsync(dir); } /* @@ -566,10 +568,11 @@ got_it: memcpy(de->name, name, namelen); de->inode = cpu_to_le32(inode->i_ino); ext2_set_de_type (de, inode); - err = ext2_commit_chunk(page, pos, rec_len); + ext2_commit_chunk(page, pos, rec_len); dir->i_mtime = dir->i_ctime = current_time(dir); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); + err = ext2_handle_dirsync(dir); /* OFFSET_CACHE */ out_put: ext2_put_page(page, page_addr); @@ -615,10 +618,11 @@ int ext2_delete_entry (struct ext2_dir_entry_2 *dir, struct page *page, if (pde) pde->rec_len = ext2_rec_len_to_disk(to - from); dir->inode = 0; - err = ext2_commit_chunk(page, pos, to - from); + ext2_commit_chunk(page, pos, to - from); inode->i_ctime = inode->i_mtime = current_time(inode); EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(inode); + err = ext2_handle_dirsync(inode); out: return err; } @@ -658,7 +662,8 @@ int ext2_make_empty(struct inode *inode, struct inode *parent) memcpy (de->name, "..\0", 4); ext2_set_de_type (de, inode); kunmap_atomic(kaddr); - err = ext2_commit_chunk(page, 0, chunk_size); + ext2_commit_chunk(page, 0, chunk_size); + err = ext2_handle_dirsync(inode); fail: put_page(page); return err; @@ -679,7 +684,7 @@ int ext2_empty_dir (struct inode * inode) page = ext2_get_page(inode, i, 0, &page_addr); if (IS_ERR(page)) - goto not_empty; + return 0; kaddr = page_addr; de = (ext2_dirent *)kaddr; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index eb97aa3d700e..6b4bebe982ca 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -200,7 +200,7 @@ const struct inode_operations ext2_file_inode_operations = { .listxattr = ext2_listxattr, .getattr = ext2_getattr, .setattr = ext2_setattr, - .get_acl = ext2_get_acl, + .get_inode_acl = ext2_get_acl, .set_acl = ext2_set_acl, .fiemap = ext2_fiemap, .fileattr_get = ext2_fileattr_get, diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 998dd2ac8008..78b8686d9a4a 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -277,8 +277,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) int best_ndir = inodes_per_group; int best_group = -1; - group = prandom_u32(); - parent_group = (unsigned)group % ngroups; + parent_group = get_random_u32_below(ngroups); for (i = 0; i < ngroups; i++) { group = (parent_group + i) % ngroups; desc = ext2_get_group_desc (sb, group, NULL); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 918ab2f9e4c0..69aed9e2359e 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -869,11 +869,6 @@ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } -static int ext2_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, ext2_get_block, wbc); -} - static int ext2_read_folio(struct file *file, struct folio *folio) { return mpage_read_folio(folio, ext2_get_block); @@ -948,7 +943,6 @@ const struct address_space_operations ext2_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = ext2_read_folio, .readahead = ext2_readahead, - .writepage = ext2_writepage, .write_begin = ext2_write_begin, .write_end = ext2_write_end, .bmap = ext2_bmap, @@ -1652,7 +1646,7 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } setattr_copy(&init_user_ns, inode, iattr); if (iattr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); mark_inode_dirty(inode); return error; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 5fd9a22d2b70..c056957221a2 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -120,7 +120,7 @@ static int ext2_create (struct user_namespace * mnt_userns, } static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { struct inode *inode = ext2_new_inode(dir, mode, NULL); if (IS_ERR(inode)) @@ -128,9 +128,9 @@ static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, ext2_set_file_ops(inode); mark_inode_dirty(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); unlock_new_inode(inode); - return 0; + return finish_open_simple(file, 0); } static int ext2_mknod (struct user_namespace * mnt_userns, struct inode * dir, @@ -427,7 +427,7 @@ const struct inode_operations ext2_dir_inode_operations = { .listxattr = ext2_listxattr, .getattr = ext2_getattr, .setattr = ext2_setattr, - .get_acl = ext2_get_acl, + .get_inode_acl = ext2_get_acl, .set_acl = ext2_set_acl, .tmpfile = ext2_tmpfile, .fileattr_get = ext2_fileattr_get, @@ -438,6 +438,6 @@ const struct inode_operations ext2_special_inode_operations = { .listxattr = ext2_listxattr, .getattr = ext2_getattr, .setattr = ext2_setattr, - .get_acl = ext2_get_acl, + .get_inode_acl = ext2_get_acl, .set_acl = ext2_set_acl, }; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 252c742379cf..69c88facfe90 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -163,7 +163,7 @@ static void ext2_put_super (struct super_block * sb) db_count = sbi->s_gdb_count; for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); - kfree(sbi->s_group_desc); + kvfree(sbi->s_group_desc); kfree(sbi->s_debts); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); @@ -1052,6 +1052,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_blocks_per_group); goto failed_mount; } + /* At least inode table, bitmaps, and sb have to fit in one group */ + if (sbi->s_blocks_per_group <= sbi->s_itb_per_group + 3) { + ext2_msg(sb, KERN_ERR, + "error: #blocks per group smaller than metadata size: %lu <= %lu", + sbi->s_blocks_per_group, sbi->s_inodes_per_group + 3); + goto failed_mount; + } if (sbi->s_frags_per_group > sb->s_blocksize * 8) { ext2_msg(sb, KERN_ERR, "error: #fragments per group too big: %lu", @@ -1065,9 +1072,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_inodes_per_group); goto failed_mount; } + if (sb_bdev_nr_blocks(sb) < le32_to_cpu(es->s_blocks_count)) { + ext2_msg(sb, KERN_ERR, + "bad geometry: block count %u exceeds size of device (%u blocks)", + le32_to_cpu(es->s_blocks_count), + (unsigned)sb_bdev_nr_blocks(sb)); + goto failed_mount; + } - if (EXT2_BLOCKS_PER_GROUP(sb) == 0) - goto cantfind_ext2; sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - le32_to_cpu(es->s_first_data_block) - 1) / EXT2_BLOCKS_PER_GROUP(sb)) + 1; @@ -1080,7 +1092,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) } db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) / EXT2_DESC_PER_BLOCK(sb); - sbi->s_group_desc = kmalloc_array(db_count, + sbi->s_group_desc = kvmalloc_array(db_count, sizeof(struct buffer_head *), GFP_KERNEL); if (sbi->s_group_desc == NULL) { @@ -1206,7 +1218,7 @@ failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); failed_mount_group_desc: - kfree(sbi->s_group_desc); + kvfree(sbi->s_group_desc); kfree(sbi->s_debts); failed_mount: brelse(bh); @@ -1636,7 +1648,7 @@ static int __init init_ext2_fs(void) err = init_inodecache(); if (err) return err; - err = register_filesystem(&ext2_fs_type); + err = register_filesystem(&ext2_fs_type); if (err) goto out; return 0; diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 57e82e25f8e2..a9f89539aeee 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -225,12 +225,13 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, } int -ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { handle_t *handle; int error, credits, retries = 0; size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0; + struct inode *inode = d_inode(dentry); umode_t mode = inode->i_mode; int update_mode = 0; diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 3219669732bf..09c4a8a3b716 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -56,7 +56,7 @@ static inline int ext4_acl_count(size_t size) /* acl.c */ struct posix_acl *ext4_get_acl(struct inode *inode, int type, bool rcu); -int ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3bf9a6926798..140e1eb300d1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -558,7 +558,7 @@ enum { * * It's not paranoia if the Murphy's Law really *is* out to get you. :-) */ -#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG)) #define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) static inline void ext4_check_flag_values(void) @@ -2964,7 +2964,8 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode, typedef enum { EXT4_IGET_NORMAL = 0, EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ - EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */ + EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */ + EXT4_IGET_BAD = 0x0004 /* Allow to iget a bad inode */ } ext4_iget_flags; extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, @@ -2977,6 +2978,7 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct user_namespace *, struct dentry *, struct iattr *); +extern u32 ext4_dio_alignment(struct inode *inode); extern int ext4_getattr(struct user_namespace *, const struct path *, struct kstat *, u32, unsigned int); extern void ext4_evict_inode(struct inode *); @@ -2998,6 +3000,7 @@ extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); @@ -3591,9 +3594,6 @@ extern bool empty_inline_dir(struct inode *dir, int *has_inline_data); extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, struct ext4_dir_entry_2 **parent_de, int *retval); -extern int ext4_inline_data_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, - int *has_inline, __u64 start, __u64 len); extern void *ext4_read_inline_link(struct inode *inode); struct iomap; @@ -3621,8 +3621,8 @@ extern void ext4_initialize_dirent_tail(struct buffer_head *bh, unsigned int blocksize); extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, struct buffer_head *bh); -extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name, - struct inode *inode); +extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name, + struct inode *inode, struct dentry *dentry); extern int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry); @@ -3712,7 +3712,7 @@ extern int ext4_ext_insert_extent(handle_t *, struct inode *, extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path **, int flags); -extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern void ext4_free_ext_path(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -3758,8 +3758,7 @@ extern void ext4_end_io_rsv_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, - int len, - bool keep_towrite); + int len); extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 8e1fb18f465e..77f318ec8abb 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -86,15 +86,21 @@ static int ext4_journal_check_start(struct super_block *sb) return 0; } -handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, +handle_t *__ext4_journal_start_sb(struct inode *inode, + struct super_block *sb, unsigned int line, int type, int blocks, int rsv_blocks, int revoke_creds) { journal_t *journal; int err; - - trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds, - _RET_IP_); + if (inode) + trace_ext4_journal_start_inode(inode, blocks, rsv_blocks, + revoke_creds, type, + _RET_IP_); + else + trace_ext4_journal_start_sb(sb, blocks, rsv_blocks, + revoke_creds, type, + _RET_IP_); err = ext4_journal_check_start(sb); if (err < 0) return ERR_PTR(err); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index db2ae4a2b38d..0c77697d5e90 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -261,9 +261,9 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ (bh)) -handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, - int type, int blocks, int rsv_blocks, - int revoke_creds); +handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb, + unsigned int line, int type, int blocks, + int rsv_blocks, int revoke_creds); int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) @@ -303,7 +303,7 @@ static inline int ext4_trans_default_revoke_credits(struct super_block *sb) } #define ext4_journal_start_sb(sb, type, nblocks) \ - __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0, \ + __ext4_journal_start_sb(NULL, (sb), __LINE__, (type), (nblocks), 0,\ ext4_trans_default_revoke_credits(sb)) #define ext4_journal_start(inode, type, nblocks) \ @@ -323,7 +323,7 @@ static inline handle_t *__ext4_journal_start(struct inode *inode, int blocks, int rsv_blocks, int revoke_creds) { - return __ext4_journal_start_sb(inode->i_sb, line, type, blocks, + return __ext4_journal_start_sb(inode, inode->i_sb, line, type, blocks, rsv_blocks, revoke_creds); } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5235974126bd..9de1c9d1a13d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -106,6 +106,25 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) return 0; } +static void ext4_ext_drop_refs(struct ext4_ext_path *path) +{ + int depth, i; + + if (!path) + return; + depth = path->p_depth; + for (i = 0; i <= depth; i++, path++) { + brelse(path->p_bh); + path->p_bh = NULL; + } +} + +void ext4_free_ext_path(struct ext4_ext_path *path) +{ + ext4_ext_drop_refs(path); + kfree(path); +} + /* * Make sure 'handle' has at least 'check_cred' credits. If not, restart * transaction with 'restart_cred' credits. The function drops i_data_sem @@ -636,8 +655,7 @@ int ext4_ext_precache(struct inode *inode) ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); out: up_read(&ei->i_data_sem); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return ret; } @@ -724,19 +742,6 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, #define ext4_ext_show_move(inode, path, newblock, level) #endif -void ext4_ext_drop_refs(struct ext4_ext_path *path) -{ - int depth, i; - - if (!path) - return; - depth = path->p_depth; - for (i = 0; i <= depth; i++, path++) { - brelse(path->p_bh); - path->p_bh = NULL; - } -} - /* * ext4_ext_binsearch_idx: * binary search for the closest index of the given block @@ -955,8 +960,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, return path; err: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); if (orig_path) *orig_path = NULL; return ERR_PTR(ret); @@ -2174,8 +2178,7 @@ merge: err = ext4_ext_dirty(handle, inode, path + path->p_depth); cleanup: - ext4_ext_drop_refs(npath); - kfree(npath); + ext4_free_ext_path(npath); return err; } @@ -2632,9 +2635,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, unwritten, ex_ee_len); path[depth].p_ext = ex; - a = ex_ee_block > start ? ex_ee_block : start; - b = ex_ee_block+ex_ee_len - 1 < end ? - ex_ee_block+ex_ee_len - 1 : end; + a = max(ex_ee_block, start); + b = min(ex_ee_block + ex_ee_len - 1, end); ext_debug(inode, " border %u:%u\n", a, b); @@ -3061,8 +3063,7 @@ again: } } out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); path = NULL; if (err == -EAGAIN) goto again; @@ -4375,8 +4376,7 @@ got_allocated_blocks: allocated = map->m_len; ext4_ext_show_leaf(inode, path); out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); @@ -5183,6 +5183,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, * and it is decreased till we reach start. */ again: + ret = 0; if (SHIFT == SHIFT_LEFT) iterator = &start; else @@ -5226,14 +5227,21 @@ again: ext4_ext_get_actual_len(extent); } else { extent = EXT_FIRST_EXTENT(path[depth].p_hdr); - if (le32_to_cpu(extent->ee_block) > 0) + if (le32_to_cpu(extent->ee_block) > start) *iterator = le32_to_cpu(extent->ee_block) - 1; - else - /* Beginning is reached, end of the loop */ + else if (le32_to_cpu(extent->ee_block) == start) iterator = NULL; - /* Update path extent in case we need to stop */ - while (le32_to_cpu(extent->ee_block) < start) + else { + extent = EXT_LAST_EXTENT(path[depth].p_hdr); + while (le32_to_cpu(extent->ee_block) >= start) + extent--; + + if (extent == EXT_LAST_EXTENT(path[depth].p_hdr)) + break; + extent++; + iterator = NULL; + } path[depth].p_ext = extent; } ret = ext4_ext_shift_path_extents(path, shift, inode, @@ -5245,8 +5253,7 @@ again: break; } out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return ret; } @@ -5538,15 +5545,13 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) EXT4_GET_BLOCKS_METADATA_NOFAIL); } - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); if (ret < 0) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } } else { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); } ret = ext4_es_remove_extent(inode, offset_lblk, @@ -5561,8 +5566,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) * ee_start_lblk to shift extents */ ret = ext4_ext_shift_extents(inode, handle, - ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk, - len_lblk, SHIFT_RIGHT); + max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) @@ -5766,10 +5770,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, count -= len; repeat: - ext4_ext_drop_refs(path1); - kfree(path1); - ext4_ext_drop_refs(path2); - kfree(path2); + ext4_free_ext_path(path1); + ext4_free_ext_path(path2); path1 = path2 = NULL; } return replaced_count; @@ -5795,6 +5797,14 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) struct ext4_extent *extent; ext4_lblk_t first_lblk, first_lclu, last_lclu; + /* + * if data can be stored inline, the logical cluster isn't + * mapped - no physical clusters have been allocated, and the + * file has no extents + */ + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) + return 0; + /* search for the extent closest to the first block in the cluster */ path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); if (IS_ERR(path)) { @@ -5848,8 +5858,7 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) } out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return err ? err : mapped; } @@ -5916,8 +5925,7 @@ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]); up_write(&EXT4_I(inode)->i_data_sem); out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); ext4_mark_inode_dirty(NULL, inode); return ret; } @@ -5935,8 +5943,7 @@ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end) return; ex = path[path->p_depth].p_ext; if (!ex) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); ext4_mark_inode_dirty(NULL, inode); return; } @@ -5949,8 +5956,7 @@ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end) ext4_ext_dirty(NULL, inode, &path[path->p_depth]); up_write(&EXT4_I(inode)->i_data_sem); ext4_mark_inode_dirty(NULL, inode); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); } } @@ -5989,13 +5995,11 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) return PTR_ERR(path); ex = path[path->p_depth].p_ext; if (!ex) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); goto out; } end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); /* Count the number of data blocks */ cur = 0; @@ -6025,30 +6029,26 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) if (IS_ERR(path)) goto out; numblks += path->p_depth; - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); while (cur < end) { path = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path)) break; ex = path[path->p_depth].p_ext; if (!ex) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return 0; } cur = max(cur + 1, le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)); ret = skip_hole(inode, &cur); if (ret < 0) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); break; } path2 = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path2)) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); break; } for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) { @@ -6062,10 +6062,8 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) if (cmp1 != cmp2 && cmp2 != 0) numblks++; } - ext4_ext_drop_refs(path); - ext4_ext_drop_refs(path2); - kfree(path); - kfree(path2); + ext4_free_ext_path(path); + ext4_free_ext_path(path2); } out: @@ -6092,13 +6090,11 @@ int ext4_ext_clear_bb(struct inode *inode) return PTR_ERR(path); ex = path[path->p_depth].p_ext; if (!ex) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return 0; } end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); cur = 0; while (cur < end) { @@ -6117,8 +6113,7 @@ int ext4_ext_clear_bb(struct inode *inode) ext4_fc_record_regions(inode->i_sb, inode->i_ino, 0, path[j].p_block, 1, 1); } - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); } ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); ext4_fc_record_regions(inode->i_sb, inode->i_ino, diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 23167efda95e..7bc221038c6c 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -155,9 +155,7 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, int __init ext4_init_es(void) { - ext4_es_cachep = kmem_cache_create("ext4_extent_status", - sizeof(struct extent_status), - 0, (SLAB_RECLAIM_ACCOUNT), NULL); + ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); if (ext4_es_cachep == NULL) return -ENOMEM; return 0; @@ -667,8 +665,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, } } out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); } static void ext4_es_insert_extent_ind_check(struct inode *inode, @@ -1372,7 +1369,7 @@ retry: if (count_reserved) count_rsvd(inode, lblk, orig_es.es_len - len1 - len2, &orig_es, &rc); - goto out; + goto out_get_reserved; } if (len1 > 0) { @@ -1414,6 +1411,7 @@ retry: } } +out_get_reserved: if (count_reserved) *reserved = get_rsvd(inode, end, es, &rc); out: @@ -1808,9 +1806,7 @@ static void ext4_print_pending_tree(struct inode *inode) int __init ext4_init_pending(void) { - ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation", - sizeof(struct pending_reservation), - 0, (SLAB_RECLAIM_ACCOUNT), NULL); + ext4_pending_cachep = KMEM_CACHE(pending_reservation, SLAB_RECLAIM_ACCOUNT); if (ext4_pending_cachep == NULL) return -ENOMEM; return 0; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 2af962cbb835..4594b62f147b 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -229,6 +229,12 @@ __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) finish_wait(wq, &wait.wq_entry); } +static bool ext4_fc_disabled(struct super_block *sb) +{ + return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || + (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); +} + /* * Inform Ext4's fast about start of an inode update * @@ -240,8 +246,7 @@ void ext4_fc_start_update(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; restart: @@ -265,8 +270,7 @@ void ext4_fc_stop_update(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; if (atomic_dec_and_test(&ei->i_fc_updates)) @@ -283,8 +287,7 @@ void ext4_fc_del(struct inode *inode) struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_fc_dentry_update *fc_dentry; - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; restart: @@ -337,8 +340,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl struct ext4_sb_info *sbi = EXT4_SB(sb); tid_t tid; - if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(sb)) return; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); @@ -418,25 +420,34 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) struct __track_dentry_update_args *dentry_update = (struct __track_dentry_update_args *)arg; struct dentry *dentry = dentry_update->dentry; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct inode *dir = dentry->d_parent->d_inode; + struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); mutex_unlock(&ei->i_fc_lock); + + if (IS_ENCRYPTED(dir)) { + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, + NULL); + mutex_lock(&ei->i_fc_lock); + return -EOPNOTSUPP; + } + node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } node->fcd_op = dentry_update->op; - node->fcd_parent = dentry->d_parent->d_inode->i_ino; + node->fcd_parent = dir->i_ino; node->fcd_ino = inode->i_ino; if (dentry->d_name.len > DNAME_INLINE_LEN) { node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); if (!node->fcd_name.name) { kmem_cache_free(ext4_fc_dentry_cachep, node); - ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_NOMEM, NULL); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -493,10 +504,8 @@ void __ext4_fc_track_unlink(handle_t *handle, void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (sbi->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) @@ -522,10 +531,8 @@ void __ext4_fc_track_link(handle_t *handle, void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (sbi->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) @@ -551,10 +558,8 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode, void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (sbi->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) @@ -576,22 +581,20 @@ static int __track_inode(struct inode *inode, void *arg, bool update) void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; if (S_ISDIR(inode->i_mode)) return; + if (ext4_fc_disabled(inode->i_sb)) + return; + if (ext4_should_journal_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); return; } - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (sbi->s_mount_state & EXT4_FC_REPLAY)) - return; - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; @@ -634,15 +637,13 @@ static int __track_range(struct inode *inode, void *arg, bool update) void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct __track_range_args args; int ret; if (S_ISDIR(inode->i_mode)) return; - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (sbi->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) @@ -674,18 +675,6 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) /* Ext4 commit path routines */ -/* memzero and update CRC */ -static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, - u32 *crc) -{ - void *ret; - - ret = memset(dst, 0, len); - if (crc) - *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len); - return ret; -} - /* * Allocate len bytes on a fast commit buffer. * @@ -699,62 +688,60 @@ static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, */ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) { - struct ext4_fc_tl *tl; + struct ext4_fc_tl tl; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh; int bsize = sbi->s_journal->j_blocksize; int ret, off = sbi->s_fc_bytes % bsize; - int pad_len; + int remaining; + u8 *dst; /* - * After allocating len, we should have space at least for a 0 byte - * padding. + * If 'len' is too long to fit in any block alongside a PAD tlv, then we + * cannot fulfill the request. */ - if (len + sizeof(struct ext4_fc_tl) > bsize) + if (len > bsize - EXT4_FC_TAG_BASE_LEN) return NULL; - if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) { - /* - * Only allocate from current buffer if we have enough space for - * this request AND we have space to add a zero byte padding. - */ - if (!sbi->s_fc_bh) { - ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); - if (ret) - return NULL; - sbi->s_fc_bh = bh; - } + if (!sbi->s_fc_bh) { + ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); + if (ret) + return NULL; + sbi->s_fc_bh = bh; + } + dst = sbi->s_fc_bh->b_data + off; + + /* + * Allocate the bytes in the current block if we can do so while still + * leaving enough space for a PAD tlv. + */ + remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; + if (len <= remaining) { sbi->s_fc_bytes += len; - return sbi->s_fc_bh->b_data + off; + return dst; } - /* Need to add PAD tag */ - tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); - tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); - pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl); - tl->fc_len = cpu_to_le16(pad_len); - if (crc) - *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl)); - if (pad_len > 0) - ext4_fc_memzero(sb, tl + 1, pad_len, crc); + + /* + * Else, terminate the current block with a PAD tlv, then allocate a new + * block and allocate the bytes at the start of that new block. + */ + + tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); + tl.fc_len = cpu_to_le16(remaining); + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); + memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); + *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize); + ext4_fc_submit_bh(sb, false); ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); if (ret) return NULL; sbi->s_fc_bh = bh; - sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len; + sbi->s_fc_bytes += bsize - off + len; return sbi->s_fc_bh->b_data; } -/* memcpy to fc reserved space and update CRC */ -static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, - int len, u32 *crc) -{ - if (crc) - *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); - return memcpy(dst, src, len); -} - /* * Complete a fast commit by writing tail tag. * @@ -775,23 +762,27 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) * ext4_fc_reserve_space takes care of allocating an extra block if * there's no enough space on this block for accommodating this tail. */ - dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc); + dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc); if (!dst) return -ENOSPC; off = sbi->s_fc_bytes % bsize; tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); - tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); + tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); - ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc); - dst += sizeof(tl); + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); + dst += EXT4_FC_TAG_BASE_LEN; tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); - ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); + memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); dst += sizeof(tail.fc_tid); + crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data, + dst - (u8 *)sbi->s_fc_bh->b_data); tail.fc_crc = cpu_to_le32(crc); - ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); + memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); + dst += sizeof(tail.fc_crc); + memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ ext4_fc_submit_bh(sb, true); @@ -808,15 +799,15 @@ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, struct ext4_fc_tl tl; u8 *dst; - dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc); + dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc); if (!dst) return false; tl.fc_tag = cpu_to_le16(tag); tl.fc_len = cpu_to_le16(len); - ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); - ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc); + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); + memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len); return true; } @@ -828,8 +819,8 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, struct ext4_fc_dentry_info fcd; struct ext4_fc_tl tl; int dlen = fc_dentry->fcd_name.len; - u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, - crc); + u8 *dst = ext4_fc_reserve_space(sb, + EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); if (!dst) return false; @@ -838,11 +829,11 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); - ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); - dst += sizeof(tl); - ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); + dst += EXT4_FC_TAG_BASE_LEN; + memcpy(dst, &fcd, sizeof(fcd)); dst += sizeof(fcd); - ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc); + memcpy(dst, fc_dentry->fcd_name.name, dlen); return true; } @@ -874,22 +865,21 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc) tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); + ret = -ECANCELED; dst = ext4_fc_reserve_space(inode->i_sb, - sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc); + EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc); if (!dst) - return -ECANCELED; + goto err; - if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc)) - return -ECANCELED; - dst += sizeof(tl); - if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) - return -ECANCELED; + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); + dst += EXT4_FC_TAG_BASE_LEN; + memcpy(dst, &fc_inode, sizeof(fc_inode)); dst += sizeof(fc_inode); - if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), - inode_len, crc)) - return -ECANCELED; - - return 0; + memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len); + ret = 0; +err: + brelse(iloc.bh); + return ret; } /* @@ -991,7 +981,7 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal) finish_wait(&ei->i_fc_wait, &wait); } spin_unlock(&sbi->s_fc_lock); - ret = jbd2_submit_inode_data(ei->jinode); + ret = jbd2_submit_inode_data(journal, ei->jinode); if (ret) return ret; spin_lock(&sbi->s_fc_lock); @@ -1343,7 +1333,7 @@ struct dentry_info_args { }; static inline void tl_to_darg(struct dentry_info_args *darg, - struct ext4_fc_tl *tl, u8 *val) + struct ext4_fc_tl *tl, u8 *val) { struct ext4_fc_dentry_info fcd; @@ -1352,8 +1342,14 @@ static inline void tl_to_darg(struct dentry_info_args *darg, darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); darg->ino = le32_to_cpu(fcd.fc_ino); darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); - darg->dname_len = le16_to_cpu(tl->fc_len) - - sizeof(struct ext4_fc_dentry_info); + darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); +} + +static inline void ext4_fc_get_tl(struct ext4_fc_tl *tl, u8 *val) +{ + memcpy(tl, val, EXT4_FC_TAG_BASE_LEN); + tl->fc_len = le16_to_cpu(tl->fc_len); + tl->fc_tag = le16_to_cpu(tl->fc_tag); } /* Unlink replay function */ @@ -1387,7 +1383,7 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl, return 0; } - ret = __ext4_unlink(NULL, old_parent, &entry, inode); + ret = __ext4_unlink(old_parent, &entry, inode, NULL); /* -ENOENT ok coz it might not exist anymore. */ if (ret == -ENOENT) ret = 0; @@ -1491,13 +1487,15 @@ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) if (state->fc_modified_inodes[i] == ino) return 0; if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { - state->fc_modified_inodes = krealloc( - state->fc_modified_inodes, + int *fc_modified_inodes; + + fc_modified_inodes = krealloc(state->fc_modified_inodes, sizeof(int) * (state->fc_modified_inodes_size + EXT4_FC_REPLAY_REALLOC_INCREMENT), GFP_KERNEL); - if (!state->fc_modified_inodes) + if (!fc_modified_inodes) return -ENOMEM; + state->fc_modified_inodes = fc_modified_inodes; state->fc_modified_inodes_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; } @@ -1516,8 +1514,9 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, struct ext4_inode *raw_fc_inode; struct inode *inode = NULL; struct ext4_iloc iloc; - int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag); + int inode_len, ino, ret, tag = tl->fc_tag; struct ext4_extent_header *eh; + size_t off_gen = offsetof(struct ext4_inode, i_generation); memcpy(&fc_inode, val, sizeof(fc_inode)); @@ -1541,12 +1540,12 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, if (ret) goto out; - inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode); + inode_len = tl->fc_len - sizeof(struct ext4_fc_inode); raw_inode = ext4_raw_inode(&iloc); memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); - memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation, - inode_len - offsetof(struct ext4_inode, i_generation)); + memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen, + inode_len - off_gen); if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); if (eh->eh_magic != EXT4_EXT_MAGIC) { @@ -1682,15 +1681,18 @@ int ext4_fc_record_regions(struct super_block *sb, int ino, if (replay && state->fc_regions_used != state->fc_regions_valid) state->fc_regions_used = state->fc_regions_valid; if (state->fc_regions_used == state->fc_regions_size) { + struct ext4_fc_alloc_region *fc_regions; + + fc_regions = krealloc(state->fc_regions, + sizeof(struct ext4_fc_alloc_region) * + (state->fc_regions_size + + EXT4_FC_REPLAY_REALLOC_INCREMENT), + GFP_KERNEL); + if (!fc_regions) + return -ENOMEM; state->fc_regions_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; - state->fc_regions = krealloc( - state->fc_regions, - state->fc_regions_size * - sizeof(struct ext4_fc_alloc_region), - GFP_KERNEL); - if (!state->fc_regions) - return -ENOMEM; + state->fc_regions = fc_regions; } region = &state->fc_regions[state->fc_regions_used++]; region->ino = ino; @@ -1770,8 +1772,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, ret = ext4_ext_insert_extent( NULL, inode, &path, &newex, 0); up_write((&EXT4_I(inode)->i_data_sem)); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); if (ret) goto out; goto next; @@ -1926,8 +1927,7 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) for (j = 0; j < path->p_depth; j++) ext4_mb_mark_bb(inode->i_sb, path[j].p_block, 1, 1); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); } cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, @@ -1972,6 +1972,33 @@ void ext4_fc_replay_cleanup(struct super_block *sb) kfree(sbi->s_fc_replay_state.fc_modified_inodes); } +static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, + int tag, int len) +{ + switch (tag) { + case EXT4_FC_TAG_ADD_RANGE: + return len == sizeof(struct ext4_fc_add_range); + case EXT4_FC_TAG_DEL_RANGE: + return len == sizeof(struct ext4_fc_del_range); + case EXT4_FC_TAG_CREAT: + case EXT4_FC_TAG_LINK: + case EXT4_FC_TAG_UNLINK: + len -= sizeof(struct ext4_fc_dentry_info); + return len >= 1 && len <= EXT4_NAME_LEN; + case EXT4_FC_TAG_INODE: + len -= sizeof(struct ext4_fc_inode); + return len >= EXT4_GOOD_OLD_INODE_SIZE && + len <= sbi->s_inode_size; + case EXT4_FC_TAG_PAD: + return true; /* padding can have any length */ + case EXT4_FC_TAG_TAIL: + return len >= sizeof(struct ext4_fc_tail); + case EXT4_FC_TAG_HEAD: + return len == sizeof(struct ext4_fc_head); + } + return false; +} + /* * Recovery Scan phase handler * @@ -2007,7 +2034,7 @@ static int ext4_fc_replay_scan(journal_t *journal, state = &sbi->s_fc_replay_state; start = (u8 *)bh->b_data; - end = (__u8 *)bh->b_data + journal->j_blocksize - 1; + end = start + journal->j_blocksize; if (state->fc_replay_expected_off == 0) { state->fc_cur_tag = 0; @@ -2028,12 +2055,19 @@ static int ext4_fc_replay_scan(journal_t *journal, } state->fc_replay_expected_off++; - for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { - memcpy(&tl, cur, sizeof(tl)); - val = cur + sizeof(tl); + for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; + cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { + ext4_fc_get_tl(&tl, cur); + val = cur + EXT4_FC_TAG_BASE_LEN; + if (tl.fc_len > end - val || + !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) { + ret = state->fc_replay_num_tags ? + JBD2_FC_REPLAY_STOP : -ECANCELED; + goto out_err; + } ext4_debug("Scan phase, tag:%s, blk %lld\n", - tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr); - switch (le16_to_cpu(tl.fc_tag)) { + tag2str(tl.fc_tag), bh->b_blocknr); + switch (tl.fc_tag) { case EXT4_FC_TAG_ADD_RANGE: memcpy(&ext, val, sizeof(ext)); ex = (struct ext4_extent *)&ext.fc_ex; @@ -2053,13 +2087,13 @@ static int ext4_fc_replay_scan(journal_t *journal, case EXT4_FC_TAG_PAD: state->fc_cur_tag++; state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, - sizeof(tl) + le16_to_cpu(tl.fc_len)); + EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; case EXT4_FC_TAG_TAIL: state->fc_cur_tag++; memcpy(&tail, val, sizeof(tail)); state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, - sizeof(tl) + + EXT4_FC_TAG_BASE_LEN + offsetof(struct ext4_fc_tail, fc_crc)); if (le32_to_cpu(tail.fc_tid) == expected_tid && @@ -2086,7 +2120,7 @@ static int ext4_fc_replay_scan(journal_t *journal, } state->fc_cur_tag++; state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, - sizeof(tl) + le16_to_cpu(tl.fc_len)); + EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; default: ret = state->fc_replay_num_tags ? @@ -2139,21 +2173,22 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, #endif start = (u8 *)bh->b_data; - end = (__u8 *)bh->b_data + journal->j_blocksize - 1; + end = start + journal->j_blocksize; - for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { - memcpy(&tl, cur, sizeof(tl)); - val = cur + sizeof(tl); + for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; + cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { + ext4_fc_get_tl(&tl, cur); + val = cur + EXT4_FC_TAG_BASE_LEN; if (state->fc_replay_num_tags == 0) { ret = JBD2_FC_REPLAY_STOP; ext4_fc_set_bitmaps_and_counters(sb); break; } - ext4_debug("Replay phase, tag:%s\n", - tag2str(le16_to_cpu(tl.fc_tag))); + + ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag)); state->fc_replay_num_tags--; - switch (le16_to_cpu(tl.fc_tag)) { + switch (tl.fc_tag) { case EXT4_FC_TAG_LINK: ret = ext4_fc_replay_link(sb, &tl, val); break; @@ -2174,19 +2209,18 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, break; case EXT4_FC_TAG_PAD: trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, - le16_to_cpu(tl.fc_len), 0); + tl.fc_len, 0); break; case EXT4_FC_TAG_TAIL: - trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, - le16_to_cpu(tl.fc_len), 0); + trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, + 0, tl.fc_len, 0); memcpy(&tail, val, sizeof(tail)); WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); break; case EXT4_FC_TAG_HEAD: break; default: - trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0, - le16_to_cpu(tl.fc_len), 0); + trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0); ret = -ECANCELED; break; } @@ -2210,17 +2244,17 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal) journal->j_fc_cleanup_callback = ext4_fc_cleanup; } -static const char *fc_ineligible_reasons[] = { - "Extended attributes changed", - "Cross rename", - "Journal flag changed", - "Insufficient memory", - "Swap boot", - "Resize", - "Dir renamed", - "Falloc range op", - "Data journalling", - "FC Commit Failed" +static const char * const fc_ineligible_reasons[] = { + [EXT4_FC_REASON_XATTR] = "Extended attributes changed", + [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename", + [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed", + [EXT4_FC_REASON_NOMEM] = "Insufficient memory", + [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot", + [EXT4_FC_REASON_RESIZE] = "Resize", + [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed", + [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", + [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", + [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", }; int ext4_fc_info_show(struct seq_file *seq, void *v) diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 1db12847a83b..2fadb2c4780c 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -58,7 +58,7 @@ struct ext4_fc_dentry_info { __u8 fc_dname[]; }; -/* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */ +/* Value structure for EXT4_FC_TAG_INODE. */ struct ext4_fc_inode { __le32 fc_ino; __u8 fc_raw_inode[]; @@ -70,6 +70,9 @@ struct ext4_fc_tail { __le32 fc_crc; }; +/* Tag base length */ +#define EXT4_FC_TAG_BASE_LEN (sizeof(struct ext4_fc_tl)) + /* * Fast commit status codes */ @@ -93,6 +96,7 @@ enum { EXT4_FC_REASON_RENAME_DIR, EXT4_FC_REASON_FALLOC_RANGE, EXT4_FC_REASON_INODE_JOURNAL_DATA, + EXT4_FC_REASON_ENCRYPTED_FILENAME, EXT4_FC_REASON_MAX }; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 109d07629f81..7ac0a81bd371 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -36,19 +36,34 @@ #include "acl.h" #include "truncate.h" -static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter) +/* + * Returns %true if the given DIO request should be attempted with DIO, or + * %false if it should fall back to buffered I/O. + * + * DIO isn't well specified; when it's unsupported (either due to the request + * being misaligned, or due to the file not supporting DIO at all), filesystems + * either fall back to buffered I/O or return EINVAL. For files that don't use + * any special features like encryption or verity, ext4 has traditionally + * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too. + * In this case, we should attempt the DIO, *not* fall back to buffered I/O. + * + * In contrast, in cases where DIO is unsupported due to ext4 features, ext4 + * traditionally falls back to buffered I/O. + * + * This function implements the traditional ext4 behavior in all these cases. + */ +static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = file_inode(iocb->ki_filp); + u32 dio_align = ext4_dio_alignment(inode); - if (!fscrypt_dio_supported(iocb, iter)) - return false; - if (fsverity_active(inode)) - return false; - if (ext4_should_journal_data(inode)) + if (dio_align == 0) return false; - if (ext4_has_inline_data(inode)) - return false; - return true; + + if (dio_align == 1) + return true; + + return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align); } static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -63,7 +78,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) inode_lock_shared(inode); } - if (!ext4_dio_supported(iocb, to)) { + if (!ext4_should_use_dio(iocb, to)) { inode_unlock_shared(inode); /* * Fallback to buffered I/O if the operation being performed on @@ -511,7 +526,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) } /* Fallback to buffered I/O if the inode does not support direct I/O. */ - if (!ext4_dio_supported(iocb, from)) { + if (!ext4_should_use_dio(iocb, from)) { if (ilock_shared) inode_unlock_shared(inode); else @@ -528,6 +543,12 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = -EAGAIN; goto out; } + /* + * Make sure inline data cannot be created anymore since we are going + * to allocate blocks for DIO. We know the inode does not have any + * inline data now because ext4_dio_supported() checked for that. + */ + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); offset = iocb->ki_pos; count = ret; @@ -934,7 +955,7 @@ const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_file_getattr, .listxattr = ext4_listxattr, - .get_acl = ext4_get_acl, + .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, .fileattr_get = ext4_fileattr_get, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 208b87ce8858..63f9bb6e8851 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -463,10 +463,9 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, hinfo.hash_version = DX_HASH_HALF_MD4; hinfo.seed = sbi->s_hash_seed; ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo); - grp = hinfo.hash; + parent_group = hinfo.hash % ngroups; } else - grp = prandom_u32(); - parent_group = (unsigned)grp % ngroups; + parent_group = get_random_u32_below(ngroups); for (i = 0; i < ngroups; i++) { g = (parent_group + i) % ngroups; get_orlov_stats(sb, g, flex_size, &stats); @@ -871,7 +870,7 @@ static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode, struct super_block *sb = dir->i_sb; int nblocks = 0; #ifdef CONFIG_EXT4_FS_POSIX_ACL - struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); + struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT); if (IS_ERR(p)) return PTR_ERR(p); @@ -1077,8 +1076,8 @@ repeat_in_this_group: if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) { BUG_ON(nblocks <= 0); - handle = __ext4_journal_start_sb(dir->i_sb, line_no, - handle_type, nblocks, 0, + handle = __ext4_journal_start_sb(NULL, dir->i_sb, + line_no, handle_type, nblocks, 0, ext4_trans_default_revoke_credits(sb)); if (IS_ERR(handle)) { err = PTR_ERR(handle); @@ -1280,7 +1279,7 @@ got: EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto out; } - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); /* Precompute checksum seed for inode metadata */ if (ext4_has_metadata_csum(sb)) { diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 860fc5119009..c68bebe7ff4b 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -148,6 +148,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, struct super_block *sb = inode->i_sb; Indirect *p = chain; struct buffer_head *bh; + unsigned int key; int ret = -EIO; *err = 0; @@ -156,7 +157,13 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, if (!p->key) goto no_block; while (--depth) { - bh = sb_getblk(sb, le32_to_cpu(p->key)); + key = le32_to_cpu(p->key); + if (key > ext4_blocks_count(EXT4_SB(sb)->s_es)) { + /* the block was out of range */ + ret = -EFSCORRUPTED; + goto failure; + } + bh = sb_getblk(sb, key); if (unlikely(!bh)) { ret = -ENOMEM; goto failure; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index a4fbe825694b..2b42ececa46d 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -180,8 +180,7 @@ static int ext4_read_inline_data(struct inode *inode, void *buffer, BUG_ON(len > EXT4_I(inode)->i_inline_size); - cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ? - len : EXT4_MIN_INLINE_DATA_SIZE; + cp_len = min_t(unsigned int, len, EXT4_MIN_INLINE_DATA_SIZE); raw_inode = ext4_raw_inode(iloc); memcpy(buffer, (void *)(raw_inode->i_block), cp_len); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 601214453c3a..9d9f414f99fe 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -222,13 +222,13 @@ void ext4_evict_inode(struct inode *inode) /* * For inodes with journalled data, transaction commit could have - * dirtied the inode. Flush worker is ignoring it because of I_FREEING - * flag but we still need to remove the inode from the writeback lists. + * dirtied the inode. And for inodes with dioread_nolock, unwritten + * extents converting worker could merge extents and also have dirtied + * the inode. Flush worker is ignoring it because of I_FREEING flag but + * we still need to remove the inode from the writeback lists. */ - if (!list_empty_careful(&inode->i_io_list)) { - WARN_ON_ONCE(!ext4_should_journal_data(inode)); + if (!list_empty_careful(&inode->i_io_list)) inode_io_list_del(inode); - } /* * Protect us against freezing - iput() caller didn't have to have any @@ -335,6 +335,12 @@ stop_handle: ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: + /* + * Check out some where else accidentally dirty the evicting inode, + * which may probably cause inode use-after-free issues later. + */ + WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list)); + if (!list_empty(&EXT4_I(inode)->i_fc_list)) ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ @@ -1188,6 +1194,13 @@ retry_grab: page = grab_cache_page_write_begin(mapping, index); if (!page) return -ENOMEM; + /* + * The same as page allocation, we prealloc buffer heads before + * starting the handle. + */ + if (!page_has_buffers(page)) + create_empty_buffers(page, inode->i_sb->s_blocksize, 0); + unlock_page(page); retry_journal: @@ -1302,7 +1315,8 @@ static int ext4_write_end(struct file *file, trace_ext4_write_end(inode, pos, len, copied); - if (ext4_has_inline_data(inode)) + if (ext4_has_inline_data(inode) && + ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) return ext4_write_inline_data_end(inode, pos, len, copied, page); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); @@ -1536,9 +1550,12 @@ void ext4_da_release_space(struct inode *inode, int to_free) */ struct mpage_da_data { + /* These are input fields for ext4_do_writepages() */ struct inode *inode; struct writeback_control *wbc; + unsigned int can_map:1; /* Can writepages call map blocks? */ + /* These are internal state of ext4_do_writepages() */ pgoff_t first_page; /* The first page to write */ pgoff_t next_page; /* Current page to examine */ pgoff_t last_page; /* Last page to examine */ @@ -2002,7 +2019,6 @@ static int ext4_writepage(struct page *page, struct buffer_head *page_bufs = NULL; struct inode *inode = page->mapping->host; struct ext4_io_submit io_submit; - bool keep_towrite = false; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { folio_invalidate(folio, 0, folio_size(folio)); @@ -2060,7 +2076,6 @@ static int ext4_writepage(struct page *page, unlock_page(page); return 0; } - keep_towrite = true; } if (PageChecked(page) && ext4_should_journal_data(inode)) @@ -2077,7 +2092,7 @@ static int ext4_writepage(struct page *page, unlock_page(page); return -ENOMEM; } - ret = ext4_bio_write_page(&io_submit, page, len, keep_towrite); + ret = ext4_bio_write_page(&io_submit, page, len); ext4_io_submit(&io_submit); /* Drop io_end reference we got from init */ ext4_put_io_end_defer(io_submit.io_end); @@ -2111,7 +2126,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) len = size & ~PAGE_MASK; else len = PAGE_SIZE; - err = ext4_bio_write_page(&mpd->io_submit, page, len, false); + err = ext4_bio_write_page(&mpd->io_submit, page, len); if (!err) mpd->wbc->nr_to_write--; mpd->first_page++; @@ -2544,18 +2559,33 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); } +/* Return true if the page needs to be written as part of transaction commit */ +static bool ext4_page_nomap_can_writeout(struct page *page) +{ + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + if (buffer_dirty(bh) && buffer_mapped(bh) && !buffer_delay(bh)) + return true; + } while ((bh = bh->b_this_page) != head); + return false; +} + /* * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages - * and underlying extent to map + * needing mapping, submit mapped pages * * @mpd - where to look for pages * * Walk dirty pages in the mapping. If they are fully mapped, submit them for - * IO immediately. When we find a page which isn't mapped we start accumulating - * extent of buffers underlying these pages that needs mapping (formed by - * either delayed or unwritten buffers). We also lock the pages containing - * these buffers. The extent found is returned in @mpd structure (starting at - * mpd->lblk with length mpd->len blocks). + * IO immediately. If we cannot map blocks, we submit just already mapped + * buffers in the page for IO and keep page dirty. When we can map blocks and + * we find a page which isn't mapped we start accumulating extent of buffers + * underlying these pages that needs mapping (formed by either delayed or + * unwritten buffers). We also lock the pages containing these buffers. The + * extent found is returned in @mpd structure (starting at mpd->lblk with + * length mpd->len blocks). * * Note that this function can attach bios to one io_end structure which are * neither logically nor physically contiguous. Although it may seem as an @@ -2646,14 +2676,30 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (mpd->map.m_len == 0) mpd->first_page = page->index; mpd->next_page = page->index + 1; - /* Add all dirty buffers to mpd */ - lblk = ((ext4_lblk_t)page->index) << - (PAGE_SHIFT - blkbits); - head = page_buffers(page); - err = mpage_process_page_bufs(mpd, head, head, lblk); - if (err <= 0) - goto out; - err = 0; + /* + * Writeout for transaction commit where we cannot + * modify metadata is simple. Just submit the page. + */ + if (!mpd->can_map) { + if (ext4_page_nomap_can_writeout(page)) { + err = mpage_submit_page(mpd, page); + if (err < 0) + goto out; + } else { + unlock_page(page); + mpd->first_page++; + } + } else { + /* Add all dirty buffers to mpd */ + lblk = ((ext4_lblk_t)page->index) << + (PAGE_SHIFT - blkbits); + head = page_buffers(page); + err = mpage_process_page_bufs(mpd, head, head, + lblk); + if (err <= 0) + goto out; + err = 0; + } left--; } pagevec_release(&pvec); @@ -2666,25 +2712,27 @@ out: return err; } -static int ext4_writepages(struct address_space *mapping, - struct writeback_control *wbc) +static int ext4_writepage_cb(struct page *page, struct writeback_control *wbc, + void *data) { + return ext4_writepage(page, wbc); +} + +static int ext4_do_writepages(struct mpage_da_data *mpd) +{ + struct writeback_control *wbc = mpd->wbc; pgoff_t writeback_index = 0; long nr_to_write = wbc->nr_to_write; int range_whole = 0; int cycled = 1; handle_t *handle = NULL; - struct mpage_da_data mpd; - struct inode *inode = mapping->host; + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; int needed_blocks, rsv_blocks = 0, ret = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); struct blk_plug plug; bool give_up_on_write = false; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) - return -EIO; - - percpu_down_read(&sbi->s_writepages_rwsem); trace_ext4_writepages(inode, wbc); /* @@ -2696,7 +2744,9 @@ static int ext4_writepages(struct address_space *mapping, goto out_writepages; if (ext4_should_journal_data(inode)) { - ret = generic_writepages(mapping, wbc); + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL); + blk_finish_plug(&plug); goto out_writepages; } @@ -2750,19 +2800,18 @@ static int ext4_writepages(struct address_space *mapping, writeback_index = mapping->writeback_index; if (writeback_index) cycled = 0; - mpd.first_page = writeback_index; - mpd.last_page = -1; + mpd->first_page = writeback_index; + mpd->last_page = -1; } else { - mpd.first_page = wbc->range_start >> PAGE_SHIFT; - mpd.last_page = wbc->range_end >> PAGE_SHIFT; + mpd->first_page = wbc->range_start >> PAGE_SHIFT; + mpd->last_page = wbc->range_end >> PAGE_SHIFT; } - mpd.inode = inode; - mpd.wbc = wbc; - ext4_io_submit_init(&mpd.io_submit, wbc); + ext4_io_submit_init(&mpd->io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); + tag_pages_for_writeback(mapping, mpd->first_page, + mpd->last_page); blk_start_plug(&plug); /* @@ -2771,31 +2820,32 @@ retry: * in the block layer on device congestion while having transaction * started. */ - mpd.do_map = 0; - mpd.scanned_until_end = 0; - mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); - if (!mpd.io_submit.io_end) { + mpd->do_map = 0; + mpd->scanned_until_end = 0; + mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!mpd->io_submit.io_end) { ret = -ENOMEM; goto unplug; } - ret = mpage_prepare_extent_to_map(&mpd); + ret = mpage_prepare_extent_to_map(mpd); /* Unlock pages we didn't use */ - mpage_release_unused_pages(&mpd, false); + mpage_release_unused_pages(mpd, false); /* Submit prepared bio */ - ext4_io_submit(&mpd.io_submit); - ext4_put_io_end_defer(mpd.io_submit.io_end); - mpd.io_submit.io_end = NULL; + ext4_io_submit(&mpd->io_submit); + ext4_put_io_end_defer(mpd->io_submit.io_end); + mpd->io_submit.io_end = NULL; if (ret < 0) goto unplug; - while (!mpd.scanned_until_end && wbc->nr_to_write > 0) { + while (!mpd->scanned_until_end && wbc->nr_to_write > 0) { /* For each extent of pages we use new io_end */ - mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); - if (!mpd.io_submit.io_end) { + mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!mpd->io_submit.io_end) { ret = -ENOMEM; break; } + WARN_ON_ONCE(!mpd->can_map); /* * We have two constraints: We find one extent to map and we * must always write out whole page (makes a difference when @@ -2815,16 +2865,16 @@ retry: "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); /* Release allocated io_end */ - ext4_put_io_end(mpd.io_submit.io_end); - mpd.io_submit.io_end = NULL; + ext4_put_io_end(mpd->io_submit.io_end); + mpd->io_submit.io_end = NULL; break; } - mpd.do_map = 1; + mpd->do_map = 1; - trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); - ret = mpage_prepare_extent_to_map(&mpd); - if (!ret && mpd.map.m_len) - ret = mpage_map_and_submit_extent(handle, &mpd, + trace_ext4_da_write_pages(inode, mpd->first_page, wbc); + ret = mpage_prepare_extent_to_map(mpd); + if (!ret && mpd->map.m_len) + ret = mpage_map_and_submit_extent(handle, mpd, &give_up_on_write); /* * Caution: If the handle is synchronous, @@ -2839,12 +2889,12 @@ retry: if (!ext4_handle_valid(handle) || handle->h_sync == 0) { ext4_journal_stop(handle); handle = NULL; - mpd.do_map = 0; + mpd->do_map = 0; } /* Unlock pages we didn't use */ - mpage_release_unused_pages(&mpd, give_up_on_write); + mpage_release_unused_pages(mpd, give_up_on_write); /* Submit prepared bio */ - ext4_io_submit(&mpd.io_submit); + ext4_io_submit(&mpd->io_submit); /* * Drop our io_end reference we got from init. We have @@ -2854,11 +2904,11 @@ retry: * up doing unwritten extent conversion. */ if (handle) { - ext4_put_io_end_defer(mpd.io_submit.io_end); + ext4_put_io_end_defer(mpd->io_submit.io_end); ext4_journal_stop(handle); } else - ext4_put_io_end(mpd.io_submit.io_end); - mpd.io_submit.io_end = NULL; + ext4_put_io_end(mpd->io_submit.io_end); + mpd->io_submit.io_end = NULL; if (ret == -ENOSPC && sbi->s_journal) { /* @@ -2878,8 +2928,8 @@ unplug: blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; - mpd.last_page = writeback_index - 1; - mpd.first_page = 0; + mpd->last_page = writeback_index - 1; + mpd->first_page = 0; goto retry; } @@ -2889,15 +2939,51 @@ unplug: * Set the writeback_index so that range_cyclic * mode will write it back later */ - mapping->writeback_index = mpd.first_page; + mapping->writeback_index = mpd->first_page; out_writepages: trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); - percpu_up_read(&sbi->s_writepages_rwsem); return ret; } +static int ext4_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct super_block *sb = mapping->host->i_sb; + struct mpage_da_data mpd = { + .inode = mapping->host, + .wbc = wbc, + .can_map = 1, + }; + int ret; + + if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + return -EIO; + + percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); + ret = ext4_do_writepages(&mpd); + percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); + + return ret; +} + +int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .range_start = jinode->i_dirty_start, + .range_end = jinode->i_dirty_end, + }; + struct mpage_da_data mpd = { + .inode = jinode->i_vfs_inode, + .wbc = &wbc, + .can_map = 0, + }; + return ext4_do_writepages(&mpd); +} + static int ext4_dax_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -3639,7 +3725,6 @@ static int ext4_iomap_swap_activate(struct swap_info_struct *sis, static const struct address_space_operations ext4_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, - .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_write_end, @@ -3657,7 +3742,6 @@ static const struct address_space_operations ext4_aops = { static const struct address_space_operations ext4_journalled_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, - .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, @@ -3666,6 +3750,7 @@ static const struct address_space_operations ext4_journalled_aops = { .invalidate_folio = ext4_journalled_invalidate_folio, .release_folio = ext4_release_folio, .direct_IO = noop_direct_IO, + .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = ext4_iomap_swap_activate, @@ -3674,7 +3759,6 @@ static const struct address_space_operations ext4_journalled_aops = { static const struct address_space_operations ext4_da_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, - .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, @@ -4218,7 +4302,8 @@ int ext4_truncate(struct inode *inode) /* If we zero-out tail of the page, we have to create jinode for jbd2 */ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { - if (ext4_inode_attach_jinode(inode) < 0) + err = ext4_inode_attach_jinode(inode); + if (err) goto out_trace; } @@ -4466,9 +4551,17 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; inode_offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)); - block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); + block = ext4_inode_table(sb, gdp); + if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) || + (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) { + ext4_error(sb, "Invalid inode table block %llu in " + "block_group %u", block, iloc->block_group); + return -EFSCORRUPTED; + } + block += (inode_offset / inodes_per_block); + bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; @@ -5037,8 +5130,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); - brelse(iloc.bh); + if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { + ext4_error_inode(inode, function, line, 0, + "bad inode without EXT4_IGET_BAD flag"); + ret = -EUCLEAN; + goto bad_inode; + } + brelse(iloc.bh); unlock_new_inode(inode); return inode; @@ -5342,6 +5441,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, int error, rc = 0; int orphan = 0; const unsigned int ia_valid = attr->ia_valid; + bool inc_ivers = true; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; @@ -5425,8 +5525,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return -EINVAL; } - if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) - inode_inc_iversion(inode); + if (attr->ia_size == inode->i_size) + inc_ivers = false; if (shrink) { if (ext4_should_order_data(inode)) { @@ -5528,6 +5628,8 @@ out_mmap_sem: } if (!error) { + if (inc_ivers) + inode_inc_iversion(inode); setattr_copy(mnt_userns, inode, attr); mark_inode_dirty(inode); } @@ -5540,7 +5642,7 @@ out_mmap_sem: ext4_orphan_del(NULL, inode); if (!error && (ia_valid & ATTR_MODE)) - rc = posix_acl_chmod(mnt_userns, inode, inode->i_mode); + rc = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); err_out: if (error) @@ -5550,6 +5652,22 @@ err_out: return error; } +u32 ext4_dio_alignment(struct inode *inode) +{ + if (fsverity_active(inode)) + return 0; + if (ext4_should_journal_data(inode)) + return 0; + if (ext4_has_inline_data(inode)) + return 0; + if (IS_ENCRYPTED(inode)) { + if (!fscrypt_dio_supported(inode)) + return 0; + return i_blocksize(inode); + } + return 1; /* use the iomap defaults */ +} + int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -5565,6 +5683,27 @@ int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->btime.tv_nsec = ei->i_crtime.tv_nsec; } + /* + * Return the DIO alignment restrictions if requested. We only return + * this information when requested, since on encrypted files it might + * take a fair bit of work to get if the file wasn't opened recently. + */ + if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { + u32 dio_align = ext4_dio_alignment(inode); + + stat->result_mask |= STATX_DIOALIGN; + if (dio_align == 1) { + struct block_device *bdev = inode->i_sb->s_bdev; + + /* iomap defaults */ + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + stat->dio_offset_align = bdev_logical_block_size(bdev); + } else { + stat->dio_mem_align = dio_align; + stat->dio_offset_align = dio_align; + } + } + flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (flags & EXT4_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; @@ -5731,9 +5870,6 @@ int ext4_mark_iloc_dirty(handle_t *handle, } ext4_fc_track_inode(handle, inode); - if (IS_I_VERSION(inode)) - inode_inc_iversion(inode); - /* the do_update_inode consumes one bh->b_count */ get_bh(iloc->bh); @@ -5809,6 +5945,14 @@ static int __ext4_expand_extra_isize(struct inode *inode, return 0; } + /* + * We may need to allocate external xattr block so we need quotas + * initialized. Here we can be called with various locks held so we + * cannot affort to initialize quotas ourselves. So just bail. + */ + if (dquot_initialize_needed(inode)) + return -EAGAIN; + /* try to expand with EAs present */ error = ext4_expand_extra_isize_ea(inode, new_extra_isize, raw_inode, handle); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 3cf3ec4b1c21..8067ccda34e4 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -145,9 +145,8 @@ static int ext4_update_backup_sb(struct super_block *sb, if (ext4_has_metadata_csum(sb) && es->s_checksum != ext4_superblock_csum(sb, es)) { ext4_msg(sb, KERN_ERR, "Invalid checksum for backup " - "superblock %llu\n", sb_block); + "superblock %llu", sb_block); unlock_buffer(bh); - err = -EFSBADCRC; goto out_bh; } func(es, arg); @@ -375,7 +374,8 @@ static long swap_inode_boot_loader(struct super_block *sb, blkcnt_t blocks; unsigned short bytes; - inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL); + inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, + EXT4_IGET_SPECIAL | EXT4_IGET_BAD); if (IS_ERR(inode_bl)) return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl); @@ -425,7 +425,7 @@ static long swap_inode_boot_loader(struct super_block *sb, /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(inode, inode_bl); - if (inode_bl->i_nlink == 0) { + if (is_bad_inode(inode_bl) || !S_ISREG(inode_bl->i_mode)) { /* this inode has never been used as a BOOT_LOADER */ set_nlink(inode_bl, 1); i_uid_write(inode_bl, 0); @@ -452,9 +452,10 @@ static long swap_inode_boot_loader(struct super_block *sb, swap_inode_data(inode, inode_bl); inode->i_ctime = inode_bl->i_ctime = current_time(inode); + inode_inc_iversion(inode); - inode->i_generation = prandom_u32(); - inode_bl->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); + inode_bl->i_generation = get_random_u32(); ext4_reset_inode_seed(inode); ext4_reset_inode_seed(inode_bl); @@ -665,6 +666,7 @@ static int ext4_ioctl_setflags(struct inode *inode, ext4_set_inode_flags(inode, false); inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); err = ext4_mark_iloc_dirty(handle, inode, &iloc); flags_err: @@ -730,6 +732,10 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) if (ext4_is_quota_file(inode)) return err; + err = dquot_initialize(inode); + if (err) + return err; + err = ext4_get_inode_loc(inode, &iloc); if (err) return err; @@ -745,10 +751,6 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) brelse(iloc.bh); } - err = dquot_initialize(inode); - if (err) - return err; - handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(sb) + EXT4_QUOTA_DEL_BLOCKS(sb) + 3); @@ -775,6 +777,7 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) EXT4_I(inode)->i_projid = kprojid; inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); out_dirty: rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) @@ -1060,9 +1063,6 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) if (!EXT4_SB(sb)->s_journal) return -ENODEV; - if (flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) - return -EINVAL; - if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !bdev_max_discard_sectors(EXT4_SB(sb)->s_journal->j_dev)) return -EOPNOTSUPP; @@ -1154,19 +1154,22 @@ static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi, if (fsuuid.fsu_len == 0) { fsuuid.fsu_len = UUID_SIZE; - if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid.fsu_len))) + if (copy_to_user(&ufsuuid->fsu_len, &fsuuid.fsu_len, + sizeof(fsuuid.fsu_len))) return -EFAULT; - return -EINVAL; + return 0; } - if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0) + if (fsuuid.fsu_len < UUID_SIZE || fsuuid.fsu_flags != 0) return -EINVAL; lock_buffer(sbi->s_sbh); memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE); unlock_buffer(sbi->s_sbh); - if (copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE)) + fsuuid.fsu_len = UUID_SIZE; + if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid)) || + copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE)) return -EFAULT; return 0; } @@ -1257,6 +1260,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); inode->i_generation = generation; err = ext4_mark_iloc_dirty(handle, inode, &iloc); } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 71f5b67d7f28..5b2ae37a8b80 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -910,7 +910,7 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, int *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_group_info *grp, *iter; + struct ext4_group_info *grp = NULL, *iter; int i; if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { @@ -927,7 +927,6 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); continue; } - grp = NULL; list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i], bb_avg_fragment_size_node) { if (sbi->s_mb_stats) @@ -5205,7 +5204,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) mutex_lock(&ac->ac_lg->lg_mutex); } -static noinline_for_stack int +static noinline_for_stack void ext4_mb_initialize_context(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { @@ -5254,8 +5253,6 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, (unsigned) ar->lleft, (unsigned) ar->pleft, (unsigned) ar->lright, (unsigned) ar->pright, inode_is_open_for_write(ar->inode) ? "" : "non-"); - return 0; - } static noinline_for_stack void @@ -5592,11 +5589,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, goto out; } - *errp = ext4_mb_initialize_context(ac, ar); - if (*errp) { - ar->len = 0; - goto out; - } + ext4_mb_initialize_context(ac, ar); ac->ac_op = EXT4_MB_HISTORY_PREALLOC; seq = this_cpu_read(discard_pa_seq); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 54e7d3c95fd7..a19a9661646e 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -56,8 +56,7 @@ static int finish_range(handle_t *handle, struct inode *inode, retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0); err_out: up_write((&EXT4_I(inode)->i_data_sem)); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); lb->first_pblock = 0; return retval; } @@ -425,7 +424,8 @@ int ext4_ext_migrate(struct inode *inode) * already is extent-based, error out. */ if (!ext4_has_feature_extents(inode->i_sb) || - (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || + ext4_has_inline_data(inode)) return -EINVAL; if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 9af68a7ecdcf..4681fff6665f 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -262,13 +262,7 @@ void ext4_stop_mmpd(struct ext4_sb_info *sbi) */ static unsigned int mmp_new_seq(void) { - u32 new_seq; - - do { - new_seq = prandom_u32(); - } while (new_seq > EXT4_MMP_SEQ_MAX); - - return new_seq; + return get_random_u32_below(EXT4_MMP_SEQ_MAX + 1); } /* diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 701f1d6a217f..8dbb87edf24c 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -32,8 +32,7 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock, if (IS_ERR(path)) return PTR_ERR(path); if (path[ext_depth(inode)].p_ext == NULL) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); *ppath = NULL; return -ENODATA; } @@ -103,12 +102,10 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, if (unwritten != ext4_ext_is_unwritten(ext)) goto out; from += ext4_ext_get_actual_len(ext); - ext4_ext_drop_refs(path); } ret = 1; out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return ret; } @@ -256,6 +253,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, { struct inode *orig_inode = file_inode(o_filp); struct page *pagep[2] = {NULL, NULL}; + struct folio *folio[2] = {NULL, NULL}; handle_t *handle; ext4_lblk_t orig_blk_offset, donor_blk_offset; unsigned long blocksize = orig_inode->i_sb->s_blocksize; @@ -316,6 +314,13 @@ again: * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ + folio[0] = page_folio(pagep[0]); + folio[1] = page_folio(pagep[1]); + + VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]); + VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]); + VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]); + if (unwritten) { ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to @@ -334,10 +339,10 @@ again: ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } - if ((page_has_private(pagep[0]) && - !try_to_release_page(pagep[0], 0)) || - (page_has_private(pagep[1]) && - !try_to_release_page(pagep[1], 0))) { + if ((folio_has_private(folio[0]) && + !filemap_release_folio(folio[0], 0)) || + (folio_has_private(folio[1]) && + !filemap_release_folio(folio[1], 0))) { *err = -EBUSY; goto drop_data_sem; } @@ -347,19 +352,21 @@ again: block_len_in_page, 1, err); drop_data_sem: ext4_double_up_write_data_sem(orig_inode, donor_inode); - goto unlock_pages; + goto unlock_folios; } data_copy: - *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size); + *err = mext_page_mkuptodate(&folio[0]->page, from, from + replaced_size); if (*err) - goto unlock_pages; + goto unlock_folios; /* At this point all buffers in range are uptodate, old mapping layout * is no longer required, try to drop it now. */ - if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) || - (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) { + if ((folio_has_private(folio[0]) && + !filemap_release_folio(folio[0], 0)) || + (folio_has_private(folio[1]) && + !filemap_release_folio(folio[1], 0))) { *err = -EBUSY; - goto unlock_pages; + goto unlock_folios; } ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, @@ -372,13 +379,13 @@ data_copy: replaced_size = block_len_in_page << orig_inode->i_blkbits; } else - goto unlock_pages; + goto unlock_folios; } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ - if (!page_has_buffers(pagep[0])) - create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0); - bh = page_buffers(pagep[0]); + if (!folio_buffers(folio[0])) + create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0); + bh = folio_buffers(folio[0]); for (i = 0; i < data_offset_in_page; i++) bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { @@ -388,7 +395,7 @@ data_copy: bh = bh->b_this_page; } if (!*err) - *err = block_commit_write(pagep[0], from, from + replaced_size); + *err = block_commit_write(&folio[0]->page, from, from + replaced_size); if (unlikely(*err < 0)) goto repair_branches; @@ -398,11 +405,11 @@ data_copy: *err = ext4_jbd2_inode_add_write(handle, orig_inode, (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size); -unlock_pages: - unlock_page(pagep[0]); - put_page(pagep[0]); - unlock_page(pagep[1]); - put_page(pagep[1]); +unlock_folios: + folio_unlock(folio[0]); + folio_put(folio[0]); + folio_unlock(folio[1]); + folio_put(folio[1]); stop_journal: ext4_journal_stop(handle); if (*err == -ENOSPC && @@ -433,7 +440,7 @@ repair_branches: *err = -EIO; } replaced_count = 0; - goto unlock_pages; + goto unlock_folios; } /** @@ -472,19 +479,17 @@ mext_check_arguments(struct inode *orig_inode, if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) return -EPERM; - /* Ext4 move extent does not support swapfile */ + /* Ext4 move extent does not support swap files */ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { - ext4_debug("ext4 move extent: The argument files should " - "not be swapfile [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); - return -EBUSY; + return -ETXTBSY; } if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) { - ext4_debug("ext4 move extent: The argument files should " - "not be quota files [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); - return -EBUSY; + return -EOPNOTSUPP; } /* Ext4 move extent supports only extent based file */ @@ -631,11 +636,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, if (ret) goto out; ex = path[path->p_depth].p_ext; - next_blk = ext4_ext_next_allocated_block(path); cur_blk = le32_to_cpu(ex->ee_block); cur_len = ext4_ext_get_actual_len(ex); /* Check hole before the start pos */ if (cur_blk + cur_len - 1 < o_start) { + next_blk = ext4_ext_next_allocated_block(path); if (next_blk == EXT_MAX_BLOCKS) { ret = -ENODATA; goto out; @@ -663,7 +668,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, donor_page_index = d_start >> (PAGE_SHIFT - donor_inode->i_blkbits); offset_in_page = o_start % blocks_per_page; - if (cur_len > blocks_per_page- offset_in_page) + if (cur_len > blocks_per_page - offset_in_page) cur_len = blocks_per_page - offset_in_page; /* * Up semaphore to avoid following problems: @@ -694,8 +699,7 @@ out: ext4_discard_preallocations(donor_inode, 0); } - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); ext4_double_up_write_data_sem(orig_inode, donor_inode); unlock_two_nondirectories(orig_inode, donor_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 3a31b662f661..dd28453d6ea3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -85,15 +85,20 @@ static struct buffer_head *ext4_append(handle_t *handle, return bh; inode->i_size += inode->i_sb->s_blocksize; EXT4_I(inode)->i_disksize = inode->i_size; + err = ext4_mark_inode_dirty(handle, inode); + if (err) + goto out; BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); - if (err) { - brelse(bh); - ext4_std_error(inode->i_sb, err); - return ERR_PTR(err); - } + if (err) + goto out; return bh; + +out: + brelse(bh); + ext4_std_error(inode->i_sb, err); + return ERR_PTR(err); } static int ext4_dx_csum_verify(struct inode *inode, @@ -126,7 +131,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, struct ext4_dir_entry *dirent; int is_dx_block = 0; - if (block >= inode->i_size) { + if (block >= inode->i_size >> inode->i_blkbits) { ext4_error_inode(inode, func, line, block, "Attempting to read directory block (%u) that is past i_size (%llu)", block, inode->i_size); @@ -2254,8 +2259,16 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, memset(de, 0, len); /* wipe old data */ de = (struct ext4_dir_entry_2 *) data2; top = data2 + len; - while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) + while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) { + if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len, + (data2 + (blocksize - csum_size) - + (char *) de))) { + brelse(bh2); + brelse(bh); + return -EFSCORRUPTED; + } de = de2; + } de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) - (char *) de, blocksize); @@ -2849,7 +2862,7 @@ retry: } static int ext4_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { handle_t *handle; struct inode *inode; @@ -2871,7 +2884,7 @@ retry: inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); err = ext4_orphan_add(handle, inode); if (err) goto err_unlock_inode; @@ -2882,7 +2895,7 @@ retry: ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; - return err; + return finish_open_simple(file, err); err_unlock_inode: ext4_journal_stop(handle); unlock_new_inode(inode); @@ -3191,14 +3204,20 @@ end_rmdir: return retval; } -int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name, - struct inode *inode) +int __ext4_unlink(struct inode *dir, const struct qstr *d_name, + struct inode *inode, + struct dentry *dentry /* NULL during fast_commit recovery */) { int retval = -ENOENT; struct buffer_head *bh; struct ext4_dir_entry_2 *de; + handle_t *handle; int skip_remove_dentry = 0; + /* + * Keep this outside the transaction; it may have to set up the + * directory's encryption key, which isn't GFP_NOFS-safe. + */ bh = ext4_find_entry(dir, d_name, &de, NULL); if (IS_ERR(bh)) return PTR_ERR(bh); @@ -3215,7 +3234,14 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) skip_remove_dentry = 1; else - goto out; + goto out_bh; + } + + handle = ext4_journal_start(dir, EXT4_HT_DIR, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + goto out_bh; } if (IS_DIRSYNC(dir)) @@ -3224,12 +3250,12 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name if (!skip_remove_dentry) { retval = ext4_delete_entry(handle, dir, de, bh); if (retval) - goto out; + goto out_handle; dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); retval = ext4_mark_inode_dirty(handle, dir); if (retval) - goto out; + goto out_handle; } else { retval = 0; } @@ -3242,15 +3268,17 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name ext4_orphan_add(handle, inode); inode->i_ctime = current_time(inode); retval = ext4_mark_inode_dirty(handle, inode); - -out: + if (dentry && !retval) + ext4_fc_track_unlink(handle, dentry); +out_handle: + ext4_journal_stop(handle); +out_bh: brelse(bh); return retval; } static int ext4_unlink(struct inode *dir, struct dentry *dentry) { - handle_t *handle; int retval; if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) @@ -3268,16 +3296,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (retval) goto out_trace; - handle = ext4_journal_start(dir, EXT4_HT_DIR, - EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) { - retval = PTR_ERR(handle); - goto out_trace; - } - - retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry)); - if (!retval) - ext4_fc_track_unlink(handle, dentry); + retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry); #if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid @@ -3288,8 +3307,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (IS_CASEFOLDED(dir)) d_invalidate(dentry); #endif - if (handle) - ext4_journal_stop(handle); out_trace: trace_ext4_unlink_exit(dentry, retval); @@ -3781,6 +3798,9 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, retval = dquot_initialize(old.dir); if (retval) return retval; + retval = dquot_initialize(old.inode); + if (retval) + return retval; retval = dquot_initialize(new.dir); if (retval) return retval; @@ -4181,7 +4201,7 @@ const struct inode_operations ext4_dir_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, - .get_acl = ext4_get_acl, + .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, .fileattr_get = ext4_fileattr_get, @@ -4192,6 +4212,6 @@ const struct inode_operations ext4_special_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, - .get_acl = ext4_get_acl, + .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, }; diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 69a9cf9137a6..e5b47dda3317 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -412,7 +412,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) /* don't clear list on RO mount w/ errors */ if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { ext4_msg(sb, KERN_INFO, "Errors on filesystem, " - "clearing orphan list.\n"); + "clearing orphan list."); es->s_last_orphan = 0; } ext4_debug("Skipping orphan recovery on fs with errors.\n"); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 97fa7b4c645f..beaec6d81074 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -430,25 +430,20 @@ submit_and_retry: int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, - int len, - bool keep_towrite) + int len) { struct page *bounce_page = NULL; struct inode *inode = page->mapping->host; unsigned block_start; struct buffer_head *bh, *head; int ret = 0; - int nr_submitted = 0; int nr_to_submit = 0; struct writeback_control *wbc = io->io_wbc; + bool keep_towrite = false; BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - if (keep_towrite) - set_page_writeback_keepwrite(page); - else - set_page_writeback(page); ClearPageError(page); /* @@ -482,16 +477,31 @@ int ext4_bio_write_page(struct ext4_io_submit *io, /* A hole? We can safely clear the dirty bit */ if (!buffer_mapped(bh)) clear_buffer_dirty(bh); - if (io->io_bio) - ext4_io_submit(io); + /* + * Keeping dirty some buffer we cannot write? Make sure + * to redirty the page and keep TOWRITE tag so that + * racing WB_SYNC_ALL writeback does not skip the page. + * This happens e.g. when doing writeout for + * transaction commit. + */ + if (buffer_dirty(bh)) { + if (!PageDirty(page)) + redirty_page_for_writepage(wbc, page); + keep_towrite = true; + } continue; } if (buffer_new(bh)) clear_buffer_new(bh); set_buffer_async_write(bh); + clear_buffer_dirty(bh); nr_to_submit++; } while ((bh = bh->b_this_page) != head); + /* Nothing to submit? Just unlock the page... */ + if (!nr_to_submit) + goto unlock; + bh = head = page_buffers(page); /* @@ -532,27 +542,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io, printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); redirty_page_for_writepage(wbc, page); do { - clear_buffer_async_write(bh); + if (buffer_async_write(bh)) { + clear_buffer_async_write(bh); + set_buffer_dirty(bh); + } bh = bh->b_this_page; } while (bh != head); goto unlock; } } + if (keep_towrite) + set_page_writeback_keepwrite(page); + else + set_page_writeback(page); + /* Now submit buffers to write */ do { if (!buffer_async_write(bh)) continue; io_submit_add_bh(io, inode, bounce_page ? bounce_page : page, bh); - nr_submitted++; - clear_buffer_dirty(bh); } while ((bh = bh->b_this_page) != head); - unlock: unlock_page(page); - /* Nothing submitted - we have to end page writeback */ - if (!nr_submitted) - end_page_writeback(page); return ret; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index e02a5f14e021..d5266932ce6c 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -75,14 +75,10 @@ static void __read_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, iter_all) { page = bv->bv_page; - /* PG_error was set if any post_read step failed */ - if (bio->bi_status || PageError(page)) { + if (bio->bi_status) ClearPageUptodate(page); - /* will re-read again later */ - ClearPageError(page); - } else { + else SetPageUptodate(page); - } unlock_page(page); } if (bio->bi_private) @@ -96,10 +92,12 @@ static void decrypt_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; - fscrypt_decrypt_bio(ctx->bio); - - bio_post_read_processing(ctx); + if (fscrypt_decrypt_bio(bio)) + bio_post_read_processing(ctx); + else + __read_end_io(bio); } static void verity_work(struct work_struct *work) @@ -408,9 +406,8 @@ int ext4_mpage_readpages(struct inode *inode, int __init ext4_init_post_read_processing(void) { - bio_post_read_ctx_cache = - kmem_cache_create("ext4_bio_post_read_ctx", - sizeof(struct bio_post_read_ctx), 0, 0, NULL); + bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT); + if (!bio_post_read_ctx_cache) goto fail; bio_post_read_ctx_pool = diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index fea2a68d067b..6b91443d6bf3 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1110,6 +1110,16 @@ exit_free: return err; } +static inline void ext4_set_block_group_nr(struct super_block *sb, char *data, + ext4_group_t group) +{ + struct ext4_super_block *es = (struct ext4_super_block *) data; + + es->s_block_group_nr = cpu_to_le16(group); + if (ext4_has_metadata_csum(sb)) + es->s_checksum = ext4_superblock_csum(sb, es); +} + /* * Update the backup copies of the ext4 metadata. These don't need to be part * of the main resize transaction, because e2fsck will re-write them if there @@ -1158,6 +1168,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, while (group < sbi->s_groups_count) { struct buffer_head *bh; ext4_fsblk_t backup_block; + int has_super = ext4_bg_has_super(sb, group); + ext4_fsblk_t first_block = ext4_group_first_block_no(sb, group); /* Out of journal space, and can't get more - abort - so sad */ err = ext4_resize_ensure_credits_batch(handle, 1); @@ -1167,8 +1179,7 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, if (meta_bg == 0) backup_block = ((ext4_fsblk_t)group) * bpg + blk_off; else - backup_block = (ext4_group_first_block_no(sb, group) + - ext4_bg_has_super(sb, group)); + backup_block = first_block + has_super; bh = sb_getblk(sb, backup_block); if (unlikely(!bh)) { @@ -1186,6 +1197,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, memcpy(bh->b_data, data, size); if (rest) memset(bh->b_data + size, 0, rest); + if (has_super && (backup_block == first_block)) + ext4_set_block_group_nr(sb, bh->b_data, group); set_buffer_uptodate(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, NULL, bh); @@ -1471,8 +1484,6 @@ static void ext4_update_super(struct super_block *sb, * active. */ ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + reserved_blocks); - ext4_superblock_csum_set(sb); - unlock_buffer(sbi->s_sbh); /* Update the free space counts */ percpu_counter_add(&sbi->s_freeclusters_counter, @@ -1508,6 +1519,8 @@ static void ext4_update_super(struct super_block *sb, ext4_calculate_overhead(sb); es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead); + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: added group %u:" "%llu blocks(%llu free %llu reserved)\n", flex_gd->count, @@ -1591,8 +1604,8 @@ exit_journal: int meta_bg = ext4_has_feature_meta_bg(sb); sector_t old_gdb = 0; - update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext4_super_block), 0); + update_backups(sb, ext4_group_first_block_no(sb, 0), + (char *)es, sizeof(struct ext4_super_block), 0); for (; gdb_num <= gdb_num_end; gdb_num++) { struct buffer_head *gdb_bh; @@ -1803,7 +1816,7 @@ errout: if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: extended group to %llu " "blocks\n", ext4_blocks_count(es)); - update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, + update_backups(sb, ext4_group_first_block_no(sb, 0), (char *)es, sizeof(struct ext4_super_block), 0); } return err; @@ -1826,7 +1839,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_grpblk_t last; ext4_grpblk_t add; struct buffer_head *bh; - int err; ext4_group_t group; o_blocks_count = ext4_blocks_count(es); @@ -1881,8 +1893,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, } brelse(bh); - err = ext4_group_extend_no_check(sb, o_blocks_count, add); - return err; + return ext4_group_extend_no_check(sb, o_blocks_count, add); } /* ext4_group_extend */ @@ -2122,7 +2133,7 @@ retry: goto out; } - if (ext4_blocks_count(es) == n_blocks_count) + if (ext4_blocks_count(es) == n_blocks_count && n_blocks_count_retry == 0) goto out; err = ext4_alloc_flex_bg_array(sb, n_group + 1); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9a66abcca1a8..16a343e8047d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -205,19 +205,12 @@ int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { - if (trylock_buffer(bh)) { - if (wait) - return ext4_read_bh(bh, op_flags, NULL); + lock_buffer(bh); + if (!wait) { ext4_read_bh_nowait(bh, op_flags, NULL); return 0; } - if (wait) { - wait_on_buffer(bh); - if (buffer_uptodate(bh)) - return 0; - return -EIO; - } - return 0; + return ext4_read_bh(bh, op_flags, NULL); } /* @@ -264,7 +257,8 @@ void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) struct buffer_head *bh = sb_getblk_gfp(sb, block, 0); if (likely(bh)) { - ext4_read_bh_lock(bh, REQ_RAHEAD, false); + if (trylock_buffer(bh)) + ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL); brelse(bh); } } @@ -546,8 +540,7 @@ static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) if (ext4_should_journal_data(jinode->i_vfs_inode)) ret = ext4_journalled_submit_inode_data_buffers(jinode); else - ret = jbd2_journal_submit_inode_data_buffers(jinode); - + ret = ext4_normal_submit_inode_data_buffers(jinode); return ret; } @@ -1212,7 +1205,8 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_sysfs(sb); if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount")) - ext4_msg(sb, KERN_INFO, "unmounting filesystem."); + ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.", + &sb->s_uuid); ext4_unregister_li_request(sb); ext4_quota_off_umount(sb); @@ -1329,6 +1323,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) return NULL; inode_set_iversion(&ei->vfs_inode, 1); + ei->i_flags = 0; spin_lock_init(&ei->i_raw_lock); INIT_LIST_HEAD(&ei->i_prealloc_list); atomic_set(&ei->i_prealloc_active, 0); @@ -1576,7 +1571,7 @@ enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_nouid32, Opt_debug, Opt_removed, - Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, + Opt_user_xattr, Opt_acl, Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, @@ -1585,7 +1580,7 @@ enum { Opt_inlinecrypt, Opt_usrjquota, Opt_grpjquota, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, + Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize, @@ -1662,9 +1657,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = { fsparam_flag ("oldalloc", Opt_removed), fsparam_flag ("orlov", Opt_removed), fsparam_flag ("user_xattr", Opt_user_xattr), - fsparam_flag ("nouser_xattr", Opt_nouser_xattr), fsparam_flag ("acl", Opt_acl), - fsparam_flag ("noacl", Opt_noacl), fsparam_flag ("norecovery", Opt_noload), fsparam_flag ("noload", Opt_noload), fsparam_flag ("bh", Opt_removed), @@ -1694,7 +1687,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = { fsparam_flag ("barrier", Opt_barrier), fsparam_u32 ("barrier", Opt_barrier), fsparam_flag ("nobarrier", Opt_nobarrier), - fsparam_flag ("i_version", Opt_i_version), + fsparam_flag ("i_version", Opt_removed), fsparam_flag ("dax", Opt_dax), fsparam_enum ("dax", Opt_dax_type, ext4_param_dax), fsparam_u32 ("stripe", Opt_stripe), @@ -1749,10 +1742,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = { #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) -static const char deprecated_msg[] = - "Mount option \"%s\" will be removed by %s\n" - "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; - #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 #define MOPT_NOSUPPORT 0x0004 @@ -1814,13 +1803,10 @@ static const struct mount_opts { {Opt_journal_ioprio, 0, MOPT_NO_EXT2}, {Opt_data, 0, MOPT_NO_EXT2}, {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, - {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, #ifdef CONFIG_EXT4_FS_POSIX_ACL {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, - {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR}, #else {Opt_acl, 0, MOPT_NOSUPPORT}, - {Opt_noacl, 0, MOPT_NOSUPPORT}, #endif {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, @@ -2120,10 +2106,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) else return note_qf_name(fc, GRPQUOTA, param); #endif - case Opt_noacl: - case Opt_nouser_xattr: - ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "3.5"); - break; case Opt_sb: if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { ext4_msg(NULL, KERN_WARNING, @@ -2140,11 +2122,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_abort: ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED); return 0; - case Opt_i_version: - ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "5.20"); - ext4_msg(NULL, KERN_WARNING, "Use iversion instead\n"); - ctx_set_flags(ctx, SB_I_VERSION); - return 0; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT ctx_set_flags(ctx, SB_INLINECRYPT); @@ -2271,7 +2248,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; } - error = fs_lookup_param(fc, param, 1, &path); + error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path); if (error) { ext4_msg(NULL, KERN_ERR, "error: could not find " "journal device path"); @@ -2814,14 +2791,6 @@ static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) sb->s_flags &= ~ctx->mask_s_flags; sb->s_flags |= ctx->vals_s_flags; - /* - * i_version differs from common mount option iversion so we have - * to let vfs know that it was set, otherwise it would get cleared - * on remount - */ - if (ctx->mask_s_flags & SB_I_VERSION) - fc->sb_flags |= SB_I_VERSION; - #define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; }) APPLY(s_commit_interval); APPLY(s_stripe); @@ -2970,8 +2939,6 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); - if (sb->s_flags & SB_I_VERSION) - SEQ_OPTS_PUTS("i_version"); if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); if (nodefs || EXT4_MOUNT_DATA_FLAGS & @@ -3767,6 +3734,7 @@ static int ext4_lazyinit_thread(void *arg) unsigned long next_wakeup, cur; BUG_ON(NULL == eli); + set_freezable(); cont_thread: while (true) { @@ -3811,8 +3779,7 @@ cont_thread: } if (!progress) { elr->lr_next_sched = jiffies + - (prandom_u32() - % (EXT4_DEF_LI_MAX_START_DELAY * HZ)); + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); } if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; @@ -3959,8 +3926,7 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, * spread the inode table initialization requests * better. */ - elr->lr_next_sched = jiffies + (prandom_u32() % - (EXT4_DEF_LI_MAX_START_DELAY * HZ)); + elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); return elr; } @@ -3982,9 +3948,9 @@ int ext4_register_li_request(struct super_block *sb, goto out; } - if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && - (first_not_zeroed == ngroups || sb_rdonly(sb) || - !test_opt(sb, INIT_INODE_TABLE))) + if (sb_rdonly(sb) || + (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && + (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE)))) goto out; elr = ext4_li_request_new(sb, first_not_zeroed); @@ -4311,112 +4277,10 @@ err_out: return NULL; } -static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) +static void ext4_set_def_opts(struct super_block *sb, + struct ext4_super_block *es) { - struct buffer_head *bh, **group_desc; - struct ext4_super_block *es = NULL; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct flex_groups **flex_groups; - ext4_fsblk_t block; - ext4_fsblk_t logical_sb_block; - unsigned long offset = 0; unsigned long def_mount_opts; - struct inode *root; - int ret = -ENOMEM; - int blocksize, clustersize; - unsigned int db_count; - unsigned int i; - int needs_recovery, has_huge_files; - __u64 blocks_count; - int err = 0; - ext4_group_t first_not_zeroed; - struct ext4_fs_context *ctx = fc->fs_private; - int silent = fc->sb_flags & SB_SILENT; - - /* Set defaults for the variables that will be set during parsing */ - if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; - - sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; - sbi->s_sectors_written_start = - part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); - - /* -EINVAL is default */ - ret = -EINVAL; - blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); - if (!blocksize) { - ext4_msg(sb, KERN_ERR, "unable to set blocksize"); - goto out_fail; - } - - /* - * The ext4 superblock will not be buffer aligned for other than 1kB - * block sizes. We need to calculate the offset from buffer start. - */ - if (blocksize != EXT4_MIN_BLOCK_SIZE) { - logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; - offset = do_div(logical_sb_block, blocksize); - } else { - logical_sb_block = sbi->s_sb_block; - } - - bh = ext4_sb_bread_unmovable(sb, logical_sb_block); - if (IS_ERR(bh)) { - ext4_msg(sb, KERN_ERR, "unable to read superblock"); - ret = PTR_ERR(bh); - goto out_fail; - } - /* - * Note: s_es must be initialized as soon as possible because - * some ext4 macro-instructions depend on its value - */ - es = (struct ext4_super_block *) (bh->b_data + offset); - sbi->s_es = es; - sb->s_magic = le16_to_cpu(es->s_magic); - if (sb->s_magic != EXT4_SUPER_MAGIC) - goto cantfind_ext4; - sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); - - /* Warn if metadata_csum and gdt_csum are both set. */ - if (ext4_has_feature_metadata_csum(sb) && - ext4_has_feature_gdt_csum(sb)) - ext4_warning(sb, "metadata_csum and uninit_bg are " - "redundant flags; please run fsck."); - - /* Check for a known checksum algorithm */ - if (!ext4_verify_csum_type(sb, es)) { - ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " - "unknown checksum algorithm."); - silent = 1; - goto cantfind_ext4; - } - ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, - ext4_orphan_file_block_trigger); - - /* Load the checksum driver */ - sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(sbi->s_chksum_driver)) { - ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); - ret = PTR_ERR(sbi->s_chksum_driver); - sbi->s_chksum_driver = NULL; - goto failed_mount; - } - - /* Check superblock checksum */ - if (!ext4_superblock_csum_verify(sb, es)) { - ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " - "invalid superblock checksum. Run e2fsck?"); - silent = 1; - ret = -EFSBADCRC; - goto cantfind_ext4; - } - - /* Precompute checksum seed for all metadata */ - if (ext4_has_feature_csum_seed(sb)) - sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); - else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) - sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, - sizeof(es->s_uuid)); /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); @@ -4445,9 +4309,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) set_opt(sb, WRITEBACK_DATA); - if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) + if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC) set_opt(sb, ERRORS_PANIC); - else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) + else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE) set_opt(sb, ERRORS_CONT); else set_opt(sb, ERRORS_RO); @@ -4456,12 +4320,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (def_mount_opts & EXT4_DEFM_DISCARD) set_opt(sb, DISCARD); - sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); - sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); - sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; - sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; - sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; - if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) set_opt(sb, BARRIER); @@ -4473,31 +4331,96 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); - /* - * set default s_li_wait_mult for lazyinit, for the case there is - * no mount option specified. - */ - sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + if (sb->s_blocksize == PAGE_SIZE) + set_opt(sb, DIOREAD_NOLOCK); +} - if (le32_to_cpu(es->s_log_block_size) > - (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { - ext4_msg(sb, KERN_ERR, - "Invalid log block size: %u", - le32_to_cpu(es->s_log_block_size)); - goto failed_mount; - } - if (le32_to_cpu(es->s_log_cluster_size) > - (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { - ext4_msg(sb, KERN_ERR, - "Invalid log cluster size: %u", - le32_to_cpu(es->s_log_cluster_size)); - goto failed_mount; +static int ext4_handle_clustersize(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int clustersize; + + /* Handle clustersize */ + clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); + if (ext4_has_feature_bigalloc(sb)) { + if (clustersize < sb->s_blocksize) { + ext4_msg(sb, KERN_ERR, + "cluster size (%d) smaller than " + "block size (%lu)", clustersize, sb->s_blocksize); + return -EINVAL; + } + sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - + le32_to_cpu(es->s_log_block_size); + sbi->s_clusters_per_group = + le32_to_cpu(es->s_clusters_per_group); + if (sbi->s_clusters_per_group > sb->s_blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#clusters per group too big: %lu", + sbi->s_clusters_per_group); + return -EINVAL; + } + if (sbi->s_blocks_per_group != + (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) { + ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " + "clusters per group (%lu) inconsistent", + sbi->s_blocks_per_group, + sbi->s_clusters_per_group); + return -EINVAL; + } + } else { + if (clustersize != sb->s_blocksize) { + ext4_msg(sb, KERN_ERR, + "fragment/cluster size (%d) != " + "block size (%lu)", clustersize, sb->s_blocksize); + return -EINVAL; + } + if (sbi->s_blocks_per_group > sb->s_blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#blocks per group too big: %lu", + sbi->s_blocks_per_group); + return -EINVAL; + } + sbi->s_clusters_per_group = sbi->s_blocks_per_group; + sbi->s_cluster_bits = 0; } + sbi->s_cluster_ratio = clustersize / sb->s_blocksize; - blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + /* Do we have standard group size of clustersize * 8 blocks ? */ + if (sbi->s_blocks_per_group == clustersize << 3) + set_opt2(sb, STD_GROUP_SIZE); - if (blocksize == PAGE_SIZE) - set_opt(sb, DIOREAD_NOLOCK); + return 0; +} + +static void ext4_fast_commit_init(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* Initialize fast commit stuff */ + atomic_set(&sbi->s_fc_subtid, 0); + INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); + INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); + INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); + INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); + sbi->s_fc_bytes = 0; + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + sbi->s_fc_ineligible_tid = 0; + spin_lock_init(&sbi->s_fc_lock); + memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); + sbi->s_fc_replay_state.fc_regions = NULL; + sbi->s_fc_replay_state.fc_regions_size = 0; + sbi->s_fc_replay_state.fc_regions_used = 0; + sbi->s_fc_replay_state.fc_regions_valid = 0; + sbi->s_fc_replay_state.fc_modified_inodes = NULL; + sbi->s_fc_replay_state.fc_modified_inodes_size = 0; + sbi->s_fc_replay_state.fc_modified_inodes_used = 0; +} + +static int ext4_inode_info_init(struct super_block *sb, + struct ext4_super_block *es) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; @@ -4508,16 +4431,16 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { ext4_msg(sb, KERN_ERR, "invalid first ino: %u", sbi->s_first_ino); - goto failed_mount; + return -EINVAL; } if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || (!is_power_of_2(sbi->s_inode_size)) || - (sbi->s_inode_size > blocksize)) { + (sbi->s_inode_size > sb->s_blocksize)) { ext4_msg(sb, KERN_ERR, "unsupported inode size: %d", sbi->s_inode_size); - ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize); - goto failed_mount; + ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize); + return -EINVAL; } /* * i_atime_extra is the last extra field available for @@ -4535,6 +4458,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) } sb->s_time_min = EXT4_TIMESTAMP_MIN; } + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { sbi->s_want_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; @@ -4546,7 +4470,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_want_extra_isize: %d", v); - goto failed_mount; + return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; @@ -4555,91 +4479,112 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_min_extra_isize: %d", v); - goto failed_mount; + return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; } } - err = parse_apply_sb_mount_options(sb, ctx); - if (err < 0) - goto failed_mount; + return 0; +} - sbi->s_def_mount_opt = sbi->s_mount_opt; +#if IS_ENABLED(CONFIG_UNICODE) +static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) +{ + const struct ext4_sb_encodings *encoding_info; + struct unicode_map *encoding; + __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags); - err = ext4_check_opt_consistency(fc, sb); - if (err < 0) - goto failed_mount; + if (!ext4_has_feature_casefold(sb) || sb->s_encoding) + return 0; - ext4_apply_options(fc, sb); + encoding_info = ext4_sb_read_encoding(es); + if (!encoding_info) { + ext4_msg(sb, KERN_ERR, + "Encoding requested by superblock is unknown"); + return -EINVAL; + } -#if IS_ENABLED(CONFIG_UNICODE) - if (ext4_has_feature_casefold(sb) && !sb->s_encoding) { - const struct ext4_sb_encodings *encoding_info; - struct unicode_map *encoding; - __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags); + encoding = utf8_load(encoding_info->version); + if (IS_ERR(encoding)) { + ext4_msg(sb, KERN_ERR, + "can't mount with superblock charset: %s-%u.%u.%u " + "not supported by the kernel. flags: 0x%x.", + encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), + encoding_flags); + return -EINVAL; + } + ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: " + "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), + encoding_flags); - encoding_info = ext4_sb_read_encoding(es); - if (!encoding_info) { - ext4_msg(sb, KERN_ERR, - "Encoding requested by superblock is unknown"); - goto failed_mount; - } + sb->s_encoding = encoding; + sb->s_encoding_flags = encoding_flags; - encoding = utf8_load(encoding_info->version); - if (IS_ERR(encoding)) { - ext4_msg(sb, KERN_ERR, - "can't mount with superblock charset: %s-%u.%u.%u " - "not supported by the kernel. flags: 0x%x.", - encoding_info->name, - unicode_major(encoding_info->version), - unicode_minor(encoding_info->version), - unicode_rev(encoding_info->version), - encoding_flags); - goto failed_mount; - } - ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: " - "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, - unicode_major(encoding_info->version), - unicode_minor(encoding_info->version), - unicode_rev(encoding_info->version), - encoding_flags); + return 0; +} +#else +static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) +{ + return 0; +} +#endif + +static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* Warn if metadata_csum and gdt_csum are both set. */ + if (ext4_has_feature_metadata_csum(sb) && + ext4_has_feature_gdt_csum(sb)) + ext4_warning(sb, "metadata_csum and uninit_bg are " + "redundant flags; please run fsck."); - sb->s_encoding = encoding; - sb->s_encoding_flags = encoding_flags; + /* Check for a known checksum algorithm */ + if (!ext4_verify_csum_type(sb, es)) { + ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " + "unknown checksum algorithm."); + return -EINVAL; } -#endif + ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, + ext4_orphan_file_block_trigger); - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n"); - /* can't mount with both data=journal and dioread_nolock. */ - clear_opt(sb, DIOREAD_NOLOCK); - clear_opt2(sb, JOURNAL_FAST_COMMIT); - if (test_opt2(sb, EXPLICIT_DELALLOC)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and delalloc"); - goto failed_mount; - } - if (test_opt(sb, DAX_ALWAYS)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and dax"); - goto failed_mount; - } - if (ext4_has_feature_encrypt(sb)) { - ext4_msg(sb, KERN_WARNING, - "encrypted files will use data=ordered " - "instead of data journaling mode"); - } - if (test_opt(sb, DELALLOC)) - clear_opt(sb, DELALLOC); - } else { - sb->s_iflags |= SB_I_CGROUPWB; + /* Load the checksum driver */ + sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(sbi->s_chksum_driver)) { + int ret = PTR_ERR(sbi->s_chksum_driver); + ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); + sbi->s_chksum_driver = NULL; + return ret; } - sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | - (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); + /* Check superblock checksum */ + if (!ext4_superblock_csum_verify(sb, es)) { + ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " + "invalid superblock checksum. Run e2fsck?"); + return -EFSBADCRC; + } + /* Precompute checksum seed for all metadata */ + if (ext4_has_feature_csum_seed(sb)) + sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); + else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) + sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, + sizeof(es->s_uuid)); + return 0; +} + +static int ext4_check_feature_compatibility(struct super_block *sb, + struct ext4_super_block *es, + int silent) +{ if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && (ext4_has_compat_features(sb) || ext4_has_ro_compat_features(sb) || @@ -4653,7 +4598,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (ext4_has_feature_64bit(sb)) { ext4_msg(sb, KERN_ERR, "The Hurd can't support 64-bit file systems"); - goto failed_mount; + return -EINVAL; } /* @@ -4663,7 +4608,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (ext4_has_feature_ea_inode(sb)) { ext4_msg(sb, KERN_ERR, "ea_inode feature is not supported for Hurd"); - goto failed_mount; + return -EINVAL; } } @@ -4677,10 +4622,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * it's actually an ext[34] filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) - goto failed_mount; + return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " "to feature incompatibilities"); - goto failed_mount; + return -EINVAL; } } @@ -4694,10 +4639,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * it's actually an ext4 filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) - goto failed_mount; + return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " "to feature incompatibilities"); - goto failed_mount; + return -EINVAL; } } @@ -4707,9 +4652,463 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * so there is a chance incompat flags are set on a rev 0 filesystem. */ if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) + return -EINVAL; + + return 0; +} + +static int ext4_geometry_check(struct super_block *sb, + struct ext4_super_block *es) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + __u64 blocks_count; + + /* check blocks count against device size */ + blocks_count = sb_bdev_nr_blocks(sb); + if (blocks_count && ext4_blocks_count(es) > blocks_count) { + ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " + "exceeds size of device (%llu blocks)", + ext4_blocks_count(es), blocks_count); + return -EINVAL; + } + + /* + * It makes no sense for the first data block to be beyond the end + * of the filesystem. + */ + if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { + ext4_msg(sb, KERN_WARNING, "bad geometry: first data " + "block %u is beyond end of filesystem (%llu)", + le32_to_cpu(es->s_first_data_block), + ext4_blocks_count(es)); + return -EINVAL; + } + if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) && + (sbi->s_cluster_ratio == 1)) { + ext4_msg(sb, KERN_WARNING, "bad geometry: first data " + "block is 0 with a 1k block and cluster size"); + return -EINVAL; + } + + blocks_count = (ext4_blocks_count(es) - + le32_to_cpu(es->s_first_data_block) + + EXT4_BLOCKS_PER_GROUP(sb) - 1); + do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); + if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { + ext4_msg(sb, KERN_WARNING, "groups count too large: %llu " + "(block count %llu, first data block %u, " + "blocks per group %lu)", blocks_count, + ext4_blocks_count(es), + le32_to_cpu(es->s_first_data_block), + EXT4_BLOCKS_PER_GROUP(sb)); + return -EINVAL; + } + sbi->s_groups_count = blocks_count; + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); + if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != + le32_to_cpu(es->s_inodes_count)) { + ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", + le32_to_cpu(es->s_inodes_count), + ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); + return -EINVAL; + } + + return 0; +} + +static void ext4_group_desc_free(struct ext4_sb_info *sbi) +{ + struct buffer_head **group_desc; + int i; + + rcu_read_lock(); + group_desc = rcu_dereference(sbi->s_group_desc); + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(group_desc[i]); + kvfree(group_desc); + rcu_read_unlock(); +} + +static int ext4_group_desc_init(struct super_block *sb, + struct ext4_super_block *es, + ext4_fsblk_t logical_sb_block, + ext4_group_t *first_not_zeroed) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned int db_count; + ext4_fsblk_t block; + int ret; + int i; + + db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / + EXT4_DESC_PER_BLOCK(sb); + if (ext4_has_feature_meta_bg(sb)) { + if (le32_to_cpu(es->s_first_meta_bg) > db_count) { + ext4_msg(sb, KERN_WARNING, + "first meta block group too large: %u " + "(group descriptor block count %u)", + le32_to_cpu(es->s_first_meta_bg), db_count); + return -EINVAL; + } + } + rcu_assign_pointer(sbi->s_group_desc, + kvmalloc_array(db_count, + sizeof(struct buffer_head *), + GFP_KERNEL)); + if (sbi->s_group_desc == NULL) { + ext4_msg(sb, KERN_ERR, "not enough memory"); + return -ENOMEM; + } + + bgl_lock_init(sbi->s_blockgroup_lock); + + /* Pre-read the descriptors into the buffer cache */ + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logical_sb_block, i); + ext4_sb_breadahead_unmovable(sb, block); + } + + for (i = 0; i < db_count; i++) { + struct buffer_head *bh; + + block = descriptor_loc(sb, logical_sb_block, i); + bh = ext4_sb_bread_unmovable(sb, block); + if (IS_ERR(bh)) { + ext4_msg(sb, KERN_ERR, + "can't read group descriptor %d", i); + sbi->s_gdb_count = i; + ret = PTR_ERR(bh); + goto out; + } + rcu_read_lock(); + rcu_dereference(sbi->s_group_desc)[i] = bh; + rcu_read_unlock(); + } + sbi->s_gdb_count = db_count; + if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) { + ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); + ret = -EFSCORRUPTED; + goto out; + } + return 0; +out: + ext4_group_desc_free(sbi); + return ret; +} + +static int ext4_load_and_init_journal(struct super_block *sb, + struct ext4_super_block *es, + struct ext4_fs_context *ctx) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; + + err = ext4_load_journal(sb, es, ctx->journal_devnum); + if (err) + return err; + + if (ext4_has_feature_64bit(sb) && + !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_64BIT)) { + ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); + goto out; + } + + if (!set_journal_csum_feature_set(sb)) { + ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " + "feature set"); + goto out; + } + + if (test_opt2(sb, JOURNAL_FAST_COMMIT) && + !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { + ext4_msg(sb, KERN_ERR, + "Failed to set fast commit journal feature"); + goto out; + } + + /* We have now updated the journal if required, so we can + * validate the data journaling mode. */ + switch (test_opt(sb, DATA_FLAGS)) { + case 0: + /* No mode set, assume a default based on the journal + * capabilities: ORDERED_DATA if the journal can + * cope, else JOURNAL_DATA + */ + if (jbd2_journal_check_available_features + (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { + set_opt(sb, ORDERED_DATA); + sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA; + } else { + set_opt(sb, JOURNAL_DATA); + sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; + } + break; + + case EXT4_MOUNT_ORDERED_DATA: + case EXT4_MOUNT_WRITEBACK_DATA: + if (!jbd2_journal_check_available_features + (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { + ext4_msg(sb, KERN_ERR, "Journal does not support " + "requested data journaling mode"); + goto out; + } + break; + default: + break; + } + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && + test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "journal_async_commit in data=ordered mode"); + goto out; + } + + set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); + + sbi->s_journal->j_submit_inode_data_buffers = + ext4_journal_submit_inode_data_buffers; + sbi->s_journal->j_finish_inode_data_buffers = + ext4_journal_finish_inode_data_buffers; + + return 0; + +out: + /* flush s_error_work before journal destroy. */ + flush_work(&sbi->s_error_work); + jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + return -EINVAL; +} + +static int ext4_journal_data_mode_check(struct super_block *sb) +{ + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with " + "data=journal disables delayed allocation, " + "dioread_nolock, O_DIRECT and fast_commit support!\n"); + /* can't mount with both data=journal and dioread_nolock. */ + clear_opt(sb, DIOREAD_NOLOCK); + clear_opt2(sb, JOURNAL_FAST_COMMIT); + if (test_opt2(sb, EXPLICIT_DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + return -EINVAL; + } + if (test_opt(sb, DAX_ALWAYS)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + return -EINVAL; + } + if (ext4_has_feature_encrypt(sb)) { + ext4_msg(sb, KERN_WARNING, + "encrypted files will use data=ordered " + "instead of data journaling mode"); + } + if (test_opt(sb, DELALLOC)) + clear_opt(sb, DELALLOC); + } else { + sb->s_iflags |= SB_I_CGROUPWB; + } + + return 0; +} + +static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, + int silent) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es; + ext4_fsblk_t logical_sb_block; + unsigned long offset = 0; + struct buffer_head *bh; + int ret = -EINVAL; + int blocksize; + + blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); + if (!blocksize) { + ext4_msg(sb, KERN_ERR, "unable to set blocksize"); + return -EINVAL; + } + + /* + * The ext4 superblock will not be buffer aligned for other than 1kB + * block sizes. We need to calculate the offset from buffer start. + */ + if (blocksize != EXT4_MIN_BLOCK_SIZE) { + logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; + offset = do_div(logical_sb_block, blocksize); + } else { + logical_sb_block = sbi->s_sb_block; + } + + bh = ext4_sb_bread_unmovable(sb, logical_sb_block); + if (IS_ERR(bh)) { + ext4_msg(sb, KERN_ERR, "unable to read superblock"); + return PTR_ERR(bh); + } + /* + * Note: s_es must be initialized as soon as possible because + * some ext4 macro-instructions depend on its value + */ + es = (struct ext4_super_block *) (bh->b_data + offset); + sbi->s_es = es; + sb->s_magic = le16_to_cpu(es->s_magic); + if (sb->s_magic != EXT4_SUPER_MAGIC) { + if (!silent) + ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); + goto out; + } + + if (le32_to_cpu(es->s_log_block_size) > + (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log block size: %u", + le32_to_cpu(es->s_log_block_size)); + goto out; + } + if (le32_to_cpu(es->s_log_cluster_size) > + (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log cluster size: %u", + le32_to_cpu(es->s_log_cluster_size)); + goto out; + } + + blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + + /* + * If the default block size is not the same as the real block size, + * we need to reload it. + */ + if (sb->s_blocksize == blocksize) { + *lsb = logical_sb_block; + sbi->s_sbh = bh; + return 0; + } + + /* + * bh must be released before kill_bdev(), otherwise + * it won't be freed and its page also. kill_bdev() + * is called by sb_set_blocksize(). + */ + brelse(bh); + /* Validate the filesystem blocksize */ + if (!sb_set_blocksize(sb, blocksize)) { + ext4_msg(sb, KERN_ERR, "bad block size %d", + blocksize); + bh = NULL; + goto out; + } + + logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; + offset = do_div(logical_sb_block, blocksize); + bh = ext4_sb_bread_unmovable(sb, logical_sb_block); + if (IS_ERR(bh)) { + ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try"); + ret = PTR_ERR(bh); + bh = NULL; + goto out; + } + es = (struct ext4_super_block *)(bh->b_data + offset); + sbi->s_es = es; + if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { + ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!"); + goto out; + } + *lsb = logical_sb_block; + sbi->s_sbh = bh; + return 0; +out: + brelse(bh); + return ret; +} + +static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) +{ + struct ext4_super_block *es = NULL; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct flex_groups **flex_groups; + ext4_fsblk_t block; + ext4_fsblk_t logical_sb_block; + struct inode *root; + int ret = -ENOMEM; + unsigned int i; + int needs_recovery, has_huge_files; + int err = 0; + ext4_group_t first_not_zeroed; + struct ext4_fs_context *ctx = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; + + /* Set defaults for the variables that will be set during parsing */ + if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) + ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + + sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; + sbi->s_sectors_written_start = + part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); + + /* -EINVAL is default */ + ret = -EINVAL; + err = ext4_load_super(sb, &logical_sb_block, silent); + if (err) + goto out_fail; + + es = sbi->s_es; + sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); + + err = ext4_init_metadata_csum(sb, es); + if (err) goto failed_mount; - if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) { + ext4_set_def_opts(sb, es); + + sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); + sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); + sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; + sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; + sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; + + /* + * set default s_li_wait_mult for lazyinit, for the case there is + * no mount option specified. + */ + sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + + if (ext4_inode_info_init(sb, es)) + goto failed_mount; + + err = parse_apply_sb_mount_options(sb, ctx); + if (err < 0) + goto failed_mount; + + sbi->s_def_mount_opt = sbi->s_mount_opt; + + err = ext4_check_opt_consistency(fc, sb); + if (err < 0) + goto failed_mount; + + ext4_apply_options(fc, sb); + + if (ext4_encoding_init(sb, es)) + goto failed_mount; + + if (ext4_journal_data_mode_check(sb)) + goto failed_mount; + + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); + + /* i_version is always enabled now */ + sb->s_flags |= SB_I_VERSION; + + if (ext4_check_feature_compatibility(sb, es, silent)) + goto failed_mount; + + if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) { ext4_msg(sb, KERN_ERR, "Number of reserved GDT blocks insanely large: %d", le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); @@ -4717,7 +5116,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) } if (sbi->s_daxdev) { - if (blocksize == PAGE_SIZE) + if (sb->s_blocksize == PAGE_SIZE) set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); else ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); @@ -4742,40 +5141,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount; } - if (sb->s_blocksize != blocksize) { - /* - * bh must be released before kill_bdev(), otherwise - * it won't be freed and its page also. kill_bdev() - * is called by sb_set_blocksize(). - */ - brelse(bh); - /* Validate the filesystem blocksize */ - if (!sb_set_blocksize(sb, blocksize)) { - ext4_msg(sb, KERN_ERR, "bad block size %d", - blocksize); - bh = NULL; - goto failed_mount; - } - - logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; - offset = do_div(logical_sb_block, blocksize); - bh = ext4_sb_bread_unmovable(sb, logical_sb_block); - if (IS_ERR(bh)) { - ext4_msg(sb, KERN_ERR, - "Can't read superblock on 2nd try"); - ret = PTR_ERR(bh); - bh = NULL; - goto failed_mount; - } - es = (struct ext4_super_block *)(bh->b_data + offset); - sbi->s_es = es; - if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { - ext4_msg(sb, KERN_ERR, - "Magic mismatch, very weird!"); - goto failed_mount; - } - } - has_huge_files = ext4_has_feature_huge_file(sb); sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, has_huge_files); @@ -4797,19 +5162,21 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); - sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); - if (sbi->s_inodes_per_block == 0) - goto cantfind_ext4; + sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb); + if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) { + if (!silent) + ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); + goto failed_mount; + } if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || - sbi->s_inodes_per_group > blocksize * 8) { + sbi->s_inodes_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", sbi->s_inodes_per_group); goto failed_mount; } sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block; - sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); - sbi->s_sbh = bh; + sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb); sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); @@ -4835,54 +5202,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) } } - /* Handle clustersize */ - clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); - if (ext4_has_feature_bigalloc(sb)) { - if (clustersize < blocksize) { - ext4_msg(sb, KERN_ERR, - "cluster size (%d) smaller than " - "block size (%d)", clustersize, blocksize); - goto failed_mount; - } - sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - - le32_to_cpu(es->s_log_block_size); - sbi->s_clusters_per_group = - le32_to_cpu(es->s_clusters_per_group); - if (sbi->s_clusters_per_group > blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#clusters per group too big: %lu", - sbi->s_clusters_per_group); - goto failed_mount; - } - if (sbi->s_blocks_per_group != - (sbi->s_clusters_per_group * (clustersize / blocksize))) { - ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " - "clusters per group (%lu) inconsistent", - sbi->s_blocks_per_group, - sbi->s_clusters_per_group); - goto failed_mount; - } - } else { - if (clustersize != blocksize) { - ext4_msg(sb, KERN_ERR, - "fragment/cluster size (%d) != " - "block size (%d)", clustersize, blocksize); - goto failed_mount; - } - if (sbi->s_blocks_per_group > blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#blocks per group too big: %lu", - sbi->s_blocks_per_group); - goto failed_mount; - } - sbi->s_clusters_per_group = sbi->s_blocks_per_group; - sbi->s_cluster_bits = 0; - } - sbi->s_cluster_ratio = clustersize / blocksize; - - /* Do we have standard group size of clustersize * 8 blocks ? */ - if (sbi->s_blocks_per_group == clustersize << 3) - set_opt2(sb, STD_GROUP_SIZE); + if (ext4_handle_clustersize(sb)) + goto failed_mount; /* * Test whether we have more sectors than will fit in sector_t, @@ -4896,111 +5217,12 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount; } - if (EXT4_BLOCKS_PER_GROUP(sb) == 0) - goto cantfind_ext4; - - /* check blocks count against device size */ - blocks_count = sb_bdev_nr_blocks(sb); - if (blocks_count && ext4_blocks_count(es) > blocks_count) { - ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " - "exceeds size of device (%llu blocks)", - ext4_blocks_count(es), blocks_count); + if (ext4_geometry_check(sb, es)) goto failed_mount; - } - /* - * It makes no sense for the first data block to be beyond the end - * of the filesystem. - */ - if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { - ext4_msg(sb, KERN_WARNING, "bad geometry: first data " - "block %u is beyond end of filesystem (%llu)", - le32_to_cpu(es->s_first_data_block), - ext4_blocks_count(es)); - goto failed_mount; - } - if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) && - (sbi->s_cluster_ratio == 1)) { - ext4_msg(sb, KERN_WARNING, "bad geometry: first data " - "block is 0 with a 1k block and cluster size"); - goto failed_mount; - } - - blocks_count = (ext4_blocks_count(es) - - le32_to_cpu(es->s_first_data_block) + - EXT4_BLOCKS_PER_GROUP(sb) - 1); - do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); - if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { - ext4_msg(sb, KERN_WARNING, "groups count too large: %llu " - "(block count %llu, first data block %u, " - "blocks per group %lu)", blocks_count, - ext4_blocks_count(es), - le32_to_cpu(es->s_first_data_block), - EXT4_BLOCKS_PER_GROUP(sb)); - goto failed_mount; - } - sbi->s_groups_count = blocks_count; - sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, - (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); - if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != - le32_to_cpu(es->s_inodes_count)) { - ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", - le32_to_cpu(es->s_inodes_count), - ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); - ret = -EINVAL; - goto failed_mount; - } - db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / - EXT4_DESC_PER_BLOCK(sb); - if (ext4_has_feature_meta_bg(sb)) { - if (le32_to_cpu(es->s_first_meta_bg) > db_count) { - ext4_msg(sb, KERN_WARNING, - "first meta block group too large: %u " - "(group descriptor block count %u)", - le32_to_cpu(es->s_first_meta_bg), db_count); - goto failed_mount; - } - } - rcu_assign_pointer(sbi->s_group_desc, - kvmalloc_array(db_count, - sizeof(struct buffer_head *), - GFP_KERNEL)); - if (sbi->s_group_desc == NULL) { - ext4_msg(sb, KERN_ERR, "not enough memory"); - ret = -ENOMEM; + err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); + if (err) goto failed_mount; - } - - bgl_lock_init(sbi->s_blockgroup_lock); - - /* Pre-read the descriptors into the buffer cache */ - for (i = 0; i < db_count; i++) { - block = descriptor_loc(sb, logical_sb_block, i); - ext4_sb_breadahead_unmovable(sb, block); - } - - for (i = 0; i < db_count; i++) { - struct buffer_head *bh; - - block = descriptor_loc(sb, logical_sb_block, i); - bh = ext4_sb_bread_unmovable(sb, block); - if (IS_ERR(bh)) { - ext4_msg(sb, KERN_ERR, - "can't read group descriptor %d", i); - db_count = i; - ret = PTR_ERR(bh); - goto failed_mount2; - } - rcu_read_lock(); - rcu_dereference(sbi->s_group_desc)[i] = bh; - rcu_read_unlock(); - } - sbi->s_gdb_count = db_count; - if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { - ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); - ret = -EFSCORRUPTED; - goto failed_mount2; - } timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); @@ -5038,24 +5260,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); - /* Initialize fast commit stuff */ - atomic_set(&sbi->s_fc_subtid, 0); - INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); - INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); - INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); - INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); - sbi->s_fc_bytes = 0; - ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - sbi->s_fc_ineligible_tid = 0; - spin_lock_init(&sbi->s_fc_lock); - memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); - sbi->s_fc_replay_state.fc_regions = NULL; - sbi->s_fc_replay_state.fc_regions_size = 0; - sbi->s_fc_replay_state.fc_regions_used = 0; - sbi->s_fc_replay_state.fc_regions_valid = 0; - sbi->s_fc_replay_state.fc_modified_inodes = NULL; - sbi->s_fc_replay_state.fc_modified_inodes_size = 0; - sbi->s_fc_replay_state.fc_modified_inodes_used = 0; + ext4_fast_commit_init(sb); sb->s_root = NULL; @@ -5072,37 +5277,38 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * root first: it may be modified in the journal! */ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { - err = ext4_load_journal(sb, es, ctx->journal_devnum); + err = ext4_load_and_init_journal(sb, es, ctx); if (err) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && ext4_has_feature_journal_needs_recovery(sb)) { ext4_msg(sb, KERN_ERR, "required journal recovery " "suppressed and not mounted read-only"); - goto failed_mount_wq; + goto failed_mount3a; } else { /* Nojournal mode, all journal mount options are illegal */ - if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_checksum, fs mounted w/o journal"); - goto failed_mount_wq; - } if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit, fs mounted w/o journal"); - goto failed_mount_wq; + goto failed_mount3a; + } + + if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "journal_checksum, fs mounted w/o journal"); + goto failed_mount3a; } if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { ext4_msg(sb, KERN_ERR, "can't mount with " "commit=%lu, fs mounted w/o journal", sbi->s_commit_interval / HZ); - goto failed_mount_wq; + goto failed_mount3a; } if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { ext4_msg(sb, KERN_ERR, "can't mount with " "data=, fs mounted w/o journal"); - goto failed_mount_wq; + goto failed_mount3a; } sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); @@ -5110,76 +5316,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) clear_opt2(sb, JOURNAL_FAST_COMMIT); sbi->s_journal = NULL; needs_recovery = 0; - goto no_journal; } - if (ext4_has_feature_64bit(sb) && - !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, - JBD2_FEATURE_INCOMPAT_64BIT)) { - ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); - goto failed_mount_wq; - } - - if (!set_journal_csum_feature_set(sb)) { - ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " - "feature set"); - goto failed_mount_wq; - } - - if (test_opt2(sb, JOURNAL_FAST_COMMIT) && - !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, - JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { - ext4_msg(sb, KERN_ERR, - "Failed to set fast commit journal feature"); - goto failed_mount_wq; - } - - /* We have now updated the journal if required, so we can - * validate the data journaling mode. */ - switch (test_opt(sb, DATA_FLAGS)) { - case 0: - /* No mode set, assume a default based on the journal - * capabilities: ORDERED_DATA if the journal can - * cope, else JOURNAL_DATA - */ - if (jbd2_journal_check_available_features - (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { - set_opt(sb, ORDERED_DATA); - sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA; - } else { - set_opt(sb, JOURNAL_DATA); - sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; - } - break; - - case EXT4_MOUNT_ORDERED_DATA: - case EXT4_MOUNT_WRITEBACK_DATA: - if (!jbd2_journal_check_available_features - (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { - ext4_msg(sb, KERN_ERR, "Journal does not support " - "requested data journaling mode"); - goto failed_mount_wq; - } - break; - default: - break; - } - - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && - test_opt(sb, JOURNAL_ASYNC_COMMIT)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_async_commit in data=ordered mode"); - goto failed_mount_wq; - } - - set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); - - sbi->s_journal->j_submit_inode_data_buffers = - ext4_journal_submit_inode_data_buffers; - sbi->s_journal->j_finish_inode_data_buffers = - ext4_journal_finish_inode_data_buffers; - -no_journal: if (!test_opt(sb, NO_MBCACHE)) { sbi->s_ea_block_cache = ext4_xattr_create_cache(); if (!sbi->s_ea_block_cache) { @@ -5198,7 +5336,7 @@ no_journal: } } - if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) { + if (ext4_has_feature_verity(sb) && sb->s_blocksize != PAGE_SIZE) { ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity"); goto failed_mount_wq; } @@ -5408,11 +5546,6 @@ no_journal: return 0; -cantfind_ext4: - if (!silent) - ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); - goto failed_mount; - failed_mount9: ext4_release_orphan_info(sb); failed_mount8: @@ -5466,13 +5599,7 @@ failed_mount3: flush_work(&sbi->s_error_work); del_timer_sync(&sbi->s_err_report); ext4_stop_mmpd(sbi); -failed_mount2: - rcu_read_lock(); - group_desc = rcu_dereference(sbi->s_group_desc); - for (i = 0; i < db_count; i++) - brelse(group_desc[i]); - kvfree(group_desc); - rcu_read_unlock(); + ext4_group_desc_free(sbi); failed_mount: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); @@ -5487,7 +5614,7 @@ failed_mount: #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */ - brelse(bh); + brelse(sbi->s_sbh); ext4_blkdev_remove(sbi); out_fail: sb->s_fs_info = NULL; @@ -5529,8 +5656,9 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) descr = "out journal"; if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) - ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " - "Quota mode: %s.", descr, ext4_quota_mode(sb)); + ext4_msg(sb, KERN_INFO, "mounted filesystem %pU with%s. " + "Quota mode: %s.", &sb->s_uuid, descr, + ext4_quota_mode(sb)); /* Update the s_overhead_clusters if necessary */ ext4_update_overhead(sb, false); @@ -5597,7 +5725,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, ext4_debug("Journal inode found at %p: %lld bytes\n", journal_inode, journal_inode->i_size); - if (!S_ISREG(journal_inode->i_mode)) { + if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); iput(journal_inode); return NULL; @@ -6485,8 +6613,8 @@ static int ext4_reconfigure(struct fs_context *fc) if (ret < 0) return ret; - ext4_msg(sb, KERN_INFO, "re-mounted. Quota mode: %s.", - ext4_quota_mode(sb)); + ext4_msg(sb, KERN_INFO, "re-mounted %pU. Quota mode: %s.", + &sb->s_uuid, ext4_quota_mode(sb)); return 0; } @@ -6653,7 +6781,7 @@ static int ext4_write_info(struct super_block *sb, int type) handle_t *handle; /* Data block + inode block */ - handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2); + handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit_info(sb, type); @@ -6760,6 +6888,20 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, return err; } +static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum) +{ + switch (type) { + case USRQUOTA: + return qf_inum == EXT4_USR_QUOTA_INO; + case GRPQUOTA: + return qf_inum == EXT4_GRP_QUOTA_INO; + case PRJQUOTA: + return qf_inum >= EXT4_GOOD_OLD_FIRST_INO; + default: + BUG(); + } +} + static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags) { @@ -6776,9 +6918,16 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, if (!qf_inums[type]) return -EPERM; + if (!ext4_check_quota_inum(type, qf_inums[type])) { + ext4_error(sb, "Bad quota inum: %lu, type: %d", + qf_inums[type], type); + return -EUCLEAN; + } + qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL); if (IS_ERR(qf_inode)) { - ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]); + ext4_error(sb, "Bad quota inode: %lu, type: %d", + qf_inums[type], type); return PTR_ERR(qf_inode); } @@ -6817,8 +6966,9 @@ int ext4_enable_quotas(struct super_block *sb) if (err) { ext4_warning(sb, "Failed to enable quota tracking " - "(type=%d, err=%d). Please run " - "e2fsck to fix.", type, err); + "(type=%d, err=%d, ino=%lu). " + "Please run e2fsck to fix.", type, + err, qf_inums[type]); for (type--; type >= 0; type--) { struct inode *inode; @@ -6905,8 +7055,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, len = i_size-off; toread = len; while (toread > 0) { - tocopy = sb->s_blocksize - offset < toread ? - sb->s_blocksize - offset : toread; + tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); bh = ext4_bread(NULL, inode, blk, 0); if (IS_ERR(bh)) return PTR_ERR(bh); diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index b051d19b5c8a..30e3b65798b5 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -79,7 +79,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); struct page *page; - void *fsdata; + void *fsdata = NULL; int res; res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); @@ -298,16 +298,14 @@ static int ext4_get_verity_descriptor_location(struct inode *inode, last_extent = path[path->p_depth].p_ext; if (!last_extent) { EXT4_ERROR_INODE(inode, "verity file has no extents"); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return -EFSCORRUPTED; } end_lblk = le32_to_cpu(last_extent->ee_block) + ext4_ext_get_actual_len(last_extent); desc_size_pos = (u64)end_lblk << inode->i_blkbits; - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); if (desc_size_pos < sizeof(desc_size_disk)) goto bad; @@ -365,13 +363,14 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); struct page *page; index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); if (!page || !PageUptodate(page)) { + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); + if (page) put_page(page); else if (num_ra_pages > 1) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 533216e80fa2..7decaaf27e82 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1281,7 +1281,7 @@ retry_ref: ce = mb_cache_entry_get(ea_block_cache, hash, bh->b_blocknr); if (ce) { - ce->e_reusable = 1; + set_bit(MBE_REUSABLE_B, &ce->e_flags); mb_cache_entry_put(ea_block_cache, ce); } } @@ -1441,6 +1441,9 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle, if (!err) err = ext4_inode_attach_jinode(ea_inode); if (err) { + if (ext4_xattr_inode_dec_ref(handle, ea_inode)) + ext4_warning_inode(ea_inode, + "cleanup dec ref error %d", err); iput(ea_inode); return ERR_PTR(err); } @@ -1540,7 +1543,8 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); if (err) { - ext4_xattr_inode_dec_ref(handle, ea_inode); + if (ext4_xattr_inode_dec_ref(handle, ea_inode)) + ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); iput(ea_inode); return err; } @@ -2042,7 +2046,7 @@ inserted: } BHDR(new_bh)->h_refcount = cpu_to_le32(ref); if (ref == EXT4_XATTR_REFCOUNT_MAX) - ce->e_reusable = 0; + clear_bit(MBE_REUSABLE_B, &ce->e_flags); ea_bdebug(new_bh, "reusing; refcount now=%d", ref); ext4_xattr_block_csum_set(inode, new_bh); @@ -2070,19 +2074,11 @@ inserted: goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); - - /* non-extent files can't have physical blocks past 2^32 */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; - block = ext4_new_meta_blocks(handle, inode, goal, 0, NULL, &error); if (error) goto cleanup; - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); - ea_idebug(inode, "creating block %llu", (unsigned long long)block); @@ -2412,6 +2408,7 @@ retry_inode: if (!error) { ext4_xattr_update_super_block(handle, inode->i_sb); inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); if (!value) no_expand = 0; error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); @@ -2554,7 +2551,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); - buffer = kmalloc(value_size, GFP_NOFS); + buffer = kvmalloc(value_size, GFP_NOFS); b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); if (!is || !bs || !buffer || !b_entry_name) { error = -ENOMEM; @@ -2606,7 +2603,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, error = 0; out: kfree(b_entry_name); - kfree(buffer); + kvfree(buffer); if (is) brelse(is->iloc.bh); if (bs) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index eaa240b21f07..c1c74aa658ae 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -219,7 +219,7 @@ static int f2fs_acl_update_mode(struct user_namespace *mnt_userns, return error; if (error == 0) *acl = NULL; - if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) && + if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; @@ -276,9 +276,11 @@ static int __f2fs_set_acl(struct user_namespace *mnt_userns, return error; } -int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int f2fs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { + struct inode *inode = d_inode(dentry); + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index a26e33cab4ff..ea2bbb3f264b 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -34,7 +34,7 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL extern struct posix_acl *f2fs_get_acl(struct inode *, int, bool); -extern int f2fs_set_acl(struct user_namespace *, struct inode *, +extern int f2fs_set_acl(struct user_namespace *, struct dentry *, struct posix_acl *, int); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, struct page *); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 8259e0fa97e1..56f7d0d6a8b2 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -26,12 +26,16 @@ static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; -void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, + unsigned char reason) { f2fs_build_fault_attr(sbi, 0, 0); set_ckpt_flags(sbi, CP_ERROR_FLAG); - if (!end_io) + if (!end_io) { f2fs_flush_merged_writes(sbi); + + f2fs_handle_stop(sbi, reason); + } } /* @@ -89,7 +93,7 @@ repeat: return ERR_PTR(err); } - f2fs_update_iostat(sbi, FS_META_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE); lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -122,7 +126,7 @@ retry: if (PTR_ERR(page) == -EIO && ++count <= DEFAULT_RETRY_IO_COUNT) goto retry; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_META_PAGE); } return page; } @@ -140,7 +144,7 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int segno, offset; bool exist; - if (type != DATA_GENERIC_ENHANCE && type != DATA_GENERIC_ENHANCE_READ) + if (type == DATA_GENERIC) return true; segno = GET_SEGNO(sbi, blkaddr); @@ -148,6 +152,13 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, se = get_seg_entry(sbi, segno); exist = f2fs_test_bit(offset, se->cur_valid_map); + if (exist && type == DATA_GENERIC_ENHANCE_UPDATE) { + f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", + blkaddr, exist); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return exist; + } + if (!exist && type == DATA_GENERIC_ENHANCE) { f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", blkaddr, exist); @@ -160,6 +171,11 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { + if (time_to_inject(sbi, FAULT_BLKADDR)) { + f2fs_show_injection_info(sbi, FAULT_BLKADDR); + return false; + } + switch (type) { case META_NAT: break; @@ -185,6 +201,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, case DATA_GENERIC: case DATA_GENERIC_ENHANCE: case DATA_GENERIC_ENHANCE_READ: + case DATA_GENERIC_ENHANCE_UPDATE: if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || blkaddr < MAIN_BLKADDR(sbi))) { f2fs_warn(sbi, "access invalid blkaddr:%u", @@ -276,7 +293,8 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, f2fs_put_page(page, err ? 1 : 0); if (!err) - f2fs_update_iostat(sbi, FS_META_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, + F2FS_BLKSIZE); } out: blk_finish_plug(&plug); @@ -448,8 +466,7 @@ static bool f2fs_dirty_meta_folio(struct address_space *mapping, if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); - if (!folio_test_dirty(folio)) { - filemap_dirty_folio(mapping, folio); + if (filemap_dirty_folio(mapping, folio)) { inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META); set_page_private_reference(&folio->page); return true; @@ -1053,7 +1070,8 @@ void f2fs_remove_dirty_inode(struct inode *inode) spin_unlock(&sbi->inode_lock[type]); } -int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, + bool from_cp) { struct list_head *head; struct inode *inode; @@ -1088,11 +1106,15 @@ retry: if (inode) { unsigned long cur_ino = inode->i_ino; - F2FS_I(inode)->cp_task = current; + if (from_cp) + F2FS_I(inode)->cp_task = current; + F2FS_I(inode)->wb_task = current; filemap_fdatawrite(inode->i_mapping); - F2FS_I(inode)->cp_task = NULL; + F2FS_I(inode)->wb_task = NULL; + if (from_cp) + F2FS_I(inode)->cp_task = NULL; iput(inode); /* We need to give cpu to another writers. */ @@ -1221,7 +1243,7 @@ retry_flush_dents: /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); - err = f2fs_sync_dirty_inodes(sbi, DIR_INODE); + err = f2fs_sync_dirty_inodes(sbi, DIR_INODE, true); if (err) return err; cond_resched(); @@ -1880,8 +1902,10 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi, "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(cprc->f2fs_issue_ckpt)) { + int err = PTR_ERR(cprc->f2fs_issue_ckpt); + cprc->f2fs_issue_ckpt = NULL; - return -ENOMEM; + return err; } set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio); @@ -1892,15 +1916,27 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi) { struct ckpt_req_control *cprc = &sbi->cprc_info; + struct task_struct *ckpt_task; - if (cprc->f2fs_issue_ckpt) { - struct task_struct *ckpt_task = cprc->f2fs_issue_ckpt; + if (!cprc->f2fs_issue_ckpt) + return; - cprc->f2fs_issue_ckpt = NULL; - kthread_stop(ckpt_task); + ckpt_task = cprc->f2fs_issue_ckpt; + cprc->f2fs_issue_ckpt = NULL; + kthread_stop(ckpt_task); - flush_remained_ckpt_reqs(sbi, NULL); - } + f2fs_flush_ckpt_thread(sbi); +} + +void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + flush_remained_ckpt_reqs(sbi, NULL); + + /* Let's wait for the previous dispatched checkpoint. */ + while (atomic_read(&cprc->queued_ckpt)) + io_schedule_timeout(DEFAULT_IO_TIMEOUT); } void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 70e97075e535..2532f369cb10 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -346,7 +346,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) if (!level) level = F2FS_ZSTD_DEFAULT_CLEVEL; - params = zstd_get_params(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen); + params = zstd_get_params(level, cc->rlen); workspace_size = zstd_cstream_workspace_bound(¶ms.cParams); workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode), @@ -567,10 +567,7 @@ MODULE_PARM_DESC(num_compress_pages, int f2fs_init_compress_mempool(void) { compress_page_pool = mempool_create_page_pool(num_compress_pages, 0); - if (!compress_page_pool) - return -ENOMEM; - - return 0; + return compress_page_pool ? 0 : -ENOMEM; } void f2fs_destroy_compress_mempool(void) @@ -762,6 +759,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION); goto out_release; } @@ -912,17 +910,15 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) reason = "[C|*|C|*]"; goto out; } - if (compressed) { - if (!__is_valid_data_blkaddr(blkaddr)) { - if (!cluster_end) - cluster_end = i; - continue; - } - /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ - if (cluster_end) { - reason = "[C|N|N|V]"; - goto out; - } + if (!__is_valid_data_blkaddr(blkaddr)) { + if (!cluster_end) + cluster_end = i; + continue; + } + /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ + if (cluster_end) { + reason = "[C|N|N|V]"; + goto out; } } return false; @@ -952,6 +948,7 @@ static int __f2fs_cluster_blocks(struct inode *inode, if (f2fs_sanity_check_cluster(&dn)) { ret = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_CLUSTER); goto fail; } @@ -1568,12 +1565,8 @@ static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, if (!dic->cbuf) return -ENOMEM; - if (cops->init_decompress_ctx) { - int ret = cops->init_decompress_ctx(dic); - - if (ret) - return ret; - } + if (cops->init_decompress_ctx) + return cops->init_decompress_ctx(dic); return 0; } @@ -1715,50 +1708,27 @@ static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task) } } -/* - * Update and unlock the cluster's pagecache pages, and release the reference to - * the decompress_io_ctx that was being held for I/O completion. - */ -static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, - bool in_task) +static void f2fs_verify_cluster(struct work_struct *work) { + struct decompress_io_ctx *dic = + container_of(work, struct decompress_io_ctx, verity_work); int i; + /* Verify, update, and unlock the decompressed pages. */ for (i = 0; i < dic->cluster_size; i++) { struct page *rpage = dic->rpages[i]; if (!rpage) continue; - /* PG_error was set if verity failed. */ - if (failed || PageError(rpage)) { - ClearPageUptodate(rpage); - /* will re-read again later */ - ClearPageError(rpage); - } else { + if (fsverity_verify_page(rpage)) SetPageUptodate(rpage); - } + else + ClearPageUptodate(rpage); unlock_page(rpage); } - f2fs_put_dic(dic, in_task); -} - -static void f2fs_verify_cluster(struct work_struct *work) -{ - struct decompress_io_ctx *dic = - container_of(work, struct decompress_io_ctx, verity_work); - int i; - - /* Verify the cluster's decompressed pages with fs-verity. */ - for (i = 0; i < dic->cluster_size; i++) { - struct page *rpage = dic->rpages[i]; - - if (rpage && !fsverity_verify_page(rpage)) - SetPageError(rpage); - } - - __f2fs_decompress_end_io(dic, false, true); + f2fs_put_dic(dic, true); } /* @@ -1768,6 +1738,8 @@ static void f2fs_verify_cluster(struct work_struct *work) void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, bool in_task) { + int i; + if (!failed && dic->need_verity) { /* * Note that to avoid deadlocks, the verity work can't be done @@ -1777,9 +1749,28 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, */ INIT_WORK(&dic->verity_work, f2fs_verify_cluster); fsverity_enqueue_verify_work(&dic->verity_work); - } else { - __f2fs_decompress_end_io(dic, failed, in_task); + return; } + + /* Update and unlock the cluster's pagecache pages. */ + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; + + if (!rpage) + continue; + + if (failed) + ClearPageUptodate(rpage); + else + SetPageUptodate(rpage); + unlock_page(rpage); + } + + /* + * Release the reference to the decompress_io_ctx that was being held + * for I/O completion. + */ + f2fs_put_dic(dic, in_task); } /* @@ -1905,7 +1896,7 @@ bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { - struct address_space *mapping = sbi->compress_inode->i_mapping; + struct address_space *mapping = COMPRESS_MAPPING(sbi); struct folio_batch fbatch; pgoff_t index = 0; pgoff_t end = MAX_BLKADDR(sbi); @@ -1987,9 +1978,7 @@ int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) sbi->page_array_slab = f2fs_kmem_cache_create(slab_name, sbi->page_array_slab_size); - if (!sbi->page_array_slab) - return -ENOMEM; - return 0; + return sbi->page_array_slab ? 0 : -ENOMEM; } void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) @@ -1997,53 +1986,24 @@ void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) kmem_cache_destroy(sbi->page_array_slab); } -static int __init f2fs_init_cic_cache(void) +int __init f2fs_init_compress_cache(void) { cic_entry_slab = f2fs_kmem_cache_create("f2fs_cic_entry", sizeof(struct compress_io_ctx)); if (!cic_entry_slab) return -ENOMEM; - return 0; -} - -static void f2fs_destroy_cic_cache(void) -{ - kmem_cache_destroy(cic_entry_slab); -} - -static int __init f2fs_init_dic_cache(void) -{ dic_entry_slab = f2fs_kmem_cache_create("f2fs_dic_entry", sizeof(struct decompress_io_ctx)); if (!dic_entry_slab) - return -ENOMEM; - return 0; -} - -static void f2fs_destroy_dic_cache(void) -{ - kmem_cache_destroy(dic_entry_slab); -} - -int __init f2fs_init_compress_cache(void) -{ - int err; - - err = f2fs_init_cic_cache(); - if (err) - goto out; - err = f2fs_init_dic_cache(); - if (err) goto free_cic; return 0; free_cic: - f2fs_destroy_cic_cache(); -out: + kmem_cache_destroy(cic_entry_slab); return -ENOMEM; } void f2fs_destroy_compress_cache(void) { - f2fs_destroy_dic_cache(); - f2fs_destroy_cic_cache(); + kmem_cache_destroy(dic_entry_slab); + kmem_cache_destroy(cic_entry_slab); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index aa3ccddfa037..6e43e19c7d1c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -39,10 +39,8 @@ static struct bio_set f2fs_bioset; int __init f2fs_init_bioset(void) { - if (bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE, - 0, BIOSET_NEED_BVECS)) - return -ENOMEM; - return 0; + return bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE, + 0, BIOSET_NEED_BVECS); } void f2fs_destroy_bioset(void) @@ -116,43 +114,56 @@ struct bio_post_read_ctx { struct f2fs_sb_info *sbi; struct work_struct work; unsigned int enabled_steps; + /* + * decompression_attempted keeps track of whether + * f2fs_end_read_compressed_page() has been called on the pages in the + * bio that belong to a compressed cluster yet. + */ + bool decompression_attempted; block_t fs_blkaddr; }; +/* + * Update and unlock a bio's pages, and free the bio. + * + * This marks pages up-to-date only if there was no error in the bio (I/O error, + * decryption error, or verity error), as indicated by bio->bi_status. + * + * "Compressed pages" (pagecache pages backed by a compressed cluster on-disk) + * aren't marked up-to-date here, as decompression is done on a per-compression- + * cluster basis rather than a per-bio basis. Instead, we only must do two + * things for each compressed page here: call f2fs_end_read_compressed_page() + * with failed=true if an error occurred before it would have normally gotten + * called (i.e., I/O error or decryption error, but *not* verity error), and + * release the bio's reference to the decompress_io_ctx of the page's cluster. + */ static void f2fs_finish_read_bio(struct bio *bio, bool in_task) { struct bio_vec *bv; struct bvec_iter_all iter_all; + struct bio_post_read_ctx *ctx = bio->bi_private; - /* - * Update and unlock the bio's pagecache pages, and put the - * decompression context for any compressed pages. - */ bio_for_each_segment_all(bv, bio, iter_all) { struct page *page = bv->bv_page; if (f2fs_is_compressed_page(page)) { - if (bio->bi_status) + if (ctx && !ctx->decompression_attempted) f2fs_end_read_compressed_page(page, true, 0, in_task); f2fs_put_page_dic(page, in_task); continue; } - /* PG_error was set if decryption or verity failed. */ - if (bio->bi_status || PageError(page)) { + if (bio->bi_status) ClearPageUptodate(page); - /* will re-read again later */ - ClearPageError(page); - } else { + else SetPageUptodate(page); - } dec_page_count(F2FS_P_SB(page), __read_io_type(page)); unlock_page(page); } - if (bio->bi_private) - mempool_free(bio->bi_private, bio_post_read_ctx_pool); + if (ctx) + mempool_free(ctx, bio_post_read_ctx_pool); bio_put(bio); } @@ -185,8 +196,10 @@ static void f2fs_verify_bio(struct work_struct *work) struct page *page = bv->bv_page; if (!f2fs_is_compressed_page(page) && - !PageError(page) && !fsverity_verify_page(page)) - SetPageError(page); + !fsverity_verify_page(page)) { + bio->bi_status = BLK_STS_IOERR; + break; + } } } else { fsverity_verify_bio(bio); @@ -236,16 +249,17 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, bio_for_each_segment_all(bv, ctx->bio, iter_all) { struct page *page = bv->bv_page; - /* PG_error was set if decryption failed. */ if (f2fs_is_compressed_page(page)) - f2fs_end_read_compressed_page(page, PageError(page), - blkaddr, in_task); + f2fs_end_read_compressed_page(page, false, blkaddr, + in_task); else all_compressed = false; blkaddr++; } + ctx->decompression_attempted = true; + /* * Optimization: if all the bio's pages are compressed, then scheduling * the per-bio verity work is unnecessary, as verity will be fully @@ -259,14 +273,17 @@ static void f2fs_post_read_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; - if (ctx->enabled_steps & STEP_DECRYPT) - fscrypt_decrypt_bio(ctx->bio); + if ((ctx->enabled_steps & STEP_DECRYPT) && !fscrypt_decrypt_bio(bio)) { + f2fs_finish_read_bio(bio, true); + return; + } if (ctx->enabled_steps & STEP_DECOMPRESS) f2fs_handle_step_decompress(ctx, true); - f2fs_verify_and_finish_bio(ctx->bio, true); + f2fs_verify_and_finish_bio(bio, true); } static void f2fs_read_end_io(struct bio *bio) @@ -333,7 +350,8 @@ static void f2fs_write_end_io(struct bio *bio) mempool_free(page, sbi->write_io_dummy); if (unlikely(bio->bi_status)) - f2fs_stop_checkpoint(sbi, true); + f2fs_stop_checkpoint(sbi, true, + STOP_CP_REASON_WRITE_FAIL); continue; } @@ -349,7 +367,8 @@ static void f2fs_write_end_io(struct bio *bio) if (unlikely(bio->bi_status)) { mapping_set_error(page->mapping, -EIO); if (type == F2FS_WB_CP_DATA) - f2fs_stop_checkpoint(sbi, true); + f2fs_stop_checkpoint(sbi, true, + STOP_CP_REASON_WRITE_FAIL); } f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) && @@ -703,8 +722,10 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, fio->is_por ? META_POR : (__is_meta_io(fio) ? - META_GENERIC : DATA_GENERIC_ENHANCE))) + META_GENERIC : DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } trace_f2fs_submit_page_bio(page, fio); @@ -723,7 +744,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? - __read_io_type(page): WB_DATA_TYPE(fio->page)); + __read_io_type(page) : WB_DATA_TYPE(fio->page)); __submit_bio(fio->sbi, bio, fio->type); return 0; @@ -904,8 +925,10 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) fio->encrypted_page : fio->page; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, - __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) + __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) { + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } trace_f2fs_submit_page_bio(page, fio); @@ -1054,6 +1077,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, ctx->sbi = sbi; ctx->enabled_steps = post_read_steps; ctx->fs_blkaddr = blkaddr; + ctx->decompression_attempted = false; bio->bi_private = ctx; } iostat_alloc_and_bind_ctx(sbi, bio, ctx); @@ -1081,9 +1105,8 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, bio_put(bio); return -EFAULT; } - ClearPageError(page); inc_page_count(sbi, F2FS_RD_DATA); - f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); __submit_bio(sbi, bio, DATA); return 0; } @@ -1120,7 +1143,7 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { dn->data_blkaddr = blkaddr; f2fs_set_data_blkaddr(dn); - f2fs_update_extent_cache(dn); + f2fs_update_read_extent_cache(dn); } /* dn->ofs_in_node will be returned with up-to-date last block pointer */ @@ -1189,7 +1212,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) struct extent_info ei = {0, }; struct inode *inode = dn->inode; - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn->data_blkaddr = ei.blk + index - ei.fofs; return 0; } @@ -1198,7 +1221,8 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) } struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - blk_opf_t op_flags, bool for_write) + blk_opf_t op_flags, bool for_write, + pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; @@ -1210,11 +1234,13 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, if (!page) return ERR_PTR(-ENOMEM); - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto put_err; } goto got_it; @@ -1222,12 +1248,17 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); - if (err) + if (err) { + if (err == -ENOENT && next_pgofs) + *next_pgofs = f2fs_get_next_page_offset(&dn, index); goto put_err; + } f2fs_put_dnode(&dn); if (unlikely(dn.data_blkaddr == NULL_ADDR)) { err = -ENOENT; + if (next_pgofs) + *next_pgofs = index + 1; goto put_err; } if (dn.data_blkaddr != NEW_ADDR && @@ -1235,6 +1266,8 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, dn.data_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto put_err; } got_it: @@ -1269,7 +1302,8 @@ put_err: return ERR_PTR(err); } -struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index) +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, + pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct page *page; @@ -1279,7 +1313,7 @@ struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = f2fs_get_read_data_page(inode, index, 0, false); + page = f2fs_get_read_data_page(inode, index, 0, false, next_pgofs); if (IS_ERR(page)) return page; @@ -1305,7 +1339,7 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = f2fs_get_read_data_page(inode, index, 0, for_write); + page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL); if (IS_ERR(page)) return page; @@ -1468,7 +1502,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, pgofs = (pgoff_t)map->m_lblk; end = pgofs + maxblocks; - if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) { + if (!create && f2fs_lookup_read_extent_cache(inode, pgofs, &ei)) { if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO && map->m_may_create) goto next_dnode; @@ -1548,6 +1582,7 @@ next_block: if (__is_valid_data_blkaddr(blkaddr) && !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto sync_out; } @@ -1593,6 +1628,8 @@ next_block: (flag != F2FS_GET_BLOCK_FIEMAP || IS_ENABLED(CONFIG_F2FS_CHECK_FS))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_CORRUPTED_CLUSTER); goto sync_out; } if (flag == F2FS_GET_BLOCK_BMAP) { @@ -1675,7 +1712,7 @@ skip: if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; - f2fs_update_extent_cache_range(&dn, + f2fs_update_read_extent_cache_range(&dn, start_pgofs, map->m_pblk + ofs, map->m_len - ofs); } @@ -1720,7 +1757,7 @@ sync_out: if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; - f2fs_update_extent_cache_range(&dn, + f2fs_update_read_extent_cache_range(&dn, start_pgofs, map->m_pblk + ofs, map->m_len - ofs); } @@ -1816,7 +1853,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); trace_f2fs_fiemap(inode, 0, phys, len, flags, err); - if (err || err == 1) + if (err) return err; } @@ -2074,6 +2111,8 @@ got_it: if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, DATA_GENERIC_ENHANCE_READ)) { ret = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto out; } } else { @@ -2122,8 +2161,8 @@ submit_and_realloc: goto submit_and_realloc; inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); - f2fs_update_iostat(F2FS_I_SB(inode), FS_DATA_READ_IO, F2FS_BLKSIZE); - ClearPageError(page); + f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, + F2FS_BLKSIZE); *last_block_in_bio = block_nr; goto out; out: @@ -2178,7 +2217,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (f2fs_cluster_is_empty(cc)) goto out; - if (f2fs_lookup_extent_cache(inode, start_idx, &ei)) + if (f2fs_lookup_read_extent_cache(inode, start_idx, &ei)) from_dnode = false; if (!from_dnode) @@ -2270,9 +2309,7 @@ submit_and_realloc: refcount_inc(&dic->refcnt); inc_page_count(sbi, F2FS_RD_DATA); - f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); - f2fs_update_iostat(sbi, FS_CDATA_READ_IO, F2FS_BLKSIZE); - ClearPageError(page); + f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); *last_block_in_bio = blkaddr; } @@ -2289,7 +2326,6 @@ out: for (i = 0; i < cc->cluster_size; i++) { if (cc->rpages[i]) { ClearPageUptodate(cc->rpages[i]); - ClearPageError(cc->rpages[i]); unlock_page(cc->rpages[i]); } } @@ -2386,7 +2422,6 @@ read_single_page: #ifdef CONFIG_F2FS_FS_COMPRESSION set_error_page: #endif - SetPageError(page); zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); } @@ -2543,7 +2578,7 @@ bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) return true; /* if this is cold file, we should overwrite to avoid fragmentation */ - if (file_is_cold(inode)) + if (file_is_cold(inode) && !is_inode_flag_set(inode, FI_OPU_WRITE)) return true; return check_inplace_update_policy(inode, fio); @@ -2613,12 +2648,15 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) set_new_dnode(&dn, inode, NULL, NULL, 0); if (need_inplace_update(fio) && - f2fs_lookup_extent_cache(inode, page->index, &ei)) { + f2fs_lookup_read_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, - DATA_GENERIC_ENHANCE)) + DATA_GENERIC_ENHANCE)) { + f2fs_handle_error(fio->sbi, + ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } ipu_force = true; fio->need_lock = LOCK_DONE; @@ -2646,6 +2684,7 @@ got_it: !f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); goto out_writepage; } @@ -2856,7 +2895,7 @@ out: } unlock_page(page); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && - !F2FS_I(inode)->cp_task && allow_balance) + !F2FS_I(inode)->wb_task && allow_balance) f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { @@ -3156,7 +3195,7 @@ static inline bool __should_serialize_io(struct inode *inode, struct writeback_control *wbc) { /* to avoid deadlock in path of data flush */ - if (F2FS_I(inode)->cp_task) + if (F2FS_I(inode)->wb_task) return false; if (!S_ISREG(inode->i_mode)) @@ -3333,7 +3372,7 @@ restart: } else if (locked) { err = f2fs_get_block(&dn, index); } else { - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; } else { /* hole case */ @@ -3374,7 +3413,7 @@ static int __find_data_block(struct inode *inode, pgoff_t index, set_new_dnode(&dn, inode, ipage, ipage, 0); - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; } else { /* hole case */ @@ -3438,6 +3477,9 @@ static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, else if (*blk_addr != NULL_ADDR) return 0; + if (is_inode_flag_set(inode, FI_ATOMIC_REPLACE)) + goto reserve_block; + /* Look for the block in the original inode */ err = __find_data_block(inode, index, &ori_blk_addr); if (err) @@ -3559,6 +3601,7 @@ repeat: if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto fail; } err = f2fs_submit_page_read(inode, page, blkaddr, 0, true); @@ -3697,8 +3740,7 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping, folio_mark_uptodate(folio); BUG_ON(folio_test_swapcache(folio)); - if (!folio_test_dirty(folio)) { - filemap_dirty_folio(mapping, folio); + if (filemap_dirty_folio(mapping, folio)) { f2fs_update_dirty_folio(inode, folio); return true; } @@ -3970,6 +4012,7 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret < 0) return ret; + stat_inc_swapfile_inode(inode); set_inode_flag(inode, FI_PIN_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; @@ -3979,6 +4022,7 @@ static void f2fs_swap_deactivate(struct file *file) { struct inode *inode = file_inode(file); + stat_dec_swapfile_inode(inode); clear_inode_flag(inode, FI_PIN_FILE); } #else @@ -4057,9 +4101,7 @@ int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi) sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq", WQ_UNBOUND | WQ_HIGHPRI, num_online_cpus()); - if (!sbi->post_read_wq) - return -ENOMEM; - return 0; + return sbi->post_read_wq ? 0 : -ENOMEM; } void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi) @@ -4072,9 +4114,7 @@ int __init f2fs_init_bio_entry_cache(void) { bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab", sizeof(struct bio_entry)); - if (!bio_entry_slab) - return -ENOMEM; - return 0; + return bio_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_bio_entry_cache(void) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index c01471573977..32af4f0c5735 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -72,15 +72,26 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->main_area_zones = si->main_area_sections / le32_to_cpu(raw_super->secs_per_zone); - /* validation check of the segment numbers */ + /* general extent cache stats */ + for (i = 0; i < NR_EXTENT_CACHES; i++) { + struct extent_tree_info *eti = &sbi->extent_tree[i]; + + si->hit_cached[i] = atomic64_read(&sbi->read_hit_cached[i]); + si->hit_rbtree[i] = atomic64_read(&sbi->read_hit_rbtree[i]); + si->total_ext[i] = atomic64_read(&sbi->total_hit_ext[i]); + si->hit_total[i] = si->hit_cached[i] + si->hit_rbtree[i]; + si->ext_tree[i] = atomic_read(&eti->total_ext_tree); + si->zombie_tree[i] = atomic_read(&eti->total_zombie_tree); + si->ext_node[i] = atomic_read(&eti->total_ext_node); + } + /* read extent_cache only */ si->hit_largest = atomic64_read(&sbi->read_hit_largest); - si->hit_cached = atomic64_read(&sbi->read_hit_cached); - si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree); - si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; - si->total_ext = atomic64_read(&sbi->total_hit_ext); - si->ext_tree = atomic_read(&sbi->total_ext_tree); - si->zombie_tree = atomic_read(&sbi->total_zombie_tree); - si->ext_node = atomic_read(&sbi->total_ext_node); + si->hit_total[EX_READ] += si->hit_largest; + + /* block age extent_cache only */ + si->allocated_data_blocks = atomic64_read(&sbi->allocated_data_blocks); + + /* validation check of the segment numbers */ si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); @@ -91,7 +102,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; - si->aw_cnt = sbi->atomic_files; + si->aw_cnt = atomic_read(&sbi->atomic_files); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ); si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE); @@ -135,6 +146,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); si->compr_inode = atomic_read(&sbi->compr_inode); + si->swapfile_inode = atomic_read(&sbi->swapfile_inode); si->compr_blocks = atomic64_read(&sbi->compr_blocks); si->append = sbi->im[APPEND_INO].ino_num; si->update = sbi->im[UPDATE_INO].ino_num; @@ -293,25 +305,32 @@ get_cache: sizeof(struct nat_entry_set); for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); - si->cache_mem += atomic_read(&sbi->total_ext_tree) * + + for (i = 0; i < NR_EXTENT_CACHES; i++) { + struct extent_tree_info *eti = &sbi->extent_tree[i]; + + si->ext_mem[i] = atomic_read(&eti->total_ext_tree) * sizeof(struct extent_tree); - si->cache_mem += atomic_read(&sbi->total_ext_node) * + si->ext_mem[i] += atomic_read(&eti->total_ext_node) * sizeof(struct extent_node); + si->cache_mem += si->ext_mem[i]; + } si->page_mem = 0; if (sbi->node_inode) { - unsigned npages = NODE_MAPPING(sbi)->nrpages; + unsigned long npages = NODE_MAPPING(sbi)->nrpages; si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } if (sbi->meta_inode) { - unsigned npages = META_MAPPING(sbi)->nrpages; + unsigned long npages = META_MAPPING(sbi)->nrpages; si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } #ifdef CONFIG_F2FS_FS_COMPRESSION if (sbi->compress_inode) { - unsigned npages = COMPRESS_MAPPING(sbi)->nrpages; + unsigned long npages = COMPRESS_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } #endif @@ -347,7 +366,7 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", si->sbi->sb->s_bdev, i++, - f2fs_readonly(si->sbi->sb) ? "RO": "RW", + f2fs_readonly(si->sbi->sb) ? "RO" : "RW", is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ? "Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good")); if (si->sbi->s_flag) { @@ -385,6 +404,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_dir); seq_printf(s, " - Compressed Inode: %u, Blocks: %llu\n", si->compr_inode, si->compr_blocks); + seq_printf(s, " - Swapfile Inode: %u\n", + si->swapfile_inode); seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", @@ -457,28 +478,28 @@ static int stat_show(struct seq_file *s, void *v) si->meta_count[META_NAT]); seq_printf(s, " - ssa blocks : %u\n", si->meta_count[META_SSA]); - seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, " - "Cur time: %4d(ms), Peak time: %4d(ms))\n", - si->nr_queued_ckpt, si->nr_issued_ckpt, - si->nr_total_ckpt, si->cur_ckpt_time, - si->peak_ckpt_time); + seq_puts(s, "CP merge:\n"); + seq_printf(s, " - Queued : %4d\n", si->nr_queued_ckpt); + seq_printf(s, " - Issued : %4d\n", si->nr_issued_ckpt); + seq_printf(s, " - Total : %4d\n", si->nr_total_ckpt); + seq_printf(s, " - Cur time : %4d(ms)\n", si->cur_ckpt_time); + seq_printf(s, " - Peak time : %4d(ms)\n", si->peak_ckpt_time); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", si->data_segs, si->bg_data_segs); seq_printf(s, " - node segments : %d (%d)\n", si->node_segs, si->bg_node_segs); - seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), " - "Idle Greedy (%d), Idle AT (%d), " - "Urgent High (%d), Urgent Mid (%d), " - "Urgent Low (%d)\n", - si->sbi->gc_reclaimed_segs[GC_NORMAL], - si->sbi->gc_reclaimed_segs[GC_IDLE_CB], - si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY], - si->sbi->gc_reclaimed_segs[GC_IDLE_AT], - si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH], - si->sbi->gc_reclaimed_segs[GC_URGENT_MID], - si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]); + seq_puts(s, " - Reclaimed segs :\n"); + seq_printf(s, " - Normal : %d\n", si->sbi->gc_reclaimed_segs[GC_NORMAL]); + seq_printf(s, " - Idle CB : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_CB]); + seq_printf(s, " - Idle Greedy : %d\n", + si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY]); + seq_printf(s, " - Idle AT : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_AT]); + seq_printf(s, " - Urgent High : %d\n", + si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH]); + seq_printf(s, " - Urgent Mid : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_MID]); + seq_printf(s, " - Urgent Low : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]); seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks, si->bg_data_blks + si->bg_node_blks); seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks, @@ -487,26 +508,44 @@ static int stat_show(struct seq_file *s, void *v) si->bg_node_blks); seq_printf(s, "BG skip : IO: %u, Other: %u\n", si->io_skip_bggc, si->other_skip_bggc); - seq_puts(s, "\nExtent Cache:\n"); + seq_puts(s, "\nExtent Cache (Read):\n"); seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n", - si->hit_largest, si->hit_cached, - si->hit_rbtree); + si->hit_largest, si->hit_cached[EX_READ], + si->hit_rbtree[EX_READ]); + seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n", + !si->total_ext[EX_READ] ? 0 : + div64_u64(si->hit_total[EX_READ] * 100, + si->total_ext[EX_READ]), + si->hit_total[EX_READ], si->total_ext[EX_READ]); + seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", + si->ext_tree[EX_READ], si->zombie_tree[EX_READ], + si->ext_node[EX_READ]); + seq_puts(s, "\nExtent Cache (Block Age):\n"); + seq_printf(s, " - Allocated Data Blocks: %llu\n", + si->allocated_data_blocks); + seq_printf(s, " - Hit Count: L1:%llu L2:%llu\n", + si->hit_cached[EX_BLOCK_AGE], + si->hit_rbtree[EX_BLOCK_AGE]); seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n", - !si->total_ext ? 0 : - div64_u64(si->hit_total * 100, si->total_ext), - si->hit_total, si->total_ext); + !si->total_ext[EX_BLOCK_AGE] ? 0 : + div64_u64(si->hit_total[EX_BLOCK_AGE] * 100, + si->total_ext[EX_BLOCK_AGE]), + si->hit_total[EX_BLOCK_AGE], + si->total_ext[EX_BLOCK_AGE]); seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", - si->ext_tree, si->zombie_tree, si->ext_node); + si->ext_tree[EX_BLOCK_AGE], + si->zombie_tree[EX_BLOCK_AGE], + si->ext_node[EX_BLOCK_AGE]); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - DIO (R: %4d, W: %4d)\n", si->nr_dio_read, si->nr_dio_write); seq_printf(s, " - IO_R (Data: %4d, Node: %4d, Meta: %4d\n", si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta); - seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), " - "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", + seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), ", si->nr_wb_cp_data, si->nr_wb_data, si->nr_flushing, si->nr_flushed, - si->flush_list_empty, + si->flush_list_empty); + seq_printf(s, "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", si->nr_discarding, si->nr_discarded, si->nr_discard_cmd, si->undiscard_blks); seq_printf(s, " - atomic IO: %4d (Max. %4d)\n", @@ -563,8 +602,12 @@ static int stat_show(struct seq_file *s, void *v) (si->base_mem + si->cache_mem + si->page_mem) >> 10); seq_printf(s, " - static: %llu KB\n", si->base_mem >> 10); - seq_printf(s, " - cached: %llu KB\n", + seq_printf(s, " - cached all: %llu KB\n", si->cache_mem >> 10); + seq_printf(s, " - read extent cache: %llu KB\n", + si->ext_mem[EX_READ] >> 10); + seq_printf(s, " - block age extent cache: %llu KB\n", + si->ext_mem[EX_BLOCK_AGE] >> 10); seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } @@ -597,16 +640,23 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) si->sbi = sbi; sbi->stat_info = si; - atomic64_set(&sbi->total_hit_ext, 0); - atomic64_set(&sbi->read_hit_rbtree, 0); + /* general extent cache stats */ + for (i = 0; i < NR_EXTENT_CACHES; i++) { + atomic64_set(&sbi->total_hit_ext[i], 0); + atomic64_set(&sbi->read_hit_rbtree[i], 0); + atomic64_set(&sbi->read_hit_cached[i], 0); + } + + /* read extent_cache only */ atomic64_set(&sbi->read_hit_largest, 0); - atomic64_set(&sbi->read_hit_cached, 0); atomic_set(&sbi->inline_xattr, 0); atomic_set(&sbi->inline_inode, 0); atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->compr_inode, 0); atomic64_set(&sbi->compr_blocks, 0); + atomic_set(&sbi->swapfile_inode, 0); + atomic_set(&sbi->atomic_files, 0); atomic_set(&sbi->inplace_count, 0); for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index d5bd7932fb64..8e025157f35c 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -340,6 +340,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, unsigned int bidx, end_block; struct page *dentry_page; struct f2fs_dir_entry *de = NULL; + pgoff_t next_pgofs; bool room = false; int max_slots; @@ -350,12 +351,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, le32_to_cpu(fname->hash) % nbucket); end_block = bidx + nblock; - for (; bidx < end_block; bidx++) { + while (bidx < end_block) { /* no need to allocate new dentry pages to all the indices */ - dentry_page = f2fs_find_data_page(dir, bidx); + dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) { room = true; + bidx = next_pgofs; continue; } else { *res_page = dentry_page; @@ -376,6 +378,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, if (max_slots >= s) room = true; f2fs_put_page(dentry_page, 0); + + bidx++; } if (!de && room && F2FS_I(dir)->chash != fname->hash) { @@ -956,7 +960,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, bool f2fs_empty_dir(struct inode *dir) { - unsigned long bidx; + unsigned long bidx = 0; struct page *dentry_page; unsigned int bit_pos; struct f2fs_dentry_block *dentry_blk; @@ -965,13 +969,17 @@ bool f2fs_empty_dir(struct inode *dir) if (f2fs_has_inline_dentry(dir)) return f2fs_empty_inline_dir(dir); - for (bidx = 0; bidx < nblock; bidx++) { - dentry_page = f2fs_get_lock_data_page(dir, bidx, false); + while (bidx < nblock) { + pgoff_t next_pgofs; + + dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs); if (IS_ERR(dentry_page)) { - if (PTR_ERR(dentry_page) == -ENOENT) + if (PTR_ERR(dentry_page) == -ENOENT) { + bidx = next_pgofs; continue; - else + } else { return false; + } } dentry_blk = page_address(dentry_page); @@ -983,10 +991,12 @@ bool f2fs_empty_dir(struct inode *dir) NR_DENTRY_IN_BLOCK, bit_pos); - f2fs_put_page(dentry_page, 1); + f2fs_put_page(dentry_page, 0); if (bit_pos < NR_DENTRY_IN_BLOCK) return false; + + bidx++; } return true; } @@ -1000,7 +1010,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, struct fscrypt_str de_name = FSTR_INIT(NULL, 0); struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); struct blk_plug plug; - bool readdir_ra = sbi->readdir_ra == 1; + bool readdir_ra = sbi->readdir_ra; bool found_valid_dirent = false; int err = 0; @@ -1041,6 +1051,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, __func__, le16_to_cpu(de->name_len)); set_sbi_flag(sbi, SBI_NEED_FSCK); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_CORRUPTED_DIRENT); goto out; } @@ -1103,7 +1114,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) goto out_free; } - for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) { + for (; n < npages; ctx->pos = n * NR_DENTRY_IN_BLOCK) { + pgoff_t next_pgofs; /* allow readdir() to be interrupted */ if (fatal_signal_pending(current)) { @@ -1117,11 +1129,12 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - dentry_page = f2fs_find_data_page(inode, n); + dentry_page = f2fs_find_data_page(inode, n, &next_pgofs); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); if (err == -ENOENT) { err = 0; + n = next_pgofs; continue; } else { goto out_free; @@ -1140,6 +1153,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) } f2fs_put_page(dentry_page, 0); + + n++; } out_free: fscrypt_fname_free_buffer(&fstr); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 866e72b29bd5..1bd38a78ebba 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -6,6 +6,10 @@ * Copyright (c) 2015 Samsung Electronics * Authors: Jaegeuk Kim <jaegeuk@kernel.org> * Chao Yu <chao2.yu@samsung.com> + * + * block_age-based extent cache added by: + * Copyright (c) 2022 xiaomi Co., Ltd. + * http://www.xiaomi.com/ */ #include <linux/fs.h> @@ -15,6 +19,123 @@ #include "node.h" #include <trace/events/f2fs.h> +static void __set_extent_info(struct extent_info *ei, + unsigned int fofs, unsigned int len, + block_t blk, bool keep_clen, + unsigned long age, unsigned long last_blocks, + enum extent_type type) +{ + ei->fofs = fofs; + ei->len = len; + + if (type == EX_READ) { + ei->blk = blk; + if (keep_clen) + return; +#ifdef CONFIG_F2FS_FS_COMPRESSION + ei->c_len = 0; +#endif + } else if (type == EX_BLOCK_AGE) { + ei->age = age; + ei->last_blocks = last_blocks; + } +} + +static bool __may_read_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, READ_EXTENT_CACHE)) + return false; + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return false; + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && + !f2fs_sb_has_readonly(sbi)) + return false; + return S_ISREG(inode->i_mode); +} + +static bool __may_age_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, AGE_EXTENT_CACHE)) + return false; + /* don't cache block age info for cold file */ + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) + return false; + if (file_is_cold(inode)) + return false; + + return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); +} + +static bool __init_may_extent_tree(struct inode *inode, enum extent_type type) +{ + if (type == EX_READ) + return __may_read_extent_tree(inode); + else if (type == EX_BLOCK_AGE) + return __may_age_extent_tree(inode); + return false; +} + +static bool __may_extent_tree(struct inode *inode, enum extent_type type) +{ + /* + * for recovered files during mount do not create extents + * if shrinker is not registered. + */ + if (list_empty(&F2FS_I_SB(inode)->s_list)) + return false; + + return __init_may_extent_tree(inode, type); +} + +static void __try_update_largest_extent(struct extent_tree *et, + struct extent_node *en) +{ + if (et->type != EX_READ) + return; + if (en->ei.len <= et->largest.len) + return; + + et->largest = en->ei; + et->largest_updated = true; +} + +static bool __is_extent_mergeable(struct extent_info *back, + struct extent_info *front, enum extent_type type) +{ + if (type == EX_READ) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (back->c_len && back->len != back->c_len) + return false; + if (front->c_len && front->len != front->c_len) + return false; +#endif + return (back->fofs + back->len == front->fofs && + back->blk + back->len == front->blk); + } else if (type == EX_BLOCK_AGE) { + return (back->fofs + back->len == front->fofs && + abs(back->age - front->age) <= SAME_AGE_REGION && + abs(back->last_blocks - front->last_blocks) <= + SAME_AGE_REGION); + } + return false; +} + +static bool __is_back_mergeable(struct extent_info *cur, + struct extent_info *back, enum extent_type type) +{ + return __is_extent_mergeable(back, cur, type); +} + +static bool __is_front_mergeable(struct extent_info *cur, + struct extent_info *front, enum extent_type type) +{ + return __is_extent_mergeable(cur, front, type); +} + static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, unsigned int ofs) { @@ -237,6 +358,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, struct rb_node *parent, struct rb_node **p, bool leftmost) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; struct extent_node *en; en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi); @@ -250,16 +372,18 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, rb_link_node(&en->rb_node, parent, p); rb_insert_color_cached(&en->rb_node, &et->root, leftmost); atomic_inc(&et->node_cnt); - atomic_inc(&sbi->total_ext_node); + atomic_inc(&eti->total_ext_node); return en; } static void __detach_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + rb_erase_cached(&en->rb_node, &et->root); atomic_dec(&et->node_cnt); - atomic_dec(&sbi->total_ext_node); + atomic_dec(&eti->total_ext_node); if (et->cached_en == en) et->cached_en = NULL; @@ -275,61 +399,51 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi, static void __release_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { - spin_lock(&sbi->extent_lock); + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + + spin_lock(&eti->extent_lock); f2fs_bug_on(sbi, list_empty(&en->list)); list_del_init(&en->list); - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); __detach_extent_node(sbi, et, en); } -static struct extent_tree *__grab_extent_tree(struct inode *inode) +static struct extent_tree *__grab_extent_tree(struct inode *inode, + enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree_info *eti = &sbi->extent_tree[type]; struct extent_tree *et; nid_t ino = inode->i_ino; - mutex_lock(&sbi->extent_tree_lock); - et = radix_tree_lookup(&sbi->extent_tree_root, ino); + mutex_lock(&eti->extent_tree_lock); + et = radix_tree_lookup(&eti->extent_tree_root, ino); if (!et) { et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS, true, NULL); - f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); + f2fs_radix_tree_insert(&eti->extent_tree_root, ino, et); memset(et, 0, sizeof(struct extent_tree)); et->ino = ino; + et->type = type; et->root = RB_ROOT_CACHED; et->cached_en = NULL; rwlock_init(&et->lock); INIT_LIST_HEAD(&et->list); atomic_set(&et->node_cnt, 0); - atomic_inc(&sbi->total_ext_tree); + atomic_inc(&eti->total_ext_tree); } else { - atomic_dec(&sbi->total_zombie_tree); + atomic_dec(&eti->total_zombie_tree); list_del_init(&et->list); } - mutex_unlock(&sbi->extent_tree_lock); + mutex_unlock(&eti->extent_tree_lock); /* never died until evict_inode */ - F2FS_I(inode)->extent_tree = et; + F2FS_I(inode)->extent_tree[type] = et; return et; } -static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_info *ei) -{ - struct rb_node **p = &et->root.rb_root.rb_node; - struct extent_node *en; - - en = __attach_extent_node(sbi, et, ei, NULL, p, true); - if (!en) - return NULL; - - et->largest = en->ei; - et->cached_en = en; - return en; -} - static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et) { @@ -358,70 +472,88 @@ static void __drop_largest_extent(struct extent_tree *et, } } -/* return true, if inode page is changed */ -static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage) +void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL; + struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; + struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext; struct extent_tree *et; struct extent_node *en; struct extent_info ei; - if (!f2fs_may_extent_tree(inode)) { - /* drop largest extent */ + if (!__may_extent_tree(inode, EX_READ)) { + /* drop largest read extent */ if (i_ext && i_ext->len) { f2fs_wait_on_page_writeback(ipage, NODE, true, true); i_ext->len = 0; set_page_dirty(ipage); - return; } - return; + goto out; } - et = __grab_extent_tree(inode); + et = __grab_extent_tree(inode, EX_READ); if (!i_ext || !i_ext->len) - return; + goto out; - get_extent_info(&ei, i_ext); + get_read_extent_info(&ei, i_ext); write_lock(&et->lock); if (atomic_read(&et->node_cnt)) - goto out; + goto unlock_out; - en = __init_extent_tree(sbi, et, &ei); + en = __attach_extent_node(sbi, et, &ei, NULL, + &et->root.rb_root.rb_node, true); if (en) { - spin_lock(&sbi->extent_lock); - list_add_tail(&en->list, &sbi->extent_list); - spin_unlock(&sbi->extent_lock); + et->largest = en->ei; + et->cached_en = en; + + spin_lock(&eti->extent_lock); + list_add_tail(&en->list, &eti->extent_list); + spin_unlock(&eti->extent_lock); } -out: +unlock_out: write_unlock(&et->lock); +out: + if (!F2FS_I(inode)->extent_tree[EX_READ]) + set_inode_flag(inode, FI_NO_EXTENT); } -void f2fs_init_extent_tree(struct inode *inode, struct page *ipage) +void f2fs_init_age_extent_tree(struct inode *inode) { - __f2fs_init_extent_tree(inode, ipage); + if (!__init_may_extent_tree(inode, EX_BLOCK_AGE)) + return; + __grab_extent_tree(inode, EX_BLOCK_AGE); +} - if (!F2FS_I(inode)->extent_tree) - set_inode_flag(inode, FI_NO_EXTENT); +void f2fs_init_extent_tree(struct inode *inode) +{ + /* initialize read cache */ + if (__init_may_extent_tree(inode, EX_READ)) + __grab_extent_tree(inode, EX_READ); + + /* initialize block age cache */ + if (__init_may_extent_tree(inode, EX_BLOCK_AGE)) + __grab_extent_tree(inode, EX_BLOCK_AGE); } -static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) +static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; struct extent_node *en; bool ret = false; f2fs_bug_on(sbi, !et); - trace_f2fs_lookup_extent_tree_start(inode, pgofs); + trace_f2fs_lookup_extent_tree_start(inode, pgofs, type); read_lock(&et->lock); - if (et->largest.fofs <= pgofs && + if (type == EX_READ && + et->largest.fofs <= pgofs && et->largest.fofs + et->largest.len > pgofs) { *ei = et->largest; ret = true; @@ -435,23 +567,26 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; if (en == et->cached_en) - stat_inc_cached_node_hit(sbi); + stat_inc_cached_node_hit(sbi, type); else - stat_inc_rbtree_node_hit(sbi); + stat_inc_rbtree_node_hit(sbi, type); *ei = en->ei; - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); + list_move_tail(&en->list, &eti->extent_list); et->cached_en = en; } - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); ret = true; out: - stat_inc_total_hit(sbi); + stat_inc_total_hit(sbi, type); read_unlock(&et->lock); - trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei); + if (type == EX_READ) + trace_f2fs_lookup_read_extent_tree_end(inode, pgofs, ei); + else if (type == EX_BLOCK_AGE) + trace_f2fs_lookup_age_extent_tree_end(inode, pgofs, ei); return ret; } @@ -460,18 +595,20 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, struct extent_node *prev_ex, struct extent_node *next_ex) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; struct extent_node *en = NULL; - if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) { + if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei, et->type)) { prev_ex->ei.len += ei->len; ei = &prev_ex->ei; en = prev_ex; } - if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { + if (next_ex && __is_front_mergeable(ei, &next_ex->ei, et->type)) { next_ex->ei.fofs = ei->fofs; - next_ex->ei.blk = ei->blk; next_ex->ei.len += ei->len; + if (et->type == EX_READ) + next_ex->ei.blk = ei->blk; if (en) __release_extent_node(sbi, et, prev_ex); @@ -483,12 +620,12 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, __try_update_largest_extent(et, en); - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); + list_move_tail(&en->list, &eti->extent_list); et->cached_en = en; } - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); return en; } @@ -498,6 +635,7 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, struct rb_node *insert_parent, bool leftmost) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; struct rb_node **p; struct rb_node *parent = NULL; struct extent_node *en = NULL; @@ -520,47 +658,54 @@ do_insert: __try_update_largest_extent(et, en); /* update in global extent list */ - spin_lock(&sbi->extent_lock); - list_add_tail(&en->list, &sbi->extent_list); + spin_lock(&eti->extent_lock); + list_add_tail(&en->list, &eti->extent_list); et->cached_en = en; - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); return en; } -static void f2fs_update_extent_tree_range(struct inode *inode, - pgoff_t fofs, block_t blkaddr, unsigned int len) +static void __update_extent_tree_range(struct inode *inode, + struct extent_info *tei, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; struct extent_node *en = NULL, *en1 = NULL; struct extent_node *prev_en = NULL, *next_en = NULL; struct extent_info ei, dei, prev; struct rb_node **insert_p = NULL, *insert_parent = NULL; + unsigned int fofs = tei->fofs, len = tei->len; unsigned int end = fofs + len; - unsigned int pos = (unsigned int)fofs; bool updated = false; bool leftmost = false; if (!et) return; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len); + if (type == EX_READ) + trace_f2fs_update_read_extent_tree_range(inode, fofs, len, + tei->blk, 0); + else if (type == EX_BLOCK_AGE) + trace_f2fs_update_age_extent_tree_range(inode, fofs, len, + tei->age, tei->last_blocks); write_lock(&et->lock); - if (is_inode_flag_set(inode, FI_NO_EXTENT)) { - write_unlock(&et->lock); - return; - } + if (type == EX_READ) { + if (is_inode_flag_set(inode, FI_NO_EXTENT)) { + write_unlock(&et->lock); + return; + } - prev = et->largest; - dei.len = 0; + prev = et->largest; + dei.len = 0; - /* - * drop largest extent before lookup, in case it's already - * been shrunk from extent tree - */ - __drop_largest_extent(et, fofs, len); + /* + * drop largest extent before lookup, in case it's already + * been shrunk from extent tree + */ + __drop_largest_extent(et, fofs, len); + } /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, @@ -581,26 +726,32 @@ static void f2fs_update_extent_tree_range(struct inode *inode, dei = en->ei; org_end = dei.fofs + dei.len; - f2fs_bug_on(sbi, pos >= org_end); + f2fs_bug_on(sbi, fofs >= org_end); - if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { - en->ei.len = pos - en->ei.fofs; + if (fofs > dei.fofs && (type != EX_READ || + fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN)) { + en->ei.len = fofs - en->ei.fofs; prev_en = en; parts = 1; } - if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) { + if (end < org_end && (type != EX_READ || + org_end - end >= F2FS_MIN_EXTENT_LEN)) { if (parts) { - set_extent_info(&ei, end, - end - dei.fofs + dei.blk, - org_end - end); + __set_extent_info(&ei, + end, org_end - end, + end - dei.fofs + dei.blk, false, + dei.age, dei.last_blocks, + type); en1 = __insert_extent_tree(sbi, et, &ei, NULL, NULL, true); next_en = en1; } else { - en->ei.fofs = end; - en->ei.blk += end - dei.fofs; - en->ei.len -= end - dei.fofs; + __set_extent_info(&en->ei, + end, en->ei.len - (end - dei.fofs), + en->ei.blk + (end - dei.fofs), true, + dei.age, dei.last_blocks, + type); next_en = en; } parts++; @@ -630,10 +781,15 @@ static void f2fs_update_extent_tree_range(struct inode *inode, en = next_en; } - /* 3. update extent in extent cache */ - if (blkaddr) { + if (type == EX_BLOCK_AGE) + goto update_age_extent_cache; + + /* 3. update extent in read extent cache */ + BUG_ON(type != EX_READ); - set_extent_info(&ei, fofs, blkaddr, len); + if (tei->blk) { + __set_extent_info(&ei, fofs, len, tei->blk, false, + 0, 0, EX_READ); if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) __insert_extent_tree(sbi, et, &ei, insert_p, insert_parent, leftmost); @@ -655,7 +811,17 @@ static void f2fs_update_extent_tree_range(struct inode *inode, et->largest_updated = false; updated = true; } + goto out_read_extent_cache; +update_age_extent_cache: + if (!tei->last_blocks) + goto out_read_extent_cache; + __set_extent_info(&ei, fofs, len, 0, false, + tei->age, tei->last_blocks, EX_BLOCK_AGE); + if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) + __insert_extent_tree(sbi, et, &ei, + insert_p, insert_parent, leftmost); +out_read_extent_cache: write_unlock(&et->lock); if (updated) @@ -663,19 +829,20 @@ static void f2fs_update_extent_tree_range(struct inode *inode, } #ifdef CONFIG_F2FS_FS_COMPRESSION -void f2fs_update_extent_tree_range_compressed(struct inode *inode, +void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int llen, unsigned int c_len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ]; struct extent_node *en = NULL; struct extent_node *prev_en = NULL, *next_en = NULL; struct extent_info ei; struct rb_node **insert_p = NULL, *insert_parent = NULL; bool leftmost = false; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen); + trace_f2fs_update_read_extent_tree_range(inode, fofs, llen, + blkaddr, c_len); /* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */ if (is_inode_flag_set(inode, FI_NO_EXTENT)) @@ -692,7 +859,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode, if (en) goto unlock_out; - set_extent_info(&ei, fofs, blkaddr, llen); + __set_extent_info(&ei, fofs, llen, blkaddr, true, 0, 0, EX_READ); ei.c_len = c_len; if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) @@ -703,24 +870,113 @@ unlock_out: } #endif -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +static unsigned long long __calculate_block_age(unsigned long long new, + unsigned long long old) +{ + unsigned long long diff; + + diff = (new >= old) ? new - (new - old) : new + (old - new); + + return div_u64(diff * LAST_AGE_WEIGHT, 100); +} + +/* This returns a new age and allocated blocks in ei */ +static int __get_new_block_age(struct inode *inode, struct extent_info *ei) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + loff_t f_size = i_size_read(inode); + unsigned long long cur_blocks = + atomic64_read(&sbi->allocated_data_blocks); + + /* + * When I/O is not aligned to a PAGE_SIZE, update will happen to the last + * file block even in seq write. So don't record age for newly last file + * block here. + */ + if ((f_size >> PAGE_SHIFT) == ei->fofs && f_size & (PAGE_SIZE - 1) && + ei->blk == NEW_ADDR) + return -EINVAL; + + if (__lookup_extent_tree(inode, ei->fofs, ei, EX_BLOCK_AGE)) { + unsigned long long cur_age; + + if (cur_blocks >= ei->last_blocks) + cur_age = cur_blocks - ei->last_blocks; + else + /* allocated_data_blocks overflow */ + cur_age = ULLONG_MAX - ei->last_blocks + cur_blocks; + + if (ei->age) + ei->age = __calculate_block_age(cur_age, ei->age); + else + ei->age = cur_age; + ei->last_blocks = cur_blocks; + WARN_ON(ei->age > cur_blocks); + return 0; + } + + f2fs_bug_on(sbi, ei->blk == NULL_ADDR); + + /* the data block was allocated for the first time */ + if (ei->blk == NEW_ADDR) + goto out; + + if (__is_valid_data_blkaddr(ei->blk) && + !f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC_ENHANCE)) { + f2fs_bug_on(sbi, 1); + return -EINVAL; + } +out: + /* + * init block age with zero, this can happen when the block age extent + * was reclaimed due to memory constraint or system reboot + */ + ei->age = 0; + ei->last_blocks = cur_blocks; + return 0; +} + +static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type) { + struct extent_info ei; + + if (!__may_extent_tree(dn->inode, type)) + return; + + ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + + dn->ofs_in_node; + ei.len = 1; + + if (type == EX_READ) { + if (dn->data_blkaddr == NEW_ADDR) + ei.blk = NULL_ADDR; + else + ei.blk = dn->data_blkaddr; + } else if (type == EX_BLOCK_AGE) { + ei.blk = dn->data_blkaddr; + if (__get_new_block_age(dn->inode, &ei)) + return; + } + __update_extent_tree_range(dn->inode, &ei, type); +} + +static unsigned int __shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink, + enum extent_type type) +{ + struct extent_tree_info *eti = &sbi->extent_tree[type]; struct extent_tree *et, *next; struct extent_node *en; unsigned int node_cnt = 0, tree_cnt = 0; int remained; - if (!test_opt(sbi, EXTENT_CACHE)) - return 0; - - if (!atomic_read(&sbi->total_zombie_tree)) + if (!atomic_read(&eti->total_zombie_tree)) goto free_node; - if (!mutex_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&eti->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ - list_for_each_entry_safe(et, next, &sbi->zombie_list, list) { + list_for_each_entry_safe(et, next, &eti->zombie_list, list) { if (atomic_read(&et->node_cnt)) { write_lock(&et->lock); node_cnt += __free_extent_tree(sbi, et); @@ -728,61 +984,137 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) } f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); list_del_init(&et->list); - radix_tree_delete(&sbi->extent_tree_root, et->ino); + radix_tree_delete(&eti->extent_tree_root, et->ino); kmem_cache_free(extent_tree_slab, et); - atomic_dec(&sbi->total_ext_tree); - atomic_dec(&sbi->total_zombie_tree); + atomic_dec(&eti->total_ext_tree); + atomic_dec(&eti->total_zombie_tree); tree_cnt++; if (node_cnt + tree_cnt >= nr_shrink) goto unlock_out; cond_resched(); } - mutex_unlock(&sbi->extent_tree_lock); + mutex_unlock(&eti->extent_tree_lock); free_node: /* 2. remove LRU extent entries */ - if (!mutex_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&eti->extent_tree_lock)) goto out; remained = nr_shrink - (node_cnt + tree_cnt); - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); for (; remained > 0; remained--) { - if (list_empty(&sbi->extent_list)) + if (list_empty(&eti->extent_list)) break; - en = list_first_entry(&sbi->extent_list, + en = list_first_entry(&eti->extent_list, struct extent_node, list); et = en->et; if (!write_trylock(&et->lock)) { /* refresh this extent node's position in extent list */ - list_move_tail(&en->list, &sbi->extent_list); + list_move_tail(&en->list, &eti->extent_list); continue; } list_del_init(&en->list); - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); __detach_extent_node(sbi, et, en); write_unlock(&et->lock); node_cnt++; - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); } - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); unlock_out: - mutex_unlock(&sbi->extent_tree_lock); + mutex_unlock(&eti->extent_tree_lock); out: - trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); + trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt, type); return node_cnt + tree_cnt; } -unsigned int f2fs_destroy_extent_node(struct inode *inode) +/* read extent cache operations */ +bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (!__may_extent_tree(inode, EX_READ)) + return false; + + return __lookup_extent_tree(inode, pgofs, ei, EX_READ); +} + +void f2fs_update_read_extent_cache(struct dnode_of_data *dn) +{ + return __update_extent_cache(dn, EX_READ); +} + +void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len) +{ + struct extent_info ei = { + .fofs = fofs, + .len = len, + .blk = blkaddr, + }; + + if (!__may_extent_tree(dn->inode, EX_READ)) + return; + + __update_extent_tree_range(dn->inode, &ei, EX_READ); +} + +unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + if (!test_opt(sbi, READ_EXTENT_CACHE)) + return 0; + + return __shrink_extent_tree(sbi, nr_shrink, EX_READ); +} + +/* block age extent cache operations */ +bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (!__may_extent_tree(inode, EX_BLOCK_AGE)) + return false; + + return __lookup_extent_tree(inode, pgofs, ei, EX_BLOCK_AGE); +} + +void f2fs_update_age_extent_cache(struct dnode_of_data *dn) +{ + return __update_extent_cache(dn, EX_BLOCK_AGE); +} + +void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, unsigned int len) +{ + struct extent_info ei = { + .fofs = fofs, + .len = len, + }; + + if (!__may_extent_tree(dn->inode, EX_BLOCK_AGE)) + return; + + __update_extent_tree_range(dn->inode, &ei, EX_BLOCK_AGE); +} + +unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + if (!test_opt(sbi, AGE_EXTENT_CACHE)) + return 0; + + return __shrink_extent_tree(sbi, nr_shrink, EX_BLOCK_AGE); +} + +static unsigned int __destroy_extent_node(struct inode *inode, + enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; unsigned int node_cnt = 0; if (!et || !atomic_read(&et->node_cnt)) @@ -795,32 +1127,46 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode) return node_cnt; } -void f2fs_drop_extent_tree(struct inode *inode) +void f2fs_destroy_extent_node(struct inode *inode) +{ + __destroy_extent_node(inode, EX_READ); + __destroy_extent_node(inode, EX_BLOCK_AGE); +} + +static void __drop_extent_tree(struct inode *inode, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; bool updated = false; - if (!f2fs_may_extent_tree(inode)) + if (!__may_extent_tree(inode, type)) return; - set_inode_flag(inode, FI_NO_EXTENT); - write_lock(&et->lock); __free_extent_tree(sbi, et); - if (et->largest.len) { - et->largest.len = 0; - updated = true; + if (type == EX_READ) { + set_inode_flag(inode, FI_NO_EXTENT); + if (et->largest.len) { + et->largest.len = 0; + updated = true; + } } write_unlock(&et->lock); if (updated) f2fs_mark_inode_dirty_sync(inode, true); } -void f2fs_destroy_extent_tree(struct inode *inode) +void f2fs_drop_extent_tree(struct inode *inode) +{ + __drop_extent_tree(inode, EX_READ); + __drop_extent_tree(inode, EX_BLOCK_AGE); +} + +static void __destroy_extent_tree(struct inode *inode, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; unsigned int node_cnt = 0; if (!et) @@ -828,76 +1174,56 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (inode->i_nlink && !is_bad_inode(inode) && atomic_read(&et->node_cnt)) { - mutex_lock(&sbi->extent_tree_lock); - list_add_tail(&et->list, &sbi->zombie_list); - atomic_inc(&sbi->total_zombie_tree); - mutex_unlock(&sbi->extent_tree_lock); + mutex_lock(&eti->extent_tree_lock); + list_add_tail(&et->list, &eti->zombie_list); + atomic_inc(&eti->total_zombie_tree); + mutex_unlock(&eti->extent_tree_lock); return; } /* free all extent info belong to this extent tree */ - node_cnt = f2fs_destroy_extent_node(inode); + node_cnt = __destroy_extent_node(inode, type); /* delete extent tree entry in radix tree */ - mutex_lock(&sbi->extent_tree_lock); + mutex_lock(&eti->extent_tree_lock); f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); - radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); + radix_tree_delete(&eti->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); - atomic_dec(&sbi->total_ext_tree); - mutex_unlock(&sbi->extent_tree_lock); + atomic_dec(&eti->total_ext_tree); + mutex_unlock(&eti->extent_tree_lock); - F2FS_I(inode)->extent_tree = NULL; + F2FS_I(inode)->extent_tree[type] = NULL; - trace_f2fs_destroy_extent_tree(inode, node_cnt); + trace_f2fs_destroy_extent_tree(inode, node_cnt, type); } -bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) -{ - if (!f2fs_may_extent_tree(inode)) - return false; - - return f2fs_lookup_extent_tree(inode, pgofs, ei); -} - -void f2fs_update_extent_cache(struct dnode_of_data *dn) +void f2fs_destroy_extent_tree(struct inode *inode) { - pgoff_t fofs; - block_t blkaddr; - - if (!f2fs_may_extent_tree(dn->inode)) - return; - - if (dn->data_blkaddr == NEW_ADDR) - blkaddr = NULL_ADDR; - else - blkaddr = dn->data_blkaddr; - - fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + - dn->ofs_in_node; - f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1); + __destroy_extent_tree(inode, EX_READ); + __destroy_extent_tree(inode, EX_BLOCK_AGE); } -void f2fs_update_extent_cache_range(struct dnode_of_data *dn, - pgoff_t fofs, block_t blkaddr, unsigned int len) - +static void __init_extent_tree_info(struct extent_tree_info *eti) { - if (!f2fs_may_extent_tree(dn->inode)) - return; - - f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len); + INIT_RADIX_TREE(&eti->extent_tree_root, GFP_NOIO); + mutex_init(&eti->extent_tree_lock); + INIT_LIST_HEAD(&eti->extent_list); + spin_lock_init(&eti->extent_lock); + atomic_set(&eti->total_ext_tree, 0); + INIT_LIST_HEAD(&eti->zombie_list); + atomic_set(&eti->total_zombie_tree, 0); + atomic_set(&eti->total_ext_node, 0); } void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) { - INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); - mutex_init(&sbi->extent_tree_lock); - INIT_LIST_HEAD(&sbi->extent_list); - spin_lock_init(&sbi->extent_lock); - atomic_set(&sbi->total_ext_tree, 0); - INIT_LIST_HEAD(&sbi->zombie_list); - atomic_set(&sbi->total_zombie_tree, 0); - atomic_set(&sbi->total_ext_node, 0); + __init_extent_tree_info(&sbi->extent_tree[EX_READ]); + __init_extent_tree_info(&sbi->extent_tree[EX_BLOCK_AGE]); + + /* initialize for block age extents */ + atomic64_set(&sbi->allocated_data_blocks, 0); + sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD; + sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD; } int __init f2fs_create_extent_cache(void) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3c7cdb70fe2e..e8953c3dc81a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -60,6 +60,7 @@ enum { FAULT_SLAB_ALLOC, FAULT_DQUOT_INIT, FAULT_LOCK_OP, + FAULT_BLKADDR, FAULT_MAX, }; @@ -91,7 +92,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_FLUSH_MERGE 0x00000400 #define F2FS_MOUNT_NOBARRIER 0x00000800 #define F2FS_MOUNT_FASTBOOT 0x00001000 -#define F2FS_MOUNT_EXTENT_CACHE 0x00002000 +#define F2FS_MOUNT_READ_EXTENT_CACHE 0x00002000 #define F2FS_MOUNT_DATA_FLUSH 0x00008000 #define F2FS_MOUNT_FAULT_INJECTION 0x00010000 #define F2FS_MOUNT_USRQUOTA 0x00080000 @@ -106,6 +107,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 #define F2FS_MOUNT_GC_MERGE 0x20000000 #define F2FS_MOUNT_COMPRESS_CACHE 0x40000000 +#define F2FS_MOUNT_AGE_EXTENT_CACHE 0x80000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -202,10 +204,6 @@ struct f2fs_mount_info { #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) #define F2FS_HAS_FEATURE(sbi, mask) __F2FS_HAS_FEATURE(sbi->raw_super, mask) -#define F2FS_SET_FEATURE(sbi, mask) \ - (sbi->raw_super->feature |= cpu_to_le32(mask)) -#define F2FS_CLEAR_FEATURE(sbi, mask) \ - (sbi->raw_super->feature &= ~cpu_to_le32(mask)) /* * Default values for user and/or group using reserved blocks @@ -266,6 +264,10 @@ enum { * condition of read on truncated area * by extent_cache */ + DATA_GENERIC_ENHANCE_UPDATE, /* + * strong check on range and segment + * bitmap for update case + */ META_GENERIC, }; @@ -274,7 +276,7 @@ enum { ORPHAN_INO, /* for orphan ino list */ APPEND_INO, /* for append ino list */ UPDATE_INO, /* for update ino list */ - TRANS_DIR_INO, /* for trasactions dir ino list */ + TRANS_DIR_INO, /* for transactions dir ino list */ FLUSH_INO, /* for multiple device flushing */ MAX_INO_ENTRY, /* max. list */ }; @@ -324,8 +326,12 @@ struct discard_entry { unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; +/* minimum discard granularity, unit: block count */ +#define MIN_DISCARD_GRANULARITY 1 /* default discard granularity of inner discard thread, unit: block count */ #define DEFAULT_DISCARD_GRANULARITY 16 +/* default maximum discard granularity of ordered discard, unit: block count */ +#define DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY 16 /* max discard pend list number */ #define MAX_PLIST_NUM 512 @@ -404,7 +410,9 @@ struct discard_cmd_control { unsigned int min_discard_issue_time; /* min. interval between discard issue */ unsigned int mid_discard_issue_time; /* mid. interval between discard issue */ unsigned int max_discard_issue_time; /* max. interval between discard issue */ + unsigned int discard_urgent_util; /* utilization which issue discard proactively */ unsigned int discard_granularity; /* discard granularity */ + unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */ unsigned int undiscard_blks; /* # of undiscard blocks */ unsigned int next_pos; /* next discard position */ atomic_t issued_discard; /* # of issued discard */ @@ -589,16 +597,35 @@ enum { /* dirty segments threshold for triggering CP */ #define DEFAULT_DIRTY_THRESHOLD 4 +#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS +#define RECOVERY_MIN_RA_BLOCKS 1 + +#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */ + /* for in-memory extent cache entry */ #define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ /* number of extent info in extent cache we try to shrink */ -#define EXTENT_CACHE_SHRINK_NUMBER 128 +#define READ_EXTENT_CACHE_SHRINK_NUMBER 128 -#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS -#define RECOVERY_MIN_RA_BLOCKS 1 +/* number of age extent info in extent cache we try to shrink */ +#define AGE_EXTENT_CACHE_SHRINK_NUMBER 128 +#define LAST_AGE_WEIGHT 30 +#define SAME_AGE_REGION 1024 -#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */ +/* + * Define data block with age less than 1GB as hot data + * define data block with age less than 10GB but more than 1GB as warm data + */ +#define DEF_HOT_DATA_AGE_THRESHOLD 262144 +#define DEF_WARM_DATA_AGE_THRESHOLD 2621440 + +/* extent cache type */ +enum extent_type { + EX_READ, + EX_BLOCK_AGE, + NR_EXTENT_CACHES, +}; struct rb_entry { struct rb_node rb_node; /* rb node located in rb-tree */ @@ -614,10 +641,24 @@ struct rb_entry { struct extent_info { unsigned int fofs; /* start offset in a file */ unsigned int len; /* length of the extent */ - u32 blk; /* start block address of the extent */ + union { + /* read extent_cache */ + struct { + /* start block address of the extent */ + block_t blk; #ifdef CONFIG_F2FS_FS_COMPRESSION - unsigned int c_len; /* physical extent length of compressed blocks */ + /* physical extent length of compressed blocks */ + unsigned int c_len; #endif + }; + /* block age extent_cache */ + struct { + /* block age of the extent */ + unsigned long long age; + /* last total blocks allocated */ + unsigned long long last_blocks; + }; + }; }; struct extent_node { @@ -629,13 +670,25 @@ struct extent_node { struct extent_tree { nid_t ino; /* inode number */ + enum extent_type type; /* keep the extent tree type */ struct rb_root_cached root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ - struct extent_info largest; /* largested extent info */ struct list_head list; /* to be used by sbi->zombie_list */ rwlock_t lock; /* protect extent info rb-tree */ atomic_t node_cnt; /* # of extent node in rb-tree*/ bool largest_updated; /* largest extent updated */ + struct extent_info largest; /* largest cached extent for EX_READ */ +}; + +struct extent_tree_info { + struct radix_tree_root extent_tree_root;/* cache extent cache entries */ + struct mutex extent_tree_lock; /* locking extent radix tree */ + struct list_head extent_list; /* lru list for shrinker */ + spinlock_t extent_lock; /* locking extent lru list */ + atomic_t total_ext_tree; /* extent tree count */ + struct list_head zombie_list; /* extent zombie tree list */ + atomic_t total_zombie_tree; /* extent zombie tree count */ + atomic_t total_ext_node; /* extent info count */ }; /* @@ -760,6 +813,8 @@ enum { FI_COMPRESS_RELEASED, /* compressed blocks were released */ FI_ALIGNED_WRITE, /* enable aligned write */ FI_COW_FILE, /* indicate COW file */ + FI_ATOMIC_COMMITTED, /* indicate atomic commit completed except disk sync */ + FI_ATOMIC_REPLACE, /* indicate atomic replace */ FI_MAX, /* max flag, never be used */ }; @@ -782,6 +837,7 @@ struct f2fs_inode_info { unsigned int clevel; /* maximum level of given file name */ struct task_struct *task; /* lookup and create consistency */ struct task_struct *cp_task; /* separate cp/wb IO stats*/ + struct task_struct *wb_task; /* indicate inode is in context of writeback */ nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ spinlock_t i_size_lock; /* protect last_disk_size */ @@ -795,7 +851,8 @@ struct f2fs_inode_info { struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ struct task_struct *atomic_write_task; /* store atomic write task */ - struct extent_tree *extent_tree; /* cached extent_tree entry */ + struct extent_tree *extent_tree[NR_EXTENT_CACHES]; + /* cached extent_tree entry */ struct inode *cow_inode; /* copy-on-write inode for atomic write */ /* avoid racing between foreground op and gc */ @@ -817,9 +874,10 @@ struct f2fs_inode_info { unsigned int i_cluster_size; /* cluster size */ unsigned int atomic_write_cnt; + loff_t original_i_size; /* original i_size before atomic write */ }; -static inline void get_extent_info(struct extent_info *ext, +static inline void get_read_extent_info(struct extent_info *ext, struct f2fs_extent *i_ext) { ext->fofs = le32_to_cpu(i_ext->fofs); @@ -827,7 +885,7 @@ static inline void get_extent_info(struct extent_info *ext, ext->len = le32_to_cpu(i_ext->len); } -static inline void set_raw_extent(struct extent_info *ext, +static inline void set_raw_read_extent(struct extent_info *ext, struct f2fs_extent *i_ext) { i_ext->fofs = cpu_to_le32(ext->fofs); @@ -835,17 +893,6 @@ static inline void set_raw_extent(struct extent_info *ext, i_ext->len = cpu_to_le32(ext->len); } -static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, - u32 blk, unsigned int len) -{ - ei->fofs = fofs; - ei->blk = blk; - ei->len = len; -#ifdef CONFIG_F2FS_FS_COMPRESSION - ei->c_len = 0; -#endif -} - static inline bool __is_discard_mergeable(struct discard_info *back, struct discard_info *front, unsigned int max_len) { @@ -865,41 +912,6 @@ static inline bool __is_discard_front_mergeable(struct discard_info *cur, return __is_discard_mergeable(cur, front, max_len); } -static inline bool __is_extent_mergeable(struct extent_info *back, - struct extent_info *front) -{ -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (back->c_len && back->len != back->c_len) - return false; - if (front->c_len && front->len != front->c_len) - return false; -#endif - return (back->fofs + back->len == front->fofs && - back->blk + back->len == front->blk); -} - -static inline bool __is_back_mergeable(struct extent_info *cur, - struct extent_info *back) -{ - return __is_extent_mergeable(back, cur); -} - -static inline bool __is_front_mergeable(struct extent_info *cur, - struct extent_info *front) -{ - return __is_extent_mergeable(cur, front); -} - -extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); -static inline void __try_update_largest_extent(struct extent_tree *et, - struct extent_node *en) -{ - if (en->ei.len > et->largest.len) { - et->largest = en->ei; - et->largest_updated = true; - } -} - /* * For free nid management */ @@ -1057,9 +1069,6 @@ struct f2fs_sm_info { /* a threshold to reclaim prefree segments */ unsigned int rec_prefree_segments; - /* for batched trimming */ - unsigned int trim_sections; /* # of sections to trim */ - struct list_head sit_entry_set; /* sit entry set list */ unsigned int ipu_policy; /* in-place-update policy */ @@ -1158,7 +1167,10 @@ enum iostat_type { APP_BUFFERED_IO, /* app buffered write IOs */ APP_WRITE_IO, /* app write IOs */ APP_MAPPED_IO, /* app mapped IOs */ + APP_BUFFERED_CDATA_IO, /* app buffered write IOs on compressed file */ + APP_MAPPED_CDATA_IO, /* app mapped write IOs on compressed file */ FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ + FS_CDATA_IO, /* data IOs from kworker/fsync/reclaimer on compressed file */ FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ FS_META_IO, /* meta IOs from kworker/reclaimer */ FS_GC_DATA_IO, /* data IOs from forground gc */ @@ -1172,6 +1184,8 @@ enum iostat_type { APP_BUFFERED_READ_IO, /* app buffered read IOs */ APP_READ_IO, /* app read IOs */ APP_MAPPED_READ_IO, /* app mapped read IOs */ + APP_BUFFERED_CDATA_READ_IO, /* app buffered read IOs on compressed file */ + APP_MAPPED_CDATA_READ_IO, /* app mapped read IOs on compressed file */ FS_DATA_READ_IO, /* data read IOs */ FS_GDATA_READ_IO, /* data read IOs from background gc */ FS_CDATA_READ_IO, /* compressed data read IOs */ @@ -1247,7 +1261,6 @@ enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ DIRTY_META, /* for all dirtied inode metadata */ - ATOMIC_FILE, /* for all atomic files */ NR_INODE_TYPE, }; @@ -1309,6 +1322,7 @@ enum { MAX_TIME, }; +/* Note that you need to keep synchronization with this gc_mode_names array */ enum { GC_NORMAL, GC_IDLE_CB, @@ -1659,14 +1673,12 @@ struct f2fs_sb_info { struct mutex flush_lock; /* for flush exclusion */ /* for extent tree cache */ - struct radix_tree_root extent_tree_root;/* cache extent cache entries */ - struct mutex extent_tree_lock; /* locking extent radix tree */ - struct list_head extent_list; /* lru list for shrinker */ - spinlock_t extent_lock; /* locking extent lru list */ - atomic_t total_ext_tree; /* extent tree count */ - struct list_head zombie_list; /* extent zombie tree list */ - atomic_t total_zombie_tree; /* extent zombie tree count */ - atomic_t total_ext_node; /* extent info count */ + struct extent_tree_info extent_tree[NR_EXTENT_CACHES]; + atomic64_t allocated_data_blocks; /* for block age extent_cache */ + + /* The threshold used for hot and warm data seperation*/ + unsigned int hot_data_age_threshold; + unsigned int warm_data_age_threshold; /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ @@ -1684,7 +1696,7 @@ struct f2fs_sb_info { unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ int dir_level; /* directory level */ - int readdir_ra; /* readahead inode in readdir */ + bool readdir_ra; /* readahead inode in readdir */ u64 max_io_bytes; /* max io bytes to merge IOs */ block_t user_block_count; /* # of user blocks */ @@ -1725,12 +1737,11 @@ struct f2fs_sb_info { unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ - spinlock_t gc_urgent_high_lock; - bool gc_urgent_high_limited; /* indicates having limited trial count */ - unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */ + spinlock_t gc_remaining_trials_lock; + /* remaining trial count for GC_URGENT_* and GC_IDLE_* */ + unsigned int gc_remaining_trials; /* for skip statistic */ - unsigned int atomic_files; /* # of opened atomic file */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ @@ -1752,15 +1763,21 @@ struct f2fs_sb_info { unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ - atomic64_t total_hit_ext; /* # of lookup extent cache */ - atomic64_t read_hit_rbtree; /* # of hit rbtree extent node */ - atomic64_t read_hit_largest; /* # of hit largest extent node */ - atomic64_t read_hit_cached; /* # of hit cached extent node */ + /* # of lookup extent cache */ + atomic64_t total_hit_ext[NR_EXTENT_CACHES]; + /* # of hit rbtree extent node */ + atomic64_t read_hit_rbtree[NR_EXTENT_CACHES]; + /* # of hit cached extent node */ + atomic64_t read_hit_cached[NR_EXTENT_CACHES]; + /* # of hit largest extent node in read extent cache */ + atomic64_t read_hit_largest; atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t compr_inode; /* # of compressed inodes */ atomic64_t compr_blocks; /* # of compressed blocks */ + atomic_t swapfile_inode; /* # of swapfile inodes */ + atomic_t atomic_files; /* # of opened atomic file */ atomic_t max_aw_cnt; /* max # of atomic writes */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ @@ -1806,6 +1823,10 @@ struct f2fs_sb_info { struct workqueue_struct *post_read_wq; /* post read workqueue */ + unsigned char errors[MAX_F2FS_ERRORS]; /* error flags */ + spinlock_t error_lock; /* protect errors array */ + bool error_dirty; /* errors of sb is dirty */ + struct kmem_cache *inline_xattr_slab; /* inline xattr entry */ unsigned int inline_xattr_slab_size; /* default inline xattr slab size */ @@ -2525,7 +2546,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) if (__cp_payload(sbi) > 0) { if (flag == NAT_BITMAP) - return &ckpt->sit_nat_version_bitmap; + return tmp_ptr; else return (unsigned char *)ckpt + F2FS_BLKSIZE; } else { @@ -2563,6 +2584,7 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } +extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, struct inode *inode, bool is_inode) { @@ -2961,7 +2983,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) /* Flags that should be inherited by new inodes from their parent. */ #define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \ F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ - F2FS_CASEFOLD_FL | F2FS_COMPR_FL | F2FS_NOCOMP_FL) + F2FS_CASEFOLD_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ @@ -3059,6 +3081,8 @@ static inline void f2fs_i_blocks_write(struct inode *inode, set_inode_flag(inode, FI_AUTO_RECOVER); } +static inline bool f2fs_is_atomic_file(struct inode *inode); + static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) { bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); @@ -3068,6 +3092,10 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) return; i_size_write(inode, i_size); + + if (f2fs_is_atomic_file(inode)) + return; + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); @@ -3547,6 +3575,8 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); +void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason); +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); @@ -3706,7 +3736,9 @@ static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi) /* * checkpoint.c */ -void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, + unsigned char reason); +void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi); struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index); @@ -3736,7 +3768,8 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi); int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio); void f2fs_remove_dirty_inode(struct inode *inode); -int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, + bool from_cp); void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type); u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi); int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); @@ -3778,8 +3811,9 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - blk_opf_t op_flags, bool for_write); -struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index); + blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs); +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, + pgoff_t *next_pgofs); struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, bool for_write); struct page *f2fs_get_new_data_page(struct inode *inode, @@ -3838,9 +3872,19 @@ struct f2fs_stat_info { struct f2fs_sb_info *sbi; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; - unsigned long long hit_largest, hit_cached, hit_rbtree; - unsigned long long hit_total, total_ext; - int ext_tree, zombie_tree, ext_node; + unsigned long long hit_cached[NR_EXTENT_CACHES]; + unsigned long long hit_rbtree[NR_EXTENT_CACHES]; + unsigned long long total_ext[NR_EXTENT_CACHES]; + unsigned long long hit_total[NR_EXTENT_CACHES]; + int ext_tree[NR_EXTENT_CACHES]; + int zombie_tree[NR_EXTENT_CACHES]; + int ext_node[NR_EXTENT_CACHES]; + /* to count memory footprint */ + unsigned long long ext_mem[NR_EXTENT_CACHES]; + /* for read extent cache */ + unsigned long long hit_largest; + /* for block age extent cache */ + unsigned long long allocated_data_blocks; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; @@ -3858,7 +3902,7 @@ struct f2fs_stat_info { int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt; unsigned int cur_ckpt_time, peak_ckpt_time; int inline_xattr, inline_inode, inline_dir, append, update, orphans; - int compr_inode; + int compr_inode, swapfile_inode; unsigned long long compr_blocks; int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; @@ -3899,10 +3943,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++) #define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) #define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) -#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) -#define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree)) +#define stat_inc_total_hit(sbi, type) (atomic64_inc(&(sbi)->total_hit_ext[type])) +#define stat_inc_rbtree_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_rbtree[type])) #define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) -#define stat_inc_cached_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_cached)) +#define stat_inc_cached_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_cached[type])) #define stat_inc_inline_xattr(inode) \ do { \ if (f2fs_has_inline_xattr(inode)) \ @@ -3947,6 +3991,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) (atomic64_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_sub_compr_blocks(inode, blocks) \ (atomic64_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) +#define stat_inc_swapfile_inode(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->swapfile_inode)) +#define stat_dec_swapfile_inode(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->swapfile_inode)) +#define stat_inc_atomic_inode(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->atomic_files)) +#define stat_dec_atomic_inode(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->atomic_files)) #define stat_inc_meta_count(sbi, blkaddr) \ do { \ if (blkaddr < SIT_I(sbi)->sit_base_addr) \ @@ -3966,7 +4018,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) (atomic_inc(&(sbi)->inplace_count)) #define stat_update_max_atomic_write(inode) \ do { \ - int cur = F2FS_I_SB(inode)->atomic_files; \ + int cur = atomic_read(&F2FS_I_SB(inode)->atomic_files); \ int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ @@ -4017,10 +4069,10 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_other_skip_bggc_count(sbi) do { } while (0) #define stat_inc_dirty_inode(sbi, type) do { } while (0) #define stat_dec_dirty_inode(sbi, type) do { } while (0) -#define stat_inc_total_hit(sbi) do { } while (0) -#define stat_inc_rbtree_node_hit(sbi) do { } while (0) +#define stat_inc_total_hit(sbi, type) do { } while (0) +#define stat_inc_rbtree_node_hit(sbi, type) do { } while (0) #define stat_inc_largest_node_hit(sbi) do { } while (0) -#define stat_inc_cached_node_hit(sbi) do { } while (0) +#define stat_inc_cached_node_hit(sbi, type) do { } while (0) #define stat_inc_inline_xattr(inode) do { } while (0) #define stat_dec_inline_xattr(inode) do { } while (0) #define stat_inc_inline_inode(inode) do { } while (0) @@ -4031,6 +4083,10 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_dec_compr_inode(inode) do { } while (0) #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) +#define stat_inc_swapfile_inode(inode) do { } while (0) +#define stat_dec_swapfile_inode(inode) do { } while (0) +#define stat_inc_atomic_inode(inode) do { } while (0) +#define stat_dec_atomic_inode(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) #define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) @@ -4122,20 +4178,34 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, bool force, bool *leftmost); bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, struct rb_root_cached *root, bool check_key); -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); -void f2fs_init_extent_tree(struct inode *inode, struct page *ipage); +void f2fs_init_extent_tree(struct inode *inode); void f2fs_drop_extent_tree(struct inode *inode); -unsigned int f2fs_destroy_extent_node(struct inode *inode); +void f2fs_destroy_extent_node(struct inode *inode); void f2fs_destroy_extent_tree(struct inode *inode); -bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei); -void f2fs_update_extent_cache(struct dnode_of_data *dn); -void f2fs_update_extent_cache_range(struct dnode_of_data *dn, - pgoff_t fofs, block_t blkaddr, unsigned int len); void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi); int __init f2fs_create_extent_cache(void); void f2fs_destroy_extent_cache(void); +/* read extent cache ops */ +void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage); +bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_read_extent_cache(struct dnode_of_data *dn); +void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len); +unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, + int nr_shrink); + +/* block age extent cache ops */ +void f2fs_init_age_extent_tree(struct inode *inode); +bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_age_extent_cache(struct dnode_of_data *dn); +void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, unsigned int len); +unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, + int nr_shrink); + /* * sysfs.c */ @@ -4205,9 +4275,9 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, struct writeback_control *wbc, enum iostat_type io_type); int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index); -void f2fs_update_extent_tree_range_compressed(struct inode *inode, - pgoff_t fofs, block_t blkaddr, unsigned int llen, - unsigned int c_len); +void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, + unsigned int llen, unsigned int c_len); int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write); @@ -4288,9 +4358,10 @@ static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { } #define inc_compr_inode_stat(inode) do { } while (0) -static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode, - pgoff_t fofs, block_t blkaddr, unsigned int llen, - unsigned int c_len) { } +static inline void f2fs_update_read_extent_tree_range_compressed( + struct inode *inode, + pgoff_t fofs, block_t blkaddr, + unsigned int llen, unsigned int c_len) { } #endif static inline int set_compress_context(struct inode *inode) @@ -4341,7 +4412,7 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode) } #define F2FS_FEATURE_FUNCS(name, flagname) \ -static inline int f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \ +static inline bool f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \ { \ return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \ } @@ -4361,26 +4432,6 @@ F2FS_FEATURE_FUNCS(casefold, CASEFOLD); F2FS_FEATURE_FUNCS(compression, COMPRESSION); F2FS_FEATURE_FUNCS(readonly, RO); -static inline bool f2fs_may_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - - if (!test_opt(sbi, EXTENT_CACHE) || - is_inode_flag_set(inode, FI_NO_EXTENT) || - (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && - !f2fs_sb_has_readonly(sbi))) - return false; - - /* - * for recovered files during mount do not create extents - * if shrinker is not registered. - */ - if (list_empty(&sbi->s_list)) - return false; - - return S_ISREG(inode->i_mode); -} - #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, block_t blkaddr) @@ -4471,17 +4522,6 @@ static inline void f2fs_i_compr_blocks_update(struct inode *inode, f2fs_mark_inode_dirty_sync(inode, true); } -static inline int block_unaligned_IO(struct inode *inode, - struct kiocb *iocb, struct iov_iter *iter) -{ - unsigned int i_blkbits = READ_ONCE(inode->i_blkbits); - unsigned int blocksize_mask = (1 << i_blkbits) - 1; - loff_t offset = iocb->ki_pos; - unsigned long align = offset | iov_iter_alignment(iter); - - return align & blocksize_mask; -} - static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi, int flag) { @@ -4492,35 +4532,6 @@ static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi, return sbi->aligned_blksize; } -static inline bool f2fs_force_buffered_io(struct inode *inode, - struct kiocb *iocb, struct iov_iter *iter) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int rw = iov_iter_rw(iter); - - if (!fscrypt_dio_supported(iocb, iter)) - return true; - if (fsverity_active(inode)) - return true; - if (f2fs_compressed_file(inode)) - return true; - - /* disallow direct IO if any of devices has unaligned blksize */ - if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize) - return true; - - if (f2fs_lfs_mode(sbi) && (rw == WRITE)) { - if (block_unaligned_IO(inode, iocb, iter)) - return true; - if (F2FS_IO_ALIGNED(sbi)) - return true; - } - if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED)) - return true; - - return false; -} - static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) { return fsverity_active(inode) && @@ -4573,6 +4584,11 @@ static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs, } } +static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi) +{ + return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb); +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ce4905a073b3..a6c401279886 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -43,8 +43,8 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) ret = filemap_fault(vmf); if (!ret) - f2fs_update_iostat(F2FS_I_SB(inode), APP_MAPPED_READ_IO, - F2FS_BLKSIZE); + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_MAPPED_READ_IO, F2FS_BLKSIZE); trace_f2fs_filemap_fault(inode, vmf->pgoff, (unsigned long)ret); @@ -154,7 +154,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) if (!PageUptodate(page)) SetPageUptodate(page); - f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, inode, APP_MAPPED_IO, F2FS_BLKSIZE); f2fs_update_time(sbi, REQ_TIME); trace_f2fs_vm_page_mkwrite(page, DATA); @@ -571,7 +571,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) raw_node = F2FS_NODE(dn->node_page); addr = blkaddr_in_node(raw_node) + base + ofs; - /* Assumption: truncateion starts with cluster */ + /* Assumption: truncation starts with cluster */ for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) { block_t blkaddr = le32_to_cpu(*addr); @@ -618,7 +618,8 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) */ fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + ofs; - f2fs_update_extent_cache_range(dn, fofs, 0, len); + f2fs_update_read_extent_cache_range(dn, fofs, 0, len); + f2fs_update_age_extent_cache_range(dn, fofs, nr_free); dec_valid_block_count(sbi, dn->inode, nr_free); } dn->ofs_in_node = ofs; @@ -808,6 +809,34 @@ int f2fs_truncate(struct inode *inode) return 0; } +static bool f2fs_force_buffered_io(struct inode *inode, int rw) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!fscrypt_dio_supported(inode)) + return true; + if (fsverity_active(inode)) + return true; + if (f2fs_compressed_file(inode)) + return true; + + /* disallow direct IO if any of devices has unaligned blksize */ + if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize) + return true; + /* + * for blkzoned device, fallback direct IO to buffered IO, so + * all IOs can be serialized by log-structured write. + */ + if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE)) + return true; + if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi)) + return true; + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + return true; + + return false; +} + int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -824,6 +853,24 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->btime.tv_nsec = fi->i_crtime.tv_nsec; } + /* + * Return the DIO alignment restrictions if requested. We only return + * this information when requested, since on encrypted files it might + * take a fair bit of work to get if the file wasn't opened recently. + * + * f2fs sometimes supports DIO reads but not DIO writes. STATX_DIOALIGN + * cannot represent that, so in that case we report no DIO support. + */ + if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { + unsigned int bsize = i_blocksize(inode); + + stat->result_mask |= STATX_DIOALIGN; + if (!f2fs_force_buffered_io(inode, WRITE)) { + stat->dio_mem_align = bsize; + stat->dio_offset_align = bsize; + } + } + flags = fi->i_flags; if (flags & F2FS_COMPR_FL) stat->attributes |= STATX_ATTR_COMPRESSED; @@ -871,9 +918,10 @@ static void __setattr_copy(struct user_namespace *mnt_userns, inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); - if (!in_group_p(kgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!vfsgid_in_group_p(vfsgid) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; set_acl_inode(inode, mode); } @@ -978,7 +1026,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, __setattr_copy(mnt_userns, inode, attr); if (attr->ia_valid & ATTR_MODE) { - err = posix_acl_chmod(mnt_userns, inode, f2fs_get_inode_mode(inode)); + err = posix_acl_chmod(mnt_userns, dentry, f2fs_get_inode_mode(inode)); if (is_inode_flag_set(inode, FI_ACL_MODE)) { if (!err) @@ -999,7 +1047,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, const struct inode_operations f2fs_file_inode_operations = { .getattr = f2fs_getattr, .setattr = f2fs_setattr, - .get_acl = f2fs_get_acl, + .get_inode_acl = f2fs_get_acl, .set_acl = f2fs_set_acl, .listxattr = f2fs_listxattr, .fiemap = f2fs_fiemap, @@ -1155,6 +1203,7 @@ next_dnode: !f2fs_is_valid_blkaddr(sbi, *blkaddr, DATA_GENERIC_ENHANCE)) { f2fs_put_dnode(&dn); + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; } @@ -1439,6 +1488,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr, DATA_GENERIC_ENHANCE)) { ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); break; } @@ -1447,7 +1497,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, f2fs_set_data_blkaddr(dn); } - f2fs_update_extent_cache_range(dn, start, 0, index - start); + f2fs_update_read_extent_cache_range(dn, start, 0, index - start); return ret; } @@ -1866,6 +1916,10 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) if (!f2fs_disable_compressed_file(inode)) return -EINVAL; } else { + /* try to convert inline_data to support compression */ + int err = f2fs_convert_inline_inode(inode); + if (err) + return err; if (!f2fs_may_compress(inode)) return -EINVAL; if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode)) @@ -1981,13 +2035,14 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) return put_user(inode->i_generation, (int __user *)arg); } -static int f2fs_ioc_start_atomic_write(struct file *filp) +static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) { struct inode *inode = file_inode(filp); struct user_namespace *mnt_userns = file_mnt_user_ns(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct inode *pinode; + loff_t isize; int ret; if (!inode_owner_or_capable(mnt_userns, inode)) @@ -2046,15 +2101,25 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) f2fs_up_write(&fi->i_gc_rwsem[WRITE]); goto out; } - f2fs_i_size_write(fi->cow_inode, i_size_read(inode)); - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - sbi->atomic_files++; - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + f2fs_write_inode(inode, NULL); + + stat_inc_atomic_inode(inode); set_inode_flag(inode, FI_ATOMIC_FILE); set_inode_flag(fi->cow_inode, FI_COW_FILE); clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); + + isize = i_size_read(inode); + fi->original_i_size = isize; + if (truncate) { + set_inode_flag(inode, FI_ATOMIC_REPLACE); + truncate_inode_pages_final(inode->i_mapping); + f2fs_i_size_write(inode, 0); + isize = 0; + } + f2fs_i_size_write(fi->cow_inode, isize); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); f2fs_update_time(sbi, REQ_TIME); @@ -2086,16 +2151,14 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (f2fs_is_atomic_file(inode)) { ret = f2fs_commit_atomic_write(inode); - if (ret) - goto unlock_out; - - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) - f2fs_abort_atomic_write(inode, false); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + + f2fs_abort_atomic_write(inode, ret); } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } -unlock_out: + inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -2144,7 +2207,8 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (ret) { if (ret == -EROFS) { ret = 0; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); trace_f2fs_shutdown(sbi, in, ret); } @@ -2157,7 +2221,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) ret = freeze_bdev(sb->s_bdev); if (ret) goto out; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); thaw_bdev(sb->s_bdev); break; @@ -2166,16 +2230,16 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) ret = f2fs_sync_fs(sb, 1); if (ret) goto out; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NOSYNC: - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_METAFLUSH: f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NEED_FSCK: @@ -2495,7 +2559,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, struct f2fs_map_blocks map = { .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = false }; - struct extent_info ei = {0, 0, 0}; + struct extent_info ei = {0, }; pgoff_t pg_start, pg_end, next_pgofs; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; @@ -2527,7 +2591,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, * lookup mapping info in extent cache, skip defragmenting if physical * block addresses are continuous. */ - if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, pg_start, &ei)) { if (ei.fofs + ei.len >= pg_end) goto out; } @@ -3321,8 +3385,10 @@ static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count) if (!__is_valid_data_blkaddr(blkaddr)) continue; if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, - DATA_GENERIC_ENHANCE))) + DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } } while (count) { @@ -3483,8 +3549,10 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count) if (!__is_valid_data_blkaddr(blkaddr)) continue; if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, - DATA_GENERIC_ENHANCE))) + DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } } while (count) { @@ -3756,6 +3824,8 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) DATA_GENERIC_ENHANCE)) { ret = -EFSCORRUPTED; f2fs_put_dnode(&dn); + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); goto out; } @@ -4077,7 +4147,9 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FS_IOC_GETVERSION: return f2fs_ioc_getversion(filp, arg); case F2FS_IOC_START_ATOMIC_WRITE: - return f2fs_ioc_start_atomic_write(filp); + return f2fs_ioc_start_atomic_write(filp, false); + case F2FS_IOC_START_ATOMIC_REPLACE: + return f2fs_ioc_start_atomic_write(filp, true); case F2FS_IOC_COMMIT_ATOMIC_WRITE: return f2fs_ioc_commit_atomic_write(filp); case F2FS_IOC_ABORT_ATOMIC_WRITE: @@ -4182,7 +4254,7 @@ static bool f2fs_should_use_dio(struct inode *inode, struct kiocb *iocb, if (!(iocb->ki_flags & IOCB_DIRECT)) return false; - if (f2fs_force_buffered_io(inode, iocb, iter)) + if (f2fs_force_buffered_io(inode, iov_iter_rw(iter))) return false; /* @@ -4212,7 +4284,7 @@ static int f2fs_dio_read_end_io(struct kiocb *iocb, ssize_t size, int error, dec_page_count(sbi, F2FS_DIO_READ); if (error) return error; - f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, size); + f2fs_update_iostat(sbi, NULL, APP_DIRECT_READ_IO, size); return 0; } @@ -4301,7 +4373,8 @@ skip_read_trace: } else { ret = filemap_read(iocb, to, 0); if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret); + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_BUFFERED_READ_IO, ret); } if (trace_f2fs_dataread_end_enabled()) trace_f2fs_dataread_end(inode, pos, ret); @@ -4418,7 +4491,8 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, if (ret > 0) { iocb->ki_pos += ret; - f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_IO, ret); + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_BUFFERED_IO, ret); } return ret; } @@ -4431,7 +4505,7 @@ static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, dec_page_count(sbi, F2FS_DIO_WRITE); if (error) return error; - f2fs_update_iostat(sbi, APP_DIRECT_IO, size); + f2fs_update_iostat(sbi, NULL, APP_DIRECT_IO, size); return 0; } @@ -4619,7 +4693,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) skip_write_trace: /* Do the actual write. */ ret = dio ? - f2fs_dio_write_iter(iocb, from, &may_need_sync): + f2fs_dio_write_iter(iocb, from, &may_need_sync) : f2fs_buffered_write_iter(iocb, from); if (trace_f2fs_datawrite_end_enabled()) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6da21d405ce1..6e2cae3d2e71 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -74,7 +74,8 @@ static int gc_thread_func(void *data) if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_FAULT_INJECT); } if (!sb_start_write_trylock(sbi->sb)) { @@ -95,20 +96,6 @@ static int gc_thread_func(void *data) * invalidated soon after by user update or deletion. * So, I'd like to wait some time to collect dirty segments. */ - if (sbi->gc_mode == GC_URGENT_HIGH) { - spin_lock(&sbi->gc_urgent_high_lock); - if (sbi->gc_urgent_high_limited) { - if (!sbi->gc_urgent_high_remaining) { - sbi->gc_urgent_high_limited = false; - spin_unlock(&sbi->gc_urgent_high_lock); - sbi->gc_mode = GC_NORMAL; - continue; - } - sbi->gc_urgent_high_remaining--; - } - spin_unlock(&sbi->gc_urgent_high_lock); - } - if (sbi->gc_mode == GC_URGENT_HIGH || sbi->gc_mode == GC_URGENT_MID) { wait_ms = gc_th->urgent_sleep_time; @@ -154,6 +141,10 @@ do_gc: /* don't bother wait_ms by foreground gc */ if (!foreground) wait_ms = gc_th->no_gc_sleep_time; + } else { + /* reset wait_ms to default sleep time */ + if (wait_ms == gc_th->no_gc_sleep_time) + wait_ms = gc_th->min_sleep_time; } if (foreground) @@ -165,6 +156,15 @@ do_gc: /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi, true); next: + if (sbi->gc_mode != GC_NORMAL) { + spin_lock(&sbi->gc_remaining_trials_lock); + if (sbi->gc_remaining_trials) { + sbi->gc_remaining_trials--; + if (!sbi->gc_remaining_trials) + sbi->gc_mode = GC_NORMAL; + } + spin_unlock(&sbi->gc_remaining_trials_lock); + } sb_end_write(sbi->sb); } while (!kthread_should_stop()); @@ -175,13 +175,10 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th; dev_t dev = sbi->sb->s_bdev->bd_dev; - int err = 0; gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); - if (!gc_th) { - err = -ENOMEM; - goto out; - } + if (!gc_th) + return -ENOMEM; gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; @@ -196,12 +193,14 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { - err = PTR_ERR(gc_th->f2fs_gc_task); + int err = PTR_ERR(gc_th->f2fs_gc_task); + kfree(gc_th); sbi->gc_thread = NULL; + return err; } -out: - return err; + + return 0; } void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) @@ -285,7 +284,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, /* let's select beginning hot/small space first in no_heap mode*/ if (f2fs_need_rand_seg(sbi)) - p->offset = prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec); + p->offset = get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec); else if (test_opt(sbi, NOHEAP) && (type == CURSEG_HOT_DATA || IS_NODESEG(type))) p->offset = 0; @@ -1082,7 +1081,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, { struct page *node_page; nid_t nid; - unsigned int ofs_in_node; + unsigned int ofs_in_node, max_addrs, base; block_t source_blkaddr; nid = le32_to_cpu(sum->nid); @@ -1108,6 +1107,21 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return false; } + if (IS_INODE(node_page)) { + base = offset_in_addr(F2FS_INODE(node_page)); + max_addrs = DEF_ADDRS_PER_INODE; + } else { + base = 0; + max_addrs = DEF_ADDRS_PER_BLOCK; + } + + if (base + ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent blkaddr offset: base:%u, ofs_in_node:%u, max:%u, ino:%u, nid:%u", + base, ofs_in_node, max_addrs, dni->ino, dni->nid); + f2fs_put_page(node_page, 1); + return false; + } + *nofs = ofs_of_node(node_page); source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); @@ -1136,7 +1150,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei = {0, 0, 0}; + struct extent_info ei = {0, }; struct f2fs_io_info fio = { .sbi = sbi, .ino = inode->i_ino, @@ -1154,11 +1168,12 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (!page) return -ENOMEM; - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto put_page; } goto got_it; @@ -1177,6 +1192,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto put_page; } got_it: @@ -1206,8 +1222,8 @@ got_it: f2fs_put_page(fio.encrypted_page, 0); f2fs_put_page(page, 1); - f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); - f2fs_update_iostat(sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_GDATA_READ_IO, F2FS_BLKSIZE); return 0; put_encrypted_page: @@ -1307,8 +1323,10 @@ static int move_data_block(struct inode *inode, block_t bidx, goto up_out; } - f2fs_update_iostat(fio.sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); - f2fs_update_iostat(fio.sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, inode, FS_DATA_READ_IO, + F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, NULL, FS_GDATA_READ_IO, + F2FS_BLKSIZE); lock_page(mpage); if (unlikely(mpage->mapping != META_MAPPING(fio.sbi) || @@ -1360,7 +1378,7 @@ static int move_data_block(struct inode *inode, block_t bidx, goto put_page_out; } - f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, NULL, FS_GC_DATA_IO, F2FS_BLKSIZE); f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); @@ -1554,8 +1572,8 @@ next_step: continue; } - data_page = f2fs_get_read_data_page(inode, - start_bidx, REQ_RAHEAD, true); + data_page = f2fs_get_read_data_page(inode, start_bidx, + REQ_RAHEAD, true, NULL); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); @@ -1706,7 +1724,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT", segno, type, GET_SUM_TYPE((&sum->footer))); set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_CORRUPTED_SUMMARY); goto skip; } @@ -1734,8 +1753,9 @@ freed: get_valid_blocks(sbi, segno, false) == 0) seg_freed++; - if (__is_large_section(sbi) && segno + 1 < end_segno) - sbi->next_victim_seg[gc_type] = segno + 1; + if (__is_large_section(sbi)) + sbi->next_victim_seg[gc_type] = + (segno + 1 < end_segno) ? segno + 1 : NULL_SEGNO; skip: f2fs_put_page(sum_page, 0); } @@ -1888,9 +1908,7 @@ int __init f2fs_create_garbage_collection_cache(void) { victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry", sizeof(struct victim_entry)); - if (!victim_entry_slab) - return -ENOMEM; - return 0; + return victim_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_garbage_collection_cache(void) @@ -2123,8 +2141,6 @@ out_unlock: if (err) return err; - set_sbi_flag(sbi, SBI_IS_RESIZEFS); - freeze_super(sbi->sb); f2fs_down_write(&sbi->gc_lock); f2fs_down_write(&sbi->cp_global_sem); @@ -2140,6 +2156,7 @@ out_unlock: if (err) goto out_err; + set_sbi_flag(sbi, SBI_IS_RESIZEFS); err = free_segment_range(sbi, secs, false); if (err) goto recover_out; @@ -2163,6 +2180,7 @@ out_unlock: f2fs_commit_super(sbi, false); } recover_out: + clear_sbi_flag(sbi, SBI_IS_RESIZEFS); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_err(sbi, "resize_fs failed, should run fsck to repair!"); @@ -2175,6 +2193,5 @@ out_err: f2fs_up_write(&sbi->cp_global_sem); f2fs_up_write(&sbi->gc_lock); thaw_super(sbi->sb); - clear_sbi_flag(sbi, SBI_IS_RESIZEFS); return err; } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index bf46a7dfbea2..21a495234ffd 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -64,7 +64,6 @@ bool f2fs_may_inline_dentry(struct inode *inode) void f2fs_do_read_inline_data(struct page *page, struct page *ipage) { struct inode *inode = page->mapping->host; - void *src_addr, *dst_addr; if (PageUptodate(page)) return; @@ -74,11 +73,8 @@ void f2fs_do_read_inline_data(struct page *page, struct page *ipage) zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE); /* Copy the whole inline data block */ - src_addr = inline_data_addr(inode, ipage); - dst_addr = kmap_atomic(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); - flush_dcache_page(page); - kunmap_atomic(dst_addr); + memcpy_to_page(page, 0, inline_data_addr(inode, ipage), + MAX_INLINE_DATA(inode)); if (!PageUptodate(page)) SetPageUptodate(page); } @@ -164,6 +160,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) set_sbi_flag(fio.sbi, SBI_NEED_FSCK); f2fs_warn(fio.sbi, "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", __func__, dn->inode->i_ino, dn->data_blkaddr); + f2fs_handle_error(fio.sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; } @@ -246,7 +243,6 @@ out: int f2fs_write_inline_data(struct inode *inode, struct page *page) { - void *src_addr, *dst_addr; struct dnode_of_data dn; int err; @@ -263,10 +259,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_bug_on(F2FS_I_SB(inode), page->index); f2fs_wait_on_page_writeback(dn.inode_page, NODE, true, true); - src_addr = kmap_atomic(page); - dst_addr = inline_data_addr(inode, dn.inode_page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); - kunmap_atomic(src_addr); + memcpy_from_page(inline_data_addr(inode, dn.inode_page), + page, 0, MAX_INLINE_DATA(inode)); set_page_dirty(dn.inode_page); f2fs_clear_page_cache_dirty_tag(page); @@ -419,6 +413,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK); f2fs_warn(F2FS_P_SB(page), "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", __func__, dir->i_ino, dn.data_blkaddr); + f2fs_handle_error(F2FS_P_SB(page), ERROR_INVALID_BLKADDR); err = -EFSCORRUPTED; goto out; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 6d11c365d7b4..ff6cf66ed46b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -81,8 +81,10 @@ static int __written_first_block(struct f2fs_sb_info *sbi, if (!__is_valid_data_blkaddr(addr)) return 1; - if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) + if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } return 0; } @@ -260,8 +262,8 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (fi->extent_tree) { - struct extent_info *ei = &fi->extent_tree->largest; + if (fi->extent_tree[EX_READ]) { + struct extent_info *ei = &fi->extent_tree[EX_READ]->largest; if (ei->len && (!f2fs_is_valid_blkaddr(sbi, ei->blk, @@ -333,6 +335,16 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return true; } +static void init_idisk_time(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + fi->i_disk_time[0] = inode->i_atime; + fi->i_disk_time[1] = inode->i_ctime; + fi->i_disk_time[2] = inode->i_mtime; + fi->i_disk_time[3] = fi->i_crtime; +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -380,8 +392,6 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - f2fs_init_extent_tree(inode, node_page); - get_inline_info(inode, ri); fi->i_extra_isize = f2fs_has_extra_attr(inode) ? @@ -405,6 +415,7 @@ static int do_read_inode(struct inode *inode) if (!sanity_check_inode(inode, node_page)) { f2fs_put_page(node_page, 1); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; } @@ -465,10 +476,12 @@ static int do_read_inode(struct inode *inode) } } - fi->i_disk_time[0] = inode->i_atime; - fi->i_disk_time[1] = inode->i_ctime; - fi->i_disk_time[2] = inode->i_mtime; - fi->i_disk_time[3] = fi->i_crtime; + init_idisk_time(inode); + + /* Need all the flag bits */ + f2fs_init_read_extent_tree(inode, node_page); + f2fs_init_age_extent_tree(inode); + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -480,6 +493,12 @@ static int do_read_inode(struct inode *inode) return 0; } +static bool is_meta_ino(struct f2fs_sb_info *sbi, unsigned int ino) +{ + return ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi) || + ino == F2FS_COMPRESS_INO(sbi); +} + struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -491,16 +510,22 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { + if (is_meta_ino(sbi, ino)) { + f2fs_err(sbi, "inaccessible inode: %lu, run fsck to repair", ino); + set_sbi_flag(sbi, SBI_NEED_FSCK); + ret = -EFSCORRUPTED; + trace_f2fs_iget_exit(inode, ret); + iput(inode); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return ERR_PTR(ret); + } + trace_f2fs_iget(inode); return inode; } - if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) - goto make_now; -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (ino == F2FS_COMPRESS_INO(sbi)) + if (is_meta_ino(sbi, ino)) goto make_now; -#endif ret = do_read_inode(inode); if (ret) @@ -585,7 +610,7 @@ retry: void f2fs_update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ]; f2fs_wait_on_page_writeback(node_page, NODE, true, true); set_page_dirty(node_page); @@ -599,12 +624,15 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) ri->i_uid = cpu_to_le32(i_uid_read(inode)); ri->i_gid = cpu_to_le32(i_gid_read(inode)); ri->i_links = cpu_to_le32(inode->i_nlink); - ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1); + if (!f2fs_is_atomic_file(inode) || + is_inode_flag_set(inode, FI_ATOMIC_COMMITTED)) + ri->i_size = cpu_to_le64(i_size_read(inode)); + if (et) { read_lock(&et->lock); - set_raw_extent(&et->largest, &ri->i_ext); + set_raw_read_extent(&et->largest, &ri->i_ext); read_unlock(&et->lock); } else { memset(&ri->i_ext, 0, sizeof(ri->i_ext)); @@ -676,11 +704,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (inode->i_nlink == 0) clear_page_private_inline(node_page); - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; - F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; - + init_idisk_time(inode); #ifdef CONFIG_F2FS_CHECK_FS f2fs_inode_chksum_set(F2FS_I_SB(inode), node_page); #endif @@ -699,7 +723,8 @@ retry: cond_resched(); goto retry; } else if (err != -ENOENT) { - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_UPDATE_INODE); } return; } diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c index d84c5f6cc09d..3166a8939ed4 100644 --- a/fs/f2fs/iostat.c +++ b/fs/f2fs/iostat.c @@ -31,55 +31,65 @@ int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset) /* print app write IOs */ seq_puts(seq, "[WRITE]\n"); - seq_printf(seq, "app buffered: %-16llu\n", + seq_printf(seq, "app buffered data: %-16llu\n", sbi->rw_iostat[APP_BUFFERED_IO]); - seq_printf(seq, "app direct: %-16llu\n", + seq_printf(seq, "app direct data: %-16llu\n", sbi->rw_iostat[APP_DIRECT_IO]); - seq_printf(seq, "app mapped: %-16llu\n", + seq_printf(seq, "app mapped data: %-16llu\n", sbi->rw_iostat[APP_MAPPED_IO]); + seq_printf(seq, "app buffered cdata: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_CDATA_IO]); + seq_printf(seq, "app mapped cdata: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_CDATA_IO]); /* print fs write IOs */ - seq_printf(seq, "fs data: %-16llu\n", + seq_printf(seq, "fs data: %-16llu\n", sbi->rw_iostat[FS_DATA_IO]); - seq_printf(seq, "fs node: %-16llu\n", + seq_printf(seq, "fs cdata: %-16llu\n", + sbi->rw_iostat[FS_CDATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", sbi->rw_iostat[FS_NODE_IO]); - seq_printf(seq, "fs meta: %-16llu\n", + seq_printf(seq, "fs meta: %-16llu\n", sbi->rw_iostat[FS_META_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", + seq_printf(seq, "fs gc data: %-16llu\n", sbi->rw_iostat[FS_GC_DATA_IO]); - seq_printf(seq, "fs gc node: %-16llu\n", + seq_printf(seq, "fs gc node: %-16llu\n", sbi->rw_iostat[FS_GC_NODE_IO]); - seq_printf(seq, "fs cp data: %-16llu\n", + seq_printf(seq, "fs cp data: %-16llu\n", sbi->rw_iostat[FS_CP_DATA_IO]); - seq_printf(seq, "fs cp node: %-16llu\n", + seq_printf(seq, "fs cp node: %-16llu\n", sbi->rw_iostat[FS_CP_NODE_IO]); - seq_printf(seq, "fs cp meta: %-16llu\n", + seq_printf(seq, "fs cp meta: %-16llu\n", sbi->rw_iostat[FS_CP_META_IO]); /* print app read IOs */ seq_puts(seq, "[READ]\n"); - seq_printf(seq, "app buffered: %-16llu\n", + seq_printf(seq, "app buffered data: %-16llu\n", sbi->rw_iostat[APP_BUFFERED_READ_IO]); - seq_printf(seq, "app direct: %-16llu\n", + seq_printf(seq, "app direct data: %-16llu\n", sbi->rw_iostat[APP_DIRECT_READ_IO]); - seq_printf(seq, "app mapped: %-16llu\n", + seq_printf(seq, "app mapped data: %-16llu\n", sbi->rw_iostat[APP_MAPPED_READ_IO]); + seq_printf(seq, "app buffered cdata: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO]); + seq_printf(seq, "app mapped cdata: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO]); /* print fs read IOs */ - seq_printf(seq, "fs data: %-16llu\n", + seq_printf(seq, "fs data: %-16llu\n", sbi->rw_iostat[FS_DATA_READ_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", + seq_printf(seq, "fs gc data: %-16llu\n", sbi->rw_iostat[FS_GDATA_READ_IO]); - seq_printf(seq, "fs compr_data: %-16llu\n", + seq_printf(seq, "fs cdata: %-16llu\n", sbi->rw_iostat[FS_CDATA_READ_IO]); - seq_printf(seq, "fs node: %-16llu\n", + seq_printf(seq, "fs node: %-16llu\n", sbi->rw_iostat[FS_NODE_READ_IO]); - seq_printf(seq, "fs meta: %-16llu\n", + seq_printf(seq, "fs meta: %-16llu\n", sbi->rw_iostat[FS_META_READ_IO]); /* print other IOs */ seq_puts(seq, "[OTHER]\n"); - seq_printf(seq, "fs discard: %-16llu\n", + seq_printf(seq, "fs discard: %-16llu\n", sbi->rw_iostat[FS_DISCARD]); return 0; @@ -159,7 +169,7 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi) spin_unlock_irq(&sbi->iostat_lat_lock); } -void f2fs_update_iostat(struct f2fs_sb_info *sbi, +void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, enum iostat_type type, unsigned long long io_bytes) { unsigned long flags; @@ -176,6 +186,28 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi, if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO) sbi->rw_iostat[APP_READ_IO] += io_bytes; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (inode && f2fs_compressed_file(inode)) { + if (type == APP_BUFFERED_IO) + sbi->rw_iostat[APP_BUFFERED_CDATA_IO] += io_bytes; + + if (type == APP_BUFFERED_READ_IO) + sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO] += io_bytes; + + if (type == APP_MAPPED_READ_IO) + sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO] += io_bytes; + + if (type == APP_MAPPED_IO) + sbi->rw_iostat[APP_MAPPED_CDATA_IO] += io_bytes; + + if (type == FS_DATA_READ_IO) + sbi->rw_iostat[FS_CDATA_READ_IO] += io_bytes; + + if (type == FS_DATA_IO) + sbi->rw_iostat[FS_CDATA_IO] += io_bytes; + } +#endif + spin_unlock_irqrestore(&sbi->iostat_lock, flags); f2fs_record_iostat(sbi); diff --git a/fs/f2fs/iostat.h b/fs/f2fs/iostat.h index 22a2d01f57ef..2c048307b6e0 100644 --- a/fs/f2fs/iostat.h +++ b/fs/f2fs/iostat.h @@ -31,7 +31,7 @@ struct iostat_lat_info { extern int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset); extern void f2fs_reset_iostat(struct f2fs_sb_info *sbi); -extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, +extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, enum iostat_type type, unsigned long long io_bytes); struct bio_iostat_ctx { @@ -65,7 +65,7 @@ extern void f2fs_destroy_iostat_processing(void); extern int f2fs_init_iostat(struct f2fs_sb_info *sbi); extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi); #else -static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, enum iostat_type type, unsigned long long io_bytes) {} static inline void iostat_update_and_unbind_ctx(struct bio *bio, int rw) {} static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index bf00d5057abb..6032589099ce 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -22,137 +22,6 @@ #include "acl.h" #include <trace/events/f2fs.h> -static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, - struct inode *dir, umode_t mode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - nid_t ino; - struct inode *inode; - bool nid_free = false; - bool encrypt = false; - int xattr_size = 0; - int err; - - inode = new_inode(dir->i_sb); - if (!inode) - return ERR_PTR(-ENOMEM); - - if (!f2fs_alloc_nid(sbi, &ino)) { - err = -ENOSPC; - goto fail; - } - - nid_free = true; - - inode_init_owner(mnt_userns, inode, dir, mode); - - inode->i_ino = ino; - inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - F2FS_I(inode)->i_crtime = inode->i_mtime; - inode->i_generation = prandom_u32(); - - if (S_ISDIR(inode->i_mode)) - F2FS_I(inode)->i_current_depth = 1; - - err = insert_inode_locked(inode); - if (err) { - err = -EINVAL; - goto fail; - } - - if (f2fs_sb_has_project_quota(sbi) && - (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL)) - F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; - else - F2FS_I(inode)->i_projid = make_kprojid(mnt_userns, - F2FS_DEF_PROJID); - - err = fscrypt_prepare_new_inode(dir, inode, &encrypt); - if (err) - goto fail_drop; - - err = f2fs_dquot_initialize(inode); - if (err) - goto fail_drop; - - set_inode_flag(inode, FI_NEW_INODE); - - if (encrypt) - f2fs_set_encrypted_inode(inode); - - if (f2fs_sb_has_extra_attr(sbi)) { - set_inode_flag(inode, FI_EXTRA_ATTR); - F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; - } - - if (test_opt(sbi, INLINE_XATTR)) - set_inode_flag(inode, FI_INLINE_XATTR); - - if (f2fs_may_inline_dentry(inode)) - set_inode_flag(inode, FI_INLINE_DENTRY); - - if (f2fs_sb_has_flexible_inline_xattr(sbi)) { - f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); - if (f2fs_has_inline_xattr(inode)) - xattr_size = F2FS_OPTION(sbi).inline_xattr_size; - /* Otherwise, will be 0 */ - } else if (f2fs_has_inline_xattr(inode) || - f2fs_has_inline_dentry(inode)) { - xattr_size = DEFAULT_INLINE_XATTR_ADDRS; - } - F2FS_I(inode)->i_inline_xattr_size = xattr_size; - - f2fs_init_extent_tree(inode, NULL); - - F2FS_I(inode)->i_flags = - f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); - - if (S_ISDIR(inode->i_mode)) - F2FS_I(inode)->i_flags |= F2FS_INDEX_FL; - - if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) - set_inode_flag(inode, FI_PROJ_INHERIT); - - if (f2fs_sb_has_compression(sbi)) { - /* Inherit the compression flag in directory */ - if ((F2FS_I(dir)->i_flags & F2FS_COMPR_FL) && - f2fs_may_compress(inode)) - set_compress_context(inode); - } - - /* Should enable inline_data after compression set */ - if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) - set_inode_flag(inode, FI_INLINE_DATA); - - stat_inc_inline_xattr(inode); - stat_inc_inline_inode(inode); - stat_inc_inline_dir(inode); - - f2fs_set_inode_flags(inode); - - trace_f2fs_new_inode(inode, 0); - return inode; - -fail: - trace_f2fs_new_inode(inode, err); - make_bad_inode(inode); - if (nid_free) - set_inode_flag(inode, FI_FREE_NID); - iput(inode); - return ERR_PTR(err); -fail_drop: - trace_f2fs_new_inode(inode, err); - dquot_drop(inode); - inode->i_flags |= S_NOQUOTA; - if (nid_free) - set_inode_flag(inode, FI_FREE_NID); - clear_nlink(inode); - unlock_new_inode(inode); - iput(inode); - return ERR_PTR(err); -} - static inline int is_extension_exist(const unsigned char *s, const char *sub, bool tmp_ext) { @@ -187,36 +56,6 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub, return 0; } -/* - * Set file's temperature for hot/cold data separation - */ -static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, - const unsigned char *name) -{ - __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - int i, cold_count, hot_count; - - f2fs_down_read(&sbi->sb_lock); - - cold_count = le32_to_cpu(sbi->raw_super->extension_count); - hot_count = sbi->raw_super->hot_ext_count; - - for (i = 0; i < cold_count + hot_count; i++) { - if (is_extension_exist(name, extlist[i], true)) - break; - } - - f2fs_up_read(&sbi->sb_lock); - - if (i == cold_count + hot_count) - return; - - if (i < cold_count) - file_set_cold(inode); - else - file_set_hot(inode); -} - int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set) { @@ -283,56 +122,215 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, return 0; } -static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, - const unsigned char *name) +static void set_compress_new_inode(struct f2fs_sb_info *sbi, struct inode *dir, + struct inode *inode, const unsigned char *name) { __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - unsigned char (*noext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).noextensions; + unsigned char (*noext)[F2FS_EXTENSION_LEN] = + F2FS_OPTION(sbi).noextensions; unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions; unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; int i, cold_count, hot_count; - if (!f2fs_sb_has_compression(sbi) || - F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL || - !f2fs_may_compress(inode) || - (!ext_cnt && !noext_cnt)) + if (!f2fs_sb_has_compression(sbi)) return; - f2fs_down_read(&sbi->sb_lock); + if (S_ISDIR(inode->i_mode)) + goto inherit_comp; + + /* This name comes only from normal files. */ + if (!name) + return; + /* Don't compress hot files. */ + f2fs_down_read(&sbi->sb_lock); cold_count = le32_to_cpu(sbi->raw_super->extension_count); hot_count = sbi->raw_super->hot_ext_count; + for (i = cold_count; i < cold_count + hot_count; i++) + if (is_extension_exist(name, extlist[i], false)) + break; + f2fs_up_read(&sbi->sb_lock); + if (i < (cold_count + hot_count)) + return; - for (i = cold_count; i < cold_count + hot_count; i++) { - if (is_extension_exist(name, extlist[i], false)) { - f2fs_up_read(&sbi->sb_lock); + /* Don't compress unallowed extension. */ + for (i = 0; i < noext_cnt; i++) + if (is_extension_exist(name, noext[i], false)) + return; + + /* Compress wanting extension. */ + for (i = 0; i < ext_cnt; i++) { + if (is_extension_exist(name, ext[i], false)) { + set_compress_context(inode); return; } } +inherit_comp: + /* Inherit the {no-}compression flag in directory */ + if (F2FS_I(dir)->i_flags & F2FS_NOCOMP_FL) { + F2FS_I(inode)->i_flags |= F2FS_NOCOMP_FL; + f2fs_mark_inode_dirty_sync(inode, true); + } else if (F2FS_I(dir)->i_flags & F2FS_COMPR_FL) { + set_compress_context(inode); + } +} + +/* + * Set file's temperature for hot/cold data separation + */ +static void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, + const unsigned char *name) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int i, cold_count, hot_count; + f2fs_down_read(&sbi->sb_lock); + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; + for (i = 0; i < cold_count + hot_count; i++) + if (is_extension_exist(name, extlist[i], true)) + break; f2fs_up_read(&sbi->sb_lock); - for (i = 0; i < noext_cnt; i++) { - if (is_extension_exist(name, noext[i], false)) { - f2fs_disable_compressed_file(inode); - return; - } + if (i == cold_count + hot_count) + return; + + if (i < cold_count) + file_set_cold(inode); + else + file_set_hot(inode); +} + +static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, + struct inode *dir, umode_t mode, + const char *name) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + nid_t ino; + struct inode *inode; + bool nid_free = false; + bool encrypt = false; + int xattr_size = 0; + int err; + + inode = new_inode(dir->i_sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!f2fs_alloc_nid(sbi, &ino)) { + err = -ENOSPC; + goto fail; } - if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) - return; + nid_free = true; - for (i = 0; i < ext_cnt; i++) { - if (!is_extension_exist(name, ext[i], false)) - continue; + inode_init_owner(mnt_userns, inode, dir, mode); - /* Do not use inline_data with compression */ - stat_dec_inline_inode(inode); - clear_inode_flag(inode, FI_INLINE_DATA); - set_compress_context(inode); - return; + inode->i_ino = ino; + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + F2FS_I(inode)->i_crtime = inode->i_mtime; + inode->i_generation = get_random_u32(); + + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_current_depth = 1; + + err = insert_inode_locked(inode); + if (err) { + err = -EINVAL; + goto fail; + } + + if (f2fs_sb_has_project_quota(sbi) && + (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL)) + F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; + else + F2FS_I(inode)->i_projid = make_kprojid(mnt_userns, + F2FS_DEF_PROJID); + + err = fscrypt_prepare_new_inode(dir, inode, &encrypt); + if (err) + goto fail_drop; + + err = f2fs_dquot_initialize(inode); + if (err) + goto fail_drop; + + set_inode_flag(inode, FI_NEW_INODE); + + if (encrypt) + f2fs_set_encrypted_inode(inode); + + if (f2fs_sb_has_extra_attr(sbi)) { + set_inode_flag(inode, FI_EXTRA_ATTR); + F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; + } + + if (test_opt(sbi, INLINE_XATTR)) + set_inode_flag(inode, FI_INLINE_XATTR); + + if (f2fs_may_inline_dentry(inode)) + set_inode_flag(inode, FI_INLINE_DENTRY); + + if (f2fs_sb_has_flexible_inline_xattr(sbi)) { + f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); + if (f2fs_has_inline_xattr(inode)) + xattr_size = F2FS_OPTION(sbi).inline_xattr_size; + /* Otherwise, will be 0 */ + } else if (f2fs_has_inline_xattr(inode) || + f2fs_has_inline_dentry(inode)) { + xattr_size = DEFAULT_INLINE_XATTR_ADDRS; } + F2FS_I(inode)->i_inline_xattr_size = xattr_size; + + F2FS_I(inode)->i_flags = + f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); + + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_flags |= F2FS_INDEX_FL; + + if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + + /* Check compression first. */ + set_compress_new_inode(sbi, dir, inode, name); + + /* Should enable inline_data after compression set */ + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) + set_inode_flag(inode, FI_INLINE_DATA); + + if (name && !test_opt(sbi, DISABLE_EXT_IDENTIFY)) + set_file_temperature(sbi, inode, name); + + stat_inc_inline_xattr(inode); + stat_inc_inline_inode(inode); + stat_inc_inline_dir(inode); + + f2fs_set_inode_flags(inode); + + f2fs_init_extent_tree(inode); + + trace_f2fs_new_inode(inode, 0); + return inode; + +fail: + trace_f2fs_new_inode(inode, err); + make_bad_inode(inode); + if (nid_free) + set_inode_flag(inode, FI_FREE_NID); + iput(inode); + return ERR_PTR(err); +fail_drop: + trace_f2fs_new_inode(inode, err); + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + if (nid_free) + set_inode_flag(inode, FI_FREE_NID); + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(err); } static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, @@ -352,15 +350,10 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, mode); + inode = f2fs_new_inode(mnt_userns, dir, mode, dentry->d_name.name); if (IS_ERR(inode)) return PTR_ERR(inode); - if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) - set_file_temperature(sbi, inode, dentry->d_name.name); - - set_compress_inode(sbi, inode, dentry->d_name.name); - inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; @@ -632,6 +625,8 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) goto fail; } f2fs_delete_entry(de, page, dir, inode); + f2fs_unlock_op(sbi); + #if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid @@ -642,8 +637,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) if (IS_CASEFOLDED(dir)) d_invalidate(dentry); #endif - f2fs_unlock_op(sbi); - if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); fail: @@ -689,7 +682,7 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO); + inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -760,7 +753,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode); + inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -817,7 +810,7 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, mode); + inode = f2fs_new_inode(mnt_userns, dir, mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -845,7 +838,7 @@ out: } static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, bool is_whiteout, + struct file *file, umode_t mode, bool is_whiteout, struct inode **new_inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -856,7 +849,7 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, mode); + inode = f2fs_new_inode(mnt_userns, dir, mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -892,8 +885,8 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } else { - if (dentry) - d_tmpfile(dentry, inode); + if (file) + d_tmpfile(file, inode); else f2fs_i_links_write(inode, false); } @@ -915,16 +908,19 @@ out: } static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + int err; if (unlikely(f2fs_cp_error(sbi))) return -EIO; if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, false, NULL); + err = __f2fs_tmpfile(mnt_userns, dir, file, mode, false, NULL); + + return finish_open_simple(file, err); } static int f2fs_create_whiteout(struct user_namespace *mnt_userns, @@ -1376,7 +1372,7 @@ const struct inode_operations f2fs_dir_inode_operations = { .tmpfile = f2fs_tmpfile, .getattr = f2fs_getattr, .setattr = f2fs_setattr, - .get_acl = f2fs_get_acl, + .get_inode_acl = f2fs_get_acl, .set_acl = f2fs_set_acl, .listxattr = f2fs_listxattr, .fiemap = f2fs_fiemap, @@ -1394,7 +1390,7 @@ const struct inode_operations f2fs_symlink_inode_operations = { const struct inode_operations f2fs_special_inode_operations = { .getattr = f2fs_getattr, .setattr = f2fs_setattr, - .get_acl = f2fs_get_acl, + .get_inode_acl = f2fs_get_acl, .set_acl = f2fs_set_acl, .listxattr = f2fs_listxattr, }; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e06a0c478b39..dde4c0458704 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -36,6 +36,7 @@ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.", __func__, nid); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; } return 0; @@ -59,7 +60,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) avail_ram = val.totalram - val.totalhigh; /* - * give 25%, 25%, 50%, 50%, 50% memory for each components respectively + * give 25%, 25%, 50%, 50%, 25%, 25% memory for each components respectively */ if (type == FREE_NIDS) { mem_size = (nm_i->nid_cnt[FREE_NID] * @@ -84,12 +85,16 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct ino_entry); mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); - } else if (type == EXTENT_CACHE) { - mem_size = (atomic_read(&sbi->total_ext_tree) * + } else if (type == READ_EXTENT_CACHE || type == AGE_EXTENT_CACHE) { + enum extent_type etype = type == READ_EXTENT_CACHE ? + EX_READ : EX_BLOCK_AGE; + struct extent_tree_info *eti = &sbi->extent_tree[etype]; + + mem_size = (atomic_read(&eti->total_ext_tree) * sizeof(struct extent_tree) + - atomic_read(&sbi->total_ext_node) * + atomic_read(&eti->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; - res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == DISCARD_CACHE) { mem_size = (atomic_read(&dcc->discard_cmd_cnt) * sizeof(struct discard_cmd)) >> PAGE_SHIFT; @@ -585,7 +590,7 @@ retry: ne = nat_in_journal(journal, i); node_info_from_raw_nat(ni, &ne); } - up_read(&curseg->journal_rwsem); + up_read(&curseg->journal_rwsem); if (i >= 0) { f2fs_up_read(&nm_i->nat_tree_lock); goto cache; @@ -858,7 +863,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) blkaddr = data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node + 1); - f2fs_update_extent_tree_range_compressed(dn->inode, + f2fs_update_read_extent_tree_range_compressed(dn->inode, index, blkaddr, F2FS_I(dn->inode)->i_cluster_size, c_len); @@ -1295,6 +1300,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) if (unlikely(new_ni.blk_addr != NULL_ADDR)) { err = -EFSCORRUPTED; set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto fail; } #endif @@ -1358,8 +1364,7 @@ static int read_node_page(struct page *page, blk_opf_t op_flags) return err; /* NEW_ADDR can be seen, after cp_error drops some dirty node pages */ - if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) || - is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) { + if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR)) { ClearPageUptodate(page); return -ENOENT; } @@ -1369,7 +1374,7 @@ static int read_node_page(struct page *page, blk_opf_t op_flags) err = f2fs_submit_page_bio(&fio); if (!err) - f2fs_update_iostat(sbi, FS_NODE_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_NODE_READ_IO, F2FS_BLKSIZE); return err; } @@ -2147,8 +2152,7 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping, if (IS_INODE(&folio->page)) f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page); #endif - if (!folio_test_dirty(folio)) { - filemap_dirty_folio(mapping, folio); + if (filemap_dirty_folio(mapping, folio)) { inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); set_page_private_reference(&folio->page); return true; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 3c09cae058b0..99454d46a939 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -146,7 +146,8 @@ enum mem_type { NAT_ENTRIES, /* indicates the cached nat entry */ DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ - EXTENT_CACHE, /* indicates extent cache */ + READ_EXTENT_CACHE, /* indicates read extent cache */ + AGE_EXTENT_CACHE, /* indicates age extent cache */ DISCARD_CACHE, /* indicates memory of cached discard cmds */ COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index dcd0a1e35095..77fd453949b1 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -474,7 +474,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, struct dnode_of_data tdn = *dn; nid_t ino, nid; struct inode *inode; - unsigned int offset; + unsigned int offset, ofs_in_node, max_addrs; block_t bidx; int i; @@ -501,15 +501,25 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, got_it: /* Use the locked dnode page and inode */ nid = le32_to_cpu(sum.nid); + ofs_in_node = le16_to_cpu(sum.ofs_in_node); + + max_addrs = ADDRS_PER_PAGE(dn->node_page, dn->inode); + if (ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u", + ofs_in_node, dn->inode->i_ino, nid, max_addrs); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUMMARY); + return -EFSCORRUPTED; + } + if (dn->inode->i_ino == nid) { tdn.nid = nid; if (!dn->inode_page_locked) lock_page(dn->inode_page); tdn.node_page = dn->inode_page; - tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + tdn.ofs_in_node = ofs_in_node; goto truncate_out; } else if (dn->nid == nid) { - tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + tdn.ofs_in_node = ofs_in_node; goto truncate_out; } @@ -628,6 +638,7 @@ retry_dn: inode->i_ino, ofs_of_node(dn.node_page), ofs_of_node(page)); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); goto err; } @@ -640,12 +651,14 @@ retry_dn: if (__is_valid_data_blkaddr(src) && !f2fs_is_valid_blkaddr(sbi, src, META_POR)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto err; } if (__is_valid_data_blkaddr(dest) && !f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto err; } @@ -698,6 +711,16 @@ retry_prev: goto err; } + if (f2fs_is_valid_blkaddr(sbi, dest, + DATA_GENERIC_ENHANCE_UPDATE)) { + f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u", + dest, inode->i_ino, dn.ofs_in_node); + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); + goto err; + } + /* write dummy data page */ f2fs_replace_block(sbi, &dn, src, dest, ni.version, false, false); @@ -900,9 +923,7 @@ int __init f2fs_create_recovery_cache(void) { fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry)); - if (!fsync_entry_slab) - return -ENOMEM; - return 0; + return fsync_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_recovery_cache(void) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0de21f82d7bc..25ddea478fc1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -187,23 +187,24 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) void f2fs_abort_atomic_write(struct inode *inode, bool clean) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); if (!f2fs_is_atomic_file(inode)) return; - if (clean) - truncate_inode_pages_final(inode->i_mapping); clear_inode_flag(fi->cow_inode, FI_COW_FILE); iput(fi->cow_inode); fi->cow_inode = NULL; release_atomic_write_cnt(inode); + clear_inode_flag(inode, FI_ATOMIC_COMMITTED); + clear_inode_flag(inode, FI_ATOMIC_REPLACE); clear_inode_flag(inode, FI_ATOMIC_FILE); + stat_dec_atomic_inode(inode); - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - sbi->atomic_files--; - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + if (clean) { + truncate_inode_pages_final(inode->i_mapping); + f2fs_i_size_write(inode, fi->original_i_size); + } } static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, @@ -261,14 +262,19 @@ static void __complete_revoke_list(struct inode *inode, struct list_head *head, bool revoke) { struct revoke_entry *cur, *tmp; + bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE); list_for_each_entry_safe(cur, tmp, head, list) { if (revoke) __replace_atomic_write_block(inode, cur->index, cur->old_addr, NULL, true); + list_del(&cur->list); kmem_cache_free(revoke_entry_slab, cur); } + + if (!revoke && truncate) + f2fs_do_truncate_blocks(inode, 0, false); } static int __f2fs_commit_atomic_write(struct inode *inode) @@ -312,6 +318,8 @@ static int __f2fs_commit_atomic_write(struct inode *inode) DATA_GENERIC_ENHANCE)) { f2fs_put_dnode(&dn); ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); goto out; } @@ -337,10 +345,12 @@ next: } out: - if (ret) + if (ret) { sbi->revoked_atomic_block += fi->atomic_write_cnt; - else + } else { sbi->committed_atomic_block += fi->atomic_write_cnt; + set_inode_flag(inode, FI_ATOMIC_COMMITTED); + } __complete_revoke_list(inode, &revoke_list, ret ? true : false); @@ -376,7 +386,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT); } /* balance_fs_bg is able to be pending */ @@ -439,8 +449,14 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) return; /* try to shrink extent cache when there is no enough memory */ - if (!f2fs_available_free_memory(sbi, EXTENT_CACHE)) - f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); + if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE)) + f2fs_shrink_read_extent_tree(sbi, + READ_EXTENT_CACHE_SHRINK_NUMBER); + + /* try to shrink age extent cache when there is no enough memory */ + if (!f2fs_available_free_memory(sbi, AGE_EXTENT_CACHE)) + f2fs_shrink_age_extent_tree(sbi, + AGE_EXTENT_CACHE_SHRINK_NUMBER); /* check the # of cached NAT entries */ if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) @@ -476,12 +492,12 @@ do_sync: mutex_lock(&sbi->flush_lock); blk_start_plug(&plug); - f2fs_sync_dirty_inodes(sbi, FILE_INODE); + f2fs_sync_dirty_inodes(sbi, FILE_INODE, false); blk_finish_plug(&plug); mutex_unlock(&sbi->flush_lock); } - f2fs_sync_fs(sbi->sb, true); + f2fs_sync_fs(sbi->sb, 1); stat_inc_bg_cp_count(sbi->stat_info); } @@ -622,12 +638,11 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct flush_cmd_control *fcc; - int err = 0; if (SM_I(sbi)->fcc_info) { fcc = SM_I(sbi)->fcc_info; if (fcc->f2fs_issue_flush) - return err; + return 0; goto init_thread; } @@ -640,19 +655,20 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; if (!test_opt(sbi, FLUSH_MERGE)) - return err; + return 0; init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { - err = PTR_ERR(fcc->f2fs_issue_flush); + int err = PTR_ERR(fcc->f2fs_issue_flush); + kfree(fcc); SM_I(sbi)->fcc_info = NULL; return err; } - return err; + return 0; } void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) @@ -694,7 +710,8 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) } while (ret && --count); if (ret) { - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_FLUSH_FAIL); break; } @@ -857,7 +874,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi) } mutex_unlock(&dirty_i->seglist_lock); - unusable = holes[DATA] > holes[NODE] ? holes[DATA] : holes[NODE]; + unusable = max(holes[DATA], holes[NODE]); if (unusable > ovp_holes) return unusable - ovp_holes; return 0; @@ -1053,8 +1070,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->io_aware = true; dpolicy->sync = false; dpolicy->ordered = true; - if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { - dpolicy->granularity = 1; + if (utilization(sbi) > dcc->discard_urgent_util) { + dpolicy->granularity = MIN_DISCARD_GRANULARITY; if (atomic_read(&dcc->discard_cmd_cnt)) dpolicy->max_interval = dcc->min_discard_issue_time; @@ -1069,7 +1086,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, } else if (discard_type == DPOLICY_UMOUNT) { dpolicy->io_aware = false; /* we need to issue all to keep CP_TRIMMED_FLAG */ - dpolicy->granularity = 1; + dpolicy->granularity = MIN_DISCARD_GRANULARITY; dpolicy->timeout = true; } } @@ -1127,13 +1144,12 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, if (time_to_inject(sbi, FAULT_DISCARD)) { f2fs_show_injection_info(sbi, FAULT_DISCARD); err = -EIO; - goto submit; - } - err = __blkdev_issue_discard(bdev, + } else { + err = __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), GFP_NOFS, &bio); -submit: + } if (err) { spin_lock_irqsave(&dc->lock, flags); if (dc->state == D_PARTIAL) @@ -1171,7 +1187,7 @@ submit: atomic_inc(&dcc->issued_discard); - f2fs_update_iostat(sbi, FS_DISCARD, 1); + f2fs_update_iostat(sbi, NULL, FS_DISCARD, len * F2FS_BLKSIZE); lstart += len; start += len; @@ -1343,13 +1359,13 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, } } -static int __queue_discard_cmd(struct f2fs_sb_info *sbi, +static void __queue_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { block_t lblkstart = blkstart; if (!f2fs_bdev_support_discard(bdev)) - return 0; + return; trace_f2fs_queue_discard(bdev, blkstart, blklen); @@ -1361,7 +1377,6 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); - return 0; } static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, @@ -1449,7 +1464,7 @@ retry: if (i + 1 < dpolicy->granularity) break; - if (i < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered) + if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered) return __issue_discard_cmd_orderly(sbi, dpolicy); pend_list = &dcc->pend_list[i]; @@ -1646,6 +1661,9 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) struct discard_policy dpolicy; bool dropped; + if (!atomic_read(&dcc->discard_cmd_cnt)) + return false; + __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); __issue_discard_cmd(sbi, &dpolicy); @@ -1670,6 +1688,11 @@ static int issue_discard_thread(void *data) set_freezable(); do { + wait_event_interruptible_timeout(*q, + kthread_should_stop() || freezing(current) || + dcc->discard_wake, + msecs_to_jiffies(wait_ms)); + if (sbi->gc_mode == GC_URGENT_HIGH || !f2fs_available_free_memory(sbi, DISCARD_CACHE)) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); @@ -1677,14 +1700,6 @@ static int issue_discard_thread(void *data) __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, dcc->discard_granularity); - if (!atomic_read(&dcc->discard_cmd_cnt)) - wait_ms = dpolicy.max_interval; - - wait_event_interruptible_timeout(*q, - kthread_should_stop() || freezing(current) || - dcc->discard_wake, - msecs_to_jiffies(wait_ms)); - if (dcc->discard_wake) dcc->discard_wake = 0; @@ -1698,12 +1713,11 @@ static int issue_discard_thread(void *data) continue; if (kthread_should_stop()) return 0; - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || + !atomic_read(&dcc->discard_cmd_cnt)) { wait_ms = dpolicy.max_interval; continue; } - if (!atomic_read(&dcc->discard_cmd_cnt)) - continue; sb_start_intwrite(sbi->sb); @@ -1718,6 +1732,8 @@ static int issue_discard_thread(void *data) } else { wait_ms = dpolicy.max_interval; } + if (!atomic_read(&dcc->discard_cmd_cnt)) + wait_ms = dpolicy.max_interval; sb_end_intwrite(sbi->sb); @@ -1761,7 +1777,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } /* For conventional zones, use regular discard if supported */ - return __queue_discard_cmd(sbi, bdev, lblkstart, blklen); + __queue_discard_cmd(sbi, bdev, lblkstart, blklen); + return 0; } #endif @@ -1772,7 +1789,8 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi, if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif - return __queue_discard_cmd(sbi, bdev, blkstart, blklen); + __queue_discard_cmd(sbi, bdev, blkstart, blklen); + return 0; } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, @@ -2026,8 +2044,10 @@ int f2fs_start_discard_thread(struct f2fs_sb_info *sbi) dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); - if (IS_ERR(dcc->f2fs_issue_discard)) + if (IS_ERR(dcc->f2fs_issue_discard)) { err = PTR_ERR(dcc->f2fs_issue_discard); + dcc->f2fs_issue_discard = NULL; + } return err; } @@ -2047,6 +2067,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; + dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) dcc->discard_granularity = sbi->blocks_per_seg; else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) @@ -2067,6 +2088,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME; dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME; dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME; + dcc->discard_urgent_util = DEF_DISCARD_URGENT_UTIL; dcc->undiscard_blks = 0; dcc->next_pos = 0; dcc->root = RB_ROOT_CACHED; @@ -2097,8 +2119,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) * Recovery can cache discard commands, so in error path of * fill_super(), it needs to give a chance to handle them. */ - if (unlikely(atomic_read(&dcc->discard_cmd_cnt))) - f2fs_issue_discard_timeout(sbi); + f2fs_issue_discard_timeout(sbi); kfree(dcc); SM_I(sbi)->dcc_info = NULL; @@ -2535,7 +2556,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) sanity_check_seg_type(sbi, seg_type); if (f2fs_need_rand_seg(sbi)) - return prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec); + return get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec); /* if segs_per_sec is large than 1, we need to keep original policy. */ if (__is_large_section(sbi)) @@ -2589,7 +2610,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) curseg->alloc_type = LFS; if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) curseg->fragment_remained_chunk = - prandom_u32() % sbi->max_fragment_chunk + 1; + get_random_u32_inclusive(1, sbi->max_fragment_chunk); } static int __next_free_blkoff(struct f2fs_sb_info *sbi, @@ -2626,9 +2647,9 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, /* To allocate block chunks in different sizes, use random number */ if (--seg->fragment_remained_chunk <= 0) { seg->fragment_remained_chunk = - prandom_u32() % sbi->max_fragment_chunk + 1; + get_random_u32_inclusive(1, sbi->max_fragment_chunk); seg->next_blkoff += - prandom_u32() % sbi->max_fragment_hole + 1; + get_random_u32_inclusive(1, sbi->max_fragment_hole); } } } @@ -2643,7 +2664,7 @@ bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) +static void change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -2651,9 +2672,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) struct f2fs_summary_block *sum_node; struct page *sum_page; - if (flush) - write_sum_page(sbi, curseg->sum_blk, - GET_SUM_BLOCK(sbi, curseg->segno)); + write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); __set_test_and_inuse(sbi, new_segno); @@ -2692,7 +2711,7 @@ static void get_atssr_segment(struct f2fs_sb_info *sbi, int type, struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno); curseg->seg_type = se->type; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } else { /* allocate cold segment by default */ curseg->seg_type = CURSEG_COLD_DATA; @@ -2836,31 +2855,20 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, return 0; } -/* - * flush out current segment and replace it with new segment - * This function should be returned with success, otherwise BUG - */ -static void allocate_segment_by_default(struct f2fs_sb_info *sbi, - int type, bool force) +static bool need_new_seg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); - if (force) - new_curseg(sbi, type, true); - else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && - curseg->seg_type == CURSEG_WARM_NODE) - new_curseg(sbi, type, false); - else if (curseg->alloc_type == LFS && - is_next_segment_free(sbi, curseg, type) && - likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) - new_curseg(sbi, type, false); - else if (f2fs_need_SSR(sbi) && - get_ssr_segment(sbi, type, SSR, 0)) - change_curseg(sbi, type, true); - else - new_curseg(sbi, type, false); - - stat_inc_seg_type(sbi, curseg); + if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && + curseg->seg_type == CURSEG_WARM_NODE) + return true; + if (curseg->alloc_type == LFS && + is_next_segment_free(sbi, curseg, type) && + likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return true; + if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0)) + return true; + return false; } void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, @@ -2878,7 +2886,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, goto unlock; if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0)) - change_curseg(sbi, type, true); + change_curseg(sbi, type); else new_curseg(sbi, type, true); @@ -2913,7 +2921,8 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, return; alloc: old_segno = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); + new_curseg(sbi, type, true); + stat_inc_seg_type(sbi, curseg); locate_dirty_segment(sbi, old_segno); } @@ -2944,10 +2953,6 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) f2fs_up_read(&SM_I(sbi)->curseg_lock); } -static const struct segment_allocation default_salloc_ops = { - .allocate_segment = allocate_segment_by_default, -}; - bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) { @@ -3153,10 +3158,28 @@ static int __get_segment_type_4(struct f2fs_io_info *fio) } } +static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_info ei; + + if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) { + if (!ei.age) + return NO_CHECK_TYPE; + if (ei.age <= sbi->hot_data_age_threshold) + return CURSEG_HOT_DATA; + if (ei.age <= sbi->warm_data_age_threshold) + return CURSEG_WARM_DATA; + return CURSEG_COLD_DATA; + } + return NO_CHECK_TYPE; +} + static int __get_segment_type_6(struct f2fs_io_info *fio) { if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; + int type; if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) return CURSEG_COLD_DATA_PINNED; @@ -3171,6 +3194,11 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) } if (file_is_cold(inode) || f2fs_need_compress_data(inode)) return CURSEG_COLD_DATA; + + type = __get_age_segment_type(inode, fio->page->index); + if (type != NO_CHECK_TYPE) + return type; + if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || f2fs_is_cow_file(inode)) @@ -3267,11 +3295,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, update_sit_entry(sbi, old_blkaddr, -1); if (!__has_curseg_space(sbi, curseg)) { - if (from_gc) + /* + * Flush out current segment and replace it with new segment. + */ + if (from_gc) { get_atssr_segment(sbi, type, se->type, AT_SSR, se->mtime); - else - sit_i->s_ops->allocate_segment(sbi, type, false); + } else { + if (need_new_seg(sbi, type)) + new_curseg(sbi, type, false); + else + change_curseg(sbi, type); + stat_inc_seg_type(sbi, curseg); + } } /* * segment dirty status should be updated after segment allocation, @@ -3281,6 +3317,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); + if (IS_DATASEG(type)) + atomic64_inc(&sbi->allocated_data_blocks); + up_write(&sit_i->sentry_lock); if (page && IS_NODESEG(type)) { @@ -3388,7 +3427,7 @@ void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, f2fs_submit_page_write(&fio); stat_inc_meta_count(sbi, page->index); - f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, io_type, F2FS_BLKSIZE); } void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) @@ -3398,7 +3437,7 @@ void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) set_summary(&sum, nid, 0, 0); do_write_page(&sum, fio); - f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(fio->sbi, NULL, fio->io_type, F2FS_BLKSIZE); } void f2fs_outplace_write_data(struct dnode_of_data *dn, @@ -3408,11 +3447,13 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn, struct f2fs_summary sum; f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); + if (fio->io_type == FS_DATA_IO || fio->io_type == FS_CP_DATA_IO) + f2fs_update_age_extent_cache(dn); set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); - f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, dn->inode, fio->io_type, F2FS_BLKSIZE); } int f2fs_inplace_write_data(struct f2fs_io_info *fio) @@ -3432,6 +3473,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.", __func__, segno); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); goto drop_bio; } @@ -3453,7 +3495,8 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) if (!err) { f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); - f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(fio->sbi, fio->page->mapping->host, + fio->io_type, F2FS_BLKSIZE); } return err; @@ -3530,7 +3573,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); @@ -3558,7 +3601,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = old_blkoff; curseg->alloc_type = old_alloc_type; @@ -4255,9 +4298,6 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; #endif - /* init SIT information */ - sit_i->s_ops = &default_salloc_ops; - sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; sit_i->written_valid_blocks = 0; @@ -4379,6 +4419,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (se->type >= NR_PERSISTENT_LOG) { f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); + f2fs_handle_error(sbi, + ERROR_INCONSISTENT_SUM_TYPE); return -EFSCORRUPTED; } @@ -4415,6 +4457,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Wrong journal entry on segno %u", start); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_CORRUPTED_JOURNAL); break; } @@ -4434,6 +4477,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); break; } @@ -4465,6 +4509,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (sit_valid_blocks[NODE] != valid_node_count(sbi)) { f2fs_err(sbi, "SIT is corrupted node# %u vs %u", sit_valid_blocks[NODE], valid_node_count(sbi)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NODE_COUNT); return -EFSCORRUPTED; } @@ -4473,6 +4518,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u", sit_valid_blocks[DATA], sit_valid_blocks[NODE], valid_user_blocks(sbi)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_BLOCK_COUNT); return -EFSCORRUPTED; } @@ -4623,6 +4669,7 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Current segment has invalid alloc_type:%d", curseg->alloc_type); + f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); return -EFSCORRUPTED; } @@ -4640,6 +4687,7 @@ out: "Current segment's next free block offset is inconsistent with bitmap, logtype:%u, segno:%u, type:%u, next_blkoff:%u, blkofs:%u", i, curseg->segno, curseg->alloc_type, curseg->next_blkoff, blkofs); + f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); return -EFSCORRUPTED; } } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index d1d63766f2c7..3ad1b7b6fa94 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -222,10 +222,6 @@ struct sec_entry { unsigned int valid_blocks; /* # of valid blocks in a section */ }; -struct segment_allocation { - void (*allocate_segment)(struct f2fs_sb_info *, int, bool); -}; - #define MAX_SKIP_GC_COUNT 16 struct revoke_entry { @@ -235,8 +231,6 @@ struct revoke_entry { }; struct sit_info { - const struct segment_allocation *s_ops; - block_t sit_base_addr; /* start block address of SIT area */ block_t sit_blocks; /* # of blocks used by SIT area */ block_t written_valid_blocks; /* # of valid blocks in main area */ @@ -753,6 +747,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Mismatch valid blocks %d vs. %d", GET_SIT_VBLOCKS(raw_sit), valid_blocks); set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); return -EFSCORRUPTED; } @@ -767,6 +762,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Wrong valid blocks %d or segno %u", GET_SIT_VBLOCKS(raw_sit), segno); set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); return -EFSCORRUPTED; } return 0; diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index dd3c3c7a90ec..83d6fb97dcae 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -28,10 +28,13 @@ static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) return count > 0 ? count : 0; } -static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) +static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi, + enum extent_type type) { - return atomic_read(&sbi->total_zombie_tree) + - atomic_read(&sbi->total_ext_node); + struct extent_tree_info *eti = &sbi->extent_tree[type]; + + return atomic_read(&eti->total_zombie_tree) + + atomic_read(&eti->total_ext_node); } unsigned long f2fs_shrink_count(struct shrinker *shrink, @@ -53,8 +56,11 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, } spin_unlock(&f2fs_list_lock); - /* count extent cache entries */ - count += __count_extent_cache(sbi); + /* count read extent cache entries */ + count += __count_extent_cache(sbi, EX_READ); + + /* count block age extent cache entries */ + count += __count_extent_cache(sbi, EX_BLOCK_AGE); /* count clean nat cache entries */ count += __count_nat_entries(sbi); @@ -100,7 +106,10 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink, sbi->shrinker_run_no = run_no; /* shrink extent cache entries */ - freed += f2fs_shrink_extent_tree(sbi, nr >> 1); + freed += f2fs_shrink_age_extent_tree(sbi, nr >> 2); + + /* shrink read extent cache entries */ + freed += f2fs_shrink_read_extent_tree(sbi, nr >> 2); /* shrink clean nat cache entries */ if (freed < nr) @@ -130,7 +139,9 @@ void f2fs_join_shrinker(struct f2fs_sb_info *sbi) void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) { - f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi)); + f2fs_shrink_read_extent_tree(sbi, __count_extent_cache(sbi, EX_READ)); + f2fs_shrink_age_extent_tree(sbi, + __count_extent_cache(sbi, EX_BLOCK_AGE)); spin_lock(&f2fs_list_lock); list_del_init(&sbi->s_list); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2451623c05a7..1f812b9ce985 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -61,6 +61,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_SLAB_ALLOC] = "slab alloc", [FAULT_DQUOT_INIT] = "dquot initialize", [FAULT_LOCK_OP] = "lock_op", + [FAULT_BLKADDR] = "invalid blkaddr", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, @@ -110,6 +111,7 @@ enum { Opt_noinline_dentry, Opt_flush_merge, Opt_noflush_merge, + Opt_barrier, Opt_nobarrier, Opt_fastboot, Opt_extent_cache, @@ -161,6 +163,7 @@ enum { Opt_nogc_merge, Opt_discard_unit, Opt_memory_mode, + Opt_age_extent_cache, Opt_err, }; @@ -186,6 +189,7 @@ static match_table_t f2fs_tokens = { {Opt_noinline_dentry, "noinline_dentry"}, {Opt_flush_merge, "flush_merge"}, {Opt_noflush_merge, "noflush_merge"}, + {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, {Opt_fastboot, "fastboot"}, {Opt_extent_cache, "extent_cache"}, @@ -238,6 +242,7 @@ static match_table_t f2fs_tokens = { {Opt_nogc_merge, "nogc_merge"}, {Opt_discard_unit, "discard_unit=%s"}, {Opt_memory_mode, "memory=%s"}, + {Opt_age_extent_cache, "age_extent_cache"}, {Opt_err, NULL}, }; @@ -285,9 +290,7 @@ static int __init f2fs_create_casefold_cache(void) { f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name", F2FS_NAME_LEN); - if (!f2fs_cf_name_slab) - return -ENOMEM; - return 0; + return f2fs_cf_name_slab ? 0 : -ENOMEM; } static void f2fs_destroy_casefold_cache(void) @@ -301,10 +304,10 @@ static void f2fs_destroy_casefold_cache(void) { } static inline void limit_reserve_root(struct f2fs_sb_info *sbi) { - block_t limit = min((sbi->user_block_count << 1) / 1000, + block_t limit = min((sbi->user_block_count >> 3), sbi->user_block_count - sbi->reserved_blocks); - /* limit is 0.2% */ + /* limit is 12.5% */ if (test_opt(sbi, RESERVE_ROOT) && F2FS_OPTION(sbi).root_reserved_blocks > limit) { F2FS_OPTION(sbi).root_reserved_blocks = limit; @@ -806,14 +809,17 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_nobarrier: set_opt(sbi, NOBARRIER); break; + case Opt_barrier: + clear_opt(sbi, NOBARRIER); + break; case Opt_fastboot: set_opt(sbi, FASTBOOT); break; case Opt_extent_cache: - set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noextent_cache: - clear_opt(sbi, EXTENT_CACHE); + clear_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noinline_data: clear_opt(sbi, INLINE_DATA); @@ -1253,6 +1259,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } kfree(name); break; + case Opt_age_extent_cache: + set_opt(sbi, AGE_EXTENT_CACHE); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1342,6 +1351,16 @@ default_check: return -EINVAL; } + if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) { + f2fs_err(sbi, "LFS not compatible with ATGC"); + return -EINVAL; + } + + if (f2fs_is_readonly(sbi) && test_opt(sbi, FLUSH_MERGE)) { + f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode"); + return -EINVAL; + } + if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; @@ -1562,8 +1581,7 @@ static void f2fs_put_super(struct super_block *sb) /* be sure to wait for any on-going discard commands */ dropped = f2fs_issue_discard_timeout(sbi); - if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) && - !sbi->discard_blks && !dropped) { + if (f2fs_realtime_discard_enable(sbi) && !sbi->discard_blks && !dropped) { struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; @@ -1666,9 +1684,8 @@ static int f2fs_freeze(struct super_block *sb) if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) return -EINVAL; - /* ensure no checkpoint required */ - if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list)) - return -EINVAL; + /* Let's flush checkpoints and stop the thread. */ + f2fs_flush_ckpt_thread(F2FS_SB(sb)); /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */ set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); @@ -1931,16 +1948,22 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",inline_dentry"); else seq_puts(seq, ",noinline_dentry"); - if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) + if (test_opt(sbi, FLUSH_MERGE)) seq_puts(seq, ",flush_merge"); + else + seq_puts(seq, ",noflush_merge"); if (test_opt(sbi, NOBARRIER)) seq_puts(seq, ",nobarrier"); + else + seq_puts(seq, ",barrier"); if (test_opt(sbi, FASTBOOT)) seq_puts(seq, ",fastboot"); - if (test_opt(sbi, EXTENT_CACHE)) + if (test_opt(sbi, READ_EXTENT_CACHE)) seq_puts(seq, ",extent_cache"); else seq_puts(seq, ",noextent_cache"); + if (test_opt(sbi, AGE_EXTENT_CACHE)) + seq_puts(seq, ",age_extent_cache"); if (test_opt(sbi, DATA_FLUSH)) seq_puts(seq, ",data_flush"); @@ -2039,7 +2062,11 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; - F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; + if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main) <= + SMALL_VOLUME_SEGMENTS) + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; + else + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); @@ -2055,13 +2082,14 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); - set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, READ_EXTENT_CACHE); set_opt(sbi, NOHEAP); clear_opt(sbi, DISABLE_CHECKPOINT); set_opt(sbi, MERGE_CHECKPOINT); F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; - set_opt(sbi, FLUSH_MERGE); + if (!f2fs_is_readonly(sbi)) + set_opt(sbi, FLUSH_MERGE); if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) set_opt(sbi, DISCARD); if (f2fs_sb_has_blkzoned(sbi)) { @@ -2181,6 +2209,9 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) f2fs_up_write(&sbi->gc_lock); f2fs_sync_fs(sbi->sb, 1); + + /* Let's ensure there's no pending checkpoint anymore */ + f2fs_flush_ckpt_thread(sbi); } static int f2fs_remount(struct super_block *sb, int *flags, char *data) @@ -2193,14 +2224,14 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_ckpt = false, need_stop_ckpt = false; bool need_restart_flush = false, need_stop_flush = false; bool need_restart_discard = false, need_stop_discard = false; - bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); + bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE); + bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE); bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); bool no_discard = !test_opt(sbi, DISCARD); bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); bool block_unit_discard = f2fs_block_unit_discard(sbi); - struct discard_cmd_control *dcc; #ifdef CONFIG_QUOTA int i, j; #endif @@ -2283,11 +2314,17 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } /* disallow enable/disable extent_cache dynamically */ - if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { + if (no_read_extent_cache == !!test_opt(sbi, READ_EXTENT_CACHE)) { err = -EINVAL; f2fs_warn(sbi, "switch extent_cache option is not allowed"); goto restore_opts; } + /* disallow enable/disable age extent_cache dynamically */ + if (no_age_extent_cache == !!test_opt(sbi, AGE_EXTENT_CACHE)) { + err = -EINVAL; + f2fs_warn(sbi, "switch age_extent_cache option is not allowed"); + goto restore_opts; + } if (no_io_align == !!F2FS_IO_ALIGNED(sbi)) { err = -EINVAL; @@ -2346,6 +2383,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) f2fs_stop_ckpt_thread(sbi); need_restart_ckpt = true; } else { + /* Flush if the prevous checkpoint, if exists. */ + f2fs_flush_ckpt_thread(sbi); + err = f2fs_start_ckpt_thread(sbi); if (err) { f2fs_err(sbi, @@ -2378,10 +2418,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_flush; need_stop_discard = true; } else { - dcc = SM_I(sbi)->dcc_info; f2fs_stop_discard_thread(sbi); - if (atomic_read(&dcc->discard_cmd_cnt)) - f2fs_issue_discard_timeout(sbi); + f2fs_issue_discard_timeout(sbi); need_restart_discard = true; } } @@ -2465,7 +2503,6 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, size_t toread; loff_t i_size = i_size_read(inode); struct page *page; - char *kaddr; if (off > i_size) return 0; @@ -2498,9 +2535,7 @@ repeat: return -EIO; } - kaddr = kmap_atomic(page); - memcpy(data, kaddr + offset, tocopy); - kunmap_atomic(kaddr); + memcpy_from_page(data, page, offset, tocopy); f2fs_put_page(page, 1); offset = 0; @@ -2522,7 +2557,6 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, size_t towrite = len; struct page *page; void *fsdata = NULL; - char *kaddr; int err = 0; int tocopy; @@ -2541,10 +2575,7 @@ retry: break; } - kaddr = kmap_atomic(page); - memcpy(kaddr + offset, data, tocopy); - kunmap_atomic(kaddr); - flush_dcache_page(page); + memcpy_to_page(page, offset, data, tocopy); a_ops->write_end(NULL, mapping, off, tocopy, tocopy, page, fsdata); @@ -3039,23 +3070,24 @@ static void f2fs_get_ino_and_lblk_bits(struct super_block *sb, *lblk_bits_ret = 8 * sizeof(block_t); } -static int f2fs_get_num_devices(struct super_block *sb) +static struct block_device **f2fs_get_devices(struct super_block *sb, + unsigned int *num_devs) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct block_device **devs; + int i; - if (f2fs_is_multi_device(sbi)) - return sbi->s_ndevs; - return 1; -} + if (!f2fs_is_multi_device(sbi)) + return NULL; -static void f2fs_get_devices(struct super_block *sb, - struct request_queue **devs) -{ - struct f2fs_sb_info *sbi = F2FS_SB(sb); - int i; + devs = kmalloc_array(sbi->s_ndevs, sizeof(*devs), GFP_KERNEL); + if (!devs) + return ERR_PTR(-ENOMEM); for (i = 0; i < sbi->s_ndevs; i++) - devs[i] = bdev_get_queue(FDEV(i).bdev); + devs[i] = FDEV(i).bdev; + *num_devs = sbi->s_ndevs; + return devs; } static const struct fscrypt_operations f2fs_cryptops = { @@ -3066,7 +3098,6 @@ static const struct fscrypt_operations f2fs_cryptops = { .empty_dir = f2fs_empty_dir, .has_stable_inodes = f2fs_has_stable_inodes, .get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits, - .get_num_devices = f2fs_get_num_devices, .get_devices = f2fs_get_devices, }; #endif @@ -3613,7 +3644,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->seq_file_ra_mul = MIN_RA_MUL; sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE; sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; - spin_lock_init(&sbi->gc_urgent_high_lock); + spin_lock_init(&sbi->gc_remaining_trials_lock); atomic64_set(&sbi->current_atomic_write, 0); sbi->dir_level = DEF_DIR_LEVEL; @@ -3843,6 +3874,68 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; } +void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + int err; + + f2fs_down_write(&sbi->sb_lock); + + if (raw_super->s_stop_reason[reason] < ((1 << BITS_PER_BYTE) - 1)) + raw_super->s_stop_reason[reason]++; + + err = f2fs_commit_super(sbi, false); + if (err) + f2fs_err(sbi, "f2fs_commit_super fails to record reason:%u err:%d", + reason, err); + f2fs_up_write(&sbi->sb_lock); +} + +static void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag) +{ + spin_lock(&sbi->error_lock); + if (!test_bit(flag, (unsigned long *)sbi->errors)) { + set_bit(flag, (unsigned long *)sbi->errors); + sbi->error_dirty = true; + } + spin_unlock(&sbi->error_lock); +} + +static bool f2fs_update_errors(struct f2fs_sb_info *sbi) +{ + bool need_update = false; + + spin_lock(&sbi->error_lock); + if (sbi->error_dirty) { + memcpy(F2FS_RAW_SUPER(sbi)->s_errors, sbi->errors, + MAX_F2FS_ERRORS); + sbi->error_dirty = false; + need_update = true; + } + spin_unlock(&sbi->error_lock); + + return need_update; +} + +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error) +{ + int err; + + f2fs_save_errors(sbi, error); + + f2fs_down_write(&sbi->sb_lock); + + if (!f2fs_update_errors(sbi)) + goto out_unlock; + + err = f2fs_commit_super(sbi, false); + if (err) + f2fs_err(sbi, "f2fs_commit_super fails to record errors:%u, err:%d", + error, err); +out_unlock: + f2fs_up_write(&sbi->sb_lock); +} + static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); @@ -3991,18 +4084,16 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) { - struct f2fs_sm_info *sm_i = SM_I(sbi); - /* adjust parameters according to the volume size */ - if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { - F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; + if (MAIN_SEGS(sbi) <= SMALL_VOLUME_SEGMENTS) { if (f2fs_block_unit_discard(sbi)) - sm_i->dcc_info->discard_granularity = 1; - sm_i->ipu_policy = 1 << F2FS_IPU_FORCE | + SM_I(sbi)->dcc_info->discard_granularity = + MIN_DISCARD_GRANULARITY; + SM_I(sbi)->ipu_policy = 1 << F2FS_IPU_FORCE | 1 << F2FS_IPU_HONOR_OPU_WRITE; } - sbi->readdir_ra = 1; + sbi->readdir_ra = true; } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) @@ -4030,6 +4121,24 @@ try_onemore: sbi->sb = sb; + /* initialize locks within allocated memory */ + init_f2fs_rwsem(&sbi->gc_lock); + mutex_init(&sbi->writepages); + init_f2fs_rwsem(&sbi->cp_global_sem); + init_f2fs_rwsem(&sbi->node_write); + init_f2fs_rwsem(&sbi->node_change); + spin_lock_init(&sbi->stat_lock); + init_f2fs_rwsem(&sbi->cp_rwsem); + init_f2fs_rwsem(&sbi->quota_sem); + init_waitqueue_head(&sbi->cp_wait); + spin_lock_init(&sbi->error_lock); + + for (i = 0; i < NR_INODE_TYPE; i++) { + INIT_LIST_HEAD(&sbi->inode_list[i]); + spin_lock_init(&sbi->inode_lock[i]); + } + mutex_init(&sbi->flush_lock); + /* Load the checksum driver */ sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { @@ -4053,6 +4162,8 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); + /* precompute checksum seed for metadata */ if (f2fs_sb_has_inode_chksum(sbi)) sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, @@ -4109,23 +4220,14 @@ try_onemore: /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; - init_f2fs_rwsem(&sbi->gc_lock); - mutex_init(&sbi->writepages); - init_f2fs_rwsem(&sbi->cp_global_sem); - init_f2fs_rwsem(&sbi->node_write); - init_f2fs_rwsem(&sbi->node_change); /* disallow all the data/node/meta page writes */ set_sbi_flag(sbi, SBI_POR_DOING); - spin_lock_init(&sbi->stat_lock); err = f2fs_init_write_merge_io(sbi); if (err) goto free_bio_info; - init_f2fs_rwsem(&sbi->cp_rwsem); - init_f2fs_rwsem(&sbi->quota_sem); - init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); err = f2fs_init_iostat(sbi); @@ -4203,12 +4305,6 @@ try_onemore: limit_reserve_root(sbi); adjust_unusable_cap_perc(sbi); - for (i = 0; i < NR_INODE_TYPE; i++) { - INIT_LIST_HEAD(&sbi->inode_list[i]); - spin_lock_init(&sbi->inode_lock[i]); - } - mutex_init(&sbi->flush_lock); - f2fs_init_extent_cache_info(sbi); f2fs_init_ino_entry_info(sbi); @@ -4455,9 +4551,9 @@ free_nm: f2fs_destroy_node_manager(sbi); free_sm: f2fs_destroy_segment_manager(sbi); - f2fs_destroy_post_read_wq(sbi); stop_ckpt_thread: f2fs_stop_ckpt_thread(sbi); + f2fs_destroy_post_read_wq(sbi); free_devices: destroy_device_list(sbi); kvfree(sbi->ckpt); @@ -4558,9 +4654,7 @@ static int __init init_inodecache(void) f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache", sizeof(struct f2fs_inode_info), 0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL); - if (!f2fs_inode_cachep) - return -ENOMEM; - return 0; + return f2fs_inode_cachep ? 0 : -ENOMEM; } static void destroy_inodecache(void) @@ -4625,7 +4719,7 @@ static int __init init_f2fs_fs(void) goto free_iostat; err = f2fs_init_bioset(); if (err) - goto free_bio_enrty_cache; + goto free_bio_entry_cache; err = f2fs_init_compress_mempool(); if (err) goto free_bioset; @@ -4642,7 +4736,7 @@ free_compress_mempool: f2fs_destroy_compress_mempool(); free_bioset: f2fs_destroy_bioset(); -free_bio_enrty_cache: +free_bio_entry_cache: f2fs_destroy_bio_entry_cache(); free_iostat: f2fs_destroy_iostat_processing(); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index eba5fb1629d7..83a366f3ee80 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -53,9 +53,9 @@ static const char *gc_mode_names[MAX_GC_MODE] = { struct f2fs_attr { struct attribute attr; - ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); - ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, - const char *, size_t); + ssize_t (*show)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf); + ssize_t (*store)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, + const char *buf, size_t len); int struct_type; int offset; int id; @@ -95,28 +95,28 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) static ssize_t dirty_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(dirty_segments(sbi))); } static ssize_t free_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(free_segments(sbi))); } static ssize_t ovp_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(overprovision_segments(sbi))); } static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(sbi->kbytes_written + ((f2fs_get_sectors_written(sbi) - sbi->sectors_written_start) >> 1))); @@ -125,7 +125,13 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, static ssize_t sb_status_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%lx\n", sbi->s_flag); + return sysfs_emit(buf, "%lx\n", sbi->s_flag); +} + +static ssize_t cp_status_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags)); } static ssize_t pending_discard_show(struct f2fs_attr *a, @@ -133,10 +139,16 @@ static ssize_t pending_discard_show(struct f2fs_attr *a, { if (!SM_I(sbi)->dcc_info) return -EINVAL; - return sprintf(buf, "%llu\n", (unsigned long long)atomic_read( + return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read( &SM_I(sbi)->dcc_info->discard_cmd_cnt)); } +static ssize_t gc_mode_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%s\n", gc_mode_names[sbi->gc_mode]); +} + static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -193,7 +205,7 @@ static ssize_t features_show(struct f2fs_attr *a, static ssize_t current_reserved_blocks_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%u\n", sbi->current_reserved_blocks); + return sysfs_emit(buf, "%u\n", sbi->current_reserved_blocks); } static ssize_t unusable_show(struct f2fs_attr *a, @@ -205,7 +217,7 @@ static ssize_t unusable_show(struct f2fs_attr *a, unusable = sbi->unusable_block_count; else unusable = f2fs_get_unusable_blocks(sbi); - return sprintf(buf, "%llu\n", (unsigned long long)unusable); + return sysfs_emit(buf, "%llu\n", (unsigned long long)unusable); } static ssize_t encoding_show(struct f2fs_attr *a, @@ -220,13 +232,13 @@ static ssize_t encoding_show(struct f2fs_attr *a, (sb->s_encoding->version >> 8) & 0xff, sb->s_encoding->version & 0xff); #endif - return sprintf(buf, "(none)"); + return sysfs_emit(buf, "(none)\n"); } static ssize_t mounted_time_sec_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu", SIT_I(sbi)->mounted_time); + return sysfs_emit(buf, "%llu\n", SIT_I(sbi)->mounted_time); } #ifdef CONFIG_F2FS_STAT_FS @@ -235,7 +247,7 @@ static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a, { struct f2fs_stat_info *si = F2FS_STAT(sbi); - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->tot_blks - (si->bg_data_blks + si->bg_node_blks))); } @@ -245,7 +257,7 @@ static ssize_t moved_blocks_background_show(struct f2fs_attr *a, { struct f2fs_stat_info *si = F2FS_STAT(sbi); - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->bg_data_blks + si->bg_node_blks)); } @@ -256,7 +268,7 @@ static ssize_t avg_vblocks_show(struct f2fs_attr *a, si->dirty_count = dirty_segments(sbi); f2fs_update_sit_info(sbi); - return sprintf(buf, "%llu\n", (unsigned long long)(si->avg_vblocks)); + return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->avg_vblocks)); } #endif @@ -326,13 +338,8 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return sysfs_emit(buf, "%u\n", sbi->compr_new_inode); #endif - if (!strcmp(a->attr.name, "gc_urgent")) - return sysfs_emit(buf, "%s\n", - gc_mode_names[sbi->gc_mode]); - if (!strcmp(a->attr.name, "gc_segment_mode")) - return sysfs_emit(buf, "%s\n", - gc_mode_names[sbi->gc_segment_mode]); + return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode); if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { return sysfs_emit(buf, "%u\n", @@ -356,7 +363,7 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, ui = (unsigned int *)(ptr + a->offset); - return sprintf(buf, "%u\n", *ui); + return sysfs_emit(buf, "%u\n", *ui); } static ssize_t __sbi_store(struct f2fs_attr *a, @@ -477,14 +484,27 @@ out: return count; } + if (!strcmp(a->attr.name, "max_ordered_discard")) { + if (t == 0 || t > MAX_PLIST_NUM) + return -EINVAL; + if (!f2fs_block_unit_discard(sbi)) + return -EINVAL; + *ui = t; + return count; + } + + if (!strcmp(a->attr.name, "discard_urgent_util")) { + if (t > 100) + return -EINVAL; + *ui = t; + return count; + } + if (!strcmp(a->attr.name, "migration_granularity")) { if (t == 0 || t > sbi->segs_per_sec) return -EINVAL; } - if (!strcmp(a->attr.name, "trim_sections")) - return -EINVAL; - if (!strcmp(a->attr.name, "gc_urgent")) { if (t == 0) { sbi->gc_mode = GC_NORMAL; @@ -525,11 +545,10 @@ out: return count; } - if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) { - spin_lock(&sbi->gc_urgent_high_lock); - sbi->gc_urgent_high_limited = t != 0; - sbi->gc_urgent_high_remaining = t; - spin_unlock(&sbi->gc_urgent_high_lock); + if (!strcmp(a->attr.name, "gc_remaining_trials")) { + spin_lock(&sbi->gc_remaining_trials_lock); + sbi->gc_remaining_trials = t; + spin_unlock(&sbi->gc_remaining_trials_lock); return count; } @@ -644,6 +663,29 @@ out: return count; } + if (!strcmp(a->attr.name, "readdir_ra")) { + sbi->readdir_ra = !!t; + return count; + } + + if (!strcmp(a->attr.name, "hot_data_age_threshold")) { + if (t == 0 || t >= sbi->warm_data_age_threshold) + return -EINVAL; + if (t == *ui) + return count; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "warm_data_age_threshold")) { + if (t == 0 || t <= sbi->hot_data_age_threshold) + return -EINVAL; + if (t == *ui) + return count; + *ui = (unsigned int)t; + return count; + } + *ui = (unsigned int)t; return count; @@ -716,7 +758,7 @@ static void f2fs_sb_release(struct kobject *kobj) static ssize_t f2fs_feature_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "supported\n"); + return sysfs_emit(buf, "supported\n"); } #define F2FS_FEATURE_RO_ATTR(_name) \ @@ -729,8 +771,8 @@ static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { if (F2FS_HAS_FEATURE(sbi, a->id)) - return sprintf(buf, "supported\n"); - return sprintf(buf, "unsupported\n"); + return sysfs_emit(buf, "supported\n"); + return sysfs_emit(buf, "unsupported\n"); } #define F2FS_SB_FEATURE_RO_ATTR(_name, _feat) \ @@ -783,9 +825,10 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_req F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_urgent_util, discard_urgent_util); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); @@ -820,7 +863,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent_high_remaining, gc_urgent_high_remaining); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_remaining_trials, gc_remaining_trials); F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); @@ -833,6 +876,7 @@ F2FS_GENERAL_RO_ATTR(encoding); F2FS_GENERAL_RO_ATTR(mounted_time_sec); F2FS_GENERAL_RO_ATTR(main_blkaddr); F2FS_GENERAL_RO_ATTR(pending_discard); +F2FS_GENERAL_RO_ATTR(gc_mode); #ifdef CONFIG_F2FS_STAT_FS F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count); F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count); @@ -897,6 +941,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, peak_atomic_write, peak_atomic_write); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, committed_atomic_block, committed_atomic_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block); +/* For block age extent cache */ +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, hot_data_age_threshold, hot_data_age_threshold); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, warm_data_age_threshold, warm_data_age_threshold); + #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_urgent_sleep_time), @@ -912,9 +960,11 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(min_discard_issue_time), ATTR_LIST(mid_discard_issue_time), ATTR_LIST(max_discard_issue_time), + ATTR_LIST(discard_urgent_util), ATTR_LIST(discard_granularity), + ATTR_LIST(max_ordered_discard), ATTR_LIST(pending_discard), - ATTR_LIST(batched_trim_sections), + ATTR_LIST(gc_mode), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), @@ -947,7 +997,7 @@ static struct attribute *f2fs_attrs[] = { #endif ATTR_LIST(data_io_flag), ATTR_LIST(node_io_flag), - ATTR_LIST(gc_urgent_high_remaining), + ATTR_LIST(gc_remaining_trials), ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), @@ -990,6 +1040,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(peak_atomic_write), ATTR_LIST(committed_atomic_block), ATTR_LIST(revoked_atomic_block), + ATTR_LIST(hot_data_age_threshold), + ATTR_LIST(warm_data_age_threshold), NULL, }; ATTRIBUTE_GROUPS(f2fs); @@ -1030,8 +1082,10 @@ static struct attribute *f2fs_feat_attrs[] = { ATTRIBUTE_GROUPS(f2fs_feat); F2FS_GENERAL_RO_ATTR(sb_status); +F2FS_GENERAL_RO_ATTR(cp_status); static struct attribute *f2fs_stat_attrs[] = { ATTR_LIST(sb_status), + ATTR_LIST(cp_status), NULL, }; ATTRIBUTE_GROUPS(f2fs_stat); @@ -1236,6 +1290,44 @@ static int __maybe_unused victim_bits_seq_show(struct seq_file *seq, return 0; } +static int __maybe_unused discard_plist_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i, count; + + seq_puts(seq, "Discard pend list(Show diacrd_cmd count on each entry, .:not exist):\n"); + if (!f2fs_realtime_discard_enable(sbi)) + return 0; + + if (dcc) { + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) { + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + + if (i % 8 == 0) + seq_printf(seq, " %-3d", i); + count = 0; + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) + count++; + if (count) + seq_printf(seq, " %7d", count); + else + seq_puts(seq, " ."); + if (i % 8 == 7) + seq_putc(seq, '\n'); + } + seq_putc(seq, '\n'); + mutex_unlock(&dcc->cmd_lock); + } + + return 0; +} + int __init f2fs_init_sysfs(void) { int ret; @@ -1306,6 +1398,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) #endif proc_create_single_data("victim_bits", 0444, sbi->s_proc, victim_bits_seq_show, sb); + proc_create_single_data("discard_plist_info", 0444, sbi->s_proc, + discard_plist_seq_show, sb); } return 0; put_feature_list_kobj: @@ -1329,6 +1423,7 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry("victim_bits", sbi->s_proc); + remove_proc_entry("discard_plist_info", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 7b8f2b41c29b..c352fff88a5e 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -47,16 +47,13 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count, size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); struct page *page; - void *addr; page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, NULL); if (IS_ERR(page)) return PTR_ERR(page); - addr = kmap_atomic(page); - memcpy(buf, addr + offset_in_page(pos), n); - kunmap_atomic(addr); + memcpy_from_page(buf, page, offset_in_page(pos), n); put_page(page); @@ -85,16 +82,13 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, PAGE_SIZE - offset_in_page(pos)); struct page *page; void *fsdata; - void *addr; int res; res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); if (res) return res; - addr = kmap_atomic(page); - memcpy(addr + offset_in_page(pos), buf, n); - kunmap_atomic(addr); + memcpy_to_page(page, offset_in_page(pos), buf, n); res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata); if (res < 0) @@ -246,6 +240,8 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, if (pos + size < pos || pos + size > inode->i_sb->s_maxbytes || pos < f2fs_verity_metadata_pos(inode) || size > INT_MAX) { f2fs_warn(F2FS_I_SB(inode), "invalid verity xattr"); + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_VERITY_XATTR); return -EFSCORRUPTED; } if (buf_size) { @@ -262,13 +258,14 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); struct page *page; index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); if (!page || !PageUptodate(page)) { + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); + if (page) put_page(page); else if (num_ra_pages > 1) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index c76c15086e5f..dc2e8637189e 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -367,6 +367,8 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto out; } check: @@ -583,6 +585,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto cleanup; } @@ -658,6 +662,8 @@ static int __f2fs_setxattr(struct inode *inode, int index, inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto exit; } @@ -684,6 +690,8 @@ static int __f2fs_setxattr(struct inode *inode, int index, inode->i_ino, ENTRY_SIZE(last)); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto exit; } last = XATTR_NEXT_ENTRY(last); diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 249825017da7..00235b8a1823 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -705,7 +705,7 @@ static int fat_readdir(struct file *file, struct dir_context *ctx) } #define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \ -static int func(struct dir_context *ctx, const char *name, int name_len, \ +static bool func(struct dir_context *ctx, const char *name, int name_len, \ loff_t offset, u64 ino, unsigned int d_type) \ { \ struct fat_ioctl_filldir_callback *buf = \ @@ -714,7 +714,7 @@ static int func(struct dir_context *ctx, const char *name, int name_len, \ struct dirent_type __user *d2 = d1 + 1; \ \ if (buf->result) \ - return -EINVAL; \ + return false; \ buf->result++; \ \ if (name != NULL) { \ @@ -750,10 +750,10 @@ static int func(struct dir_context *ctx, const char *name, int name_len, \ put_user(short_len, &d1->d_reclen)) \ goto efault; \ } \ - return 0; \ + return true; \ efault: \ buf->result = -EFAULT; \ - return -EFAULT; \ + return false; \ } FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent) diff --git a/fs/fat/file.c b/fs/fat/file.c index 3e4eb3467cb4..8a6b493b5b5f 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -461,8 +461,9 @@ static int fat_allow_set_time(struct user_namespace *mnt_userns, { umode_t allow_utime = sbi->options.allow_utime; - if (!uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) { - if (in_group_p(i_gid_into_mnt(mnt_userns, inode))) + if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), + current_fsuid())) { + if (vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode))) allow_utime >>= 3; if (allow_utime & MAY_WRITE) return 1; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index a38238d75c08..d99b8549ec8f 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -194,11 +194,6 @@ static int fat_get_block(struct inode *inode, sector_t iblock, return 0; } -static int fat_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, fat_get_block, wbc); -} - static int fat_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -346,12 +341,12 @@ static const struct address_space_operations fat_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = fat_read_folio, .readahead = fat_readahead, - .writepage = fat_writepage, .writepages = fat_writepages, .write_begin = fat_write_begin, .write_end = fat_write_end, .direct_IO = fat_direct_IO, - .bmap = _fat_bmap + .bmap = _fat_bmap, + .migrate_folio = buffer_migrate_folio, }; /* @@ -523,7 +518,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; inode_inc_iversion(inode); - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) { inode->i_generation &= ~1; diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c index af191371c352..3626eb585a98 100644 --- a/fs/fat/nfs.c +++ b/fs/fat/nfs.c @@ -17,7 +17,7 @@ struct fat_fid { #define FAT_FID_SIZE_WITHOUT_PARENT 3 #define FAT_FID_SIZE_WITH_PARENT (sizeof(struct fat_fid)/sizeof(u32)) -/** +/* * Look up a directory inode given its starting cluster. */ static struct inode *fat_dget(struct super_block *sb, int i_logstart) @@ -135,7 +135,7 @@ fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp, return type; } -/** +/* * Map a NFS file handle to a corresponding dentry. * The dentry may or may not be connected to the filesystem root. */ diff --git a/fs/fhandle.c b/fs/fhandle.c index 6630c69c23a2..f2bc27d1975e 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -14,7 +14,7 @@ #include "internal.h" #include "mount.h" -static long do_sys_name_to_handle(struct path *path, +static long do_sys_name_to_handle(const struct path *path, struct file_handle __user *ufh, int __user *mnt_id) { diff --git a/fs/file.c b/fs/file.c index 3bcc1ecc314a..c942c89ca4cd 100644 --- a/fs/file.c +++ b/fs/file.c @@ -980,6 +980,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret *ret_fd = fd; return file; } +EXPORT_SYMBOL(task_lookup_next_fd_rcu); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. @@ -1002,7 +1003,16 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask) struct files_struct *files = current->files; struct file *file; - if (atomic_read(&files->count) == 1) { + /* + * If another thread is concurrently calling close_fd() followed + * by put_files_struct(), we must not observe the old table + * entry combined with the new refcount - otherwise we could + * return a file that is concurrently being freed. + * + * atomic_read_acquire() pairs with atomic_dec_and_test() in + * put_files_struct(). + */ + if (atomic_read_acquire(&files->count) == 1) { file = files_lookup_fd_raw(files, fd); if (!file || unlikely(file->f_mode & mask)) return 0; diff --git a/fs/file_table.c b/fs/file_table.c index 99c6796c9f28..dd88701e54a9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -324,12 +324,7 @@ static void __fput(struct file *file) } fops_put(file->f_op); put_pid(file->f_owner.pid); - if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) - i_readcount_dec(inode); - if (mode & FMODE_WRITER) { - put_write_access(inode); - __mnt_drop_write(mnt); - } + put_file_access(file); dput(dentry); if (unlikely(mode & FMODE_NEED_UNMOUNT)) dissolve_on_fput(mnt); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 08a1993ab7fd..6fba5a52127b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -121,6 +121,7 @@ static bool inode_io_list_move_locked(struct inode *inode, { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); + WARN_ON_ONCE(inode->i_state & I_FREEING); list_move(&inode->i_io_list, head); @@ -280,6 +281,7 @@ static void inode_cgwb_move_to_attached(struct inode *inode, { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); + WARN_ON_ONCE(inode->i_state & I_FREEING); inode->i_state &= ~I_SYNC_QUEUED; if (wb != &wb->bdi->wb) @@ -1129,6 +1131,7 @@ static void inode_cgwb_move_to_attached(struct inode *inode, { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); + WARN_ON_ONCE(inode->i_state & I_FREEING); inode->i_state &= ~I_SYNC_QUEUED; list_del_init(&inode->i_io_list); @@ -1294,6 +1297,17 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&inode->i_lock); + inode->i_state &= ~I_SYNC_QUEUED; + /* + * When the inode is being freed just don't bother with dirty list + * tracking. Flush worker will ignore this inode anyway and it will + * trigger assertions in inode_io_list_move_locked(). + */ + if (inode->i_state & I_FREEING) { + list_del_init(&inode->i_io_list); + wb_io_lists_depopulated(wb); + return; + } if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -1302,7 +1316,6 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb) inode->dirtied_when = jiffies; } inode_io_list_move_locked(inode, wb, &wb->b_dirty); - inode->i_state &= ~I_SYNC_QUEUED; } static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) @@ -1345,8 +1358,6 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) return ret; } -#define EXPIRE_DIRTY_ATIME 0x0001 - /* * Move expired (dirtied before dirtied_before) dirty inodes from * @delaying_queue to @dispatch_queue. @@ -1712,15 +1723,28 @@ static int writeback_single_inode(struct inode *inode, wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); /* - * If the inode is now fully clean, then it can be safely removed from - * its writeback list (if any). Otherwise the flusher threads are - * responsible for the writeback lists. + * If the inode is freeing, its i_io_list shoudn't be updated + * as it can be finally deleted at this moment. */ - if (!(inode->i_state & I_DIRTY_ALL)) - inode_cgwb_move_to_attached(inode, wb); - else if (!(inode->i_state & I_SYNC_QUEUED) && - (inode->i_state & I_DIRTY)) - redirty_tail_locked(inode, wb); + if (!(inode->i_state & I_FREEING)) { + /* + * If the inode is now fully clean, then it can be safely + * removed from its writeback list (if any). Otherwise the + * flusher threads are responsible for the writeback lists. + */ + if (!(inode->i_state & I_DIRTY_ALL)) + inode_cgwb_move_to_attached(inode, wb); + else if (!(inode->i_state & I_SYNC_QUEUED)) { + if ((inode->i_state & I_DIRTY)) + redirty_tail_locked(inode, wb); + else if (inode->i_state & I_DIRTY_TIME) { + inode->dirtied_when = jiffies; + inode_io_list_move_locked(inode, + wb, + &wb->b_dirty_time); + } + } + } spin_unlock(&wb->list_lock); inode_sync_complete(inode); @@ -2370,6 +2394,20 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (flags & I_DIRTY_INODE) { /* + * Inode timestamp update will piggback on this dirtying. + * We tell ->dirty_inode callback that timestamps need to + * be updated by setting I_DIRTY_TIME in flags. + */ + if (inode->i_state & I_DIRTY_TIME) { + spin_lock(&inode->i_lock); + if (inode->i_state & I_DIRTY_TIME) { + inode->i_state &= ~I_DIRTY_TIME; + flags |= I_DIRTY_TIME; + } + spin_unlock(&inode->i_lock); + } + + /* * Notify the filesystem about the inode being dirtied, so that * (if needed) it can update on-disk fields and journal the * inode. This is only needed when the inode itself is being @@ -2378,7 +2416,8 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ trace_writeback_dirty_inode_start(inode, flags); if (sb->s_op->dirty_inode) - sb->s_op->dirty_inode(inode, flags & I_DIRTY_INODE); + sb->s_op->dirty_inode(inode, + flags & (I_DIRTY_INODE | I_DIRTY_TIME)); trace_writeback_dirty_inode(inode, flags); /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */ @@ -2399,21 +2438,15 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ smp_mb(); - if (((inode->i_state & flags) == flags) || - (dirtytime && (inode->i_state & I_DIRTY_INODE))) + if ((inode->i_state & flags) == flags) return; spin_lock(&inode->i_lock); - if (dirtytime && (inode->i_state & I_DIRTY_INODE)) - goto out_unlock_inode; if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; inode_attach_wb(inode, NULL); - /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */ - if (flags & I_DIRTY_INODE) - inode->i_state &= ~I_DIRTY_TIME; inode->i_state |= flags; /* @@ -2486,7 +2519,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) out_unlock: if (wb) spin_unlock(&wb->list_lock); -out_unlock_inode: spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(__mark_inode_dirty); diff --git a/fs/fs_parser.c b/fs/fs_parser.c index ed40ce5742fd..edb3712dcfa5 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -138,15 +138,16 @@ EXPORT_SYMBOL(__fs_parse); * @fc: The filesystem context to log errors through. * @param: The parameter. * @want_bdev: T if want a blockdev + * @flags: Pathwalk flags passed to filename_lookup() * @_path: The result of the lookup */ int fs_lookup_param(struct fs_context *fc, struct fs_parameter *param, bool want_bdev, + unsigned int flags, struct path *_path) { struct filename *f; - unsigned int flags = 0; bool put_f; int ret; diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 451d8a077e12..bce2492186d0 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -605,6 +605,14 @@ again: set_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags); queue = true; } + /* + * We could race with cookie_lru which may set LRU_DISCARD bit + * but has yet to run the cookie state machine. If this happens + * and another thread tries to use the cookie, clear LRU_DISCARD + * so we don't end up withdrawing the cookie while in use. + */ + if (test_and_clear_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags)) + fscache_see_cookie(cookie, fscache_cookie_see_lru_discard_clear); break; case FSCACHE_COOKIE_STATE_FAILED: diff --git a/fs/fscache/io.c b/fs/fscache/io.c index 3af3b08a9bb3..0d2b8dec8f82 100644 --- a/fs/fscache/io.c +++ b/fs/fscache/io.c @@ -286,7 +286,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, * taken into account. */ - iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len); + iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len); fscache_write(cres, start, &iter, fscache_wreq_done, wreq); return; diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c index a058e0136bfe..ab8ceddf9efa 100644 --- a/fs/fscache/volume.c +++ b/fs/fscache/volume.c @@ -203,7 +203,11 @@ static struct fscache_volume *fscache_alloc_volume(const char *volume_key, struct fscache_volume *volume; struct fscache_cache *cache; size_t klen, hlen; - char *key; + u8 *key; + + klen = strlen(volume_key); + if (klen > NAME_MAX) + return NULL; if (!coherency_data) coherency_len = 0; @@ -229,7 +233,6 @@ static struct fscache_volume *fscache_alloc_volume(const char *volume_key, /* Stick the length on the front of the key and pad it out to make * hashing easier. */ - klen = strlen(volume_key); hlen = round_up(1 + klen + 1, sizeof(__le32)); key = kzalloc(hlen, GFP_KERNEL); if (!key) diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 337cb29a8dd5..a4850aee2639 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -53,9 +53,10 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu) return acl; } -int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { + struct inode *inode = d_inode(dentry); struct fuse_conn *fc = get_fuse_conn(inode); const char *name; int ret; @@ -98,7 +99,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, return ret; } - if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) && + if (!vfsgid_in_group_p(i_gid_into_vfsgid(&init_user_ns, inode)) && !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index c7d882a9fe33..a06fbb1a8a5b 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -545,7 +545,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file) { struct fuse_dev *fud = file->private_data; struct cuse_conn *cc = fc_to_cc(fud->fc); - int rc; /* remove from the conntbl, no more access from this point on */ mutex_lock(&cuse_lock); @@ -560,9 +559,7 @@ static int cuse_channel_release(struct inode *inode, struct file *file) cdev_del(cc->cdev); } - rc = fuse_dev_release(inode, file); /* puts the base reference */ - - return rc; + return fuse_dev_release(inode, file); } static struct file_operations cuse_channel_fops; /* initialized during init */ diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 51897427a534..e8b60ce72c9a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -764,11 +764,11 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) return ncpy; } -static int fuse_check_page(struct page *page) +static int fuse_check_folio(struct folio *folio) { - if (page_mapcount(page) || - page->mapping != NULL || - (page->flags & PAGE_FLAGS_CHECK_AT_PREP & + if (folio_mapped(folio) || + folio->mapping != NULL || + (folio->flags & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced | 1 << PG_uptodate | @@ -776,8 +776,9 @@ static int fuse_check_page(struct page *page) 1 << PG_active | 1 << PG_workingset | 1 << PG_reclaim | - 1 << PG_waiters))) { - dump_page(page, "fuse: trying to steal weird page"); + 1 << PG_waiters | + LRU_GEN_MASK | LRU_REFS_MASK))) { + dump_page(&folio->page, "fuse: trying to steal weird page"); return 1; } return 0; @@ -786,11 +787,11 @@ static int fuse_check_page(struct page *page) static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) { int err; - struct page *oldpage = *pagep; - struct page *newpage; + struct folio *oldfolio = page_folio(*pagep); + struct folio *newfolio; struct pipe_buffer *buf = cs->pipebufs; - get_page(oldpage); + folio_get(oldfolio); err = unlock_request(cs->req); if (err) goto out_put_old; @@ -813,35 +814,36 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (!pipe_buf_try_steal(cs->pipe, buf)) goto out_fallback; - newpage = buf->page; + newfolio = page_folio(buf->page); - if (!PageUptodate(newpage)) - SetPageUptodate(newpage); + if (!folio_test_uptodate(newfolio)) + folio_mark_uptodate(newfolio); - ClearPageMappedToDisk(newpage); + folio_clear_mappedtodisk(newfolio); - if (fuse_check_page(newpage) != 0) + if (fuse_check_folio(newfolio) != 0) goto out_fallback_unlock; /* * This is a new and locked page, it shouldn't be mapped or * have any special flags on it */ - if (WARN_ON(page_mapped(oldpage))) + if (WARN_ON(folio_mapped(oldfolio))) goto out_fallback_unlock; - if (WARN_ON(page_has_private(oldpage))) + if (WARN_ON(folio_has_private(oldfolio))) goto out_fallback_unlock; - if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage))) + if (WARN_ON(folio_test_dirty(oldfolio) || + folio_test_writeback(oldfolio))) goto out_fallback_unlock; - if (WARN_ON(PageMlocked(oldpage))) + if (WARN_ON(folio_test_mlocked(oldfolio))) goto out_fallback_unlock; - replace_page_cache_page(oldpage, newpage); + replace_page_cache_folio(oldfolio, newfolio); - get_page(newpage); + folio_get(newfolio); if (!(buf->flags & PIPE_BUF_FLAG_LRU)) - lru_cache_add(newpage); + folio_add_lru(newfolio); /* * Release while we have extra ref on stolen page. Otherwise @@ -854,28 +856,28 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (test_bit(FR_ABORTED, &cs->req->flags)) err = -ENOENT; else - *pagep = newpage; + *pagep = &newfolio->page; spin_unlock(&cs->req->waitq.lock); if (err) { - unlock_page(newpage); - put_page(newpage); + folio_unlock(newfolio); + folio_put(newfolio); goto out_put_old; } - unlock_page(oldpage); + folio_unlock(oldfolio); /* Drop ref for ap->pages[] array */ - put_page(oldpage); + folio_put(oldfolio); cs->len = 0; err = 0; out_put_old: /* Drop ref obtained in this function */ - put_page(oldpage); + folio_put(oldfolio); return err; out_fallback_unlock: - unlock_page(newpage); + folio_unlock(newfolio); out_fallback: cs->pg = buf->page; cs->offset = buf->offset; @@ -1497,7 +1499,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, buf[outarg.namelen] = 0; down_read(&fc->killsb); - err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name); + err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags); up_read(&fc->killsb); kfree(buf); return err; @@ -1545,7 +1547,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, buf[outarg.namelen] = 0; down_read(&fc->killsb); - err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name); + err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0); up_read(&fc->killsb); kfree(buf); return err; @@ -2266,8 +2268,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, * Check against file->f_op because CUSE * uses the same ioctl handler. */ - if (old->f_op == file->f_op && - old->f_cred->user_ns == file->f_cred->user_ns) + if (old->f_op == file->f_op) fud = fuse_get_dev(old); if (fud) { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b585b04e815e..cd1a071b625a 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -214,7 +214,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (inode && fuse_is_bad(inode)) goto invalid; else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || - (flags & (LOOKUP_EXCL | LOOKUP_REVAL))) { + (flags & (LOOKUP_EXCL | LOOKUP_REVAL | LOOKUP_RENAME_TARGET))) { struct fuse_entry_out outarg; FUSE_ARGS(args); struct fuse_forget_link *forget; @@ -529,7 +529,7 @@ out_err: */ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct file *file, unsigned int flags, - umode_t mode) + umode_t mode, u32 opcode) { int err; struct inode *inode; @@ -573,7 +573,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; } - args.opcode = FUSE_CREATE; + args.opcode = opcode; args.nodeid = get_node_id(dir); args.in_numargs = 2; args.in_args[0].size = sizeof(inarg); @@ -676,7 +676,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, if (fc->no_create) goto mknod; - err = fuse_create_open(dir, entry, file, flags, mode); + err = fuse_create_open(dir, entry, file, flags, mode, FUSE_CREATE); if (err == -ENOSYS) { fc->no_create = 1; goto mknod; @@ -802,6 +802,23 @@ static int fuse_create(struct user_namespace *mnt_userns, struct inode *dir, return fuse_mknod(&init_user_ns, dir, entry, mode, 0); } +static int fuse_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct file *file, umode_t mode) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + int err; + + if (fc->no_tmpfile) + return -EOPNOTSUPP; + + err = fuse_create_open(dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE); + if (err == -ENOSYS) { + fc->no_tmpfile = 1; + err = -EOPNOTSUPP; + } + return err; +} + static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *entry, umode_t mode) { @@ -1153,7 +1170,7 @@ int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask) } int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, - u64 child_nodeid, struct qstr *name) + u64 child_nodeid, struct qstr *name, u32 flags) { int err = -ENOTDIR; struct inode *parent; @@ -1180,7 +1197,9 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, goto unlock; fuse_dir_changed(parent); - fuse_invalidate_entry(entry); + if (!(flags & FUSE_EXPIRE_ONLY)) + d_invalidate(entry); + fuse_invalidate_entry_cache(entry); if (child_nodeid != 0 && d_really_is_positive(entry)) { inode_lock(d_inode(entry)); @@ -1218,6 +1237,18 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, return err; } +static inline bool fuse_permissible_uidgid(struct fuse_conn *fc) +{ + const struct cred *cred = current_cred(); + + return (uid_eq(cred->euid, fc->user_id) && + uid_eq(cred->suid, fc->user_id) && + uid_eq(cred->uid, fc->user_id) && + gid_eq(cred->egid, fc->group_id) && + gid_eq(cred->sgid, fc->group_id) && + gid_eq(cred->gid, fc->group_id)); +} + /* * Calling into a user-controlled filesystem gives the filesystem * daemon ptrace-like capabilities over the current process. This @@ -1231,26 +1262,19 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, * for which the owner of the mount has ptrace privilege. This * excludes processes started by other users, suid or sgid processes. */ -int fuse_allow_current_process(struct fuse_conn *fc) +bool fuse_allow_current_process(struct fuse_conn *fc) { - const struct cred *cred; - - if (allow_sys_admin_access && capable(CAP_SYS_ADMIN)) - return 1; + bool allow; if (fc->allow_other) - return current_in_userns(fc->user_ns); + allow = current_in_userns(fc->user_ns); + else + allow = fuse_permissible_uidgid(fc); - cred = current_cred(); - if (uid_eq(cred->euid, fc->user_id) && - uid_eq(cred->suid, fc->user_id) && - uid_eq(cred->uid, fc->user_id) && - gid_eq(cred->egid, fc->group_id) && - gid_eq(cred->sgid, fc->group_id) && - gid_eq(cred->gid, fc->group_id)) - return 1; + if (!allow && allow_sys_admin_access && capable(CAP_SYS_ADMIN)) + allow = true; - return 0; + return allow; } static int fuse_access(struct inode *inode, int mask) @@ -1913,11 +1937,12 @@ static const struct inode_operations fuse_dir_inode_operations = { .setattr = fuse_setattr, .create = fuse_create, .atomic_open = fuse_atomic_open, + .tmpfile = fuse_tmpfile, .mknod = fuse_mknod, .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, - .get_acl = fuse_get_acl, + .get_inode_acl = fuse_get_acl, .set_acl = fuse_set_acl, .fileattr_get = fuse_fileattr_get, .fileattr_set = fuse_fileattr_set, @@ -1939,7 +1964,7 @@ static const struct inode_operations fuse_common_inode_operations = { .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, - .get_acl = fuse_get_acl, + .get_inode_acl = fuse_get_acl, .set_acl = fuse_set_acl, .fileattr_get = fuse_fileattr_get, .fileattr_set = fuse_fileattr_set, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 1a3afd469e3a..875314ee6f59 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1313,7 +1313,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) return err; if (fc->handle_killpriv_v2 && - should_remove_suid(file_dentry(file))) { + setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) { goto writethrough; } @@ -1563,14 +1563,47 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) return res; } +static bool fuse_direct_write_extending_i_size(struct kiocb *iocb, + struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); +} + static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; + bool exclusive_lock = + !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) || + iocb->ki_flags & IOCB_APPEND || + fuse_direct_write_extending_i_size(iocb, from); + + /* + * Take exclusive lock if + * - Parallel direct writes are disabled - a user space decision + * - Parallel direct writes are enabled and i_size is being extended. + * This might not be needed at all, but needs further investigation. + */ + if (exclusive_lock) + inode_lock(inode); + else { + inode_lock_shared(inode); + + /* A race with truncate might have come up as the decision for + * the lock type was done without holding the lock, check again. + */ + if (fuse_direct_write_extending_i_size(iocb, from)) { + inode_unlock_shared(inode); + inode_lock(inode); + exclusive_lock = true; + } + } - /* Don't allow parallel writes to the same file */ - inode_lock(inode); res = generic_write_checks(iocb, from); if (res > 0) { if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { @@ -1581,7 +1614,10 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) fuse_write_update_attr(inode, iocb->ki_pos, res); } } - inode_unlock(inode); + if (exclusive_lock) + inode_unlock(inode); + else + inode_unlock_shared(inode); return res; } @@ -2931,6 +2967,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (iov_iter_rw(iter) == WRITE) { fuse_write_update_attr(inode, pos, ret); + /* For extending writes we already hold exclusive lock */ if (ret < 0 && offset + count > i_size) fuse_do_truncate(file); } @@ -2963,11 +3000,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, .mode = mode }; int err; - bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || - (mode & (FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_ZERO_RANGE)); - - bool block_faults = FUSE_IS_DAX(inode) && lock_inode; + bool block_faults = FUSE_IS_DAX(inode) && + (!(mode & FALLOC_FL_KEEP_SIZE) || + (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))); if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) @@ -2976,22 +3011,20 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (fm->fc->no_fallocate) return -EOPNOTSUPP; - if (lock_inode) { - inode_lock(inode); - if (block_faults) { - filemap_invalidate_lock(inode->i_mapping); - err = fuse_dax_break_layouts(inode, 0, 0); - if (err) - goto out; - } + inode_lock(inode); + if (block_faults) { + filemap_invalidate_lock(inode->i_mapping); + err = fuse_dax_break_layouts(inode, 0, 0); + if (err) + goto out; + } - if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) { - loff_t endbyte = offset + length - 1; + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) { + loff_t endbyte = offset + length - 1; - err = fuse_writeback_range(inode, offset, endbyte); - if (err) - goto out; - } + err = fuse_writeback_range(inode, offset, endbyte); + if (err) + goto out; } if (!(mode & FALLOC_FL_KEEP_SIZE) && @@ -3001,6 +3034,10 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, goto out; } + err = file_modified(file); + if (err) + goto out; + if (!(mode & FALLOC_FL_KEEP_SIZE)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); @@ -3035,8 +3072,7 @@ out: if (block_faults) filemap_invalidate_unlock(inode->i_mapping); - if (lock_inode) - inode_unlock(inode); + inode_unlock(inode); fuse_flush_time_update(inode); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 488b460e046f..c673faefdcb9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -784,6 +784,9 @@ struct fuse_conn { /* Does the filesystem support per inode DAX? */ unsigned int inode_dax:1; + /* Is tmpfile not implemented by fs? */ + unsigned int no_tmpfile:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -1176,7 +1179,7 @@ bool fuse_invalid_attr(struct fuse_attr *attr); /** * Is current process allowed to perform filesystem operation? */ -int fuse_allow_current_process(struct fuse_conn *fc); +bool fuse_allow_current_process(struct fuse_conn *fc); u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); @@ -1217,7 +1220,7 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, * then the dentry is unhashed (d_delete()). */ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, - u64 child_nodeid, struct qstr *name); + u64 child_nodeid, struct qstr *name, u32 flags); int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir); @@ -1266,7 +1269,7 @@ extern const struct xattr_handler *fuse_no_acl_xattr_handlers[]; struct posix_acl; struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu); -int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); /* readdir.c */ diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 61d8afcb10a3..fcce94ace2c2 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -255,7 +255,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, ap.args.in_pages = true; err = -EFAULT; - iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size); + iov_iter_init(&ii, ITER_SOURCE, in_iov, in_iovs, in_size); for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) @@ -324,7 +324,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, goto out; err = -EFAULT; - iov_iter_init(&ii, READ, out_iov, out_iovs, transferred); + iov_iter_init(&ii, ITER_DEST, out_iov, out_iovs, transferred); for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index b4e565711045..dc603479b30e 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -77,8 +77,10 @@ static void fuse_add_dirent_to_cache(struct file *file, goto unlock; addr = kmap_local_page(page); - if (!offset) + if (!offset) { clear_page(addr); + SetPageUptodate(page); + } memcpy(addr + offset, dirent, reclen); kunmap_local(addr); fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen; @@ -516,6 +518,12 @@ retry_locked: page = find_get_page_flags(file->f_mapping, index, FGP_ACCESSED | FGP_LOCK); + /* Page gone missing, then re-added to cache, but not initialized? */ + if (page && !PageUptodate(page)) { + unlock_page(page); + put_page(page); + page = NULL; + } spin_lock(&fi->rdc.lock); if (!page) { /* @@ -539,9 +547,9 @@ retry_locked: * Contents of the page are now protected against changing by holding * the page lock. */ - addr = kmap(page); + addr = kmap_local_page(page); res = fuse_parse_cache(ff, addr, size, ctx); - kunmap(page); + kunmap_local(addr); unlock_page(page); put_page(page); diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 734d1f05d823..3dcde4912413 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -109,9 +109,10 @@ out: return error; } -int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; bool need_unlock = false; diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index cd180ca7c959..b8de8c148f5c 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -13,7 +13,7 @@ extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu); extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); -extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); #endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 05bee80ac7de..e782b4f1d104 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -427,8 +427,6 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) return error; kaddr = kmap_atomic(page); - if (dsize > gfs2_max_stuffed_size(ip)) - dsize = gfs2_max_stuffed_size(ip); memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); memset(kaddr + dsize, 0, PAGE_SIZE - dsize); kunmap_atomic(kaddr); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 3bdb2c668a71..e7537fd305dd 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -61,9 +61,6 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, void *kaddr = kmap(page); u64 dsize = i_size_read(inode); - if (dsize > gfs2_max_stuffed_size(ip)) - dsize = gfs2_max_stuffed_size(ip); - memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); memset(kaddr + dsize, 0, PAGE_SIZE - dsize); kunmap(page); diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 756d05779200..cf40895233f5 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -66,7 +66,7 @@ struct get_name_filldir { char *name; }; -static int get_name_filldir(struct dir_context *ctx, const char *name, +static bool get_name_filldir(struct dir_context *ctx, const char *name, int length, loff_t offset, u64 inum, unsigned int type) { @@ -74,12 +74,12 @@ static int get_name_filldir(struct dir_context *ctx, const char *name, container_of(ctx, struct get_name_filldir, ctx); if (inum != gnfd->inum.no_addr) - return 0; + return true; memcpy(gnfd->name, name, length); gnfd->name[length] = 0; - return 1; + return false; } static int gfs2_get_name(struct dentry *parent, char *name, diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 892006fbbb09..eea5be4fbf0e 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1443,6 +1443,21 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); } +static void __flock_holder_uninit(struct file *file, struct gfs2_holder *fl_gh) +{ + struct gfs2_glock *gl = gfs2_glock_hold(fl_gh->gh_gl); + + /* + * Make sure gfs2_glock_put() won't sleep under the file->f_lock + * spinlock. + */ + + spin_lock(&file->f_lock); + gfs2_holder_uninit(fl_gh); + spin_unlock(&file->f_lock); + gfs2_glock_put(gl); +} + static int do_flock(struct file *file, int cmd, struct file_lock *fl) { struct gfs2_file *fp = file->private_data; @@ -1455,7 +1470,9 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) int sleeptime; state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; - flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY_1CB) | GL_EXACT; + flags = GL_EXACT | GL_NOPID; + if (!IS_SETLKW(cmd)) + flags |= LM_FLAG_TRY_1CB; mutex_lock(&fp->f_fl_mutex); @@ -1474,18 +1491,21 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) &gfs2_flock_glops, CREATE, &gl); if (error) goto out; + spin_lock(&file->f_lock); gfs2_holder_init(gl, state, flags, fl_gh); + spin_unlock(&file->f_lock); gfs2_glock_put(gl); } for (sleeptime = 1; sleeptime <= 4; sleeptime <<= 1) { error = gfs2_glock_nq(fl_gh); if (error != GLR_TRYFAILED) break; - fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT; + fl_gh->gh_flags &= ~LM_FLAG_TRY_1CB; + fl_gh->gh_flags |= LM_FLAG_TRY; msleep(sleeptime); } if (error) { - gfs2_holder_uninit(fl_gh); + __flock_holder_uninit(file, fl_gh); if (error == GLR_TRYFAILED) error = -EAGAIN; } else { @@ -1507,7 +1527,7 @@ static void do_unflock(struct file *file, struct file_lock *fl) locks_lock_file_wait(file, fl); if (gfs2_holder_initialized(fl_gh)) { gfs2_glock_dq(fl_gh); - gfs2_holder_uninit(fl_gh); + __flock_holder_uninit(file, fl_gh); } mutex_unlock(&fp->f_fl_mutex); } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 41b6c89e4bf7..524f3c96b9a4 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -33,6 +33,9 @@ #include <linux/list_sort.h> #include <linux/lockref.h> #include <linux/rhashtable.h> +#include <linux/pid_namespace.h> +#include <linux/fdtable.h> +#include <linux/file.h> #include "gfs2.h" #include "incore.h" @@ -59,6 +62,8 @@ typedef void (*glock_examiner) (struct gfs2_glock * gl); static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); static void __gfs2_glock_dq(struct gfs2_holder *gh); +static void handle_callback(struct gfs2_glock *gl, unsigned int state, + unsigned long delay, bool remote); static struct dentry *gfs2_root; static struct workqueue_struct *glock_workqueue; @@ -181,10 +186,11 @@ void gfs2_glock_free(struct gfs2_glock *gl) * */ -void gfs2_glock_hold(struct gfs2_glock *gl) +struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl) { GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); lockref_get(&gl->gl_lockref); + return gl; } /** @@ -200,12 +206,6 @@ static int demote_ok(const struct gfs2_glock *gl) if (gl->gl_state == LM_ST_UNLOCKED) return 0; - /* - * Note that demote_ok is used for the lru process of disposing of - * glocks. For this purpose, we don't care if the glock's holders - * have the HIF_MAY_DEMOTE flag set or not. If someone is using - * them, don't demote. - */ if (!list_empty(&gl->gl_holders)) return 0; if (glops->go_demote_ok) @@ -388,7 +388,7 @@ static void do_error(struct gfs2_glock *gl, const int ret) struct gfs2_holder *gh, *tmp; list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { - if (!test_bit(HIF_WAIT, &gh->gh_iflags)) + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) continue; if (ret & LM_OUT_ERROR) gh->gh_error = -EIO; @@ -403,45 +403,6 @@ static void do_error(struct gfs2_glock *gl, const int ret) } /** - * demote_incompat_holders - demote incompatible demoteable holders - * @gl: the glock we want to promote - * @current_gh: the newly promoted holder - * - * We're passing the newly promoted holder in @current_gh, but actually, any of - * the strong holders would do. - */ -static void demote_incompat_holders(struct gfs2_glock *gl, - struct gfs2_holder *current_gh) -{ - struct gfs2_holder *gh, *tmp; - - /* - * Demote incompatible holders before we make ourselves eligible. - * (This holder may or may not allow auto-demoting, but we don't want - * to demote the new holder before it's even granted.) - */ - list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { - /* - * Since holders are at the front of the list, we stop when we - * find the first non-holder. - */ - if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) - return; - if (gh == current_gh) - continue; - if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) && - !may_grant(gl, current_gh, gh)) { - /* - * We should not recurse into do_promote because - * __gfs2_glock_dq only calls handle_callback, - * gfs2_glock_add_to_lru and __gfs2_glock_queue_work. - */ - __gfs2_glock_dq(gh); - } - } -} - -/** * find_first_holder - find the first "holder" gh * @gl: the glock */ @@ -459,26 +420,6 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) return NULL; } -/** - * find_first_strong_holder - find the first non-demoteable holder - * @gl: the glock - * - * Find the first holder that doesn't have the HIF_MAY_DEMOTE flag set. - */ -static inline struct gfs2_holder * -find_first_strong_holder(struct gfs2_glock *gl) -{ - struct gfs2_holder *gh; - - list_for_each_entry(gh, &gl->gl_holders, gh_list) { - if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) - return NULL; - if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags)) - return gh; - } - return NULL; -} - /* * gfs2_instantiate - Call the glops instantiate function * @gh: The glock holder @@ -535,9 +476,8 @@ done: static int do_promote(struct gfs2_glock *gl) { struct gfs2_holder *gh, *current_gh; - bool incompat_holders_demoted = false; - current_gh = find_first_strong_holder(gl); + current_gh = find_first_holder(gl); list_for_each_entry(gh, &gl->gl_holders, gh_list) { if (test_bit(HIF_HOLDER, &gh->gh_iflags)) continue; @@ -556,11 +496,8 @@ static int do_promote(struct gfs2_glock *gl) set_bit(HIF_HOLDER, &gh->gh_iflags); trace_gfs2_promote(gh); gfs2_holder_wake(gh); - if (!incompat_holders_demoted) { + if (!current_gh) current_gh = gh; - demote_incompat_holders(gl, current_gh); - incompat_holders_demoted = true; - } } return 0; } @@ -730,7 +667,8 @@ static bool is_system_glock(struct gfs2_glock *gl) * */ -static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target) +static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, + unsigned int target) __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { @@ -741,7 +679,8 @@ __acquires(&gl->gl_lockref.lock) if (target != LM_ST_UNLOCKED && glock_blocked_by_withdraw(gl) && gh && !(gh->gh_flags & LM_FLAG_NOEXP)) - return; + goto skip_inval; + lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | LM_FLAG_PRIORITY); GLOCK_BUG_ON(gl, gl->gl_state == target); @@ -826,6 +765,20 @@ skip_inval: (target != LM_ST_UNLOCKED || test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags))) { if (!is_system_glock(gl)) { + handle_callback(gl, LM_ST_UNLOCKED, 0, false); /* sets demote */ + /* + * Ordinarily, we would call dlm and its callback would call + * finish_xmote, which would call state_change() to the new state. + * Since we withdrew, we won't call dlm, so call state_change + * manually, but to the UNLOCKED state we desire. + */ + state_change(gl, LM_ST_UNLOCKED); + /* + * We skip telling dlm to do the locking, so we won't get a + * reply that would otherwise clear GLF_LOCK. So we clear it here. + */ + clear_bit(GLF_LOCK, &gl->gl_flags); + clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD); goto out; } else { @@ -906,6 +859,48 @@ out_unlock: return; } +/** + * glock_set_object - set the gl_object field of a glock + * @gl: the glock + * @object: the object + */ +void glock_set_object(struct gfs2_glock *gl, void *object) +{ + void *prev_object; + + spin_lock(&gl->gl_lockref.lock); + prev_object = gl->gl_object; + gl->gl_object = object; + spin_unlock(&gl->gl_lockref.lock); + if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL)) { + pr_warn("glock=%u/%llx\n", + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number); + gfs2_dump_glock(NULL, gl, true); + } +} + +/** + * glock_clear_object - clear the gl_object field of a glock + * @gl: the glock + */ +void glock_clear_object(struct gfs2_glock *gl, void *object) +{ + void *prev_object; + + spin_lock(&gl->gl_lockref.lock); + prev_object = gl->gl_object; + gl->gl_object = NULL; + spin_unlock(&gl->gl_lockref.lock); + if (gfs2_assert_warn(gl->gl_name.ln_sbd, + prev_object == object || prev_object == NULL)) { + pr_warn("glock=%u/%llx\n", + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number); + gfs2_dump_glock(NULL, gl, true); + } +} + void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation) { struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr; @@ -959,8 +954,6 @@ static bool gfs2_try_evict(struct gfs2_glock *gl) ip = NULL; spin_unlock(&gl->gl_lockref.lock); if (ip) { - struct gfs2_glock *inode_gl = NULL; - gl->gl_no_formal_ino = ip->i_no_formal_ino; set_bit(GIF_DEFERRED_DELETE, &ip->i_flags); d_prune_aliases(&ip->i_inode); @@ -970,14 +963,14 @@ static bool gfs2_try_evict(struct gfs2_glock *gl) spin_lock(&gl->gl_lockref.lock); ip = gl->gl_object; if (ip) { - inode_gl = ip->i_gl; - lockref_get(&inode_gl->gl_lockref); clear_bit(GIF_DEFERRED_DELETE, &ip->i_flags); + if (!igrab(&ip->i_inode)) + ip = NULL; } spin_unlock(&gl->gl_lockref.lock); - if (inode_gl) { - gfs2_glock_poke(inode_gl); - gfs2_glock_put(inode_gl); + if (ip) { + gfs2_glock_poke(ip->i_gl); + iput(&ip->i_inode); } evicted = !ip; } @@ -1023,7 +1016,11 @@ static void delete_work_func(struct work_struct *work) inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino, GFS2_BLKST_UNLINKED); - if (!IS_ERR_OR_NULL(inode)) { + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -EAGAIN && + (gfs2_queue_delete_work(gl, 5 * HZ))) + return; + } else { d_prune_aliases(inode); iput(inode); } @@ -1233,13 +1230,12 @@ void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, struct gfs2_holder *gh, unsigned long ip) { INIT_LIST_HEAD(&gh->gh_list); - gh->gh_gl = gl; + gh->gh_gl = gfs2_glock_hold(gl); gh->gh_ip = ip; gh->gh_owner_pid = get_pid(task_pid(current)); gh->gh_state = state; gh->gh_flags = flags; gh->gh_iflags = 0; - gfs2_glock_hold(gl); } /** @@ -1436,6 +1432,15 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) va_end(args); } +static inline bool pid_is_meaningful(const struct gfs2_holder *gh) +{ + if (!(gh->gh_flags & GL_NOPID)) + return true; + if (gh->gh_state == LM_ST_UNLOCKED) + return true; + return false; +} + /** * add_to_queue - Add a holder to the wait queue (but look for recursion) * @gh: the holder structure to add @@ -1464,7 +1469,7 @@ __acquires(&gl->gl_lockref.lock) if (test_bit(GLF_LOCK, &gl->gl_flags)) { struct gfs2_holder *current_gh; - current_gh = find_first_strong_holder(gl); + current_gh = find_first_holder(gl); try_futile = !may_grant(gl, current_gh, gh); } if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) @@ -1472,10 +1477,15 @@ __acquires(&gl->gl_lockref.lock) } list_for_each_entry(gh2, &gl->gl_holders, gh_list) { - if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid && - (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) && - !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags))) - goto trap_recursive; + if (likely(gh2->gh_owner_pid != gh->gh_owner_pid)) + continue; + if (gh->gh_gl->gl_ops->go_type == LM_TYPE_FLOCK) + continue; + if (!pid_is_meaningful(gh2)) + continue; + goto trap_recursive; + } + list_for_each_entry(gh2, &gl->gl_holders, gh_list) { if (try_futile && !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { fail: @@ -1580,69 +1590,28 @@ static inline bool needs_demote(struct gfs2_glock *gl) static void __gfs2_glock_dq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; - struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; unsigned delay = 0; int fast_path = 0; /* - * This while loop is similar to function demote_incompat_holders: - * If the glock is due to be demoted (which may be from another node - * or even if this holder is GL_NOCACHE), the weak holders are - * demoted as well, allowing the glock to be demoted. + * This holder should not be cached, so mark it for demote. + * Note: this should be done before the check for needs_demote + * below. */ - while (gh) { - /* - * If we're in the process of file system withdraw, we cannot - * just dequeue any glocks until our journal is recovered, lest - * we introduce file system corruption. We need two exceptions - * to this rule: We need to allow unlocking of nondisk glocks - * and the glock for our own journal that needs recovery. - */ - if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) && - glock_blocked_by_withdraw(gl) && - gh->gh_gl != sdp->sd_jinode_gl) { - sdp->sd_glock_dqs_held++; - spin_unlock(&gl->gl_lockref.lock); - might_sleep(); - wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, - TASK_UNINTERRUPTIBLE); - spin_lock(&gl->gl_lockref.lock); - } - - /* - * This holder should not be cached, so mark it for demote. - * Note: this should be done before the check for needs_demote - * below. - */ - if (gh->gh_flags & GL_NOCACHE) - handle_callback(gl, LM_ST_UNLOCKED, 0, false); - - list_del_init(&gh->gh_list); - clear_bit(HIF_HOLDER, &gh->gh_iflags); - trace_gfs2_glock_queue(gh, 0); + if (gh->gh_flags & GL_NOCACHE) + handle_callback(gl, LM_ST_UNLOCKED, 0, false); - /* - * If there hasn't been a demote request we are done. - * (Let the remaining holders, if any, keep holding it.) - */ - if (!needs_demote(gl)) { - if (list_empty(&gl->gl_holders)) - fast_path = 1; - break; - } - /* - * If we have another strong holder (we cannot auto-demote) - * we are done. It keeps holding it until it is done. - */ - if (find_first_strong_holder(gl)) - break; + list_del_init(&gh->gh_list); + clear_bit(HIF_HOLDER, &gh->gh_iflags); + trace_gfs2_glock_queue(gh, 0); - /* - * If we have a weak holder at the head of the list, it - * (and all others like it) must be auto-demoted. If there - * are no more weak holders, we exit the while loop. - */ - gh = find_first_holder(gl); + /* + * If there hasn't been a demote request we are done. + * (Let the remaining holders, if any, keep holding it.) + */ + if (!needs_demote(gl)) { + if (list_empty(&gl->gl_holders)) + fast_path = 1; } if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl)) @@ -1666,8 +1635,17 @@ static void __gfs2_glock_dq(struct gfs2_holder *gh) void gfs2_glock_dq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; spin_lock(&gl->gl_lockref.lock); + if (!gfs2_holder_queued(gh)) { + /* + * May have already been dequeued because the locking request + * was GL_ASYNC and it has failed in the meantime. + */ + goto out; + } + if (list_is_first(&gh->gh_list, &gl->gl_holders) && !test_bit(HIF_HOLDER, &gh->gh_iflags)) { spin_unlock(&gl->gl_lockref.lock); @@ -1676,7 +1654,26 @@ void gfs2_glock_dq(struct gfs2_holder *gh) spin_lock(&gl->gl_lockref.lock); } + /* + * If we're in the process of file system withdraw, we cannot just + * dequeue any glocks until our journal is recovered, lest we introduce + * file system corruption. We need two exceptions to this rule: We need + * to allow unlocking of nondisk glocks and the glock for our own + * journal that needs recovery. + */ + if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) && + glock_blocked_by_withdraw(gl) && + gh->gh_gl != sdp->sd_jinode_gl) { + sdp->sd_glock_dqs_held++; + spin_unlock(&gl->gl_lockref.lock); + might_sleep(); + wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, + TASK_UNINTERRUPTIBLE); + spin_lock(&gl->gl_lockref.lock); + } + __gfs2_glock_dq(gh); +out: spin_unlock(&gl->gl_lockref.lock); } @@ -1849,33 +1846,6 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) delay = gl->gl_hold_time; } - /* - * Note 1: We cannot call demote_incompat_holders from handle_callback - * or gfs2_set_demote due to recursion problems like: gfs2_glock_dq -> - * handle_callback -> demote_incompat_holders -> gfs2_glock_dq - * Plus, we only want to demote the holders if the request comes from - * a remote cluster node because local holder conflicts are resolved - * elsewhere. - * - * Note 2: if a remote node wants this glock in EX mode, lock_dlm will - * request that we set our state to UNLOCKED. Here we mock up a holder - * to make it look like someone wants the lock EX locally. Any SH - * and DF requests should be able to share the lock without demoting. - * - * Note 3: We only want to demote the demoteable holders when there - * are no more strong holders. The demoteable holders might as well - * keep the glock until the last strong holder is done with it. - */ - if (!find_first_strong_holder(gl)) { - struct gfs2_holder mock_gh = { - .gh_gl = gl, - .gh_state = (state == LM_ST_UNLOCKED) ? - LM_ST_EXCLUSIVE : state, - .gh_iflags = BIT(HIF_HOLDER) - }; - - demote_incompat_holders(gl, &mock_gh); - } handle_callback(gl, state, delay, true); __gfs2_glock_queue_work(gl, delay); spin_unlock(&gl->gl_lockref.lock); @@ -2194,6 +2164,20 @@ static void dump_glock_func(struct gfs2_glock *gl) dump_glock(NULL, gl, true); } +static void withdraw_dq(struct gfs2_glock *gl) +{ + spin_lock(&gl->gl_lockref.lock); + if (!__lockref_is_dead(&gl->gl_lockref) && + glock_blocked_by_withdraw(gl)) + do_error(gl, LM_OUT_ERROR); /* remove pending waiters */ + spin_unlock(&gl->gl_lockref.lock); +} + +void gfs2_gl_dq_holders(struct gfs2_sbd *sdp) +{ + glock_hash_walk(withdraw_dq, sdp); +} + /** * gfs2_gl_hash_clear - Empty out the glock hash table * @sdp: the filesystem @@ -2253,8 +2237,6 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) *p++ = 'H'; if (test_bit(HIF_WAIT, &iflags)) *p++ = 'W'; - if (test_bit(HIF_MAY_DEMOTE, &iflags)) - *p++ = 'D'; if (flags & GL_SKIP) *p++ = 's'; *p = 0; @@ -2272,19 +2254,24 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh, const char *fs_id_buf) { - struct task_struct *gh_owner = NULL; + const char *comm = "(none)"; + pid_t owner_pid = 0; char flags_buf[32]; rcu_read_lock(); - if (gh->gh_owner_pid) + if (pid_is_meaningful(gh)) { + struct task_struct *gh_owner; + + comm = "(ended)"; + owner_pid = pid_nr(gh->gh_owner_pid); gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); + if (gh_owner) + comm = gh_owner->comm; + } gfs2_print_dbg(seq, "%s H: s:%s f:%s e:%d p:%ld [%s] %pS\n", fs_id_buf, state2str(gh->gh_state), hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), - gh->gh_error, - gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, - gh_owner ? gh_owner->comm : "(ended)", - (void *)gh->gh_ip); + gh->gh_error, (long)owner_pid, comm, (void *)gh->gh_ip); rcu_read_unlock(); } @@ -2699,6 +2686,172 @@ static const struct file_operations gfs2_glstats_fops = { .release = gfs2_glocks_release, }; +struct gfs2_glockfd_iter { + struct super_block *sb; + unsigned int tgid; + struct task_struct *task; + unsigned int fd; + struct file *file; +}; + +static struct task_struct *gfs2_glockfd_next_task(struct gfs2_glockfd_iter *i) +{ + struct pid_namespace *ns = task_active_pid_ns(current); + struct pid *pid; + + if (i->task) + put_task_struct(i->task); + + rcu_read_lock(); +retry: + i->task = NULL; + pid = find_ge_pid(i->tgid, ns); + if (pid) { + i->tgid = pid_nr_ns(pid, ns); + i->task = pid_task(pid, PIDTYPE_TGID); + if (!i->task) { + i->tgid++; + goto retry; + } + get_task_struct(i->task); + } + rcu_read_unlock(); + return i->task; +} + +static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i) +{ + if (i->file) { + fput(i->file); + i->file = NULL; + } + + rcu_read_lock(); + for(;; i->fd++) { + struct inode *inode; + + i->file = task_lookup_next_fd_rcu(i->task, &i->fd); + if (!i->file) { + i->fd = 0; + break; + } + inode = file_inode(i->file); + if (inode->i_sb != i->sb) + continue; + if (get_file_rcu(i->file)) + break; + } + rcu_read_unlock(); + return i->file; +} + +static void *gfs2_glockfd_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct gfs2_glockfd_iter *i = seq->private; + + if (*pos) + return NULL; + while (gfs2_glockfd_next_task(i)) { + if (gfs2_glockfd_next_file(i)) + return i; + i->tgid++; + } + return NULL; +} + +static void *gfs2_glockfd_seq_next(struct seq_file *seq, void *iter_ptr, + loff_t *pos) +{ + struct gfs2_glockfd_iter *i = seq->private; + + (*pos)++; + i->fd++; + do { + if (gfs2_glockfd_next_file(i)) + return i; + i->tgid++; + } while (gfs2_glockfd_next_task(i)); + return NULL; +} + +static void gfs2_glockfd_seq_stop(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_glockfd_iter *i = seq->private; + + if (i->file) + fput(i->file); + if (i->task) + put_task_struct(i->task); +} + +static void gfs2_glockfd_seq_show_flock(struct seq_file *seq, + struct gfs2_glockfd_iter *i) +{ + struct gfs2_file *fp = i->file->private_data; + struct gfs2_holder *fl_gh = &fp->f_fl_gh; + struct lm_lockname gl_name = { .ln_type = LM_TYPE_RESERVED }; + + if (!READ_ONCE(fl_gh->gh_gl)) + return; + + spin_lock(&i->file->f_lock); + if (gfs2_holder_initialized(fl_gh)) + gl_name = fl_gh->gh_gl->gl_name; + spin_unlock(&i->file->f_lock); + + if (gl_name.ln_type != LM_TYPE_RESERVED) { + seq_printf(seq, "%d %u %u/%llx\n", + i->tgid, i->fd, gl_name.ln_type, + (unsigned long long)gl_name.ln_number); + } +} + +static int gfs2_glockfd_seq_show(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_glockfd_iter *i = seq->private; + struct inode *inode = file_inode(i->file); + struct gfs2_glock *gl; + + inode_lock_shared(inode); + gl = GFS2_I(inode)->i_iopen_gh.gh_gl; + if (gl) { + seq_printf(seq, "%d %u %u/%llx\n", + i->tgid, i->fd, gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number); + } + gfs2_glockfd_seq_show_flock(seq, i); + inode_unlock_shared(inode); + return 0; +} + +static const struct seq_operations gfs2_glockfd_seq_ops = { + .start = gfs2_glockfd_seq_start, + .next = gfs2_glockfd_seq_next, + .stop = gfs2_glockfd_seq_stop, + .show = gfs2_glockfd_seq_show, +}; + +static int gfs2_glockfd_open(struct inode *inode, struct file *file) +{ + struct gfs2_glockfd_iter *i; + struct gfs2_sbd *sdp = inode->i_private; + + i = __seq_open_private(file, &gfs2_glockfd_seq_ops, + sizeof(struct gfs2_glockfd_iter)); + if (!i) + return -ENOMEM; + i->sb = sdp->sd_vfs; + return 0; +} + +static const struct file_operations gfs2_glockfd_fops = { + .owner = THIS_MODULE, + .open = gfs2_glockfd_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + DEFINE_SEQ_ATTRIBUTE(gfs2_sbstats); void gfs2_create_debugfs_file(struct gfs2_sbd *sdp) @@ -2708,6 +2861,9 @@ void gfs2_create_debugfs_file(struct gfs2_sbd *sdp) debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_glocks_fops); + debugfs_create_file("glockfd", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, + &gfs2_glockfd_fops); + debugfs_create_file("glstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_glstats_fops); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 5aed8b500cf5..f37ac087e2c1 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -91,6 +91,7 @@ enum { #define GL_ASYNC 0x0040 #define GL_EXACT 0x0080 #define GL_SKIP 0x0100 +#define GL_NOPID 0x0200 #define GL_NOCACHE 0x0400 /* @@ -155,8 +156,6 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock * list_for_each_entry(gh, &gl->gl_holders, gh_list) { if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) break; - if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags)) - continue; if (gh->gh_owner_pid == pid) goto out; } @@ -195,7 +194,7 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp); -extern void gfs2_glock_hold(struct gfs2_glock *gl); +extern struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl); extern void gfs2_glock_put(struct gfs2_glock *gl); extern void gfs2_glock_queue_put(struct gfs2_glock *gl); @@ -274,6 +273,7 @@ extern void gfs2_cancel_delete_work(struct gfs2_glock *gl); extern bool gfs2_delete_work_queued(const struct gfs2_glock *gl); extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp); extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); +extern void gfs2_gl_dq_holders(struct gfs2_sbd *sdp); extern void gfs2_glock_thaw(struct gfs2_sbd *sdp); extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl); extern void gfs2_glock_free(struct gfs2_glock *gl); @@ -286,6 +286,9 @@ extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); extern void gfs2_register_debugfs(void); extern void gfs2_unregister_debugfs(void); +extern void glock_set_object(struct gfs2_glock *gl, void *object); +extern void glock_clear_object(struct gfs2_glock *gl, void *object); + extern const struct lm_lockops gfs2_dlm_ops; static inline void gfs2_holder_mark_uninitialized(struct gfs2_holder *gh) @@ -303,64 +306,6 @@ static inline bool gfs2_holder_queued(struct gfs2_holder *gh) return !list_empty(&gh->gh_list); } -/** - * glock_set_object - set the gl_object field of a glock - * @gl: the glock - * @object: the object - */ -static inline void glock_set_object(struct gfs2_glock *gl, void *object) -{ - spin_lock(&gl->gl_lockref.lock); - if (gfs2_assert_warn(gl->gl_name.ln_sbd, gl->gl_object == NULL)) - gfs2_dump_glock(NULL, gl, true); - gl->gl_object = object; - spin_unlock(&gl->gl_lockref.lock); -} - -/** - * glock_clear_object - clear the gl_object field of a glock - * @gl: the glock - * @object: the object - * - * I'd love to similarly add this: - * else if (gfs2_assert_warn(gl->gl_sbd, gl->gl_object == object)) - * gfs2_dump_glock(NULL, gl, true); - * Unfortunately, that's not possible because as soon as gfs2_delete_inode - * frees the block in the rgrp, another process can reassign it for an I_NEW - * inode in gfs2_create_inode because that calls new_inode, not gfs2_iget. - * That means gfs2_delete_inode may subsequently try to call this function - * for a glock that's already pointing to a brand new inode. If we clear the - * new inode's gl_object, we'll introduce metadata corruption. Function - * gfs2_delete_inode calls clear_inode which calls gfs2_clear_inode which also - * tries to clear gl_object, so it's more than just gfs2_delete_inode. - * - */ -static inline void glock_clear_object(struct gfs2_glock *gl, void *object) -{ - spin_lock(&gl->gl_lockref.lock); - if (gl->gl_object == object) - gl->gl_object = NULL; - spin_unlock(&gl->gl_lockref.lock); -} - -static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh) -{ - struct gfs2_glock *gl = gh->gh_gl; - - spin_lock(&gl->gl_lockref.lock); - set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags); - spin_unlock(&gl->gl_lockref.lock); -} - -static inline void gfs2_holder_disallow_demote(struct gfs2_holder *gh) -{ - struct gfs2_glock *gl = gh->gh_gl; - - spin_lock(&gl->gl_lockref.lock); - clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags); - spin_unlock(&gl->gl_lockref.lock); -} - extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation); extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 49210a2e7ce7..d78b61ecc1cd 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -397,38 +397,39 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) struct timespec64 atime; u16 height, depth; umode_t mode = be32_to_cpu(str->di_mode); - bool is_new = ip->i_inode.i_state & I_NEW; + struct inode *inode = &ip->i_inode; + bool is_new = inode->i_state & I_NEW; if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) goto corrupt; - if (unlikely(!is_new && inode_wrong_type(&ip->i_inode, mode))) + if (unlikely(!is_new && inode_wrong_type(inode, mode))) goto corrupt; ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino); - ip->i_inode.i_mode = mode; + inode->i_mode = mode; if (is_new) { - ip->i_inode.i_rdev = 0; + inode->i_rdev = 0; switch (mode & S_IFMT) { case S_IFBLK: case S_IFCHR: - ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major), - be32_to_cpu(str->di_minor)); + inode->i_rdev = MKDEV(be32_to_cpu(str->di_major), + be32_to_cpu(str->di_minor)); break; } } - i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid)); - i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid)); - set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink)); - i_size_write(&ip->i_inode, be64_to_cpu(str->di_size)); - gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); + i_uid_write(inode, be32_to_cpu(str->di_uid)); + i_gid_write(inode, be32_to_cpu(str->di_gid)); + set_nlink(inode, be32_to_cpu(str->di_nlink)); + i_size_write(inode, be64_to_cpu(str->di_size)); + gfs2_set_inode_blocks(inode, be64_to_cpu(str->di_blocks)); atime.tv_sec = be64_to_cpu(str->di_atime); atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); - if (timespec64_compare(&ip->i_inode.i_atime, &atime) < 0) - ip->i_inode.i_atime = atime; - ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime); - ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); - ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime); - ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); + if (timespec64_compare(&inode->i_atime, &atime) < 0) + inode->i_atime = atime; + inode->i_mtime.tv_sec = be64_to_cpu(str->di_mtime); + inode->i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); + inode->i_ctime.tv_sec = be64_to_cpu(str->di_ctime); + inode->i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); ip->i_goal = be64_to_cpu(str->di_goal_meta); ip->i_generation = be64_to_cpu(str->di_generation); @@ -436,7 +437,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) ip->i_diskflags = be32_to_cpu(str->di_flags); ip->i_eattr = be64_to_cpu(str->di_eattr); /* i_diskflags and i_eattr must be set before gfs2_set_inode_flags() */ - gfs2_set_inode_flags(&ip->i_inode); + gfs2_set_inode_flags(inode); height = be16_to_cpu(str->di_height); if (unlikely(height > GFS2_MAX_META_HEIGHT)) goto corrupt; @@ -448,8 +449,11 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) ip->i_depth = (u8)depth; ip->i_entries = be32_to_cpu(str->di_entries); - if (S_ISREG(ip->i_inode.i_mode)) - gfs2_set_aops(&ip->i_inode); + if (gfs2_is_stuffed(ip) && inode->i_size > gfs2_max_stuffed_size(ip)) + goto corrupt; + + if (S_ISREG(inode->i_mode)) + gfs2_set_aops(inode); return 0; corrupt: diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index d09d9892cd05..c26765080f28 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -252,7 +252,6 @@ struct gfs2_lkstats { enum { /* States */ - HIF_MAY_DEMOTE = 1, HIF_HOLDER = 6, /* Set for gh that "holds" the glock */ HIF_WAIT = 10, }; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index c8ec876f33ea..614db3055c02 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -130,6 +130,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_glock *io_gl; + int extra_flags = 0; error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); @@ -141,9 +142,17 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (unlikely(error)) goto fail; - if (blktype != GFS2_BLKST_UNLINKED) + /* + * The only caller that sets @blktype to GFS2_BLKST_UNLINKED is + * delete_work_func(). Make sure not to cancel the delete work + * from within itself here. + */ + if (blktype == GFS2_BLKST_UNLINKED) + extra_flags |= LM_FLAG_TRY; + else gfs2_cancel_delete_work(io_gl); - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, + GL_EXACT | GL_NOPID | extra_flags, &ip->i_iopen_gh); gfs2_glock_put(io_gl); if (unlikely(error)) @@ -210,6 +219,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, return inode; fail: + if (error == GLR_TRYFAILED) + error = -EAGAIN; if (gfs2_holder_initialized(&ip->i_iopen_gh)) gfs2_glock_dq_uninit(&ip->i_iopen_gh); if (gfs2_holder_initialized(&i_gh)) @@ -397,12 +408,17 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks) goto out_ipreserv; error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation); + if (error) + goto out_trans_end; + ip->i_no_formal_ino = ip->i_generation; ip->i_inode.i_ino = ip->i_no_addr; ip->i_goal = ip->i_no_addr; + if (*dblocks > 1) + ip->i_eattr = ip->i_no_addr + 1; +out_trans_end: gfs2_trans_end(sdp); - out_ipreserv: gfs2_inplace_release(ip); out_quota: @@ -580,6 +596,12 @@ static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, * @size: The initial size of the inode (ignored for directories) * @excl: Force fail if inode exists * + * FIXME: Change to allocate the disk blocks and write them out in the same + * transaction. That way, we can no longer end up in a situation in which an + * inode is allocated, the node crashes, and the block looks like a valid + * inode. (With atomic creates in place, we will also no longer need to zero + * the link count and dirty the inode here on failure.) + * * Returns: 0 on success, or error code */ @@ -590,12 +612,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, { const struct qstr *name = &dentry->d_name; struct posix_acl *default_acl, *acl; - struct gfs2_holder ghs[2]; + struct gfs2_holder d_gh, gh; struct inode *inode = NULL; struct gfs2_inode *dip = GFS2_I(dir), *ip; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_glock *io_gl; - int error, free_vfs_inode = 1; + int error; u32 aflags = 0; unsigned blocks = 1; struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, }; @@ -611,10 +633,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail; - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, &d_gh); if (error) goto fail; - gfs2_holder_mark_uninitialized(ghs + 1); + gfs2_holder_mark_uninitialized(&gh); error = create_ok(dip, name, mode); if (error) @@ -636,7 +658,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, else error = finish_no_open(file, NULL); } - gfs2_glock_dq_uninit(ghs); + gfs2_glock_dq_uninit(&d_gh); goto fail; } else if (error != -ENOENT) { goto fail_gunlock; @@ -650,12 +672,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = -ENOMEM; if (!inode) goto fail_gunlock; + ip = GFS2_I(inode); error = posix_acl_create(dir, &mode, &default_acl, &acl); if (error) goto fail_gunlock; - ip = GFS2_I(inode); error = gfs2_qa_get(ip); if (error) goto fail_free_acls; @@ -717,14 +739,19 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, goto fail_free_inode; gfs2_cancel_delete_work(io_gl); +retry: error = insert_inode_locked4(inode, ip->i_no_addr, iget_test, &ip->i_no_addr); - BUG_ON(error); + if (error == -EBUSY) + goto retry; + if (error) + goto fail_gunlock2; - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT | GL_NOPID, + &ip->i_iopen_gh); if (error) goto fail_gunlock2; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (error) goto fail_gunlock3; @@ -732,10 +759,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock3; - if (blocks > 1) { - ip->i_eattr = ip->i_no_addr + 1; + if (blocks > 1) gfs2_init_xattr(ip); - } init_dinode(dip, ip, symname); gfs2_trans_end(sdp); @@ -743,9 +768,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, glock_set_object(io_gl, ip); gfs2_set_iop(inode); - free_vfs_inode = 0; /* After this point, the inode is no longer - considered free. Any failures need to undo - the gfs2 structures. */ if (default_acl) { error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); if (error) @@ -778,9 +800,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, file->f_mode |= FMODE_CREATED; error = finish_open(file, dentry, gfs2_open_common); } - gfs2_glock_dq_uninit(ghs); + gfs2_glock_dq_uninit(&d_gh); gfs2_qa_put(ip); - gfs2_glock_dq_uninit(ghs + 1); + gfs2_glock_dq_uninit(&gh); gfs2_glock_put(io_gl); gfs2_qa_put(dip); unlock_new_inode(inode); @@ -794,10 +816,6 @@ fail_gunlock3: fail_gunlock2: gfs2_glock_put(io_gl); fail_free_inode: - if (ip->i_gl) { - if (free_vfs_inode) /* else evict will do the put for us */ - gfs2_glock_put(ip->i_gl); - } gfs2_rs_deltree(&ip->i_res); gfs2_qa_put(ip); fail_free_acls: @@ -805,20 +823,19 @@ fail_free_acls: posix_acl_release(acl); fail_gunlock: gfs2_dir_no_add(&da); - gfs2_glock_dq_uninit(ghs); + gfs2_glock_dq_uninit(&d_gh); if (!IS_ERR_OR_NULL(inode)) { + set_bit(GIF_ALLOC_FAILED, &ip->i_flags); clear_nlink(inode); - if (!free_vfs_inode) + if (ip->i_no_addr) mark_inode_dirty(inode); - set_bit(free_vfs_inode ? GIF_FREE_VFS_INODE : GIF_ALLOC_FAILED, - &GFS2_I(inode)->i_flags); if (inode->i_state & I_NEW) iget_failed(inode); else iput(inode); } - if (gfs2_holder_initialized(ghs + 1)) - gfs2_glock_dq_uninit(ghs + 1); + if (gfs2_holder_initialized(&gh)) + gfs2_glock_dq_uninit(&gh); fail: gfs2_qa_put(dip); return error; @@ -1990,7 +2007,7 @@ static int gfs2_setattr(struct user_namespace *mnt_userns, else { error = gfs2_setattr_simple(inode, attr); if (!error && attr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(&init_user_ns, inode, + error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); } @@ -2142,7 +2159,7 @@ static const struct inode_operations gfs2_file_iops = { .getattr = gfs2_getattr, .listxattr = gfs2_listxattr, .fiemap = gfs2_fiemap, - .get_acl = gfs2_get_acl, + .get_inode_acl = gfs2_get_acl, .set_acl = gfs2_set_acl, .update_time = gfs2_update_time, .fileattr_get = gfs2_fileattr_get, @@ -2164,7 +2181,7 @@ static const struct inode_operations gfs2_dir_iops = { .getattr = gfs2_getattr, .listxattr = gfs2_listxattr, .fiemap = gfs2_fiemap, - .get_acl = gfs2_get_acl, + .get_inode_acl = gfs2_get_acl, .set_acl = gfs2_set_acl, .update_time = gfs2_update_time, .atomic_open = gfs2_atomic_open, diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 6ce369b096d4..71911bf9ab34 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1302,7 +1302,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) memcpy(cluster, table, strlen(table) - strlen(fsname)); fsname++; - flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL; + flags = DLM_LSFL_NEWEXCL; /* * create/join lockspace diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 14ae9de76277..afcb32854f14 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -151,14 +151,6 @@ static int __init init_gfs2_fs(void) if (error) goto fail_shrinker; - error = register_filesystem(&gfs2_fs_type); - if (error) - goto fail_fs1; - - error = register_filesystem(&gfs2meta_fs_type); - if (error) - goto fail_fs2; - error = -ENOMEM; gfs_recovery_wq = alloc_workqueue("gfs_recovery", WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); @@ -180,11 +172,23 @@ static int __init init_gfs2_fs(void) goto fail_mempool; gfs2_register_debugfs(); + error = register_filesystem(&gfs2_fs_type); + if (error) + goto fail_fs1; + + error = register_filesystem(&gfs2meta_fs_type); + if (error) + goto fail_fs2; + pr_info("GFS2 installed\n"); return 0; +fail_fs2: + unregister_filesystem(&gfs2_fs_type); +fail_fs1: + mempool_destroy(gfs2_page_pool); fail_mempool: destroy_workqueue(gfs2_freeze_wq); fail_wq3: @@ -192,10 +196,6 @@ fail_wq3: fail_wq2: destroy_workqueue(gfs_recovery_wq); fail_wq1: - unregister_filesystem(&gfs2meta_fs_type); -fail_fs2: - unregister_filesystem(&gfs2_fs_type); -fail_fs1: unregister_shrinker(&gfs2_qd_shrinker); fail_shrinker: kmem_cache_destroy(gfs2_trans_cachep); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 7e70e0ba5a6c..3c41b864ee5b 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -442,6 +442,12 @@ void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) struct buffer_head *bh; int ty; + if (!ip->i_gl) { + /* This can only happen during incomplete inode creation. */ + BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)); + return; + } + gfs2_ail1_wipe(sdp, bstart, blen); while (blen) { ty = REMOVE_META; @@ -525,8 +531,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; - if (!buffer_locked(first_bh)) - ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &first_bh); + bh_read_nowait(first_bh, REQ_META | REQ_PRIO); dblock++; extlen--; @@ -534,9 +539,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) while (extlen) { bh = gfs2_getbuf(gl, dblock, CREATE); - if (!buffer_uptodate(bh) && !buffer_locked(bh)) - ll_rw_block(REQ_OP_READ | REQ_RAHEAD | REQ_META | - REQ_PRIO, 1, &bh); + bh_readahead(bh, REQ_RAHEAD | REQ_META | REQ_PRIO); brelse(bh); dblock++; extlen--; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 549879929c84..c0cf1d2d0ef5 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -178,7 +178,10 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) pr_warn("Invalid block size\n"); return -EINVAL; } - + if (sb->sb_bsize_shift != ffs(sb->sb_bsize) - 1) { + pr_warn("Invalid block size shift\n"); + return -EINVAL; + } return 0; } @@ -381,8 +384,10 @@ static int init_names(struct gfs2_sbd *sdp, int silent) if (!table[0]) table = sdp->sd_vfs->s_id; - strlcpy(sdp->sd_proto_name, proto, GFS2_FSNAME_LEN); - strlcpy(sdp->sd_table_name, table, GFS2_FSNAME_LEN); + BUILD_BUG_ON(GFS2_LOCKNAME_LEN > GFS2_FSNAME_LEN); + + strscpy(sdp->sd_proto_name, proto, GFS2_LOCKNAME_LEN); + strscpy(sdp->sd_table_name, table, GFS2_LOCKNAME_LEN); table = sdp->sd_table_name; while ((table = strchr(table, '/'))) @@ -401,7 +406,8 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, error = gfs2_glock_nq_num(sdp, GFS2_MOUNT_LOCK, &gfs2_nondisk_glops, - LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE, + LM_ST_EXCLUSIVE, + LM_FLAG_NOEXP | GL_NOCACHE | GL_NOPID, mount_gh); if (error) { fs_err(sdp, "can't acquire mount glock: %d\n", error); @@ -411,7 +417,7 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, error = gfs2_glock_nq_num(sdp, GFS2_LIVE_LOCK, &gfs2_nondisk_glops, LM_ST_SHARED, - LM_FLAG_NOEXP | GL_EXACT, + LM_FLAG_NOEXP | GL_EXACT | GL_NOPID, &sdp->sd_live_gh); if (error) { fs_err(sdp, "can't acquire live glock: %d\n", error); @@ -687,7 +693,7 @@ static int init_statfs(struct gfs2_sbd *sdp) iput(pn); pn = NULL; ip = GFS2_I(sdp->sd_sc_inode); - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_NOPID, &sdp->sd_sc_gh); if (error) { fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); @@ -776,7 +782,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid, &gfs2_journal_glops, LM_ST_EXCLUSIVE, - LM_FLAG_NOEXP | GL_NOCACHE, + LM_FLAG_NOEXP | GL_NOCACHE | GL_NOPID, &sdp->sd_journal_gh); if (error) { fs_err(sdp, "can't acquire journal glock: %d\n", error); @@ -786,7 +792,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) ip = GFS2_I(sdp->sd_jdesc->jd_inode); sdp->sd_jinode_gl = ip->i_gl; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, - LM_FLAG_NOEXP | GL_EXACT | GL_NOCACHE, + LM_FLAG_NOEXP | GL_EXACT | + GL_NOCACHE | GL_NOPID, &sdp->sd_jinode_gh); if (error) { fs_err(sdp, "can't acquire journal inode glock: %d\n", @@ -957,7 +964,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) pn = NULL; ip = GFS2_I(sdp->sd_qc_inode); - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_NOPID, &sdp->sd_qc_gh); if (error) { fs_err(sdp, "can't lock local \"qc\" file: %d\n", error); @@ -1439,13 +1446,13 @@ static int gfs2_parse_param(struct fs_context *fc, struct fs_parameter *param) switch (o) { case Opt_lockproto: - strlcpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN); + strscpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN); break; case Opt_locktable: - strlcpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN); + strscpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN); break; case Opt_hostdata: - strlcpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN); + strscpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN); break; case Opt_spectator: args->ar_spectator = 1; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index f201eaf59d0d..1ed17226d9ed 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -745,12 +745,8 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, } if (PageUptodate(page)) set_buffer_uptodate(bh); - if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - goto unlock_out; - } + if (bh_read(bh, REQ_META | REQ_PRIO) < 0) + goto unlock_out; if (gfs2_is_jdata(ip)) gfs2_trans_add_data(ip->i_gl, bh); else diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b5b0f285b27f..999cc146d708 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -346,7 +346,8 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp) } error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE, - LM_FLAG_NOEXP, &sdp->sd_freeze_gh); + LM_FLAG_NOEXP | GL_NOPID, + &sdp->sd_freeze_gh); if (error) goto out; @@ -378,6 +379,7 @@ out: void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) { + const struct inode *inode = &ip->i_inode; struct gfs2_dinode *str = buf; str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC); @@ -385,15 +387,15 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI); str->di_num.no_addr = cpu_to_be64(ip->i_no_addr); str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); - str->di_mode = cpu_to_be32(ip->i_inode.i_mode); - str->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode)); - str->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode)); - str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); - str->di_size = cpu_to_be64(i_size_read(&ip->i_inode)); - str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); - str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); - str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); - str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec); + str->di_mode = cpu_to_be32(inode->i_mode); + str->di_uid = cpu_to_be32(i_uid_read(inode)); + str->di_gid = cpu_to_be32(i_gid_read(inode)); + str->di_nlink = cpu_to_be32(inode->i_nlink); + str->di_size = cpu_to_be64(i_size_read(inode)); + str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(inode)); + str->di_atime = cpu_to_be64(inode->i_atime.tv_sec); + str->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec); + str->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec); str->di_goal_meta = cpu_to_be64(ip->i_goal); str->di_goal_data = cpu_to_be64(ip->i_goal); @@ -401,16 +403,16 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_flags = cpu_to_be32(ip->i_diskflags); str->di_height = cpu_to_be16(ip->i_height); - str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) && + str->di_payload_format = cpu_to_be32(S_ISDIR(inode->i_mode) && !(ip->i_diskflags & GFS2_DIF_EXHASH) ? GFS2_FORMAT_DE : 0); str->di_depth = cpu_to_be16(ip->i_depth); str->di_entries = cpu_to_be32(ip->i_entries); str->di_eattr = cpu_to_be64(ip->i_eattr); - str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec); - str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec); - str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec); + str->di_atime_nsec = cpu_to_be32(inode->i_atime.tv_nsec); + str->di_mtime_nsec = cpu_to_be32(inode->i_mtime.tv_nsec); + str->di_ctime_nsec = cpu_to_be32(inode->i_ctime.tv_nsec); } /** @@ -474,6 +476,12 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) int need_endtrans = 0; int ret; + if (unlikely(!ip->i_gl)) { + /* This can only happen during incomplete inode creation. */ + BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)); + return; + } + if (unlikely(gfs2_withdrawn(sdp))) return; if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { @@ -926,8 +934,7 @@ static int gfs2_drop_inode(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); - if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) && - inode->i_nlink && + if (inode->i_nlink && gfs2_holder_initialized(&ip->i_iopen_gh)) { struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; if (test_bit(GLF_DEMOTE, &gl->gl_flags)) @@ -1075,7 +1082,13 @@ static void gfs2_final_release_pages(struct gfs2_inode *ip) struct inode *inode = &ip->i_inode; struct gfs2_glock *gl = ip->i_gl; - truncate_inode_pages(gfs2_glock2aspace(ip->i_gl), 0); + if (unlikely(!gl)) { + /* This can only happen during incomplete inode creation. */ + BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)); + return; + } + + truncate_inode_pages(gfs2_glock2aspace(gl), 0); truncate_inode_pages(&inode->i_data, 0); if (atomic_read(&gl->gl_revokes) == 0) { @@ -1217,10 +1230,8 @@ static enum dinode_demise evict_should_delete(struct inode *inode, struct gfs2_sbd *sdp = sb->s_fs_info; int ret; - if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { - BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl)); + if (unlikely(test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) goto should_delete; - } if (test_bit(GIF_DEFERRED_DELETE, &ip->i_flags)) return SHOULD_DEFER_EVICTION; @@ -1293,13 +1304,22 @@ static int evict_unlinked_inode(struct inode *inode) goto out; } - /* We're about to clear the bitmap for the dinode, but as soon as we - do, gfs2_create_inode can create another inode at the same block - location and try to set gl_object again. We clear gl_object here so - that subsequent inode creates don't see an old gl_object. */ - glock_clear_object(ip->i_gl, ip); + if (ip->i_gl) + gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino); + + /* + * As soon as we clear the bitmap for the dinode, gfs2_create_inode() + * can get called to recreate it, or even gfs2_inode_lookup() if the + * inode was recreated on another node in the meantime. + * + * However, inserting the new inode into the inode hash table will not + * succeed until the old inode is removed, and that only happens after + * ->evict_inode() returns. The new inode is attached to its inode and + * iopen glocks after inserting it into the inode hash table, so at + * that point we can be sure that both glocks are unused. + */ + ret = gfs2_dinode_dealloc(ip); - gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino); out: return ret; } @@ -1366,12 +1386,7 @@ static void gfs2_evict_inode(struct inode *inode) struct gfs2_holder gh; int ret; - if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) { - clear_inode(inode); - return; - } - - if (inode->i_nlink || sb_rdonly(sb)) + if (inode->i_nlink || sb_rdonly(sb) || !ip->i_no_addr) goto out; gfs2_holder_mark_uninitialized(&gh); @@ -1404,12 +1419,9 @@ out: struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; glock_clear_object(gl, ip); - if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { - ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq(&ip->i_iopen_gh); - } gfs2_glock_hold(gl); - gfs2_holder_uninit(&ip->i_iopen_gh); + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_uninit(&ip->i_iopen_gh); gfs2_glock_put_eventually(gl); } if (ip->i_gl) { @@ -1428,6 +1440,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) ip = alloc_inode_sb(sb, gfs2_inode_cachep, GFP_KERNEL); if (!ip) return NULL; + ip->i_no_addr = 0; ip->i_flags = 0; ip->i_gl = NULL; gfs2_holder_mark_uninitialized(&ip->i_iopen_gh); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 8241029a2a5d..7a6aeffcdf5c 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -164,6 +164,11 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) } if (!ret) gfs2_make_fs_ro(sdp); + /* + * Dequeue any pending non-system glock holders that can no + * longer be granted because the file system is withdrawn. + */ + gfs2_gl_dq_holders(sdp); gfs2_freeze_unlock(&freeze_gh); } @@ -204,6 +209,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) * exception code in glock_dq. */ iput(inode); + sdp->sd_jdesc->jd_inode = NULL; /* * Wait until the journal inode's glock is freed. This allows try locks * on other nodes to be successful, otherwise we remain the owner of @@ -226,7 +232,8 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) */ fs_warn(sdp, "Requesting recovery of jid %d.\n", sdp->sd_lockstruct.ls_jid); - gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | LM_FLAG_NOEXP, + gfs2_holder_reinit(LM_ST_EXCLUSIVE, + LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | GL_NOPID, &sdp->sd_live_gh); msleep(GL_GLOCK_MAX_HOLD); /* @@ -251,7 +258,8 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) fs_warn(sdp, "Unable to recover our journal jid %d.\n", sdp->sd_lockstruct.ls_jid); gfs2_glock_dq_wait(&sdp->sd_live_gh); - gfs2_holder_reinit(LM_ST_SHARED, LM_FLAG_NOEXP | GL_EXACT, + gfs2_holder_reinit(LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT | GL_NOPID, &sdp->sd_live_gh); gfs2_glock_nq(&sdp->sd_live_gh); } diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index f6a66050380e..518c0677e12a 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1412,11 +1412,13 @@ static int ea_dealloc_block(struct gfs2_inode *ip) ip->i_eattr = 0; gfs2_add_inode_blocks(&ip->i_inode, -1); - error = gfs2_meta_inode_buffer(ip, &dibh); - if (!error) { - gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); + if (likely(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) { + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + } } gfs2_trans_end(sdp); @@ -1445,14 +1447,16 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip) if (error) return error; - error = ea_foreach(ip, ea_dealloc_unstuffed, NULL); - if (error) - goto out_quota; - - if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { - error = ea_dealloc_indirect(ip); + if (likely(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) { + error = ea_foreach(ip, ea_dealloc_unstuffed, NULL); if (error) goto out_quota; + + if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { + error = ea_dealloc_indirect(ip); + if (error) + goto out_quota; + } } error = ea_dealloc_block(ip); diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index c83fd0e8404d..2015e42e752a 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -21,7 +21,6 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) int pagenum; int bytes_read; int bytes_to_read; - void *vaddr; off += node->page_offset; pagenum = off >> PAGE_SHIFT; @@ -33,9 +32,7 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) page = node->page[pagenum]; bytes_to_read = min_t(int, len - bytes_read, PAGE_SIZE - off); - vaddr = kmap_atomic(page); - memcpy(buf + bytes_read, vaddr + off, bytes_to_read); - kunmap_atomic(vaddr); + memcpy_from_page(buf + bytes_read, page, off, bytes_to_read); pagenum++; off = 0; /* page offset only applies to the first page */ @@ -80,8 +77,7 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) off += node->page_offset; page = node->page[0]; - memcpy(kmap(page) + off, buf, len); - kunmap(page); + memcpy_to_page(page, off, buf, len); set_page_dirty(page); } @@ -105,8 +101,7 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) off += node->page_offset; page = node->page[0]; - memset(kmap(page) + off, 0, len); - kunmap(page); + memzero_page(page, off, len); set_page_dirty(page); } @@ -123,9 +118,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, src_page = src_node->page[0]; dst_page = dst_node->page[0]; - memcpy(kmap(dst_page) + dst, kmap(src_page) + src, len); - kunmap(src_page); - kunmap(dst_page); + memcpy_page(dst_page, dst, src_page, src, len); set_page_dirty(dst_page); } @@ -140,9 +133,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) src += node->page_offset; dst += node->page_offset; page = node->page[0]; - ptr = kmap(page); + ptr = kmap_local_page(page); memmove(ptr + dst, ptr + src, len); - kunmap(page); + kunmap_local(ptr); set_page_dirty(page); } @@ -346,13 +339,14 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) if (!test_bit(HFS_BNODE_NEW, &node->flags)) return node; - desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + node->page_offset); + desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) + + node->page_offset); node->prev = be32_to_cpu(desc->prev); node->next = be32_to_cpu(desc->next); node->num_recs = be16_to_cpu(desc->num_recs); node->type = desc->type; node->height = desc->height; - kunmap(node->page[0]); + kunmap_local(desc); switch (node->type) { case HFS_NODE_HEADER: @@ -436,14 +430,12 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) } pagep = node->page; - memset(kmap(*pagep) + node->page_offset, 0, - min((int)PAGE_SIZE, (int)tree->node_size)); + memzero_page(*pagep, node->page_offset, + min((int)PAGE_SIZE, (int)tree->node_size)); set_page_dirty(*pagep); - kunmap(*pagep); for (i = 1; i < tree->pages_per_bnode; i++) { - memset(kmap(*++pagep), 0, PAGE_SIZE); + memzero_page(*++pagep, 0, PAGE_SIZE); set_page_dirty(*pagep); - kunmap(*pagep); } clear_bit(HFS_BNODE_NEW, &node->flags); wake_up(&node->lock_wq); diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 19017d296173..2fa4b1f8cc7f 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -80,7 +80,8 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke goto free_inode; /* Load the header */ - head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + + sizeof(struct hfs_bnode_desc)); tree->root = be32_to_cpu(head->root); tree->leaf_count = be32_to_cpu(head->leaf_count); tree->leaf_head = be32_to_cpu(head->leaf_head); @@ -119,11 +120,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke tree->node_size_shift = ffs(size) - 1; tree->pages_per_bnode = (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - kunmap(page); + kunmap_local(head); put_page(page); return tree; fail_page: + kunmap_local(head); put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfs_aops; @@ -169,7 +171,8 @@ void hfs_btree_write(struct hfs_btree *tree) return; /* Load the header */ page = node->page[0]; - head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + + sizeof(struct hfs_bnode_desc)); head->root = cpu_to_be32(tree->root); head->leaf_count = cpu_to_be32(tree->leaf_count); @@ -180,7 +183,7 @@ void hfs_btree_write(struct hfs_btree *tree) head->attributes = cpu_to_be32(tree->attributes); head->depth = cpu_to_be16(tree->depth); - kunmap(page); + kunmap_local(head); set_page_dirty(page); hfs_bnode_put(node); } @@ -268,7 +271,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; idx = 0; @@ -281,7 +284,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) idx += i; data[off] |= m; set_page_dirty(*pagep); - kunmap(*pagep); + kunmap_local(data); tree->free_nodes--; mark_inode_dirty(tree->inode); hfs_bnode_put(node); @@ -290,14 +293,14 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) } } if (++off >= PAGE_SIZE) { - kunmap(*pagep); - data = kmap(*++pagep); + kunmap_local(data); + data = kmap_local_page(*++pagep); off = 0; } idx += 8; len--; } - kunmap(*pagep); + kunmap_local(data); nidx = node->next; if (!nidx) { printk(KERN_DEBUG "create new bmap node...\n"); @@ -313,7 +316,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; } } @@ -360,20 +363,20 @@ void hfs_bmap_free(struct hfs_bnode *node) } off += node->page_offset + nidx / 8; page = node->page[off >> PAGE_SHIFT]; - data = kmap(page); + data = kmap_local_page(page); off &= ~PAGE_MASK; m = 1 << (~nidx & 7); byte = data[off]; if (!(byte & m)) { pr_crit("trying to free free bnode %u(%d)\n", node->this, node->type); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); return; } data[off] = byte & ~m; set_page_dirty(page); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); tree->free_nodes++; mark_inode_dirty(tree->inode); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index c4526f16355d..9c329a365e75 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -173,12 +173,12 @@ const struct address_space_operations hfs_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = hfs_read_folio, - .writepage = hfs_writepage, .write_begin = hfs_write_begin, .write_end = generic_write_end, .bmap = hfs_bmap, .direct_IO = hfs_direct_IO, .writepages = hfs_writepages, + .migrate_folio = buffer_migrate_folio, }; /* @@ -458,6 +458,8 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) /* panic? */ return -EIO; + if (HFS_I(main_inode)->cat_key.CName.len > HFS_NAMELEN) + return -EIO; fd.search_key->cat = HFS_I(main_inode)->cat_key; if (hfs_brec_find(&fd)) /* panic? */ diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c index 39f5e343bf4d..fdb0edb8a607 100644 --- a/fs/hfs/trans.c +++ b/fs/hfs/trans.c @@ -109,7 +109,7 @@ void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, const struct qstr if (nls_io) { wchar_t ch; - while (srclen > 0) { + while (srclen > 0 && dstlen > 0) { size = nls_io->char2uni(src, srclen, &ch); if (size < 0) { ch = '?'; diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index cebce0cfe340..bd8dcea85588 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -39,7 +39,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, start = size; goto out; } - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; i = offset % 32; offset &= ~(PAGE_CACHE_BITS - 1); @@ -74,7 +74,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, } curr++; } - kunmap(page); + kunmap_local(pptr); offset += PAGE_CACHE_BITS; if (offset >= size) break; @@ -84,7 +84,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, start = size; goto out; } - curr = pptr = kmap(page); + curr = pptr = kmap_local_page(page); if ((size ^ offset) / PAGE_CACHE_BITS) end = pptr + PAGE_CACHE_BITS / 32; else @@ -127,7 +127,7 @@ found: len -= 32; } set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); offset += PAGE_CACHE_BITS; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); @@ -135,7 +135,7 @@ found: start = size; goto out; } - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr; end = pptr + PAGE_CACHE_BITS / 32; } @@ -151,7 +151,7 @@ last: done: *curr = cpu_to_be32(n); set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); *max = offset + (curr - pptr) * 32 + i - start; sbi->free_blocks -= *max; hfsplus_mark_mdb_dirty(sb); @@ -185,7 +185,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) page = read_mapping_page(mapping, pnr, NULL); if (IS_ERR(page)) goto kaboom; - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; end = pptr + PAGE_CACHE_BITS / 32; len = count; @@ -215,11 +215,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) if (!count) break; set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); page = read_mapping_page(mapping, ++pnr, NULL); if (IS_ERR(page)) goto kaboom; - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr; end = pptr + PAGE_CACHE_BITS / 32; } @@ -231,7 +231,7 @@ done: } out: set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); sbi->free_blocks += len; hfsplus_mark_mdb_dirty(sb); mutex_unlock(&sbi->alloc_mutex); diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index a5ab00e54220..87974d5e6791 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -29,14 +29,12 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) off &= ~PAGE_MASK; l = min_t(int, len, PAGE_SIZE - off); - memcpy(buf, kmap(*pagep) + off, l); - kunmap(*pagep); + memcpy_from_page(buf, *pagep, off, l); while ((len -= l) != 0) { buf += l; l = min_t(int, len, PAGE_SIZE); - memcpy(buf, kmap(*++pagep), l); - kunmap(*pagep); + memcpy_from_page(buf, *++pagep, 0, l); } } @@ -82,16 +80,14 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) off &= ~PAGE_MASK; l = min_t(int, len, PAGE_SIZE - off); - memcpy(kmap(*pagep) + off, buf, l); + memcpy_to_page(*pagep, off, buf, l); set_page_dirty(*pagep); - kunmap(*pagep); while ((len -= l) != 0) { buf += l; l = min_t(int, len, PAGE_SIZE); - memcpy(kmap(*++pagep), buf, l); + memcpy_to_page(*++pagep, 0, buf, l); set_page_dirty(*pagep); - kunmap(*pagep); } } @@ -112,15 +108,13 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) off &= ~PAGE_MASK; l = min_t(int, len, PAGE_SIZE - off); - memset(kmap(*pagep) + off, 0, l); + memzero_page(*pagep, off, l); set_page_dirty(*pagep); - kunmap(*pagep); while ((len -= l) != 0) { l = min_t(int, len, PAGE_SIZE); - memset(kmap(*++pagep), 0, l); + memzero_page(*++pagep, 0, l); set_page_dirty(*pagep); - kunmap(*pagep); } } @@ -142,24 +136,20 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, if (src == dst) { l = min_t(int, len, PAGE_SIZE - src); - memcpy(kmap(*dst_page) + src, kmap(*src_page) + src, l); - kunmap(*src_page); + memcpy_page(*dst_page, src, *src_page, src, l); set_page_dirty(*dst_page); - kunmap(*dst_page); while ((len -= l) != 0) { l = min_t(int, len, PAGE_SIZE); - memcpy(kmap(*++dst_page), kmap(*++src_page), l); - kunmap(*src_page); + memcpy_page(*++dst_page, 0, *++src_page, 0, l); set_page_dirty(*dst_page); - kunmap(*dst_page); } } else { void *src_ptr, *dst_ptr; do { - src_ptr = kmap(*src_page) + src; - dst_ptr = kmap(*dst_page) + dst; + dst_ptr = kmap_local_page(*dst_page) + dst; + src_ptr = kmap_local_page(*src_page) + src; if (PAGE_SIZE - src < PAGE_SIZE - dst) { l = PAGE_SIZE - src; src = 0; @@ -171,9 +161,9 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, } l = min(len, l); memcpy(dst_ptr, src_ptr, l); - kunmap(*src_page); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); if (!dst) dst_page++; else @@ -185,6 +175,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) { struct page **src_page, **dst_page; + void *src_ptr, *dst_ptr; int l; hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); @@ -202,27 +193,28 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) if (src == dst) { while (src < len) { - memmove(kmap(*dst_page), kmap(*src_page), src); - kunmap(*src_page); + dst_ptr = kmap_local_page(*dst_page); + src_ptr = kmap_local_page(*src_page); + memmove(dst_ptr, src_ptr, src); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); len -= src; src = PAGE_SIZE; src_page--; dst_page--; } src -= len; - memmove(kmap(*dst_page) + src, - kmap(*src_page) + src, len); - kunmap(*src_page); + dst_ptr = kmap_local_page(*dst_page); + src_ptr = kmap_local_page(*src_page); + memmove(dst_ptr + src, src_ptr + src, len); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); } else { - void *src_ptr, *dst_ptr; - do { - src_ptr = kmap(*src_page) + src; - dst_ptr = kmap(*dst_page) + dst; + dst_ptr = kmap_local_page(*dst_page) + dst; + src_ptr = kmap_local_page(*src_page) + src; if (src < dst) { l = src; src = PAGE_SIZE; @@ -234,9 +226,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) } l = min(len, l); memmove(dst_ptr - l, src_ptr - l, l); - kunmap(*src_page); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); if (dst == PAGE_SIZE) dst_page--; else @@ -251,26 +243,27 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) if (src == dst) { l = min_t(int, len, PAGE_SIZE - src); - memmove(kmap(*dst_page) + src, - kmap(*src_page) + src, l); - kunmap(*src_page); + + dst_ptr = kmap_local_page(*dst_page) + src; + src_ptr = kmap_local_page(*src_page) + src; + memmove(dst_ptr, src_ptr, l); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); while ((len -= l) != 0) { l = min_t(int, len, PAGE_SIZE); - memmove(kmap(*++dst_page), - kmap(*++src_page), l); - kunmap(*src_page); + dst_ptr = kmap_local_page(*++dst_page); + src_ptr = kmap_local_page(*++src_page); + memmove(dst_ptr, src_ptr, l); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); } } else { - void *src_ptr, *dst_ptr; - do { - src_ptr = kmap(*src_page) + src; - dst_ptr = kmap(*dst_page) + dst; + dst_ptr = kmap_local_page(*dst_page) + dst; + src_ptr = kmap_local_page(*src_page) + src; if (PAGE_SIZE - src < PAGE_SIZE - dst) { l = PAGE_SIZE - src; @@ -283,9 +276,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) } l = min(len, l); memmove(dst_ptr, src_ptr, l); - kunmap(*src_page); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); if (!dst) dst_page++; else @@ -498,14 +491,14 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) if (!test_bit(HFS_BNODE_NEW, &node->flags)) return node; - desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + - node->page_offset); + desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) + + node->page_offset); node->prev = be32_to_cpu(desc->prev); node->next = be32_to_cpu(desc->next); node->num_recs = be16_to_cpu(desc->num_recs); node->type = desc->type; node->height = desc->height; - kunmap(node->page[0]); + kunmap_local(desc); switch (node->type) { case HFS_NODE_HEADER: @@ -589,14 +582,12 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) } pagep = node->page; - memset(kmap(*pagep) + node->page_offset, 0, - min_t(int, PAGE_SIZE, tree->node_size)); + memzero_page(*pagep, node->page_offset, + min_t(int, PAGE_SIZE, tree->node_size)); set_page_dirty(*pagep); - kunmap(*pagep); for (i = 1; i < tree->pages_per_bnode; i++) { - memset(kmap(*++pagep), 0, PAGE_SIZE); + memzero_page(*++pagep, 0, PAGE_SIZE); set_page_dirty(*pagep); - kunmap(*pagep); } clear_bit(HFS_BNODE_NEW, &node->flags); wake_up(&node->lock_wq); diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 66774f4cb4fd..9e1732a2b92a 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -163,7 +163,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) goto free_inode; /* Load the header */ - head = (struct hfs_btree_header_rec *)(kmap(page) + + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + sizeof(struct hfs_bnode_desc)); tree->root = be32_to_cpu(head->root); tree->leaf_count = be32_to_cpu(head->leaf_count); @@ -240,11 +240,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - kunmap(page); + kunmap_local(head); put_page(page); return tree; fail_page: + kunmap_local(head); put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfsplus_aops; @@ -291,7 +292,7 @@ int hfs_btree_write(struct hfs_btree *tree) return -EIO; /* Load the header */ page = node->page[0]; - head = (struct hfs_btree_header_rec *)(kmap(page) + + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + sizeof(struct hfs_bnode_desc)); head->root = cpu_to_be32(tree->root); @@ -303,7 +304,7 @@ int hfs_btree_write(struct hfs_btree *tree) head->attributes = cpu_to_be32(tree->attributes); head->depth = cpu_to_be16(tree->depth); - kunmap(page); + kunmap_local(head); set_page_dirty(page); hfs_bnode_put(node); return 0; @@ -394,7 +395,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; idx = 0; @@ -407,7 +408,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) idx += i; data[off] |= m; set_page_dirty(*pagep); - kunmap(*pagep); + kunmap_local(data); tree->free_nodes--; mark_inode_dirty(tree->inode); hfs_bnode_put(node); @@ -417,14 +418,14 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) } } if (++off >= PAGE_SIZE) { - kunmap(*pagep); - data = kmap(*++pagep); + kunmap_local(data); + data = kmap_local_page(*++pagep); off = 0; } idx += 8; len--; } - kunmap(*pagep); + kunmap_local(data); nidx = node->next; if (!nidx) { hfs_dbg(BNODE_MOD, "create new bmap node\n"); @@ -440,7 +441,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; } } @@ -490,7 +491,7 @@ void hfs_bmap_free(struct hfs_bnode *node) } off += node->page_offset + nidx / 8; page = node->page[off >> PAGE_SHIFT]; - data = kmap(page); + data = kmap_local_page(page); off &= ~PAGE_MASK; m = 1 << (~nidx & 7); byte = data[off]; @@ -498,13 +499,13 @@ void hfs_bmap_free(struct hfs_bnode *node) pr_crit("trying to free free bnode " "%u(%d)\n", node->this, node->type); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); return; } data[off] = byte & ~m; set_page_dirty(page); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); tree->free_nodes++; mark_inode_dirty(tree->inode); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index a5db2e3b2980..6aa919e59483 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -198,6 +198,8 @@ struct hfsplus_sb_info { #define HFSPLUS_SB_HFSX 3 #define HFSPLUS_SB_CASEFOLD 4 #define HFSPLUS_SB_NOBARRIER 5 +#define HFSPLUS_SB_UID 6 +#define HFSPLUS_SB_GID 7 static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb) { diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index aeab83ed1c9c..840577a0c1e7 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -170,12 +170,12 @@ const struct address_space_operations hfsplus_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = hfsplus_read_folio, - .writepage = hfsplus_writepage, .write_begin = hfsplus_write_begin, .write_end = generic_write_end, .bmap = hfsplus_bmap, .direct_IO = hfsplus_direct_IO, .writepages = hfsplus_writepages, + .migrate_folio = buffer_migrate_folio, }; const struct dentry_operations hfsplus_dentry_operations = { @@ -192,11 +192,11 @@ static void hfsplus_get_perms(struct inode *inode, mode = be16_to_cpu(perms->mode); i_uid_write(inode, be32_to_cpu(perms->owner)); - if (!i_uid_read(inode) && !mode) + if ((test_bit(HFSPLUS_SB_UID, &sbi->flags)) || (!i_uid_read(inode) && !mode)) inode->i_uid = sbi->uid; i_gid_write(inode, be32_to_cpu(perms->group)); - if (!i_gid_read(inode) && !mode) + if ((test_bit(HFSPLUS_SB_GID, &sbi->flags)) || (!i_gid_read(inode) && !mode)) inode->i_gid = sbi->gid; if (dir) { diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index 047e05c57560..c94a58762ad6 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -140,6 +140,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) if (!uid_valid(sbi->uid)) { pr_err("invalid uid specified\n"); return 0; + } else { + set_bit(HFSPLUS_SB_UID, &sbi->flags); } break; case opt_gid: @@ -151,6 +153,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) if (!gid_valid(sbi->gid)) { pr_err("invalid gid specified\n"); return 0; + } else { + set_bit(HFSPLUS_SB_GID, &sbi->flags); } break; case opt_part: diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 07881b76d42f..277468783fee 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -103,7 +103,7 @@ static char *__dentry_name(struct dentry *dentry, char *name) */ BUG_ON(p + strlen(p) + 1 != name + PATH_MAX); - strlcpy(name, root, PATH_MAX); + strscpy(name, root, PATH_MAX); if (len > p - name) { __putname(name); return NULL; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index f7547a62c81f..88952d4a631e 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -163,11 +163,6 @@ static int hpfs_read_folio(struct file *file, struct folio *folio) return mpage_read_folio(folio, hpfs_get_block); } -static int hpfs_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, hpfs_get_block, wbc); -} - static void hpfs_readahead(struct readahead_control *rac) { mpage_readahead(rac, hpfs_get_block); @@ -248,12 +243,12 @@ const struct address_space_operations hpfs_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = hpfs_read_folio, - .writepage = hpfs_writepage, .readahead = hpfs_readahead, .writepages = hpfs_writepages, .write_begin = hpfs_write_begin, .write_end = hpfs_write_end, - .bmap = _hpfs_bmap + .bmap = _hpfs_bmap, + .migrate_folio = buffer_migrate_folio, }; const struct file_operations hpfs_file_ops = diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f7a5b5124d8a..790d2727141a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -328,6 +328,12 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) } else { unlock_page(page); + if (PageHWPoison(page)) { + put_page(page); + retval = -EIO; + break; + } + /* * We have the page, copy it to user space buffer. */ @@ -364,11 +370,153 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, return -EINVAL; } -static void remove_huge_page(struct page *page) +static void hugetlb_delete_from_page_cache(struct folio *folio) +{ + folio_clear_dirty(folio); + folio_clear_uptodate(folio); + filemap_remove_folio(folio); +} + +/* + * Called with i_mmap_rwsem held for inode based vma maps. This makes + * sure vma (and vm_mm) will not go away. We also hold the hugetlb fault + * mutex for the page in the mapping. So, we can not race with page being + * faulted into the vma. + */ +static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, + unsigned long addr, struct page *page) +{ + pte_t *ptep, pte; + + ptep = huge_pte_offset(vma->vm_mm, addr, + huge_page_size(hstate_vma(vma))); + + if (!ptep) + return false; + + pte = huge_ptep_get(ptep); + if (huge_pte_none(pte) || !pte_present(pte)) + return false; + + if (pte_page(pte) == page) + return true; + + return false; +} + +/* + * Can vma_offset_start/vma_offset_end overflow on 32-bit arches? + * No, because the interval tree returns us only those vmas + * which overlap the truncated area starting at pgoff, + * and no vma on a 32-bit arch can span beyond the 4GB. + */ +static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start) +{ + if (vma->vm_pgoff < start) + return (start - vma->vm_pgoff) << PAGE_SHIFT; + else + return 0; +} + +static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end) +{ + unsigned long t_end; + + if (!end) + return vma->vm_end; + + t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start; + if (t_end > vma->vm_end) + t_end = vma->vm_end; + return t_end; +} + +/* + * Called with hugetlb fault mutex held. Therefore, no more mappings to + * this folio can be created while executing the routine. + */ +static void hugetlb_unmap_file_folio(struct hstate *h, + struct address_space *mapping, + struct folio *folio, pgoff_t index) { - ClearPageDirty(page); - ClearPageUptodate(page); - delete_from_page_cache(page); + struct rb_root_cached *root = &mapping->i_mmap; + struct hugetlb_vma_lock *vma_lock; + struct page *page = &folio->page; + struct vm_area_struct *vma; + unsigned long v_start; + unsigned long v_end; + pgoff_t start, end; + + start = index * pages_per_huge_page(h); + end = (index + 1) * pages_per_huge_page(h); + + i_mmap_lock_write(mapping); +retry: + vma_lock = NULL; + vma_interval_tree_foreach(vma, root, start, end - 1) { + v_start = vma_offset_start(vma, start); + v_end = vma_offset_end(vma, end); + + if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) + continue; + + if (!hugetlb_vma_trylock_write(vma)) { + vma_lock = vma->vm_private_data; + /* + * If we can not get vma lock, we need to drop + * immap_sema and take locks in order. First, + * take a ref on the vma_lock structure so that + * we can be guaranteed it will not go away when + * dropping immap_sema. + */ + kref_get(&vma_lock->refs); + break; + } + + unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, + NULL, ZAP_FLAG_DROP_MARKER); + hugetlb_vma_unlock_write(vma); + } + + i_mmap_unlock_write(mapping); + + if (vma_lock) { + /* + * Wait on vma_lock. We know it is still valid as we have + * a reference. We must 'open code' vma locking as we do + * not know if vma_lock is still attached to vma. + */ + down_write(&vma_lock->rw_sema); + i_mmap_lock_write(mapping); + + vma = vma_lock->vma; + if (!vma) { + /* + * If lock is no longer attached to vma, then just + * unlock, drop our reference and retry looking for + * other vmas. + */ + up_write(&vma_lock->rw_sema); + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); + goto retry; + } + + /* + * vma_lock is still attached to vma. Check to see if vma + * still maps page and if so, unmap. + */ + v_start = vma_offset_start(vma, start); + v_end = vma_offset_end(vma, end); + if (hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) + unmap_hugepage_range(vma, vma->vm_start + v_start, + v_end, NULL, + ZAP_FLAG_DROP_MARKER); + + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); + hugetlb_vma_unlock_write(vma); + + goto retry; + } } static void @@ -383,32 +531,66 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, * an inclusive "last". */ vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) { - unsigned long v_offset; + unsigned long v_start; unsigned long v_end; + if (!hugetlb_vma_trylock_write(vma)) + continue; + + v_start = vma_offset_start(vma, start); + v_end = vma_offset_end(vma, end); + + unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, + NULL, zap_flags); + /* - * Can the expression below overflow on 32-bit arches? - * No, because the interval tree returns us only those vmas - * which overlap the truncated area starting at pgoff, - * and no vma on a 32-bit arch can span beyond the 4GB. + * Note that vma lock only exists for shared/non-private + * vmas. Therefore, lock is not held when calling + * unmap_hugepage_range for private vmas. */ - if (vma->vm_pgoff < start) - v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; - else - v_offset = 0; - - if (!end) - v_end = vma->vm_end; - else { - v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) - + vma->vm_start; - if (v_end > vma->vm_end) - v_end = vma->vm_end; - } + hugetlb_vma_unlock_write(vma); + } +} - unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, - NULL, zap_flags); +/* + * Called with hugetlb fault mutex held. + * Returns true if page was actually removed, false otherwise. + */ +static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, + struct address_space *mapping, + struct folio *folio, pgoff_t index, + bool truncate_op) +{ + bool ret = false; + + /* + * If folio is mapped, it was faulted in after being + * unmapped in caller. Unmap (again) while holding + * the fault mutex. The mutex will prevent faults + * until we finish removing the folio. + */ + if (unlikely(folio_mapped(folio))) + hugetlb_unmap_file_folio(h, mapping, folio, index); + + folio_lock(folio); + /* + * We must remove the folio from page cache before removing + * the region/ reserve map (hugetlb_unreserve_pages). In + * rare out of memory conditions, removal of the region/reserve + * map could fail. Correspondingly, the subpool and global + * reserve usage count can need to be adjusted. + */ + VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio); + hugetlb_delete_from_page_cache(folio); + ret = true; + if (!truncate_op) { + if (unlikely(hugetlb_unreserve_pages(inode, index, + index + 1, 1))) + hugetlb_fix_reserve_counts(inode); } + + folio_unlock(folio); + return ret; } /* @@ -418,10 +600,10 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve - * maps and global counts. Page faults can not race with truncation - * in this routine. hugetlb_no_page() holds i_mmap_rwsem and prevents - * page faults in the truncated range by checking i_size. i_size is - * modified while holding i_mmap_rwsem. + * maps and global counts. Page faults can race with truncation. + * During faults, hugetlb_no_page() checks i_size before page allocation, + * and again after obtaining page table lock. It will 'back out' + * allocations in the truncated range. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserve map @@ -451,61 +633,17 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, u32 hash = 0; index = folio->index; - if (!truncate_op) { - /* - * Only need to hold the fault mutex in the - * hole punch case. This prevents races with - * page faults. Races are not possible in the - * case of truncation. - */ - hash = hugetlb_fault_mutex_hash(mapping, index); - mutex_lock(&hugetlb_fault_mutex_table[hash]); - } + hash = hugetlb_fault_mutex_hash(mapping, index); + mutex_lock(&hugetlb_fault_mutex_table[hash]); /* - * If folio is mapped, it was faulted in after being - * unmapped in caller. Unmap (again) now after taking - * the fault mutex. The mutex will prevent faults - * until we finish removing the folio. - * - * This race can only happen in the hole punch case. - * Getting here in a truncate operation is a bug. + * Remove folio that was part of folio_batch. */ - if (unlikely(folio_mapped(folio))) { - BUG_ON(truncate_op); - - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_lock_write(mapping); - mutex_lock(&hugetlb_fault_mutex_table[hash]); - hugetlb_vmdelete_list(&mapping->i_mmap, - index * pages_per_huge_page(h), - (index + 1) * pages_per_huge_page(h), - ZAP_FLAG_DROP_MARKER); - i_mmap_unlock_write(mapping); - } + if (remove_inode_single_folio(h, inode, mapping, folio, + index, truncate_op)) + freed++; - folio_lock(folio); - /* - * We must free the huge page and remove from page - * cache (remove_huge_page) BEFORE removing the - * region/reserve map (hugetlb_unreserve_pages). In - * rare out of memory conditions, removal of the - * region/reserve map could fail. Correspondingly, - * the subpool and global reserve usage count can need - * to be adjusted. - */ - VM_BUG_ON(HPageRestoreReserve(&folio->page)); - remove_huge_page(&folio->page); - freed++; - if (!truncate_op) { - if (unlikely(hugetlb_unreserve_pages(inode, - index, index + 1, 1))) - hugetlb_fix_reserve_counts(inode); - } - - folio_unlock(folio); - if (!truncate_op) - mutex_unlock(&hugetlb_fault_mutex_table[hash]); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); } folio_batch_release(&fbatch); cond_resched(); @@ -543,8 +681,8 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset) BUG_ON(offset & ~huge_page_mask(h)); pgoff = offset >> PAGE_SHIFT; - i_mmap_lock_write(mapping); i_size_write(inode, offset); + i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0, ZAP_FLAG_DROP_MARKER); @@ -703,11 +841,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, /* addr is the offset within the file (zero based) */ addr = index * hpage_size; - /* - * fault mutex taken here, protects against fault path - * and hole punch. inode_lock previously taken protects - * against truncation. - */ + /* mutex taken here, fault path and hole punch */ hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -737,7 +871,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, } clear_huge_page(page, addr, pages_per_huge_page(h)); __SetPageUptodate(page); - error = huge_add_to_page_cache(page, mapping, index); + error = hugetlb_add_to_page_cache(page, mapping, index); if (unlikely(error)) { restore_reserve_on_error(h, &pseudo_vma, addr, page); put_page(page); @@ -749,7 +883,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, SetHPageMigratable(page); /* - * unlock_page because locked by huge_add_to_page_cache() + * unlock_page because locked by hugetlb_add_to_page_cache() * put_page() due to reference from alloc_huge_page() */ unlock_page(page); @@ -885,33 +1019,18 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, /* * File creation. Allocate an inode, and we're done.. */ -static int do_hugetlbfs_mknod(struct inode *dir, - struct dentry *dentry, - umode_t mode, - dev_t dev, - bool tmpfile) +static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; - int error = -ENOSPC; inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); - if (inode) { - dir->i_ctime = dir->i_mtime = current_time(dir); - if (tmpfile) { - d_tmpfile(dentry, inode); - } else { - d_instantiate(dentry, inode); - dget(dentry);/* Extra count - pin the dentry in core */ - } - error = 0; - } - return error; -} - -static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t dev) -{ - return do_hugetlbfs_mknod(dir, dentry, mode, dev, false); + if (!inode) + return -ENOSPC; + dir->i_ctime = dir->i_mtime = current_time(dir); + d_instantiate(dentry, inode); + dget(dentry);/* Extra count - pin the dentry in core */ + return 0; } static int hugetlbfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, @@ -932,10 +1051,17 @@ static int hugetlbfs_create(struct user_namespace *mnt_userns, } static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns, - struct inode *dir, struct dentry *dentry, + struct inode *dir, struct file *file, umode_t mode) { - return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true); + struct inode *inode; + + inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0); + if (!inode) + return -ENOSPC; + dir->i_ctime = dir->i_mtime = current_time(dir); + d_tmpfile(file, inode); + return finish_open_simple(file, 0); } static int hugetlbfs_symlink(struct user_namespace *mnt_userns, @@ -971,10 +1097,10 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, if (rc != MIGRATEPAGE_SUCCESS) return rc; - if (hugetlb_page_subpool(&src->page)) { - hugetlb_set_page_subpool(&dst->page, - hugetlb_page_subpool(&src->page)); - hugetlb_set_page_subpool(&src->page, NULL); + if (hugetlb_folio_subpool(src)) { + hugetlb_set_folio_subpool(dst, + hugetlb_folio_subpool(src)); + hugetlb_set_folio_subpool(src, NULL); } if (mode != MIGRATE_SYNC_NO_COPY) @@ -991,13 +1117,6 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, static int hugetlbfs_error_remove_page(struct address_space *mapping, struct page *page) { - struct inode *inode = mapping->host; - pgoff_t index = page->index; - - remove_huge_page(page); - if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1))) - hugetlb_fix_reserve_counts(inode); - return 0; } @@ -1160,7 +1279,7 @@ static const struct address_space_operations hugetlbfs_aops = { static void init_once(void *foo) { - struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; + struct hugetlbfs_inode_info *ei = foo; inode_init_once(&ei->vfs_inode); } @@ -1258,7 +1377,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par case Opt_size: /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(param->string[0])) + if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->max_size_opt = memparse(param->string, &rest); ctx->max_val_type = SIZE_STD; @@ -1268,7 +1387,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par case Opt_nr_inodes: /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(param->string[0])) + if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->nr_inodes = memparse(param->string, &rest); return 0; @@ -1284,7 +1403,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par case Opt_min_size: /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(param->string[0])) + if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->min_size_opt = memparse(param->string, &rest); ctx->min_val_type = SIZE_STD; diff --git a/fs/inode.c b/fs/inode.c index ba1de23c13c1..f453eb58fd03 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -192,8 +192,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_wb_frn_history = 0; #endif - if (security_inode_alloc(inode)) - goto out; spin_lock_init(&inode->i_lock); lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); @@ -228,11 +226,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_fsnotify_mask = 0; #endif inode->i_flctx = NULL; + + if (unlikely(security_inode_alloc(inode))) + return -ENOMEM; this_cpu_inc(nr_inodes); return 0; -out: - return -ENOMEM; } EXPORT_SYMBOL(inode_init_always); @@ -1950,40 +1949,12 @@ skip_update: EXPORT_SYMBOL(touch_atime); /* - * The logic we want is - * - * if suid or (sgid and xgrp) - * remove privs - */ -int should_remove_suid(struct dentry *dentry) -{ - umode_t mode = d_inode(dentry)->i_mode; - int kill = 0; - - /* suid always must be killed */ - if (unlikely(mode & S_ISUID)) - kill = ATTR_KILL_SUID; - - /* - * sgid without any exec bits is just a mandatory locking mark; leave - * it alone. If some exec bits are set, it's a real sgid; kill it. - */ - if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) - kill |= ATTR_KILL_SGID; - - if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) - return kill; - - return 0; -} -EXPORT_SYMBOL(should_remove_suid); - -/* * Return mask of changes for notify_change() that need to be done as a * response to write or truncate. Return 0 if nothing has to be changed. * Negative value on error (change should be denied). */ -int dentry_needs_remove_privs(struct dentry *dentry) +int dentry_needs_remove_privs(struct user_namespace *mnt_userns, + struct dentry *dentry) { struct inode *inode = d_inode(dentry); int mask = 0; @@ -1992,7 +1963,7 @@ int dentry_needs_remove_privs(struct dentry *dentry) if (IS_NOSEC(inode)) return 0; - mask = should_remove_suid(dentry); + mask = setattr_should_drop_suidgid(mnt_userns, inode); ret = security_inode_need_killpriv(dentry); if (ret < 0) return ret; @@ -2024,7 +1995,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags) if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; - kill = dentry_needs_remove_privs(dentry); + kill = dentry_needs_remove_privs(file_mnt_user_ns(file), dentry); if (kill < 0) return kill; @@ -2072,9 +2043,6 @@ static int inode_needs_update_time(struct inode *inode, struct timespec64 *now) if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) sync_it |= S_VERSION; - if (!sync_it) - return 0; - return sync_it; } @@ -2355,15 +2323,15 @@ EXPORT_SYMBOL(inode_init_owner); bool inode_owner_or_capable(struct user_namespace *mnt_userns, const struct inode *inode) { - kuid_t i_uid; + vfsuid_t vfsuid; struct user_namespace *ns; - i_uid = i_uid_into_mnt(mnt_userns, inode); - if (uid_eq(current_fsuid(), i_uid)) + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return true; ns = current_user_ns(); - if (kuid_has_mapping(ns, i_uid) && ns_capable(ns, CAP_FOWNER)) + if (vfsuid_has_mapping(ns, vfsuid) && ns_capable(ns, CAP_FOWNER)) return true; return false; } @@ -2489,6 +2457,28 @@ struct timespec64 current_time(struct inode *inode) EXPORT_SYMBOL(current_time); /** + * in_group_or_capable - check whether caller is CAP_FSETID privileged + * @mnt_userns: user namespace of the mount @inode was found from + * @inode: inode to check + * @vfsgid: the new/current vfsgid of @inode + * + * Check wether @vfsgid is in the caller's group list or if the caller is + * privileged with CAP_FSETID over @inode. This can be used to determine + * whether the setgid bit can be kept or must be dropped. + * + * Return: true if the caller is sufficiently privileged, false if not. + */ +bool in_group_or_capable(struct user_namespace *mnt_userns, + const struct inode *inode, vfsgid_t vfsgid) +{ + if (vfsgid_in_group_p(vfsgid)) + return true; + if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + return true; + return false; +} + +/** * mode_strip_sgid - handle the sgid bit for non-directories * @mnt_userns: User namespace of the mount the inode was created from * @dir: parent directory inode @@ -2509,11 +2499,9 @@ umode_t mode_strip_sgid(struct user_namespace *mnt_userns, return mode; if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID)) return mode; - if (in_group_p(i_gid_into_mnt(mnt_userns, dir))) - return mode; - if (capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID)) + if (in_group_or_capable(mnt_userns, dir, + i_gid_into_vfsgid(mnt_userns, dir))) return mode; - return mode & ~S_ISGID; } EXPORT_SYMBOL(mode_strip_sgid); diff --git a/fs/internal.h b/fs/internal.h index 87e96b9024ce..a803cc3cf716 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -16,6 +16,7 @@ struct shrink_control; struct fs_context; struct user_namespace; struct pipe_inode_info; +struct iov_iter; /* * block/bdev.c @@ -62,7 +63,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); int do_rmdir(int dfd, struct filename *name); int do_unlinkat(int dfd, struct filename *name); -int may_linkat(struct user_namespace *mnt_userns, struct path *link); +int may_linkat(struct user_namespace *mnt_userns, const struct path *link); int do_renameat2(int olddfd, struct filename *oldname, int newdfd, struct filename *newname, unsigned int flags); int do_mkdirat(int dfd, struct filename *name, umode_t mode); @@ -101,6 +102,16 @@ extern void chroot_fs_refs(const struct path *, const struct path *); extern struct file *alloc_empty_file(int, const struct cred *); extern struct file *alloc_empty_file_noaccount(int, const struct cred *); +static inline void put_file_access(struct file *file) +{ + if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { + i_readcount_dec(file->f_inode); + } else if (file->f_mode & FMODE_WRITER) { + put_write_access(file->f_inode); + __mnt_drop_write(file->f_path.mnt); + } +} + /* * super.c */ @@ -139,7 +150,9 @@ extern int vfs_open(const struct path *, struct file *); * inode.c */ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); -extern int dentry_needs_remove_privs(struct dentry *dentry); +int dentry_needs_remove_privs(struct user_namespace *, struct dentry *dentry); +bool in_group_or_capable(struct user_namespace *mnt_userns, + const struct inode *inode, vfsgid_t vfsgid); /* * fs-writeback.c @@ -214,10 +227,39 @@ struct xattr_ctx { }; -ssize_t do_getxattr(struct user_namespace *mnt_userns, +ssize_t do_getxattr(struct mnt_idmap *idmap, struct dentry *d, struct xattr_ctx *ctx); int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); -int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_ctx *ctx); +int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode); + +#ifdef CONFIG_FS_POSIX_ACL +int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + const char *acl_name, const void *kvalue, size_t size); +ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, + const char *acl_name, void *kvalue, size_t size); +#else +static inline int do_set_acl(struct mnt_idmap *idmap, + struct dentry *dentry, const char *acl_name, + const void *kvalue, size_t size) +{ + return -EOPNOTSUPP; +} +static inline ssize_t do_get_acl(struct mnt_idmap *idmap, + struct dentry *dentry, const char *acl_name, + void *kvalue, size_t size) +{ + return -EOPNOTSUPP; +} +#endif + +ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos); + +/* + * fs/attr.c + */ +int setattr_should_drop_sgid(struct user_namespace *mnt_userns, + const struct inode *inode); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index ca5c62901541..356193e44cf0 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -584,7 +584,7 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter, return iomap_read_inline_data(iter, folio); } -static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, +static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, size_t len, struct folio **foliop) { const struct iomap_page_ops *page_ops = iter->iomap.page_ops; @@ -618,6 +618,27 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM; goto out_no_page; } + + /* + * Now we have a locked folio, before we do anything with it we need to + * check that the iomap we have cached is not stale. The inode extent + * mapping can change due to concurrent IO in flight (e.g. + * IOMAP_UNWRITTEN state can change and memory reclaim could have + * reclaimed a previously partially written page at this index after IO + * completion before this write reaches this file offset) and hence we + * could do the wrong thing here (zero a page range incorrectly or fail + * to zero) and corrupt data. + */ + if (page_ops && page_ops->iomap_valid) { + bool iomap_valid = page_ops->iomap_valid(iter->inode, + &iter->iomap); + if (!iomap_valid) { + iter->iomap.flags |= IOMAP_F_STALE; + status = 0; + goto out_unlock; + } + } + if (pos + len > folio_pos(folio) + folio_size(folio)) len = folio_pos(folio) + folio_size(folio) - pos; @@ -773,6 +794,8 @@ again: status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) break; + if (iter->iomap.flags & IOMAP_F_STALE) + break; page = folio_file_page(folio, pos >> PAGE_SHIFT); if (mapping_writably_mapped(mapping)) @@ -832,6 +855,231 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); +/* + * Scan the data range passed to us for dirty page cache folios. If we find a + * dirty folio, punch out the preceeding range and update the offset from which + * the next punch will start from. + * + * We can punch out storage reservations under clean pages because they either + * contain data that has been written back - in which case the delalloc punch + * over that range is a no-op - or they have been read faults in which case they + * contain zeroes and we can remove the delalloc backing range and any new + * writes to those pages will do the normal hole filling operation... + * + * This makes the logic simple: we only need to keep the delalloc extents only + * over the dirty ranges of the page cache. + * + * This function uses [start_byte, end_byte) intervals (i.e. open ended) to + * simplify range iterations. + */ +static int iomap_write_delalloc_scan(struct inode *inode, + loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, + int (*punch)(struct inode *inode, loff_t offset, loff_t length)) +{ + while (start_byte < end_byte) { + struct folio *folio; + + /* grab locked page */ + folio = filemap_lock_folio(inode->i_mapping, + start_byte >> PAGE_SHIFT); + if (!folio) { + start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) + + PAGE_SIZE; + continue; + } + + /* if dirty, punch up to offset */ + if (folio_test_dirty(folio)) { + if (start_byte > *punch_start_byte) { + int error; + + error = punch(inode, *punch_start_byte, + start_byte - *punch_start_byte); + if (error) { + folio_unlock(folio); + folio_put(folio); + return error; + } + } + + /* + * Make sure the next punch start is correctly bound to + * the end of this data range, not the end of the folio. + */ + *punch_start_byte = min_t(loff_t, end_byte, + folio_next_index(folio) << PAGE_SHIFT); + } + + /* move offset to start of next folio in range */ + start_byte = folio_next_index(folio) << PAGE_SHIFT; + folio_unlock(folio); + folio_put(folio); + } + return 0; +} + +/* + * Punch out all the delalloc blocks in the range given except for those that + * have dirty data still pending in the page cache - those are going to be + * written and so must still retain the delalloc backing for writeback. + * + * As we are scanning the page cache for data, we don't need to reimplement the + * wheel - mapping_seek_hole_data() does exactly what we need to identify the + * start and end of data ranges correctly even for sub-folio block sizes. This + * byte range based iteration is especially convenient because it means we + * don't have to care about variable size folios, nor where the start or end of + * the data range lies within a folio, if they lie within the same folio or even + * if there are multiple discontiguous data ranges within the folio. + * + * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so + * can return data ranges that exist in the cache beyond EOF. e.g. a page fault + * spanning EOF will initialise the post-EOF data to zeroes and mark it up to + * date. A write page fault can then mark it dirty. If we then fail a write() + * beyond EOF into that up to date cached range, we allocate a delalloc block + * beyond EOF and then have to punch it out. Because the range is up to date, + * mapping_seek_hole_data() will return it, and we will skip the punch because + * the folio is dirty. THis is incorrect - we always need to punch out delalloc + * beyond EOF in this case as writeback will never write back and covert that + * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF, + * resulting in always punching out the range from the EOF to the end of the + * range the iomap spans. + * + * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it + * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA + * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte) + * returns the end of the data range (data_end). Using closed intervals would + * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose + * the code to subtle off-by-one bugs.... + */ +static int iomap_write_delalloc_release(struct inode *inode, + loff_t start_byte, loff_t end_byte, + int (*punch)(struct inode *inode, loff_t pos, loff_t length)) +{ + loff_t punch_start_byte = start_byte; + loff_t scan_end_byte = min(i_size_read(inode), end_byte); + int error = 0; + + /* + * Lock the mapping to avoid races with page faults re-instantiating + * folios and dirtying them via ->page_mkwrite whilst we walk the + * cache and perform delalloc extent removal. Failing to do this can + * leave dirty pages with no space reservation in the cache. + */ + filemap_invalidate_lock(inode->i_mapping); + while (start_byte < scan_end_byte) { + loff_t data_end; + + start_byte = mapping_seek_hole_data(inode->i_mapping, + start_byte, scan_end_byte, SEEK_DATA); + /* + * If there is no more data to scan, all that is left is to + * punch out the remaining range. + */ + if (start_byte == -ENXIO || start_byte == scan_end_byte) + break; + if (start_byte < 0) { + error = start_byte; + goto out_unlock; + } + WARN_ON_ONCE(start_byte < punch_start_byte); + WARN_ON_ONCE(start_byte > scan_end_byte); + + /* + * We find the end of this contiguous cached data range by + * seeking from start_byte to the beginning of the next hole. + */ + data_end = mapping_seek_hole_data(inode->i_mapping, start_byte, + scan_end_byte, SEEK_HOLE); + if (data_end < 0) { + error = data_end; + goto out_unlock; + } + WARN_ON_ONCE(data_end <= start_byte); + WARN_ON_ONCE(data_end > scan_end_byte); + + error = iomap_write_delalloc_scan(inode, &punch_start_byte, + start_byte, data_end, punch); + if (error) + goto out_unlock; + + /* The next data search starts at the end of this one. */ + start_byte = data_end; + } + + if (punch_start_byte < end_byte) + error = punch(inode, punch_start_byte, + end_byte - punch_start_byte); +out_unlock: + filemap_invalidate_unlock(inode->i_mapping); + return error; +} + +/* + * When a short write occurs, the filesystem may need to remove reserved space + * that was allocated in ->iomap_begin from it's ->iomap_end method. For + * filesystems that use delayed allocation, we need to punch out delalloc + * extents from the range that are not dirty in the page cache. As the write can + * race with page faults, there can be dirty pages over the delalloc extent + * outside the range of a short write but still within the delalloc extent + * allocated for this iomap. + * + * This function uses [start_byte, end_byte) intervals (i.e. open ended) to + * simplify range iterations. + * + * The punch() callback *must* only punch delalloc extents in the range passed + * to it. It must skip over all other types of extents in the range and leave + * them completely unchanged. It must do this punch atomically with respect to + * other extent modifications. + * + * The punch() callback may be called with a folio locked to prevent writeback + * extent allocation racing at the edge of the range we are currently punching. + * The locked folio may or may not cover the range being punched, so it is not + * safe for the punch() callback to lock folios itself. + * + * Lock order is: + * + * inode->i_rwsem (shared or exclusive) + * inode->i_mapping->invalidate_lock (exclusive) + * folio_lock() + * ->punch + * internal filesystem allocation lock + */ +int iomap_file_buffered_write_punch_delalloc(struct inode *inode, + struct iomap *iomap, loff_t pos, loff_t length, + ssize_t written, + int (*punch)(struct inode *inode, loff_t pos, loff_t length)) +{ + loff_t start_byte; + loff_t end_byte; + int blocksize = i_blocksize(inode); + + if (iomap->type != IOMAP_DELALLOC) + return 0; + + /* If we didn't reserve the blocks, we're not allowed to punch them. */ + if (!(iomap->flags & IOMAP_F_NEW)) + return 0; + + /* + * start_byte refers to the first unused block after a short write. If + * nothing was written, round offset down to point at the first block in + * the range. + */ + if (unlikely(!written)) + start_byte = round_down(pos, blocksize); + else + start_byte = round_up(pos + written, blocksize); + end_byte = round_up(pos + length, blocksize); + + /* Nothing to do if we've written the entire delalloc extent */ + if (start_byte >= end_byte) + return 0; + + return iomap_write_delalloc_release(inode, start_byte, end_byte, + punch); +} +EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); + static loff_t iomap_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; @@ -856,6 +1104,8 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) return status; + if (iter->iomap.flags & IOMAP_F_STALE) + break; status = iomap_write_end(iter, pos, bytes, bytes, folio); if (WARN_ON_ONCE(status == 0)) @@ -911,6 +1161,8 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) status = iomap_write_begin(iter, pos, bytes, &folio); if (status) return status; + if (iter->iomap.flags & IOMAP_F_STALE) + break; offset = offset_in_folio(folio, pos); if (bytes > folio_size(folio) - offset) @@ -1360,6 +1612,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, error = wpc->ops->map_blocks(wpc, inode, pos); if (error) break; + trace_iomap_writepage_map(inode, &wpc->iomap); if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE)) continue; if (wpc->iomap.type == IOMAP_HOLE) @@ -1421,7 +1674,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, if (!count) folio_end_writeback(folio); done: - mapping_set_error(folio->mapping, error); + mapping_set_error(inode->i_mapping, error); return error; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 4eb559a16c9e..9804714b1751 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -240,7 +240,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, { const struct iomap *iomap = &iter->iomap; struct inode *inode = iter->inode; - unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); unsigned int fs_block_size = i_blocksize(inode), pad; loff_t length = iomap_length(iter); loff_t pos = iter->pos; @@ -252,7 +251,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, size_t copied = 0; size_t orig_count; - if ((pos | length) & ((1 << blkbits) - 1) || + if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) return -EINVAL; diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index a1c7592d2ade..79a0614eaab7 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -7,12 +7,28 @@ #include <linux/iomap.h> #include "trace.h" +/* + * Advance to the next range we need to map. + * + * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully + * processed - it was aborted because the extent the iomap spanned may have been + * changed during the operation. In this case, the iteration behaviour is to + * remap the unprocessed range of the iter, and that means we may need to remap + * even when we've made no progress (i.e. iter->processed = 0). Hence the + * "finished iterating" case needs to distinguish between + * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we + * need to remap the entire remaining range. + */ static inline int iomap_iter_advance(struct iomap_iter *iter) { + bool stale = iter->iomap.flags & IOMAP_F_STALE; + /* handle the previous iteration (if any) */ if (iter->iomap.length) { - if (iter->processed <= 0) + if (iter->processed < 0) return iter->processed; + if (!iter->processed && !stale) + return 0; if (WARN_ON_ONCE(iter->processed > iomap_length(iter))) return -EIO; iter->pos += iter->processed; @@ -33,6 +49,7 @@ static inline void iomap_iter_done(struct iomap_iter *iter) WARN_ON_ONCE(iter->iomap.offset > iter->pos); WARN_ON_ONCE(iter->iomap.length == 0); WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); + WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE); trace_iomap_iter_dstmap(iter->inode, &iter->iomap); if (iter->srcmap.type != IOMAP_HOLE) diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index d48868fc40d7..f6ea9540d082 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -148,6 +148,7 @@ DEFINE_EVENT(iomap_class, name, \ TP_ARGS(inode, iomap)) DEFINE_IOMAP_EVENT(iomap_iter_dstmap); DEFINE_IOMAP_EVENT(iomap_iter_srcmap); +DEFINE_IOMAP_EVENT(iomap_writepage_map); TRACE_EVENT(iomap_iter, TP_PROTO(struct iomap_iter *iter, const void *ops, diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index b466172eec25..c4da3f634b92 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -67,8 +67,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, for ( i = 0 ; i < pcount ; i++ ) { if (!pages[i]) continue; - memset(page_address(pages[i]), 0, PAGE_SIZE); - flush_dcache_page(pages[i]); + memzero_page(pages[i], 0, PAGE_SIZE); SetPageUptodate(pages[i]); } return ((loff_t)pcount) << PAGE_SHIFT; @@ -82,7 +81,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, return 0; } haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks); - ll_rw_block(REQ_OP_READ, haveblocks, bhs); + bh_read_batch(haveblocks, bhs); curbh = 0; curpage = 0; @@ -120,7 +119,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, zerr != Z_STREAM_END) { if (!stream.avail_out) { if (pages[curpage]) { - stream.next_out = page_address(pages[curpage]) + stream.next_out = kmap_local_page(pages[curpage]) + poffset; stream.avail_out = PAGE_SIZE - poffset; poffset = 0; @@ -176,6 +175,10 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, flush_dcache_page(pages[curpage]); SetPageUptodate(pages[curpage]); } + if (stream.next_out != (unsigned char *)zisofs_sink_page) { + kunmap_local(stream.next_out); + stream.next_out = NULL; + } curpage++; } if (!stream.avail_in) @@ -183,6 +186,8 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, } inflate_out: zlib_inflateEnd(&stream); + if (stream.next_out && stream.next_out != (unsigned char *)zisofs_sink_page) + kunmap_local(stream.next_out); z_eio: mutex_unlock(&zisofs_zlib_lock); @@ -283,9 +288,7 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount, } if (poffset && *pages) { - memset(page_address(*pages) + poffset, 0, - PAGE_SIZE - poffset); - flush_dcache_page(*pages); + memzero_page(*pages, poffset, PAGE_SIZE - poffset); SetPageUptodate(*pages); } return 0; @@ -343,10 +346,8 @@ static int zisofs_read_folio(struct file *file, struct folio *folio) for (i = 0; i < pcount; i++, index++) { if (i != full_page) pages[i] = grab_cache_page_nowait(mapping, index); - if (pages[i]) { + if (pages[i]) ClearPageError(pages[i]); - kmap(pages[i]); - } } err = zisofs_fill_pages(inode, full_page, pcount, pages); @@ -357,7 +358,6 @@ static int zisofs_read_folio(struct file *file, struct folio *folio) flush_dcache_page(pages[i]); if (i == full_page && err) SetPageError(pages[i]); - kunmap(pages[i]); unlock_page(pages[i]); if (i != full_page) put_page(pages[i]); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 88bf20303466..df9d70588b60 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1277,13 +1277,11 @@ static int isofs_read_level3_size(struct inode *inode) } while (more_entries); out: kfree(tmpde); - if (bh) - brelse(bh); + brelse(bh); return 0; out_nomem: - if (bh) - brelse(bh); + brelse(bh); return -ENOMEM; out_noread: @@ -1486,8 +1484,7 @@ static int isofs_read_inode(struct inode *inode, int relocated) ret = 0; out: kfree(tmpde); - if (bh) - brelse(bh); + brelse(bh); return ret; out_badread: diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index b2b2bc9b88d9..4810438b7856 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -122,8 +122,8 @@ static int journal_submit_commit_record(journal_t *journal, { struct commit_header *tmp; struct buffer_head *bh; - int ret; struct timespec64 now; + blk_opf_t write_flags = REQ_OP_WRITE | REQ_SYNC; *cbh = NULL; @@ -155,13 +155,11 @@ static int journal_submit_commit_record(journal_t *journal, if (journal->j_flags & JBD2_BARRIER && !jbd2_has_feature_async_commit(journal)) - ret = submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | - REQ_FUA, bh); - else - ret = submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); + write_flags |= REQ_PREFLUSH | REQ_FUA; + submit_bh(write_flags, bh); *cbh = bh; - return ret; + return 0; } /* @@ -209,14 +207,13 @@ int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) } /* Send all the data buffers related to an inode */ -int jbd2_submit_inode_data(struct jbd2_inode *jinode) +int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode) { - if (!jinode || !(jinode->i_flags & JI_WRITE_DATA)) return 0; trace_jbd2_submit_inode_data(jinode->i_vfs_inode); - return jbd2_journal_submit_inode_data_buffers(jinode); + return journal->j_submit_inode_data_buffers(jinode); } EXPORT_SYMBOL(jbd2_submit_inode_data); @@ -570,7 +567,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) journal->j_running_transaction = NULL; start_time = ktime_get(); commit_transaction->t_log_start = journal->j_head; - wake_up(&journal->j_wait_transaction_locked); + wake_up_all(&journal->j_wait_transaction_locked); write_unlock(&journal->j_state_lock); jbd2_debug(3, "JBD2: commit phase 2a\n"); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 6350d3857c89..2696f43e7239 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -923,10 +923,16 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks) for (i = j_fc_off - 1; i >= j_fc_off - num_blks; i--) { bh = journal->j_fc_wbuf[i]; wait_on_buffer(bh); + /* + * Update j_fc_off so jbd2_fc_release_bufs can release remain + * buffer head. + */ + if (unlikely(!buffer_uptodate(bh))) { + journal->j_fc_off = i + 1; + return -EIO; + } put_bh(bh); journal->j_fc_wbuf[i] = NULL; - if (unlikely(!buffer_uptodate(bh))) - return -EIO; } return 0; @@ -1606,7 +1612,7 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags) { struct buffer_head *bh = journal->j_sb_buffer; journal_superblock_t *sb = journal->j_superblock; - int ret; + int ret = 0; /* Buffer got discarded which means block device got invalidated */ if (!buffer_mapped(bh)) { @@ -1636,7 +1642,7 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags) sb->s_checksum = jbd2_superblock_csum(journal, sb); get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(REQ_OP_WRITE | write_flags, bh); + submit_bh(REQ_OP_WRITE | write_flags, bh); wait_on_buffer(bh); if (buffer_write_io_error(bh)) { clear_buffer_write_io_error(bh); @@ -1644,9 +1650,8 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags) ret = -EIO; } if (ret) { - printk(KERN_ERR "JBD2: Error %d detected when updating " - "journal superblock for %s.\n", ret, - journal->j_devname); + printk(KERN_ERR "JBD2: I/O error when updating journal superblock for %s.\n", + journal->j_devname); if (!is_journal_aborted(journal)) jbd2_journal_abort(journal, ret); } @@ -1893,19 +1898,16 @@ static int journal_get_superblock(journal_t *journal) { struct buffer_head *bh; journal_superblock_t *sb; - int err = -EIO; + int err; bh = journal->j_sb_buffer; J_ASSERT(bh != NULL); - if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ, 1, &bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - printk(KERN_ERR - "JBD2: IO error reading journal superblock\n"); - goto out; - } + err = bh_read(bh, 0); + if (err < 0) { + printk(KERN_ERR + "JBD2: IO error reading journal superblock\n"); + goto out; } if (buffer_verified(bh)) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index f548479615c6..8286a9ec122f 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -100,7 +100,7 @@ static int do_readahead(journal_t *journal, unsigned int start) if (!buffer_uptodate(bh) && !buffer_locked(bh)) { bufs[nbufs++] = bh; if (nbufs == MAXBUF) { - ll_rw_block(REQ_OP_READ, nbufs, bufs); + bh_readahead_batch(nbufs, bufs, 0); journal_brelse_array(bufs, nbufs); nbufs = 0; } @@ -109,7 +109,7 @@ static int do_readahead(journal_t *journal, unsigned int start) } if (nbufs) - ll_rw_block(REQ_OP_READ, nbufs, bufs); + bh_readahead_batch(nbufs, bufs, 0); err = 0; failed: @@ -152,9 +152,14 @@ static int jread(struct buffer_head **bhp, journal_t *journal, return -ENOMEM; if (!buffer_uptodate(bh)) { - /* If this is a brand new buffer, start readahead. - Otherwise, we assume we are already reading it. */ - if (!buffer_req(bh)) + /* + * If this is a brand new buffer, start readahead. + * Otherwise, we assume we are already reading it. + */ + bool need_readahead = !buffer_req(bh); + + bh_read_nowait(bh, 0); + if (need_readahead) do_readahead(journal, offset); wait_on_buffer(bh); } @@ -256,6 +261,7 @@ static int fc_do_one_pass(journal_t *journal, err = journal->j_fc_replay_callback(journal, bh, pass, next_fc_block - journal->j_fc_first, expected_commit_id); + brelse(bh); next_fc_block++; if (err < 0 || err == JBD2_FC_REPLAY_STOP) break; @@ -687,7 +693,6 @@ static int do_one_pass(journal_t *journal, mark_buffer_dirty(nbh); BUFFER_TRACE(nbh, "marking uptodate"); ++info->nr_replays; - /* ll_rw_block(WRITE, 1, &nbh); */ unlock_buffer(nbh); brelse(obh); brelse(nbh); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e1be93ccd81c..6a404ac1c178 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -168,7 +168,7 @@ static void wait_transaction_locked(journal_t *journal) int need_to_start; tid_t tid = journal->j_running_transaction->t_tid; - prepare_to_wait(&journal->j_wait_transaction_locked, &wait, + prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); need_to_start = !tid_geq(journal->j_commit_request, tid); read_unlock(&journal->j_state_lock); @@ -194,7 +194,7 @@ static void wait_transaction_switching(journal_t *journal) read_unlock(&journal->j_state_lock); return; } - prepare_to_wait(&journal->j_wait_transaction_locked, &wait, + prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); read_unlock(&journal->j_state_lock); /* @@ -920,7 +920,7 @@ void jbd2_journal_unlock_updates (journal_t *journal) write_lock(&journal->j_state_lock); --journal->j_barrier_count; write_unlock(&journal->j_state_lock); - wake_up(&journal->j_wait_transaction_locked); + wake_up_all(&journal->j_wait_transaction_locked); } static void warn_dirty_buffer(struct buffer_head *bh) diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index e945e3484788..8bb58ce5c06c 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -229,10 +229,11 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a return rc; } -int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int rc, xprefix; + struct inode *inode = d_inode(dentry); switch (type) { case ACL_TYPE_ACCESS: diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index 9d9fb7cf093e..ca36a6eca594 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -28,7 +28,7 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL struct posix_acl *jffs2_get_acl(struct inode *inode, int type, bool rcu); -int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *); extern int jffs2_init_acl_post(struct inode *); diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index c0aabbcbfd58..f399b390b5f6 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -62,7 +62,7 @@ const struct inode_operations jffs2_dir_inode_operations = .rmdir = jffs2_rmdir, .mknod = jffs2_mknod, .rename = jffs2_rename, - .get_acl = jffs2_get_acl, + .get_inode_acl = jffs2_get_acl, .set_acl = jffs2_set_acl, .setattr = jffs2_setattr, .listxattr = jffs2_listxattr, diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index ba86acbe12d3..3cf71befa475 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -64,7 +64,7 @@ const struct file_operations jffs2_file_operations = const struct inode_operations jffs2_file_inode_operations = { - .get_acl = jffs2_get_acl, + .get_inode_acl = jffs2_get_acl, .set_acl = jffs2_set_acl, .setattr = jffs2_setattr, .listxattr = jffs2_listxattr, diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 39cec28096a7..66af51c41619 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -202,7 +202,7 @@ int jffs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, rc = jffs2_do_setattr(inode, iattr); if (!rc && (iattr->ia_valid & ATTR_MODE)) - rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); return rc; } diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index c6821a509481..4061e0ba7010 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1035,7 +1035,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, { int i, ret; int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); - struct mtd_oob_ops ops; + struct mtd_oob_ops ops = { }; ops.mode = MTD_OPS_AUTO_OOB; ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail; @@ -1076,7 +1076,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) { - struct mtd_oob_ops ops; + struct mtd_oob_ops ops = { }; int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); ops.mode = MTD_OPS_AUTO_OOB; @@ -1101,7 +1101,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) { int ret; - struct mtd_oob_ops ops; + struct mtd_oob_ops ops = { }; int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); ops.mode = MTD_OPS_AUTO_OOB; diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index a653f34c6e26..3b667eccc73b 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -94,12 +94,13 @@ out: return rc; } -int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int rc; tid_t tid; int update_mode = 0; + struct inode *inode = d_inode(dentry); umode_t mode = inode->i_mode; tid = txBegin(inode->i_sb, 0); diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 332dc9ac47a9..88663465aecd 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -123,7 +123,7 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) - rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); return rc; } @@ -133,7 +133,7 @@ const struct inode_operations jfs_file_inode_operations = { .fileattr_get = jfs_fileattr_get, .fileattr_set = jfs_fileattr_set, #ifdef CONFIG_JFS_POSIX_ACL - .get_acl = jfs_get_acl, + .get_inode_acl = jfs_get_acl, .set_acl = jfs_set_acl, #endif }; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index d1ec920aa030..8ac10e396050 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -264,11 +264,6 @@ int jfs_get_block(struct inode *ip, sector_t lblock, return rc; } -static int jfs_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, jfs_get_block, wbc); -} - static int jfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -355,12 +350,12 @@ const struct address_space_operations jfs_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = jfs_read_folio, .readahead = jfs_readahead, - .writepage = jfs_writepage, .writepages = jfs_writepages, .write_begin = jfs_write_begin, .write_end = jfs_write_end, .bmap = jfs_bmap, .direct_IO = jfs_direct_IO, + .migrate_folio = buffer_migrate_folio, }; /* diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index 3de40286d31f..f0704a25835f 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -8,7 +8,7 @@ #ifdef CONFIG_JFS_POSIX_ACL struct posix_acl *jfs_get_acl(struct inode *inode, int type, bool rcu); -int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); int jfs_init_acl(tid_t, struct inode *, struct inode *); diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 6b838d3ae7c2..765838578a72 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -155,7 +155,7 @@ int dbMount(struct inode *ipbmap) struct bmap *bmp; struct dbmap_disk *dbmp_le; struct metapage *mp; - int i; + int i, err; /* * allocate/initialize the in-memory bmap descriptor @@ -170,8 +170,8 @@ int dbMount(struct inode *ipbmap) BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage, PSIZE, 0); if (mp == NULL) { - kfree(bmp); - return -EIO; + err = -EIO; + goto err_kfree_bmp; } /* copy the on-disk bmap descriptor to its in-memory version. */ @@ -181,9 +181,8 @@ int dbMount(struct inode *ipbmap) bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage); bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); if (!bmp->db_numag) { - release_metapage(mp); - kfree(bmp); - return -EINVAL; + err = -EINVAL; + goto err_release_metapage; } bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel); @@ -194,6 +193,16 @@ int dbMount(struct inode *ipbmap) bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); + if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG) { + err = -EINVAL; + goto err_release_metapage; + } + + if (((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) { + err = -EINVAL; + goto err_release_metapage; + } + for (i = 0; i < MAXAG; i++) bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]); bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize); @@ -214,6 +223,12 @@ int dbMount(struct inode *ipbmap) BMAP_LOCK_INIT(bmp); return (0); + +err_release_metapage: + release_metapage(mp); +err_kfree_bmp: + kfree(bmp); + return err; } diff --git a/fs/jfs/jfs_extent.h b/fs/jfs/jfs_extent.h index 1c984214e95e..a0ee4ccea66e 100644 --- a/fs/jfs/jfs_extent.h +++ b/fs/jfs/jfs_extent.h @@ -10,9 +10,7 @@ (addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1) extern int extAlloc(struct inode *, s64, s64, xad_t *, bool); -extern int extFill(struct inode *, xad_t *); extern int extHint(struct inode *, s64, xad_t *); -extern int extRealloc(struct inode *, s64, xad_t *, bool); extern int extRecord(struct inode *, xad_t *); #endif /* _H_JFS_EXTENT */ diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 799d3837e7c2..390cbfce391f 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -310,8 +310,8 @@ int diRead(struct inode *ip) iagno = INOTOIAG(ip->i_ino); /* read the iag */ - imap = JFS_IP(ipimap)->i_imap; IREAD_LOCK(ipimap, RDWRLOCK_IMAP); + imap = JFS_IP(ipimap)->i_imap; rc = diIAGRead(imap, iagno, &mp); IREAD_UNLOCK(ipimap); if (rc) { diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index 48d1f70f786c..b83aae56a1f2 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -234,11 +234,15 @@ int jfs_mount_rw(struct super_block *sb, int remount) truncate_inode_pages(sbi->ipimap->i_mapping, 0); truncate_inode_pages(sbi->ipbmap->i_mapping, 0); + + IWRITE_LOCK(sbi->ipimap, RDWRLOCK_IMAP); diUnmount(sbi->ipimap, 1); if ((rc = diMount(sbi->ipimap))) { + IWRITE_UNLOCK(sbi->ipimap); jfs_err("jfs_mount_rw: diMount failed!"); return rc; } + IWRITE_UNLOCK(sbi->ipimap); dbUnmount(sbi->ipbmap, 1); if ((rc = dbMount(sbi->ipbmap))) { diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c index 3e8b13e6aa01..8ec43f53f686 100644 --- a/fs/jfs/jfs_umount.c +++ b/fs/jfs/jfs_umount.c @@ -68,7 +68,6 @@ int jfs_umount(struct super_block *sb) /* * close secondary aggregate inode allocation map */ - ipaimap2 = sbi->ipaimap2; if (ipaimap2) { diUnmount(ipaimap2, 0); diFreeSpecial(ipaimap2); @@ -78,7 +77,6 @@ int jfs_umount(struct super_block *sb) /* * close aggregate inode allocation map */ - ipaimap = sbi->ipaimap; diUnmount(ipaimap, 0); diFreeSpecial(ipaimap); sbi->ipaimap = NULL; @@ -89,7 +87,7 @@ int jfs_umount(struct super_block *sb) dbUnmount(ipbmap, 0); diFreeSpecial(ipbmap); - sbi->ipimap = NULL; + sbi->ipbmap = NULL; /* * Make sure all metadata makes it to disk before we mark diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h index c50167a7bc50..0d33816d251d 100644 --- a/fs/jfs/jfs_xattr.h +++ b/fs/jfs/jfs_xattr.h @@ -25,7 +25,7 @@ struct jfs_ea_list { struct jfs_ea ea[]; /* Variable length list */ }; -/* Macros for defining maxiumum number of bytes supported for EAs */ +/* Macros for defining maximum number of bytes supported for EAs */ #define MAXEASIZE 65535 #define MAXEALISTSIZE MAXEASIZE diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h index 142caafc73b1..ad7592191d76 100644 --- a/fs/jfs/jfs_xtree.h +++ b/fs/jfs/jfs_xtree.h @@ -96,12 +96,8 @@ extern int xtInsert(tid_t tid, struct inode *ip, extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen, int flag); extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad); -extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen, - int flag); extern s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int type); extern s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size); -extern int xtRelocate(tid_t tid, struct inode *ip, - xad_t * oxad, s64 nxaddr, int xtype); extern int xtAppend(tid_t tid, struct inode *ip, int xflag, s64 xoff, int maxblocks, int *xlenp, s64 * xaddrp, int flag); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 9db4f5789c0e..a38d14eed047 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -946,7 +946,7 @@ static int jfs_symlink(struct user_namespace *mnt_userns, struct inode *dip, if (ssize <= IDATASIZE) { ip->i_op = &jfs_fast_symlink_inode_operations; - ip->i_link = JFS_IP(ip)->i_inline; + ip->i_link = JFS_IP(ip)->i_inline_all; memcpy(ip->i_link, name, ssize); ip->i_size = ssize - 1; @@ -1525,7 +1525,7 @@ const struct inode_operations jfs_dir_inode_operations = { .fileattr_get = jfs_fileattr_get, .fileattr_set = jfs_fileattr_set, #ifdef CONFIG_JFS_POSIX_ACL - .get_acl = jfs_get_acl, + .get_inode_acl = jfs_get_acl, .set_acl = jfs_set_acl, #endif }; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 85d4f44f2ac4..d2f82cb7db1b 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -745,8 +745,7 @@ static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data, len = i_size-off; toread = len; while (toread > 0) { - tocopy = sb->s_blocksize - offset < toread ? - sb->s_blocksize - offset : toread; + tocopy = min_t(size_t, sb->s_blocksize - offset, toread); tmp_bh.b_state = 0; tmp_bh.b_size = i_blocksize(inode); @@ -785,8 +784,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, inode_lock(inode); while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; + tocopy = min_t(size_t, sb->s_blocksize - offset, towrite); tmp_bh.b_state = 0; tmp_bh.b_size = i_blocksize(inode); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 1cc88ba6de90..935ef8cb02b2 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -31,10 +31,15 @@ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) +static bool __kernfs_active(struct kernfs_node *kn) +{ + return atomic_read(&kn->active) >= 0; +} + static bool kernfs_active(struct kernfs_node *kn) { lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem); - return atomic_read(&kn->active) >= 0; + return __kernfs_active(kn); } static bool kernfs_lockdep(struct kernfs_node *kn) @@ -120,9 +125,9 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, * kn_to: /n1/n2/n3 [depth=3] * result: /../.. * - * [3] when @kn_to is NULL result will be "(null)" + * [3] when @kn_to is %NULL result will be "(null)" * - * Returns the length of the full path. If the full length is equal to or + * Return: the length of the full path. If the full length is equal to or * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ @@ -180,10 +185,12 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, * @buflen: size of @buf * * Copies the name of @kn into @buf of @buflen bytes. The behavior is - * similar to strlcpy(). It returns the length of @kn's name and if @buf - * isn't long enough, it's filled upto @buflen-1 and nul terminated. + * similar to strlcpy(). + * + * Fills buffer with "(null)" if @kn is %NULL. * - * Fills buffer with "(null)" if @kn is NULL. + * Return: the length of @kn's name and if @buf isn't long enough, + * it's filled up to @buflen-1 and nul terminated. * * This function can be called from any context. */ @@ -210,7 +217,7 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) * path (which includes '..'s) as needed to reach from @from to @to is * returned. * - * Returns the length of the full path. If the full length is equal to or + * Return: the length of the full path. If the full length is equal to or * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ @@ -282,6 +289,8 @@ out: * * Determines @kn's parent, pins and returns it. This function can be * called from any context. + * + * Return: parent node of @kn */ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) { @@ -297,11 +306,11 @@ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) } /** - * kernfs_name_hash + * kernfs_name_hash - calculate hash of @ns + @name * @name: Null terminated string to hash * @ns: Namespace tag to hash * - * Returns 31 bit hash of ns + name (so it fits in an off_t ) + * Return: 31-bit hash of ns + name (so it fits in an off_t) */ static unsigned int kernfs_name_hash(const char *name, const void *ns) { @@ -349,8 +358,8 @@ static int kernfs_sd_compare(const struct kernfs_node *left, * Locking: * kernfs_rwsem held exclusive * - * RETURNS: - * 0 on susccess -EEXIST on failure. + * Return: + * %0 on success, -EEXIST on failure. */ static int kernfs_link_sibling(struct kernfs_node *kn) { @@ -389,8 +398,10 @@ static int kernfs_link_sibling(struct kernfs_node *kn) * @kn: kernfs_node of interest * * Try to unlink @kn from its sibling rbtree which starts from - * kn->parent->dir.children. Returns %true if @kn was actually - * removed, %false if @kn wasn't on the rbtree. + * kn->parent->dir.children. + * + * Return: %true if @kn was actually removed, + * %false if @kn wasn't on the rbtree. * * Locking: * kernfs_rwsem held exclusive @@ -414,10 +425,10 @@ static bool kernfs_unlink_sibling(struct kernfs_node *kn) * @kn: kernfs_node to get an active reference to * * Get an active reference of @kn. This function is noop if @kn - * is NULL. + * is %NULL. * - * RETURNS: - * Pointer to @kn on success, NULL on failure. + * Return: + * Pointer to @kn on success, %NULL on failure. */ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) { @@ -437,7 +448,7 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) * @kn: kernfs_node to put an active reference to * * Put an active reference to @kn. This function is noop if @kn - * is NULL. + * is %NULL. */ void kernfs_put_active(struct kernfs_node *kn) { @@ -459,7 +470,7 @@ void kernfs_put_active(struct kernfs_node *kn) * kernfs_drain - drain kernfs_node * @kn: kernfs_node to drain * - * Drain existing usages and nuke all existing mmaps of @kn. Mutiple + * Drain existing usages and nuke all existing mmaps of @kn. Multiple * removers may invoke this function concurrently on @kn and all will * return after draining is complete. */ @@ -472,6 +483,16 @@ static void kernfs_drain(struct kernfs_node *kn) lockdep_assert_held_write(&root->kernfs_rwsem); WARN_ON_ONCE(kernfs_active(kn)); + /* + * Skip draining if already fully drained. This avoids draining and its + * lockdep annotations for nodes which have never been activated + * allowing embedding kernfs_remove() in create error paths without + * worrying about draining. + */ + if (atomic_read(&kn->active) == KN_DEACTIVATED_BIAS && + !kernfs_should_drain_open_files(kn)) + return; + up_write(&root->kernfs_rwsem); if (kernfs_lockdep(kn)) { @@ -480,7 +501,6 @@ static void kernfs_drain(struct kernfs_node *kn) lock_contended(&kn->dep_map, _RET_IP_); } - /* but everyone should wait for draining */ wait_event(root->deactivate_waitq, atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); @@ -489,7 +509,8 @@ static void kernfs_drain(struct kernfs_node *kn) rwsem_release(&kn->dep_map, _RET_IP_); } - kernfs_drain_open_files(kn); + if (kernfs_should_drain_open_files(kn)) + kernfs_drain_open_files(kn); down_write(&root->kernfs_rwsem); } @@ -562,7 +583,7 @@ EXPORT_SYMBOL_GPL(kernfs_put); * kernfs_node_from_dentry - determine kernfs_node associated with a dentry * @dentry: the dentry in question * - * Return the kernfs_node associated with @dentry. If @dentry is not a + * Return: the kernfs_node associated with @dentry. If @dentry is not a * kernfs one, %NULL is returned. * * While the returned kernfs_node will stay accessible as long as @dentry @@ -669,8 +690,8 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, * @id's lower 32bits encode ino and upper gen. If the gen portion is * zero, all generations are matched. * - * RETURNS: - * NULL on failure. Return a kernfs node with reference counter incremented + * Return: %NULL on failure, + * otherwise a kernfs node with reference counter incremented. */ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, u64 id) @@ -696,12 +717,11 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, } /* - * ACTIVATED is protected with kernfs_mutex but it was clear when - * @kn was added to idr and we just wanna see it set. No need to - * grab kernfs_mutex. + * We should fail if @kn has never been activated and guarantee success + * if the caller knows that @kn is active. Both can be achieved by + * __kernfs_active() which tests @kn->active without kernfs_rwsem. */ - if (unlikely(!(kn->flags & KERNFS_ACTIVATED) || - !atomic_inc_not_zero(&kn->count))) + if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count))) goto err_unlock; spin_unlock(&kernfs_idr_lock); @@ -719,8 +739,8 @@ err_unlock: * function increments nlink of the parent's inode if @kn is a * directory and link into the children list of the parent. * - * RETURNS: - * 0 on success, -EEXIST if entry with the given name already + * Return: + * %0 on success, -EEXIST if entry with the given name already * exists. */ int kernfs_add_one(struct kernfs_node *kn) @@ -743,10 +763,7 @@ int kernfs_add_one(struct kernfs_node *kn) goto out_unlock; ret = -ENOENT; - if (parent->flags & KERNFS_EMPTY_DIR) - goto out_unlock; - - if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent)) + if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR)) goto out_unlock; kn->hash = kernfs_name_hash(kn->name, kn->ns); @@ -786,8 +803,9 @@ out_unlock: * @name: name to look for * @ns: the namespace tag to use * - * Look for kernfs_node with name @name under @parent. Returns pointer to - * the found kernfs_node on success, %NULL on failure. + * Look for kernfs_node with name @name under @parent. + * + * Return: pointer to the found kernfs_node on success, %NULL on failure. */ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, const unsigned char *name, @@ -860,8 +878,9 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, * @ns: the namespace tag to use * * Look for kernfs_node with name @name under @parent and get a reference - * if found. This function may sleep and returns pointer to the found - * kernfs_node on success, %NULL on failure. + * if found. This function may sleep. + * + * Return: pointer to the found kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) @@ -885,8 +904,9 @@ EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); * @ns: the namespace tag to use * * Look for kernfs_node with path @path under @parent and get a reference - * if found. This function may sleep and returns pointer to the found - * kernfs_node on success, %NULL on failure. + * if found. This function may sleep. + * + * Return: pointer to the found kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns) @@ -908,7 +928,7 @@ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, * @flags: KERNFS_ROOT_* flags * @priv: opaque data associated with the new directory * - * Returns the root of the new hierarchy on success, ERR_PTR() value on + * Return: the root of the new hierarchy on success, ERR_PTR() value on * failure. */ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, @@ -980,6 +1000,8 @@ void kernfs_destroy_root(struct kernfs_root *root) /** * kernfs_root_to_node - return the kernfs_node associated with a kernfs_root * @root: root to use to lookup + * + * Return: @root's kernfs_node */ struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root) { @@ -996,7 +1018,7 @@ struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root) * @priv: opaque data associated with the new directory * @ns: optional namespace tag of the directory * - * Returns the created node on success, ERR_PTR() value on failure. + * Return: the created node on success, ERR_PTR() value on failure. */ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, @@ -1030,7 +1052,7 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, * @parent: parent in which to create a new directory * @name: name of the new directory * - * Returns the created node on success, ERR_PTR() value on failure. + * Return: the created node on success, ERR_PTR() value on failure. */ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, const char *name) @@ -1072,20 +1094,30 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) /* If the kernfs parent node has changed discard and * proceed to ->lookup. + * + * There's nothing special needed here when getting the + * dentry parent, even if a concurrent rename is in + * progress. That's because the dentry is negative so + * it can only be the target of the rename and it will + * be doing a d_move() not a replace. Consequently the + * dentry d_parent won't change over the d_move(). + * + * Also kernfs negative dentries transitioning from + * negative to positive during revalidate won't happen + * because they are invalidated on containing directory + * changes and the lookup re-done so that a new positive + * dentry can be properly created. */ - spin_lock(&dentry->d_lock); + root = kernfs_root_from_sb(dentry->d_sb); + down_read(&root->kernfs_rwsem); parent = kernfs_dentry_node(dentry->d_parent); if (parent) { - spin_unlock(&dentry->d_lock); - root = kernfs_root(parent); - down_read(&root->kernfs_rwsem); if (kernfs_dir_changed(parent, dentry)) { up_read(&root->kernfs_rwsem); return 0; } - up_read(&root->kernfs_rwsem); - } else - spin_unlock(&dentry->d_lock); + } + up_read(&root->kernfs_rwsem); /* The kernfs parent node hasn't changed, leave the * dentry negative and return success. @@ -1279,6 +1311,8 @@ static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos) * Find the next descendant to visit for post-order traversal of @root's * descendants. @root is included in the iteration and the last node to be * visited. + * + * Return: the next descendant to visit or %NULL when done. */ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, struct kernfs_node *root) @@ -1304,6 +1338,21 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, return pos->parent; } +static void kernfs_activate_one(struct kernfs_node *kn) +{ + lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); + + kn->flags |= KERNFS_ACTIVATED; + + if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING))) + return; + + WARN_ON_ONCE(kn->parent && RB_EMPTY_NODE(&kn->rb)); + WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); + + atomic_sub(KN_DEACTIVATED_BIAS, &kn->active); +} + /** * kernfs_activate - activate a node which started deactivated * @kn: kernfs_node whose subtree is to be activated @@ -1325,15 +1374,42 @@ void kernfs_activate(struct kernfs_node *kn) down_write(&root->kernfs_rwsem); pos = NULL; - while ((pos = kernfs_next_descendant_post(pos, kn))) { - if (pos->flags & KERNFS_ACTIVATED) - continue; + while ((pos = kernfs_next_descendant_post(pos, kn))) + kernfs_activate_one(pos); + + up_write(&root->kernfs_rwsem); +} + +/** + * kernfs_show - show or hide a node + * @kn: kernfs_node to show or hide + * @show: whether to show or hide + * + * If @show is %false, @kn is marked hidden and deactivated. A hidden node is + * ignored in future activaitons. If %true, the mark is removed and activation + * state is restored. This function won't implicitly activate a new node in a + * %KERNFS_ROOT_CREATE_DEACTIVATED root which hasn't been activated yet. + * + * To avoid recursion complexities, directories aren't supported for now. + */ +void kernfs_show(struct kernfs_node *kn, bool show) +{ + struct kernfs_root *root = kernfs_root(kn); + + if (WARN_ON_ONCE(kernfs_type(kn) == KERNFS_DIR)) + return; - WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb)); - WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS); + down_write(&root->kernfs_rwsem); - atomic_sub(KN_DEACTIVATED_BIAS, &pos->active); - pos->flags |= KERNFS_ACTIVATED; + if (show) { + kn->flags &= ~KERNFS_HIDDEN; + if (kn->flags & KERNFS_ACTIVATED) + kernfs_activate_one(kn); + } else { + kn->flags |= KERNFS_HIDDEN; + if (kernfs_active(kn)) + atomic_add(KN_DEACTIVATED_BIAS, &kn->active); + kernfs_drain(kn); } up_write(&root->kernfs_rwsem); @@ -1358,34 +1434,27 @@ static void __kernfs_remove(struct kernfs_node *kn) pr_debug("kernfs %s: removing\n", kn->name); - /* prevent any new usage under @kn by deactivating all nodes */ + /* prevent new usage by marking all nodes removing and deactivating */ pos = NULL; - while ((pos = kernfs_next_descendant_post(pos, kn))) + while ((pos = kernfs_next_descendant_post(pos, kn))) { + pos->flags |= KERNFS_REMOVING; if (kernfs_active(pos)) atomic_add(KN_DEACTIVATED_BIAS, &pos->active); + } /* deactivate and unlink the subtree node-by-node */ do { pos = kernfs_leftmost_descendant(kn); /* - * kernfs_drain() drops kernfs_rwsem temporarily and @pos's + * kernfs_drain() may drop kernfs_rwsem temporarily and @pos's * base ref could have been put by someone else by the time * the function returns. Make sure it doesn't go away * underneath us. */ kernfs_get(pos); - /* - * Drain iff @kn was activated. This avoids draining and - * its lockdep annotations for nodes which have never been - * activated and allows embedding kernfs_remove() in create - * error paths without worrying about draining. - */ - if (kn->flags & KERNFS_ACTIVATED) - kernfs_drain(pos); - else - WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); + kernfs_drain(pos); /* * kernfs_unlink_sibling() succeeds once per node. Use it @@ -1507,6 +1576,8 @@ void kernfs_unbreak_active_protection(struct kernfs_node *kn) * the whole kernfs_ops which won the arbitration. This can be used to * guarantee, for example, all concurrent writes to a "delete" file to * finish only after the whole operation is complete. + * + * Return: %true if @kn is removed by this call, otherwise %false. */ bool kernfs_remove_self(struct kernfs_node *kn) { @@ -1567,7 +1638,8 @@ bool kernfs_remove_self(struct kernfs_node *kn) * @ns: namespace tag of the kernfs_node to remove * * Look for the kernfs_node with @name and @ns under @parent and remove it. - * Returns 0 on success, -ENOENT if such entry doesn't exist. + * + * Return: %0 on success, -ENOENT if such entry doesn't exist. */ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const void *ns) @@ -1585,8 +1657,11 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, down_write(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); - if (kn) + if (kn) { + kernfs_get(kn); __kernfs_remove(kn); + kernfs_put(kn); + } up_write(&root->kernfs_rwsem); @@ -1602,6 +1677,8 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, * @new_parent: new parent to put @sd under * @new_name: new name * @new_ns: new namespace tag + * + * Return: %0 on success, -errno on failure. */ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns) diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index b3ec34386b43..e4a50e4ff0d2 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -23,6 +23,8 @@ struct kernfs_open_node { atomic_t event; wait_queue_head_t poll; struct list_head files; /* goes through kernfs_open_file.list */ + unsigned int nr_mmapped; + unsigned int nr_to_release; }; /* @@ -31,7 +33,7 @@ struct kernfs_open_node { * pending queue is implemented as a singly linked list of kernfs_nodes. * The list is terminated with the self pointer so that whether a * kernfs_node is on the list or not can be determined by testing the next - * pointer for NULL. + * pointer for %NULL. */ #define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list) @@ -57,31 +59,19 @@ static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn) } /** - * kernfs_deref_open_node - Get kernfs_open_node corresponding to @kn. + * of_on - Get the kernfs_open_node of the specified kernfs_open_file + * @of: target kernfs_open_file * - * @of: associated kernfs_open_file instance. - * @kn: target kernfs_node. - * - * Fetch and return ->attr.open of @kn if @of->list is non empty. - * If @of->list is not empty we can safely assume that @of is on - * @kn->attr.open->files list and this guarantees that @kn->attr.open - * will not vanish i.e. dereferencing outside RCU read-side critical - * section is safe here. - * - * The caller needs to make sure that @of->list is not empty. + * Return: the kernfs_open_node of the kernfs_open_file */ -static struct kernfs_open_node * -kernfs_deref_open_node(struct kernfs_open_file *of, struct kernfs_node *kn) +static struct kernfs_open_node *of_on(struct kernfs_open_file *of) { - struct kernfs_open_node *on; - - on = rcu_dereference_check(kn->attr.open, !list_empty(&of->list)); - - return on; + return rcu_dereference_protected(of->kn->attr.open, + !list_empty(&of->list)); } /** - * kernfs_deref_open_node_protected - Get kernfs_open_node corresponding to @kn + * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn * * @kn: target kernfs_node. * @@ -94,9 +84,11 @@ kernfs_deref_open_node(struct kernfs_open_file *of, struct kernfs_node *kn) * outside RCU read-side critical section. * * The caller needs to make sure that kernfs_open_file_mutex is held. + * + * Return: @kn->attr.open when kernfs_open_file_mutex is held. */ static struct kernfs_open_node * -kernfs_deref_open_node_protected(struct kernfs_node *kn) +kernfs_deref_open_node_locked(struct kernfs_node *kn) { return rcu_dereference_protected(kn->attr.open, lockdep_is_held(kernfs_open_file_mutex_ptr(kn))); @@ -207,12 +199,8 @@ static void kernfs_seq_stop(struct seq_file *sf, void *v) static int kernfs_seq_show(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; - struct kernfs_open_node *on = kernfs_deref_open_node(of, of->kn); - - if (!on) - return -EINVAL; - of->event = atomic_read(&on->event); + of->event = atomic_read(&of_on(of)->event); return of->kn->attr.ops->seq_show(sf, v); } @@ -235,7 +223,6 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE); const struct kernfs_ops *ops; - struct kernfs_open_node *on; char *buf; buf = of->prealloc_buf; @@ -257,14 +244,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) goto out_free; } - on = kernfs_deref_open_node(of, of->kn); - if (!on) { - len = -EINVAL; - mutex_unlock(&of->mutex); - goto out_free; - } - - of->event = atomic_read(&on->event); + of->event = atomic_read(&of_on(of)->event); ops = kernfs_ops(of->kn); if (ops->read) @@ -553,6 +533,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) rc = 0; of->mmapped = true; + of_on(of)->nr_mmapped++; of->vm_ops = vma->vm_ops; vma->vm_ops = &kernfs_vm_ops; out_put: @@ -571,40 +552,39 @@ out_unlock: * If @kn->attr.open exists, increment its reference count; otherwise, * create one. @of is chained to the files list. * - * LOCKING: + * Locking: * Kernel thread context (may sleep). * - * RETURNS: - * 0 on success, -errno on failure. + * Return: + * %0 on success, -errno on failure. */ static int kernfs_get_open_node(struct kernfs_node *kn, struct kernfs_open_file *of) { - struct kernfs_open_node *on, *new_on = NULL; - struct mutex *mutex = NULL; + struct kernfs_open_node *on; + struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); - on = kernfs_deref_open_node_protected(kn); + on = kernfs_deref_open_node_locked(kn); - if (on) { - list_add_tail(&of->list, &on->files); - mutex_unlock(mutex); - return 0; - } else { + if (!on) { /* not there, initialize a new one */ - new_on = kmalloc(sizeof(*new_on), GFP_KERNEL); - if (!new_on) { + on = kzalloc(sizeof(*on), GFP_KERNEL); + if (!on) { mutex_unlock(mutex); return -ENOMEM; } - atomic_set(&new_on->event, 1); - init_waitqueue_head(&new_on->poll); - INIT_LIST_HEAD(&new_on->files); - list_add_tail(&of->list, &new_on->files); - rcu_assign_pointer(kn->attr.open, new_on); + atomic_set(&on->event, 1); + init_waitqueue_head(&on->poll); + INIT_LIST_HEAD(&on->files); + rcu_assign_pointer(kn->attr.open, on); } - mutex_unlock(mutex); + list_add_tail(&of->list, &on->files); + if (kn->flags & KERNFS_HAS_RELEASE) + on->nr_to_release++; + + mutex_unlock(mutex); return 0; } @@ -613,6 +593,7 @@ static int kernfs_get_open_node(struct kernfs_node *kn, * * @kn: target kernfs_node * @of: associated kernfs_open_file + * @open_failed: ->open() failed, cancel ->release() * * Unlink @of from list of @kn's associated open files. If list of * associated open files becomes empty, disassociate and free @@ -622,21 +603,30 @@ static int kernfs_get_open_node(struct kernfs_node *kn, * None. */ static void kernfs_unlink_open_file(struct kernfs_node *kn, - struct kernfs_open_file *of) + struct kernfs_open_file *of, + bool open_failed) { struct kernfs_open_node *on; - struct mutex *mutex = NULL; + struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); - on = kernfs_deref_open_node_protected(kn); + on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; } - if (of) + if (of) { + if (kn->flags & KERNFS_HAS_RELEASE) { + WARN_ON_ONCE(of->released == open_failed); + if (open_failed) + on->nr_to_release--; + } + if (of->mmapped) + on->nr_mmapped--; list_del(&of->list); + } if (list_empty(&on->files)) { rcu_assign_pointer(kn->attr.open, NULL); @@ -763,7 +753,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) return 0; err_put_node: - kernfs_unlink_open_file(kn, of); + kernfs_unlink_open_file(kn, of, true); err_seq_release: seq_release(inode, file); err_free: @@ -795,6 +785,7 @@ static void kernfs_release_file(struct kernfs_node *kn, */ kn->attr.ops->release(of); of->released = true; + of_on(of)->nr_to_release--; } } @@ -802,15 +793,16 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp) { struct kernfs_node *kn = inode->i_private; struct kernfs_open_file *of = kernfs_of(filp); - struct mutex *mutex = NULL; if (kn->flags & KERNFS_HAS_RELEASE) { + struct mutex *mutex; + mutex = kernfs_open_file_mutex_lock(kn); kernfs_release_file(kn, of); mutex_unlock(mutex); } - kernfs_unlink_open_file(kn, of); + kernfs_unlink_open_file(kn, of, false); seq_release(inode, filp); kfree(of->prealloc_buf); kfree(of); @@ -818,28 +810,33 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp) return 0; } -void kernfs_drain_open_files(struct kernfs_node *kn) +bool kernfs_should_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; - struct kernfs_open_file *of; - struct mutex *mutex = NULL; - - if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE))) - return; + bool ret; /* - * lockless opportunistic check is safe below because no one is adding to - * ->attr.open at this point of time. This check allows early bail out - * if ->attr.open is already NULL. kernfs_unlink_open_file makes - * ->attr.open NULL only while holding kernfs_open_file_mutex so below - * check under kernfs_open_file_mutex_ptr(kn) will ensure bailing out if - * ->attr.open became NULL while waiting for the mutex. + * @kn being deactivated guarantees that @kn->attr.open can't change + * beneath us making the lockless test below safe. */ - if (!rcu_access_pointer(kn->attr.open)) - return; + WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); + + rcu_read_lock(); + on = rcu_dereference(kn->attr.open); + ret = on && (on->nr_mmapped || on->nr_to_release); + rcu_read_unlock(); + + return ret; +} + +void kernfs_drain_open_files(struct kernfs_node *kn) +{ + struct kernfs_open_node *on; + struct kernfs_open_file *of; + struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); - on = kernfs_deref_open_node_protected(kn); + on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; @@ -848,13 +845,17 @@ void kernfs_drain_open_files(struct kernfs_node *kn) list_for_each_entry(of, &on->files, list) { struct inode *inode = file_inode(of->file); - if (kn->flags & KERNFS_HAS_MMAP) + if (of->mmapped) { unmap_mapping_range(inode->i_mapping, 0, 0, 1); + of->mmapped = false; + on->nr_mmapped--; + } if (kn->flags & KERNFS_HAS_RELEASE) kernfs_release_file(kn, of); } + WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release); mutex_unlock(mutex); } @@ -874,11 +875,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn) */ __poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait) { - struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry); - struct kernfs_open_node *on = kernfs_deref_open_node(of, kn); - - if (!on) - return EPOLLERR; + struct kernfs_open_node *on = of_on(of); poll_wait(of->file, &on->poll, wait); @@ -1031,7 +1028,7 @@ const struct file_operations kernfs_file_fops = { * @ns: optional namespace tag of the file * @key: lockdep key for the file's active_ref, %NULL to disable lockdep * - * Returns the created node on success, ERR_PTR() value on error. + * Return: the created node on success, ERR_PTR() value on error. */ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, const char *name, diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 3d783d80f5da..eac0f210299a 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -94,7 +94,7 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) * @kn: target node * @iattr: iattr to set * - * Returns 0 on success, -errno on failure. + * Return: %0 on success, -errno on failure. */ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { @@ -190,10 +190,8 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns, struct kernfs_root *root = kernfs_root(kn); down_read(&root->kernfs_rwsem); - spin_lock(&inode->i_lock); kernfs_refresh_inode(kn, inode); generic_fillattr(&init_user_ns, inode, stat); - spin_unlock(&inode->i_lock); up_read(&root->kernfs_rwsem); return 0; @@ -241,11 +239,11 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) * allocated and basics are initialized. New inode is returned * locked. * - * LOCKING: + * Locking: * Kernel thread context (may sleep). * - * RETURNS: - * Pointer to allocated inode on success, NULL on failure. + * Return: + * Pointer to allocated inode on success, %NULL on failure. */ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { @@ -288,10 +286,8 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns, root = kernfs_root(kn); down_read(&root->kernfs_rwsem); - spin_lock(&inode->i_lock); kernfs_refresh_inode(kn, inode); ret = generic_permission(&init_user_ns, inode, mask); - spin_unlock(&inode->i_lock); up_read(&root->kernfs_rwsem); return ret; diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 3ae214d02d44..9046d9f39e63 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -58,7 +58,7 @@ struct kernfs_root { * kernfs_root - find out the kernfs_root a kernfs_node belongs to * @kn: kernfs_node of interest * - * Return the kernfs_root @kn belongs to. + * Return: the kernfs_root @kn belongs to. */ static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn) { @@ -157,6 +157,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, */ extern const struct file_operations kernfs_file_fops; +bool kernfs_should_drain_open_files(struct kernfs_node *kn); void kernfs_drain_open_files(struct kernfs_node *kn); /* diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index d0859f72d2d6..e08e8d999807 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -153,7 +153,7 @@ static const struct export_operations kernfs_export_ops = { * kernfs_root_from_sb - determine kernfs_root associated with a super_block * @sb: the super_block in question * - * Return the kernfs_root associated with @sb. If @sb is not a kernfs one, + * Return: the kernfs_root associated with @sb. If @sb is not a kernfs one, * %NULL is returned. */ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) @@ -167,7 +167,7 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) * find the next ancestor in the path down to @child, where @parent was the * ancestor whose descendant we want to find. * - * Say the path is /a/b/c/d. @child is d, @parent is NULL. We return the root + * Say the path is /a/b/c/d. @child is d, @parent is %NULL. We return the root * node. If @parent is b, then we return the node for c. * Passing in d as @parent is not ok. */ @@ -192,6 +192,8 @@ static struct kernfs_node *find_next_ancestor(struct kernfs_node *child, * kernfs_node_dentry - get a dentry for the given kernfs_node * @kn: kernfs_node for which a dentry is needed * @sb: the kernfs super_block + * + * Return: the dentry pointer */ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, struct super_block *sb) @@ -296,7 +298,7 @@ static int kernfs_set_super(struct super_block *sb, struct fs_context *fc) * kernfs_super_ns - determine the namespace tag of a kernfs super_block * @sb: super_block of interest * - * Return the namespace tag associated with kernfs super_block @sb. + * Return: the namespace tag associated with kernfs super_block @sb. */ const void *kernfs_super_ns(struct super_block *sb) { @@ -313,6 +315,8 @@ const void *kernfs_super_ns(struct super_block *sb) * implementation, which should set the specified ->@fs_type and ->@flags, and * specify the hierarchy and namespace tag to mount via ->@root and ->@ns, * respectively. + * + * Return: %0 on success, -errno on failure. */ int kernfs_get_tree(struct fs_context *fc) { diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 0ab13824822f..45371a70caa7 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -19,7 +19,7 @@ * @name: name of the symlink * @target: target node for the symlink to point to * - * Returns the created node on success, ERR_PTR() value on error. + * Return: the created node on success, ERR_PTR() value on error. * Ownership of the link matches ownership of the target. */ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c index c5a5c7b90d72..2a39ffb8423b 100644 --- a/fs/ksmbd/auth.c +++ b/fs/ksmbd/auth.c @@ -424,6 +424,9 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, NTLMSSP_NEGOTIATE_56); } + if (cflags & NTLMSSP_NEGOTIATE_SEAL && smb3_encryption_negotiated(conn)) + flags |= NTLMSSP_NEGOTIATE_SEAL; + if (cflags & NTLMSSP_NEGOTIATE_ALWAYS_SIGN) flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN; @@ -984,13 +987,16 @@ out: return rc; } -static int ksmbd_get_encryption_key(struct ksmbd_conn *conn, __u64 ses_id, +static int ksmbd_get_encryption_key(struct ksmbd_work *work, __u64 ses_id, int enc, u8 *key) { struct ksmbd_session *sess; u8 *ses_enc_key; - sess = ksmbd_session_lookup_all(conn, ses_id); + if (enc) + sess = work->sess; + else + sess = ksmbd_session_lookup_all(work->conn, ses_id); if (!sess) return -EINVAL; @@ -1078,9 +1084,10 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec, return sg; } -int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov, +int ksmbd_crypt_message(struct ksmbd_work *work, struct kvec *iov, unsigned int nvec, int enc) { + struct ksmbd_conn *conn = work->conn; struct smb2_transform_hdr *tr_hdr = smb2_get_msg(iov[0].iov_base); unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20; int rc; @@ -1094,7 +1101,7 @@ int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov, unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); struct ksmbd_crypto_ctx *ctx; - rc = ksmbd_get_encryption_key(conn, + rc = ksmbd_get_encryption_key(work, le64_to_cpu(tr_hdr->SessionId), enc, key); diff --git a/fs/ksmbd/auth.h b/fs/ksmbd/auth.h index 25b772653de0..362b6159a6cf 100644 --- a/fs/ksmbd/auth.h +++ b/fs/ksmbd/auth.h @@ -33,9 +33,10 @@ struct ksmbd_session; struct ksmbd_conn; +struct ksmbd_work; struct kvec; -int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov, +int ksmbd_crypt_message(struct ksmbd_work *work, struct kvec *iov, unsigned int nvec, int enc); void ksmbd_copy_gss_neg_header(void *buf); int ksmbd_auth_ntlmv2(struct ksmbd_conn *conn, struct ksmbd_session *sess, diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index 756ad631c019..12be8386446a 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -60,6 +60,12 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) conn->local_nls = load_nls("utf8"); if (!conn->local_nls) conn->local_nls = load_nls_default(); + if (IS_ENABLED(CONFIG_UNICODE)) + conn->um = utf8_load(UNICODE_AGE(12, 1, 0)); + else + conn->um = ERR_PTR(-EOPNOTSUPP); + if (IS_ERR(conn->um)) + conn->um = NULL; atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); conn->total_credits = 1; @@ -350,6 +356,8 @@ out: wait_event(conn->r_count_q, atomic_read(&conn->r_count) == 0); + if (IS_ENABLED(CONFIG_UNICODE)) + utf8_unload(conn->um); unload_nls(conn->local_nls); if (default_conn_ops.terminate_fn) default_conn_ops.terminate_fn(conn); diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h index e7f7d5707951..3643354a3fa7 100644 --- a/fs/ksmbd/connection.h +++ b/fs/ksmbd/connection.h @@ -14,6 +14,7 @@ #include <net/request_sock.h> #include <linux/kthread.h> #include <linux/nls.h> +#include <linux/unicode.h> #include "smb_common.h" #include "ksmbd_work.h" @@ -46,6 +47,7 @@ struct ksmbd_conn { char *request_buf; struct ksmbd_transport *transport; struct nls_table *local_nls; + struct unicode_map *um; struct list_head conns_list; /* smb session 1 per user */ struct xarray sessions; diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index e0cbcfa98c7e..b6bd8311e6b4 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -74,6 +74,7 @@ struct ksmbd_heartbeat { #define KSMBD_GLOBAL_FLAG_SMB2_LEASES BIT(0) #define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION BIT(1) #define KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL BIT(2) +#define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF BIT(3) /* * IPC request for ksmbd server startup @@ -163,7 +164,8 @@ struct ksmbd_share_config_response { __u16 force_directory_mode; __u16 force_uid; __u16 force_gid; - __u32 reserved[128]; /* Reserved room */ + __s8 share_name[KSMBD_REQ_MAX_SHARE_NAME]; + __u32 reserved[112]; /* Reserved room */ __u32 veto_list_sz; __s8 ____payload[]; }; diff --git a/fs/ksmbd/mgmt/share_config.c b/fs/ksmbd/mgmt/share_config.c index c9bca1c2c834..328a412259dc 100644 --- a/fs/ksmbd/mgmt/share_config.c +++ b/fs/ksmbd/mgmt/share_config.c @@ -16,6 +16,7 @@ #include "user_config.h" #include "user_session.h" #include "../transport_ipc.h" +#include "../misc.h" #define SHARE_HASH_BITS 3 static DEFINE_HASHTABLE(shares_table, SHARE_HASH_BITS); @@ -26,7 +27,7 @@ struct ksmbd_veto_pattern { struct list_head list; }; -static unsigned int share_name_hash(char *name) +static unsigned int share_name_hash(const char *name) { return jhash(name, strlen(name), 0); } @@ -72,7 +73,7 @@ __get_share_config(struct ksmbd_share_config *share) return share; } -static struct ksmbd_share_config *__share_lookup(char *name) +static struct ksmbd_share_config *__share_lookup(const char *name) { struct ksmbd_share_config *share; unsigned int key = share_name_hash(name); @@ -119,7 +120,8 @@ static int parse_veto_list(struct ksmbd_share_config *share, return 0; } -static struct ksmbd_share_config *share_config_request(char *name) +static struct ksmbd_share_config *share_config_request(struct unicode_map *um, + const char *name) { struct ksmbd_share_config_response *resp; struct ksmbd_share_config *share = NULL; @@ -133,6 +135,19 @@ static struct ksmbd_share_config *share_config_request(char *name) if (resp->flags == KSMBD_SHARE_FLAG_INVALID) goto out; + if (*resp->share_name) { + char *cf_resp_name; + bool equal; + + cf_resp_name = ksmbd_casefold_sharename(um, resp->share_name); + if (IS_ERR(cf_resp_name)) + goto out; + equal = !strcmp(cf_resp_name, name); + kfree(cf_resp_name); + if (!equal) + goto out; + } + share = kzalloc(sizeof(struct ksmbd_share_config), GFP_KERNEL); if (!share) goto out; @@ -190,20 +205,11 @@ out: return share; } -static void strtolower(char *share_name) -{ - while (*share_name) { - *share_name = tolower(*share_name); - share_name++; - } -} - -struct ksmbd_share_config *ksmbd_share_config_get(char *name) +struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um, + const char *name) { struct ksmbd_share_config *share; - strtolower(name); - down_read(&shares_table_lock); share = __share_lookup(name); if (share) @@ -212,7 +218,7 @@ struct ksmbd_share_config *ksmbd_share_config_get(char *name) if (share) return share; - return share_config_request(name); + return share_config_request(um, name); } bool ksmbd_share_veto_filename(struct ksmbd_share_config *share, diff --git a/fs/ksmbd/mgmt/share_config.h b/fs/ksmbd/mgmt/share_config.h index 902f2cb1963a..3fd338293942 100644 --- a/fs/ksmbd/mgmt/share_config.h +++ b/fs/ksmbd/mgmt/share_config.h @@ -9,6 +9,7 @@ #include <linux/workqueue.h> #include <linux/hashtable.h> #include <linux/path.h> +#include <linux/unicode.h> struct ksmbd_share_config { char *name; @@ -74,7 +75,8 @@ static inline void ksmbd_share_config_put(struct ksmbd_share_config *share) __ksmbd_share_config_put(share); } -struct ksmbd_share_config *ksmbd_share_config_get(char *name); +struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um, + const char *name); bool ksmbd_share_veto_filename(struct ksmbd_share_config *share, const char *filename); #endif /* __SHARE_CONFIG_MANAGEMENT_H__ */ diff --git a/fs/ksmbd/mgmt/tree_connect.c b/fs/ksmbd/mgmt/tree_connect.c index 97ab7987df6e..8ce17b3fb8da 100644 --- a/fs/ksmbd/mgmt/tree_connect.c +++ b/fs/ksmbd/mgmt/tree_connect.c @@ -17,7 +17,7 @@ struct ksmbd_tree_conn_status ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, - char *share_name) + const char *share_name) { struct ksmbd_tree_conn_status status = {-ENOENT, NULL}; struct ksmbd_tree_connect_response *resp = NULL; @@ -26,7 +26,7 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, struct sockaddr *peer_addr; int ret; - sc = ksmbd_share_config_get(share_name); + sc = ksmbd_share_config_get(conn->um, share_name); if (!sc) return status; @@ -61,7 +61,7 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, struct ksmbd_share_config *new_sc; ksmbd_share_config_del(sc); - new_sc = ksmbd_share_config_get(share_name); + new_sc = ksmbd_share_config_get(conn->um, share_name); if (!new_sc) { pr_err("Failed to update stale share config\n"); status.ret = -ESTALE; diff --git a/fs/ksmbd/mgmt/tree_connect.h b/fs/ksmbd/mgmt/tree_connect.h index 71e50271dccf..0f97ddc1e39c 100644 --- a/fs/ksmbd/mgmt/tree_connect.h +++ b/fs/ksmbd/mgmt/tree_connect.h @@ -42,7 +42,7 @@ struct ksmbd_session; struct ksmbd_tree_conn_status ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, - char *share_name); + const char *share_name); int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess, struct ksmbd_tree_connect *tree_conn); diff --git a/fs/ksmbd/mgmt/user_session.c b/fs/ksmbd/mgmt/user_session.c index 3fa2139a0b30..92b1603b5abe 100644 --- a/fs/ksmbd/mgmt/user_session.c +++ b/fs/ksmbd/mgmt/user_session.c @@ -108,15 +108,17 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) entry->method = method; entry->id = ksmbd_ipc_id_alloc(); if (entry->id < 0) - goto error; + goto free_entry; resp = ksmbd_rpc_open(sess, entry->id); if (!resp) - goto error; + goto free_id; kvfree(resp); return entry->id; -error: +free_id: + ksmbd_rpc_id_free(entry->id); +free_entry: list_del(&entry->list); kfree(entry); return -EINVAL; diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c index df991107ad2c..9e8afaa686e3 100644 --- a/fs/ksmbd/misc.c +++ b/fs/ksmbd/misc.c @@ -7,6 +7,7 @@ #include <linux/kernel.h> #include <linux/xattr.h> #include <linux/fs.h> +#include <linux/unicode.h> #include "misc.h" #include "smb_common.h" @@ -159,7 +160,7 @@ out: */ char *convert_to_nt_pathname(struct ksmbd_share_config *share, - struct path *path) + const struct path *path) { char *pathname, *ab_pathname, *nt_pathname; int share_path_len = share->path_sz; @@ -226,26 +227,53 @@ void ksmbd_conv_path_to_windows(char *path) strreplace(path, '/', '\\'); } +char *ksmbd_casefold_sharename(struct unicode_map *um, const char *name) +{ + char *cf_name; + int cf_len; + + cf_name = kzalloc(KSMBD_REQ_MAX_SHARE_NAME, GFP_KERNEL); + if (!cf_name) + return ERR_PTR(-ENOMEM); + + if (IS_ENABLED(CONFIG_UNICODE) && um) { + const struct qstr q_name = {.name = name, .len = strlen(name)}; + + cf_len = utf8_casefold(um, &q_name, cf_name, + KSMBD_REQ_MAX_SHARE_NAME); + if (cf_len < 0) + goto out_ascii; + + return cf_name; + } + +out_ascii: + cf_len = strscpy(cf_name, name, KSMBD_REQ_MAX_SHARE_NAME); + if (cf_len < 0) { + kfree(cf_name); + return ERR_PTR(-E2BIG); + } + + for (; *cf_name; ++cf_name) + *cf_name = isascii(*cf_name) ? tolower(*cf_name) : *cf_name; + return cf_name - cf_len; +} + /** * ksmbd_extract_sharename() - get share name from tree connect request * @treename: buffer containing tree name and share name * * Return: share name on success, otherwise error */ -char *ksmbd_extract_sharename(char *treename) +char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename) { - char *name = treename; - char *dst; - char *pos = strrchr(name, '\\'); + const char *name = treename, *pos = strrchr(name, '\\'); if (pos) name = (pos + 1); /* caller has to free the memory */ - dst = kstrdup(name, GFP_KERNEL); - if (!dst) - return ERR_PTR(-ENOMEM); - return dst; + return ksmbd_casefold_sharename(um, name); } /** diff --git a/fs/ksmbd/misc.h b/fs/ksmbd/misc.h index aae2a252945f..1facfcd21200 100644 --- a/fs/ksmbd/misc.h +++ b/fs/ksmbd/misc.h @@ -15,12 +15,13 @@ int match_pattern(const char *str, size_t len, const char *pattern); int ksmbd_validate_filename(char *filename); int parse_stream_name(char *filename, char **stream_name, int *s_type); char *convert_to_nt_pathname(struct ksmbd_share_config *share, - struct path *path); + const struct path *path); int get_nlink(struct kstat *st); void ksmbd_conv_path_to_unix(char *path); void ksmbd_strip_last_slash(char *path); void ksmbd_conv_path_to_windows(char *path); -char *ksmbd_extract_sharename(char *treename); +char *ksmbd_casefold_sharename(struct unicode_map *um, const char *name); +char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename); char *convert_to_unix_name(struct ksmbd_share_config *share, const char *name); #define KSMBD_DIR_INFO_ALIGNMENT 8 diff --git a/fs/ksmbd/ndr.c b/fs/ksmbd/ndr.c index 5052be9261d9..0ae8d08d85a8 100644 --- a/fs/ksmbd/ndr.c +++ b/fs/ksmbd/ndr.c @@ -345,6 +345,8 @@ int ndr_encode_posix_acl(struct ndr *n, { unsigned int ref_id = 0x00020000; int ret; + vfsuid_t vfsuid; + vfsgid_t vfsgid; n->offset = 0; n->length = 1024; @@ -372,10 +374,12 @@ int ndr_encode_posix_acl(struct ndr *n, if (ret) return ret; - ret = ndr_write_int64(n, from_kuid(&init_user_ns, i_uid_into_mnt(user_ns, inode))); + vfsuid = i_uid_into_vfsuid(user_ns, inode); + ret = ndr_write_int64(n, from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid))); if (ret) return ret; - ret = ndr_write_int64(n, from_kgid(&init_user_ns, i_gid_into_mnt(user_ns, inode))); + vfsgid = i_gid_into_vfsgid(user_ns, inode); + ret = ndr_write_int64(n, from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid))); if (ret) return ret; ret = ndr_write_int32(n, inode->i_mode); diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c index 9046cff4374b..d7d47b82451d 100644 --- a/fs/ksmbd/oplock.c +++ b/fs/ksmbd/oplock.c @@ -1609,12 +1609,18 @@ void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp) struct create_posix_rsp *buf; struct inode *inode = file_inode(fp->filp); struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); + vfsuid_t vfsuid = i_uid_into_vfsuid(user_ns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(user_ns, inode); buf = (struct create_posix_rsp *)cc; memset(buf, 0, sizeof(struct create_posix_rsp)); buf->ccontext.DataOffset = cpu_to_le16(offsetof (struct create_posix_rsp, nlink)); - buf->ccontext.DataLength = cpu_to_le32(52); + /* + * DataLength = nlink(4) + reparse_tag(4) + mode(4) + + * domain sid(28) + unix group sid(16). + */ + buf->ccontext.DataLength = cpu_to_le32(56); buf->ccontext.NameOffset = cpu_to_le16(offsetof (struct create_posix_rsp, Name)); buf->ccontext.NameLength = cpu_to_le16(POSIX_CTXT_DATA_LEN); @@ -1638,13 +1644,18 @@ void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp) buf->nlink = cpu_to_le32(inode->i_nlink); buf->reparse_tag = cpu_to_le32(fp->volatile_id); - buf->mode = cpu_to_le32(inode->i_mode); - id_to_sid(from_kuid_munged(&init_user_ns, - i_uid_into_mnt(user_ns, inode)), - SIDNFS_USER, (struct smb_sid *)&buf->SidBuffer[0]); - id_to_sid(from_kgid_munged(&init_user_ns, - i_gid_into_mnt(user_ns, inode)), - SIDNFS_GROUP, (struct smb_sid *)&buf->SidBuffer[20]); + buf->mode = cpu_to_le32(inode->i_mode & 0777); + /* + * SidBuffer(44) contain two sids(Domain sid(28), UNIX group sid(16)). + * Domain sid(28) = revision(1) + num_subauth(1) + authority(6) + + * sub_auth(4 * 4(num_subauth)) + RID(4). + * UNIX group id(16) = revision(1) + num_subauth(1) + authority(6) + + * sub_auth(4 * 1(num_subauth)) + RID(4). + */ + id_to_sid(from_kuid_munged(&init_user_ns, vfsuid_into_kuid(vfsuid)), + SIDOWNER, (struct smb_sid *)&buf->SidBuffer[0]); + id_to_sid(from_kgid_munged(&init_user_ns, vfsgid_into_kgid(vfsgid)), + SIDUNIX_GROUP, (struct smb_sid *)&buf->SidBuffer[28]); } /* diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c index ce42bff42ef9..394b6ceac431 100644 --- a/fs/ksmbd/server.c +++ b/fs/ksmbd/server.c @@ -235,10 +235,8 @@ send: if (work->sess && work->sess->enc && work->encrypted && conn->ops->encrypt_resp) { rc = conn->ops->encrypt_resp(work); - if (rc < 0) { + if (rc < 0) conn->ops->set_rsp_status(work, STATUS_DATA_ERROR); - goto send; - } } ksmbd_conn_write(work); @@ -434,11 +432,9 @@ static ssize_t stats_show(struct class *class, struct class_attribute *attr, "reset", "shutdown" }; - - ssize_t sz = scnprintf(buf, PAGE_SIZE, "%d %s %d %lu\n", stats_version, - state[server_conf.state], server_conf.tcp_port, - server_conf.ipc_last_active / HZ); - return sz; + return sysfs_emit(buf, "%d %s %d %lu\n", stats_version, + state[server_conf.state], server_conf.tcp_port, + server_conf.ipc_last_active / HZ); } static ssize_t kill_server_store(struct class *class, @@ -470,19 +466,13 @@ static ssize_t debug_show(struct class *class, struct class_attribute *attr, for (i = 0; i < ARRAY_SIZE(debug_type_strings); i++) { if ((ksmbd_debug_types >> i) & 1) { - pos = scnprintf(buf + sz, - PAGE_SIZE - sz, - "[%s] ", - debug_type_strings[i]); + pos = sysfs_emit_at(buf, sz, "[%s] ", debug_type_strings[i]); } else { - pos = scnprintf(buf + sz, - PAGE_SIZE - sz, - "%s ", - debug_type_strings[i]); + pos = sysfs_emit_at(buf, sz, "%s ", debug_type_strings[i]); } sz += pos; } - sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); + sz += sysfs_emit_at(buf, sz, "\n"); return sz; } diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c index ab23da2120b9..e401302478c3 100644 --- a/fs/ksmbd/smb2ops.c +++ b/fs/ksmbd/smb2ops.c @@ -247,8 +247,9 @@ void init_smb3_02_server(struct ksmbd_conn *conn) if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING; - if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION && - conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION) + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION || + (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) && + conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)) conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) @@ -271,6 +272,11 @@ int init_smb3_11_server(struct ksmbd_conn *conn) if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING; + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION || + (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) && + conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)) + conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION; + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 19412ac701a6..14d7f3599c63 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -903,7 +903,7 @@ static void decode_encrypt_ctxt(struct ksmbd_conn *conn, return; } - if (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION)) + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) return; for (i = 0; i < cph_cnt; i++) { @@ -925,7 +925,7 @@ static void decode_encrypt_ctxt(struct ksmbd_conn *conn, * * Return: true if connection should be encrypted, else false */ -static bool smb3_encryption_negotiated(struct ksmbd_conn *conn) +bool smb3_encryption_negotiated(struct ksmbd_conn *conn) { if (!conn->ops->generate_encryptionkey) return false; @@ -1508,7 +1508,8 @@ static int ntlm_authenticate(struct ksmbd_work *work) return -EINVAL; } sess->enc = true; - rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION) + rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; /* * signing is disable if encryption is enable * on this session @@ -1599,7 +1600,8 @@ static int krb5_authenticate(struct ksmbd_work *work) return -EINVAL; } sess->enc = true; - rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION) + rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; sess->sign = false; } @@ -1883,7 +1885,7 @@ int smb2_tree_connect(struct ksmbd_work *work) goto out_err1; } - name = ksmbd_extract_sharename(treename); + name = ksmbd_extract_sharename(conn->um, treename); if (IS_ERR(name)) { status.ret = KSMBD_TREE_CONN_STATUS_ERROR; goto out_err1; @@ -2185,7 +2187,7 @@ out: * Return: 0 on success, otherwise error */ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, - struct path *path) + const struct path *path) { struct user_namespace *user_ns = mnt_user_ns(path->mnt); char *attr_name = NULL, *value; @@ -2272,7 +2274,7 @@ next: return rc; } -static noinline int smb2_set_stream_name_xattr(struct path *path, +static noinline int smb2_set_stream_name_xattr(const struct path *path, struct ksmbd_file *fp, char *stream_name, int s_type) { @@ -2311,7 +2313,7 @@ static noinline int smb2_set_stream_name_xattr(struct path *path, return 0; } -static int smb2_remove_smb_xattrs(struct path *path) +static int smb2_remove_smb_xattrs(const struct path *path) { struct user_namespace *user_ns = mnt_user_ns(path->mnt); char *name, *xattr_list = NULL; @@ -2345,7 +2347,7 @@ out: return err; } -static int smb2_create_truncate(struct path *path) +static int smb2_create_truncate(const struct path *path) { int rc = vfs_truncate(path, 0); @@ -2364,7 +2366,7 @@ static int smb2_create_truncate(struct path *path) return rc; } -static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, struct path *path, +static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, const struct path *path, struct ksmbd_file *fp) { struct xattr_dos_attrib da = {0}; @@ -2387,7 +2389,7 @@ static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, struct path *path, } static void smb2_update_xattrs(struct ksmbd_tree_connect *tcon, - struct path *path, struct ksmbd_file *fp) + const struct path *path, struct ksmbd_file *fp) { struct xattr_dos_attrib da; int rc; @@ -2447,7 +2449,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name, static int smb2_create_sd_buffer(struct ksmbd_work *work, struct smb2_create_req *req, - struct path *path) + const struct path *path) { struct create_context *context; struct create_sd_buf_req *sd_buf; @@ -2477,16 +2479,19 @@ static void ksmbd_acls_fattr(struct smb_fattr *fattr, struct user_namespace *mnt_userns, struct inode *inode) { - fattr->cf_uid = i_uid_into_mnt(mnt_userns, inode); - fattr->cf_gid = i_gid_into_mnt(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + + fattr->cf_uid = vfsuid_into_kuid(vfsuid); + fattr->cf_gid = vfsgid_into_kgid(vfsgid); fattr->cf_mode = inode->i_mode; fattr->cf_acls = NULL; fattr->cf_dacls = NULL; if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) { - fattr->cf_acls = get_acl(inode, ACL_TYPE_ACCESS); + fattr->cf_acls = get_inode_acl(inode, ACL_TYPE_ACCESS); if (S_ISDIR(inode->i_mode)) - fattr->cf_dacls = get_acl(inode, ACL_TYPE_DEFAULT); + fattr->cf_dacls = get_inode_acl(inode, ACL_TYPE_DEFAULT); } } @@ -2761,7 +2766,6 @@ int smb2_open(struct ksmbd_work *work) } else { file_present = true; user_ns = mnt_user_ns(path.mnt); - generic_fillattr(user_ns, d_inode(path.dentry), &stat); } if (stream_name) { if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) { @@ -2770,7 +2774,8 @@ int smb2_open(struct ksmbd_work *work) rsp->hdr.Status = STATUS_NOT_A_DIRECTORY; } } else { - if (S_ISDIR(stat.mode) && s_type == DATA_STREAM) { + if (file_present && S_ISDIR(d_inode(path.dentry)->i_mode) && + s_type == DATA_STREAM) { rc = -EIO; rsp->hdr.Status = STATUS_FILE_IS_A_DIRECTORY; } @@ -2787,7 +2792,8 @@ int smb2_open(struct ksmbd_work *work) } if (file_present && req->CreateOptions & FILE_NON_DIRECTORY_FILE_LE && - S_ISDIR(stat.mode) && !(req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) { + S_ISDIR(d_inode(path.dentry)->i_mode) && + !(req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) { ksmbd_debug(SMB, "open() argument is a directory: %s, %x\n", name, req->CreateOptions); rsp->hdr.Status = STATUS_FILE_IS_A_DIRECTORY; @@ -2797,7 +2803,7 @@ int smb2_open(struct ksmbd_work *work) if (file_present && (req->CreateOptions & FILE_DIRECTORY_FILE_LE) && !(req->CreateDisposition == FILE_CREATE_LE) && - !S_ISDIR(stat.mode)) { + !S_ISDIR(d_inode(path.dentry)->i_mode)) { rsp->hdr.Status = STATUS_NOT_A_DIRECTORY; rc = -EIO; goto err_out; @@ -2952,7 +2958,7 @@ int smb2_open(struct ksmbd_work *work) struct inode *inode = d_inode(path.dentry); posix_acl_rc = ksmbd_vfs_inherit_posix_acl(user_ns, - inode, + path.dentry, d_inode(path.dentry->d_parent)); if (posix_acl_rc) ksmbd_debug(SMB, "inherit posix acl failed : %d\n", posix_acl_rc); @@ -2968,7 +2974,7 @@ int smb2_open(struct ksmbd_work *work) if (rc) { if (posix_acl_rc) ksmbd_vfs_set_init_posix_acl(user_ns, - inode); + path.dentry); if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) { @@ -3434,7 +3440,7 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, goto free_conv_name; } - struct_sz = readdir_info_level_struct_sz(info_level) - 1 + conv_len; + struct_sz = readdir_info_level_struct_sz(info_level) + conv_len; next_entry_offset = ALIGN(struct_sz, KSMBD_DIR_INFO_ALIGNMENT); d_info->last_entry_off_align = next_entry_offset - struct_sz; @@ -3561,17 +3567,22 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, posix_info->AllocationSize = cpu_to_le64(ksmbd_kstat->kstat->blocks << 9); posix_info->DeviceId = cpu_to_le32(ksmbd_kstat->kstat->rdev); posix_info->HardLinks = cpu_to_le32(ksmbd_kstat->kstat->nlink); - posix_info->Mode = cpu_to_le32(ksmbd_kstat->kstat->mode); + posix_info->Mode = cpu_to_le32(ksmbd_kstat->kstat->mode & 0777); posix_info->Inode = cpu_to_le64(ksmbd_kstat->kstat->ino); posix_info->DosAttributes = S_ISDIR(ksmbd_kstat->kstat->mode) ? FILE_ATTRIBUTE_DIRECTORY_LE : FILE_ATTRIBUTE_ARCHIVE_LE; if (d_info->hide_dot_file && d_info->name[0] == '.') posix_info->DosAttributes |= FILE_ATTRIBUTE_HIDDEN_LE; + /* + * SidBuffer(32) contain two sids(Domain sid(16), UNIX group sid(16)). + * UNIX sid(16) = revision(1) + num_subauth(1) + authority(6) + + * sub_auth(4 * 1(num_subauth)) + RID(4). + */ id_to_sid(from_kuid_munged(&init_user_ns, ksmbd_kstat->kstat->uid), - SIDNFS_USER, (struct smb_sid *)&posix_info->SidBuffer[0]); + SIDUNIX_USER, (struct smb_sid *)&posix_info->SidBuffer[0]); id_to_sid(from_kgid_munged(&init_user_ns, ksmbd_kstat->kstat->gid), - SIDNFS_GROUP, (struct smb_sid *)&posix_info->SidBuffer[20]); + SIDUNIX_GROUP, (struct smb_sid *)&posix_info->SidBuffer[16]); memcpy(posix_info->name, conv_name, conv_len); posix_info->name_len = cpu_to_le32(conv_len); posix_info->NextEntryOffset = cpu_to_le32(next_entry_offset); @@ -3681,7 +3692,7 @@ static int reserve_populate_dentry(struct ksmbd_dir_info *d_info, return -EOPNOTSUPP; conv_len = (d_info->name_len + 1) * 2; - next_entry_offset = ALIGN(struct_sz - 1 + conv_len, + next_entry_offset = ALIGN(struct_sz + conv_len, KSMBD_DIR_INFO_ALIGNMENT); if (next_entry_offset > d_info->out_buf_len) { @@ -3776,7 +3787,7 @@ static int reserve_populate_dentry(struct ksmbd_dir_info *d_info, return 0; } -static int __query_dir(struct dir_context *ctx, const char *name, int namlen, +static bool __query_dir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct ksmbd_readdir_data *buf; @@ -3790,27 +3801,20 @@ static int __query_dir(struct dir_context *ctx, const char *name, int namlen, /* dot and dotdot entries are already reserved */ if (!strcmp(".", name) || !strcmp("..", name)) - return 0; + return true; if (ksmbd_share_veto_filename(priv->work->tcon->share_conf, name)) - return 0; + return true; if (!match_pattern(name, namlen, priv->search_pattern)) - return 0; + return true; d_info->name = name; d_info->name_len = namlen; rc = reserve_populate_dentry(d_info, priv->info_level); if (rc) - return rc; - if (d_info->flags & SMB2_RETURN_SINGLE_ENTRY) { + return false; + if (d_info->flags & SMB2_RETURN_SINGLE_ENTRY) d_info->out_buf_len = 0; - return 0; - } - return 0; -} - -static void restart_ctx(struct dir_context *ctx) -{ - ctx->pos = 0; + return true; } static int verify_info_level(int info_level) @@ -3894,8 +3898,7 @@ int smb2_query_dir(struct ksmbd_work *work) inode_permission(file_mnt_user_ns(dir_fp->filp), file_inode(dir_fp->filp), MAY_READ | MAY_EXEC)) { - pr_err("no right to enumerate directory (%pd)\n", - dir_fp->filp->f_path.dentry); + pr_err("no right to enumerate directory (%pD)\n", dir_fp->filp); rc = -EACCES; goto err_out2; } @@ -3921,7 +3924,6 @@ int smb2_query_dir(struct ksmbd_work *work) if (srch_flag & SMB2_REOPEN || srch_flag & SMB2_RESTART_SCANS) { ksmbd_debug(SMB, "Restart directory scan\n"); generic_file_llseek(dir_fp->filp, 0, SEEK_SET); - restart_ctx(&dir_fp->readdir_data.ctx); } memset(&d_info, 0, sizeof(struct ksmbd_dir_info)); @@ -3968,11 +3970,9 @@ int smb2_query_dir(struct ksmbd_work *work) */ if (!d_info.out_buf_len && !d_info.num_entry) goto no_buf_len; - if (rc == 0) - restart_ctx(&dir_fp->readdir_data.ctx); - if (rc == -ENOSPC) + if (rc > 0 || rc == -ENOSPC) rc = 0; - if (rc) + else if (rc) goto err_out; d_info.wptr = d_info.rptr; @@ -4029,6 +4029,8 @@ err_out2: rsp->hdr.Status = STATUS_NO_MEMORY; else if (rc == -EFAULT) rsp->hdr.Status = STATUS_INVALID_INFO_CLASS; + else if (rc == -EIO) + rsp->hdr.Status = STATUS_FILE_CORRUPT_ERROR; if (!rsp->hdr.Status) rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR; @@ -4158,7 +4160,7 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, int rc, name_len, value_len, xattr_list_len, idx; ssize_t buf_free_len, alignment_bytes, next_offset, rsp_data_cnt = 0; struct smb2_ea_info_req *ea_req = NULL; - struct path *path; + const struct path *path; struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); if (!(fp->daccess & FILE_READ_EA_LE)) { @@ -4495,7 +4497,7 @@ static void get_file_stream_info(struct ksmbd_work *work, struct smb2_file_stream_info *file_info; char *stream_name, *xattr_list = NULL, *stream_buf; struct kstat stat; - struct path *path = &fp->filp->f_path; + const struct path *path = &fp->filp->f_path; ssize_t xattr_list_len; int nbytes = 0, streamlen, stream_name_len, next, idx = 0; int buf_free_len; @@ -4720,7 +4722,11 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, { struct smb311_posix_qinfo *file_info; struct inode *inode = file_inode(fp->filp); + struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); + vfsuid_t vfsuid = i_uid_into_vfsuid(user_ns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(user_ns, inode); u64 time; + int out_buf_len = sizeof(struct smb311_posix_qinfo) + 32; file_info = (struct smb311_posix_qinfo *)rsp->Buffer; file_info->CreationTime = cpu_to_le64(fp->create_time); @@ -4735,12 +4741,22 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, file_info->EndOfFile = cpu_to_le64(inode->i_size); file_info->AllocationSize = cpu_to_le64(inode->i_blocks << 9); file_info->HardLinks = cpu_to_le32(inode->i_nlink); - file_info->Mode = cpu_to_le32(inode->i_mode); + file_info->Mode = cpu_to_le32(inode->i_mode & 0777); file_info->DeviceId = cpu_to_le32(inode->i_rdev); - rsp->OutputBufferLength = - cpu_to_le32(sizeof(struct smb311_posix_qinfo)); - inc_rfc1001_len(rsp_org, sizeof(struct smb311_posix_qinfo)); - return 0; + + /* + * Sids(32) contain two sids(Domain sid(16), UNIX group sid(16)). + * UNIX sid(16) = revision(1) + num_subauth(1) + authority(6) + + * sub_auth(4 * 1(num_subauth)) + RID(4). + */ + id_to_sid(from_kuid_munged(&init_user_ns, vfsuid_into_kuid(vfsuid)), + SIDUNIX_USER, (struct smb_sid *)&file_info->Sids[0]); + id_to_sid(from_kgid_munged(&init_user_ns, vfsgid_into_kgid(vfsgid)), + SIDUNIX_GROUP, (struct smb_sid *)&file_info->Sids[16]); + + rsp->OutputBufferLength = cpu_to_le32(out_buf_len); + inc_rfc1001_len(rsp_org, out_buf_len); + return out_buf_len; } static int smb2_get_info_file(struct ksmbd_work *work, @@ -4860,8 +4876,8 @@ static int smb2_get_info_file(struct ksmbd_work *work, pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n"); rc = -EOPNOTSUPP; } else { - rc = find_file_posix_info(rsp, fp, work->response_buf); - file_infoclass_size = sizeof(struct smb311_posix_qinfo); + file_infoclass_size = find_file_posix_info(rsp, fp, + work->response_buf); } break; default: @@ -5413,7 +5429,7 @@ static int smb2_rename(struct ksmbd_work *work, if (!pathname) return -ENOMEM; - abs_oldname = d_path(&fp->filp->f_path, pathname, PATH_MAX); + abs_oldname = file_path(fp->filp, pathname, PATH_MAX); if (IS_ERR(abs_oldname)) { rc = -EINVAL; goto out; @@ -5548,7 +5564,7 @@ static int smb2_create_link(struct ksmbd_work *work, } ksmbd_debug(SMB, "link name is %s\n", link_name); - target_name = d_path(&filp->f_path, pathname, PATH_MAX); + target_name = file_path(filp, pathname, PATH_MAX); if (IS_ERR(target_name)) { rc = -EINVAL; goto out; @@ -6266,8 +6282,8 @@ int smb2_read(struct ksmbd_work *work) goto out; } - ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n", - fp->filp->f_path.dentry, offset, length); + ksmbd_debug(SMB, "filename %pD, offset %lld, len %zu\n", + fp->filp, offset, length); work->aux_payload_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO); if (!work->aux_payload_buf) { @@ -6531,8 +6547,8 @@ int smb2_write(struct ksmbd_work *work) data_buf = (char *)(((char *)&req->hdr.ProtocolId) + le16_to_cpu(req->DataOffset)); - ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n", - fp->filp->f_path.dentry, offset, length); + ksmbd_debug(SMB, "filename %pD, offset %lld, len %zu\n", + fp->filp, offset, length); err = ksmbd_vfs_write(work, fp, data_buf, length, &offset, writethrough, &nbytes); if (err < 0) @@ -6737,7 +6753,7 @@ static int smb2_set_flock_flags(struct file_lock *flock, int flags) case SMB2_LOCKFLAG_UNLOCK: ksmbd_debug(SMB, "received unlock request\n"); flock->fl_type = F_UNLCK; - cmd = 0; + cmd = F_SETLK; break; } @@ -6841,6 +6857,7 @@ int smb2_lock(struct ksmbd_work *work) if (lock_start > U64_MAX - lock_length) { pr_err("Invalid lock range requested\n"); rsp->hdr.Status = STATUS_INVALID_LOCK_RANGE; + locks_free_lock(flock); goto out; } @@ -6860,6 +6877,7 @@ int smb2_lock(struct ksmbd_work *work) "the end offset(%llx) is smaller than the start offset(%llx)\n", flock->fl_end, flock->fl_start); rsp->hdr.Status = STATUS_INVALID_LOCK_RANGE; + locks_free_lock(flock); goto out; } @@ -6871,6 +6889,7 @@ int smb2_lock(struct ksmbd_work *work) flock->fl_type != F_UNLCK) { pr_err("conflict two locks in one request\n"); err = -EINVAL; + locks_free_lock(flock); goto out; } } @@ -6879,6 +6898,7 @@ int smb2_lock(struct ksmbd_work *work) smb_lock = smb2_lock_init(flock, cmd, flags, &lock_list); if (!smb_lock) { err = -EINVAL; + locks_free_lock(flock); goto out; } } @@ -7115,7 +7135,7 @@ out: rlock->fl_start = smb_lock->start; rlock->fl_end = smb_lock->end; - rc = vfs_lock_file(filp, 0, rlock, NULL); + rc = vfs_lock_file(filp, F_SETLK, rlock, NULL); if (rc) pr_err("rollback unlock fail : %d\n", rc); @@ -7643,11 +7663,16 @@ int smb2_ioctl(struct ksmbd_work *work) goto out; } - if (in_buf_len < sizeof(struct validate_negotiate_info_req)) - return -EINVAL; + if (in_buf_len < offsetof(struct validate_negotiate_info_req, + Dialects)) { + ret = -EINVAL; + goto out; + } - if (out_buf_len < sizeof(struct validate_negotiate_info_rsp)) - return -EINVAL; + if (out_buf_len < sizeof(struct validate_negotiate_info_rsp)) { + ret = -EINVAL; + goto out; + } ret = fsctl_validate_negotiate_info(conn, (struct validate_negotiate_info_req *)&req->Buffer[0], @@ -8573,7 +8598,7 @@ int smb3_encrypt_resp(struct ksmbd_work *work) buf_size += iov[1].iov_len; work->resp_hdr_sz = iov[1].iov_len; - rc = ksmbd_crypt_message(work->conn, iov, rq_nvec, 1); + rc = ksmbd_crypt_message(work, iov, rq_nvec, 1); if (rc) return rc; @@ -8592,7 +8617,6 @@ bool smb3_is_transform_hdr(void *buf) int smb3_decrypt_req(struct ksmbd_work *work) { - struct ksmbd_conn *conn = work->conn; struct ksmbd_session *sess; char *buf = work->request_buf; unsigned int pdu_length = get_rfc1002_len(buf); @@ -8612,7 +8636,7 @@ int smb3_decrypt_req(struct ksmbd_work *work) return -ECONNABORTED; } - sess = ksmbd_session_lookup_all(conn, le64_to_cpu(tr_hdr->SessionId)); + sess = ksmbd_session_lookup_all(work->conn, le64_to_cpu(tr_hdr->SessionId)); if (!sess) { pr_err("invalid session id(%llx) in transform header\n", le64_to_cpu(tr_hdr->SessionId)); @@ -8623,7 +8647,7 @@ int smb3_decrypt_req(struct ksmbd_work *work) iov[0].iov_len = sizeof(struct smb2_transform_hdr) + 4; iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr) + 4; iov[1].iov_len = buf_data_size; - rc = ksmbd_crypt_message(conn, iov, 2, 0); + rc = ksmbd_crypt_message(work, iov, 2, 0); if (rc) return rc; diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index af455278d005..aa5dbe54f5a1 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -158,7 +158,8 @@ struct create_posix_rsp { __le32 nlink; __le32 reparse_tag; __le32 mode; - u8 SidBuffer[40]; + /* SidBuffer contain two sids(Domain sid(28), UNIX group sid(16)) */ + u8 SidBuffer[44]; } __packed; struct smb2_buffer_desc_v1 { @@ -439,9 +440,10 @@ struct smb2_posix_info { __le32 HardLinks; __le32 ReparseTag; __le32 Mode; - u8 SidBuffer[40]; + /* SidBuffer contain two sids (UNIX user sid(16), UNIX group sid(16)) */ + u8 SidBuffer[32]; __le32 name_len; - u8 name[1]; + u8 name[]; /* * var sized owner SID * var sized group SID @@ -492,6 +494,7 @@ int smb3_decrypt_req(struct ksmbd_work *work); int smb3_encrypt_resp(struct ksmbd_work *work); bool smb3_11_final_sess_setup_resp(struct ksmbd_work *work); int smb2_set_rsp_credits(struct ksmbd_work *work); +bool smb3_encryption_negotiated(struct ksmbd_conn *conn); /* smb2 misc functions */ int ksmbd_smb2_check_message(struct ksmbd_work *work); diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index 7f8ab14fb8ec..2a4fbbd55b91 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -4,6 +4,8 @@ * Copyright (C) 2018 Namjae Jeon <linkinjeon@kernel.org> */ +#include <linux/user_namespace.h> + #include "smb_common.h" #include "server.h" #include "misc.h" @@ -621,12 +623,12 @@ int ksmbd_override_fsids(struct ksmbd_work *work) if (share->force_gid != KSMBD_SHARE_INVALID_GID) gid = share->force_gid; - cred = prepare_kernel_cred(NULL); + cred = prepare_kernel_cred(&init_task); if (!cred) return -ENOMEM; - cred->fsuid = make_kuid(current_user_ns(), uid); - cred->fsgid = make_kgid(current_user_ns(), gid); + cred->fsuid = make_kuid(&init_user_ns, uid); + cred->fsgid = make_kgid(&init_user_ns, gid); gi = groups_alloc(0); if (!gi) { diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h index 318c16fa81da..e663ab9ea759 100644 --- a/fs/ksmbd/smb_common.h +++ b/fs/ksmbd/smb_common.h @@ -277,14 +277,14 @@ struct file_directory_info { __le64 AllocationSize; __le32 ExtFileAttributes; __le32 FileNameLength; - char FileName[1]; + char FileName[]; } __packed; /* level 0x101 FF resp data */ struct file_names_info { __le32 NextEntryOffset; __u32 FileIndex; __le32 FileNameLength; - char FileName[1]; + char FileName[]; } __packed; /* level 0xc FF resp data */ struct file_full_directory_info { @@ -299,7 +299,7 @@ struct file_full_directory_info { __le32 ExtFileAttributes; __le32 FileNameLength; __le32 EaSize; - char FileName[1]; + char FileName[]; } __packed; /* level 0x102 FF resp */ struct file_both_directory_info { @@ -317,7 +317,7 @@ struct file_both_directory_info { __u8 ShortNameLength; __u8 Reserved; __u8 ShortName[24]; - char FileName[1]; + char FileName[]; } __packed; /* level 0x104 FFrsp data */ struct file_id_both_directory_info { @@ -337,7 +337,7 @@ struct file_id_both_directory_info { __u8 ShortName[24]; __le16 Reserved2; __le64 UniqueId; - char FileName[1]; + char FileName[]; } __packed; struct file_id_full_dir_info { @@ -354,7 +354,7 @@ struct file_id_full_dir_info { __le32 EaSize; /* EA size */ __le32 Reserved; __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/ - char FileName[1]; + char FileName[]; } __packed; /* level 0x105 FF rsp data */ struct smb_version_values { diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c index 3781bca2c8fc..ab5c68cc0e13 100644 --- a/fs/ksmbd/smbacl.c +++ b/fs/ksmbd/smbacl.c @@ -275,7 +275,8 @@ static int sid_to_id(struct user_namespace *user_ns, uid_t id; id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); - uid = mapped_kuid_user(user_ns, &init_user_ns, KUIDT_INIT(id)); + uid = KUIDT_INIT(id); + uid = from_vfsuid(user_ns, &init_user_ns, VFSUIDT_INIT(uid)); if (uid_valid(uid)) { fattr->cf_uid = uid; rc = 0; @@ -285,7 +286,8 @@ static int sid_to_id(struct user_namespace *user_ns, gid_t id; id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); - gid = mapped_kgid_user(user_ns, &init_user_ns, KGIDT_INIT(id)); + gid = KGIDT_INIT(id); + gid = from_vfsgid(user_ns, &init_user_ns, VFSGIDT_INIT(gid)); if (gid_valid(gid)) { fattr->cf_gid = gid; rc = 0; @@ -991,7 +993,7 @@ static void smb_set_ace(struct smb_ace *ace, const struct smb_sid *sid, u8 type, } int smb_inherit_dacl(struct ksmbd_conn *conn, - struct path *path, + const struct path *path, unsigned int uid, unsigned int gid) { const struct smb_sid *psid, *creator = NULL; @@ -1185,7 +1187,7 @@ bool smb_inherit_flags(int flags, bool is_dir) return false; } -int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path, +int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, __le32 *pdaccess, int uid) { struct user_namespace *user_ns = mnt_user_ns(path->mnt); @@ -1287,7 +1289,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path, } if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) { - posix_acls = get_acl(d_inode(path->dentry), ACL_TYPE_ACCESS); + posix_acls = get_inode_acl(d_inode(path->dentry), ACL_TYPE_ACCESS); if (posix_acls && !found) { unsigned int id = -1; @@ -1352,7 +1354,7 @@ err_out: } int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, - struct path *path, struct smb_ntsd *pntsd, int ntsd_len, + const struct path *path, struct smb_ntsd *pntsd, int ntsd_len, bool type_check) { int rc; @@ -1384,14 +1386,14 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, ksmbd_vfs_remove_acl_xattrs(user_ns, path->dentry); /* Update posix acls */ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) { - rc = set_posix_acl(user_ns, inode, + rc = set_posix_acl(user_ns, path->dentry, ACL_TYPE_ACCESS, fattr.cf_acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", rc); if (S_ISDIR(inode->i_mode) && fattr.cf_dacls) { - rc = set_posix_acl(user_ns, inode, + rc = set_posix_acl(user_ns, path->dentry, ACL_TYPE_DEFAULT, fattr.cf_dacls); if (rc) ksmbd_debug(SMB, diff --git a/fs/ksmbd/smbacl.h b/fs/ksmbd/smbacl.h index fcb2c83f2992..618f2e0236b3 100644 --- a/fs/ksmbd/smbacl.h +++ b/fs/ksmbd/smbacl.h @@ -201,12 +201,12 @@ void posix_state_to_acl(struct posix_acl_state *state, struct posix_acl_entry *pace); int compare_sids(const struct smb_sid *ctsid, const struct smb_sid *cwsid); bool smb_inherit_flags(int flags, bool is_dir); -int smb_inherit_dacl(struct ksmbd_conn *conn, struct path *path, +int smb_inherit_dacl(struct ksmbd_conn *conn, const struct path *path, unsigned int uid, unsigned int gid); -int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path, +int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, __le32 *pdaccess, int uid); int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, - struct path *path, struct smb_ntsd *pntsd, int ntsd_len, + const struct path *path, struct smb_ntsd *pntsd, int ntsd_len, bool type_check); void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid); void ksmbd_init_domain(u32 *sub_auth); @@ -214,25 +214,25 @@ void ksmbd_init_domain(u32 *sub_auth); static inline uid_t posix_acl_uid_translate(struct user_namespace *mnt_userns, struct posix_acl_entry *pace) { - kuid_t kuid; + vfsuid_t vfsuid; /* If this is an idmapped mount, apply the idmapping. */ - kuid = mapped_kuid_fs(mnt_userns, &init_user_ns, pace->e_uid); + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, pace->e_uid); /* Translate the kuid into a userspace id ksmbd would see. */ - return from_kuid(&init_user_ns, kuid); + return from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid)); } static inline gid_t posix_acl_gid_translate(struct user_namespace *mnt_userns, struct posix_acl_entry *pace) { - kgid_t kgid; + vfsgid_t vfsgid; /* If this is an idmapped mount, apply the idmapping. */ - kgid = mapped_kgid_fs(mnt_userns, &init_user_ns, pace->e_gid); + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, pace->e_gid); /* Translate the kgid into a userspace id ksmbd would see. */ - return from_kgid(&init_user_ns, kgid); + return from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid)); } #endif /* _SMBACL_H */ diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c index 7cb0eeb07c80..c9aca21637d5 100644 --- a/fs/ksmbd/transport_ipc.c +++ b/fs/ksmbd/transport_ipc.c @@ -197,6 +197,7 @@ static struct genl_family ksmbd_genl_family = { .module = THIS_MODULE, .ops = ksmbd_genl_ops, .n_ops = ARRAY_SIZE(ksmbd_genl_ops), + .resv_start_op = KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE + 1, }; static void ksmbd_nl_init_fixup(void) diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 35b55ee94fe5..096eda9ef873 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -32,7 +32,7 @@ /* SMB_DIRECT negotiation timeout in seconds */ #define SMB_DIRECT_NEGOTIATE_TIMEOUT 120 -#define SMB_DIRECT_MAX_SEND_SGES 8 +#define SMB_DIRECT_MAX_SEND_SGES 6 #define SMB_DIRECT_MAX_RECV_SGES 1 /* @@ -62,13 +62,13 @@ static int smb_direct_receive_credit_max = 255; static int smb_direct_send_credit_target = 255; /* The maximum single message size can be sent to remote peer */ -static int smb_direct_max_send_size = 8192; +static int smb_direct_max_send_size = 1364; /* The maximum fragmented upper-layer payload receive size supported */ static int smb_direct_max_fragmented_recv_size = 1024 * 1024; /* The maximum single-message size which can be received */ -static int smb_direct_max_receive_size = 8192; +static int smb_direct_max_receive_size = 1364; static int smb_direct_max_read_write_size = SMBD_DEFAULT_IOSIZE; @@ -1527,6 +1527,8 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, } case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_DISCONNECTED: { + ib_drain_qp(t->qp); + t->status = SMB_DIRECT_CS_DISCONNECTED; wake_up_interruptible(&t->wait_status); wake_up_interruptible(&t->wait_reassembly_queue); diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c index 143bba4e4db8..63d55f543bd2 100644 --- a/fs/ksmbd/transport_tcp.c +++ b/fs/ksmbd/transport_tcp.c @@ -399,7 +399,8 @@ static int create_socket(struct interface *iface) ret = sock_create(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &ksmbd_socket); if (ret) { - pr_err("Can't create socket for ipv6, try ipv4: %d\n", ret); + if (ret != -EAFNOSUPPORT) + pr_err("Can't create socket for ipv6, fallback to ipv4: %d\n", ret); ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &ksmbd_socket); if (ret) { diff --git a/fs/ksmbd/unicode.h b/fs/ksmbd/unicode.h index 5593024230ae..076f6034a789 100644 --- a/fs/ksmbd/unicode.h +++ b/fs/ksmbd/unicode.h @@ -24,6 +24,7 @@ #include <asm/byteorder.h> #include <linux/types.h> #include <linux/nls.h> +#include <linux/unicode.h> #define UNIUPR_NOLOWER /* Example to not expand lower case tables */ @@ -69,7 +70,7 @@ char *smb_strndup_from_utf16(const char *src, const int maxlen, const struct nls_table *codepage); int smbConvertToUTF16(__le16 *target, const char *source, int srclen, const struct nls_table *cp, int mapchars); -char *ksmbd_extract_sharename(char *treename); +char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename); #endif /* diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 78d01033604c..ff0e7a4fcd4d 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -321,7 +321,7 @@ static int check_lock_range(struct file *filp, loff_t start, loff_t end, unsigned char type) { struct file_lock *flock; - struct file_lock_context *ctx = file_inode(filp)->i_flctx; + struct file_lock_context *ctx = locks_inode_context(file_inode(filp)); int error = 0; if (!ctx || list_empty_careful(&ctx->flc_posix)) @@ -377,8 +377,7 @@ int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count, if (work->conn->connection_type) { if (!(fp->daccess & (FILE_READ_DATA_LE | FILE_EXECUTE_LE))) { - pr_err("no right to read(%pd)\n", - fp->filp->f_path.dentry); + pr_err("no right to read(%pD)\n", fp->filp); return -EACCES; } } @@ -487,8 +486,7 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp, if (work->conn->connection_type) { if (!(fp->daccess & FILE_WRITE_DATA_LE)) { - pr_err("no right to write(%pd)\n", - fp->filp->f_path.dentry); + pr_err("no right to write(%pD)\n", fp->filp); err = -EACCES; goto out; } @@ -527,8 +525,8 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp, if (sync) { err = vfs_fsync_range(filp, offset, offset + *written, 0); if (err < 0) - pr_err("fsync failed for filename = %pd, err = %d\n", - fp->filp->f_path.dentry, err); + pr_err("fsync failed for filename = %pD, err = %d\n", + fp->filp, err); } out: @@ -543,7 +541,7 @@ out: * * Return: 0 on success, otherwise error */ -int ksmbd_vfs_getattr(struct path *path, struct kstat *stat) +int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat) { int err; @@ -1105,7 +1103,7 @@ int ksmbd_vfs_unlink(struct user_namespace *user_ns, return err; } -static int __dir_empty(struct dir_context *ctx, const char *name, int namlen, +static bool __dir_empty(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct ksmbd_readdir_data *buf; @@ -1113,9 +1111,7 @@ static int __dir_empty(struct dir_context *ctx, const char *name, int namlen, buf = container_of(ctx, struct ksmbd_readdir_data, ctx); buf->dirent_count++; - if (buf->dirent_count > 2) - return -ENOTEMPTY; - return 0; + return buf->dirent_count <= 2; } /** @@ -1142,22 +1138,33 @@ int ksmbd_vfs_empty_dir(struct ksmbd_file *fp) return err; } -static int __caseless_lookup(struct dir_context *ctx, const char *name, +static bool __caseless_lookup(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct ksmbd_readdir_data *buf; + int cmp = -EINVAL; buf = container_of(ctx, struct ksmbd_readdir_data, ctx); if (buf->used != namlen) - return 0; - if (!strncasecmp((char *)buf->private, name, namlen)) { + return true; + if (IS_ENABLED(CONFIG_UNICODE) && buf->um) { + const struct qstr q_buf = {.name = buf->private, + .len = buf->used}; + const struct qstr q_name = {.name = name, + .len = namlen}; + + cmp = utf8_strncasecmp(buf->um, &q_buf, &q_name); + } + if (cmp < 0) + cmp = strncasecmp((char *)buf->private, name, namlen); + if (!cmp) { memcpy((char *)buf->private, name, namlen); buf->dirent_count = 1; - return -EEXIST; + return false; } - return 0; + return true; } /** @@ -1168,7 +1175,8 @@ static int __caseless_lookup(struct dir_context *ctx, const char *name, * * Return: 0 on success, otherwise error */ -static int ksmbd_vfs_lookup_in_dir(struct path *dir, char *name, size_t namelen) +static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name, + size_t namelen, struct unicode_map *um) { int ret; struct file *dfilp; @@ -1178,6 +1186,7 @@ static int ksmbd_vfs_lookup_in_dir(struct path *dir, char *name, size_t namelen) .private = name, .used = namelen, .dirent_count = 0, + .um = um, }; dfilp = dentry_open(dir, flags, current_cred()); @@ -1240,7 +1249,8 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, break; err = ksmbd_vfs_lookup_in_dir(&parent, filename, - filename_len); + filename_len, + work->conn->um); path_put(&parent); if (err) goto out; @@ -1311,7 +1321,7 @@ int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns, sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1) || !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1)) { - err = ksmbd_vfs_remove_xattr(user_ns, dentry, name); + err = vfs_remove_acl(user_ns, dentry, name); if (err) ksmbd_debug(SMB, "remove acl xattr failed : %s\n", name); @@ -1365,7 +1375,7 @@ static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct user_namespac if (!IS_ENABLED(CONFIG_FS_POSIX_ACL)) return NULL; - posix_acls = get_acl(inode, acl_type); + posix_acls = get_inode_acl(inode, acl_type); if (!posix_acls) return NULL; @@ -1743,11 +1753,11 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work, *total_size_written = 0; if (!(src_fp->daccess & (FILE_READ_DATA_LE | FILE_EXECUTE_LE))) { - pr_err("no right to read(%pd)\n", src_fp->filp->f_path.dentry); + pr_err("no right to read(%pD)\n", src_fp->filp); return -EACCES; } if (!(dst_fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE))) { - pr_err("no right to write(%pd)\n", dst_fp->filp->f_path.dentry); + pr_err("no right to write(%pD)\n", dst_fp->filp); return -EACCES; } @@ -1784,9 +1794,9 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work, ret = vfs_copy_file_range(src_fp->filp, src_off, dst_fp->filp, dst_off, len, 0); if (ret == -EOPNOTSUPP || ret == -EXDEV) - ret = generic_copy_file_range(src_fp->filp, src_off, - dst_fp->filp, dst_off, - len, 0); + ret = vfs_copy_file_range(src_fp->filp, src_off, + dst_fp->filp, dst_off, len, + COPY_FILE_SPLICE); if (ret < 0) return ret; @@ -1814,10 +1824,11 @@ void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock) } int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, - struct inode *inode) + struct dentry *dentry) { struct posix_acl_state acl_state; struct posix_acl *acls; + struct inode *inode = d_inode(dentry); int rc; if (!IS_ENABLED(CONFIG_FS_POSIX_ACL)) @@ -1846,14 +1857,13 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, return -ENOMEM; } posix_state_to_acl(&acl_state, acls->a_entries); - rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, acls); + rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", rc); else if (S_ISDIR(inode->i_mode)) { posix_state_to_acl(&acl_state, acls->a_entries); - rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT, - acls); + rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", rc); @@ -1864,16 +1874,17 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, } int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, - struct inode *inode, struct inode *parent_inode) + struct dentry *dentry, struct inode *parent_inode) { struct posix_acl *acls; struct posix_acl_entry *pace; + struct inode *inode = d_inode(dentry); int rc, i; if (!IS_ENABLED(CONFIG_FS_POSIX_ACL)) return -EOPNOTSUPP; - acls = get_acl(parent_inode, ACL_TYPE_DEFAULT); + acls = get_inode_acl(parent_inode, ACL_TYPE_DEFAULT); if (!acls) return -ENOENT; pace = acls->a_entries; @@ -1885,12 +1896,12 @@ int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, } } - rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, acls); + rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", rc); if (S_ISDIR(inode->i_mode)) { - rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT, + rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 70da4c0ba7ad..0d73d735cc39 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -12,6 +12,7 @@ #include <linux/namei.h> #include <uapi/linux/xattr.h> #include <linux/posix_acl.h> +#include <linux/unicode.h> #include "smbacl.h" #include "xattr.h" @@ -60,6 +61,7 @@ struct ksmbd_readdir_data { unsigned int used; unsigned int dirent_count; unsigned int file_attr; + struct unicode_map *um; }; /* ksmbd kstat wrapper to get valid create time when reading dir entry */ @@ -85,7 +87,7 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id); int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name); int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, const char *newname); -int ksmbd_vfs_getattr(struct path *path, struct kstat *stat); +int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat); int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, char *newname); int ksmbd_vfs_truncate(struct ksmbd_work *work, @@ -158,8 +160,8 @@ int ksmbd_vfs_get_dos_attrib_xattr(struct user_namespace *user_ns, struct dentry *dentry, struct xattr_dos_attrib *da); int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, - struct inode *inode); + struct dentry *dentry); int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, - struct inode *inode, + struct dentry *dentry, struct inode *parent_inode); #endif /* __KSMBD_VFS_H__ */ diff --git a/fs/libfs.c b/fs/libfs.c index 31b0ddf01c31..aada4e7c8713 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -15,6 +15,7 @@ #include <linux/mutex.h> #include <linux/namei.h> #include <linux/exportfs.h> +#include <linux/iversion.h> #include <linux/writeback.h> #include <linux/buffer_head.h> /* sync_mapping_buffers */ #include <linux/fs_context.h> @@ -994,8 +995,8 @@ out: EXPORT_SYMBOL_GPL(simple_attr_read); /* interpret the buffer as a number to call the set function with */ -ssize_t simple_attr_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) +static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf, + size_t len, loff_t *ppos, bool is_signed) { struct simple_attr *attr; unsigned long long val; @@ -1016,7 +1017,10 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, goto out; attr->set_buf[size] = '\0'; - ret = kstrtoull(attr->set_buf, 0, &val); + if (is_signed) + ret = kstrtoll(attr->set_buf, 0, &val); + else + ret = kstrtoull(attr->set_buf, 0, &val); if (ret) goto out; ret = attr->set(attr->data, val); @@ -1026,8 +1030,21 @@ out: mutex_unlock(&attr->mutex); return ret; } + +ssize_t simple_attr_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return simple_attr_write_xsigned(file, buf, len, ppos, false); +} EXPORT_SYMBOL_GPL(simple_attr_write); +ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return simple_attr_write_xsigned(file, buf, len, ppos, true); +} +EXPORT_SYMBOL_GPL(simple_attr_write_signed); + /** * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation * @sb: filesystem to do the file handle conversion on @@ -1520,3 +1537,48 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry) #endif } EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops); + +/** + * inode_maybe_inc_iversion - increments i_version + * @inode: inode with the i_version that should be updated + * @force: increment the counter even if it's not necessary? + * + * Every time the inode is modified, the i_version field must be seen to have + * changed by any observer. + * + * If "force" is set or the QUERIED flag is set, then ensure that we increment + * the value, and clear the queried flag. + * + * In the common case where neither is set, then we can return "false" without + * updating i_version. + * + * If this function returns false, and no other metadata has changed, then we + * can avoid logging the metadata. + */ +bool inode_maybe_inc_iversion(struct inode *inode, bool force) +{ + u64 cur, new; + + /* + * The i_version field is not strictly ordered with any other inode + * information, but the legacy inode_inc_iversion code used a spinlock + * to serialize increments. + * + * Here, we add full memory barriers to ensure that any de-facto + * ordering with other info is preserved. + * + * This barrier pairs with the barrier in inode_query_iversion() + */ + smp_mb(); + cur = inode_peek_iversion_raw(inode); + do { + /* If flag is clear then we needn't do anything */ + if (!force && !(cur & I_VERSION_QUERIED)) + return false; + + /* Since lowest bit is flag, add 2 to avoid it */ + new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); + return true; +} +EXPORT_SYMBOL(inode_maybe_inc_iversion); diff --git a/fs/lockd/host.c b/fs/lockd/host.c index f802223e71ab..cdc8e12cdac4 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -164,7 +164,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, host->h_addrbuf = nsm->sm_addrbuf; host->net = ni->net; host->h_cred = get_cred(ni->cred); - strlcpy(host->nodename, utsname()->nodename, sizeof(host->nodename)); + strscpy(host->nodename, utsname()->nodename, sizeof(host->nodename)); out: return host; diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index bf274f23969b..b72023a6b4c1 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -52,6 +52,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, *filp = file; /* Set up the missing parts of the file_lock structure */ + lock->fl.fl_flags = FL_POSIX; lock->fl.fl_file = file->f_file[mode]; lock->fl.fl_pid = current->tgid; lock->fl.fl_start = (loff_t)lock->lock_start; @@ -521,6 +522,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_void, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_void), + .pc_argzero = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "NULL", @@ -530,6 +532,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_testargs, .pc_encode = nlm4svc_encode_testres, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+2+No+Rg, .pc_name = "TEST", @@ -539,6 +542,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_lockargs, .pc_encode = nlm4svc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "LOCK", @@ -548,6 +552,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_cancargs, .pc_encode = nlm4svc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "CANCEL", @@ -557,6 +562,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_unlockargs, .pc_encode = nlm4svc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "UNLOCK", @@ -566,6 +572,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_testargs, .pc_encode = nlm4svc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "GRANTED", @@ -575,6 +582,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_testargs, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "TEST_MSG", @@ -584,6 +592,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_lockargs, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "LOCK_MSG", @@ -593,6 +602,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_cancargs, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "CANCEL_MSG", @@ -602,6 +612,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_unlockargs, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "UNLOCK_MSG", @@ -611,6 +622,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_testargs, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "GRANTED_MSG", @@ -620,6 +632,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_void, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "TEST_RES", @@ -629,6 +642,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_void, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "LOCK_RES", @@ -638,6 +652,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_void, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "CANCEL_RES", @@ -647,6 +662,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_void, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "UNLOCK_RES", @@ -656,6 +672,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_res, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "GRANTED_RES", @@ -665,6 +682,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_reboot, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_reboot), + .pc_argzero = sizeof(struct nlm_reboot), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "SM_NOTIFY", @@ -674,6 +692,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_void, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_void), + .pc_argzero = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, .pc_name = "UNUSED", @@ -683,6 +702,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_void, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_void), + .pc_argzero = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, .pc_name = "UNUSED", @@ -692,6 +712,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_void, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_void), + .pc_argzero = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, .pc_name = "UNUSED", @@ -701,6 +722,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_shareargs, .pc_encode = nlm4svc_encode_shareres, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, .pc_name = "SHARE", @@ -710,6 +732,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_shareargs, .pc_encode = nlm4svc_encode_shareres, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, .pc_name = "UNSHARE", @@ -719,6 +742,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_lockargs, .pc_encode = nlm4svc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "NM_LOCK", @@ -728,6 +752,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_notify, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "FREE_ALL", diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 9c1aa75441e1..4e30f3c50970 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -659,11 +659,13 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) nlmsvc_cancel_blocked(net, file, lock); lock->fl.fl_type = F_UNLCK; - if (file->f_file[O_RDONLY]) - error = vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, + lock->fl.fl_file = file->f_file[O_RDONLY]; + if (lock->fl.fl_file) + error = vfs_lock_file(lock->fl.fl_file, F_SETLK, &lock->fl, NULL); - if (file->f_file[O_WRONLY]) - error = vfs_lock_file(file->f_file[O_WRONLY], F_SETLK, + lock->fl.fl_file = file->f_file[O_WRONLY]; + if (lock->fl.fl_file) + error |= vfs_lock_file(lock->fl.fl_file, F_SETLK, &lock->fl, NULL); return (error < 0)? nlm_lck_denied_nolocks : nlm_granted; @@ -697,9 +699,10 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l block = nlmsvc_lookup_block(file, lock); mutex_unlock(&file->f_mutex); if (block != NULL) { - mode = lock_to_openmode(&lock->fl); - vfs_cancel_lock(block->b_file->f_file[mode], - &block->b_call->a_args.lock.fl); + struct file_lock *fl = &block->b_call->a_args.lock.fl; + + mode = lock_to_openmode(fl); + vfs_cancel_lock(block->b_file->f_file[mode], fl); status = nlmsvc_unlink_block(block); nlmsvc_release_block(block); } diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index b09ca35b527c..32784f508c81 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -77,6 +77,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, /* Set up the missing parts of the file_lock structure */ mode = lock_to_openmode(&lock->fl); + lock->fl.fl_flags = FL_POSIX; lock->fl.fl_file = file->f_file[mode]; lock->fl.fl_pid = current->tgid; lock->fl.fl_lmops = &nlmsvc_lock_operations; @@ -555,6 +556,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_void, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_void), + .pc_argzero = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "NULL", @@ -564,6 +566,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_testargs, .pc_encode = nlmsvc_encode_testres, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+2+No+Rg, .pc_name = "TEST", @@ -573,6 +576,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_lockargs, .pc_encode = nlmsvc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "LOCK", @@ -582,6 +586,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_cancargs, .pc_encode = nlmsvc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "CANCEL", @@ -591,6 +596,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_unlockargs, .pc_encode = nlmsvc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "UNLOCK", @@ -600,6 +606,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_testargs, .pc_encode = nlmsvc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "GRANTED", @@ -609,6 +616,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_testargs, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "TEST_MSG", @@ -618,6 +626,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_lockargs, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "LOCK_MSG", @@ -627,6 +636,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_cancargs, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "CANCEL_MSG", @@ -636,6 +646,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_unlockargs, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "UNLOCK_MSG", @@ -645,6 +656,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_testargs, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "GRANTED_MSG", @@ -654,6 +666,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_void, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "TEST_RES", @@ -663,6 +676,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_void, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "LOCK_RES", @@ -672,6 +686,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_void, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "CANCEL_RES", @@ -681,6 +696,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_void, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "UNLOCK_RES", @@ -690,6 +706,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_res, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "GRANTED_RES", @@ -699,6 +716,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_reboot, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_reboot), + .pc_argzero = sizeof(struct nlm_reboot), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "SM_NOTIFY", @@ -708,6 +726,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_void, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_void), + .pc_argzero = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "UNUSED", @@ -717,6 +736,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_void, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_void), + .pc_argzero = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "UNUSED", @@ -726,6 +746,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_void, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_void), + .pc_argzero = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "UNUSED", @@ -735,6 +756,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_shareargs, .pc_encode = nlmsvc_encode_shareres, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, .pc_name = "SHARE", @@ -744,6 +766,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_shareargs, .pc_encode = nlmsvc_encode_shareres, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, .pc_name = "UNSHARE", @@ -753,6 +776,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_lockargs, .pc_encode = nlmsvc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "NM_LOCK", @@ -762,6 +786,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_notify, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, .pc_name = "FREE_ALL", diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index e1c4617de771..e3b6229e7ae5 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -176,7 +176,7 @@ nlm_delete_file(struct nlm_file *file) } } -static int nlm_unlock_files(struct nlm_file *file, fl_owner_t owner) +static int nlm_unlock_files(struct nlm_file *file, const struct file_lock *fl) { struct file_lock lock; @@ -184,12 +184,15 @@ static int nlm_unlock_files(struct nlm_file *file, fl_owner_t owner) lock.fl_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; - lock.fl_owner = owner; - if (file->f_file[O_RDONLY] && - vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, &lock, NULL)) + lock.fl_owner = fl->fl_owner; + lock.fl_pid = fl->fl_pid; + lock.fl_flags = FL_POSIX; + + lock.fl_file = file->f_file[O_RDONLY]; + if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL)) goto out_err; - if (file->f_file[O_WRONLY] && - vfs_lock_file(file->f_file[O_WRONLY], F_SETLK, &lock, NULL)) + lock.fl_file = file->f_file[O_WRONLY]; + if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL)) goto out_err; return 0; out_err: @@ -207,7 +210,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, { struct inode *inode = nlmsvc_file_inode(file); struct file_lock *fl; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); struct nlm_host *lockhost; if (!flctx || list_empty_careful(&flctx->flc_posix)) @@ -226,7 +229,7 @@ again: if (match(lockhost, host)) { spin_unlock(&flctx->flc_lock); - if (nlm_unlock_files(file, fl->fl_owner)) + if (nlm_unlock_files(file, fl)) return 1; goto again; } @@ -262,7 +265,7 @@ nlm_file_inuse(struct nlm_file *file) { struct inode *inode = nlmsvc_file_inode(file); struct file_lock *fl; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) return 1; diff --git a/fs/locks.c b/fs/locks.c index 607f94a0e789..8f01bee17715 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -175,7 +175,7 @@ locks_get_lock_context(struct inode *inode, int type) struct file_lock_context *ctx; /* paired with cmpxchg() below */ - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (likely(ctx) || type == F_UNLCK) goto out; @@ -194,7 +194,7 @@ locks_get_lock_context(struct inode *inode, int type) */ if (cmpxchg(&inode->i_flctx, NULL, ctx)) { kmem_cache_free(flctx_cache, ctx); - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); } out: trace_locks_get_lock_context(inode, type, ctx); @@ -247,7 +247,7 @@ locks_check_ctx_file_list(struct file *filp, struct list_head *list, void locks_free_lock_context(struct inode *inode) { - struct file_lock_context *ctx = inode->i_flctx; + struct file_lock_context *ctx = locks_inode_context(inode); if (unlikely(ctx)) { locks_check_ctx_lists(inode); @@ -891,7 +891,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) void *owner; void (*func)(void); - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx || list_empty_careful(&ctx->flc_posix)) { fl->fl_type = F_UNLCK; return; @@ -1483,7 +1483,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) new_fl->fl_flags = type; /* typically we will check that ctx is non-NULL before calling */ - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx) { WARN_ON_ONCE(1); goto free_lock; @@ -1588,7 +1588,7 @@ void lease_get_mtime(struct inode *inode, struct timespec64 *time) struct file_lock_context *ctx; struct file_lock *fl; - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (ctx && !list_empty_careful(&ctx->flc_lease)) { spin_lock(&ctx->flc_lock); fl = list_first_entry_or_null(&ctx->flc_lease, @@ -1634,7 +1634,7 @@ int fcntl_getlease(struct file *filp) int type = F_UNLCK; LIST_HEAD(dispose); - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (ctx && !list_empty_careful(&ctx->flc_lease)) { percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); @@ -1823,7 +1823,7 @@ static int generic_delete_lease(struct file *filp, void *owner) struct file_lock_context *ctx; LIST_HEAD(dispose); - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx) { trace_generic_delete_lease(inode, NULL); return error; @@ -2096,7 +2096,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) * throw a warning to let people know that they don't actually work. */ if (cmd & LOCK_MAND) { - pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n"); + pr_warn_once("%s(%d): Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n", current->comm, current->pid); return 0; } @@ -2146,6 +2146,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) */ int vfs_test_lock(struct file *filp, struct file_lock *fl) { + WARN_ON_ONCE(filp != fl->fl_file); if (filp->f_op->lock) return filp->f_op->lock(filp, F_GETLK, fl); posix_test_lock(filp, fl); @@ -2295,6 +2296,7 @@ out: */ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) { + WARN_ON_ONCE(filp != fl->fl_file); if (filp->f_op->lock) return filp->f_op->lock(filp, cmd, fl); else @@ -2561,7 +2563,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) * posix_lock_file(). Another process could be setting a lock on this * file at the same time, but we wouldn't remove that lock anyway. */ - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx || list_empty(&ctx->flc_posix)) return; @@ -2634,7 +2636,7 @@ void locks_remove_file(struct file *filp) { struct file_lock_context *ctx; - ctx = smp_load_acquire(&locks_inode(filp)->i_flctx); + ctx = locks_inode_context(locks_inode(filp)); if (!ctx) return; @@ -2663,12 +2665,36 @@ void locks_remove_file(struct file *filp) */ int vfs_cancel_lock(struct file *filp, struct file_lock *fl) { + WARN_ON_ONCE(filp != fl->fl_file); if (filp->f_op->lock) return filp->f_op->lock(filp, F_CANCELLK, fl); return 0; } EXPORT_SYMBOL_GPL(vfs_cancel_lock); +/** + * vfs_inode_has_locks - are any file locks held on @inode? + * @inode: inode to check for locks + * + * Return true if there are any FL_POSIX or FL_FLOCK locks currently + * set on @inode. + */ +bool vfs_inode_has_locks(struct inode *inode) +{ + struct file_lock_context *ctx; + bool ret; + + ctx = locks_inode_context(inode); + if (!ctx) + return false; + + spin_lock(&ctx->flc_lock); + ret = !list_empty(&ctx->flc_posix) || !list_empty(&ctx->flc_flock); + spin_unlock(&ctx->flc_lock); + return ret; +} +EXPORT_SYMBOL_GPL(vfs_inode_has_locks); + #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -2839,7 +2865,7 @@ void show_fd_locks(struct seq_file *f, struct file_lock_context *ctx; int id = 0; - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx) return; diff --git a/fs/mbcache.c b/fs/mbcache.c index 47ccfcbe0a22..2a4b8b549e93 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -90,12 +90,19 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, return -ENOMEM; INIT_LIST_HEAD(&entry->e_list); - /* Initial hash reference */ - atomic_set(&entry->e_refcnt, 1); + /* + * We create entry with two references. One reference is kept by the + * hash table, the other reference is used to protect us from + * mb_cache_entry_delete_or_get() until the entry is fully setup. This + * avoids nesting of cache->c_list_lock into hash table bit locks which + * is problematic for RT. + */ + atomic_set(&entry->e_refcnt, 2); entry->e_key = key; entry->e_value = value; - entry->e_reusable = reusable; - entry->e_referenced = 0; + entry->e_flags = 0; + if (reusable) + set_bit(MBE_REUSABLE_B, &entry->e_flags); head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { @@ -106,15 +113,12 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, } } hlist_bl_add_head(&entry->e_hash_list, head); - /* - * Add entry to LRU list before it can be found by - * mb_cache_entry_delete() to avoid races - */ + hlist_bl_unlock(head); spin_lock(&cache->c_list_lock); list_add_tail(&entry->e_list, &cache->c_list); cache->c_entry_count++; spin_unlock(&cache->c_list_lock); - hlist_bl_unlock(head); + mb_cache_entry_put(cache, entry); return 0; } @@ -162,7 +166,8 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache, while (node) { entry = hlist_bl_entry(node, struct mb_cache_entry, e_hash_list); - if (entry->e_key == key && entry->e_reusable && + if (entry->e_key == key && + test_bit(MBE_REUSABLE_B, &entry->e_flags) && atomic_inc_not_zero(&entry->e_refcnt)) goto out; node = node->next; @@ -281,7 +286,7 @@ EXPORT_SYMBOL(mb_cache_entry_delete_or_get); void mb_cache_entry_touch(struct mb_cache *cache, struct mb_cache_entry *entry) { - entry->e_referenced = 1; + set_bit(MBE_REFERENCED_B, &entry->e_flags); } EXPORT_SYMBOL(mb_cache_entry_touch); @@ -306,9 +311,9 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, entry = list_first_entry(&cache->c_list, struct mb_cache_entry, e_list); /* Drop initial hash reference if there is no user */ - if (entry->e_referenced || + if (test_bit(MBE_REFERENCED_B, &entry->e_flags) || atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) { - entry->e_referenced = 0; + clear_bit(MBE_REFERENCED_B, &entry->e_flags); list_move_tail(&entry->e_list, &cache->c_list); continue; } diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 937fa5fae2b8..8afdc408ca4f 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -53,16 +53,16 @@ static int minix_mknod(struct user_namespace *mnt_userns, struct inode *dir, } static int minix_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { int error; struct inode *inode = minix_new_inode(dir, mode, &error); if (inode) { minix_set_inode(inode, 0); mark_inode_dirty(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); } - return error; + return finish_open_simple(file, error); } static int minix_create(struct user_namespace *mnt_userns, struct inode *dir, diff --git a/fs/namei.c b/fs/namei.c index 53b4bc094db2..309ae6fc8c99 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -297,13 +297,13 @@ static int check_acl(struct user_namespace *mnt_userns, acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); if (!acl) return -EAGAIN; - /* no ->get_acl() calls in RCU mode... */ + /* no ->get_inode_acl() calls in RCU mode... */ if (is_uncached_acl(acl)) return -ECHILD; return posix_acl_permission(mnt_userns, inode, acl, mask); } - acl = get_acl(inode, ACL_TYPE_ACCESS); + acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { @@ -336,11 +336,11 @@ static int acl_permission_check(struct user_namespace *mnt_userns, struct inode *inode, int mask) { unsigned int mode = inode->i_mode; - kuid_t i_uid; + vfsuid_t vfsuid; /* Are we the owner? If so, ACL's don't matter */ - i_uid = i_uid_into_mnt(mnt_userns, inode); - if (likely(uid_eq(current_fsuid(), i_uid))) { + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { mask &= 7; mode >>= 6; return (mask & ~mode) ? -EACCES : 0; @@ -362,8 +362,8 @@ static int acl_permission_check(struct user_namespace *mnt_userns, * about? Need to check group ownership if so. */ if (mask & (mode ^ (mode >> 3))) { - kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); - if (in_group_p(kgid)) + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + if (vfsgid_in_group_p(vfsgid)) mode >>= 3; } @@ -581,7 +581,7 @@ struct nameidata { struct nameidata *saved; unsigned root_seq; int dfd; - kuid_t dir_uid; + vfsuid_t dir_vfsuid; umode_t dir_mode; } __randomize_layout; @@ -986,7 +986,7 @@ static int nd_jump_root(struct nameidata *nd) * Helper to directly jump to a known parsed path from ->get_link, * caller must have taken a reference to path beforehand. */ -int nd_jump_link(struct path *path) +int nd_jump_link(const struct path *path) { int error = -ELOOP; struct nameidata *nd = current->nameidata; @@ -1095,15 +1095,15 @@ fs_initcall(init_fs_namei_sysctls); static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { struct user_namespace *mnt_userns; - kuid_t i_uid; + vfsuid_t vfsuid; if (!sysctl_protected_symlinks) return 0; mnt_userns = mnt_user_ns(nd->path.mnt); - i_uid = i_uid_into_mnt(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); /* Allowed if owner and follower match. */ - if (uid_eq(current_cred()->fsuid, i_uid)) + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return 0; /* Allowed if parent directory not sticky and world-writable. */ @@ -1111,7 +1111,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod return 0; /* Allowed if parent directory and link owner match. */ - if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid)) + if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid)) return 0; if (nd->flags & LOOKUP_RCU) @@ -1178,13 +1178,13 @@ static bool safe_hardlink_source(struct user_namespace *mnt_userns, * * Returns 0 if successful, -ve on error. */ -int may_linkat(struct user_namespace *mnt_userns, struct path *link) +int may_linkat(struct user_namespace *mnt_userns, const struct path *link) { struct inode *inode = link->dentry->d_inode; /* Inode writeback is not safe when the uid or gid are invalid. */ - if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) || - !gid_valid(i_gid_into_mnt(mnt_userns, inode))) + if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) || + !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) return -EOVERFLOW; if (!sysctl_protected_hardlinks) @@ -1232,13 +1232,13 @@ static int may_create_in_sticky(struct user_namespace *mnt_userns, struct nameidata *nd, struct inode *const inode) { umode_t dir_mode = nd->dir_mode; - kuid_t dir_uid = nd->dir_uid; + vfsuid_t dir_vfsuid = nd->dir_vfsuid; if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || likely(!(dir_mode & S_ISVTX)) || - uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) || - uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) + vfsuid_eq(i_uid_into_vfsuid(mnt_userns, inode), dir_vfsuid) || + vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid())) return 0; if (likely(dir_mode & 0002) || @@ -2307,7 +2307,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) OK: /* pathname or trailing symlink, done */ if (!depth) { - nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode); + nd->dir_vfsuid = i_uid_into_vfsuid(mnt_userns, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; return 0; @@ -2885,9 +2885,9 @@ int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir, { kuid_t fsuid = current_fsuid(); - if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid)) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), fsuid)) return 0; - if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid)) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, dir), fsuid)) return 0; return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER); } @@ -2926,8 +2926,8 @@ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir, BUG_ON(victim->d_parent->d_inode != dir); /* Inode writeback is not safe when the uid or gid are invalid. */ - if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) || - !gid_valid(i_gid_into_mnt(mnt_userns, inode))) + if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) || + !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) return -EOVERFLOW; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); @@ -3211,7 +3211,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp) if (error) return error; - error = security_path_truncate(path); + error = security_file_truncate(filp); if (!error) { error = do_truncate(mnt_userns, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, @@ -3583,72 +3583,95 @@ static int do_open(struct nameidata *nd, * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply passs init_user_ns. */ -struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns, - struct dentry *dentry, umode_t mode, int open_flag) +static int vfs_tmpfile(struct user_namespace *mnt_userns, + const struct path *parentpath, + struct file *file, umode_t mode) { - struct dentry *child = NULL; - struct inode *dir = dentry->d_inode; + struct dentry *child; + struct inode *dir = d_inode(parentpath->dentry); struct inode *inode; int error; + int open_flag = file->f_flags; /* we want directory to be writable */ error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); if (error) - goto out_err; - error = -EOPNOTSUPP; + return error; if (!dir->i_op->tmpfile) - goto out_err; - error = -ENOMEM; - child = d_alloc(dentry, &slash_name); + return -EOPNOTSUPP; + child = d_alloc(parentpath->dentry, &slash_name); if (unlikely(!child)) - goto out_err; + return -ENOMEM; + file->f_path.mnt = parentpath->mnt; + file->f_path.dentry = child; mode = vfs_prepare_mode(mnt_userns, dir, mode, mode, mode); - error = dir->i_op->tmpfile(mnt_userns, dir, child, mode); + error = dir->i_op->tmpfile(mnt_userns, dir, file, mode); + dput(child); if (error) - goto out_err; - error = -ENOENT; - inode = child->d_inode; - if (unlikely(!inode)) - goto out_err; + return error; + /* Don't check for other permissions, the inode was just created */ + error = may_open(mnt_userns, &file->f_path, 0, file->f_flags); + if (error) + return error; + inode = file_inode(file); if (!(open_flag & O_EXCL)) { spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } ima_post_create_tmpfile(mnt_userns, inode); - return child; + return 0; +} -out_err: - dput(child); - return ERR_PTR(error); +/** + * vfs_tmpfile_open - open a tmpfile for kernel internal use + * @mnt_userns: user namespace of the mount the inode was found from + * @parentpath: path of the base directory + * @mode: mode of the new tmpfile + * @open_flag: flags + * @cred: credentials for open + * + * Create and open a temporary file. The file is not accounted in nr_files, + * hence this is only for kernel internal use, and must not be installed into + * file tables or such. + */ +struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns, + const struct path *parentpath, + umode_t mode, int open_flag, const struct cred *cred) +{ + struct file *file; + int error; + + file = alloc_empty_file_noaccount(open_flag, cred); + if (!IS_ERR(file)) { + error = vfs_tmpfile(mnt_userns, parentpath, file, mode); + if (error) { + fput(file); + file = ERR_PTR(error); + } + } + return file; } -EXPORT_SYMBOL(vfs_tmpfile); +EXPORT_SYMBOL(vfs_tmpfile_open); static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file) { struct user_namespace *mnt_userns; - struct dentry *child; struct path path; int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); + if (unlikely(error)) return error; error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; mnt_userns = mnt_user_ns(path.mnt); - child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag); - error = PTR_ERR(child); - if (IS_ERR(child)) + error = vfs_tmpfile(mnt_userns, &path, file, op->mode); + if (error) goto out2; - dput(path.dentry); - path.dentry = child; - audit_inode(nd->name, child, 0); - /* Don't check for other permissions, the inode was just created */ - error = may_open(mnt_userns, &path, 0, op->open_flag); - if (!error) - error = vfs_open(&path, file); + audit_inode(nd->name, file->f_path.dentry, 0); out2: mnt_drop_write(path.mnt); out: @@ -5088,7 +5111,7 @@ int page_symlink(struct inode *inode, const char *symname, int len) const struct address_space_operations *aops = mapping->a_ops; bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS); struct page *page; - void *fsdata; + void *fsdata = NULL; int err; unsigned int flags; diff --git a/fs/namespace.c b/fs/namespace.c index df137ba19d37..ab467ee58341 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -75,6 +75,22 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +struct mnt_idmap { + struct user_namespace *owner; + refcount_t count; +}; + +/* + * Carries the initial idmapping of 0:0:4294967295 which is an identity + * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is + * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. + */ +struct mnt_idmap nop_mnt_idmap = { + .owner = &init_user_ns, + .count = REFCOUNT_INIT(1), +}; +EXPORT_SYMBOL_GPL(nop_mnt_idmap); + struct mount_kattr { unsigned int attr_set; unsigned int attr_clr; @@ -82,6 +98,7 @@ struct mount_kattr { unsigned int lookup_flags; bool recurse; struct user_namespace *mnt_userns; + struct mnt_idmap *mnt_idmap; }; /* /sys/fs */ @@ -193,6 +210,104 @@ int mnt_get_count(struct mount *mnt) #endif } +/** + * mnt_idmap_owner - retrieve owner of the mount's idmapping + * @idmap: mount idmapping + * + * This helper will go away once the conversion to use struct mnt_idmap + * everywhere has finished at which point the helper will be unexported. + * + * Only code that needs to perform permission checks based on the owner of the + * idmapping will get access to it. All other code will solely rely on + * idmappings. This will get us type safety so it's impossible to conflate + * filesystems idmappings with mount idmappings. + * + * Return: The owner of the idmapping. + */ +struct user_namespace *mnt_idmap_owner(const struct mnt_idmap *idmap) +{ + return idmap->owner; +} +EXPORT_SYMBOL_GPL(mnt_idmap_owner); + +/** + * mnt_user_ns - retrieve owner of an idmapped mount + * @mnt: the relevant vfsmount + * + * This helper will go away once the conversion to use struct mnt_idmap + * everywhere has finished at which point the helper will be unexported. + * + * Only code that needs to perform permission checks based on the owner of the + * idmapping will get access to it. All other code will solely rely on + * idmappings. This will get us type safety so it's impossible to conflate + * filesystems idmappings with mount idmappings. + * + * Return: The owner of the idmapped. + */ +struct user_namespace *mnt_user_ns(const struct vfsmount *mnt) +{ + struct mnt_idmap *idmap = mnt_idmap(mnt); + + /* Return the actual owner of the filesystem instead of the nop. */ + if (idmap == &nop_mnt_idmap && + !initial_idmapping(mnt->mnt_sb->s_user_ns)) + return mnt->mnt_sb->s_user_ns; + return mnt_idmap_owner(idmap); +} +EXPORT_SYMBOL_GPL(mnt_user_ns); + +/** + * alloc_mnt_idmap - allocate a new idmapping for the mount + * @mnt_userns: owning userns of the idmapping + * + * Allocate a new struct mnt_idmap which carries the idmapping of the mount. + * + * Return: On success a new idmap, on error an error pointer is returned. + */ +static struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) +{ + struct mnt_idmap *idmap; + + idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); + if (!idmap) + return ERR_PTR(-ENOMEM); + + idmap->owner = get_user_ns(mnt_userns); + refcount_set(&idmap->count, 1); + return idmap; +} + +/** + * mnt_idmap_get - get a reference to an idmapping + * @idmap: the idmap to bump the reference on + * + * If @idmap is not the @nop_mnt_idmap bump the reference count. + * + * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed. + */ +static inline struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap) +{ + if (idmap != &nop_mnt_idmap) + refcount_inc(&idmap->count); + + return idmap; +} + +/** + * mnt_idmap_put - put a reference to an idmapping + * @idmap: the idmap to put the reference on + * + * If this is a non-initial idmapping, put the reference count when a mount is + * released and free it if we're the last user. + */ +static inline void mnt_idmap_put(struct mnt_idmap *idmap) +{ + if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) { + put_user_ns(idmap->owner); + kfree(idmap); + } +} + static struct mount *alloc_vfsmnt(const char *name) { struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); @@ -232,7 +347,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); - mnt->mnt.mnt_userns = &init_user_ns; + mnt->mnt.mnt_idmap = &nop_mnt_idmap; } return mnt; @@ -602,11 +717,7 @@ int sb_prepare_remount_readonly(struct super_block *sb) static void free_vfsmnt(struct mount *mnt) { - struct user_namespace *mnt_userns; - - mnt_userns = mnt_user_ns(&mnt->mnt); - if (!initial_idmapping(mnt_userns)) - put_user_ns(mnt_userns); + mnt_idmap_put(mnt_idmap(&mnt->mnt)); kfree_const(mnt->mnt_devname); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); @@ -1009,7 +1120,6 @@ static struct mount *skip_mnt_tree(struct mount *p) struct vfsmount *vfs_create_mount(struct fs_context *fc) { struct mount *mnt; - struct user_namespace *fs_userns; if (!fc->root) return ERR_PTR(-EINVAL); @@ -1027,10 +1137,6 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc) mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; - fs_userns = mnt->mnt.mnt_sb->s_user_ns; - if (!initial_idmapping(fs_userns)) - mnt->mnt.mnt_userns = get_user_ns(fs_userns); - lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); unlock_mount_hash(); @@ -1120,9 +1226,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); atomic_inc(&sb->s_active); - mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt); - if (!initial_idmapping(mnt->mnt.mnt_userns)) - mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns); + mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); + mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); mnt->mnt_mountpoint = mnt->mnt.mnt_root; @@ -3515,8 +3620,9 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, q = next_mnt(q, new); if (!q) break; + // an mntns binding we'd skipped? while (p->mnt.mnt_root != q->mnt.mnt_root) - p = next_mnt(p, old); + p = next_mnt(skip_mnt_tree(p), old); } namespace_unlock(); @@ -3981,14 +4087,14 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) struct vfsmount *m = &mnt->mnt; struct user_namespace *fs_userns = m->mnt_sb->s_user_ns; - if (!kattr->mnt_userns) + if (!kattr->mnt_idmap) return 0; /* * Creating an idmapped mount with the filesystem wide idmapping * doesn't make sense so block that. We don't allow mushy semantics. */ - if (kattr->mnt_userns == fs_userns) + if (mnt_idmap_owner(kattr->mnt_idmap) == fs_userns) return -EINVAL; /* @@ -4028,7 +4134,7 @@ static inline bool mnt_allow_writers(const struct mount_kattr *kattr, { return (!(kattr->attr_set & MNT_READONLY) || (mnt->mnt.mnt_flags & MNT_READONLY)) && - !kattr->mnt_userns; + !kattr->mnt_idmap; } static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) @@ -4082,27 +4188,18 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { - struct user_namespace *mnt_userns, *old_mnt_userns; - - if (!kattr->mnt_userns) + if (!kattr->mnt_idmap) return; /* - * We're the only ones able to change the mount's idmapping. So - * mnt->mnt.mnt_userns is stable and we can retrieve it directly. - */ - old_mnt_userns = mnt->mnt.mnt_userns; - - mnt_userns = get_user_ns(kattr->mnt_userns); - /* Pairs with smp_load_acquire() in mnt_user_ns(). */ - smp_store_release(&mnt->mnt.mnt_userns, mnt_userns); - - /* - * If this is an idmapped filesystem drop the reference we've taken - * in vfs_create_mount() before. + * Pairs with smp_load_acquire() in mnt_idmap(). + * + * Since we only allow a mount to change the idmapping once and + * verified this in can_idmap_mount() we know that the mount has + * @nop_mnt_idmap attached to it. So there's no need to drop any + * references. */ - if (!initial_idmapping(old_mnt_userns)) - put_user_ns(old_mnt_userns); + smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap)); } static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) @@ -4136,6 +4233,15 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) if (path->dentry != mnt->mnt.mnt_root) return -EINVAL; + if (kattr->mnt_userns) { + struct mnt_idmap *mnt_idmap; + + mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns); + if (IS_ERR(mnt_idmap)) + return PTR_ERR(mnt_idmap); + kattr->mnt_idmap = mnt_idmap; + } + if (kattr->propagation) { /* * Only take namespace_lock() if we're actually changing @@ -4323,6 +4429,9 @@ static void finish_mount_kattr(struct mount_kattr *kattr) { put_user_ns(kattr->mnt_userns); kattr->mnt_userns = NULL; + + if (kattr->mnt_idmap) + mnt_idmap_put(kattr->mnt_idmap); } SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 0ce535852151..7679a68e8193 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -17,9 +17,9 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; struct folio *folio; - unsigned int iopos, account = 0; pgoff_t start_page = rreq->start / PAGE_SIZE; pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; + size_t account = 0; bool subreq_failed = false; XA_STATE(xas, &rreq->mapping->i_pages, start_page); @@ -39,18 +39,23 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) */ subreq = list_first_entry(&rreq->subrequests, struct netfs_io_subrequest, rreq_link); - iopos = 0; subreq_failed = (subreq->error < 0); trace_netfs_rreq(rreq, netfs_rreq_trace_unlock); rcu_read_lock(); xas_for_each(&xas, folio, last_page) { - unsigned int pgpos = (folio_index(folio) - start_page) * PAGE_SIZE; - unsigned int pgend = pgpos + folio_size(folio); + loff_t pg_end; bool pg_failed = false; + if (xas_retry(&xas, folio)) + continue; + + pg_end = folio_pos(folio) + folio_size(folio) - 1; + for (;;) { + loff_t sreq_end; + if (!subreq) { pg_failed = true; break; @@ -58,11 +63,11 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) folio_start_fscache(folio); pg_failed |= subreq_failed; - if (pgend < iopos + subreq->len) + sreq_end = subreq->start + subreq->len - 1; + if (pg_end < sreq_end) break; account += subreq->transferred; - iopos += subreq->len; if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { subreq = list_next_entry(subreq, rreq_link); subreq_failed = (subreq->error < 0); @@ -70,7 +75,8 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) subreq = NULL; subreq_failed = false; } - if (pgend == iopos) + + if (pg_end == sreq_end) break; } diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 428925899282..7f753380e047 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -23,7 +23,7 @@ static void netfs_clear_unread(struct netfs_io_subrequest *subreq) { struct iov_iter iter; - iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages, + iov_iter_xarray(&iter, ITER_DEST, &subreq->rreq->mapping->i_pages, subreq->start + subreq->transferred, subreq->len - subreq->transferred); iov_iter_zero(iov_iter_count(&iter), &iter); @@ -49,7 +49,7 @@ static void netfs_read_from_cache(struct netfs_io_request *rreq, struct iov_iter iter; netfs_stat(&netfs_n_rh_read); - iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, + iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start + subreq->transferred, subreq->len - subreq->transferred); @@ -121,6 +121,9 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE); xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) { + if (xas_retry(&xas, folio)) + continue; + /* We might have multiple writes from the same huge * folio, but we mustn't unlock a folio more than once. */ @@ -205,7 +208,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq) continue; } - iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages, + iov_iter_xarray(&iter, ITER_SOURCE, &rreq->mapping->i_pages, subreq->start, subreq->len); atomic_inc(&rreq->nr_copy_ops); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 14a72224b657..1ead5bd740c2 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -209,8 +209,8 @@ config NFS_DISABLE_UDP_SUPPORT config NFS_V4_2_READ_PLUS bool "NFS: Enable support for the NFSv4.2 READ_PLUS operation" depends on NFS_V4_2 - default n + default y help - This is intended for developers only. The READ_PLUS operation has - been shown to have issues under specific conditions and should not - be used in production. + Choose Y here to enable the use of READ_PLUS over NFS v4.2. READ_PLUS + attempts to improve read performance by compressing out sparse holes + in the file contents. diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 8dcb08e1a885..d0cccddb7d08 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -1065,6 +1065,7 @@ static const struct svc_procedure nfs4_callback_procedures1[] = { .pc_func = nfs4_callback_compound, .pc_encode = nfs4_encode_void, .pc_argsize = 256, + .pc_argzero = 256, .pc_ressize = 256, .pc_xdrressize = NFS4_CALLBACK_BUFSIZE, .pc_name = "COMPOUND", diff --git a/fs/nfs/client.c b/fs/nfs/client.c index da8da5cdbbc1..f50e025ae406 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -280,7 +280,7 @@ EXPORT_SYMBOL_GPL(nfs_put_client); static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) { struct nfs_client *clp; - const struct sockaddr *sap = data->addr; + const struct sockaddr *sap = (struct sockaddr *)data->addr; struct nfs_net *nn = net_generic(data->net, nfs_net_id); int error; @@ -666,7 +666,7 @@ static int nfs_init_server(struct nfs_server *server, struct rpc_timeout timeparms; struct nfs_client_initdata cl_init = { .hostname = ctx->nfs_server.hostname, - .addr = (const struct sockaddr *)&ctx->nfs_server.address, + .addr = &ctx->nfs_server._address, .addrlen = ctx->nfs_server.addrlen, .nfs_mod = ctx->nfs_mod, .proto = ctx->nfs_server.protocol, diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 5c97cad741a7..cf7365581031 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -146,7 +146,7 @@ static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_state { struct inode *inode = state->inode; struct file_lock *fl; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); struct list_head *list; int status = 0; @@ -228,8 +228,7 @@ again: * */ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, - fmode_t type, - const nfs4_stateid *stateid, + fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit) { struct nfs_delegation *delegation; @@ -239,25 +238,24 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, delegation = rcu_dereference(NFS_I(inode)->delegation); if (delegation != NULL) { spin_lock(&delegation->lock); - if (nfs4_is_valid_delegation(delegation, 0)) { - nfs4_stateid_copy(&delegation->stateid, stateid); - delegation->type = type; - delegation->pagemod_limit = pagemod_limit; - oldcred = delegation->cred; - delegation->cred = get_cred(cred); - clear_bit(NFS_DELEGATION_NEED_RECLAIM, - &delegation->flags); - spin_unlock(&delegation->lock); - rcu_read_unlock(); - put_cred(oldcred); - trace_nfs4_reclaim_delegation(inode, type); - return; - } - /* We appear to have raced with a delegation return. */ + nfs4_stateid_copy(&delegation->stateid, stateid); + delegation->type = type; + delegation->pagemod_limit = pagemod_limit; + oldcred = delegation->cred; + delegation->cred = get_cred(cred); + clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); + if (test_and_clear_bit(NFS_DELEGATION_REVOKED, + &delegation->flags)) + atomic_long_inc(&nfs_active_delegations); spin_unlock(&delegation->lock); + rcu_read_unlock(); + put_cred(oldcred); + trace_nfs4_reclaim_delegation(inode, type); + } else { + rcu_read_unlock(); + nfs_inode_set_delegation(inode, cred, type, stateid, + pagemod_limit); } - rcu_read_unlock(); - nfs_inode_set_delegation(inode, cred, type, stateid, pagemod_limit); } static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 5d6c2ddc7ea6..ea1ceffa1d3a 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1074,6 +1074,8 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc) return res; } +#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL) + /* * Once we've found the start of the dirent within a page: fill 'er up... */ @@ -1083,6 +1085,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, struct file *file = desc->file; struct nfs_cache_array *array; unsigned int i; + bool first_emit = !desc->dir_cookie; array = kmap_local_page(desc->page); for (i = desc->cache_entry_index; i < array->size; i++) { @@ -1106,6 +1109,10 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos++; + if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 1) { + desc->eob = true; + break; + } } if (array->page_is_eof) desc->eof = !desc->eob; @@ -1187,8 +1194,6 @@ out: return status; } -#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL) - static bool nfs_readdir_handle_cache_misses(struct inode *inode, struct nfs_readdir_descriptor *desc, unsigned int cache_misses, @@ -2022,7 +2027,7 @@ static int nfs_finish_open(struct nfs_open_context *ctx, err = finish_open(file, dentry, do_open); if (err) goto out; - if (S_ISREG(file->f_path.dentry->d_inode->i_mode)) + if (S_ISREG(file_inode(file)->i_mode)) nfs_file_set_open_context(file, ctx); else err = -EOPENSTALE; @@ -2489,9 +2494,8 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) spin_unlock(&dentry->d_lock); goto out; } - if (dentry->d_fsdata) - /* old devname */ - kfree(dentry->d_fsdata); + /* old devname */ + kfree(dentry->d_fsdata); dentry->d_fsdata = NFS_FSDATA_BLOCKED; spin_unlock(&dentry->d_lock); @@ -2949,9 +2953,28 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, co return NULL; } +static u64 nfs_access_login_time(const struct task_struct *task, + const struct cred *cred) +{ + const struct task_struct *parent; + u64 ret; + + rcu_read_lock(); + for (;;) { + parent = rcu_dereference(task->real_parent); + if (parent == task || cred_fscmp(parent->cred, cred) != 0) + break; + task = parent; + } + ret = task->start_time; + rcu_read_unlock(); + return ret; +} + static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, u32 *mask, bool may_block) { struct nfs_inode *nfsi = NFS_I(inode); + u64 login_time = nfs_access_login_time(current, cred); struct nfs_access_entry *cache; bool retry = true; int err; @@ -2979,6 +3002,9 @@ static int nfs_access_get_cached_locked(struct inode *inode, const struct cred * spin_lock(&inode->i_lock); retry = false; } + err = -ENOENT; + if ((s64)(login_time - cache->timestamp) > 0) + goto out; *mask = cache->mask; list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru); err = 0; @@ -3058,6 +3084,7 @@ static void nfs_access_add_rbtree(struct inode *inode, else goto found; } + set->timestamp = ktime_get_ns(); rb_link_node(&set->rb_node, parent, p); rb_insert_color(&set->rb_node, root_node); list_add_tail(&set->lru, &nfsi->access_cache_entry_lru); diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index e87d500ad95a..6603b5cee029 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -16,8 +16,9 @@ #include "dns_resolve.h" ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, - struct sockaddr *sa, size_t salen) + struct sockaddr_storage *ss, size_t salen) { + struct sockaddr *sa = (struct sockaddr *)ss; ssize_t ret; char *ip_addr = NULL; int ip_len; @@ -341,7 +342,7 @@ out: } ssize_t nfs_dns_resolve_name(struct net *net, char *name, - size_t namelen, struct sockaddr *sa, size_t salen) + size_t namelen, struct sockaddr_storage *ss, size_t salen) { struct nfs_dns_ent key = { .hostname = name, @@ -354,7 +355,7 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, ret = do_cache_lookup_wait(nn->nfs_dns_resolve, &key, &item); if (ret == 0) { if (salen >= item->addrlen) { - memcpy(sa, &item->addr, item->addrlen); + memcpy(ss, &item->addr, item->addrlen); ret = item->addrlen; } else ret = -EOVERFLOW; diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h index 576ff4b54c82..fe3b172c4de1 100644 --- a/fs/nfs/dns_resolve.h +++ b/fs/nfs/dns_resolve.h @@ -32,6 +32,6 @@ extern void nfs_dns_resolver_cache_destroy(struct net *net); #endif extern ssize_t nfs_dns_resolve_name(struct net *net, char *name, - size_t namelen, struct sockaddr *sa, size_t salen); + size_t namelen, struct sockaddr_storage *sa, size_t salen); #endif diff --git a/fs/nfs/file.c b/fs/nfs/file.c index e032fe201a36..d8ec889a4b3f 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -567,7 +567,8 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf) } wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING, - nfs_wait_bit_killable, TASK_KILLABLE); + nfs_wait_bit_killable, + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); lock_page(page); mapping = page_file_mapping(page); @@ -655,9 +656,9 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) goto out; } if (mntflags & NFS_MOUNT_WRITE_WAIT) { - result = filemap_fdatawait_range(file->f_mapping, - iocb->ki_pos - written, - iocb->ki_pos - 1); + filemap_fdatawait_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); } result = generic_write_sync(iocb, written); if (result < 0) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 7d285561e59f..7deb3cd76abe 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -30,14 +30,20 @@ #define FF_LAYOUT_POLL_RETRY_MAX (15*HZ) #define FF_LAYOUTRETURN_MAXERR 20 +enum nfs4_ff_op_type { + NFS4_FF_OP_LAYOUTSTATS, + NFS4_FF_OP_LAYOUTRETURN, +}; + static unsigned short io_maxretrans; static const struct pnfs_commit_ops ff_layout_commit_ops; static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, struct nfs_pgio_header *hdr); -static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, +static int +ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, struct nfs42_layoutstat_devinfo *devinfo, - int dev_limit); + int dev_limit, enum nfs4_ff_op_type type); static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, const struct nfs42_layoutstat_devinfo *devinfo, struct nfs4_ff_layout_mirror *mirror); @@ -487,10 +493,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, gid = make_kgid(&init_user_ns, id); if (gfp_flags & __GFP_FS) - kcred = prepare_kernel_cred(NULL); + kcred = prepare_kernel_cred(&init_task); else { unsigned int nofs_flags = memalloc_nofs_save(); - kcred = prepare_kernel_cred(NULL); + kcred = prepare_kernel_cred(&init_task); memalloc_nofs_restore(nofs_flags); } rc = -ENOMEM; @@ -1373,6 +1379,11 @@ static int ff_layout_read_prepare_common(struct rpc_task *task, return -EIO; } + if (!pnfs_is_valid_lseg(hdr->lseg)) { + rpc_exit(task, -EAGAIN); + return -EAGAIN; + } + ff_layout_read_record_layoutstats_start(task, hdr); return 0; } @@ -1553,6 +1564,11 @@ static int ff_layout_write_prepare_common(struct rpc_task *task, return -EIO; } + if (!pnfs_is_valid_lseg(hdr->lseg)) { + rpc_exit(task, -EAGAIN); + return -EAGAIN; + } + ff_layout_write_record_layoutstats_start(task, hdr); return 0; } @@ -1645,15 +1661,23 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task, set_bit(NFS_LSEG_LAYOUTRETURN, &cdata->lseg->pls_flags); } -static void ff_layout_commit_prepare_common(struct rpc_task *task, - struct nfs_commit_data *cdata) +static int ff_layout_commit_prepare_common(struct rpc_task *task, + struct nfs_commit_data *cdata) { + if (!pnfs_is_valid_lseg(cdata->lseg)) { + rpc_exit(task, -EAGAIN); + return -EAGAIN; + } + ff_layout_commit_record_layoutstats_start(task, cdata); + return 0; } static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) { - ff_layout_commit_prepare_common(task, data); + if (ff_layout_commit_prepare_common(task, data)) + return; + rpc_call_start(task); } @@ -1949,6 +1973,65 @@ ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, ff_layout_initiate_commit); } +static bool ff_layout_match_rw(const struct rpc_task *task, + const struct nfs_pgio_header *hdr, + const struct pnfs_layout_segment *lseg) +{ + return hdr->lseg == lseg; +} + +static bool ff_layout_match_commit(const struct rpc_task *task, + const struct nfs_commit_data *cdata, + const struct pnfs_layout_segment *lseg) +{ + return cdata->lseg == lseg; +} + +static bool ff_layout_match_io(const struct rpc_task *task, const void *data) +{ + const struct rpc_call_ops *ops = task->tk_ops; + + if (ops == &ff_layout_read_call_ops_v3 || + ops == &ff_layout_read_call_ops_v4 || + ops == &ff_layout_write_call_ops_v3 || + ops == &ff_layout_write_call_ops_v4) + return ff_layout_match_rw(task, task->tk_calldata, data); + if (ops == &ff_layout_commit_call_ops_v3 || + ops == &ff_layout_commit_call_ops_v4) + return ff_layout_match_commit(task, task->tk_calldata, data); + return false; +} + +static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg) +{ + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); + struct nfs4_ff_layout_mirror *mirror; + struct nfs4_ff_layout_ds *mirror_ds; + struct nfs4_pnfs_ds *ds; + struct nfs_client *ds_clp; + struct rpc_clnt *clnt; + u32 idx; + + for (idx = 0; idx < flseg->mirror_array_cnt; idx++) { + mirror = flseg->mirror_array[idx]; + mirror_ds = mirror->mirror_ds; + if (!mirror_ds) + continue; + ds = mirror->mirror_ds->ds; + if (!ds) + continue; + ds_clp = ds->ds_clp; + if (!ds_clp) + continue; + clnt = ds_clp->cl_rpcclient; + if (!clnt) + continue; + if (!rpc_cancel_tasks(clnt, -EAGAIN, ff_layout_match_io, lseg)) + continue; + rpc_clnt_disconnect(clnt); + } +} + static struct pnfs_ds_commit_info * ff_layout_get_ds_info(struct inode *inode) { @@ -2161,8 +2244,9 @@ ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args) FF_LAYOUTRETURN_MAXERR); spin_lock(&args->inode->i_lock); - ff_args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr, - &ff_args->devinfo[0], ARRAY_SIZE(ff_args->devinfo)); + ff_args->num_dev = ff_layout_mirror_prepare_stats( + &ff_layout->generic_hdr, &ff_args->devinfo[0], + ARRAY_SIZE(ff_args->devinfo), NFS4_FF_OP_LAYOUTRETURN); spin_unlock(&args->inode->i_lock); args->ld_private->ops = &layoutreturn_ops; @@ -2396,7 +2480,7 @@ static const struct nfs4_xdr_opaque_ops layoutstat_ops = { static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, struct nfs42_layoutstat_devinfo *devinfo, - int dev_limit) + int dev_limit, enum nfs4_ff_op_type type) { struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo); struct nfs4_ff_layout_mirror *mirror; @@ -2408,7 +2492,9 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, break; if (IS_ERR_OR_NULL(mirror->mirror_ds)) continue; - if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags)) + if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, + &mirror->flags) && + type != NFS4_FF_OP_LAYOUTRETURN) continue; /* mirror refcount put in cleanup_layoutstats */ if (!refcount_inc_not_zero(&mirror->ref)) @@ -2448,7 +2534,9 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) spin_lock(&args->inode->i_lock); ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout); args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr, - &args->devinfo[0], dev_count); + &args->devinfo[0], + dev_count, + NFS4_FF_OP_LAYOUTSTATS); spin_unlock(&args->inode->i_lock); if (!args->num_dev) { kfree(args->devinfo); @@ -2501,6 +2589,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .prepare_layoutreturn = ff_layout_prepare_layoutreturn, .sync = pnfs_nfs_generic_sync, .prepare_layoutstats = ff_layout_prepare_layoutstats, + .cancel_io = ff_layout_cancel_io, }; static int __init nfs4flexfilelayout_init(void) diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 4da701fd1424..9bcd53d5c7d4 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -273,9 +273,9 @@ static const struct constant_table nfs_secflavor_tokens[] = { * Address family must be initialized, and address must not be * the ANY address for that family. */ -static int nfs_verify_server_address(struct sockaddr *addr) +static int nfs_verify_server_address(struct sockaddr_storage *addr) { - switch (addr->sa_family) { + switch (addr->ss_family) { case AF_INET: { struct sockaddr_in *sa = (struct sockaddr_in *)addr; return sa->sin_addr.s_addr != htonl(INADDR_ANY); @@ -684,6 +684,8 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, return ret; break; case Opt_vers: + if (!param->string) + goto out_invalid_value; trace_nfs_mount_assign(param->key, param->string); ret = nfs_parse_version_string(fc, param->string); if (ret < 0) @@ -696,6 +698,8 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, break; case Opt_proto: + if (!param->string) + goto out_invalid_value; trace_nfs_mount_assign(param->key, param->string); protofamily = AF_INET; switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) { @@ -732,6 +736,8 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, break; case Opt_mountproto: + if (!param->string) + goto out_invalid_value; trace_nfs_mount_assign(param->key, param->string); mountfamily = AF_INET; switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) { @@ -969,7 +975,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc, { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_fh *mntfh = ctx->mntfh; - struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address; + struct sockaddr_storage *sap = &ctx->nfs_server._address; int extra_flags = NFS_MOUNT_LEGACY_INTERFACE; int ret; @@ -1044,7 +1050,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc, memcpy(sap, &data->addr, sizeof(data->addr)); ctx->nfs_server.addrlen = sizeof(data->addr); ctx->nfs_server.port = ntohs(data->addr.sin_port); - if (sap->sa_family != AF_INET || + if (sap->ss_family != AF_INET || !nfs_verify_server_address(sap)) goto out_no_address; @@ -1200,7 +1206,7 @@ static int nfs4_parse_monolithic(struct fs_context *fc, struct nfs4_mount_data *data) { struct nfs_fs_context *ctx = nfs_fc2context(fc); - struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address; + struct sockaddr_storage *sap = &ctx->nfs_server._address; int ret; char *c; @@ -1314,7 +1320,7 @@ static int nfs_fs_context_validate(struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_subversion *nfs_mod; - struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address; + struct sockaddr_storage *sap = &ctx->nfs_server._address; int max_namelen = PAGE_SIZE; int max_pathlen = NFS_MAXPATHLEN; int port = 0; @@ -1540,7 +1546,7 @@ static int nfs_init_fs_context(struct fs_context *fc) ctx->version = nfss->nfs_client->rpc_ops->version; ctx->minorversion = nfss->nfs_client->cl_minorversion; - memcpy(&ctx->nfs_server.address, &nfss->nfs_client->cl_addr, + memcpy(&ctx->nfs_server._address, &nfss->nfs_client->cl_addr, ctx->nfs_server.addrlen); if (fc->net_ns != net) { diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index e861d7bae305..e731c00a9fcb 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -252,7 +252,7 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page) bvec[0].bv_page = page; bvec[0].bv_offset = 0; bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + iov_iter_bvec(&iter, ITER_DEST, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); ret = fscache_begin_read_operation(&cres, cookie); if (ret < 0) @@ -282,7 +282,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page, bvec[0].bv_page = page; bvec[0].bv_offset = 0; bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + iov_iter_bvec(&iter, ITER_SOURCE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); ret = fscache_begin_write_operation(&cres, cookie); if (ret < 0) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index bea7c005119c..e98ee7599eeb 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -72,18 +72,13 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) return nfs_fileid_to_ino_t(fattr->fileid); } -static int nfs_wait_killable(int mode) +int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) { - freezable_schedule_unsafe(); + schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; return 0; } - -int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) -{ - return nfs_wait_killable(mode); -} EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); /** @@ -318,7 +313,7 @@ struct nfs_find_desc { static int nfs_find_actor(struct inode *inode, void *opaque) { - struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; + struct nfs_find_desc *desc = opaque; struct nfs_fh *fh = desc->fh; struct nfs_fattr *fattr = desc->fattr; @@ -336,7 +331,7 @@ nfs_find_actor(struct inode *inode, void *opaque) static int nfs_init_locked(struct inode *inode, void *opaque) { - struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; + struct nfs_find_desc *desc = opaque; struct nfs_fattr *fattr = desc->fattr; set_nfs_fileid(inode, fattr->fileid); @@ -1173,7 +1168,8 @@ int nfs_open(struct inode *inode, struct file *filp) { struct nfs_open_context *ctx; - ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp); + ctx = alloc_nfs_open_context(file_dentry(filp), + flags_to_mode(filp->f_flags), filp); if (IS_ERR(ctx)) return PTR_ERR(ctx); nfs_file_set_open_context(filp, ctx); @@ -1332,7 +1328,8 @@ int nfs_clear_invalid_mapping(struct address_space *mapping) */ for (;;) { ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING, - nfs_wait_bit_killable, TASK_KILLABLE); + nfs_wait_bit_killable, + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (ret) goto out; spin_lock(&inode->i_lock); @@ -2271,7 +2268,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi) static void init_once(void *foo) { - struct nfs_inode *nfsi = (struct nfs_inode *) foo; + struct nfs_inode *nfsi = foo; inode_init_once(&nfsi->vfs_inode); INIT_LIST_HEAD(&nfsi->open_files); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 898dd95bc7a7..ae7d4a8c728c 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -69,7 +69,7 @@ static inline fmode_t flags_to_mode(int flags) struct nfs_client_initdata { unsigned long init_flags; const char *hostname; /* Hostname of the server */ - const struct sockaddr *addr; /* Address of the server */ + const struct sockaddr_storage *addr; /* Address of the server */ const char *nodename; /* Hostname of the client */ const char *ip_addr; /* IP address of the client */ size_t addrlen; @@ -180,7 +180,7 @@ static inline struct nfs_fs_context *nfs_fc2context(const struct fs_context *fc) /* mount_clnt.c */ struct nfs_mount_request { - struct sockaddr *sap; + struct sockaddr_storage *sap; size_t salen; char *hostname; char *dirpath; @@ -223,7 +223,7 @@ extern void nfs4_server_set_init_caps(struct nfs_server *); extern struct nfs_server *nfs4_create_server(struct fs_context *); extern struct nfs_server *nfs4_create_referral_server(struct fs_context *); extern int nfs4_update_server(struct nfs_server *server, const char *hostname, - struct sockaddr *sap, size_t salen, + struct sockaddr_storage *sap, size_t salen, struct net *net); extern void nfs_free_server(struct nfs_server *server); extern struct nfs_server *nfs_clone_server(struct nfs_server *, @@ -235,7 +235,7 @@ extern int nfs_client_init_status(const struct nfs_client *clp); extern int nfs_wait_client_init_complete(const struct nfs_client *clp); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, @@ -243,7 +243,7 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, struct inode *); extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, int ds_addrlen, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans); #ifdef CONFIG_PROC_FS @@ -435,7 +435,6 @@ extern void nfs_zap_acl_cache(struct inode *inode); extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags); extern bool nfs_check_cache_invalid(struct inode *, unsigned long); extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); -extern int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode); /* super.c */ extern const struct super_operations nfs_sops; @@ -503,7 +502,6 @@ extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, const struct nfs_pgio_completion_ops *compl_ops); extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_commit_free(struct nfs_commit_data *p); -extern void nfs_write_prepare(struct rpc_task *task, void *calldata); extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); extern int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, @@ -741,12 +739,10 @@ unsigned long nfs_io_size(unsigned long iosize, enum xprt_transports proto) iosize = NFS_DEF_FILE_IO_SIZE; else if (iosize >= NFS_MAX_FILE_IO_SIZE) iosize = NFS_MAX_FILE_IO_SIZE; - else - iosize = iosize & PAGE_MASK; - if (proto == XPRT_TRANSPORT_UDP) + if (proto == XPRT_TRANSPORT_UDP || iosize < PAGE_SIZE) return nfs_block_bits(iosize, NULL); - return iosize; + return iosize & PAGE_MASK; } /* @@ -896,13 +892,13 @@ static inline bool nfs_error_is_fatal_on_server(int err) * Select between a default port value and a user-specified port value. * If a zero value is set, then autobind will be used. */ -static inline void nfs_set_port(struct sockaddr *sap, int *port, +static inline void nfs_set_port(struct sockaddr_storage *sap, int *port, const unsigned short default_port) { if (*port == NFS_UNSPEC_PORT) *port = default_port; - rpc_set_port(sap, *port); + rpc_set_port((struct sockaddr *)sap, *port); } struct nfs_direct_req { diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index c5e3b6b3366a..68e76b626371 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -158,7 +158,7 @@ int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans) struct rpc_create_args args = { .net = info->net, .protocol = info->protocol, - .address = info->sap, + .address = (struct sockaddr *)info->sap, .addrsize = info->salen, .timeout = &mnt_timeout, .servername = info->hostname, @@ -245,7 +245,7 @@ void nfs_umount(const struct nfs_mount_request *info) struct rpc_create_args args = { .net = info->net, .protocol = IPPROTO_UDP, - .address = info->sap, + .address = (struct sockaddr *)info->sap, .addrsize = info->salen, .timeout = &nfs_umnt_timeout, .servername = info->hostname, diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 3295af4110f1..b0ef7e7ddb30 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -147,7 +147,7 @@ struct vfsmount *nfs_d_automount(struct path *path) struct nfs_fs_context *ctx; struct fs_context *fc; struct vfsmount *mnt = ERR_PTR(-ENOMEM); - struct nfs_server *server = NFS_SERVER(d_inode(path->dentry)); + struct nfs_server *server = NFS_SB(path->dentry->d_sb); struct nfs_client *client = server->nfs_client; int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout); int ret; @@ -175,7 +175,7 @@ struct vfsmount *nfs_d_automount(struct path *path) } /* for submounts we want the same server; referrals will reassign */ - memcpy(&ctx->nfs_server.address, &client->cl_addr, client->cl_addrlen); + memcpy(&ctx->nfs_server._address, &client->cl_addr, client->cl_addrlen); ctx->nfs_server.addrlen = client->cl_addrlen; ctx->nfs_server.port = server->port; @@ -354,7 +354,7 @@ static int param_get_nfs_timeout(char *buffer, const struct kernel_param *kp) num = (num + (HZ - 1)) / HZ; } else num = -1; - return scnprintf(buffer, PAGE_SIZE, "%li\n", num); + return sysfs_emit(buffer, "%li\n", num); } static const struct kernel_param_ops param_ops_nfs_timeout = { diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h index 03a4e679fd99..df9ca56db347 100644 --- a/fs/nfs/nfs3_fs.h +++ b/fs/nfs/nfs3_fs.h @@ -12,7 +12,7 @@ */ #ifdef CONFIG_NFS_V3_ACL extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu); -extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, struct posix_acl *dfacl); diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 93de0b58647a..74d11e3c4205 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -255,23 +255,24 @@ int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, } -int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { struct posix_acl *orig = acl, *dfacl = NULL, *alloc; + struct inode *inode = d_inode(dentry); int status; if (S_ISDIR(inode->i_mode)) { switch(type) { case ACL_TYPE_ACCESS: - alloc = get_acl(inode, ACL_TYPE_DEFAULT); + alloc = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(alloc)) goto fail; dfacl = alloc; break; case ACL_TYPE_DEFAULT: - alloc = get_acl(inode, ACL_TYPE_ACCESS); + alloc = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(alloc)) goto fail; dfacl = acl; @@ -312,7 +313,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, struct posix_acl *acl; char *p = data + *result; - acl = get_acl(inode, type); + acl = get_inode_acl(inode, type); if (IS_ERR_OR_NULL(acl)) return 0; diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index b49359afac88..669cda757a5c 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -78,7 +78,7 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source, * the MDS. */ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, int ds_addrlen, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans) { struct rpc_timeout ds_timeout; @@ -98,7 +98,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, char buf[INET6_ADDRSTRLEN + 1]; /* fake a hostname because lockd wants it */ - if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0) + if (rpc_ntop((struct sockaddr *)ds_addr, buf, sizeof(buf)) <= 0) return ERR_PTR(-EINVAL); cl_init.hostname = buf; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 1597eef40d54..4bf208a0a8e9 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -36,7 +36,8 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) res = rpc_call_sync(clnt, msg, flags); if (res != -EJUKEBOX) break; - freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME); + __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); + schedule_timeout(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; } while (!fatal_signal_pending(current)); return res; @@ -997,7 +998,7 @@ static const struct inode_operations nfs3_dir_inode_operations = { .setattr = nfs_setattr, #ifdef CONFIG_NFS_V3_ACL .listxattr = nfs3_listxattr, - .get_acl = nfs3_get_acl, + .get_inode_acl = nfs3_get_acl, .set_acl = nfs3_set_acl, #endif }; @@ -1008,7 +1009,7 @@ static const struct inode_operations nfs3_file_inode_operations = { .setattr = nfs_setattr, #ifdef CONFIG_NFS_V3_ACL .listxattr = nfs3_listxattr, - .get_acl = nfs3_get_acl, + .get_inode_acl = nfs3_get_acl, .set_acl = nfs3_set_acl, #endif }; diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6dab9e408372..ecb428512fe1 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -341,7 +341,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, return status; } } - status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping, + status = nfs_filemap_write_and_wait_range(src->f_mapping, pos_src, pos_src + (loff_t)count - 1); if (status) return status; @@ -1093,6 +1093,9 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, &args.seq_args, &res.seq_res, 0); trace_nfs4_clone(src_inode, dst_inode, &args, status); if (status == 0) { + /* a zero-length count means clone to EOF in src */ + if (count == 0 && res.dst_fattr->valid & NFS_ATTR_FATTR_SIZE) + count = nfs_size_to_loff_t(res.dst_fattr->size) - dst_offset; nfs42_copy_dest_done(dst_inode, dst_offset, count); status = nfs_post_op_update_inode(dst_inode, res.dst_fattr); } @@ -1175,6 +1178,7 @@ static int _nfs42_proc_removexattr(struct inode *inode, const char *name) ret = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); + trace_nfs4_removexattr(inode, name, ret); if (!ret) nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0); @@ -1214,6 +1218,7 @@ static int _nfs42_proc_setxattr(struct inode *inode, const char *name, ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + trace_nfs4_setxattr(inode, name, ret); for (; np > 0; np--) put_page(pages[np - 1]); @@ -1246,6 +1251,7 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0); + trace_nfs4_getxattr(inode, name, ret); if (ret < 0) return ret; @@ -1317,6 +1323,7 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0); + trace_nfs4_listxattr(inode, ret); if (ret >= 0) { ret = res.copied; diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index a9bf09fdf2c3..76ae11834206 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -981,7 +981,7 @@ nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc) static void nfs4_xattr_cache_init_once(void *p) { - struct nfs4_xattr_cache *cache = (struct nfs4_xattr_cache *)p; + struct nfs4_xattr_cache *cache = p; spin_lock_init(&cache->listxattr_lock); atomic_long_set(&cache->nent, 0); diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index b56f05113d36..d80ee88ca996 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -47,13 +47,14 @@ #define decode_deallocate_maxsz (op_decode_hdr_maxsz) #define encode_read_plus_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 3) -#define NFS42_READ_PLUS_SEGMENT_SIZE (1 /* data_content4 */ + \ +#define NFS42_READ_PLUS_DATA_SEGMENT_SIZE \ + (1 /* data_content4 */ + \ 2 /* data_info4.di_offset */ + \ - 2 /* data_info4.di_length */) + 1 /* data_info4.di_length */) #define decode_read_plus_maxsz (op_decode_hdr_maxsz + \ 1 /* rpr_eof */ + \ 1 /* rpr_contents count */ + \ - 2 * NFS42_READ_PLUS_SEGMENT_SIZE) + NFS42_READ_PLUS_DATA_SEGMENT_SIZE) #define encode_seek_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + \ 2 /* offset */ + \ @@ -569,6 +570,14 @@ static int decode_listxattrs(struct xdr_stream *xdr, */ if (status == -ETOOSMALL) status = -ERANGE; + /* + * Special case: for LISTXATTRS, NFS4ERR_NOXATTR + * should be translated to success with zero-length reply. + */ + if (status == -ENODATA) { + res->eof = true; + status = 0; + } goto out; } @@ -1134,7 +1143,7 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) if (!segs) return -ENOMEM; - xdr_set_scratch_buffer(xdr, &scratch_buf, 32); + xdr_set_scratch_buffer(xdr, &scratch_buf, sizeof(scratch_buf)); status = -EIO; for (i = 0; i < segments; i++) { status = decode_read_plus_segment(xdr, &segs[i]); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 79df6e83881b..5edd1704f735 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -149,6 +149,7 @@ struct nfs4_lock_state { struct nfs4_state * ls_state; /* Pointer to open state */ #define NFS_LOCK_INITIALIZED 0 #define NFS_LOCK_LOST 1 +#define NFS_LOCK_UNLOCKING 2 unsigned long ls_flags; struct nfs_seqid_counter ls_seqid; nfs4_stateid ls_stateid; @@ -281,7 +282,7 @@ struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, int nfs4_submount(struct fs_context *, struct nfs_server *); int nfs4_replace_transport(struct nfs_server *server, const struct nfs4_fs_locations *locations); -size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr_storage *ss, size_t salen, struct net *net, int port); /* nfs4proc.c */ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *); @@ -459,7 +460,6 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *); /* nfs4renewd.c */ extern void nfs4_schedule_state_renewal(struct nfs_client *); -extern void nfs4_renewd_prepare_shutdown(struct nfs_server *); extern void nfs4_kill_renewd(struct nfs_client *); extern void nfs4_renew_state(struct work_struct *); extern void nfs4_set_lease_period(struct nfs_client *clp, unsigned long lease); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 3c5678aec006..d3051b051a56 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -254,7 +254,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) goto error; ip_addr = (const char *)buf; } - strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + strscpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); err = nfs_idmap_new(clp); if (err < 0) { @@ -346,6 +346,7 @@ int nfs40_init_client(struct nfs_client *clp) ret = nfs4_setup_slot_table(tbl, NFS4_MAX_SLOT_TABLE, "NFSv4.0 transport Slot table"); if (ret) { + nfs4_shutdown_slot_table(tbl); kfree(tbl); return ret; } @@ -889,7 +890,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, */ static int nfs4_set_client(struct nfs_server *server, const char *hostname, - const struct sockaddr *addr, + const struct sockaddr_storage *addr, const size_t addrlen, const char *ip_addr, int proto, const struct rpc_timeout *timeparms, @@ -924,7 +925,7 @@ static int nfs4_set_client(struct nfs_server *server, __set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); if (test_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status)) __set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags); - server->port = rpc_get_port(addr); + server->port = rpc_get_port((struct sockaddr *)addr); /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); @@ -960,7 +961,7 @@ static int nfs4_set_client(struct nfs_server *server, * the MDS. */ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, int ds_addrlen, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, u32 minor_version) { @@ -980,7 +981,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, }; char buf[INET6_ADDRSTRLEN + 1]; - if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0) + if (rpc_ntop((struct sockaddr *)ds_addr, buf, sizeof(buf)) <= 0) return ERR_PTR(-EINVAL); cl_init.hostname = buf; @@ -1148,7 +1149,7 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc) /* Get a client record */ error = nfs4_set_client(server, ctx->nfs_server.hostname, - &ctx->nfs_server.address, + &ctx->nfs_server._address, ctx->nfs_server.addrlen, ctx->client_address, ctx->nfs_server.protocol, @@ -1238,7 +1239,7 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc) rpc_set_port(&ctx->nfs_server.address, NFS_RDMA_PORT); error = nfs4_set_client(server, ctx->nfs_server.hostname, - &ctx->nfs_server.address, + &ctx->nfs_server._address, ctx->nfs_server.addrlen, parent_client->cl_ipaddr, XPRT_TRANSPORT_RDMA, @@ -1254,7 +1255,7 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc) rpc_set_port(&ctx->nfs_server.address, NFS_PORT); error = nfs4_set_client(server, ctx->nfs_server.hostname, - &ctx->nfs_server.address, + &ctx->nfs_server._address, ctx->nfs_server.addrlen, parent_client->cl_ipaddr, XPRT_TRANSPORT_TCP, @@ -1303,14 +1304,14 @@ error: * Returns zero on success, or a negative errno value. */ int nfs4_update_server(struct nfs_server *server, const char *hostname, - struct sockaddr *sap, size_t salen, struct net *net) + struct sockaddr_storage *sap, size_t salen, struct net *net) { struct nfs_client *clp = server->nfs_client; struct rpc_clnt *clnt = server->client; struct xprt_create xargs = { .ident = clp->cl_proto, .net = net, - .dstaddr = sap, + .dstaddr = (struct sockaddr *)sap, .addrlen = salen, .servername = hostname, }; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 9eb181287879..2563ed8580f3 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -32,7 +32,6 @@ nfs4_file_open(struct inode *inode, struct file *filp) struct dentry *parent = NULL; struct inode *dir; unsigned openflags = filp->f_flags; - fmode_t f_mode; struct iattr attr; int err; @@ -51,17 +50,14 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (err) return err; - f_mode = filp->f_mode; - if ((openflags & O_ACCMODE) == 3) - f_mode |= flags_to_mode(openflags); - /* We can't create new files here */ openflags &= ~(O_CREAT|O_EXCL); parent = dget_parent(dentry); dir = d_inode(parent); - ctx = alloc_nfs_open_context(file_dentry(filp), f_mode, filp); + ctx = alloc_nfs_open_context(file_dentry(filp), + flags_to_mode(openflags), filp); err = PTR_ERR(ctx); if (IS_ERR(ctx)) goto out; @@ -366,8 +362,8 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, goto out_free_name; } - ctx = alloc_nfs_open_context(filep->f_path.dentry, filep->f_mode, - filep); + ctx = alloc_nfs_open_context(filep->f_path.dentry, + flags_to_mode(filep->f_flags), filep); if (IS_ERR(ctx)) { res = ERR_CAST(ctx); goto out_filep; diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index ec6afd3c4bca..25a7c771cfd8 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -203,7 +203,7 @@ int nfs_idmap_init(void) printk(KERN_NOTICE "NFS: Registering the %s key type\n", key_type_id_resolver.name); - cred = prepare_kernel_cred(NULL); + cred = prepare_kernel_cred(&init_task); if (!cred) return -ENOMEM; @@ -583,7 +583,7 @@ static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux) struct request_key_auth *rka = get_request_key_auth(authkey); struct rpc_pipe_msg *msg; struct idmap_msg *im; - struct idmap *idmap = (struct idmap *)aux; + struct idmap *idmap = aux; struct key *key = rka->target_key; int ret = -ENOKEY; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index f2dbf904c598..9a98595bb160 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -164,16 +164,17 @@ static int nfs4_validate_fspath(struct dentry *dentry, return 0; } -size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr_storage *ss, size_t salen, struct net *net, int port) { + struct sockaddr *sa = (struct sockaddr *)ss; ssize_t ret; ret = rpc_pton(net, string, len, sa, salen); if (ret == 0) { ret = rpc_uaddr2sockaddr(net, string, len, sa, salen); if (ret == 0) { - ret = nfs_dns_resolve_name(net, string, len, sa, salen); + ret = nfs_dns_resolve_name(net, string, len, ss, salen); if (ret < 0) ret = 0; } @@ -331,7 +332,7 @@ static int try_location(struct fs_context *fc, ctx->nfs_server.addrlen = nfs_parse_server_name(buf->data, buf->len, - &ctx->nfs_server.address, + &ctx->nfs_server._address, sizeof(ctx->nfs_server._address), fc->net_ns, 0); if (ctx->nfs_server.addrlen == 0) @@ -483,14 +484,13 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server, char *page, char *page2, const struct nfs4_fs_location *location) { - const size_t addr_bufsize = sizeof(struct sockaddr_storage); struct net *net = rpc_net_ns(server->client); - struct sockaddr *sap; + struct sockaddr_storage *sap; unsigned int s; size_t salen; int error; - sap = kmalloc(addr_bufsize, GFP_KERNEL); + sap = kmalloc(sizeof(*sap), GFP_KERNEL); if (sap == NULL) return -ENOMEM; @@ -506,10 +506,10 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server, continue; salen = nfs_parse_server_name(buf->data, buf->len, - sap, addr_bufsize, net, 0); + sap, sizeof(*sap), net, 0); if (salen == 0) continue; - rpc_set_port(sap, NFS_PORT); + rpc_set_port((struct sockaddr *)sap, NFS_PORT); error = -ENOMEM; hostname = kmemdup_nul(buf->data, buf->len, GFP_KERNEL); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3ed14a2a84a4..40d749f29ed3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -122,6 +122,11 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry, if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0) return NULL; + label->lfs = 0; + label->pi = 0; + label->len = 0; + label->label = NULL; + err = security_dentry_init_security(dentry, sattr->ia_mode, &dentry->d_name, NULL, (void **)&label->label, &label->len); @@ -416,8 +421,8 @@ static int nfs4_delay_killable(long *timeout) { might_sleep(); - freezable_schedule_timeout_killable_unsafe( - nfs4_update_delay(timeout)); + __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); + schedule_timeout(nfs4_update_delay(timeout)); if (!__fatal_signal_pending(current)) return 0; return -EINTR; @@ -427,7 +432,8 @@ static int nfs4_delay_interruptible(long *timeout) { might_sleep(); - freezable_schedule_timeout_interruptible_unsafe(nfs4_update_delay(timeout)); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE); + schedule_timeout(nfs4_update_delay(timeout)); if (!signal_pending(current)) return 0; return __fatal_signal_pending(current) ? -EINTR :-ERESTARTSYS; @@ -2125,18 +2131,18 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context } static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, - fmode_t fmode) + fmode_t fmode) { struct nfs4_state *newstate; + struct nfs_server *server = NFS_SB(opendata->dentry->d_sb); + int openflags = opendata->o_arg.open_flags; int ret; if (!nfs4_mode_match_open_stateid(opendata->state, fmode)) return 0; - opendata->o_arg.open_flags = 0; opendata->o_arg.fmode = fmode; - opendata->o_arg.share_access = nfs4_map_atomic_open_share( - NFS_SB(opendata->dentry->d_sb), - fmode, 0); + opendata->o_arg.share_access = + nfs4_map_atomic_open_share(server, fmode, openflags); memset(&opendata->o_res, 0, sizeof(opendata->o_res)); memset(&opendata->c_res, 0, sizeof(opendata->c_res)); nfs4_init_opendata_res(opendata); @@ -2624,8 +2630,7 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data) */ static int nfs4_opendata_access(const struct cred *cred, struct nfs4_opendata *opendata, - struct nfs4_state *state, fmode_t fmode, - int openflags) + struct nfs4_state *state, fmode_t fmode) { struct nfs_access_entry cache; u32 mask, flags; @@ -2636,11 +2641,7 @@ static int nfs4_opendata_access(const struct cred *cred, return 0; mask = 0; - /* - * Use openflags to check for exec, because fmode won't - * always have FMODE_EXEC set when file open for exec. - */ - if (openflags & __FMODE_EXEC) { + if (fmode & FMODE_EXEC) { /* ONLY check for exec rights */ if (S_ISDIR(state->inode->i_mode)) mask = NFS4_ACCESS_LOOKUP; @@ -2718,10 +2719,15 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s struct nfs4_opendata *opendata; int ret; - opendata = nfs4_open_recoverdata_alloc(ctx, state, - NFS4_OPEN_CLAIM_FH); + opendata = nfs4_open_recoverdata_alloc(ctx, state, NFS4_OPEN_CLAIM_FH); if (IS_ERR(opendata)) return PTR_ERR(opendata); + /* + * We're not recovering a delegation, so ask for no delegation. + * Otherwise the recovery thread could deadlock with an outstanding + * delegation return. + */ + opendata->o_arg.open_flags = O_DIRECT; ret = nfs4_open_recover(opendata, state); if (ret == -ESTALE) d_drop(ctx->dentry); @@ -3023,7 +3029,7 @@ static unsigned nfs4_exclusive_attrset(struct nfs4_opendata *opendata, } static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, - int flags, struct nfs_open_context *ctx) + struct nfs_open_context *ctx) { struct nfs4_state_owner *sp = opendata->owner; struct nfs_server *server = sp->so_server; @@ -3084,8 +3090,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, /* Parse layoutget results before we check for access */ pnfs_parse_lgopen(state->inode, opendata->lgp, ctx); - ret = nfs4_opendata_access(sp->so_cred, opendata, state, - acc_mode, flags); + ret = nfs4_opendata_access(sp->so_cred, opendata, state, acc_mode); if (ret != 0) goto out; @@ -3159,7 +3164,7 @@ static int _nfs4_do_open(struct inode *dir, if (d_really_is_positive(dentry)) opendata->state = nfs4_get_open_state(d_inode(dentry), sp); - status = _nfs4_open_and_get_state(opendata, flags, ctx); + status = _nfs4_open_and_get_state(opendata, ctx); if (status != 0) goto err_opendata_put; state = ctx->state; @@ -3795,7 +3800,7 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr, int *opened) { struct nfs4_state *state; - struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL; + struct nfs4_label l, *label; label = nfs4_label_init_security(dir, ctx->dentry, attr, &l); @@ -3950,7 +3955,7 @@ static void test_fs_location_for_trunking(struct nfs4_fs_location *location, for (i = 0; i < location->nservers; i++) { struct nfs4_string *srv_loc = &location->servers[i]; - struct sockaddr addr; + struct sockaddr_storage addr; size_t addrlen; struct xprt_create xprt_args = { .ident = 0, @@ -3973,7 +3978,7 @@ static void test_fs_location_for_trunking(struct nfs4_fs_location *location, clp->cl_net, server->port); if (!addrlen) return; - xprt_args.dstaddr = &addr; + xprt_args.dstaddr = (struct sockaddr *)&addr; xprt_args.addrlen = addrlen; servername = kmalloc(srv_loc->len + 1, GFP_KERNEL); if (!servername) @@ -4012,7 +4017,7 @@ static int _nfs4_discover_trunking(struct nfs_server *server, page = alloc_page(GFP_KERNEL); if (!page) - return -ENOMEM; + goto out_put_cred; locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); if (!locations) goto out_free; @@ -4034,6 +4039,8 @@ out_free_2: kfree(locations); out_free: __free_page(page); +out_put_cred: + put_cred(cred); return status; } @@ -4681,7 +4688,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags) { struct nfs_server *server = NFS_SERVER(dir); - struct nfs4_label l, *ilabel = NULL; + struct nfs4_label l, *ilabel; struct nfs_open_context *ctx; struct nfs4_state *state; int status = 0; @@ -5032,7 +5039,7 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, struct nfs4_exception exception = { .interruptible = true, }; - struct nfs4_label l, *label = NULL; + struct nfs4_label l, *label; int err; label = nfs4_label_init_security(dir, dentry, sattr, &l); @@ -5073,7 +5080,7 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, struct nfs4_exception exception = { .interruptible = true, }; - struct nfs4_label l, *label = NULL; + struct nfs4_label l, *label; int err; label = nfs4_label_init_security(dir, dentry, sattr, &l); @@ -5192,7 +5199,7 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, struct nfs4_exception exception = { .interruptible = true, }; - struct nfs4_label l, *label = NULL; + struct nfs4_label l, *label; int err; label = nfs4_label_init_security(dir, dentry, sattr, &l); @@ -6607,7 +6614,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) struct nfs4_delegreturndata *d_data; struct pnfs_layout_hdr *lo; - d_data = (struct nfs4_delegreturndata *)data; + d_data = data; if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) { nfs4_sequence_done(task, &d_data->res.seq_res); @@ -7016,12 +7023,13 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * mutex_unlock(&sp->so_delegreturn_mutex); goto out; } + lsp = request->fl_u.nfs4_fl.owner; + set_bit(NFS_LOCK_UNLOCKING, &lsp->ls_flags); up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); if (status != 0) goto out; /* Is this a delegated lock? */ - lsp = request->fl_u.nfs4_fl.owner; if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0) goto out; alloc_seqid = NFS_SERVER(inode)->nfs_client->cl_mvops->alloc_seqid; @@ -7137,6 +7145,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) { struct nfs4_lockdata *data = calldata; struct nfs4_lock_state *lsp = data->lsp; + struct nfs_server *server = NFS_SERVER(d_inode(data->ctx->dentry)); if (!nfs4_sequence_done(task, &data->res.seq_res)) return; @@ -7144,8 +7153,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) data->rpc_status = task->tk_status; switch (task->tk_status) { case 0: - renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)), - data->timestamp); + renew_lease(server, data->timestamp); if (data->arg.new_lock && !data->cancelled) { data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) @@ -7166,6 +7174,8 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) if (!nfs4_stateid_match(&data->arg.open_stateid, &lsp->ls_state->open_stateid)) goto out_restart; + else if (nfs4_async_handle_error(task, server, lsp->ls_state, NULL) == -EAGAIN) + goto out_restart; } else if (!nfs4_stateid_match(&data->arg.lock_stateid, &lsp->ls_stateid)) goto out_restart; @@ -7406,7 +7416,8 @@ nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd, status = nfs4_proc_setlk(state, cmd, request); if ((status != -EAGAIN) || IS_SETLK(cmd)) break; - freezable_schedule_timeout_interruptible(timeout); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + schedule_timeout(timeout); timeout *= 2; timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout); status = -ERESTARTSYS; @@ -7474,10 +7485,8 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) break; status = -ERESTARTSYS; - freezer_do_not_count(); - wait_woken(&waiter.wait, TASK_INTERRUPTIBLE, + wait_woken(&waiter.wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE, NFS4_LOCK_MAXTIMEOUT); - freezer_count(); } while (!signalled()); remove_wait_queue(q, &waiter.wait); @@ -8900,7 +8909,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred) void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data) { - struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data; + struct nfs4_add_xprt_data *adata = data; struct rpc_task *task; int status; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9bab3e9c702a..2a0ca5c7f082 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -497,8 +497,7 @@ nfs4_alloc_state_owner(struct nfs_server *server, sp = kzalloc(sizeof(*sp), gfp_flags); if (!sp) return NULL; - sp->so_seqid.owner_id = ida_simple_get(&server->openowner_id, 0, 0, - gfp_flags); + sp->so_seqid.owner_id = ida_alloc(&server->openowner_id, gfp_flags); if (sp->so_seqid.owner_id < 0) { kfree(sp); return NULL; @@ -534,7 +533,7 @@ static void nfs4_free_state_owner(struct nfs4_state_owner *sp) { nfs4_destroy_seqid_counter(&sp->so_seqid); put_cred(sp->so_cred); - ida_simple_remove(&sp->so_server->openowner_id, sp->so_seqid.owner_id); + ida_free(&sp->so_server->openowner_id, sp->so_seqid.owner_id); kfree(sp); } @@ -877,8 +876,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f refcount_set(&lsp->ls_count, 1); lsp->ls_state = state; lsp->ls_owner = fl_owner; - lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, - 0, 0, GFP_KERNEL_ACCOUNT); + lsp->ls_seqid.owner_id = ida_alloc(&server->lockowner_id, GFP_KERNEL_ACCOUNT); if (lsp->ls_seqid.owner_id < 0) goto out_free; INIT_LIST_HEAD(&lsp->ls_locks); @@ -890,7 +888,7 @@ out_free: void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) { - ida_simple_remove(&server->lockowner_id, lsp->ls_seqid.owner_id); + ida_free(&server->lockowner_id, lsp->ls_seqid.owner_id); nfs4_destroy_seqid_counter(&lsp->ls_seqid); kfree(lsp); } @@ -1232,6 +1230,8 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) if (IS_ERR(task)) { printk(KERN_ERR "%s: kthread_run: %ld\n", __func__, PTR_ERR(task)); + if (!nfs_client_init_is_complete(clp)) + nfs_mark_client_ready(clp, PTR_ERR(task)); nfs4_clear_state_manager_bit(clp); clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs_put_client(clp); @@ -1314,7 +1314,8 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp) refcount_inc(&clp->cl_count); res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, - nfs_wait_bit_killable, TASK_KILLABLE); + nfs_wait_bit_killable, + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (res) goto out; if (clp->cl_cons_state < 0) @@ -1502,7 +1503,7 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ struct file_lock *fl; struct nfs4_lock_state *lsp; int status = 0; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); struct list_head *list; if (flctx == NULL) @@ -1620,7 +1621,8 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st spin_lock(&state->state_lock); list_for_each_entry(lock, &state->lock_states, ls_locks) { trace_nfs4_state_lock_reclaim(state, lock); - if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) + if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags) && + !test_bit(NFS_LOCK_UNLOCKING, &lock->ls_flags)) *lost_locks += 1; } spin_unlock(&state->state_lock); @@ -1787,6 +1789,7 @@ static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) { + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); /* Mark all delegations for reclaim */ nfs_delegation_mark_reclaim(clp); nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot); @@ -2671,6 +2674,7 @@ static void nfs4_state_manager(struct nfs_client *clp) if (status < 0) goto out_error; nfs4_state_end_reclaim_reboot(clp); + continue; } /* Detect expired delegations... */ diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 6ee6ad3674a2..214bc56f92d2 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -9,10 +9,10 @@ #define _TRACE_NFS4_H #include <linux/tracepoint.h> -#include <trace/events/sunrpc_base.h> +#include <trace/misc/sunrpc.h> -#include <trace/events/fs.h> -#include <trace/events/nfs.h> +#include <trace/misc/fs.h> +#include <trace/misc/nfs.h> #define show_nfs_fattr_flags(valid) \ __print_flags((unsigned long)valid, "|", \ @@ -1815,7 +1815,7 @@ TRACE_EVENT(pnfs_update_layout, __entry->count = count; __entry->iomode = iomode; __entry->reason = reason; - if (lo != NULL) { + if (lo != NULL && pnfs_layout_is_valid(lo)) { __entry->layoutstateid_seq = be32_to_cpu(lo->plh_stateid.seqid); __entry->layoutstateid_hash = @@ -1869,7 +1869,7 @@ DECLARE_EVENT_CLASS(pnfs_layout_event, __entry->pos = pos; __entry->count = count; __entry->iomode = iomode; - if (lo != NULL) { + if (lo != NULL && pnfs_layout_is_valid(lo)) { __entry->layoutstateid_seq = be32_to_cpu(lo->plh_stateid.seqid); __entry->layoutstateid_hash = @@ -2097,6 +2097,7 @@ TRACE_EVENT(ff_layout_commit_error, ) ); +#ifdef CONFIG_NFS_V4_2 TRACE_DEFINE_ENUM(NFS4_CONTENT_DATA); TRACE_DEFINE_ENUM(NFS4_CONTENT_HOLE); @@ -2105,7 +2106,6 @@ TRACE_DEFINE_ENUM(NFS4_CONTENT_HOLE); { NFS4_CONTENT_DATA, "DATA" }, \ { NFS4_CONTENT_HOLE, "HOLE" }) -#ifdef CONFIG_NFS_V4_2 TRACE_EVENT(nfs4_llseek, TP_PROTO( const struct inode *inode, @@ -2496,6 +2496,54 @@ TRACE_EVENT(nfs4_offload_cancel, __entry->stateid_seq, __entry->stateid_hash ) ); + +DECLARE_EVENT_CLASS(nfs4_xattr_event, + TP_PROTO( + const struct inode *inode, + const char *name, + int error + ), + + TP_ARGS(inode, name, error), + + TP_STRUCT__entry( + __field(unsigned long, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __string(name, name) + ), + + TP_fast_assign( + __entry->error = error < 0 ? -error : 0; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __assign_str(name, name); + ), + + TP_printk( + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "name=%s", + -__entry->error, show_nfs4_status(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __get_str(name) + ) +); +#define DEFINE_NFS4_XATTR_EVENT(name) \ + DEFINE_EVENT(nfs4_xattr_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + const char *name, \ + int error \ + ), \ + TP_ARGS(inode, name, error)) +DEFINE_NFS4_XATTR_EVENT(nfs4_getxattr); +DEFINE_NFS4_XATTR_EVENT(nfs4_setxattr); +DEFINE_NFS4_XATTR_EVENT(nfs4_removexattr); + +DEFINE_NFS4_INODE_EVENT(nfs4_listxattr); #endif /* CONFIG_NFS_V4_2 */ #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index acfe5f4bda48..deec76cf5afe 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4234,19 +4234,17 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, p = xdr_inline_decode(xdr, len); if (unlikely(!p)) return -EIO; + bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL; if (len < NFS4_MAXLABELLEN) { - if (label) { - if (label->len) { - if (label->len < len) - return -ERANGE; - memcpy(label->label, p, len); - } + if (label && label->len) { + if (label->len < len) + return -ERANGE; + memcpy(label->label, p, len); label->len = len; label->pi = pi; label->lfs = lfs; status = NFS_ATTR_FATTR_V4_SECURITY_LABEL; } - bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL; } else printk(KERN_WARNING "%s: label too long (%u)!\n", __func__, len); @@ -4755,12 +4753,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, if (status < 0) goto xdr_error; - if (fattr->label) { - status = decode_attr_security_label(xdr, bitmap, fattr->label); - if (status < 0) - goto xdr_error; - fattr->valid |= status; - } + status = decode_attr_security_label(xdr, bitmap, fattr->label); + if (status < 0) + goto xdr_error; + fattr->valid |= status; xdr_error: dprintk("%s: xdr returned %d\n", __func__, -status); diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index fa148308822c..620329b7e6ae 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -139,7 +139,7 @@ static int __init nfs_root_setup(char *line) ROOT_DEV = Root_NFS; if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) { - strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms)); + strscpy(nfs_root_parms, line, sizeof(nfs_root_parms)); } else { size_t n = strlen(line) + sizeof(NFS_ROOT) - 1; if (n >= sizeof(nfs_root_parms)) diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 8c6cc58679ff..642f6921852f 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -11,9 +11,9 @@ #include <linux/tracepoint.h> #include <linux/iversion.h> -#include <trace/events/fs.h> -#include <trace/events/nfs.h> -#include <trace/events/sunrpc_base.h> +#include <trace/misc/fs.h> +#include <trace/misc/nfs.h> +#include <trace/misc/sunrpc.h> #define nfs_show_cache_validity(v) \ __print_flags(v, "|", \ diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 317cedfa52bf..16be6dae524f 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -1055,7 +1055,7 @@ static unsigned int nfs_coalesce_size(struct nfs_page *prev, if (prev) { if (!nfs_match_open_context(nfs_req_openctx(req), nfs_req_openctx(prev))) return 0; - flctx = d_inode(nfs_req_openctx(req)->dentry)->i_flctx; + flctx = locks_inode_context(d_inode(nfs_req_openctx(req)->dentry)); if (flctx != NULL && !(list_empty_careful(&flctx->flc_posix) && list_empty_careful(&flctx->flc_flock)) && diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 2613b7e36eb9..a5db5158c634 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -710,6 +710,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, u32 seq) { struct pnfs_layout_segment *lseg, *next; + struct nfs_server *server = NFS_SERVER(lo->plh_inode); int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); @@ -722,8 +723,10 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, lseg->pls_seq, lseg->pls_range.offset, lseg->pls_range.length); - if (!mark_lseg_invalid(lseg, tmp_list)) - remaining++; + if (mark_lseg_invalid(lseg, tmp_list)) + continue; + remaining++; + pnfs_lseg_cancel_io(server, lseg); } dprintk("%s:Return %i\n", __func__, remaining); return remaining; @@ -1908,7 +1911,7 @@ static int pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) pnfs_layoutcommit_inode(lo->plh_inode, false); return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN, nfs_wait_bit_killable, - TASK_KILLABLE); + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); } static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo) @@ -2485,6 +2488,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, u32 seq) { struct pnfs_layout_segment *lseg, *next; + struct nfs_server *server = NFS_SERVER(lo->plh_inode); int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); @@ -2507,6 +2511,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, continue; remaining++; set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); + pnfs_lseg_cancel_io(server, lseg); } if (remaining) { @@ -3192,7 +3197,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) status = wait_on_bit_lock_action(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, nfs_wait_bit_killable, - TASK_KILLABLE); + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (status) goto out; } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index f331f067691b..e3e6a41f19de 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -169,6 +169,8 @@ struct pnfs_layoutdriver_type { void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args); int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args); + + void (*cancel_io)(struct pnfs_layout_segment *lseg); }; struct pnfs_commit_ops { @@ -685,6 +687,13 @@ pnfs_lseg_request_intersecting(struct pnfs_layout_segment *lseg, struct nfs_page req_offset(req), req_last); } +static inline void pnfs_lseg_cancel_io(struct nfs_server *server, + struct pnfs_layout_segment *lseg) +{ + if (server->pnfs_curr_ld->cancel_io) + server->pnfs_curr_ld->cancel_io(lseg); +} + extern unsigned int layoutstats_timer; #ifdef NFS_DEBUG diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 657c242a18ff..5d035dd2d7bf 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -374,12 +374,12 @@ pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets, return NULL; } -/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head reqest +/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head request * for @page * @cinfo - commit info for current inode * @page - page to search for matching head request * - * Returns a the head request if one is found, otherwise returns NULL. + * Return: the head request if one is found, otherwise %NULL. */ struct nfs_page * pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page) @@ -821,7 +821,7 @@ static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) static struct nfs_client *(*get_v3_ds_connect)( struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, @@ -882,7 +882,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, continue; } clp = get_v3_ds_connect(mds_srv, - (struct sockaddr *)&da->da_addr, + &da->da_addr, da->da_addrlen, da->da_transport, timeo, retrans); if (IS_ERR(clp)) @@ -951,7 +951,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, put_cred(xprtdata.cred); } else { clp = nfs4_set_ds_client(mds_srv, - (struct sockaddr *)&da->da_addr, + &da->da_addr, da->da_addrlen, da->da_transport, timeo, retrans, minor_version); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ee66ffdb985e..05ae23657527 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -822,8 +822,7 @@ static int nfs_request_mount(struct fs_context *fc, { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_mount_request request = { - .sap = (struct sockaddr *) - &ctx->mount_server.address, + .sap = &ctx->mount_server._address, .dirpath = ctx->nfs_server.export_path, .protocol = ctx->mount_server.protocol, .fh = root_fh, @@ -854,7 +853,7 @@ static int nfs_request_mount(struct fs_context *fc, * Construct the mount server's address. */ if (ctx->mount_server.address.sa_family == AF_UNSPEC) { - memcpy(request.sap, &ctx->nfs_server.address, + memcpy(request.sap, &ctx->nfs_server._address, ctx->nfs_server.addrlen); ctx->mount_server.addrlen = ctx->nfs_server.addrlen; } diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index a6f740366963..0cbcd2dfa732 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -26,7 +26,7 @@ static void nfs_netns_object_release(struct kobject *kobj) } static const struct kobj_ns_type_operations *nfs_netns_object_child_ns_type( - struct kobject *kobj) + const struct kobject *kobj) { return &net_ns_type_operations; } @@ -82,7 +82,7 @@ static ssize_t nfs_netns_identifier_show(struct kobject *kobj, ssize_t ret; rcu_read_lock(); - ret = scnprintf(buf, PAGE_SIZE, "%s\n", rcu_dereference(c->identifier)); + ret = sysfs_emit(buf, "%s\n", rcu_dereference(c->identifier)); rcu_read_unlock(); return ret; } @@ -130,7 +130,7 @@ static void nfs_netns_client_release(struct kobject *kobj) kfree(c); } -static const void *nfs_netns_client_namespace(struct kobject *kobj) +static const void *nfs_netns_client_namespace(const struct kobject *kobj) { return container_of(kobj, struct nfs_netns_client, kobject)->net; } diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 9697cd5d2561..150a953a8be9 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -139,6 +139,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct inode *inode, struct nf */ spin_lock(&alias->d_lock); if (d_really_is_positive(alias) && + !nfs_compare_fh(NFS_FH(inode), NFS_FH(d_inode(alias))) && !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { devname_garbage = alias->d_fsdata; alias->d_fsdata = data; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f41d24b54fd1..80c240e50952 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1185,7 +1185,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) { struct nfs_open_context *ctx = nfs_file_open_context(file); struct nfs_lock_context *l_ctx; - struct file_lock_context *flctx = file_inode(file)->i_flctx; + struct file_lock_context *flctx = locks_inode_context(file_inode(file)); struct nfs_page *req; int do_flush, status; /* @@ -1321,7 +1321,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode, unsigned int pagelen) { int ret; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); struct file_lock *fl; if (file->f_flags & O_DSYNC) diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index f6a2fd3015e7..7c441f2bd444 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -8,6 +8,7 @@ config NFSD select SUNRPC select EXPORTFS select NFS_ACL_SUPPORT if NFSD_V2_ACL + select NFS_ACL_SUPPORT if NFSD_V3_ACL depends on MULTIUSER help Choose Y here if you want to allow other computers to access @@ -26,19 +27,29 @@ config NFSD Below you can choose which versions of the NFS protocol are available to clients mounting the NFS server on this system. - Support for NFS version 2 (RFC 1094) is always available when + Support for NFS version 3 (RFC 1813) is always available when CONFIG_NFSD is selected. If unsure, say N. -config NFSD_V2_ACL - bool +config NFSD_V2 + bool "NFS server support for NFS version 2 (DEPRECATED)" depends on NFSD + default n + help + NFSv2 (RFC 1094) was the first publicly-released version of NFS. + Unless you are hosting ancient (1990's era) NFS clients, you don't + need this. + + If unsure, say N. + +config NFSD_V2_ACL + bool "NFS server support for the NFSv2 ACL protocol extension" + depends on NFSD_V2 config NFSD_V3_ACL bool "NFS server support for the NFSv3 ACL protocol extension" depends on NFSD - select NFSD_V2_ACL help Solaris NFS servers support an auxiliary NFSv3 ACL protocol that never became an official part of the NFS version 3 protocol. diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 805c06d5f1b4..6fffc8f03f74 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -10,9 +10,10 @@ obj-$(CONFIG_NFSD) += nfsd.o # this one should be compiled first, as the tracing macros can easily blow up nfsd-y += trace.o -nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ - export.o auth.o lockd.o nfscache.o nfsxdr.o \ +nfsd-y += nfssvc.o nfsctl.o nfsfh.o vfs.o \ + export.o auth.o lockd.o nfscache.o \ stats.o filecache.o nfs3proc.o nfs3xdr.o +nfsd-$(CONFIG_NFSD_V2) += nfsproc.o nfsxdr.o nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index b6d01d51a746..04697f8dc37d 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -12,6 +12,7 @@ #include "blocklayoutxdr.h" #include "pnfs.h" #include "filecache.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PNFS diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 442543304930..8e9c1a0f8d38 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -9,6 +9,7 @@ #include "nfsd.h" #include "blocklayoutxdr.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PNFS diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 65c331f75e9c..f21259ead64b 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -84,6 +84,6 @@ int nfsd_reply_cache_init(struct nfsd_net *); void nfsd_reply_cache_shutdown(struct nfsd_net *); int nfsd_cache_lookup(struct svc_rqst *); void nfsd_cache_update(struct svc_rqst *, int, __be32 *); -int nfsd_reply_cache_stats_open(struct inode *, struct file *); +int nfsd_reply_cache_stats_show(struct seq_file *m, void *v); #endif /* NFSCACHE_H */ diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index ee0e3aba4a6e..d03f7f6a8642 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -115,7 +115,6 @@ struct svc_export * rqst_find_fsidzero_export(struct svc_rqst *); int exp_rootfh(struct net *, struct auth_domain *, char *path, struct knfsd_fh *, int maxsize); __be32 exp_pseudoroot(struct svc_rqst *, struct svc_fh *); -__be32 nfserrno(int errno); static inline void exp_put(struct svc_export *exp) { diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index eeed4ae5b4ad..45b2c9e3f636 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -1,7 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * Open file cache. + * The NFSD open file cache. * * (c) 2015 - Jeff Layton <jeff.layton@primarydata.com> + * + * An nfsd_file object is a per-file collection of open state that binds + * together: + * - a struct file * + * - a user credential + * - a network namespace + * - a read-ahead context + * - monitoring for writeback errors + * + * nfsd_file objects are reference-counted. Consumers acquire a new + * object via the nfsd_file_acquire API. They manage their interest in + * the acquired object, and hence the object's reference count, via + * nfsd_file_get and nfsd_file_put. There are two varieties of nfsd_file + * object: + * + * * non-garbage-collected: When a consumer wants to precisely control + * the lifetime of a file's open state, it acquires a non-garbage- + * collected nfsd_file. The final nfsd_file_put releases the open + * state immediately. + * + * * garbage-collected: When a consumer does not control the lifetime + * of open state, it acquires a garbage-collected nfsd_file. The + * final nfsd_file_put allows the open state to linger for a period + * during which it may be re-used. */ #include <linux/hash.h> @@ -33,7 +58,6 @@ static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); static DEFINE_PER_CPU(unsigned long, nfsd_file_releases); static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age); -static DEFINE_PER_CPU(unsigned long, nfsd_file_pages_flushed); static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions); struct nfsd_fcache_disposal { @@ -63,6 +87,7 @@ struct nfsd_file_lookup_key { struct net *net; const struct cred *cred; unsigned char need; + bool gc; enum nfsd_file_lookup_type type; }; @@ -162,6 +187,8 @@ static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg, return 1; if (!nfsd_match_cred(nf->nf_cred, key->cred)) return 1; + if (!!test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc) + return 1; if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) return 1; break; @@ -184,12 +211,9 @@ static const struct rhashtable_params nfsd_file_rhash_params = { static void nfsd_file_schedule_laundrette(void) { - if ((atomic_read(&nfsd_file_rhash_tbl.nelems) == 0) || - test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 0) - return; - - queue_delayed_work(system_wq, &nfsd_filecache_laundrette, - NFSD_LAUNDRETTE_DELAY); + if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags)) + queue_delayed_work(system_wq, &nfsd_filecache_laundrette, + NFSD_LAUNDRETTE_DELAY); } static void @@ -297,56 +321,28 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may) nf->nf_flags = 0; __set_bit(NFSD_FILE_HASHED, &nf->nf_flags); __set_bit(NFSD_FILE_PENDING, &nf->nf_flags); + if (key->gc) + __set_bit(NFSD_FILE_GC, &nf->nf_flags); nf->nf_inode = key->inode; - /* nf_ref is pre-incremented for hash table */ - refcount_set(&nf->nf_ref, 2); + refcount_set(&nf->nf_ref, 1); nf->nf_may = key->need; nf->nf_mark = NULL; } return nf; } -static bool -nfsd_file_free(struct nfsd_file *nf) -{ - s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime)); - bool flush = false; - - this_cpu_inc(nfsd_file_releases); - this_cpu_add(nfsd_file_total_age, age); - - trace_nfsd_file_put_final(nf); - if (nf->nf_mark) - nfsd_file_mark_put(nf->nf_mark); - if (nf->nf_file) { - get_file(nf->nf_file); - filp_close(nf->nf_file, NULL); - fput(nf->nf_file); - flush = true; - } - - /* - * If this item is still linked via nf_lru, that's a bug. - * WARN and leak it to preserve system stability. - */ - if (WARN_ON_ONCE(!list_empty(&nf->nf_lru))) - return flush; - - call_rcu(&nf->nf_rcu, nfsd_file_slab_free); - return flush; -} - -static bool -nfsd_file_check_writeback(struct nfsd_file *nf) +static void +nfsd_file_fsync(struct nfsd_file *nf) { struct file *file = nf->nf_file; - struct address_space *mapping; + int ret; if (!file || !(file->f_mode & FMODE_WRITE)) - return false; - mapping = file->f_mapping; - return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) || - mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); + return; + ret = vfs_fsync(file, 1); + trace_nfsd_file_fsync(nf, ret); + if (ret) + nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); } static int @@ -360,31 +356,6 @@ nfsd_file_check_write_error(struct nfsd_file *nf) } static void -nfsd_file_flush(struct nfsd_file *nf) -{ - struct file *file = nf->nf_file; - - if (!file || !(file->f_mode & FMODE_WRITE)) - return; - this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages); - if (vfs_fsync(file, 1) != 0) - nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); -} - -static void nfsd_file_lru_add(struct nfsd_file *nf) -{ - set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); - if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) - trace_nfsd_file_lru_add(nf); -} - -static void nfsd_file_lru_remove(struct nfsd_file *nf) -{ - if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) - trace_nfsd_file_lru_del(nf); -} - -static void nfsd_file_hash_remove(struct nfsd_file *nf) { trace_nfsd_file_unhash(nf); @@ -405,68 +376,77 @@ nfsd_file_unhash(struct nfsd_file *nf) return false; } -/* - * Return true if the file was unhashed. - */ -static bool -nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose) +static void +nfsd_file_free(struct nfsd_file *nf) { - trace_nfsd_file_unhash_and_dispose(nf); - if (!nfsd_file_unhash(nf)) - return false; - /* keep final reference for nfsd_file_lru_dispose */ - if (refcount_dec_not_one(&nf->nf_ref)) - return true; + s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime)); - nfsd_file_lru_remove(nf); - list_add(&nf->nf_lru, dispose); - return true; -} + trace_nfsd_file_free(nf); -static void -nfsd_file_put_noref(struct nfsd_file *nf) -{ - trace_nfsd_file_put(nf); + this_cpu_inc(nfsd_file_releases); + this_cpu_add(nfsd_file_total_age, age); - if (refcount_dec_and_test(&nf->nf_ref)) { - WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags)); - nfsd_file_lru_remove(nf); - nfsd_file_free(nf); + nfsd_file_unhash(nf); + + /* + * We call fsync here in order to catch writeback errors. It's not + * strictly required by the protocol, but an nfsd_file could get + * evicted from the cache before a COMMIT comes in. If another + * task were to open that file in the interim and scrape the error, + * then the client may never see it. By calling fsync here, we ensure + * that writeback happens before the entry is freed, and that any + * errors reported result in the write verifier changing. + */ + nfsd_file_fsync(nf); + + if (nf->nf_mark) + nfsd_file_mark_put(nf->nf_mark); + if (nf->nf_file) { + get_file(nf->nf_file); + filp_close(nf->nf_file, NULL); + fput(nf->nf_file); } + + /* + * If this item is still linked via nf_lru, that's a bug. + * WARN and leak it to preserve system stability. + */ + if (WARN_ON_ONCE(!list_empty(&nf->nf_lru))) + return; + + call_rcu(&nf->nf_rcu, nfsd_file_slab_free); } -void -nfsd_file_put(struct nfsd_file *nf) +static bool +nfsd_file_check_writeback(struct nfsd_file *nf) { - might_sleep(); + struct file *file = nf->nf_file; + struct address_space *mapping; - nfsd_file_lru_add(nf); - if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) { - nfsd_file_flush(nf); - nfsd_file_put_noref(nf); - } else if (nf->nf_file) { - nfsd_file_put_noref(nf); - nfsd_file_schedule_laundrette(); - } else - nfsd_file_put_noref(nf); + if (!file || !(file->f_mode & FMODE_WRITE)) + return false; + mapping = file->f_mapping; + return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) || + mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); } -/** - * nfsd_file_close - Close an nfsd_file - * @nf: nfsd_file to close - * - * If this is the final reference for @nf, free it immediately. - * This reflects an on-the-wire CLOSE or DELEGRETURN into the - * VFS and exported filesystem. - */ -void nfsd_file_close(struct nfsd_file *nf) +static bool nfsd_file_lru_add(struct nfsd_file *nf) { - nfsd_file_put(nf); - if (refcount_dec_if_one(&nf->nf_ref)) { - nfsd_file_unhash(nf); - nfsd_file_lru_remove(nf); - nfsd_file_free(nf); + set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); + if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) { + trace_nfsd_file_lru_add(nf); + return true; + } + return false; +} + +static bool nfsd_file_lru_remove(struct nfsd_file *nf) +{ + if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) { + trace_nfsd_file_lru_del(nf); + return true; } + return false; } struct nfsd_file * @@ -477,36 +457,60 @@ nfsd_file_get(struct nfsd_file *nf) return NULL; } -static void -nfsd_file_dispose_list(struct list_head *dispose) +/** + * nfsd_file_put - put the reference to a nfsd_file + * @nf: nfsd_file of which to put the reference + * + * Put a reference to a nfsd_file. In the non-GC case, we just put the + * reference immediately. In the GC case, if the reference would be + * the last one, the put it on the LRU instead to be cleaned up later. + */ +void +nfsd_file_put(struct nfsd_file *nf) { - struct nfsd_file *nf; + might_sleep(); + trace_nfsd_file_put(nf); - while(!list_empty(dispose)) { - nf = list_first_entry(dispose, struct nfsd_file, nf_lru); - list_del_init(&nf->nf_lru); - nfsd_file_flush(nf); - nfsd_file_put_noref(nf); + if (test_bit(NFSD_FILE_GC, &nf->nf_flags) && + test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { + /* + * If this is the last reference (nf_ref == 1), then try to + * transfer it to the LRU. + */ + if (refcount_dec_not_one(&nf->nf_ref)) + return; + + /* Try to add it to the LRU. If that fails, decrement. */ + if (nfsd_file_lru_add(nf)) { + /* If it's still hashed, we're done */ + if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { + nfsd_file_schedule_laundrette(); + return; + } + + /* + * We're racing with unhashing, so try to remove it from + * the LRU. If removal fails, then someone else already + * has our reference. + */ + if (!nfsd_file_lru_remove(nf)) + return; + } } + if (refcount_dec_and_test(&nf->nf_ref)) + nfsd_file_free(nf); } static void -nfsd_file_dispose_list_sync(struct list_head *dispose) +nfsd_file_dispose_list(struct list_head *dispose) { - bool flush = false; struct nfsd_file *nf; - while(!list_empty(dispose)) { + while (!list_empty(dispose)) { nf = list_first_entry(dispose, struct nfsd_file, nf_lru); list_del_init(&nf->nf_lru); - nfsd_file_flush(nf); - if (!refcount_dec_and_test(&nf->nf_ref)) - continue; - if (nfsd_file_free(nf)) - flush = true; + nfsd_file_free(nf); } - if (flush) - flush_delayed_fput(); } static void @@ -562,8 +566,6 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose) * @lock: LRU list lock (unused) * @arg: dispose list * - * Note this can deadlock with nfsd_file_cache_purge. - * * Return values: * %LRU_REMOVED: @item was removed from the LRU * %LRU_ROTATE: @item is to be moved to the LRU tail @@ -578,21 +580,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, struct list_head *head = arg; struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru); - /* - * Do a lockless refcount check. The hashtable holds one reference, so - * we look to see if anything else has a reference, or if any have - * been put since the shrinker last ran. Those don't get unhashed and - * released. - * - * Note that in the put path, we set the flag and then decrement the - * counter. Here we check the counter and then test and clear the flag. - * That order is deliberate to ensure that we can do this locklessly. - */ - if (refcount_read(&nf->nf_ref) > 1) { - list_lru_isolate(lru, &nf->nf_lru); - trace_nfsd_file_gc_in_use(nf); - return LRU_REMOVED; - } + /* We should only be dealing with GC entries here */ + WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags)); /* * Don't throw out files that are still undergoing I/O or @@ -603,40 +592,30 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, return LRU_SKIP; } + /* If it was recently added to the list, skip it */ if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) { trace_nfsd_file_gc_referenced(nf); return LRU_ROTATE; } - if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { - trace_nfsd_file_gc_hashed(nf); - return LRU_SKIP; + /* + * Put the reference held on behalf of the LRU. If it wasn't the last + * one, then just remove it from the LRU and ignore it. + */ + if (!refcount_dec_and_test(&nf->nf_ref)) { + trace_nfsd_file_gc_in_use(nf); + list_lru_isolate(lru, &nf->nf_lru); + return LRU_REMOVED; } + /* Refcount went to zero. Unhash it and queue it to the dispose list */ + nfsd_file_unhash(nf); list_lru_isolate_move(lru, &nf->nf_lru, head); this_cpu_inc(nfsd_file_evictions); trace_nfsd_file_gc_disposed(nf); return LRU_REMOVED; } -/* - * Unhash items on @dispose immediately, then queue them on the - * disposal workqueue to finish releasing them in the background. - * - * cel: Note that between the time list_lru_shrink_walk runs and - * now, these items are in the hash table but marked unhashed. - * Why release these outside of lru_cb ? There's no lock ordering - * problem since lru_cb currently takes no lock. - */ -static void nfsd_file_gc_dispose_list(struct list_head *dispose) -{ - struct nfsd_file *nf; - - list_for_each_entry(nf, dispose, nf_lru) - nfsd_file_hash_remove(nf); - nfsd_file_dispose_list_delayed(dispose); -} - static void nfsd_file_gc(void) { @@ -646,14 +625,15 @@ nfsd_file_gc(void) ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb, &dispose, list_lru_count(&nfsd_file_lru)); trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru)); - nfsd_file_gc_dispose_list(&dispose); + nfsd_file_dispose_list_delayed(&dispose); } static void nfsd_file_gc_worker(struct work_struct *work) { nfsd_file_gc(); - nfsd_file_schedule_laundrette(); + if (list_lru_count(&nfsd_file_lru)) + nfsd_file_schedule_laundrette(); } static unsigned long @@ -671,7 +651,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) ret = list_lru_shrink_walk(&nfsd_file_lru, sc, nfsd_file_lru_cb, &dispose); trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru)); - nfsd_file_gc_dispose_list(&dispose); + nfsd_file_dispose_list_delayed(&dispose); return ret; } @@ -681,75 +661,112 @@ static struct shrinker nfsd_file_shrinker = { .seeks = 1, }; -/* - * Find all cache items across all net namespaces that match @inode and - * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire(). +/** + * nfsd_file_queue_for_close: try to close out any open nfsd_files for an inode + * @inode: inode on which to close out nfsd_files + * @dispose: list on which to gather nfsd_files to close out + * + * An nfsd_file represents a struct file being held open on behalf of nfsd. An + * open file however can block other activity (such as leases), or cause + * undesirable behavior (e.g. spurious silly-renames when reexporting NFS). + * + * This function is intended to find open nfsd_files when this sort of + * conflicting access occurs and then attempt to close those files out. + * + * Populates the dispose list with entries that have already had their + * refcounts go to zero. The actual free of an nfsd_file can be expensive, + * so we leave it up to the caller whether it wants to wait or not. */ -static unsigned int -__nfsd_file_close_inode(struct inode *inode, struct list_head *dispose) +static void +nfsd_file_queue_for_close(struct inode *inode, struct list_head *dispose) { struct nfsd_file_lookup_key key = { .type = NFSD_FILE_KEY_INODE, .inode = inode, }; - unsigned int count = 0; struct nfsd_file *nf; rcu_read_lock(); do { + int decrement = 1; + nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, nfsd_file_rhash_params); if (!nf) break; - nfsd_file_unhash_and_dispose(nf, dispose); - count++; + + /* If we raced with someone else unhashing, ignore it */ + if (!nfsd_file_unhash(nf)) + continue; + + /* If we can't get a reference, ignore it */ + if (!nfsd_file_get(nf)) + continue; + + /* Extra decrement if we remove from the LRU */ + if (nfsd_file_lru_remove(nf)) + ++decrement; + + /* If refcount goes to 0, then put on the dispose list */ + if (refcount_sub_and_test(decrement, &nf->nf_ref)) { + list_add(&nf->nf_lru, dispose); + trace_nfsd_file_closing(nf); + } } while (1); rcu_read_unlock(); - return count; } /** - * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file + * nfsd_file_close_inode - attempt a delayed close of a nfsd_file * @inode: inode of the file to attempt to remove * - * Unhash and put, then flush and fput all cache items associated with @inode. + * Close out any open nfsd_files that can be reaped for @inode. The + * actual freeing is deferred to the dispose_list_delayed infrastructure. + * + * This is used by the fsnotify callbacks and setlease notifier. */ -void -nfsd_file_close_inode_sync(struct inode *inode) +static void +nfsd_file_close_inode(struct inode *inode) { LIST_HEAD(dispose); - unsigned int count; - count = __nfsd_file_close_inode(inode, &dispose); - trace_nfsd_file_close_inode_sync(inode, count); - nfsd_file_dispose_list_sync(&dispose); + nfsd_file_queue_for_close(inode, &dispose); + nfsd_file_dispose_list_delayed(&dispose); } /** - * nfsd_file_close_inode - attempt a delayed close of a nfsd_file + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file * @inode: inode of the file to attempt to remove * - * Unhash and put all cache item associated with @inode. + * Close out any open nfsd_files that can be reaped for @inode. The + * nfsd_files are closed out synchronously. + * + * This is called from nfsd_rename and nfsd_unlink to avoid silly-renames + * when reexporting NFS. */ -static void -nfsd_file_close_inode(struct inode *inode) +void +nfsd_file_close_inode_sync(struct inode *inode) { + struct nfsd_file *nf; LIST_HEAD(dispose); - unsigned int count; - count = __nfsd_file_close_inode(inode, &dispose); - trace_nfsd_file_close_inode(inode, count); - nfsd_file_dispose_list_delayed(&dispose); + trace_nfsd_file_close(inode); + + nfsd_file_queue_for_close(inode, &dispose); + while (!list_empty(&dispose)) { + nf = list_first_entry(&dispose, struct nfsd_file, nf_lru); + list_del_init(&nf->nf_lru); + nfsd_file_free(nf); + } + flush_delayed_fput(); } /** * nfsd_file_delayed_close - close unused nfsd_files * @work: dummy * - * Walk the LRU list and close any entries that have not been used since + * Walk the LRU list and destroy any entries that have not been used since * the last scan. - * - * Note this can deadlock with nfsd_file_cache_purge. */ static void nfsd_file_delayed_close(struct work_struct *work) @@ -770,7 +787,7 @@ nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg, /* Only close files for F_SETLEASE leases */ if (fl->fl_flags & FL_LEASE) - nfsd_file_close_inode_sync(file_inode(fl->fl_file)); + nfsd_file_close_inode(file_inode(fl->fl_file)); return 0; } @@ -891,8 +908,12 @@ out_err: goto out; } -/* - * Note this can deadlock with nfsd_file_lru_cb. +/** + * __nfsd_file_cache_purge: clean out the cache for shutdown + * @net: net-namespace to shut down the cache (may be NULL) + * + * Walk the nfsd_file cache and close out any that match @net. If @net is NULL, + * then close out everything. Called when an nfsd instance is being shut down. */ static void __nfsd_file_cache_purge(struct net *net) @@ -900,7 +921,6 @@ __nfsd_file_cache_purge(struct net *net) struct rhashtable_iter iter; struct nfsd_file *nf; LIST_HEAD(dispose); - bool del; rhashtable_walk_enter(&nfsd_file_rhash_tbl, &iter); do { @@ -908,16 +928,11 @@ __nfsd_file_cache_purge(struct net *net) nf = rhashtable_walk_next(&iter); while (!IS_ERR_OR_NULL(nf)) { - if (net && nf->nf_net != net) - continue; - del = nfsd_file_unhash_and_dispose(nf, &dispose); - - /* - * Deadlock detected! Something marked this entry as - * unhased, but hasn't removed it from the hash list. - */ - WARN_ON_ONCE(!del); - + if (!net || nf->nf_net == net) { + nfsd_file_unhash(nf); + nfsd_file_lru_remove(nf); + list_add(&nf->nf_lru, &dispose); + } nf = rhashtable_walk_next(&iter); } @@ -1023,7 +1038,6 @@ nfsd_file_cache_shutdown(void) per_cpu(nfsd_file_acquisitions, i) = 0; per_cpu(nfsd_file_releases, i) = 0; per_cpu(nfsd_file_total_age, i) = 0; - per_cpu(nfsd_file_pages_flushed, i) = 0; per_cpu(nfsd_file_evictions, i) = 0; } } @@ -1057,16 +1071,19 @@ nfsd_file_is_cached(struct inode *inode) static __be32 nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, - unsigned int may_flags, struct nfsd_file **pnf, bool open) + unsigned int may_flags, struct nfsd_file **pnf, + bool open, bool want_gc) { struct nfsd_file_lookup_key key = { .type = NFSD_FILE_KEY_FULL, .need = may_flags & NFSD_FILE_MAY_MASK, .net = SVC_NET(rqstp), + .gc = want_gc, }; - struct nfsd_file *nf, *new; - bool retry = true; + bool open_retry = true; + struct nfsd_file *nf; __be32 status; + int ret; status = fh_verify(rqstp, fhp, S_IFREG, may_flags|NFSD_MAY_OWNER_OVERRIDE); @@ -1076,35 +1093,38 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, key.cred = get_current_cred(); retry: - /* Avoid allocation if the item is already in cache */ - nf = rhashtable_lookup_fast(&nfsd_file_rhash_tbl, &key, - nfsd_file_rhash_params); + rcu_read_lock(); + nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, + nfsd_file_rhash_params); if (nf) nf = nfsd_file_get(nf); - if (nf) + rcu_read_unlock(); + + if (nf) { + if (nfsd_file_lru_remove(nf)) + WARN_ON_ONCE(refcount_dec_and_test(&nf->nf_ref)); goto wait_for_construction; + } - new = nfsd_file_alloc(&key, may_flags); - if (!new) { + nf = nfsd_file_alloc(&key, may_flags); + if (!nf) { status = nfserr_jukebox; goto out_status; } - nf = rhashtable_lookup_get_insert_key(&nfsd_file_rhash_tbl, - &key, &new->nf_rhash, - nfsd_file_rhash_params); - if (!nf) { - nf = new; - goto open_file; - } - if (IS_ERR(nf)) - goto insert_err; - nf = nfsd_file_get(nf); - if (nf == NULL) { - nf = new; + ret = rhashtable_lookup_insert_key(&nfsd_file_rhash_tbl, + &key, &nf->nf_rhash, + nfsd_file_rhash_params); + if (likely(ret == 0)) goto open_file; - } - nfsd_file_slab_free(&new->nf_rcu); + + nfsd_file_slab_free(&nf->nf_rcu); + nf = NULL; + if (ret == -EEXIST) + goto retry; + trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, ret); + status = nfserr_jukebox; + goto out_status; wait_for_construction: wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE); @@ -1112,16 +1132,16 @@ wait_for_construction: /* Did construction of this file fail? */ if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { trace_nfsd_file_cons_err(rqstp, key.inode, may_flags, nf); - if (!retry) { + if (!open_retry) { status = nfserr_jukebox; goto out; } - retry = false; - nfsd_file_put_noref(nf); + open_retry = false; + if (refcount_dec_and_test(&nf->nf_ref)) + nfsd_file_free(nf); goto retry; } - nfsd_file_lru_remove(nf); this_cpu_inc(nfsd_file_cache_hits); status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags)); @@ -1131,7 +1151,8 @@ out: this_cpu_inc(nfsd_file_acquisitions); *pnf = nf; } else { - nfsd_file_put(nf); + if (refcount_dec_and_test(&nf->nf_ref)) + nfsd_file_free(nf); nf = NULL; } @@ -1157,20 +1178,36 @@ open_file: * If construction failed, or we raced with a call to unlink() * then unhash. */ - if (status != nfs_ok || key.inode->i_nlink == 0) - if (nfsd_file_unhash(nf)) - nfsd_file_put_noref(nf); + if (status == nfs_ok && key.inode->i_nlink == 0) + status = nfserr_jukebox; + if (status != nfs_ok) + nfsd_file_unhash(nf); clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags); smp_mb__after_atomic(); wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING); goto out; +} -insert_err: - nfsd_file_slab_free(&new->nf_rcu); - trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, PTR_ERR(nf)); - nf = NULL; - status = nfserr_jukebox; - goto out_status; +/** + * nfsd_file_acquire_gc - Get a struct nfsd_file with an open file + * @rqstp: the RPC transaction being executed + * @fhp: the NFS filehandle of the file to be opened + * @may_flags: NFSD_MAY_ settings for the file + * @pnf: OUT: new or found "struct nfsd_file" object + * + * The nfsd_file object returned by this API is reference-counted + * and garbage-collected. The object is retained for a few + * seconds after the final nfsd_file_put() in case the caller + * wants to re-use it. + * + * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in + * network byte order is returned. + */ +__be32 +nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf) +{ + return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, true); } /** @@ -1180,6 +1217,10 @@ insert_err: * @may_flags: NFSD_MAY_ settings for the file * @pnf: OUT: new or found "struct nfsd_file" object * + * The nfsd_file_object returned by this API is reference-counted + * but not garbage-collected. The object is unhashed after the + * final nfsd_file_put(). + * * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in * network byte order is returned. */ @@ -1187,7 +1228,7 @@ __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **pnf) { - return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true); + return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, false); } /** @@ -1197,6 +1238,10 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, * @may_flags: NFSD_MAY_ settings for the file * @pnf: OUT: new or found "struct nfsd_file" object * + * The nfsd_file_object returned by this API is reference-counted + * but not garbage-collected. The object is released immediately + * one RCU grace period after the final nfsd_file_put(). + * * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in * network byte order is returned. */ @@ -1204,7 +1249,7 @@ __be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **pnf) { - return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false); + return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false, false); } /* @@ -1212,9 +1257,9 @@ nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, * scraping this file for info should test the labels to ensure they're * getting the correct field. */ -static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) +int nfsd_file_cache_stats_show(struct seq_file *m, void *v) { - unsigned long releases = 0, pages_flushed = 0, evictions = 0; + unsigned long releases = 0, evictions = 0; unsigned long hits = 0, acquisitions = 0; unsigned int i, count = 0, buckets = 0; unsigned long lru = 0, total_age = 0; @@ -1242,7 +1287,6 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) releases += per_cpu(nfsd_file_releases, i); total_age += per_cpu(nfsd_file_total_age, i); evictions += per_cpu(nfsd_file_evictions, i); - pages_flushed += per_cpu(nfsd_file_pages_flushed, i); } seq_printf(m, "total entries: %u\n", count); @@ -1256,11 +1300,5 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "mean age (ms): %ld\n", total_age / releases); else seq_printf(m, "mean age (ms): -\n"); - seq_printf(m, "pages flushed: %lu\n", pages_flushed); return 0; } - -int nfsd_file_cache_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, nfsd_file_cache_stats_show, NULL); -} diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 8e8c0c47d67d..b7efb2c3ddb1 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -38,6 +38,7 @@ struct nfsd_file { #define NFSD_FILE_HASHED (0) #define NFSD_FILE_PENDING (1) #define NFSD_FILE_REFERENCED (2) +#define NFSD_FILE_GC (3) unsigned long nf_flags; struct inode *nf_inode; /* don't deref */ refcount_t nf_ref; @@ -52,13 +53,14 @@ void nfsd_file_cache_shutdown(void); int nfsd_file_cache_start_net(struct net *net); void nfsd_file_cache_shutdown_net(struct net *net); void nfsd_file_put(struct nfsd_file *nf); -void nfsd_file_close(struct nfsd_file *nf); struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); void nfsd_file_close_inode_sync(struct inode *inode); bool nfsd_file_is_cached(struct inode *inode); +__be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **nfp); __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **nfp); __be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **nfp); -int nfsd_file_cache_stats_open(struct inode *, struct file *); +int nfsd_file_cache_stats_show(struct seq_file *m, void *v); #endif /* _FS_NFSD_FILECACHE_H */ diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c index 070f90ed09b6..3ca5304440ff 100644 --- a/fs/nfsd/flexfilelayout.c +++ b/fs/nfsd/flexfilelayout.c @@ -15,6 +15,7 @@ #include "flexfilelayoutxdr.h" #include "pnfs.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PNFS diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index ffe17743cc74..8c854ba3285b 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -192,6 +192,10 @@ struct nfsd_net { atomic_t nfs4_client_count; int nfs4_max_clients; + + atomic_t nfsd_courtesy_clients; + struct shrinker nfsd_client_shrinker; + struct delayed_work nfsd_shrinker_work; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 9edd3c1a30fb..1457f59f447a 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -55,7 +55,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp) goto out; if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { - acl = get_acl(inode, ACL_TYPE_ACCESS); + acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (acl == NULL) { /* Solaris returns the inode's minimum ACL. */ acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); @@ -69,7 +69,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp) if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { /* Check how Solaris handles requests for the Default ACL of a non-directory! */ - acl = get_acl(inode, ACL_TYPE_DEFAULT); + acl = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) { resp->status = nfserrno(PTR_ERR(acl)); goto fail; @@ -113,11 +113,11 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp) inode_lock(inode); - error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS, + error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS, argp->acl_access); if (error) goto out_drop_lock; - error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT, + error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT, argp->acl_default); if (error) goto out_drop_lock; @@ -246,7 +246,6 @@ nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr) struct nfsd3_getaclres *resp = rqstp->rq_resp; struct dentry *dentry = resp->fh.fh_dentry; struct inode *inode; - int w; if (!svcxdr_encode_stat(xdr, resp->status)) return false; @@ -260,15 +259,6 @@ nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr) if (xdr_stream_encode_u32(xdr, resp->mask) < 0) return false; - rqstp->rq_res.page_len = w = nfsacl_size( - (resp->mask & NFS_ACL) ? resp->acl_access : NULL, - (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); - while (w > 0) { - if (!*(rqstp->rq_next_page++)) - return true; - w -= PAGE_SIZE; - } - if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access, resp->mask & NFS_ACL, 0)) return false; @@ -331,6 +321,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_decode = nfssvc_decode_voidarg, .pc_encode = nfssvc_encode_voidres, .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, @@ -342,6 +333,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_encode = nfsaclsvc_encode_getaclres, .pc_release = nfsaclsvc_release_getacl, .pc_argsize = sizeof(struct nfsd3_getaclargs), + .pc_argzero = sizeof(struct nfsd3_getaclargs), .pc_ressize = sizeof(struct nfsd3_getaclres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+1+2*(1+ACL), @@ -353,6 +345,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_encode = nfssvc_encode_attrstatres, .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd3_setaclargs), + .pc_argzero = sizeof(struct nfsd3_setaclargs), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, @@ -364,6 +357,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_encode = nfssvc_encode_attrstatres, .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_argzero = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, @@ -375,6 +369,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_encode = nfsaclsvc_encode_accessres, .pc_release = nfsaclsvc_release_access, .pc_argsize = sizeof(struct nfsd3_accessargs), + .pc_argzero = sizeof(struct nfsd3_accessargs), .pc_ressize = sizeof(struct nfsd3_accessres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT+1, diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 9446c6743664..647108138e8a 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -47,7 +47,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp) resp->mask = argp->mask; if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { - acl = get_acl(inode, ACL_TYPE_ACCESS); + acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (acl == NULL) { /* Solaris returns the inode's minimum ACL. */ acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); @@ -61,7 +61,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp) if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { /* Check how Solaris handles requests for the Default ACL of a non-directory! */ - acl = get_acl(inode, ACL_TYPE_DEFAULT); + acl = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) { resp->status = nfserrno(PTR_ERR(acl)); goto fail; @@ -103,11 +103,11 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp) inode_lock(inode); - error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS, + error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS, argp->acl_access); if (error) goto out_drop_lock; - error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT, + error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT, argp->acl_default); out_drop_lock: @@ -171,11 +171,7 @@ nfs3svc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_getaclres *resp = rqstp->rq_resp; struct dentry *dentry = resp->fh.fh_dentry; - struct kvec *head = rqstp->rq_res.head; struct inode *inode; - unsigned int base; - int n; - int w; if (!svcxdr_encode_nfsstat3(xdr, resp->status)) return false; @@ -187,26 +183,12 @@ nfs3svc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr) if (xdr_stream_encode_u32(xdr, resp->mask) < 0) return false; - base = (char *)xdr->p - (char *)head->iov_base; - - rqstp->rq_res.page_len = w = nfsacl_size( - (resp->mask & NFS_ACL) ? resp->acl_access : NULL, - (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); - while (w > 0) { - if (!*(rqstp->rq_next_page++)) - return false; - w -= PAGE_SIZE; - } - - n = nfsacl_encode(&rqstp->rq_res, base, inode, - resp->acl_access, - resp->mask & NFS_ACL, 0); - if (n > 0) - n = nfsacl_encode(&rqstp->rq_res, base + n, inode, - resp->acl_default, - resp->mask & NFS_DFACL, - NFS_ACL_DEFAULT); - if (n <= 0) + if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access, + resp->mask & NFS_ACL, 0)) + return false; + if (!nfs_stream_encode_acl(xdr, inode, resp->acl_default, + resp->mask & NFS_DFACL, + NFS_ACL_DEFAULT)) return false; break; default: @@ -252,6 +234,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = { .pc_decode = nfssvc_decode_voidarg, .pc_encode = nfssvc_encode_voidres, .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, @@ -263,6 +246,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = { .pc_encode = nfs3svc_encode_getaclres, .pc_release = nfs3svc_release_getacl, .pc_argsize = sizeof(struct nfsd3_getaclargs), + .pc_argzero = sizeof(struct nfsd3_getaclargs), .pc_ressize = sizeof(struct nfsd3_getaclres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+1+2*(1+ACL), @@ -274,6 +258,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = { .pc_encode = nfs3svc_encode_setaclres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_setaclargs), + .pc_argzero = sizeof(struct nfsd3_setaclargs), .pc_ressize = sizeof(struct nfsd3_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT, diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index a41cca619338..d01b29aba662 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -13,6 +13,7 @@ #include "cache.h" #include "xdr3.h" #include "vfs.h" +#include "filecache.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -150,7 +151,6 @@ nfsd3_proc_read(struct svc_rqst *rqstp) { struct nfsd3_readargs *argp = rqstp->rq_argp; struct nfsd3_readres *resp = rqstp->rq_resp; - u32 max_blocksize = svc_max_payload(rqstp); unsigned int len; int v; @@ -159,7 +159,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp) (unsigned long) argp->count, (unsigned long long) argp->offset); - argp->count = min_t(u32, argp->count, max_blocksize); + argp->count = min_t(u32, argp->count, svc_max_payload(rqstp)); + argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen); if (argp->offset > (u64)OFFSET_MAX) argp->offset = (u64)OFFSET_MAX; if (argp->offset + argp->count > (u64)OFFSET_MAX) @@ -563,25 +564,18 @@ static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp, { struct xdr_buf *buf = &resp->dirlist; struct xdr_stream *xdr = &resp->xdr; - - count = clamp(count, (u32)(XDR_UNIT * 2), svc_max_payload(rqstp)); + unsigned int sendbuf = min_t(unsigned int, rqstp->rq_res.buflen, + svc_max_payload(rqstp)); memset(buf, 0, sizeof(*buf)); /* Reserve room for the NULL ptr & eof flag (-2 words) */ - buf->buflen = count - XDR_UNIT * 2; + buf->buflen = clamp(count, (u32)(XDR_UNIT * 2), sendbuf); + buf->buflen -= XDR_UNIT * 2; buf->pages = rqstp->rq_next_page; rqstp->rq_next_page += (buf->buflen + PAGE_SIZE - 1) >> PAGE_SHIFT; - /* This is xdr_init_encode(), but it assumes that - * the head kvec has already been consumed. */ - xdr_set_scratch_buffer(xdr, NULL, 0); - xdr->buf = buf; - xdr->page_ptr = buf->pages; - xdr->iov = NULL; - xdr->p = page_address(*buf->pages); - xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE); - xdr->rqst = NULL; + xdr_init_encode_pages(xdr, buf, buf->pages, NULL); } /* @@ -770,6 +764,7 @@ nfsd3_proc_commit(struct svc_rqst *rqstp) { struct nfsd3_commitargs *argp = rqstp->rq_argp; struct nfsd3_commitres *resp = rqstp->rq_resp; + struct nfsd_file *nf; dprintk("nfsd: COMMIT(3) %s %u@%Lu\n", SVCFH_fmt(&argp->fh), @@ -777,8 +772,14 @@ nfsd3_proc_commit(struct svc_rqst *rqstp) (unsigned long long) argp->offset); fh_copy(&resp->fh, &argp->fh); - resp->status = nfsd_commit(rqstp, &resp->fh, argp->offset, + resp->status = nfsd_file_acquire_gc(rqstp, &resp->fh, NFSD_MAY_WRITE | + NFSD_MAY_NOT_BREAK_LEASE, &nf); + if (resp->status) + goto out; + resp->status = nfsd_commit(rqstp, &resp->fh, nf, argp->offset, argp->count, resp->verf); + nfsd_file_put(nf); +out: return rpc_success; } @@ -808,6 +809,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_decode = nfssvc_decode_voidarg, .pc_encode = nfssvc_encode_voidres, .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, @@ -819,6 +821,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_getattrres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_argzero = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd3_attrstatres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, @@ -830,6 +833,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_wccstatres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_sattrargs), + .pc_argzero = sizeof(struct nfsd3_sattrargs), .pc_ressize = sizeof(struct nfsd3_wccstatres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC, @@ -841,6 +845,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_lookupres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_diropargs), + .pc_argzero = sizeof(struct nfsd3_diropargs), .pc_ressize = sizeof(struct nfsd3_diropres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+FH+pAT+pAT, @@ -852,6 +857,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_accessres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_accessargs), + .pc_argzero = sizeof(struct nfsd3_accessargs), .pc_ressize = sizeof(struct nfsd3_accessres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+1, @@ -863,6 +869,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_readlinkres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_argzero = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd3_readlinkres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4, @@ -874,6 +881,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_readres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_readargs), + .pc_argzero = sizeof(struct nfsd3_readargs), .pc_ressize = sizeof(struct nfsd3_readres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4, @@ -885,6 +893,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_writeres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_writeargs), + .pc_argzero = sizeof(struct nfsd3_writeargs), .pc_ressize = sizeof(struct nfsd3_writeres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC+4, @@ -896,6 +905,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_createres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_createargs), + .pc_argzero = sizeof(struct nfsd3_createargs), .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, @@ -907,6 +917,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_createres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_mkdirargs), + .pc_argzero = sizeof(struct nfsd3_mkdirargs), .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, @@ -918,6 +929,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_createres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_symlinkargs), + .pc_argzero = sizeof(struct nfsd3_symlinkargs), .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, @@ -929,6 +941,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_createres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_mknodargs), + .pc_argzero = sizeof(struct nfsd3_mknodargs), .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, @@ -940,6 +953,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_wccstatres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_diropargs), + .pc_argzero = sizeof(struct nfsd3_diropargs), .pc_ressize = sizeof(struct nfsd3_wccstatres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC, @@ -951,6 +965,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_wccstatres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_diropargs), + .pc_argzero = sizeof(struct nfsd3_diropargs), .pc_ressize = sizeof(struct nfsd3_wccstatres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC, @@ -962,6 +977,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_renameres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_renameargs), + .pc_argzero = sizeof(struct nfsd3_renameargs), .pc_ressize = sizeof(struct nfsd3_renameres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC+WC, @@ -973,6 +989,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_linkres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_linkargs), + .pc_argzero = sizeof(struct nfsd3_linkargs), .pc_ressize = sizeof(struct nfsd3_linkres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+pAT+WC, @@ -984,6 +1001,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_readdirres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_readdirargs), + .pc_argzero = sizeof(struct nfsd3_readdirargs), .pc_ressize = sizeof(struct nfsd3_readdirres), .pc_cachetype = RC_NOCACHE, .pc_name = "READDIR", @@ -994,6 +1012,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_readdirres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_readdirplusargs), + .pc_argzero = sizeof(struct nfsd3_readdirplusargs), .pc_ressize = sizeof(struct nfsd3_readdirres), .pc_cachetype = RC_NOCACHE, .pc_name = "READDIRPLUS", @@ -1003,6 +1022,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_decode = nfs3svc_decode_fhandleargs, .pc_encode = nfs3svc_encode_fsstatres, .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_argzero = sizeof(struct nfsd3_fhandleargs), .pc_ressize = sizeof(struct nfsd3_fsstatres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+2*6+1, @@ -1013,6 +1033,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_decode = nfs3svc_decode_fhandleargs, .pc_encode = nfs3svc_encode_fsinfores, .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_argzero = sizeof(struct nfsd3_fhandleargs), .pc_ressize = sizeof(struct nfsd3_fsinfores), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+12, @@ -1023,6 +1044,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_decode = nfs3svc_decode_fhandleargs, .pc_encode = nfs3svc_encode_pathconfres, .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_argzero = sizeof(struct nfsd3_fhandleargs), .pc_ressize = sizeof(struct nfsd3_pathconfres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+6, @@ -1034,6 +1056,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_commitres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_commitargs), + .pc_argzero = sizeof(struct nfsd3_commitargs), .pc_ressize = sizeof(struct nfsd3_commitres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+WC+2, diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 0293b8d65f10..3308dd671ef0 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -571,10 +571,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) args->count = max_blocksize; args->len = max_blocksize; } - if (!xdr_stream_subsegment(xdr, &args->payload, args->count)) - return false; - return true; + return xdr_stream_subsegment(xdr, &args->payload, args->count); } bool @@ -616,8 +614,6 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_symlinkargs *args = rqstp->rq_argp; struct kvec *head = rqstp->rq_arg.head; - struct kvec *tail = rqstp->rq_arg.tail; - size_t remaining; if (!svcxdr_decode_diropargs3(xdr, &args->ffh, &args->fname, &args->flen)) return false; @@ -626,16 +622,10 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) if (xdr_stream_decode_u32(xdr, &args->tlen) < 0) return false; - /* request sanity */ - remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len; - remaining -= xdr_stream_pos(xdr); - if (remaining < xdr_align_size(args->tlen)) - return false; - - args->first.iov_base = xdr->p; + /* symlink_data */ args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); - - return true; + args->first.iov_base = xdr_inline_decode(xdr, args->tlen); + return args->first.iov_base != NULL; } bool diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index bb8e2f6d7d03..518203821790 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -135,7 +135,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, unsigned int flags = 0; int size = 0; - pacl = get_acl(inode, ACL_TYPE_ACCESS); + pacl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (!pacl) pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); @@ -147,7 +147,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, if (S_ISDIR(inode->i_mode)) { flags = NFS4_ACL_DIR; - dpacl = get_acl(inode, ACL_TYPE_DEFAULT); + dpacl = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(dpacl)) { error = PTR_ERR(dpacl); goto rel_pacl; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 4ce328209f61..2a815f5a52c4 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -76,6 +76,17 @@ static __be32 *xdr_encode_empty_array(__be32 *p) * 1 Protocol" */ +static void encode_uint32(struct xdr_stream *xdr, u32 n) +{ + WARN_ON_ONCE(xdr_stream_encode_u32(xdr, n) < 0); +} + +static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap, + size_t len) +{ + WARN_ON_ONCE(xdr_stream_encode_uint32_array(xdr, bitmap, len) < 0); +} + /* * nfs_cb_opnum4 * @@ -329,6 +340,24 @@ static void encode_cb_recall4args(struct xdr_stream *xdr, } /* + * CB_RECALLANY4args + * + * struct CB_RECALLANY4args { + * uint32_t craa_objects_to_keep; + * bitmap4 craa_type_mask; + * }; + */ +static void +encode_cb_recallany4args(struct xdr_stream *xdr, + struct nfs4_cb_compound_hdr *hdr, struct nfsd4_cb_recall_any *ra) +{ + encode_nfs_cb_opnum4(xdr, OP_CB_RECALL_ANY); + encode_uint32(xdr, ra->ra_keep); + encode_bitmap4(xdr, ra->ra_bmval, ARRAY_SIZE(ra->ra_bmval)); + hdr->nops++; +} + +/* * CB_SEQUENCE4args * * struct CB_SEQUENCE4args { @@ -482,6 +511,26 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, encode_cb_nops(&hdr); } +/* + * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects + */ +static void +nfs4_xdr_enc_cb_recall_any(struct rpc_rqst *req, + struct xdr_stream *xdr, const void *data) +{ + const struct nfsd4_callback *cb = data; + struct nfsd4_cb_recall_any *ra; + struct nfs4_cb_compound_hdr hdr = { + .ident = cb->cb_clp->cl_cb_ident, + .minorversion = cb->cb_clp->cl_minorversion, + }; + + ra = container_of(cb, struct nfsd4_cb_recall_any, ra_cb); + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_recallany4args(xdr, &hdr, ra); + encode_cb_nops(&hdr); +} /* * NFSv4.0 and NFSv4.1 XDR decode functions @@ -520,6 +569,28 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status); } +/* + * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects + */ +static int +nfs4_xdr_dec_cb_recall_any(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfsd4_callback *cb = data; + struct nfs4_cb_compound_hdr hdr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + status = decode_cb_op_status(xdr, OP_CB_RECALL_ANY, &cb->cb_status); + return status; +} + #ifdef CONFIG_NFSD_PNFS /* * CB_LAYOUTRECALL4args @@ -783,6 +854,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = { #endif PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock), PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload), + PROC(CB_RECALL_ANY, COMPOUND, cb_recall_any, cb_recall_any), }; static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)]; @@ -870,7 +942,7 @@ static const struct cred *get_backchannel_cred(struct nfs4_client *clp, struct r } else { struct cred *kcred; - kcred = prepare_kernel_cred(NULL); + kcred = prepare_kernel_cred(&init_task); if (!kcred) return NULL; @@ -916,7 +988,6 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c } else { if (!conn->cb_xprt) return -EINVAL; - clp->cl_cb_conn.cb_xprt = conn->cb_xprt; clp->cl_cb_session = ses; args.bc_xprt = conn->cb_xprt; args.prognumber = clp->cl_cb_session->se_cb_prog; @@ -936,6 +1007,9 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c rpc_shutdown_client(client); return -ENOMEM; } + + if (clp->cl_minorversion != 0) + clp->cl_cb_conn.cb_xprt = conn->cb_xprt; clp->cl_cb_client = client; clp->cl_cb_cred = cred; rcu_read_lock(); @@ -1371,11 +1445,21 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_holds_slot = false; } -void nfsd4_run_cb(struct nfsd4_callback *cb) +/** + * nfsd4_run_cb - queue up a callback job to run + * @cb: callback to queue + * + * Kick off a callback to do its thing. Returns false if it was already + * on a queue, true otherwise. + */ +bool nfsd4_run_cb(struct nfsd4_callback *cb) { struct nfs4_client *clp = cb->cb_clp; + bool queued; nfsd41_cb_inflight_begin(clp); - if (!nfsd4_queue_cb(cb)) + queued = nfsd4_queue_cb(cb); + if (!queued) nfsd41_cb_inflight_end(clp); + return queued; } diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index f92161ce1f97..5e9809aff37e 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -41,6 +41,7 @@ #include "idmap.h" #include "nfsd.h" #include "netns.h" +#include "vfs.h" /* * Turn off idmapping when using AUTH_SYS. @@ -82,8 +83,8 @@ ent_init(struct cache_head *cnew, struct cache_head *citm) new->id = itm->id; new->type = itm->type; - strlcpy(new->name, itm->name, sizeof(new->name)); - strlcpy(new->authname, itm->authname, sizeof(new->authname)); + strscpy(new->name, itm->name, sizeof(new->name)); + strscpy(new->authname, itm->authname, sizeof(new->authname)); } static void @@ -548,7 +549,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen return nfserr_badowner; memcpy(key.name, name, namelen); key.name[namelen] = '\0'; - strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); + strscpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, nametoid_lookup, &key, nn->nametoid_cache, &item); if (ret == -ENOENT) return nfserr_badowner; @@ -584,7 +585,7 @@ static __be32 idmap_id_to_name(struct xdr_stream *xdr, int ret; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); + strscpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item); if (ret == -ENOENT) return encode_ascii_id(xdr, id); diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 2c05692a9abf..3564d1c6f610 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -658,7 +658,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) ktime_t now, cutoff; const struct nfsd4_layout_ops *ops; - + trace_nfsd_cb_layout_done(&ls->ls_stid.sc_stateid, task); switch (task->tk_status) { case 0: case -NFS4ERR_DELAY: diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a72ab97f77ef..bd880d55f565 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -141,7 +141,6 @@ fh_dup2(struct svc_fh *dst, struct svc_fh *src) static __be32 do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, int accmode) { - __be32 status; if (open->op_truncate && !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) @@ -156,9 +155,7 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs if (open->op_share_deny & NFS4_SHARE_DENY_READ) accmode |= NFSD_MAY_WRITE; - status = fh_verify(rqstp, current_fh, S_IFREG, accmode); - - return status; + return fh_verify(rqstp, current_fh, S_IFREG, accmode); } static __be32 nfsd_check_obj_isreg(struct svc_fh *fh) @@ -454,7 +451,6 @@ static __be32 do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open) { struct svc_fh *current_fh = &cstate->current_fh; - __be32 status; int accmode = 0; /* We don't know the target directory, and therefore can not @@ -479,9 +475,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, str if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH) accmode = NFSD_MAY_OWNER_OVERRIDE; - status = do_open_permission(rqstp, current_fh, open, accmode); - - return status; + return do_open_permission(rqstp, current_fh, open, accmode); } static void @@ -668,11 +662,9 @@ static __be32 nfsd4_putrootfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { - __be32 status; - fh_put(&cstate->current_fh); - status = exp_pseudoroot(rqstp, &cstate->current_fh); - return status; + + return exp_pseudoroot(rqstp, &cstate->current_fh); } static __be32 @@ -739,10 +731,19 @@ nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_commit *commit = &u->commit; + struct nfsd_file *nf; + __be32 status; - return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, + status = nfsd_file_acquire(rqstp, &cstate->current_fh, NFSD_MAY_WRITE | + NFSD_MAY_NOT_BREAK_LEASE, &nf); + if (status != nfs_ok) + return status; + + status = nfsd_commit(rqstp, &cstate->current_fh, nf, commit->co_offset, commit->co_count, (__be32 *)commit->co_verf.data); + nfsd_file_put(nf); + return status; } static __be32 @@ -942,12 +943,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &read->rd_stateid, RD_STATE, &read->rd_nf, NULL); - if (status) { - dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); - goto out; - } - status = nfs_ok; -out: + read->rd_rqstp = rqstp; read->rd_fhp = &cstate->current_fh; return status; @@ -1116,10 +1112,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &setattr->sa_stateid, WR_STATE, NULL, NULL); - if (status) { - dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); + if (status) return status; - } } err = fh_want_write(&cstate->current_fh); if (err) @@ -1141,6 +1135,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 0, (time64_t)0); if (!status) status = nfserrno(attrs.na_labelerr); + if (!status) + status = nfserrno(attrs.na_aclerr); out: nfsd_attrs_free(&attrs); fh_drop_write(&cstate->current_fh); @@ -1167,10 +1163,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, write->wr_offset, cnt); status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, stateid, WR_STATE, &nf, NULL); - if (status) { - dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); + if (status) return status; - } write->wr_how_written = write->wr_stable_how; @@ -1201,17 +1195,13 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh, src_stateid, RD_STATE, src, NULL); - if (status) { - dprintk("NFSD: %s: couldn't process src stateid!\n", __func__); + if (status) goto out; - } status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, dst_stateid, WR_STATE, dst, NULL); - if (status) { - dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__); + if (status) goto out_put_src; - } /* fix up for NFS-specific error code */ if (!S_ISREG(file_inode((*src)->nf_file)->i_mode) || @@ -1343,7 +1333,7 @@ try_again: return 0; } if (work) { - strlcpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr) - 1); + strscpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr) - 1); refcount_set(&work->nsui_refcnt, 2); work->nsui_busy = true; list_add_tail(&work->nsui_list, &nn->nfsd_ssc_mount_list); @@ -1471,13 +1461,6 @@ out_err: return status; } -static void -nfsd4_interssc_disconnect(struct vfsmount *ss_mnt) -{ - nfs_do_sb_deactive(ss_mnt->mnt_sb); - mntput(ss_mnt); -} - /* * Verify COPY destination stateid. * @@ -1580,11 +1563,6 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct file *filp, { } -static void -nfsd4_interssc_disconnect(struct vfsmount *ss_mnt) -{ -} - static struct file *nfs42_ssc_open(struct vfsmount *ss_mnt, struct nfs_fh *src_fh, nfs4_stateid *stateid) @@ -1621,6 +1599,10 @@ static void nfsd4_cb_offload_release(struct nfsd4_callback *cb) static int nfsd4_cb_offload_done(struct nfsd4_callback *cb, struct rpc_task *task) { + struct nfsd4_cb_offload *cbo = + container_of(cb, struct nfsd4_cb_offload, co_cb); + + trace_nfsd_cb_offload_done(&cbo->co_res.cb_stateid, task); return 1; } @@ -1648,6 +1630,7 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy, u64 src_pos = copy->cp_src_pos; u64 dst_pos = copy->cp_dst_pos; int status; + loff_t end; /* See RFC 7862 p.67: */ if (bytes_total == 0) @@ -1667,8 +1650,8 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy, /* for a non-zero asynchronous copy do a commit of data */ if (nfsd4_copy_is_async(copy) && copy->cp_res.wr_bytes_written > 0) { since = READ_ONCE(dst->f_wb_err); - status = vfs_fsync_range(dst, copy->cp_dst_pos, - copy->cp_res.wr_bytes_written, 0); + end = copy->cp_dst_pos + copy->cp_res.wr_bytes_written - 1; + status = vfs_fsync_range(dst, copy->cp_dst_pos, end, 0); if (!status) status = filemap_check_wb_err(dst->f_mapping, since); if (!status) @@ -1768,8 +1751,14 @@ static int nfsd4_do_async_copy(void *data) filp = nfs42_ssc_open(copy->ss_mnt, ©->c_fh, ©->stateid); if (IS_ERR(filp)) { - nfserr = nfserr_offload_denied; - nfsd4_interssc_disconnect(copy->ss_mnt); + switch (PTR_ERR(filp)) { + case -EBADF: + nfserr = nfserr_wrong_type; + break; + default: + nfserr = nfserr_offload_denied; + } + /* ss_mnt will be unmounted by the laundromat */ goto do_callback; } nfserr = nfsd4_do_copy(copy, filp, copy->nf_dst->nf_file, @@ -1826,7 +1815,7 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!nfs4_init_copy_state(nn, copy)) goto out_err; refcount_set(&async_copy->refcount, 1); - memcpy(©->cp_res.cb_stateid, ©->cp_stateid.stid, + memcpy(©->cp_res.cb_stateid, ©->cp_stateid.cs_stid, sizeof(copy->cp_res.cb_stateid)); dup_copy_fields(copy, async_copy); async_copy->copy_task = kthread_create(nfsd4_do_async_copy, @@ -1850,8 +1839,10 @@ out_err: if (async_copy) cleanup_async_copy(async_copy); status = nfserrno(-ENOMEM); - if (nfsd4_ssc_is_inter(copy)) - nfsd4_interssc_disconnect(copy->ss_mnt); + /* + * source's vfsmount of inter-copy will be unmounted + * by the laundromat + */ goto out; } @@ -1862,7 +1853,7 @@ find_async_copy(struct nfs4_client *clp, stateid_t *stateid) spin_lock(&clp->async_lock); list_for_each_entry(copy, &clp->async_copies, copies) { - if (memcmp(©->cp_stateid.stid, stateid, NFS4_STATEID_SIZE)) + if (memcmp(©->cp_stateid.cs_stid, stateid, NFS4_STATEID_SIZE)) continue; refcount_inc(©->refcount); spin_unlock(&clp->async_lock); @@ -1916,7 +1907,7 @@ nfsd4_copy_notify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, cps = nfs4_alloc_init_cpntf_state(nn, stid); if (!cps) goto out; - memcpy(&cn->cpn_cnr_stateid, &cps->cp_stateid.stid, sizeof(stateid_t)); + memcpy(&cn->cpn_cnr_stateid, &cps->cp_stateid.cs_stid, sizeof(stateid_t)); memcpy(&cps->cp_p_stateid, &stid->sc_stateid, sizeof(stateid_t)); memcpy(&cps->cp_p_clid, &clp->cl_clientid, sizeof(clientid_t)); @@ -1946,10 +1937,8 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &fallocate->falloc_stateid, WR_STATE, &nf, NULL); - if (status != nfs_ok) { - dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n"); + if (status != nfs_ok) return status; - } status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, nf->nf_file, fallocate->falloc_offset, @@ -2005,10 +1994,8 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &seek->seek_stateid, RD_STATE, &nf, NULL); - if (status) { - dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n"); + if (status) return status; - } switch (seek->seek_whence) { case NFS4_CONTENT_DATA: @@ -2633,9 +2620,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) status = nfserr_minor_vers_mismatch; if (nfsd_minorversion(nn, args->minorversion, NFSD_TEST) <= 0) goto out; - status = nfserr_resource; - if (args->opcnt > NFSD_MAX_OPS_PER_COMPOUND) - goto out; status = nfs41_check_op_ordering(args); if (status) { @@ -2648,10 +2632,20 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) rqstp->rq_lease_breaker = (void **)&cstate->clp; - trace_nfsd_compound(rqstp, args->opcnt); + trace_nfsd_compound(rqstp, args->tag, args->taglen, args->client_opcnt); while (!status && resp->opcnt < args->opcnt) { op = &args->ops[resp->opcnt++]; + if (unlikely(resp->opcnt == NFSD_MAX_OPS_PER_COMPOUND)) { + /* If there are still more operations to process, + * stop here and report NFS4ERR_RESOURCE. */ + if (cstate->minorversion == 0 && + args->client_opcnt > resp->opcnt) { + op->status = nfserr_resource; + goto encode_op; + } + } + /* * The XDR decode routines may have pre-set op->status; * for example, if there is a miscellaneous XDR error @@ -2727,8 +2721,8 @@ encode_op: status = op->status; } - trace_nfsd_compound_status(args->opcnt, resp->opcnt, status, - nfsd4_op_name(op->opnum)); + trace_nfsd_compound_status(args->client_opcnt, resp->opcnt, + status, nfsd4_op_name(op->opnum)); nfsd4_cstate_clear_replay(cstate); nfsd4_increment_op_stats(op->opnum); @@ -2762,28 +2756,49 @@ out: #define op_encode_channel_attrs_maxsz (6 + 1 + 1) -static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +/* + * The _rsize() helpers are invoked by the NFSv4 COMPOUND decoder, which + * is called before sunrpc sets rq_res.buflen. Thus we have to compute + * the maximum payload size here, based on transport limits and the size + * of the remaining space in the rq_pages array. + */ +static u32 nfsd4_max_payload(const struct svc_rqst *rqstp) +{ + u32 buflen; + + buflen = (rqstp->rq_page_end - rqstp->rq_next_page) * PAGE_SIZE; + buflen -= rqstp->rq_auth_slack; + buflen -= rqstp->rq_res.head[0].iov_len; + return min_t(u32, buflen, svc_max_payload(rqstp)); +} + +static u32 nfsd4_only_status_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size) * sizeof(__be32); } -static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_status_stateid_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32); } -static inline u32 nfsd4_access_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_access_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { /* ac_supported, ac_resp_access */ return (op_encode_hdr_size + 2)* sizeof(__be32); } -static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_commit_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_create_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); @@ -2794,17 +2809,17 @@ static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op * the op prematurely if the estimate is too large. We may turn off splice * reads unnecessarily. */ -static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp, - struct nfsd4_op *op) +static u32 nfsd4_getattr_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { - u32 *bmap = op->u.getattr.ga_bmval; + const u32 *bmap = op->u.getattr.ga_bmval; u32 bmap0 = bmap[0], bmap1 = bmap[1], bmap2 = bmap[2]; u32 ret = 0; if (bmap0 & FATTR4_WORD0_ACL) - return svc_max_payload(rqstp); + return nfsd4_max_payload(rqstp); if (bmap0 & FATTR4_WORD0_FS_LOCATIONS) - return svc_max_payload(rqstp); + return nfsd4_max_payload(rqstp); if (bmap1 & FATTR4_WORD1_OWNER) { ret += IDMAP_NAMESZ + 4; @@ -2832,24 +2847,28 @@ static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp, return ret; } -static inline u32 nfsd4_getfh_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_getfh_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1) * sizeof(__be32) + NFS4_FHSIZE; } -static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_link_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_lock_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_lock_denied_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_open_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_stateid_maxsz + op_encode_change_info_maxsz + 1 @@ -2857,20 +2876,18 @@ static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) + op_encode_delegation_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_read_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { - u32 maxcount = 0, rlen = 0; - - maxcount = svc_max_payload(rqstp); - rlen = min(op->u.read.rd_length, maxcount); + u32 rlen = min(op->u.read.rd_length, nfsd4_max_payload(rqstp)); return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32); } -static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_read_plus_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { - u32 maxcount = svc_max_payload(rqstp); - u32 rlen = min(op->u.read.rd_length, maxcount); + u32 rlen = min(op->u.read.rd_length, nfsd4_max_payload(rqstp)); /* * If we detect that the file changed during hole encoding, then we * recover by encoding the remaining reply as data. This means we need @@ -2881,70 +2898,77 @@ static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op return (op_encode_hdr_size + 2 + seg_len + XDR_QUADLEN(rlen)) * sizeof(__be32); } -static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_readdir_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { - u32 maxcount = 0, rlen = 0; - - maxcount = svc_max_payload(rqstp); - rlen = min(op->u.readdir.rd_maxcount, maxcount); + u32 rlen = min(op->u.readdir.rd_maxcount, nfsd4_max_payload(rqstp)); return (op_encode_hdr_size + op_encode_verifier_maxsz + XDR_QUADLEN(rlen)) * sizeof(__be32); } -static inline u32 nfsd4_readlink_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_readlink_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1) * sizeof(__be32) + PAGE_SIZE; } -static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_remove_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_rename_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz + op_encode_change_info_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp, - struct nfsd4_op *op) +static u32 nfsd4_sequence_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32); } -static inline u32 nfsd4_test_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_test_stateid_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1 + op->u.test_stateid.ts_num_ids) * sizeof(__be32); } -static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_setattr_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_secinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_secinfo_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + RPC_AUTH_MAXFLAVOR * (4 + XDR_QUADLEN(GSS_OID_MAX_LEN))) * sizeof(__be32); } -static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_setclientid_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) * sizeof(__be32); } -static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_write_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 2 + op_encode_verifier_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_exchange_id_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\ 1 + 1 + /* eir_flags, spr_how */\ @@ -2958,14 +2982,16 @@ static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_o 0 /* ignored eir_server_impl_id contents */) * sizeof(__be32); } -static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_bind_conn_to_session_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + \ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\ 2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32); } -static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_create_session_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + \ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\ @@ -2974,7 +3000,8 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd op_encode_channel_attrs_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_copy_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1 /* wr_callback */ + @@ -2986,16 +3013,16 @@ static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) 1 /* cr_synchronous */) * sizeof(__be32); } -static inline u32 nfsd4_offload_status_rsize(struct svc_rqst *rqstp, - struct nfsd4_op *op) +static u32 nfsd4_offload_status_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 2 /* osr_count */ + 1 /* osr_complete<1> optional 0 for now */) * sizeof(__be32); } -static inline u32 nfsd4_copy_notify_rsize(struct svc_rqst *rqstp, - struct nfsd4_op *op) +static u32 nfsd4_copy_notify_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 3 /* cnr_lease_time */ + @@ -3010,12 +3037,10 @@ static inline u32 nfsd4_copy_notify_rsize(struct svc_rqst *rqstp, } #ifdef CONFIG_NFSD_PNFS -static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_getdeviceinfo_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { - u32 maxcount = 0, rlen = 0; - - maxcount = svc_max_payload(rqstp); - rlen = min(op->u.getdeviceinfo.gd_maxcount, maxcount); + u32 rlen = min(op->u.getdeviceinfo.gd_maxcount, nfsd4_max_payload(rqstp)); return (op_encode_hdr_size + 1 /* gd_layout_type*/ + @@ -3028,7 +3053,8 @@ static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4 * so we need to define an arbitrary upper bound here. */ #define MAX_LAYOUT_SIZE 128 -static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_layoutget_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1 /* logr_return_on_close */ + @@ -3037,14 +3063,16 @@ static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op MAX_LAYOUT_SIZE) * sizeof(__be32); } -static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_layoutcommit_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1 /* locr_newsize */ + 2 /* ns_size */) * sizeof(__be32); } -static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_layoutreturn_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1 /* lrs_stateid */ + @@ -3053,41 +3081,36 @@ static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_ #endif /* CONFIG_NFSD_PNFS */ -static inline u32 nfsd4_seek_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_seek_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 3) * sizeof(__be32); } -static inline u32 nfsd4_getxattr_rsize(struct svc_rqst *rqstp, - struct nfsd4_op *op) +static u32 nfsd4_getxattr_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { - u32 maxcount, rlen; - - maxcount = svc_max_payload(rqstp); - rlen = min_t(u32, XATTR_SIZE_MAX, maxcount); + u32 rlen = min_t(u32, XATTR_SIZE_MAX, nfsd4_max_payload(rqstp)); return (op_encode_hdr_size + 1 + XDR_QUADLEN(rlen)) * sizeof(__be32); } -static inline u32 nfsd4_setxattr_rsize(struct svc_rqst *rqstp, - struct nfsd4_op *op) +static u32 nfsd4_setxattr_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_listxattrs_rsize(struct svc_rqst *rqstp, - struct nfsd4_op *op) +static u32 nfsd4_listxattrs_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { - u32 maxcount, rlen; - - maxcount = svc_max_payload(rqstp); - rlen = min(op->u.listxattrs.lsxa_maxcount, maxcount); + u32 rlen = min(op->u.listxattrs.lsxa_maxcount, nfsd4_max_payload(rqstp)); return (op_encode_hdr_size + 4 + XDR_QUADLEN(rlen)) * sizeof(__be32); } -static inline u32 nfsd4_removexattr_rsize(struct svc_rqst *rqstp, - struct nfsd4_op *op) +static u32 nfsd4_removexattr_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz) * sizeof(__be32); @@ -3576,6 +3599,7 @@ static const struct svc_procedure nfsd_procedures4[2] = { .pc_decode = nfssvc_decode_voidarg, .pc_encode = nfssvc_encode_voidres, .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 1, @@ -3586,6 +3610,7 @@ static const struct svc_procedure nfsd_procedures4[2] = { .pc_decode = nfs4svc_decode_compoundargs, .pc_encode = nfs4svc_encode_compoundres, .pc_argsize = sizeof(struct nfsd4_compoundargs), + .pc_argzero = offsetof(struct nfsd4_compoundargs, iops), .pc_ressize = sizeof(struct nfsd4_compoundres), .pc_release = nfsd4_release_compoundargs, .pc_cachetype = RC_NOCACHE, diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index c634483d85d2..78b8cd9651d5 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -266,7 +266,7 @@ struct nfs4_dir_ctx { struct list_head names; }; -static int +static bool nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { @@ -275,14 +275,14 @@ nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen, struct name_list *entry; if (namlen != HEXDIR_LEN - 1) - return 0; + return true; entry = kmalloc(sizeof(struct name_list), GFP_KERNEL); if (entry == NULL) - return -ENOMEM; + return false; memcpy(entry->name, name, HEXDIR_LEN - 1); entry->name[HEXDIR_LEN - 1] = '\0'; list_add(&entry->list, &ctx->names); - return 0; + return true; } static int @@ -807,16 +807,18 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, if (get_user(namelen, &ci->cc_name.cn_len)) return -EFAULT; name.data = memdup_user(&ci->cc_name.cn_id, namelen); - if (IS_ERR_OR_NULL(name.data)) - return -EFAULT; + if (IS_ERR(name.data)) + return PTR_ERR(name.data); name.len = namelen; get_user(princhashlen, &ci->cc_princhash.cp_len); if (princhashlen > 0) { princhash.data = memdup_user( &ci->cc_princhash.cp_data, princhashlen); - if (IS_ERR_OR_NULL(princhash.data)) - return -EFAULT; + if (IS_ERR(princhash.data)) { + kfree(name.data); + return PTR_ERR(princhash.data); + } princhash.len = princhashlen; } else princhash.len = 0; @@ -827,8 +829,8 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, if (get_user(namelen, &cnm->cn_len)) return -EFAULT; name.data = memdup_user(&cnm->cn_id, namelen); - if (IS_ERR_OR_NULL(name.data)) - return -EFAULT; + if (IS_ERR(name.data)) + return PTR_ERR(name.data); name.len = namelen; } if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) { diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c5d199d7e6b4..7b2ee535ade8 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -44,7 +44,9 @@ #include <linux/jhash.h> #include <linux/string_helpers.h> #include <linux/fsnotify.h> +#include <linux/rhashtable.h> #include <linux/nfs_ssc.h> + #include "xdr4.h" #include "xdr4cb.h" #include "vfs.h" @@ -84,6 +86,7 @@ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) static void nfs4_free_ol_stateid(struct nfs4_stid *stid); void nfsd4_end_grace(struct nfsd_net *nn); static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps); +static void nfsd4_file_hash_remove(struct nfs4_file *fi); /* Locking: */ @@ -160,6 +163,13 @@ static bool is_client_expired(struct nfs4_client *clp) return clp->cl_time == 0; } +static void nfsd4_dec_courtesy_client_count(struct nfsd_net *nn, + struct nfs4_client *clp) +{ + if (clp->cl_state != NFSD4_ACTIVE) + atomic_add_unless(&nn->nfsd_courtesy_clients, -1, 0); +} + static __be32 get_client_locked(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); @@ -169,6 +179,7 @@ static __be32 get_client_locked(struct nfs4_client *clp) if (is_client_expired(clp)) return nfserr_expired; atomic_inc(&clp->cl_rpc_users); + nfsd4_dec_courtesy_client_count(nn, clp); clp->cl_state = NFSD4_ACTIVE; return nfs_ok; } @@ -190,6 +201,7 @@ renew_client_locked(struct nfs4_client *clp) list_move_tail(&clp->cl_lru, &nn->client_lru); clp->cl_time = ktime_get_boottime_seconds(); + nfsd4_dec_courtesy_client_count(nn, clp); clp->cl_state = NFSD4_ACTIVE; } @@ -357,6 +369,8 @@ nfsd4_cb_notify_lock_prepare(struct nfsd4_callback *cb) static int nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task) { + trace_nfsd_cb_notify_lock_done(&zero_stateid, task); + /* * Since this is just an optimization, we don't try very hard if it * turns out not to succeed. We'll requeue it on NFS4ERR_DELAY, and @@ -577,11 +591,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu) void put_nfs4_file(struct nfs4_file *fi) { - might_lock(&state_lock); - - if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) { - hlist_del_rcu(&fi->fi_hash); - spin_unlock(&state_lock); + if (refcount_dec_and_test(&fi->fi_ref)) { + nfsd4_file_hash_remove(fi); WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu); @@ -664,15 +675,26 @@ find_any_file(struct nfs4_file *f) return ret; } -static struct nfsd_file *find_deleg_file(struct nfs4_file *f) +static struct nfsd_file *find_any_file_locked(struct nfs4_file *f) +{ + lockdep_assert_held(&f->fi_lock); + + if (f->fi_fds[O_RDWR]) + return f->fi_fds[O_RDWR]; + if (f->fi_fds[O_WRONLY]) + return f->fi_fds[O_WRONLY]; + if (f->fi_fds[O_RDONLY]) + return f->fi_fds[O_RDONLY]; + return NULL; +} + +static struct nfsd_file *find_deleg_file_locked(struct nfs4_file *f) { - struct nfsd_file *ret = NULL; + lockdep_assert_held(&f->fi_lock); - spin_lock(&f->fi_lock); if (f->fi_deleg_file) - ret = nfsd_file_get(f->fi_deleg_file); - spin_unlock(&f->fi_lock); - return ret; + return f->fi_deleg_file; + return NULL; } static atomic_long_t num_delegations; @@ -695,19 +717,20 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername) return ret & OWNER_HASH_MASK; } -/* hash table for nfs4_file */ -#define FILE_HASH_BITS 8 -#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) +static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp; -static unsigned int file_hashval(struct svc_fh *fh) -{ - struct inode *inode = d_inode(fh->fh_dentry); +static const struct rhashtable_params nfs4_file_rhash_params = { + .key_len = sizeof_field(struct nfs4_file, fi_inode), + .key_offset = offsetof(struct nfs4_file, fi_inode), + .head_offset = offsetof(struct nfs4_file, fi_rlist), - /* XXX: why not (here & in file cache) use inode? */ - return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS); -} - -static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; + /* + * Start with a single page hash table to reduce resizing churn + * on light workloads. + */ + .min_size = 256, + .automatic_shrinking = true, +}; /* * Check if courtesy clients have conflicting access and resolve it if possible @@ -820,9 +843,9 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) swap(f2, fp->fi_fds[O_RDWR]); spin_unlock(&fp->fi_lock); if (f1) - nfsd_file_close(f1); + nfsd_file_put(f1); if (f2) - nfsd_file_close(f2); + nfsd_file_put(f2); } } @@ -963,19 +986,19 @@ out_free: * Create a unique stateid_t to represent each COPY. */ static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid, - unsigned char sc_type) + unsigned char cs_type) { int new_id; - stid->stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time; - stid->stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id; - stid->sc_type = sc_type; + stid->cs_stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time; + stid->cs_stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id; + stid->cs_type = cs_type; idr_preload(GFP_KERNEL); spin_lock(&nn->s2s_cp_lock); new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, stid, 0, 0, GFP_NOWAIT); - stid->stid.si_opaque.so_id = new_id; - stid->stid.si_generation = 1; + stid->cs_stid.si_opaque.so_id = new_id; + stid->cs_stid.si_generation = 1; spin_unlock(&nn->s2s_cp_lock); idr_preload_end(); if (new_id < 0) @@ -997,7 +1020,7 @@ struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn, if (!cps) return NULL; cps->cpntf_time = ktime_get_boottime_seconds(); - refcount_set(&cps->cp_stateid.sc_count, 1); + refcount_set(&cps->cp_stateid.cs_count, 1); if (!nfs4_init_cp_state(nn, &cps->cp_stateid, NFS4_COPYNOTIFY_STID)) goto out_free; spin_lock(&nn->s2s_cp_lock); @@ -1013,11 +1036,11 @@ void nfs4_free_copy_state(struct nfsd4_copy *copy) { struct nfsd_net *nn; - WARN_ON_ONCE(copy->cp_stateid.sc_type != NFS4_COPY_STID); + WARN_ON_ONCE(copy->cp_stateid.cs_type != NFS4_COPY_STID); nn = net_generic(copy->cp_clp->net, nfsd_net_id); spin_lock(&nn->s2s_cp_lock); idr_remove(&nn->s2s_cp_stateids, - copy->cp_stateid.stid.si_opaque.so_id); + copy->cp_stateid.cs_stid.si_opaque.so_id); spin_unlock(&nn->s2s_cp_lock); } @@ -1049,6 +1072,12 @@ static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp) static void nfs4_free_deleg(struct nfs4_stid *stid) { + struct nfs4_delegation *dp = delegstateid(stid); + + WARN_ON_ONCE(!list_empty(&stid->sc_cp_list)); + WARN_ON_ONCE(!list_empty(&dp->dl_perfile)); + WARN_ON_ONCE(!list_empty(&dp->dl_perclnt)); + WARN_ON_ONCE(!list_empty(&dp->dl_recall_lru)); kmem_cache_free(deleg_slab, stid); atomic_long_dec(&num_delegations); } @@ -1338,6 +1367,8 @@ static void revoke_delegation(struct nfs4_delegation *dp) WARN_ON(!list_empty(&dp->dl_recall_lru)); + trace_nfsd_stid_revoke(&dp->dl_stid); + if (clp->cl_minorversion) { dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID; refcount_inc(&dp->dl_stid.sc_count); @@ -1462,6 +1493,7 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid) release_all_access(stp); if (stp->st_stateowner) nfs4_put_stateowner(stp->st_stateowner); + WARN_ON(!list_empty(&stid->sc_cp_list)); kmem_cache_free(stateid_slab, stid); } @@ -1801,13 +1833,12 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs, int numslots = fattrs->maxreqs; int slotsize = slot_bytes(fattrs); struct nfsd4_session *new; - int mem, i; + int i; - BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *) - + sizeof(struct nfsd4_session) > PAGE_SIZE); - mem = numslots * sizeof(struct nfsd4_slot *); + BUILD_BUG_ON(struct_size(new, se_slots, NFSD_MAX_SLOTS_PER_SESSION) + > PAGE_SIZE); - new = kzalloc(sizeof(*new) + mem, GFP_KERNEL); + new = kzalloc(struct_size(new, se_slots, numslots), GFP_KERNEL); if (!new) return NULL; /* allocate each struct nfsd4_slot and data cache in one piece */ @@ -2113,6 +2144,7 @@ static void __free_client(struct kref *k) kfree(clp->cl_nii_domain.data); kfree(clp->cl_nii_name.data); idr_destroy(&clp->cl_stateids); + kfree(clp->cl_ra); kmem_cache_free(client_slab, clp); } @@ -2233,6 +2265,7 @@ __destroy_client(struct nfs4_client *clp) if (clp->cl_cb_conn.cb_xprt) svc_xprt_put(clp->cl_cb_conn.cb_xprt); atomic_add_unless(&nn->nfs4_client_count, -1, 0); + nfsd4_dec_courtesy_client_count(nn, clp); free_client(clp); wake_up_all(&expiry_wq); } @@ -2478,7 +2511,7 @@ static const char *cb_state2str(int state) static int client_info_show(struct seq_file *m, void *v) { - struct inode *inode = m->private; + struct inode *inode = file_inode(m->file); struct nfs4_client *clp; u64 clid; @@ -2518,17 +2551,7 @@ static int client_info_show(struct seq_file *m, void *v) return 0; } -static int client_info_open(struct inode *inode, struct file *file) -{ - return single_open(file, client_info_show, inode); -} - -static const struct file_operations client_info_fops = { - .open = client_info_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(client_info); static void *states_start(struct seq_file *s, loff_t *pos) __acquires(&clp->cl_lock) @@ -2604,9 +2627,11 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) ols = openlockstateid(st); oo = ols->st_stateowner; nf = st->sc_file; - file = find_any_file(nf); + + spin_lock(&nf->fi_lock); + file = find_any_file_locked(nf); if (!file) - return 0; + goto out; seq_printf(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); @@ -2628,8 +2653,8 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) seq_printf(s, ", "); nfs4_show_owner(s, oo); seq_printf(s, " }\n"); - nfsd_file_put(file); - +out: + spin_unlock(&nf->fi_lock); return 0; } @@ -2643,9 +2668,10 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) ols = openlockstateid(st); oo = ols->st_stateowner; nf = st->sc_file; - file = find_any_file(nf); + spin_lock(&nf->fi_lock); + file = find_any_file_locked(nf); if (!file) - return 0; + goto out; seq_printf(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); @@ -2665,8 +2691,8 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) seq_printf(s, ", "); nfs4_show_owner(s, oo); seq_printf(s, " }\n"); - nfsd_file_put(file); - +out: + spin_unlock(&nf->fi_lock); return 0; } @@ -2678,9 +2704,10 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) ds = delegstateid(st); nf = st->sc_file; - file = find_deleg_file(nf); + spin_lock(&nf->fi_lock); + file = find_deleg_file_locked(nf); if (!file) - return 0; + goto out; seq_printf(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); @@ -2696,8 +2723,8 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) seq_printf(s, ", "); nfs4_show_fname(s, file); seq_printf(s, " }\n"); - nfsd_file_put(file); - +out: + spin_unlock(&nf->fi_lock); return 0; } @@ -2845,6 +2872,37 @@ static const struct tree_descr client_files[] = { [3] = {""}, }; +static int +nfsd4_cb_recall_any_done(struct nfsd4_callback *cb, + struct rpc_task *task) +{ + trace_nfsd_cb_recall_any_done(cb, task); + switch (task->tk_status) { + case -NFS4ERR_DELAY: + rpc_delay(task, 2 * HZ); + return 0; + default: + return 1; + } +} + +static void +nfsd4_cb_recall_any_release(struct nfsd4_callback *cb) +{ + struct nfs4_client *clp = cb->cb_clp; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + spin_lock(&nn->client_lock); + clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags); + put_client_renew_locked(clp); + spin_unlock(&nn->client_lock); +} + +static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = { + .done = nfsd4_cb_recall_any_done, + .release = nfsd4_cb_recall_any_release, +}; + static struct nfs4_client *create_client(struct xdr_netobj name, struct svc_rqst *rqstp, nfs4_verifier *verf) { @@ -2882,6 +2940,14 @@ static struct nfs4_client *create_client(struct xdr_netobj name, free_client(clp); return NULL; } + clp->cl_ra = kzalloc(sizeof(*clp->cl_ra), GFP_KERNEL); + if (!clp->cl_ra) { + free_client(clp); + return NULL; + } + clp->cl_ra_time = 0; + nfsd4_init_cb(&clp->cl_ra->ra_cb, clp, &nfsd4_cb_recall_any_ops, + NFSPROC4_CLNT_CB_RECALL_ANY); return clp; } @@ -4251,11 +4317,9 @@ static struct nfs4_file *nfsd4_alloc_file(void) } /* OPEN Share state helper functions */ -static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval, - struct nfs4_file *fp) -{ - lockdep_assert_held(&state_lock); +static void nfsd4_file_init(const struct svc_fh *fh, struct nfs4_file *fp) +{ refcount_set(&fp->fi_ref, 1); spin_lock_init(&fp->fi_lock); INIT_LIST_HEAD(&fp->fi_stateids); @@ -4273,7 +4337,6 @@ static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval, INIT_LIST_HEAD(&fp->fi_lo_states); atomic_set(&fp->fi_lo_recalls, 0); #endif - hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]); } void @@ -4337,7 +4400,29 @@ out: return -ENOMEM; } -void nfsd4_init_leases_net(struct nfsd_net *nn) +static unsigned long +nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) +{ + int count; + struct nfsd_net *nn = container_of(shrink, + struct nfsd_net, nfsd_client_shrinker); + + count = atomic_read(&nn->nfsd_courtesy_clients); + if (!count) + count = atomic_long_read(&num_delegations); + if (count) + mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); + return (unsigned long)count; +} + +static unsigned long +nfsd4_state_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + return SHRINK_STOP; +} + +int +nfsd4_init_leases_net(struct nfsd_net *nn) { struct sysinfo si; u64 max_clients; @@ -4346,8 +4431,8 @@ void nfsd4_init_leases_net(struct nfsd_net *nn) nn->nfsd4_grace = 90; nn->somebody_reclaimed = false; nn->track_reclaim_completes = false; - nn->clverifier_counter = prandom_u32(); - nn->clientid_base = prandom_u32(); + nn->clverifier_counter = get_random_u32(); + nn->clientid_base = get_random_u32(); nn->clientid_counter = nn->clientid_base + 1; nn->s2s_cp_cl_id = nn->clientid_counter++; @@ -4356,6 +4441,18 @@ void nfsd4_init_leases_net(struct nfsd_net *nn) max_clients = (u64)si.totalram * si.mem_unit / (1024 * 1024 * 1024); max_clients *= NFS4_CLIENTS_PER_GB; nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB); + + atomic_set(&nn->nfsd_courtesy_clients, 0); + nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan; + nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count; + nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS; + return register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client"); +} + +void +nfsd4_leases_net_shutdown(struct nfsd_net *nn) +{ + unregister_shrinker(&nn->nfsd_client_shrinker); } static void init_nfs4_replay(struct nfs4_replay *rp) @@ -4626,71 +4723,80 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) nfs4_put_stid(&last->st_stid); } -/* search file_hashtbl[] for file */ -static struct nfs4_file * -find_file_locked(struct svc_fh *fh, unsigned int hashval) +static noinline_for_stack struct nfs4_file * +nfsd4_file_hash_lookup(const struct svc_fh *fhp) { - struct nfs4_file *fp; + struct inode *inode = d_inode(fhp->fh_dentry); + struct rhlist_head *tmp, *list; + struct nfs4_file *fi; - hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash, - lockdep_is_held(&state_lock)) { - if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) { - if (refcount_inc_not_zero(&fp->fi_ref)) - return fp; + rcu_read_lock(); + list = rhltable_lookup(&nfs4_file_rhltable, &inode, + nfs4_file_rhash_params); + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { + if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { + if (refcount_inc_not_zero(&fi->fi_ref)) { + rcu_read_unlock(); + return fi; + } } } + rcu_read_unlock(); return NULL; } -static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh, - unsigned int hashval) +/* + * On hash insertion, identify entries with the same inode but + * distinct filehandles. They will all be on the list returned + * by rhltable_lookup(). + * + * inode->i_lock prevents racing insertions from adding an entry + * for the same inode/fhp pair twice. + */ +static noinline_for_stack struct nfs4_file * +nfsd4_file_hash_insert(struct nfs4_file *new, const struct svc_fh *fhp) { - struct nfs4_file *fp; + struct inode *inode = d_inode(fhp->fh_dentry); + struct rhlist_head *tmp, *list; struct nfs4_file *ret = NULL; bool alias_found = false; + struct nfs4_file *fi; + int err; - spin_lock(&state_lock); - hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash, - lockdep_is_held(&state_lock)) { - if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) { - if (refcount_inc_not_zero(&fp->fi_ref)) - ret = fp; - } else if (d_inode(fh->fh_dentry) == fp->fi_inode) - fp->fi_aliased = alias_found = true; - } - if (likely(ret == NULL)) { - nfsd4_init_file(fh, hashval, new); - new->fi_aliased = alias_found; - ret = new; + rcu_read_lock(); + spin_lock(&inode->i_lock); + + list = rhltable_lookup(&nfs4_file_rhltable, &inode, + nfs4_file_rhash_params); + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { + if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { + if (refcount_inc_not_zero(&fi->fi_ref)) + ret = fi; + } else + fi->fi_aliased = alias_found = true; } - spin_unlock(&state_lock); - return ret; -} + if (ret) + goto out_unlock; -static struct nfs4_file * find_file(struct svc_fh *fh) -{ - struct nfs4_file *fp; - unsigned int hashval = file_hashval(fh); + nfsd4_file_init(fhp, new); + err = rhltable_insert(&nfs4_file_rhltable, &new->fi_rlist, + nfs4_file_rhash_params); + if (err) + goto out_unlock; - rcu_read_lock(); - fp = find_file_locked(fh, hashval); + new->fi_aliased = alias_found; + ret = new; + +out_unlock: + spin_unlock(&inode->i_lock); rcu_read_unlock(); - return fp; + return ret; } -static struct nfs4_file * -find_or_add_file(struct nfs4_file *new, struct svc_fh *fh) +static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi) { - struct nfs4_file *fp; - unsigned int hashval = file_hashval(fh); - - rcu_read_lock(); - fp = find_file_locked(fh, hashval); - rcu_read_unlock(); - if (fp) - return fp; - - return insert_file(new, fh, hashval); + rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist, + nfs4_file_rhash_params); } /* @@ -4703,9 +4809,10 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) struct nfs4_file *fp; __be32 ret = nfs_ok; - fp = find_file(current_fh); + fp = nfsd4_file_hash_lookup(current_fh); if (!fp) return ret; + /* Check for conflicting share reservations */ spin_lock(&fp->fi_lock); if (fp->fi_share_deny & deny_type) @@ -4715,6 +4822,35 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) return ret; } +static bool nfsd4_deleg_present(const struct inode *inode) +{ + struct file_lock_context *ctx = locks_inode_context(inode); + + return ctx && !list_empty_careful(&ctx->flc_lease); +} + +/** + * nfsd_wait_for_delegreturn - wait for delegations to be returned + * @rqstp: the RPC transaction being executed + * @inode: in-core inode of the file being waited for + * + * The timeout prevents deadlock if all nfsd threads happen to be + * tied up waiting for returning delegations. + * + * Return values: + * %true: delegation was returned + * %false: timed out waiting for delegreturn + */ +bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp, struct inode *inode) +{ + long __maybe_unused timeo; + + timeo = wait_var_event_timeout(inode, !nfsd4_deleg_present(inode), + NFSD_DELEGRETURN_TIMEOUT); + trace_nfsd_delegret_wakeup(rqstp, inode, timeo); + return timeo > 0; +} + static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb) { struct nfs4_delegation *dp = cb_to_delegation(cb); @@ -4743,6 +4879,8 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, { struct nfs4_delegation *dp = cb_to_delegation(cb); + trace_nfsd_cb_recall_done(&dp->dl_stid.sc_stateid, task); + if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID || dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) return 1; @@ -4788,18 +4926,17 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) * We're assuming the state code never drops its reference * without first removing the lease. Since we're in this lease * callback (and since the lease code is serialized by the - * i_lock) we know the server hasn't removed the lease yet, and + * flc_lock) we know the server hasn't removed the lease yet, and * we know it's safe to take a reference. */ refcount_inc(&dp->dl_stid.sc_count); - nfsd4_run_cb(&dp->dl_recall); + WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall)); } -/* Called from break_lease() with i_lock held. */ +/* Called from break_lease() with flc_lock held. */ static bool nfsd_break_deleg_cb(struct file_lock *fl) { - bool ret = false; struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; struct nfs4_file *fp = dp->dl_stid.sc_file; struct nfs4_client *clp = dp->dl_stid.sc_client; @@ -4825,7 +4962,7 @@ nfsd_break_deleg_cb(struct file_lock *fl) fp->fi_had_conflict = true; nfsd_break_one_deleg(dp); spin_unlock(&fp->fi_lock); - return ret; + return false; } /** @@ -5311,6 +5448,7 @@ nfsd4_verify_deleg_dentry(struct nfsd4_open *open, struct nfs4_file *fp, if (err) return -EAGAIN; + exp_put(exp); dput(child); if (child != file_dentry(fp->fi_deleg_file->nf_file)) return -EAGAIN; @@ -5548,7 +5686,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf * and check for delegations in the process of being recalled. * If not found, create the nfs4_file struct */ - fp = find_or_add_file(open->op_file, current_fh); + fp = nfsd4_file_hash_insert(open->op_file, current_fh); + if (unlikely(!fp)) + return nfserr_jukebox; if (fp != open->op_file) { status = nfs4_check_deleg(cl, open, &dp); if (status) @@ -5825,7 +5965,7 @@ nfs4_lockowner_has_blockers(struct nfs4_lockowner *lo) list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) { nf = stp->st_stid.sc_file; - ctx = nf->fi_inode->i_flctx; + ctx = locks_inode_context(nf->fi_inode); if (!ctx) continue; if (locks_owner_has_blockers(ctx, lo)) @@ -5878,8 +6018,11 @@ nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist, goto exp_client; if (!state_expired(lt, clp->cl_time)) break; - if (!atomic_read(&clp->cl_rpc_users)) + if (!atomic_read(&clp->cl_rpc_users)) { + if (clp->cl_state == NFSD4_ACTIVE) + atomic_inc(&nn->nfsd_courtesy_clients); clp->cl_state = NFSD4_COURTESY; + } if (!client_has_state(clp)) goto exp_client; if (!nfs4_anylock_blockers(clp)) @@ -5894,10 +6037,49 @@ exp_client: spin_unlock(&nn->client_lock); } +static void +nfs4_get_courtesy_client_reaplist(struct nfsd_net *nn, + struct list_head *reaplist) +{ + unsigned int maxreap = 0, reapcnt = 0; + struct list_head *pos, *next; + struct nfs4_client *clp; + + maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN; + INIT_LIST_HEAD(reaplist); + + spin_lock(&nn->client_lock); + list_for_each_safe(pos, next, &nn->client_lru) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + if (clp->cl_state == NFSD4_ACTIVE) + break; + if (reapcnt >= maxreap) + break; + if (!mark_client_expired_locked(clp)) { + list_add(&clp->cl_lru, reaplist); + reapcnt++; + } + } + spin_unlock(&nn->client_lock); +} + +static void +nfs4_process_client_reaplist(struct list_head *reaplist) +{ + struct list_head *pos, *next; + struct nfs4_client *clp; + + list_for_each_safe(pos, next, reaplist) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + trace_nfsd_clid_purged(&clp->cl_clientid); + list_del_init(&clp->cl_lru); + expire_client(clp); + } +} + static time64_t nfs4_laundromat(struct nfsd_net *nn) { - struct nfs4_client *clp; struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct nfs4_ol_stateid *stp; @@ -5920,18 +6102,14 @@ nfs4_laundromat(struct nfsd_net *nn) spin_lock(&nn->s2s_cp_lock); idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) { cps = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid); - if (cps->cp_stateid.sc_type == NFS4_COPYNOTIFY_STID && + if (cps->cp_stateid.cs_type == NFS4_COPYNOTIFY_STID && state_expired(<, cps->cpntf_time)) _free_cpntf_state_locked(nn, cps); } spin_unlock(&nn->s2s_cp_lock); nfs4_get_client_reaplist(nn, &reaplist, <); - list_for_each_safe(pos, next, &reaplist) { - clp = list_entry(pos, struct nfs4_client, cl_lru); - trace_nfsd_clid_purged(&clp->cl_clientid); - list_del_init(&clp->cl_lru); - expire_client(clp); - } + nfs4_process_client_reaplist(&reaplist); + spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); @@ -6014,6 +6192,65 @@ laundromat_main(struct work_struct *laundry) queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ); } +static void +courtesy_client_reaper(struct nfsd_net *nn) +{ + struct list_head reaplist; + + nfs4_get_courtesy_client_reaplist(nn, &reaplist); + nfs4_process_client_reaplist(&reaplist); +} + +static void +deleg_reaper(struct nfsd_net *nn) +{ + struct list_head *pos, *next; + struct nfs4_client *clp; + struct list_head cblist; + + INIT_LIST_HEAD(&cblist); + spin_lock(&nn->client_lock); + list_for_each_safe(pos, next, &nn->client_lru) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + if (clp->cl_state != NFSD4_ACTIVE || + list_empty(&clp->cl_delegations) || + atomic_read(&clp->cl_delegs_in_recall) || + test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) || + (ktime_get_boottime_seconds() - + clp->cl_ra_time < 5)) { + continue; + } + list_add(&clp->cl_ra_cblist, &cblist); + + /* release in nfsd4_cb_recall_any_release */ + atomic_inc(&clp->cl_rpc_users); + set_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags); + clp->cl_ra_time = ktime_get_boottime_seconds(); + } + spin_unlock(&nn->client_lock); + + while (!list_empty(&cblist)) { + clp = list_first_entry(&cblist, struct nfs4_client, + cl_ra_cblist); + list_del_init(&clp->cl_ra_cblist); + clp->cl_ra->ra_keep = 0; + clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG); + trace_nfsd_cb_recall_any(clp->cl_ra); + nfsd4_run_cb(&clp->cl_ra->ra_cb); + } +} + +static void +nfsd4_state_shrinker_worker(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct nfsd_net *nn = container_of(dwork, struct nfsd_net, + nfsd_shrinker_work); + + courtesy_client_reaper(nn); + deleg_reaper(nn); +} + static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) { if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle)) @@ -6149,6 +6386,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, struct nfs4_stid **s, struct nfsd_net *nn) { __be32 status; + struct nfs4_stid *stid; bool return_revoked = false; /* @@ -6171,15 +6409,16 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, } if (status) return status; - *s = find_stateid_by_type(cstate->clp, stateid, typemask); - if (!*s) + stid = find_stateid_by_type(cstate->clp, stateid, typemask); + if (!stid) return nfserr_bad_stateid; - if (((*s)->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) { - nfs4_put_stid(*s); + if ((stid->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) { + nfs4_put_stid(stid); if (cstate->minorversion) return nfserr_deleg_revoked; return nfserr_bad_stateid; } + *s = stid; return nfs_ok; } @@ -6244,12 +6483,12 @@ out: static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps) { - WARN_ON_ONCE(cps->cp_stateid.sc_type != NFS4_COPYNOTIFY_STID); - if (!refcount_dec_and_test(&cps->cp_stateid.sc_count)) + WARN_ON_ONCE(cps->cp_stateid.cs_type != NFS4_COPYNOTIFY_STID); + if (!refcount_dec_and_test(&cps->cp_stateid.cs_count)) return; list_del(&cps->cp_list); idr_remove(&nn->s2s_cp_stateids, - cps->cp_stateid.stid.si_opaque.so_id); + cps->cp_stateid.cs_stid.si_opaque.so_id); kfree(cps); } /* @@ -6271,12 +6510,12 @@ __be32 manage_cpntf_state(struct nfsd_net *nn, stateid_t *st, if (cps_t) { state = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid); - if (state->cp_stateid.sc_type != NFS4_COPYNOTIFY_STID) { + if (state->cp_stateid.cs_type != NFS4_COPYNOTIFY_STID) { state = NULL; goto unlock; } if (!clp) - refcount_inc(&state->cp_stateid.sc_count); + refcount_inc(&state->cp_stateid.cs_count); else _free_cpntf_state_locked(nn, state); } @@ -6684,6 +6923,7 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) struct nfs4_client *clp = s->st_stid.sc_client; bool unhashed; LIST_HEAD(reaplist); + struct nfs4_ol_stateid *stp; spin_lock(&clp->cl_lock); unhashed = unhash_open_stateid(s, &reaplist); @@ -6692,6 +6932,8 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) if (unhashed) put_ol_stateid_locked(s, &reaplist); spin_unlock(&clp->cl_lock); + list_for_each_entry(stp, &reaplist, st_locks) + nfs4_free_cpntf_statelist(clp->net, &stp->st_stid); free_ol_stateid_reaplist(&reaplist); } else { spin_unlock(&clp->cl_lock); @@ -6775,6 +7017,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto put_stateid; + trace_nfsd_deleg_return(stateid); + wake_up_var(d_inode(cstate->current_fh.fh_dentry)); destroy_delegation(dp); put_stateid: nfs4_put_stid(&dp->dl_stid); @@ -7585,7 +7829,7 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) } inode = locks_inode(nf->nf_file); - flctx = inode->i_flctx; + flctx = locks_inode_context(inode); if (flctx && !list_empty_careful(&flctx->flc_posix)) { spin_lock(&flctx->flc_lock); @@ -7830,6 +8074,7 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->blocked_locks_lru); INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); + INIT_DELAYED_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker); get_net(net); return 0; @@ -7905,10 +8150,16 @@ nfs4_state_start(void) { int ret; - ret = nfsd4_create_callback_queue(); + ret = rhltable_init(&nfs4_file_rhltable, &nfs4_file_rhash_params); if (ret) return ret; + ret = nfsd4_create_callback_queue(); + if (ret) { + rhltable_destroy(&nfs4_file_rhltable); + return ret; + } + set_max_delegations(); return 0; } @@ -7939,6 +8190,7 @@ nfs4_state_shutdown_net(struct net *net) nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); + rhltable_destroy(&nfs4_file_rhltable); #ifdef CONFIG_NFSD_V4_2_INTER_SSC nfsd4_ssc_shutdown_umount(nn); #endif diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 1e9690a061ec..2b4ae858c89b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -42,6 +42,8 @@ #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/addr.h> #include <linux/xattr.h> +#include <linux/vmalloc.h> + #include <uapi/linux/xattr.h> #include "idmap.h" @@ -768,16 +770,18 @@ nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs) static __be32 nfsd4_decode_access(struct nfsd4_compoundargs *argp, - struct nfsd4_access *access) + union nfsd4_op_u *u) { + struct nfsd4_access *access = &u->access; if (xdr_stream_decode_u32(argp->xdr, &access->ac_req_access) < 0) return nfserr_bad_xdr; return nfs_ok; } static __be32 -nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) +nfsd4_decode_close(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_close *close = &u->close; if (xdr_stream_decode_u32(argp->xdr, &close->cl_seqid) < 0) return nfserr_bad_xdr; return nfsd4_decode_stateid4(argp, &close->cl_stateid); @@ -785,20 +789,24 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) static __be32 -nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit) +nfsd4_decode_commit(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_commit *commit = &u->commit; if (xdr_stream_decode_u64(argp->xdr, &commit->co_offset) < 0) return nfserr_bad_xdr; if (xdr_stream_decode_u32(argp->xdr, &commit->co_count) < 0) return nfserr_bad_xdr; + memset(&commit->co_verf, 0, sizeof(commit->co_verf)); return nfs_ok; } static __be32 -nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create) +nfsd4_decode_create(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_create *create = &u->create; __be32 *p, status; + memset(create, 0, sizeof(*create)); if (xdr_stream_decode_u32(argp->xdr, &create->cr_type) < 0) return nfserr_bad_xdr; switch (create->cr_type) { @@ -840,21 +848,26 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create } static inline __be32 -nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr) +nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_delegreturn *dr = &u->delegreturn; return nfsd4_decode_stateid4(argp, &dr->dr_stateid); } static inline __be32 -nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr) +nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_getattr *getattr = &u->getattr; + memset(getattr, 0, sizeof(*getattr)); return nfsd4_decode_bitmap4(argp, getattr->ga_bmval, ARRAY_SIZE(getattr->ga_bmval)); } static __be32 -nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link) +nfsd4_decode_link(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_link *link = &u->link; + memset(link, 0, sizeof(*link)); return nfsd4_decode_component4(argp, &link->li_name, &link->li_namelen); } @@ -901,8 +914,10 @@ nfsd4_decode_locker4(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) } static __be32 -nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) +nfsd4_decode_lock(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_lock *lock = &u->lock; + memset(lock, 0, sizeof(*lock)); if (xdr_stream_decode_u32(argp->xdr, &lock->lk_type) < 0) return nfserr_bad_xdr; if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT)) @@ -917,8 +932,10 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) } static __be32 -nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) +nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_lockt *lockt = &u->lockt; + memset(lockt, 0, sizeof(*lockt)); if (xdr_stream_decode_u32(argp->xdr, &lockt->lt_type) < 0) return nfserr_bad_xdr; if ((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT)) @@ -932,8 +949,9 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) } static __be32 -nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) +nfsd4_decode_locku(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_locku *locku = &u->locku; __be32 status; if (xdr_stream_decode_u32(argp->xdr, &locku->lu_type) < 0) @@ -954,8 +972,9 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) } static __be32 -nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup) +nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_lookup *lookup = &u->lookup; return nfsd4_decode_component4(argp, &lookup->lo_name, &lookup->lo_len); } @@ -1135,16 +1154,14 @@ nfsd4_decode_open_claim4(struct nfsd4_compoundargs *argp, } static __be32 -nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) +nfsd4_decode_open(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_open *open = &u->open; __be32 status; u32 dummy; - memset(open->op_bmval, 0, sizeof(open->op_bmval)); - open->op_iattr.ia_valid = 0; - open->op_openowner = NULL; + memset(open, 0, sizeof(*open)); - open->op_xdr_error = 0; if (xdr_stream_decode_u32(argp->xdr, &open->op_seqid) < 0) return nfserr_bad_xdr; /* deleg_want is ignored */ @@ -1166,8 +1183,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) } static __be32 -nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf) +nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_open_confirm *open_conf = &u->open_confirm; __be32 status; if (argp->minorversion >= 1) @@ -1179,14 +1198,19 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con if (xdr_stream_decode_u32(argp->xdr, &open_conf->oc_seqid) < 0) return nfserr_bad_xdr; + memset(&open_conf->oc_resp_stateid, 0, + sizeof(open_conf->oc_resp_stateid)); return nfs_ok; } static __be32 -nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down) +nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_open_downgrade *open_down = &u->open_downgrade; __be32 status; + memset(open_down, 0, sizeof(*open_down)); status = nfsd4_decode_stateid4(argp, &open_down->od_stateid); if (status) return status; @@ -1201,8 +1225,9 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d } static __be32 -nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh) +nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_putfh *putfh = &u->putfh; __be32 *p; if (xdr_stream_decode_u32(argp->xdr, &putfh->pf_fhlen) < 0) @@ -1216,11 +1241,12 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh) if (!putfh->pf_fhval) return nfserr_jukebox; + putfh->no_verify = false; return nfs_ok; } static __be32 -nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p) +nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p) { if (argp->minorversion == 0) return nfs_ok; @@ -1228,10 +1254,12 @@ nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p) } static __be32 -nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) +nfsd4_decode_read(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_read *read = &u->read; __be32 status; + memset(read, 0, sizeof(*read)); status = nfsd4_decode_stateid4(argp, &read->rd_stateid); if (status) return status; @@ -1244,10 +1272,12 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) } static __be32 -nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir) +nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_readdir *readdir = &u->readdir; __be32 status; + memset(readdir, 0, sizeof(*readdir)); if (xdr_stream_decode_u64(argp->xdr, &readdir->rd_cookie) < 0) return nfserr_bad_xdr; status = nfsd4_decode_verifier4(argp, &readdir->rd_verf); @@ -1265,16 +1295,20 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read } static __be32 -nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove) +nfsd4_decode_remove(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_remove *remove = &u->remove; + memset(&remove->rm_cinfo, 0, sizeof(remove->rm_cinfo)); return nfsd4_decode_component4(argp, &remove->rm_name, &remove->rm_namelen); } static __be32 -nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename) +nfsd4_decode_rename(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_rename *rename = &u->rename; __be32 status; + memset(rename, 0, sizeof(*rename)); status = nfsd4_decode_component4(argp, &rename->rn_sname, &rename->rn_snamelen); if (status) return status; @@ -1282,23 +1316,28 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename } static __be32 -nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid) +nfsd4_decode_renew(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + clientid_t *clientid = &u->renew; return nfsd4_decode_clientid4(argp, clientid); } static __be32 nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, - struct nfsd4_secinfo *secinfo) + union nfsd4_op_u *u) { + struct nfsd4_secinfo *secinfo = &u->secinfo; + secinfo->si_exp = NULL; return nfsd4_decode_component4(argp, &secinfo->si_name, &secinfo->si_namelen); } static __be32 -nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) +nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_setattr *setattr = &u->setattr; __be32 status; + memset(setattr, 0, sizeof(*setattr)); status = nfsd4_decode_stateid4(argp, &setattr->sa_stateid); if (status) return status; @@ -1309,10 +1348,13 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta } static __be32 -nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid) +nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_setclientid *setclientid = &u->setclientid; __be32 *p, status; + memset(setclientid, 0, sizeof(*setclientid)); + if (argp->minorversion >= 1) return nfserr_notsupp; @@ -1350,8 +1392,10 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient } static __be32 -nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c) +nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_setclientid_confirm *scd_c = &u->setclientid_confirm; __be32 status; if (argp->minorversion >= 1) @@ -1365,10 +1409,13 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s /* Also used for NVERIFY */ static __be32 -nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify) +nfsd4_decode_verify(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_verify *verify = &u->verify; __be32 *p, status; + memset(verify, 0, sizeof(*verify)); + status = nfsd4_decode_bitmap4(argp, verify->ve_bmval, ARRAY_SIZE(verify->ve_bmval)); if (status) @@ -1390,8 +1437,9 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify } static __be32 -nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) +nfsd4_decode_write(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_write *write = &u->write; __be32 status; status = nfsd4_decode_stateid4(argp, &write->wr_stateid); @@ -1408,12 +1456,17 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) if (!xdr_stream_subsegment(argp->xdr, &write->wr_payload, write->wr_buflen)) return nfserr_bad_xdr; + write->wr_bytes_written = 0; + write->wr_how_written = 0; + memset(&write->wr_verifier, 0, sizeof(write->wr_verifier)); return nfs_ok; } static __be32 -nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner) +nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner; __be32 status; if (argp->minorversion >= 1) @@ -1430,18 +1483,24 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel return nfs_ok; } -static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc) +static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_backchannel_ctl *bc = &u->backchannel_ctl; + memset(bc, 0, sizeof(*bc)); if (xdr_stream_decode_u32(argp->xdr, &bc->bc_cb_program) < 0) return nfserr_bad_xdr; return nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec); } -static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) +static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session; u32 use_conn_in_rdma_mode; __be32 status; + memset(bcts, 0, sizeof(*bcts)); status = nfsd4_decode_sessionid4(argp, &bcts->sessionid); if (status) return status; @@ -1579,10 +1638,12 @@ nfsd4_decode_nfs_impl_id4(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, - struct nfsd4_exchange_id *exid) + union nfsd4_op_u *u) { + struct nfsd4_exchange_id *exid = &u->exchange_id; __be32 status; + memset(exid, 0, sizeof(*exid)); status = nfsd4_decode_verifier4(argp, &exid->verifier); if (status) return status; @@ -1631,10 +1692,12 @@ nfsd4_decode_channel_attrs4(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, - struct nfsd4_create_session *sess) + union nfsd4_op_u *u) { + struct nfsd4_create_session *sess = &u->create_session; __be32 status; + memset(sess, 0, sizeof(*sess)); status = nfsd4_decode_clientid4(argp, &sess->clientid); if (status) return status; @@ -1650,34 +1713,34 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, return status; if (xdr_stream_decode_u32(argp->xdr, &sess->callback_prog) < 0) return nfserr_bad_xdr; - status = nfsd4_decode_cb_sec(argp, &sess->cb_sec); - if (status) - return status; - - return nfs_ok; + return nfsd4_decode_cb_sec(argp, &sess->cb_sec); } static __be32 nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp, - struct nfsd4_destroy_session *destroy_session) + union nfsd4_op_u *u) { + struct nfsd4_destroy_session *destroy_session = &u->destroy_session; return nfsd4_decode_sessionid4(argp, &destroy_session->sessionid); } static __be32 nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp, - struct nfsd4_free_stateid *free_stateid) + union nfsd4_op_u *u) { + struct nfsd4_free_stateid *free_stateid = &u->free_stateid; return nfsd4_decode_stateid4(argp, &free_stateid->fr_stateid); } #ifdef CONFIG_NFSD_PNFS static __be32 nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, - struct nfsd4_getdeviceinfo *gdev) + union nfsd4_op_u *u) { + struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo; __be32 status; + memset(gdev, 0, sizeof(*gdev)); status = nfsd4_decode_deviceid4(argp, &gdev->gd_devid); if (status) return status; @@ -1694,10 +1757,12 @@ nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, - struct nfsd4_layoutcommit *lcp) + union nfsd4_op_u *u) { + struct nfsd4_layoutcommit *lcp = &u->layoutcommit; __be32 *p, status; + memset(lcp, 0, sizeof(*lcp)); if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.offset) < 0) return nfserr_bad_xdr; if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.length) < 0) @@ -1729,10 +1794,12 @@ nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, - struct nfsd4_layoutget *lgp) + union nfsd4_op_u *u) { + struct nfsd4_layoutget *lgp = &u->layoutget; __be32 status; + memset(lgp, 0, sizeof(*lgp)); if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_signal) < 0) return nfserr_bad_xdr; if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_layout_type) < 0) @@ -1756,8 +1823,10 @@ nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, - struct nfsd4_layoutreturn *lrp) + union nfsd4_op_u *u) { + struct nfsd4_layoutreturn *lrp = &u->layoutreturn; + memset(lrp, 0, sizeof(*lrp)); if (xdr_stream_decode_bool(argp->xdr, &lrp->lr_reclaim) < 0) return nfserr_bad_xdr; if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_layout_type) < 0) @@ -1769,17 +1838,21 @@ nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, #endif /* CONFIG_NFSD_PNFS */ static __be32 nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, - struct nfsd4_secinfo_no_name *sin) + union nfsd4_op_u *u) { + struct nfsd4_secinfo_no_name *sin = &u->secinfo_no_name; if (xdr_stream_decode_u32(argp->xdr, &sin->sin_style) < 0) return nfserr_bad_xdr; + + sin->sin_exp = NULL; return nfs_ok; } static __be32 nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, - struct nfsd4_sequence *seq) + union nfsd4_op_u *u) { + struct nfsd4_sequence *seq = &u->sequence; __be32 *p, status; status = nfsd4_decode_sessionid4(argp, &seq->sessionid); @@ -1793,16 +1866,20 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, seq->maxslots = be32_to_cpup(p++); seq->cachethis = be32_to_cpup(p); + seq->status_flags = 0; return nfs_ok; } static __be32 -nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid) +nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_test_stateid *test_stateid = &u->test_stateid; struct nfsd4_test_stateid_id *stateid; __be32 status; u32 i; + memset(test_stateid, 0, sizeof(*test_stateid)); if (xdr_stream_decode_u32(argp->xdr, &test_stateid->ts_num_ids) < 0) return nfserr_bad_xdr; @@ -1822,14 +1899,16 @@ nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_sta } static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, - struct nfsd4_destroy_clientid *dc) + union nfsd4_op_u *u) { + struct nfsd4_destroy_clientid *dc = &u->destroy_clientid; return nfsd4_decode_clientid4(argp, &dc->clientid); } static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, - struct nfsd4_reclaim_complete *rc) + union nfsd4_op_u *u) { + struct nfsd4_reclaim_complete *rc = &u->reclaim_complete; if (xdr_stream_decode_bool(argp->xdr, &rc->rca_one_fs) < 0) return nfserr_bad_xdr; return nfs_ok; @@ -1837,8 +1916,9 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, - struct nfsd4_fallocate *fallocate) + union nfsd4_op_u *u) { + struct nfsd4_fallocate *fallocate = &u->allocate; __be32 status; status = nfsd4_decode_stateid4(argp, &fallocate->falloc_stateid); @@ -1894,12 +1974,14 @@ static __be32 nfsd4_decode_nl4_server(struct nfsd4_compoundargs *argp, } static __be32 -nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) +nfsd4_decode_copy(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_copy *copy = &u->copy; u32 consecutive, i, count, sync; struct nl4_server *ns_dummy; __be32 status; + memset(copy, 0, sizeof(*copy)); status = nfsd4_decode_stateid4(argp, ©->cp_src_stateid); if (status) return status; @@ -1951,10 +2033,12 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) static __be32 nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp, - struct nfsd4_copy_notify *cn) + union nfsd4_op_u *u) { + struct nfsd4_copy_notify *cn = &u->copy_notify; __be32 status; + memset(cn, 0, sizeof(*cn)); cn->cpn_src = svcxdr_tmpalloc(argp, sizeof(*cn->cpn_src)); if (cn->cpn_src == NULL) return nfserr_jukebox; @@ -1970,14 +2054,18 @@ nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp, - struct nfsd4_offload_status *os) + union nfsd4_op_u *u) { + struct nfsd4_offload_status *os = &u->offload_status; + os->count = 0; + os->status = 0; return nfsd4_decode_stateid4(argp, &os->stateid); } static __be32 -nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) +nfsd4_decode_seek(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_seek *seek = &u->seek; __be32 status; status = nfsd4_decode_stateid4(argp, &seek->seek_stateid); @@ -1988,12 +2076,15 @@ nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) if (xdr_stream_decode_u32(argp->xdr, &seek->seek_whence) < 0) return nfserr_bad_xdr; + seek->seek_eof = 0; + seek->seek_pos = 0; return nfs_ok; } static __be32 -nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone) +nfsd4_decode_clone(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_clone *clone = &u->clone; __be32 status; status = nfsd4_decode_stateid4(argp, &clone->cl_src_stateid); @@ -2118,11 +2209,13 @@ nfsd4_decode_xattr_name(struct nfsd4_compoundargs *argp, char **namep) */ static __be32 nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp, - struct nfsd4_getxattr *getxattr) + union nfsd4_op_u *u) { + struct nfsd4_getxattr *getxattr = &u->getxattr; __be32 status; u32 maxcount; + memset(getxattr, 0, sizeof(*getxattr)); status = nfsd4_decode_xattr_name(argp, &getxattr->getxa_name); if (status) return status; @@ -2131,17 +2224,19 @@ nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp, maxcount = min_t(u32, XATTR_SIZE_MAX, maxcount); getxattr->getxa_len = maxcount; - - return status; + return nfs_ok; } static __be32 nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp, - struct nfsd4_setxattr *setxattr) + union nfsd4_op_u *u) { + struct nfsd4_setxattr *setxattr = &u->setxattr; u32 flags, maxcount, size; __be32 status; + memset(setxattr, 0, sizeof(*setxattr)); + if (xdr_stream_decode_u32(argp->xdr, &flags) < 0) return nfserr_bad_xdr; @@ -2176,10 +2271,13 @@ nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp, - struct nfsd4_listxattrs *listxattrs) + union nfsd4_op_u *u) { + struct nfsd4_listxattrs *listxattrs = &u->listxattrs; u32 maxcount; + memset(listxattrs, 0, sizeof(*listxattrs)); + if (xdr_stream_decode_u64(argp->xdr, &listxattrs->lsxa_cookie) < 0) return nfserr_bad_xdr; @@ -2205,112 +2303,114 @@ nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_removexattr(struct nfsd4_compoundargs *argp, - struct nfsd4_removexattr *removexattr) + union nfsd4_op_u *u) { + struct nfsd4_removexattr *removexattr = &u->removexattr; + memset(removexattr, 0, sizeof(*removexattr)); return nfsd4_decode_xattr_name(argp, &removexattr->rmxa_name); } static __be32 -nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) +nfsd4_decode_noop(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p) { return nfs_ok; } static __be32 -nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p) +nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p) { return nfserr_notsupp; } -typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *); +typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u); static const nfsd4_dec nfsd4_dec_ops[] = { - [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access, - [OP_CLOSE] = (nfsd4_dec)nfsd4_decode_close, - [OP_COMMIT] = (nfsd4_dec)nfsd4_decode_commit, - [OP_CREATE] = (nfsd4_dec)nfsd4_decode_create, - [OP_DELEGPURGE] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_DELEGRETURN] = (nfsd4_dec)nfsd4_decode_delegreturn, - [OP_GETATTR] = (nfsd4_dec)nfsd4_decode_getattr, - [OP_GETFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_LINK] = (nfsd4_dec)nfsd4_decode_link, - [OP_LOCK] = (nfsd4_dec)nfsd4_decode_lock, - [OP_LOCKT] = (nfsd4_dec)nfsd4_decode_lockt, - [OP_LOCKU] = (nfsd4_dec)nfsd4_decode_locku, - [OP_LOOKUP] = (nfsd4_dec)nfsd4_decode_lookup, - [OP_LOOKUPP] = (nfsd4_dec)nfsd4_decode_noop, - [OP_NVERIFY] = (nfsd4_dec)nfsd4_decode_verify, - [OP_OPEN] = (nfsd4_dec)nfsd4_decode_open, - [OP_OPENATTR] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm, - [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, - [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, - [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_putpubfh, - [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_READ] = (nfsd4_dec)nfsd4_decode_read, - [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, - [OP_READLINK] = (nfsd4_dec)nfsd4_decode_noop, - [OP_REMOVE] = (nfsd4_dec)nfsd4_decode_remove, - [OP_RENAME] = (nfsd4_dec)nfsd4_decode_rename, - [OP_RENEW] = (nfsd4_dec)nfsd4_decode_renew, - [OP_RESTOREFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_SAVEFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_SECINFO] = (nfsd4_dec)nfsd4_decode_secinfo, - [OP_SETATTR] = (nfsd4_dec)nfsd4_decode_setattr, - [OP_SETCLIENTID] = (nfsd4_dec)nfsd4_decode_setclientid, - [OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm, - [OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify, - [OP_WRITE] = (nfsd4_dec)nfsd4_decode_write, - [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner, + [OP_ACCESS] = nfsd4_decode_access, + [OP_CLOSE] = nfsd4_decode_close, + [OP_COMMIT] = nfsd4_decode_commit, + [OP_CREATE] = nfsd4_decode_create, + [OP_DELEGPURGE] = nfsd4_decode_notsupp, + [OP_DELEGRETURN] = nfsd4_decode_delegreturn, + [OP_GETATTR] = nfsd4_decode_getattr, + [OP_GETFH] = nfsd4_decode_noop, + [OP_LINK] = nfsd4_decode_link, + [OP_LOCK] = nfsd4_decode_lock, + [OP_LOCKT] = nfsd4_decode_lockt, + [OP_LOCKU] = nfsd4_decode_locku, + [OP_LOOKUP] = nfsd4_decode_lookup, + [OP_LOOKUPP] = nfsd4_decode_noop, + [OP_NVERIFY] = nfsd4_decode_verify, + [OP_OPEN] = nfsd4_decode_open, + [OP_OPENATTR] = nfsd4_decode_notsupp, + [OP_OPEN_CONFIRM] = nfsd4_decode_open_confirm, + [OP_OPEN_DOWNGRADE] = nfsd4_decode_open_downgrade, + [OP_PUTFH] = nfsd4_decode_putfh, + [OP_PUTPUBFH] = nfsd4_decode_putpubfh, + [OP_PUTROOTFH] = nfsd4_decode_noop, + [OP_READ] = nfsd4_decode_read, + [OP_READDIR] = nfsd4_decode_readdir, + [OP_READLINK] = nfsd4_decode_noop, + [OP_REMOVE] = nfsd4_decode_remove, + [OP_RENAME] = nfsd4_decode_rename, + [OP_RENEW] = nfsd4_decode_renew, + [OP_RESTOREFH] = nfsd4_decode_noop, + [OP_SAVEFH] = nfsd4_decode_noop, + [OP_SECINFO] = nfsd4_decode_secinfo, + [OP_SETATTR] = nfsd4_decode_setattr, + [OP_SETCLIENTID] = nfsd4_decode_setclientid, + [OP_SETCLIENTID_CONFIRM] = nfsd4_decode_setclientid_confirm, + [OP_VERIFY] = nfsd4_decode_verify, + [OP_WRITE] = nfsd4_decode_write, + [OP_RELEASE_LOCKOWNER] = nfsd4_decode_release_lockowner, /* new operations for NFSv4.1 */ - [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_backchannel_ctl, - [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session, - [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id, - [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session, - [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, - [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid, - [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_BACKCHANNEL_CTL] = nfsd4_decode_backchannel_ctl, + [OP_BIND_CONN_TO_SESSION] = nfsd4_decode_bind_conn_to_session, + [OP_EXCHANGE_ID] = nfsd4_decode_exchange_id, + [OP_CREATE_SESSION] = nfsd4_decode_create_session, + [OP_DESTROY_SESSION] = nfsd4_decode_destroy_session, + [OP_FREE_STATEID] = nfsd4_decode_free_stateid, + [OP_GET_DIR_DELEGATION] = nfsd4_decode_notsupp, #ifdef CONFIG_NFSD_PNFS - [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdeviceinfo, - [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, - [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, - [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, + [OP_GETDEVICEINFO] = nfsd4_decode_getdeviceinfo, + [OP_GETDEVICELIST] = nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = nfsd4_decode_layoutcommit, + [OP_LAYOUTGET] = nfsd4_decode_layoutget, + [OP_LAYOUTRETURN] = nfsd4_decode_layoutreturn, #else - [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICEINFO] = nfsd4_decode_notsupp, + [OP_GETDEVICELIST] = nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = nfsd4_decode_notsupp, + [OP_LAYOUTGET] = nfsd4_decode_notsupp, + [OP_LAYOUTRETURN] = nfsd4_decode_notsupp, #endif - [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name, - [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, - [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid, - [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid, - [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, + [OP_SECINFO_NO_NAME] = nfsd4_decode_secinfo_no_name, + [OP_SEQUENCE] = nfsd4_decode_sequence, + [OP_SET_SSV] = nfsd4_decode_notsupp, + [OP_TEST_STATEID] = nfsd4_decode_test_stateid, + [OP_WANT_DELEGATION] = nfsd4_decode_notsupp, + [OP_DESTROY_CLIENTID] = nfsd4_decode_destroy_clientid, + [OP_RECLAIM_COMPLETE] = nfsd4_decode_reclaim_complete, /* new operations for NFSv4.2 */ - [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, - [OP_COPY] = (nfsd4_dec)nfsd4_decode_copy, - [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_copy_notify, - [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, - [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_offload_status, - [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_offload_status, - [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_read, - [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek, - [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_CLONE] = (nfsd4_dec)nfsd4_decode_clone, + [OP_ALLOCATE] = nfsd4_decode_fallocate, + [OP_COPY] = nfsd4_decode_copy, + [OP_COPY_NOTIFY] = nfsd4_decode_copy_notify, + [OP_DEALLOCATE] = nfsd4_decode_fallocate, + [OP_IO_ADVISE] = nfsd4_decode_notsupp, + [OP_LAYOUTERROR] = nfsd4_decode_notsupp, + [OP_LAYOUTSTATS] = nfsd4_decode_notsupp, + [OP_OFFLOAD_CANCEL] = nfsd4_decode_offload_status, + [OP_OFFLOAD_STATUS] = nfsd4_decode_offload_status, + [OP_READ_PLUS] = nfsd4_decode_read, + [OP_SEEK] = nfsd4_decode_seek, + [OP_WRITE_SAME] = nfsd4_decode_notsupp, + [OP_CLONE] = nfsd4_decode_clone, /* RFC 8276 extended atributes operations */ - [OP_GETXATTR] = (nfsd4_dec)nfsd4_decode_getxattr, - [OP_SETXATTR] = (nfsd4_dec)nfsd4_decode_setxattr, - [OP_LISTXATTRS] = (nfsd4_dec)nfsd4_decode_listxattrs, - [OP_REMOVEXATTR] = (nfsd4_dec)nfsd4_decode_removexattr, + [OP_GETXATTR] = nfsd4_decode_getxattr, + [OP_SETXATTR] = nfsd4_decode_setxattr, + [OP_LISTXATTRS] = nfsd4_decode_listxattrs, + [OP_REMOVEXATTR] = nfsd4_decode_removexattr, }; static inline bool @@ -2357,22 +2457,15 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0) return false; - if (xdr_stream_decode_u32(argp->xdr, &argp->opcnt) < 0) + if (xdr_stream_decode_u32(argp->xdr, &argp->client_opcnt) < 0) return false; - - /* - * NFS4ERR_RESOURCE is a more helpful error than GARBAGE_ARGS - * here, so we return success at the xdr level so that - * nfsd4_proc can handle this is an NFS-level error. - */ - if (argp->opcnt > NFSD_MAX_OPS_PER_COMPOUND) - return true; + argp->opcnt = min_t(u32, argp->client_opcnt, + NFSD_MAX_OPS_PER_COMPOUND); if (argp->opcnt > ARRAY_SIZE(argp->iops)) { - argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL); + argp->ops = vcalloc(argp->opcnt, sizeof(*argp->ops)); if (!argp->ops) { argp->ops = argp->iops; - dprintk("nfsd: couldn't allocate room for COMPOUND\n"); return false; } } @@ -2774,9 +2867,10 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *bmval2, u32 } -static int get_parent_attributes(struct svc_export *exp, struct kstat *stat) +static int nfsd4_get_mounted_on_ino(struct svc_export *exp, u64 *pino) { struct path path = exp->ex_path; + struct kstat stat; int err; path_get(&path); @@ -2784,8 +2878,10 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat) if (path.dentry != path.mnt->mnt_root) break; } - err = vfs_getattr(&path, stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); + err = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT); path_put(&path); + if (!err) + *pino = stat.ino; return err; } @@ -3282,22 +3378,21 @@ out_acl: *p++ = cpu_to_be32(stat.btime.tv_nsec); } if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { - struct kstat parent_stat; u64 ino = stat.ino; p = xdr_reserve_space(xdr, 8); if (!p) goto out_resource; /* - * Get parent's attributes if not ignoring crossmount - * and this is the root of a cross-mounted filesystem. + * Get ino of mountpoint in parent filesystem, if not ignoring + * crossmount and this is the root of a cross-mounted + * filesystem. */ if (ignore_crossmnt == 0 && dentry == exp->ex_path.mnt->mnt_root) { - err = get_parent_attributes(exp, &parent_stat); + err = nfsd4_get_mounted_on_ino(exp, &ino); if (err) goto out_nfserr; - ino = parent_stat.ino; } p = xdr_encode_hyper(p, ino); } @@ -3594,8 +3689,10 @@ nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid) } static __be32 -nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) +nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_access *access = &u->access; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -3607,8 +3704,10 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ return 0; } -static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts) +static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -3624,8 +3723,10 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, } static __be32 -nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) +nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_close *close = &u->close; struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &close->cl_stateid); @@ -3633,8 +3734,10 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c static __be32 -nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit) +nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_commit *commit = &u->commit; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -3647,8 +3750,10 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ } static __be32 -nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create) +nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_create *create = &u->create; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -3661,8 +3766,10 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ } static __be32 -nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr) +nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_getattr *getattr = &u->getattr; struct svc_fh *fhp = getattr->ga_fhp; struct xdr_stream *xdr = resp->xdr; @@ -3671,8 +3778,10 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 } static __be32 -nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp) +nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct svc_fh **fhpp = &u->getfh; struct xdr_stream *xdr = resp->xdr; struct svc_fh *fhp = *fhpp; unsigned int len; @@ -3726,8 +3835,10 @@ again: } static __be32 -nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock) +nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_lock *lock = &u->lock; struct xdr_stream *xdr = resp->xdr; if (!nfserr) @@ -3739,8 +3850,10 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo } static __be32 -nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt) +nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_lockt *lockt = &u->lockt; struct xdr_stream *xdr = resp->xdr; if (nfserr == nfserr_denied) @@ -3749,8 +3862,10 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l } static __be32 -nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku) +nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_locku *locku = &u->locku; struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &locku->lu_stateid); @@ -3758,8 +3873,10 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l static __be32 -nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link) +nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_link *link = &u->link; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -3772,8 +3889,10 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li static __be32 -nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) +nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_open *open = &u->open; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -3866,16 +3985,20 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op } static __be32 -nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) +nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_open_confirm *oc = &u->open_confirm; struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid); } static __be32 -nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) +nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_open_downgrade *od = &u->open_downgrade; struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &od->od_stateid); @@ -3974,8 +4097,9 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, static __be32 nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_read *read) + union nfsd4_op_u *u) { + struct nfsd4_read *read = &u->read; bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags); unsigned long maxcount; struct xdr_stream *xdr = resp->xdr; @@ -3994,7 +4118,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, } if (resp->xdr->buf->page_len && splice_ok) { WARN_ON_ONCE(1); - return nfserr_resource; + return nfserr_serverfault; } xdr_commit_encode(xdr); @@ -4016,8 +4140,10 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, } static __be32 -nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink) +nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_readlink *readlink = &u->readlink; __be32 *p, *maxcount_p, zero = xdr_zero; struct xdr_stream *xdr = resp->xdr; int length_offset = xdr->buf->len; @@ -4061,8 +4187,10 @@ out_err: } static __be32 -nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir) +nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_readdir *readdir = &u->readdir; int maxcount; int bytes_left; loff_t offset; @@ -4152,8 +4280,10 @@ err_no_verf: } static __be32 -nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove) +nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_remove *remove = &u->remove; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4165,8 +4295,10 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ } static __be32 -nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename) +nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_rename *rename = &u->rename; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4248,8 +4380,9 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp) static __be32 nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_secinfo *secinfo) + union nfsd4_op_u *u) { + struct nfsd4_secinfo *secinfo = &u->secinfo; struct xdr_stream *xdr = resp->xdr; return nfsd4_do_encode_secinfo(xdr, secinfo->si_exp); @@ -4257,8 +4390,9 @@ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_secinfo_no_name *secinfo) + union nfsd4_op_u *u) { + struct nfsd4_secinfo_no_name *secinfo = &u->secinfo_no_name; struct xdr_stream *xdr = resp->xdr; return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp); @@ -4269,8 +4403,10 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, * regardless of the error status. */ static __be32 -nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr) +nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_setattr *setattr = &u->setattr; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4293,8 +4429,10 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 } static __be32 -nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd) +nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_setclientid *scd = &u->setclientid; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4317,8 +4455,10 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n } static __be32 -nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write) +nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_write *write = &u->write; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4334,8 +4474,9 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w static __be32 nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_exchange_id *exid) + union nfsd4_op_u *u) { + struct nfsd4_exchange_id *exid = &u->exchange_id; struct xdr_stream *xdr = resp->xdr; __be32 *p; char *major_id; @@ -4412,8 +4553,9 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_create_session *sess) + union nfsd4_op_u *u) { + struct nfsd4_create_session *sess = &u->create_session; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4465,8 +4607,9 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_sequence *seq) + union nfsd4_op_u *u) { + struct nfsd4_sequence *seq = &u->sequence; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4488,8 +4631,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_test_stateid *test_stateid) + union nfsd4_op_u *u) { + struct nfsd4_test_stateid *test_stateid = &u->test_stateid; struct xdr_stream *xdr = resp->xdr; struct nfsd4_test_stateid_id *stateid, *next; __be32 *p; @@ -4509,8 +4653,9 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, #ifdef CONFIG_NFSD_PNFS static __be32 nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_getdeviceinfo *gdev) + union nfsd4_op_u *u) { + struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo; struct xdr_stream *xdr = resp->xdr; const struct nfsd4_layout_ops *ops; u32 starting_len = xdr->buf->len, needed_len; @@ -4565,8 +4710,9 @@ toosmall: static __be32 nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_layoutget *lgp) + union nfsd4_op_u *u) { + struct nfsd4_layoutget *lgp = &u->layoutget; struct xdr_stream *xdr = resp->xdr; const struct nfsd4_layout_ops *ops; __be32 *p; @@ -4592,8 +4738,9 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_layoutcommit *lcp) + union nfsd4_op_u *u) { + struct nfsd4_layoutcommit *lcp = &u->layoutcommit; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4613,8 +4760,9 @@ nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_layoutreturn *lrp) + union nfsd4_op_u *u) { + struct nfsd4_layoutreturn *lrp = &u->layoutreturn; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4699,8 +4847,9 @@ nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns) static __be32 nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_copy *copy) + union nfsd4_op_u *u) { + struct nfsd4_copy *copy = &u->copy; __be32 *p; nfserr = nfsd42_encode_write_res(resp, ©->cp_res, @@ -4716,8 +4865,9 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_offload_status *os) + union nfsd4_op_u *u) { + struct nfsd4_offload_status *os = &u->offload_status; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4731,156 +4881,83 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, - struct nfsd4_read *read, - unsigned long *maxcount, u32 *eof, - loff_t *pos) + struct nfsd4_read *read) { - struct xdr_stream *xdr = resp->xdr; + bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags); struct file *file = read->rd_nf->nf_file; - int starting_len = xdr->buf->len; - loff_t hole_pos; - __be32 nfserr; - __be32 *p, tmp; - __be64 tmp64; - - hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE); - if (hole_pos > read->rd_offset) - *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset); - *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len)); + struct xdr_stream *xdr = resp->xdr; + unsigned long maxcount; + __be32 nfserr, *p; /* Content type, offset, byte count */ p = xdr_reserve_space(xdr, 4 + 8 + 4); if (!p) - return nfserr_resource; + return nfserr_io; + if (resp->xdr->buf->page_len && splice_ok) { + WARN_ON_ONCE(splice_ok); + return nfserr_serverfault; + } - read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount); - if (read->rd_vlen < 0) - return nfserr_resource; + maxcount = min_t(unsigned long, read->rd_length, + (xdr->buf->buflen - xdr->buf->len)); - nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, - resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof); + if (file->f_op->splice_read && splice_ok) + nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount); + else + nfserr = nfsd4_encode_readv(resp, read, file, maxcount); if (nfserr) return nfserr; - xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount)); - - tmp = htonl(NFS4_CONTENT_DATA); - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); - tmp64 = cpu_to_be64(read->rd_offset); - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8); - tmp = htonl(*maxcount); - write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4); - - tmp = xdr_zero; - write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp, - xdr_pad_size(*maxcount)); - return nfs_ok; -} - -static __be32 -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, - struct nfsd4_read *read, - unsigned long *maxcount, u32 *eof) -{ - struct file *file = read->rd_nf->nf_file; - loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA); - loff_t f_size = i_size_read(file_inode(file)); - unsigned long count; - __be32 *p; - - if (data_pos == -ENXIO) - data_pos = f_size; - else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE)) - return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size); - count = data_pos - read->rd_offset; - - /* Content type, offset, byte count */ - p = xdr_reserve_space(resp->xdr, 4 + 8 + 8); - if (!p) - return nfserr_resource; - *p++ = htonl(NFS4_CONTENT_HOLE); + *p++ = cpu_to_be32(NFS4_CONTENT_DATA); p = xdr_encode_hyper(p, read->rd_offset); - p = xdr_encode_hyper(p, count); + *p = cpu_to_be32(read->rd_length); - *eof = (read->rd_offset + count) >= f_size; - *maxcount = min_t(unsigned long, count, *maxcount); return nfs_ok; } static __be32 nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_read *read) + union nfsd4_op_u *u) { - unsigned long maxcount, count; + struct nfsd4_read *read = &u->read; + struct file *file = read->rd_nf->nf_file; struct xdr_stream *xdr = resp->xdr; - struct file *file; int starting_len = xdr->buf->len; - int last_segment = xdr->buf->len; - int segments = 0; - __be32 *p, tmp; - bool is_data; - loff_t pos; - u32 eof; + u32 segments = 0; + __be32 *p; if (nfserr) return nfserr; - file = read->rd_nf->nf_file; /* eof flag, segment count */ p = xdr_reserve_space(xdr, 4 + 4); if (!p) - return nfserr_resource; + return nfserr_io; xdr_commit_encode(xdr); - maxcount = min_t(unsigned long, read->rd_length, - (xdr->buf->buflen - xdr->buf->len)); - count = maxcount; - - eof = read->rd_offset >= i_size_read(file_inode(file)); - if (eof) + read->rd_eof = read->rd_offset >= i_size_read(file_inode(file)); + if (read->rd_eof) goto out; - pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE); - is_data = pos > read->rd_offset; - - while (count > 0 && !eof) { - maxcount = count; - if (is_data) - nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof, - segments == 0 ? &pos : NULL); - else - nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof); - if (nfserr) - goto out; - count -= maxcount; - read->rd_offset += maxcount; - is_data = !is_data; - last_segment = xdr->buf->len; - segments++; - } - -out: - if (nfserr && segments == 0) + nfserr = nfsd4_encode_read_plus_data(resp, read); + if (nfserr) { xdr_truncate_encode(xdr, starting_len); - else { - if (nfserr) { - xdr_truncate_encode(xdr, last_segment); - nfserr = nfs_ok; - eof = 0; - } - tmp = htonl(eof); - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); - tmp = htonl(segments); - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4); + return nfserr; } + segments++; + +out: + p = xdr_encode_bool(p, read->rd_eof); + *p = cpu_to_be32(segments); return nfserr; } static __be32 nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_copy_notify *cn) + union nfsd4_op_u *u) { + struct nfsd4_copy_notify *cn = &u->copy_notify; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4914,8 +4991,9 @@ nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_seek *seek) + union nfsd4_op_u *u) { + struct nfsd4_seek *seek = &u->seek; __be32 *p; p = xdr_reserve_space(resp->xdr, 4 + 8); @@ -4926,7 +5004,8 @@ nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, } static __be32 -nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) +nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *p) { return nfserr; } @@ -4977,8 +5056,9 @@ nfsd4_vbuf_to_stream(struct xdr_stream *xdr, char *buf, u32 buflen) static __be32 nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_getxattr *getxattr) + union nfsd4_op_u *u) { + struct nfsd4_getxattr *getxattr = &u->getxattr; struct xdr_stream *xdr = resp->xdr; __be32 *p, err; @@ -5001,8 +5081,9 @@ nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_setxattr *setxattr) + union nfsd4_op_u *u) { + struct nfsd4_setxattr *setxattr = &u->setxattr; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -5042,8 +5123,9 @@ nfsd4_listxattr_validate_cookie(struct nfsd4_listxattrs *listxattrs, static __be32 nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_listxattrs *listxattrs) + union nfsd4_op_u *u) { + struct nfsd4_listxattrs *listxattrs = &u->listxattrs; struct xdr_stream *xdr = resp->xdr; u32 cookie_offset, count_offset, eof; u32 left, xdrleft, slen, count; @@ -5153,8 +5235,9 @@ out: static __be32 nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_removexattr *removexattr) + union nfsd4_op_u *u) { + struct nfsd4_removexattr *removexattr = &u->removexattr; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -5166,7 +5249,7 @@ nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr, return 0; } -typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); +typedef __be32(*nfsd4_enc)(struct nfsd4_compoundres *, __be32, union nfsd4_op_u *u); /* * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1 @@ -5174,93 +5257,93 @@ typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); * done in the decoding phase. */ static const nfsd4_enc nfsd4_enc_ops[] = { - [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access, - [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close, - [OP_COMMIT] = (nfsd4_enc)nfsd4_encode_commit, - [OP_CREATE] = (nfsd4_enc)nfsd4_encode_create, - [OP_DELEGPURGE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_DELEGRETURN] = (nfsd4_enc)nfsd4_encode_noop, - [OP_GETATTR] = (nfsd4_enc)nfsd4_encode_getattr, - [OP_GETFH] = (nfsd4_enc)nfsd4_encode_getfh, - [OP_LINK] = (nfsd4_enc)nfsd4_encode_link, - [OP_LOCK] = (nfsd4_enc)nfsd4_encode_lock, - [OP_LOCKT] = (nfsd4_enc)nfsd4_encode_lockt, - [OP_LOCKU] = (nfsd4_enc)nfsd4_encode_locku, - [OP_LOOKUP] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop, - [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop, - [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open, - [OP_OPENATTR] = (nfsd4_enc)nfsd4_encode_noop, - [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm, - [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade, - [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_PUTPUBFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_PUTROOTFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_READ] = (nfsd4_enc)nfsd4_encode_read, - [OP_READDIR] = (nfsd4_enc)nfsd4_encode_readdir, - [OP_READLINK] = (nfsd4_enc)nfsd4_encode_readlink, - [OP_REMOVE] = (nfsd4_enc)nfsd4_encode_remove, - [OP_RENAME] = (nfsd4_enc)nfsd4_encode_rename, - [OP_RENEW] = (nfsd4_enc)nfsd4_encode_noop, - [OP_RESTOREFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_SAVEFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_SECINFO] = (nfsd4_enc)nfsd4_encode_secinfo, - [OP_SETATTR] = (nfsd4_enc)nfsd4_encode_setattr, - [OP_SETCLIENTID] = (nfsd4_enc)nfsd4_encode_setclientid, - [OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop, - [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop, - [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write, - [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop, + [OP_ACCESS] = nfsd4_encode_access, + [OP_CLOSE] = nfsd4_encode_close, + [OP_COMMIT] = nfsd4_encode_commit, + [OP_CREATE] = nfsd4_encode_create, + [OP_DELEGPURGE] = nfsd4_encode_noop, + [OP_DELEGRETURN] = nfsd4_encode_noop, + [OP_GETATTR] = nfsd4_encode_getattr, + [OP_GETFH] = nfsd4_encode_getfh, + [OP_LINK] = nfsd4_encode_link, + [OP_LOCK] = nfsd4_encode_lock, + [OP_LOCKT] = nfsd4_encode_lockt, + [OP_LOCKU] = nfsd4_encode_locku, + [OP_LOOKUP] = nfsd4_encode_noop, + [OP_LOOKUPP] = nfsd4_encode_noop, + [OP_NVERIFY] = nfsd4_encode_noop, + [OP_OPEN] = nfsd4_encode_open, + [OP_OPENATTR] = nfsd4_encode_noop, + [OP_OPEN_CONFIRM] = nfsd4_encode_open_confirm, + [OP_OPEN_DOWNGRADE] = nfsd4_encode_open_downgrade, + [OP_PUTFH] = nfsd4_encode_noop, + [OP_PUTPUBFH] = nfsd4_encode_noop, + [OP_PUTROOTFH] = nfsd4_encode_noop, + [OP_READ] = nfsd4_encode_read, + [OP_READDIR] = nfsd4_encode_readdir, + [OP_READLINK] = nfsd4_encode_readlink, + [OP_REMOVE] = nfsd4_encode_remove, + [OP_RENAME] = nfsd4_encode_rename, + [OP_RENEW] = nfsd4_encode_noop, + [OP_RESTOREFH] = nfsd4_encode_noop, + [OP_SAVEFH] = nfsd4_encode_noop, + [OP_SECINFO] = nfsd4_encode_secinfo, + [OP_SETATTR] = nfsd4_encode_setattr, + [OP_SETCLIENTID] = nfsd4_encode_setclientid, + [OP_SETCLIENTID_CONFIRM] = nfsd4_encode_noop, + [OP_VERIFY] = nfsd4_encode_noop, + [OP_WRITE] = nfsd4_encode_write, + [OP_RELEASE_LOCKOWNER] = nfsd4_encode_noop, /* NFSv4.1 operations */ - [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop, - [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session, - [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, - [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, - [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop, - [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, - [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_BACKCHANNEL_CTL] = nfsd4_encode_noop, + [OP_BIND_CONN_TO_SESSION] = nfsd4_encode_bind_conn_to_session, + [OP_EXCHANGE_ID] = nfsd4_encode_exchange_id, + [OP_CREATE_SESSION] = nfsd4_encode_create_session, + [OP_DESTROY_SESSION] = nfsd4_encode_noop, + [OP_FREE_STATEID] = nfsd4_encode_noop, + [OP_GET_DIR_DELEGATION] = nfsd4_encode_noop, #ifdef CONFIG_NFSD_PNFS - [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdeviceinfo, - [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, - [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, - [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, + [OP_GETDEVICEINFO] = nfsd4_encode_getdeviceinfo, + [OP_GETDEVICELIST] = nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = nfsd4_encode_layoutcommit, + [OP_LAYOUTGET] = nfsd4_encode_layoutget, + [OP_LAYOUTRETURN] = nfsd4_encode_layoutreturn, #else - [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, - [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICEINFO] = nfsd4_encode_noop, + [OP_GETDEVICELIST] = nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = nfsd4_encode_noop, + [OP_LAYOUTGET] = nfsd4_encode_noop, + [OP_LAYOUTRETURN] = nfsd4_encode_noop, #endif - [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name, - [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, - [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, - [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_test_stateid, - [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, - [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop, - [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SECINFO_NO_NAME] = nfsd4_encode_secinfo_no_name, + [OP_SEQUENCE] = nfsd4_encode_sequence, + [OP_SET_SSV] = nfsd4_encode_noop, + [OP_TEST_STATEID] = nfsd4_encode_test_stateid, + [OP_WANT_DELEGATION] = nfsd4_encode_noop, + [OP_DESTROY_CLIENTID] = nfsd4_encode_noop, + [OP_RECLAIM_COMPLETE] = nfsd4_encode_noop, /* NFSv4.2 operations */ - [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_COPY] = (nfsd4_enc)nfsd4_encode_copy, - [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_copy_notify, - [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop, - [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop, - [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_offload_status, - [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_read_plus, - [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek, - [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop, - [OP_CLONE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_ALLOCATE] = nfsd4_encode_noop, + [OP_COPY] = nfsd4_encode_copy, + [OP_COPY_NOTIFY] = nfsd4_encode_copy_notify, + [OP_DEALLOCATE] = nfsd4_encode_noop, + [OP_IO_ADVISE] = nfsd4_encode_noop, + [OP_LAYOUTERROR] = nfsd4_encode_noop, + [OP_LAYOUTSTATS] = nfsd4_encode_noop, + [OP_OFFLOAD_CANCEL] = nfsd4_encode_noop, + [OP_OFFLOAD_STATUS] = nfsd4_encode_offload_status, + [OP_READ_PLUS] = nfsd4_encode_read_plus, + [OP_SEEK] = nfsd4_encode_seek, + [OP_WRITE_SAME] = nfsd4_encode_noop, + [OP_CLONE] = nfsd4_encode_noop, /* RFC 8276 extended atributes operations */ - [OP_GETXATTR] = (nfsd4_enc)nfsd4_encode_getxattr, - [OP_SETXATTR] = (nfsd4_enc)nfsd4_encode_setxattr, - [OP_LISTXATTRS] = (nfsd4_enc)nfsd4_encode_listxattrs, - [OP_REMOVEXATTR] = (nfsd4_enc)nfsd4_encode_removexattr, + [OP_GETXATTR] = nfsd4_encode_getxattr, + [OP_SETXATTR] = nfsd4_encode_setxattr, + [OP_LISTXATTRS] = nfsd4_encode_listxattrs, + [OP_REMOVEXATTR] = nfsd4_encode_removexattr, }; /* @@ -5394,7 +5477,7 @@ void nfsd4_release_compoundargs(struct svc_rqst *rqstp) struct nfsd4_compoundargs *args = rqstp->rq_argp; if (args->ops != args->iops) { - kfree(args->ops); + vfree(args->ops); args->ops = args->iops; } while (args->to_free) { @@ -5423,12 +5506,8 @@ bool nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd4_compoundres *resp = rqstp->rq_resp; - struct xdr_buf *buf = xdr->buf; __be32 *p; - WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len + - buf->tail[0].iov_len); - /* * Send buffer space for the following items is reserved * at the top of nfsd4_proc_compound(). diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 9b31e1103e7b..3e64a3d50a1c 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -604,9 +604,10 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data) * scraping this file for info should test the labels to ensure they're * getting the correct field. */ -static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) +int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) { - struct nfsd_net *nn = m->private; + struct nfsd_net *nn = net_generic(file_inode(m->file)->i_sb->s_fs_info, + nfsd_net_id); seq_printf(m, "max entries: %u\n", nn->max_drc_entries); seq_printf(m, "num entries: %u\n", @@ -626,11 +627,3 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "cachesize at longest: %u\n", nn->longest_chain_cachesize); return 0; } - -int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file) -{ - struct nfsd_net *nn = net_generic(file_inode(file)->i_sb->s_fs_info, - nfsd_net_id); - - return single_open(file, nfsd_reply_cache_stats_show, nn); -} diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 917fa1892fd2..d1e581a60480 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -185,17 +185,7 @@ static int export_features_show(struct seq_file *m, void *v) return 0; } -static int export_features_open(struct inode *inode, struct file *file) -{ - return single_open(file, export_features_show, NULL); -} - -static const struct file_operations export_features_operations = { - .open = export_features_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(export_features); #if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) static int supported_enctypes_show(struct seq_file *m, void *v) @@ -204,17 +194,7 @@ static int supported_enctypes_show(struct seq_file *m, void *v) return 0; } -static int supported_enctypes_open(struct inode *inode, struct file *file) -{ - return single_open(file, supported_enctypes_show, NULL); -} - -static const struct file_operations supported_enctypes_ops = { - .open = supported_enctypes_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(supported_enctypes); #endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ static const struct file_operations pool_stats_operations = { @@ -224,19 +204,9 @@ static const struct file_operations pool_stats_operations = { .release = nfsd_pool_stats_release, }; -static const struct file_operations reply_cache_stats_operations = { - .open = nfsd_reply_cache_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(nfsd_reply_cache_stats); -static const struct file_operations filecache_ops = { - .open = nfsd_file_cache_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(nfsd_file_cache_stats); /*----------------------------------------------------------------------------*/ /* @@ -611,7 +581,9 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET; switch(num) { +#ifdef CONFIG_NFSD_V2 case 2: +#endif case 3: nfsd_vers(nn, num, cmd); break; @@ -631,7 +603,9 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) } break; default: - return -EINVAL; + /* Ignore requests to disable non-existent versions */ + if (cmd == NFSD_SET) + return -EINVAL; } vers += len + 1; } while ((len = qword_get(&mesg, vers, size)) > 0); @@ -1365,7 +1339,7 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) /* Per-export io stats use same ops as exports file */ [NFSD_Export_Stats] = {"export_stats", &exports_nfsd_operations, S_IRUGO}, [NFSD_Export_features] = {"export_features", - &export_features_operations, S_IRUGO}, + &export_features_fops, S_IRUGO}, [NFSD_FO_UnlockIP] = {"unlock_ip", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_FO_UnlockFS] = {"unlock_filesystem", @@ -1374,14 +1348,16 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO}, - [NFSD_Reply_Cache_Stats] = {"reply_cache_stats", &reply_cache_stats_operations, S_IRUGO}, + [NFSD_Reply_Cache_Stats] = {"reply_cache_stats", + &nfsd_reply_cache_stats_fops, S_IRUGO}, [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO}, - [NFSD_Filecache] = {"filecache", &filecache_ops, S_IRUGO}, + [NFSD_Filecache] = {"filecache", &nfsd_file_cache_stats_fops, S_IRUGO}, #if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) - [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO}, + [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", + &supported_enctypes_fops, S_IRUGO}, #endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, @@ -1481,16 +1457,19 @@ static __net_init int nfsd_init_net(struct net *net) goto out_idmap_error; nn->nfsd_versions = NULL; nn->nfsd4_minorversions = NULL; - retval = nfsd_reply_cache_init(nn); + retval = nfsd4_init_leases_net(nn); if (retval) goto out_drc_error; - nfsd4_init_leases_net(nn); - + retval = nfsd_reply_cache_init(nn); + if (retval) + goto out_cache_error; get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); seqlock_init(&nn->writeverf_lock); return 0; +out_cache_error: + nfsd4_leases_net_shutdown(nn); out_drc_error: nfsd_idmap_shutdown(net); out_idmap_error: @@ -1507,6 +1486,7 @@ static __net_exit void nfsd_exit_net(struct net *net) nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); nfsd_netns_free_versions(net_generic(net, nfsd_net_id)); + nfsd4_leases_net_shutdown(nn); } static struct pernet_operations nfsd_net_ops = { diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 57a468ed85c3..93b42ef9ed91 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -64,8 +64,7 @@ struct readdir_cd { extern struct svc_program nfsd_program; -extern const struct svc_version nfsd_version2, nfsd_version3, - nfsd_version4; +extern const struct svc_version nfsd_version2, nfsd_version3, nfsd_version4; extern struct mutex nfsd_mutex; extern spinlock_t nfsd_drc_lock; extern unsigned long nfsd_drc_max_mem; @@ -164,6 +163,7 @@ char * nfs4_recoverydir(void); bool nfsd4_spo_must_allow(struct svc_rqst *rqstp); int nfsd4_create_laundry_wq(void); void nfsd4_destroy_laundry_wq(void); +bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp, struct inode *inode); #else static inline int nfsd4_init_slabs(void) { return 0; } static inline void nfsd4_free_slabs(void) { } @@ -179,6 +179,11 @@ static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) } static inline int nfsd4_create_laundry_wq(void) { return 0; }; static inline void nfsd4_destroy_laundry_wq(void) {}; +static inline bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp, + struct inode *inode) +{ + return false; +} #endif /* @@ -343,6 +348,7 @@ void nfsd_lockd_shutdown(void); #define NFSD_COURTESY_CLIENT_TIMEOUT (24 * 60 * 60) /* seconds */ #define NFSD_CLIENT_MAX_TRIM_PER_RUN 128 #define NFS4_CLIENTS_PER_GB 1024 +#define NFSD_DELEGRETURN_TIMEOUT (HZ / 34) /* 30ms */ /* * The following attributes are currently not supported by the NFSv4 server: @@ -498,7 +504,8 @@ extern void unregister_cld_notifier(void); extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn); #endif -extern void nfsd4_init_leases_net(struct nfsd_net *nn); +extern int nfsd4_init_leases_net(struct nfsd_net *nn); +extern void nfsd4_leases_net_shutdown(struct nfsd_net *nn); #else /* CONFIG_NFSD_V4 */ static inline int nfsd4_is_junction(struct dentry *dentry) @@ -506,7 +513,8 @@ static inline int nfsd4_is_junction(struct dentry *dentry) return 0; } -static inline void nfsd4_init_leases_net(struct nfsd_net *nn) {}; +static inline int nfsd4_init_leases_net(struct nfsd_net *nn) { return 0; }; +static inline void nfsd4_leases_net_shutdown(struct nfsd_net *nn) {}; #define register_cld_notifier() 0 #define unregister_cld_notifier() do { } while(0) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index a5b71526cee0..8c52b6c9d31a 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -392,14 +392,8 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) skip_pseudoflavor_check: /* Finally, check access permissions. */ error = nfsd_permission(rqstp, exp, dentry, access); - - if (error) { - dprintk("fh_verify: %pd2 permission failure, " - "acc=%x, error=%d\n", - dentry, - access, ntohl(error)); - } out: + trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error); if (error == nfserr_stale) nfsd_stats_fh_stale_inc(exp); return error; diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index c3ae6414fc5c..513e028b0bbe 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -220,7 +220,7 @@ __be32 fh_update(struct svc_fh *); void fh_put(struct svc_fh *); static __inline__ struct svc_fh * -fh_copy(struct svc_fh *dst, struct svc_fh *src) +fh_copy(struct svc_fh *dst, const struct svc_fh *src) { WARN_ON(src->fh_dentry); @@ -229,7 +229,7 @@ fh_copy(struct svc_fh *dst, struct svc_fh *src) } static inline void -fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src) +fh_copy_shallow(struct knfsd_fh *dst, const struct knfsd_fh *src) { dst->fh_size = src->fh_size; memcpy(&dst->fh_raw, &src->fh_raw, src->fh_size); @@ -243,7 +243,8 @@ fh_init(struct svc_fh *fhp, int maxsize) return fhp; } -static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) +static inline bool fh_match(const struct knfsd_fh *fh1, + const struct knfsd_fh *fh2) { if (fh1->fh_size != fh2->fh_size) return false; @@ -252,7 +253,8 @@ static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) return true; } -static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) +static inline bool fh_fsid_match(const struct knfsd_fh *fh1, + const struct knfsd_fh *fh2) { if (fh1->fh_fsid_type != fh2->fh_fsid_type) return false; diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 7381972f1677..a5570cf75f3f 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -185,6 +185,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) argp->count, argp->offset); argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2); + argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen); v = 0; len = argp->count; @@ -210,7 +211,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) if (resp->status == nfs_ok) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) - return rpc_drop_reply; + __set_bit(RQ_DROPME, &rqstp->rq_flags); return rpc_success; } @@ -245,7 +246,7 @@ nfsd_proc_write(struct svc_rqst *rqstp) if (resp->status == nfs_ok) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) - return rpc_drop_reply; + __set_bit(RQ_DROPME, &rqstp->rq_flags); return rpc_success; } @@ -390,9 +391,8 @@ nfsd_proc_create(struct svc_rqst *rqstp) resp->status = nfs_ok; if (!inode) { /* File doesn't exist. Create it and set attrs */ - resp->status = nfsd_create_locked(rqstp, dirfhp, argp->name, - argp->len, &attrs, type, rdev, - newfhp); + resp->status = nfsd_create_locked(rqstp, dirfhp, &attrs, type, + rdev, newfhp); } else if (type == S_IFREG) { dprintk("nfsd: existing %s, valid=%x, size=%ld\n", argp->name, attr->ia_valid, (long) attr->ia_size); @@ -567,24 +567,15 @@ static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp, struct xdr_buf *buf = &resp->dirlist; struct xdr_stream *xdr = &resp->xdr; - count = clamp(count, (u32)(XDR_UNIT * 2), svc_max_payload(rqstp)); - memset(buf, 0, sizeof(*buf)); /* Reserve room for the NULL ptr & eof flag (-2 words) */ - buf->buflen = count - XDR_UNIT * 2; + buf->buflen = clamp(count, (u32)(XDR_UNIT * 2), (u32)PAGE_SIZE); + buf->buflen -= XDR_UNIT * 2; buf->pages = rqstp->rq_next_page; rqstp->rq_next_page++; - /* This is xdr_init_encode(), but it assumes that - * the head kvec has already been consumed. */ - xdr_set_scratch_buffer(xdr, NULL, 0); - xdr->buf = buf; - xdr->page_ptr = buf->pages; - xdr->iov = NULL; - xdr->p = page_address(*buf->pages); - xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE); - xdr->rqst = NULL; + xdr_init_encode_pages(xdr, buf, buf->pages, NULL); } /* @@ -646,6 +637,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_voidarg, .pc_encode = nfssvc_encode_voidres, .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, @@ -657,6 +649,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_encode = nfssvc_encode_attrstatres, .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_argzero = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, @@ -668,6 +661,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_encode = nfssvc_encode_attrstatres, .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_sattrargs), + .pc_argzero = sizeof(struct nfsd_sattrargs), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+AT, @@ -678,6 +672,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_voidarg, .pc_encode = nfssvc_encode_voidres, .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, @@ -689,6 +684,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_encode = nfssvc_encode_diropres, .pc_release = nfssvc_release_diropres, .pc_argsize = sizeof(struct nfsd_diropargs), + .pc_argzero = sizeof(struct nfsd_diropargs), .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+FH+AT, @@ -699,6 +695,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_fhandleargs, .pc_encode = nfssvc_encode_readlinkres, .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_argzero = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_readlinkres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+1+NFS_MAXPATHLEN/4, @@ -710,6 +707,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_encode = nfssvc_encode_readres, .pc_release = nfssvc_release_readres, .pc_argsize = sizeof(struct nfsd_readargs), + .pc_argzero = sizeof(struct nfsd_readargs), .pc_ressize = sizeof(struct nfsd_readres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4, @@ -720,6 +718,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_voidarg, .pc_encode = nfssvc_encode_voidres, .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, @@ -731,6 +730,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_encode = nfssvc_encode_attrstatres, .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_writeargs), + .pc_argzero = sizeof(struct nfsd_writeargs), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+AT, @@ -742,6 +742,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_encode = nfssvc_encode_diropres, .pc_release = nfssvc_release_diropres, .pc_argsize = sizeof(struct nfsd_createargs), + .pc_argzero = sizeof(struct nfsd_createargs), .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+FH+AT, @@ -752,6 +753,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_diropargs, .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_diropargs), + .pc_argzero = sizeof(struct nfsd_diropargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, @@ -762,6 +764,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_renameargs, .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_renameargs), + .pc_argzero = sizeof(struct nfsd_renameargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, @@ -772,6 +775,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_linkargs, .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_linkargs), + .pc_argzero = sizeof(struct nfsd_linkargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, @@ -782,6 +786,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_symlinkargs, .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_symlinkargs), + .pc_argzero = sizeof(struct nfsd_symlinkargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, @@ -793,6 +798,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_encode = nfssvc_encode_diropres, .pc_release = nfssvc_release_diropres, .pc_argsize = sizeof(struct nfsd_createargs), + .pc_argzero = sizeof(struct nfsd_createargs), .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+FH+AT, @@ -803,6 +809,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_diropargs, .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_diropargs), + .pc_argzero = sizeof(struct nfsd_diropargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, @@ -813,6 +820,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_readdirargs, .pc_encode = nfssvc_encode_readdirres, .pc_argsize = sizeof(struct nfsd_readdirargs), + .pc_argzero = sizeof(struct nfsd_readdirargs), .pc_ressize = sizeof(struct nfsd_readdirres), .pc_cachetype = RC_NOCACHE, .pc_name = "READDIR", @@ -822,6 +830,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_fhandleargs, .pc_encode = nfssvc_encode_statfsres, .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_argzero = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_statfsres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+5, @@ -839,65 +848,3 @@ const struct svc_version nfsd_version2 = { .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS2_SVC_XDRSIZE, }; - -/* - * Map errnos to NFS errnos. - */ -__be32 -nfserrno (int errno) -{ - static struct { - __be32 nfserr; - int syserr; - } nfs_errtbl[] = { - { nfs_ok, 0 }, - { nfserr_perm, -EPERM }, - { nfserr_noent, -ENOENT }, - { nfserr_io, -EIO }, - { nfserr_nxio, -ENXIO }, - { nfserr_fbig, -E2BIG }, - { nfserr_stale, -EBADF }, - { nfserr_acces, -EACCES }, - { nfserr_exist, -EEXIST }, - { nfserr_xdev, -EXDEV }, - { nfserr_mlink, -EMLINK }, - { nfserr_nodev, -ENODEV }, - { nfserr_notdir, -ENOTDIR }, - { nfserr_isdir, -EISDIR }, - { nfserr_inval, -EINVAL }, - { nfserr_fbig, -EFBIG }, - { nfserr_nospc, -ENOSPC }, - { nfserr_rofs, -EROFS }, - { nfserr_mlink, -EMLINK }, - { nfserr_nametoolong, -ENAMETOOLONG }, - { nfserr_notempty, -ENOTEMPTY }, -#ifdef EDQUOT - { nfserr_dquot, -EDQUOT }, -#endif - { nfserr_stale, -ESTALE }, - { nfserr_jukebox, -ETIMEDOUT }, - { nfserr_jukebox, -ERESTARTSYS }, - { nfserr_jukebox, -EAGAIN }, - { nfserr_jukebox, -EWOULDBLOCK }, - { nfserr_jukebox, -ENOMEM }, - { nfserr_io, -ETXTBSY }, - { nfserr_notsupp, -EOPNOTSUPP }, - { nfserr_toosmall, -ETOOSMALL }, - { nfserr_serverfault, -ESERVERFAULT }, - { nfserr_serverfault, -ENFILE }, - { nfserr_io, -EREMOTEIO }, - { nfserr_stale, -EOPENSTALE }, - { nfserr_io, -EUCLEAN }, - { nfserr_perm, -ENOKEY }, - { nfserr_no_grace, -ENOGRACE}, - }; - int i; - - for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) { - if (nfs_errtbl[i].syserr == errno) - return nfs_errtbl[i].nfserr; - } - WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno); - return nfserr_io; -} - diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 4bb5baa17040..56fba1cba3af 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -91,8 +91,12 @@ unsigned long nfsd_drc_mem_used; #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static struct svc_stat nfsd_acl_svcstats; static const struct svc_version *nfsd_acl_version[] = { +# if defined(CONFIG_NFSD_V2_ACL) [2] = &nfsd_acl_version2, +# endif +# if defined(CONFIG_NFSD_V3_ACL) [3] = &nfsd_acl_version3, +# endif }; #define NFSD_ACL_MINVERS 2 @@ -116,7 +120,9 @@ static struct svc_stat nfsd_acl_svcstats = { #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ static const struct svc_version *nfsd_version[] = { +#if defined(CONFIG_NFSD_V2) [2] = &nfsd_version2, +#endif [3] = &nfsd_version3, #if defined(CONFIG_NFSD_V4) [4] = &nfsd_version4, @@ -799,7 +805,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) if (nrservs == 0 && nn->nfsd_serv == NULL) goto out; - strlcpy(nn->nfsd_name, utsname()->nodename, + strscpy(nn->nfsd_name, utsname()->nodename, sizeof(nn->nfsd_name)); error = nfsd_create_serv(net); @@ -1054,7 +1060,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) svcxdr_init_encode(rqstp); *statp = proc->pc_func(rqstp); - if (*statp == rpc_drop_reply || test_bit(RQ_DROPME, &rqstp->rq_flags)) + if (test_bit(RQ_DROPME, &rqstp->rq_flags)) goto out_update_drop; if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream)) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index aba8520b4b8b..caf6355b18fa 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -338,10 +338,8 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; if (args->len > NFSSVC_MAXBLKSIZE_V2) return false; - if (!xdr_stream_subsegment(xdr, &args->payload, args->len)) - return false; - return true; + return xdr_stream_subsegment(xdr, &args->payload, args->len); } bool diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index ae596dbf8667..e94634d30591 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -57,11 +57,11 @@ typedef struct { } stateid_t; typedef struct { - stateid_t stid; + stateid_t cs_stid; #define NFS4_COPY_STID 1 #define NFS4_COPYNOTIFY_STID 2 - unsigned char sc_type; - refcount_t sc_count; + unsigned char cs_type; + refcount_t cs_count; } copy_stateid_t; struct nfsd4_callback { @@ -175,7 +175,7 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) /* Maximum number of slots per session. 160 is useful for long haul TCP */ #define NFSD_MAX_SLOTS_PER_SESSION 160 /* Maximum number of operations per session compound */ -#define NFSD_MAX_OPS_PER_COMPOUND 16 +#define NFSD_MAX_OPS_PER_COMPOUND 50 /* Maximum session per slot cache size */ #define NFSD_SLOT_CACHE_SIZE 2048 /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ @@ -368,6 +368,7 @@ struct nfs4_client { #define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */ #define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \ 1 << NFSD4_CLIENT_CB_KILL) +#define NFSD4_CLIENT_CB_RECALL_ANY (6) unsigned long cl_flags; const struct cred *cl_cb_cred; struct rpc_clnt *cl_cb_client; @@ -411,6 +412,10 @@ struct nfs4_client { unsigned int cl_state; atomic_t cl_delegs_in_recall; + + struct nfsd4_cb_recall_any *cl_ra; + time64_t cl_ra_time; + struct list_head cl_ra_cblist; }; /* struct nfs4_client_reset @@ -536,16 +541,13 @@ struct nfs4_clnt_odstate { * inode can have multiple filehandles associated with it, so there is * (potentially) a many to one relationship between this struct and struct * inode. - * - * These are hashed by filehandle in the file_hashtbl, which is protected by - * the global state_lock spinlock. */ struct nfs4_file { refcount_t fi_ref; struct inode * fi_inode; bool fi_aliased; spinlock_t fi_lock; - struct hlist_node fi_hash; /* hash on fi_fhandle */ + struct rhlist_head fi_rlist; struct list_head fi_stateids; union { struct list_head fi_delegations; @@ -639,6 +641,7 @@ enum nfsd4_cb_op { NFSPROC4_CLNT_CB_OFFLOAD, NFSPROC4_CLNT_CB_SEQUENCE, NFSPROC4_CLNT_CB_NOTIFY_LOCK, + NFSPROC4_CLNT_CB_RECALL_ANY, }; /* Returns true iff a is later than b: */ @@ -692,12 +695,11 @@ extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op); -extern void nfsd4_run_cb(struct nfsd4_callback *cb); +extern bool nfsd4_run_cb(struct nfsd4_callback *cb); extern int nfsd4_create_callback_queue(void); extern void nfsd4_destroy_callback_queue(void); extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfsd4_shutdown_copy(struct nfs4_client *clp); -extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash, struct nfsd_net *nn); extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn); diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index a8c5a02a84f0..777e24e5da33 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -32,7 +32,7 @@ struct svc_stat nfsd_svcstats = { .program = &nfsd_program, }; -static int nfsd_proc_show(struct seq_file *seq, void *v) +static int nfsd_show(struct seq_file *seq, void *v) { int i; @@ -72,17 +72,7 @@ static int nfsd_proc_show(struct seq_file *seq, void *v) return 0; } -static int nfsd_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, nfsd_proc_show, NULL); -} - -static const struct proc_ops nfsd_proc_ops = { - .proc_open = nfsd_proc_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = single_release, -}; +DEFINE_PROC_SHOW_ATTRIBUTE(nfsd); int nfsd_percpu_counters_init(struct percpu_counter counters[], int num) { diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 9ebd67d461f9..c852ae8eaf37 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -9,9 +9,12 @@ #define _NFSD_TRACE_H #include <linux/tracepoint.h> +#include <linux/sunrpc/xprt.h> +#include <trace/misc/nfs.h> #include "export.h" #include "nfsfh.h" +#include "xdr4.h" #define NFSD_TRACE_PROC_RES_FIELDS \ __field(unsigned int, netns_ino) \ @@ -84,19 +87,26 @@ DEFINE_NFSD_XDR_ERR_EVENT(cant_encode); { NFSD_MAY_64BIT_COOKIE, "64BIT_COOKIE" }) TRACE_EVENT(nfsd_compound, - TP_PROTO(const struct svc_rqst *rqst, - u32 args_opcnt), - TP_ARGS(rqst, args_opcnt), + TP_PROTO( + const struct svc_rqst *rqst, + const char *tag, + u32 taglen, + u32 opcnt + ), + TP_ARGS(rqst, tag, taglen, opcnt), TP_STRUCT__entry( __field(u32, xid) - __field(u32, args_opcnt) + __field(u32, opcnt) + __string_len(tag, tag, taglen) ), TP_fast_assign( __entry->xid = be32_to_cpu(rqst->rq_xid); - __entry->args_opcnt = args_opcnt; + __entry->opcnt = opcnt; + __assign_str_len(tag, tag, taglen); ), - TP_printk("xid=0x%08x opcnt=%u", - __entry->xid, __entry->args_opcnt) + TP_printk("xid=0x%08x opcnt=%u tag=%s", + __entry->xid, __entry->opcnt, __get_str(tag) + ) ) TRACE_EVENT(nfsd_compound_status, @@ -195,7 +205,7 @@ TRACE_EVENT(nfsd_fh_verify, __sockaddr(client, rqstp->rq_xprt->xpt_remotelen) __field(u32, xid) __field(u32, fh_hash) - __field(void *, inode) + __field(const void *, inode) __field(unsigned long, type) __field(unsigned long, access) ), @@ -211,13 +221,58 @@ TRACE_EVENT(nfsd_fh_verify, __entry->type = type; __entry->access = access; ), - TP_printk("xid=0x%08x fh_hash=0x%08x inode=%p type=%s access=%s", - __entry->xid, __entry->fh_hash, __entry->inode, + TP_printk("xid=0x%08x fh_hash=0x%08x type=%s access=%s", + __entry->xid, __entry->fh_hash, show_fs_file_type(__entry->type), show_nfsd_may_flags(__entry->access) ) ); +TRACE_EVENT_CONDITION(nfsd_fh_verify_err, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + umode_t type, + int access, + __be32 error + ), + TP_ARGS(rqstp, fhp, type, access, error), + TP_CONDITION(error), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __sockaddr(server, rqstp->rq_xprt->xpt_remotelen) + __sockaddr(client, rqstp->rq_xprt->xpt_remotelen) + __field(u32, xid) + __field(u32, fh_hash) + __field(const void *, inode) + __field(unsigned long, type) + __field(unsigned long, access) + __field(int, error) + ), + TP_fast_assign( + __entry->netns_ino = SVC_NET(rqstp)->ns.inum; + __assign_sockaddr(server, &rqstp->rq_xprt->xpt_local, + rqstp->rq_xprt->xpt_locallen); + __assign_sockaddr(client, &rqstp->rq_xprt->xpt_remote, + rqstp->rq_xprt->xpt_remotelen); + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + if (fhp->fh_dentry) + __entry->inode = d_inode(fhp->fh_dentry); + else + __entry->inode = NULL; + __entry->type = type; + __entry->access = access; + __entry->error = be32_to_cpu(error); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x type=%s access=%s error=%d", + __entry->xid, __entry->fh_hash, + show_fs_file_type(__entry->type), + show_nfsd_may_flags(__entry->access), + __entry->error + ) +); + DECLARE_EVENT_CLASS(nfsd_fh_err_class, TP_PROTO(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -489,6 +544,29 @@ DEFINE_NFSD_COPY_ERR_EVENT(clone_file_range_err); #include "filecache.h" #include "vfs.h" +TRACE_EVENT(nfsd_delegret_wakeup, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct inode *inode, + long timeo + ), + TP_ARGS(rqstp, inode, timeo), + TP_STRUCT__entry( + __field(u32, xid) + __field(const void *, inode) + __field(long, timeo) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->inode = inode; + __entry->timeo = timeo; + ), + TP_printk("xid=0x%08x inode=%p%s", + __entry->xid, __entry->inode, + __entry->timeo == 0 ? " (timed out)" : "" + ) +); + DECLARE_EVENT_CLASS(nfsd_stateid_class, TP_PROTO(stateid_t *stp), TP_ARGS(stp), @@ -529,6 +607,7 @@ DEFINE_STATEID_EVENT(layout_recall_release); DEFINE_STATEID_EVENT(open); DEFINE_STATEID_EVENT(deleg_read); +DEFINE_STATEID_EVENT(deleg_return); DEFINE_STATEID_EVENT(deleg_recall); DECLARE_EVENT_CLASS(nfsd_stateseqid_class, @@ -561,6 +640,61 @@ DEFINE_EVENT(nfsd_stateseqid_class, nfsd_##name, \ DEFINE_STATESEQID_EVENT(preprocess); DEFINE_STATESEQID_EVENT(open_confirm); +TRACE_DEFINE_ENUM(NFS4_OPEN_STID); +TRACE_DEFINE_ENUM(NFS4_LOCK_STID); +TRACE_DEFINE_ENUM(NFS4_DELEG_STID); +TRACE_DEFINE_ENUM(NFS4_CLOSED_STID); +TRACE_DEFINE_ENUM(NFS4_REVOKED_DELEG_STID); +TRACE_DEFINE_ENUM(NFS4_CLOSED_DELEG_STID); +TRACE_DEFINE_ENUM(NFS4_LAYOUT_STID); + +#define show_stid_type(x) \ + __print_flags(x, "|", \ + { NFS4_OPEN_STID, "OPEN" }, \ + { NFS4_LOCK_STID, "LOCK" }, \ + { NFS4_DELEG_STID, "DELEG" }, \ + { NFS4_CLOSED_STID, "CLOSED" }, \ + { NFS4_REVOKED_DELEG_STID, "REVOKED" }, \ + { NFS4_CLOSED_DELEG_STID, "CLOSED_DELEG" }, \ + { NFS4_LAYOUT_STID, "LAYOUT" }) + +DECLARE_EVENT_CLASS(nfsd_stid_class, + TP_PROTO( + const struct nfs4_stid *stid + ), + TP_ARGS(stid), + TP_STRUCT__entry( + __field(unsigned long, sc_type) + __field(int, sc_count) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, si_id) + __field(u32, si_generation) + ), + TP_fast_assign( + const stateid_t *stp = &stid->sc_stateid; + + __entry->sc_type = stid->sc_type; + __entry->sc_count = refcount_read(&stid->sc_count); + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; + __entry->cl_id = stp->si_opaque.so_clid.cl_id; + __entry->si_id = stp->si_opaque.so_id; + __entry->si_generation = stp->si_generation; + ), + TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s", + __entry->cl_boot, __entry->cl_id, + __entry->si_id, __entry->si_generation, + __entry->sc_count, show_stid_type(__entry->sc_type) + ) +); + +#define DEFINE_STID_EVENT(name) \ +DEFINE_EVENT(nfsd_stid_class, nfsd_stid_##name, \ + TP_PROTO(const struct nfs4_stid *stid), \ + TP_ARGS(stid)) + +DEFINE_STID_EVENT(revoke); + DECLARE_EVENT_CLASS(nfsd_clientid_class, TP_PROTO(const clientid_t *clid), TP_ARGS(clid), @@ -742,7 +876,8 @@ DEFINE_CLID_EVENT(confirmed_r); __print_flags(val, "|", \ { 1 << NFSD_FILE_HASHED, "HASHED" }, \ { 1 << NFSD_FILE_PENDING, "PENDING" }, \ - { 1 << NFSD_FILE_REFERENCED, "REFERENCED"}) + { 1 << NFSD_FILE_REFERENCED, "REFERENCED" }, \ + { 1 << NFSD_FILE_GC, "GC" }) DECLARE_EVENT_CLASS(nfsd_file_class, TP_PROTO(struct nfsd_file *nf), @@ -774,10 +909,11 @@ DEFINE_EVENT(nfsd_file_class, name, \ TP_PROTO(struct nfsd_file *nf), \ TP_ARGS(nf)) -DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final); +DEFINE_NFSD_FILE_EVENT(nfsd_file_free); DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash); DEFINE_NFSD_FILE_EVENT(nfsd_file_put); -DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_dispose); +DEFINE_NFSD_FILE_EVENT(nfsd_file_closing); +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue); TRACE_EVENT(nfsd_file_alloc, TP_PROTO( @@ -968,35 +1104,6 @@ TRACE_EVENT(nfsd_file_open, __entry->nf_file) ) -DECLARE_EVENT_CLASS(nfsd_file_search_class, - TP_PROTO( - const struct inode *inode, - unsigned int count - ), - TP_ARGS(inode, count), - TP_STRUCT__entry( - __field(const struct inode *, inode) - __field(unsigned int, count) - ), - TP_fast_assign( - __entry->inode = inode; - __entry->count = count; - ), - TP_printk("inode=%p count=%u", - __entry->inode, __entry->count) -); - -#define DEFINE_NFSD_FILE_SEARCH_EVENT(name) \ -DEFINE_EVENT(nfsd_file_search_class, name, \ - TP_PROTO( \ - const struct inode *inode, \ - unsigned int count \ - ), \ - TP_ARGS(inode, count)) - -DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode_sync); -DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode); - TRACE_EVENT(nfsd_file_is_cached, TP_PROTO( const struct inode *inode, @@ -1074,7 +1181,6 @@ DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del_disposed); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_in_use); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_writeback); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_referenced); -DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_hashed); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_disposed); DECLARE_EVENT_CLASS(nfsd_file_lruwalk_class, @@ -1106,6 +1212,53 @@ DEFINE_EVENT(nfsd_file_lruwalk_class, name, \ DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_removed); DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_shrinker_removed); +TRACE_EVENT(nfsd_file_close, + TP_PROTO( + const struct inode *inode + ), + TP_ARGS(inode), + TP_STRUCT__entry( + __field(const void *, inode) + ), + TP_fast_assign( + __entry->inode = inode; + ), + TP_printk("inode=%p", + __entry->inode + ) +); + +TRACE_EVENT(nfsd_file_fsync, + TP_PROTO( + const struct nfsd_file *nf, + int ret + ), + TP_ARGS(nf, ret), + TP_STRUCT__entry( + __field(void *, nf_inode) + __field(int, nf_ref) + __field(int, ret) + __field(unsigned long, nf_flags) + __field(unsigned char, nf_may) + __field(struct file *, nf_file) + ), + TP_fast_assign( + __entry->nf_inode = nf->nf_inode; + __entry->nf_ref = refcount_read(&nf->nf_ref); + __entry->ret = ret; + __entry->nf_flags = nf->nf_flags; + __entry->nf_may = nf->nf_may; + __entry->nf_file = nf->nf_file; + ), + TP_printk("inode=%p ref=%d flags=%s may=%s nf_file=%p ret=%d", + __entry->nf_inode, + __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nfsd_may_flags(__entry->nf_may), + __entry->nf_file, __entry->ret + ) +); + #include "cache.h" TRACE_DEFINE_ENUM(RC_DROPIT); @@ -1399,6 +1552,92 @@ TRACE_EVENT(nfsd_cb_offload, __entry->fh_hash, __entry->count, __entry->status) ); +TRACE_EVENT(nfsd_cb_recall_any, + TP_PROTO( + const struct nfsd4_cb_recall_any *ra + ), + TP_ARGS(ra), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, keep) + __field(unsigned long, bmval0) + __sockaddr(addr, ra->ra_cb.cb_clp->cl_cb_conn.cb_addrlen) + ), + TP_fast_assign( + __entry->cl_boot = ra->ra_cb.cb_clp->cl_clientid.cl_boot; + __entry->cl_id = ra->ra_cb.cb_clp->cl_clientid.cl_id; + __entry->keep = ra->ra_keep; + __entry->bmval0 = ra->ra_bmval[0]; + __assign_sockaddr(addr, &ra->ra_cb.cb_clp->cl_addr, + ra->ra_cb.cb_clp->cl_cb_conn.cb_addrlen); + ), + TP_printk("addr=%pISpc client %08x:%08x keep=%u bmval0=%s", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->keep, show_rca_mask(__entry->bmval0) + ) +); + +DECLARE_EVENT_CLASS(nfsd_cb_done_class, + TP_PROTO( + const stateid_t *stp, + const struct rpc_task *task + ), + TP_ARGS(stp, task), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, si_id) + __field(u32, si_generation) + __field(int, status) + ), + TP_fast_assign( + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; + __entry->cl_id = stp->si_opaque.so_clid.cl_id; + __entry->si_id = stp->si_opaque.so_id; + __entry->si_generation = stp->si_generation; + __entry->status = task->tk_status; + ), + TP_printk("client %08x:%08x stateid %08x:%08x status=%d", + __entry->cl_boot, __entry->cl_id, __entry->si_id, + __entry->si_generation, __entry->status + ) +); + +#define DEFINE_NFSD_CB_DONE_EVENT(name) \ +DEFINE_EVENT(nfsd_cb_done_class, name, \ + TP_PROTO( \ + const stateid_t *stp, \ + const struct rpc_task *task \ + ), \ + TP_ARGS(stp, task)) + +DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_recall_done); +DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_notify_lock_done); +DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_layout_done); +DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_offload_done); + +TRACE_EVENT(nfsd_cb_recall_any_done, + TP_PROTO( + const struct nfsd4_callback *cb, + const struct rpc_task *task + ), + TP_ARGS(cb, task), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(int, status) + ), + TP_fast_assign( + __entry->status = task->tk_status; + __entry->cl_boot = cb->cb_clp->cl_clientid.cl_boot; + __entry->cl_id = cb->cb_clp->cl_clientid.cl_id; + ), + TP_printk("client %08x:%08x status=%d", + __entry->cl_boot, __entry->cl_id, __entry->status + ) +); + #endif /* _NFSD_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index fc17b0ac8729..4c3a0d84043c 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -49,6 +49,69 @@ #define NFSDDBG_FACILITY NFSDDBG_FILEOP +/** + * nfserrno - Map Linux errnos to NFS errnos + * @errno: POSIX(-ish) error code to be mapped + * + * Returns the appropriate (net-endian) nfserr_* (or nfs_ok if errno is 0). If + * it's an error we don't expect, log it once and return nfserr_io. + */ +__be32 +nfserrno (int errno) +{ + static struct { + __be32 nfserr; + int syserr; + } nfs_errtbl[] = { + { nfs_ok, 0 }, + { nfserr_perm, -EPERM }, + { nfserr_noent, -ENOENT }, + { nfserr_io, -EIO }, + { nfserr_nxio, -ENXIO }, + { nfserr_fbig, -E2BIG }, + { nfserr_stale, -EBADF }, + { nfserr_acces, -EACCES }, + { nfserr_exist, -EEXIST }, + { nfserr_xdev, -EXDEV }, + { nfserr_mlink, -EMLINK }, + { nfserr_nodev, -ENODEV }, + { nfserr_notdir, -ENOTDIR }, + { nfserr_isdir, -EISDIR }, + { nfserr_inval, -EINVAL }, + { nfserr_fbig, -EFBIG }, + { nfserr_nospc, -ENOSPC }, + { nfserr_rofs, -EROFS }, + { nfserr_mlink, -EMLINK }, + { nfserr_nametoolong, -ENAMETOOLONG }, + { nfserr_notempty, -ENOTEMPTY }, + { nfserr_dquot, -EDQUOT }, + { nfserr_stale, -ESTALE }, + { nfserr_jukebox, -ETIMEDOUT }, + { nfserr_jukebox, -ERESTARTSYS }, + { nfserr_jukebox, -EAGAIN }, + { nfserr_jukebox, -EWOULDBLOCK }, + { nfserr_jukebox, -ENOMEM }, + { nfserr_io, -ETXTBSY }, + { nfserr_notsupp, -EOPNOTSUPP }, + { nfserr_toosmall, -ETOOSMALL }, + { nfserr_serverfault, -ESERVERFAULT }, + { nfserr_serverfault, -ENFILE }, + { nfserr_io, -EREMOTEIO }, + { nfserr_stale, -EOPENSTALE }, + { nfserr_io, -EUCLEAN }, + { nfserr_perm, -ENOKEY }, + { nfserr_no_grace, -ENOGRACE}, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) { + if (nfs_errtbl[i].syserr == errno) + return nfs_errtbl[i].nfserr; + } + WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno); + return nfserr_io; +} + /* * Called from nfsd_lookup and encode_dirent. Check if we have crossed * a mount point. @@ -343,8 +406,61 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, return nfserrno(get_write_access(inode)); } -/* - * Set various file attributes. After this call fhp needs an fh_put. +static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap) +{ + int host_err; + + if (iap->ia_valid & ATTR_SIZE) { + /* + * RFC5661, Section 18.30.4: + * Changing the size of a file with SETATTR indirectly + * changes the time_modify and change attributes. + * + * (and similar for the older RFCs) + */ + struct iattr size_attr = { + .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME, + .ia_size = iap->ia_size, + }; + + if (iap->ia_size < 0) + return -EFBIG; + + host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL); + if (host_err) + return host_err; + iap->ia_valid &= ~ATTR_SIZE; + + /* + * Avoid the additional setattr call below if the only other + * attribute that the client sends is the mtime, as we update + * it as part of the size change above. + */ + if ((iap->ia_valid & ~ATTR_MTIME) == 0) + return 0; + } + + if (!iap->ia_valid) + return 0; + + iap->ia_valid |= ATTR_CTIME; + return notify_change(&init_user_ns, dentry, iap, NULL); +} + +/** + * nfsd_setattr - Set various file attributes. + * @rqstp: controlling RPC transaction + * @fhp: filehandle of target + * @attr: attributes to set + * @check_guard: set to 1 if guardtime is a valid timestamp + * @guardtime: do not act if ctime.tv_sec does not match this timestamp + * + * This call may adjust the contents of @attr (in particular, this + * call may change the bits in the na_iattr.ia_valid field). + * + * Returns nfs_ok on success, otherwise an NFS status code is + * returned. Caller must release @fhp by calling fh_put in either + * case. */ __be32 nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -357,9 +473,10 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, int accmode = NFSD_MAY_SATTR; umode_t ftype = 0; __be32 err; - int host_err = 0; + int host_err; bool get_write_count; bool size_change = (iap->ia_valid & ATTR_SIZE); + int retries; if (iap->ia_valid & ATTR_SIZE) { accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; @@ -414,54 +531,24 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, } inode_lock(inode); - if (size_change) { - /* - * RFC5661, Section 18.30.4: - * Changing the size of a file with SETATTR indirectly - * changes the time_modify and change attributes. - * - * (and similar for the older RFCs) - */ - struct iattr size_attr = { - .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME, - .ia_size = iap->ia_size, - }; - - host_err = -EFBIG; - if (iap->ia_size < 0) - goto out_unlock; - - host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL); - if (host_err) - goto out_unlock; - iap->ia_valid &= ~ATTR_SIZE; - - /* - * Avoid the additional setattr call below if the only other - * attribute that the client sends is the mtime, as we update - * it as part of the size change above. - */ - if ((iap->ia_valid & ~ATTR_MTIME) == 0) - goto out_unlock; - } - - if (iap->ia_valid) { - iap->ia_valid |= ATTR_CTIME; - host_err = notify_change(&init_user_ns, dentry, iap, NULL); + for (retries = 1;;) { + host_err = __nfsd_setattr(dentry, iap); + if (host_err != -EAGAIN || !retries--) + break; + if (!nfsd_wait_for_delegreturn(rqstp, inode)) + break; } - -out_unlock: if (attr->na_seclabel && attr->na_seclabel->len) attr->na_labelerr = security_inode_setsecctx(dentry, attr->na_seclabel->data, attr->na_seclabel->len); if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && attr->na_pacl) attr->na_aclerr = set_posix_acl(&init_user_ns, - inode, ACL_TYPE_ACCESS, + dentry, ACL_TYPE_ACCESS, attr->na_pacl); if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && !attr->na_aclerr && attr->na_dpacl && S_ISDIR(inode->i_mode)) attr->na_aclerr = set_posix_acl(&init_user_ns, - inode, ACL_TYPE_DEFAULT, + dentry, ACL_TYPE_DEFAULT, attr->na_dpacl); inode_unlock(inode); if (size_change) @@ -572,8 +659,8 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0); if (ret == -EOPNOTSUPP || ret == -EXDEV) - ret = generic_copy_file_range(src, src_pos, dst, dst_pos, - count, 0); + ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, + COPY_FILE_SPLICE); return ret; } @@ -847,10 +934,11 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct svc_rqst *rqstp = sd->u.data; struct page *page = buf->page; // may be a compound one unsigned offset = buf->offset; + struct page *last_page; - page += offset / PAGE_SIZE; - for (int i = sd->len; i > 0; i -= PAGE_SIZE) - svc_rqst_replace_page(rqstp, page++); + last_page = page + (offset + sd->len - 1) / PAGE_SIZE; + for (page += offset / PAGE_SIZE; page <= last_page; page++) + svc_rqst_replace_page(rqstp, page); if (rqstp->rq_res.page_len == 0) // first call rqstp->rq_res.page_base = offset % PAGE_SIZE; rqstp->rq_res.page_len += sd->len; @@ -918,7 +1006,7 @@ __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, ssize_t host_err; trace_nfsd_read_vector(rqstp, fhp, offset, *count); - iov_iter_kvec(&iter, READ, vec, vlen, *count); + iov_iter_kvec(&iter, ITER_DEST, vec, vlen, *count); host_err = vfs_iter_read(file, &iter, &ppos, 0); return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } @@ -1008,7 +1096,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, if (stable && !use_wgather) flags |= RWF_SYNC; - iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt); + iov_iter_kvec(&iter, ITER_SOURCE, vec, vlen, *cnt); since = READ_ONCE(file->f_wb_err); if (verf) nfsd_copy_write_verifier(verf, nn); @@ -1060,7 +1148,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 err; trace_nfsd_read_start(rqstp, fhp, offset, *count); - err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf); + err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_READ, &nf); if (err) return err; @@ -1092,7 +1180,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, trace_nfsd_write_start(rqstp, fhp, offset, *cnt); - err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf); + err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_WRITE, &nf); if (err) goto out; @@ -1108,6 +1196,7 @@ out: * nfsd_commit - Commit pending writes to stable storage * @rqstp: RPC request being processed * @fhp: NFS filehandle + * @nf: target file * @offset: raw offset from beginning of file * @count: raw count of bytes to sync * @verf: filled in with the server's current write verifier @@ -1124,19 +1213,13 @@ out: * An nfsstat value in network byte order. */ __be32 -nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset, - u32 count, __be32 *verf) +nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, + u64 offset, u32 count, __be32 *verf) { + __be32 err = nfs_ok; u64 maxbytes; loff_t start, end; struct nfsd_net *nn; - struct nfsd_file *nf; - __be32 err; - - err = nfsd_file_acquire(rqstp, fhp, - NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf); - if (err) - goto out; /* * Convert the client-provided (offset, count) range to a @@ -1177,8 +1260,6 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset, } else nfsd_copy_write_verifier(verf, nn); - nfsd_file_put(nf); -out: return err; } @@ -1255,7 +1336,7 @@ nfsd_check_ignore_resizing(struct iattr *iap) /* The parent directory should already be locked: */ __be32 nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, - char *fname, int flen, struct nfsd_attrs *attrs, + struct nfsd_attrs *attrs, int type, dev_t rdev, struct svc_fh *resfhp) { struct dentry *dentry, *dchild; @@ -1280,7 +1361,6 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, iap->ia_mode &= ~current_umask(); err = 0; - host_err = 0; switch (type) { case S_IFREG: host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true); @@ -1382,8 +1462,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, if (err) goto out_unlock; fh_fill_pre_attrs(fhp); - err = nfsd_create_locked(rqstp, fhp, fname, flen, attrs, type, - rdev, resfhp); + err = nfsd_create_locked(rqstp, fhp, attrs, type, rdev, resfhp); fh_fill_post_attrs(fhp); out_unlock: inode_unlock(dentry->d_inode); @@ -1673,7 +1752,15 @@ retry: .new_dir = tdir, .new_dentry = ndentry, }; - host_err = vfs_rename(&rd); + int retries; + + for (retries = 1;;) { + host_err = vfs_rename(&rd); + if (host_err != -EAGAIN || !retries--) + break; + if (!nfsd_wait_for_delegreturn(rqstp, d_inode(odentry))) + break; + } if (!host_err) { host_err = commit_metadata(tfhp); if (!host_err) @@ -1757,9 +1844,18 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, fh_fill_pre_attrs(fhp); if (type != S_IFDIR) { + int retries; + if (rdentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) nfsd_close_cached_files(rdentry); - host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL); + + for (retries = 1;;) { + host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL); + if (host_err != -EAGAIN || !retries--) + break; + if (!nfsd_wait_for_delegreturn(rqstp, rinode)) + break; + } } else { host_err = vfs_rmdir(&init_user_ns, dirp, rdentry); } @@ -1814,7 +1910,7 @@ struct readdir_data { int full; }; -static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name, +static bool nfsd_buffered_filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { @@ -1826,7 +1922,7 @@ static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name, reclen = ALIGN(sizeof(struct buffered_dirent) + namlen, sizeof(u64)); if (buf->used + reclen > PAGE_SIZE) { buf->full = 1; - return -EINVAL; + return false; } de->namlen = namlen; @@ -1836,7 +1932,7 @@ static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name, memcpy(de->name, name, namlen); buf->used += reclen; - return 0; + return true; } static __be32 nfsd_buffered_readdir(struct file *file, struct svc_fh *fhp, diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index c95cd414b4bb..dbdfef7ae85b 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -60,6 +60,7 @@ static inline void nfsd_attrs_free(struct nfsd_attrs *attrs) posix_acl_release(attrs->na_dpacl); } +__be32 nfserrno (int errno); int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, struct svc_export **expp); __be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, @@ -79,8 +80,8 @@ __be32 nfsd4_clone_file_range(struct svc_rqst *rqstp, u64 count, bool sync); #endif /* CONFIG_NFSD_V4 */ __be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *, - char *name, int len, struct nfsd_attrs *attrs, - int type, dev_t rdev, struct svc_fh *res); + struct nfsd_attrs *attrs, int type, dev_t rdev, + struct svc_fh *res); __be32 nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct nfsd_attrs *attrs, int type, dev_t rdev, struct svc_fh *res); @@ -88,7 +89,8 @@ __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); __be32 nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct svc_fh *resfhp, struct nfsd_attrs *iap); __be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp, - u64 offset, u32 count, __be32 *verf); + struct nfsd_file *nf, u64 offset, u32 count, + __be32 *verf); #ifdef CONFIG_NFSD_V4 __be32 nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, void **bufp, int *lenp); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 96267258e629..4fd2cf6d1d2d 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -717,13 +717,13 @@ struct nfsd4_compoundargs { struct svcxdr_tmpbuf *to_free; struct svc_rqst *rqstp; - u32 taglen; char * tag; + u32 taglen; u32 minorversion; + u32 client_opcnt; u32 opcnt; struct nfsd4_op *ops; struct nfsd4_op iops[8]; - int cachetype; }; struct nfsd4_compoundres { @@ -732,8 +732,8 @@ struct nfsd4_compoundres { struct svc_rqst * rqstp; __be32 *statusp; - u32 taglen; char * tag; + u32 taglen; u32 opcnt; struct nfsd4_compound_state cstate; @@ -888,12 +888,18 @@ struct nfsd4_operation { u32 op_flags; char *op_name; /* Try to get response size before operation */ - u32 (*op_rsize_bop)(struct svc_rqst *, struct nfsd4_op *); + u32 (*op_rsize_bop)(const struct svc_rqst *rqstp, + const struct nfsd4_op *op); void (*op_get_currentstateid)(struct nfsd4_compound_state *, union nfsd4_op_u *); void (*op_set_currentstateid)(struct nfsd4_compound_state *, union nfsd4_op_u *); }; +struct nfsd4_cb_recall_any { + struct nfsd4_callback ra_cb; + u32 ra_keep; + u32 ra_bmval[1]; +}; #endif diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index 547cf07cf4e0..0d39af1b00a0 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -48,3 +48,9 @@ #define NFS4_dec_cb_offload_sz (cb_compound_dec_hdr_sz + \ cb_sequence_dec_sz + \ op_dec_sz) +#define NFS4_enc_cb_recall_any_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 1 + 1 + 1) +#define NFS4_dec_cb_recall_any_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 9f4d9432d38a..b9d15c3df3cc 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -1668,8 +1668,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key) maxkey = nilfs_btree_node_get_key(node, nchildren - 1); nextmaxkey = (nchildren > 1) ? nilfs_btree_node_get_key(node, nchildren - 2) : 0; - if (bh != NULL) - brelse(bh); + brelse(bh); return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW); } @@ -1717,8 +1716,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *btree, ptrs[i] = le64_to_cpu(dptrs[i]); } - if (bh != NULL) - brelse(bh); + brelse(bh); return nitems; } diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 3b55e239705f..9930fa901039 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -111,6 +111,13 @@ static void nilfs_dat_commit_free(struct inode *dat, kunmap_atomic(kaddr); nilfs_dat_commit_entry(dat, req); + + if (unlikely(req->pr_desc_bh == NULL || req->pr_bitmap_bh == NULL)) { + nilfs_error(dat->i_sb, + "state inconsistency probably due to duplicate use of vblocknr = %llu", + (unsigned long long)req->pr_entry_nr); + return; + } nilfs_palloc_commit_free_entry(dat, req); } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 67f63cfeade5..232dd7b6cca1 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -328,6 +328,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) struct inode *inode; struct nilfs_inode_info *ii; struct nilfs_root *root; + struct buffer_head *bh; int err = -ENOMEM; ino_t ino; @@ -343,11 +344,25 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) ii->i_state = BIT(NILFS_I_NEW); ii->i_root = root; - err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh); + err = nilfs_ifile_create_inode(root->ifile, &ino, &bh); if (unlikely(err)) goto failed_ifile_create_inode; /* reference count of i_bh inherits from nilfs_mdt_read_block() */ + if (unlikely(ino < NILFS_USER_INO)) { + nilfs_warn(sb, + "inode bitmap is inconsistent for reserved inodes"); + do { + brelse(bh); + err = nilfs_ifile_create_inode(root->ifile, &ino, &bh); + if (unlikely(err)) + goto failed_ifile_create_inode; + } while (ino < NILFS_USER_INO); + + nilfs_info(sb, "repaired inode bitmap for reserved inodes"); + } + ii->i_bh = bh; + atomic64_inc(&root->inodes_count); inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_ino = ino; @@ -440,6 +455,8 @@ int nilfs_read_inode_common(struct inode *inode, inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode)) + return -EIO; /* this inode is for metadata and corrupted */ if (inode->i_nlink == 0) return -ESTALE; /* this inode is deleted */ diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 3267e96c256c..39b7eea2642a 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -480,41 +480,36 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode, sector_t start_blk, sector_t *blkoff) { - unsigned int i; + unsigned int i, nr_folios; pgoff_t index; - unsigned int nblocks_in_page; unsigned long length = 0; - sector_t b; - struct pagevec pvec; - struct page *page; + struct folio_batch fbatch; + struct folio *folio; if (inode->i_mapping->nrpages == 0) return 0; index = start_blk >> (PAGE_SHIFT - inode->i_blkbits); - nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits); - pagevec_init(&pvec); + folio_batch_init(&fbatch); repeat: - pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE, - pvec.pages); - if (pvec.nr == 0) + nr_folios = filemap_get_folios_contig(inode->i_mapping, &index, ULONG_MAX, + &fbatch); + if (nr_folios == 0) return length; - if (length > 0 && pvec.pages[0]->index > index) - goto out; - - b = pvec.pages[0]->index << (PAGE_SHIFT - inode->i_blkbits); i = 0; do { - page = pvec.pages[i]; + folio = fbatch.folios[i]; - lock_page(page); - if (page_has_buffers(page)) { + folio_lock(folio); + if (folio_buffers(folio)) { struct buffer_head *bh, *head; + sector_t b; - bh = head = page_buffers(page); + b = folio->index << (PAGE_SHIFT - inode->i_blkbits); + bh = head = folio_buffers(folio); do { if (b < start_blk) continue; @@ -529,21 +524,17 @@ repeat: } else { if (length > 0) goto out_locked; - - b += nblocks_in_page; } - unlock_page(page); + folio_unlock(folio); - } while (++i < pagevec_count(&pvec)); + } while (++i < nr_folios); - index = page->index + 1; - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); goto repeat; out_locked: - unlock_page(page); -out: - pagevec_release(&pvec); + folio_unlock(folio); + folio_batch_release(&fbatch); return length; } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 0afe0832c754..3335ef352915 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -317,7 +317,7 @@ void nilfs_relax_pressure_in_lock(struct super_block *sb) struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_sc_info *sci = nilfs->ns_writer; - if (!sci || !sci->sc_flush_request) + if (sb_rdonly(sb) || unlikely(!sci) || !sci->sc_flush_request) return; set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags); @@ -875,9 +875,11 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) nilfs_mdt_mark_dirty(nilfs->ns_cpfile); nilfs_cpfile_put_checkpoint( nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); - } else - WARN_ON(err == -EINVAL || err == -ENOENT); - + } else if (err == -EINVAL || err == -ENOENT) { + nilfs_error(sci->sc_super, + "checkpoint creation failed due to metadata corruption."); + err = -EIO; + } return err; } @@ -891,7 +893,11 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0, &raw_cp, &bh_cp); if (unlikely(err)) { - WARN_ON(err == -EINVAL || err == -ENOENT); + if (err == -EINVAL || err == -ENOENT) { + nilfs_error(sci->sc_super, + "checkpoint finalization failed due to metadata corruption."); + err = -EIO; + } goto failed_ibh; } raw_cp->cp_snapshot_list.ssl_next = 0; @@ -2235,16 +2241,14 @@ int nilfs_construct_segment(struct super_block *sb) struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_sc_info *sci = nilfs->ns_writer; struct nilfs_transaction_info *ti; - int err; - if (!sci) + if (sb_rdonly(sb) || unlikely(!sci)) return -EROFS; /* A call inside transactions causes a deadlock. */ BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC); - err = nilfs_segctor_sync(sci); - return err; + return nilfs_segctor_sync(sci); } /** @@ -2276,7 +2280,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode, struct nilfs_transaction_info ti; int err = 0; - if (!sci) + if (sb_rdonly(sb) || unlikely(!sci)) return -EROFS; nilfs_transaction_lock(sb, &ti, 0); @@ -2772,11 +2776,12 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root) if (nilfs->ns_writer) { /* - * This happens if the filesystem was remounted - * read/write after nilfs_error degenerated it into a - * read-only mount. + * This happens if the filesystem is made read-only by + * __nilfs_error or nilfs_remount and then remounted + * read/write. In these cases, reuse the existing + * writer. */ - nilfs_detach_log_writer(sb); + return 0; } nilfs->ns_writer = nilfs_segctor_new(sb, root); @@ -2786,10 +2791,9 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root) inode_attach_wb(nilfs->ns_bdev->bd_inode, NULL); err = nilfs_segctor_start_thread(nilfs->ns_writer); - if (err) { - kfree(nilfs->ns_writer); - nilfs->ns_writer = NULL; - } + if (unlikely(err)) + nilfs_detach_log_writer(sb); + return err; } diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 77ff8e95421f..dc359b56fdfa 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -495,14 +495,22 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) { struct buffer_head *bh; + void *kaddr; + struct nilfs_segment_usage *su; int ret; + down_write(&NILFS_MDT(sufile)->mi_sem); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); if (!ret) { mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(sufile); + kaddr = kmap_atomic(bh->b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); + nilfs_segment_usage_set_dirty(su); + kunmap_atomic(kaddr); brelse(bh); } + up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index ba108f915391..6edb6e0dd61f 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1133,8 +1133,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) goto out; if (*flags & SB_RDONLY) { - /* Shutting down log writer */ - nilfs_detach_log_writer(sb); sb->s_flags |= SB_RDONLY; /* diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 3b4a079c9617..2064e6473d30 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -13,6 +13,7 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/random.h> +#include <linux/log2.h> #include <linux/crc32.h> #include "nilfs.h" #include "segment.h" @@ -193,6 +194,34 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs, } /** + * nilfs_get_blocksize - get block size from raw superblock data + * @sb: super block instance + * @sbp: superblock raw data buffer + * @blocksize: place to store block size + * + * nilfs_get_blocksize() calculates the block size from the block size + * exponent information written in @sbp and stores it in @blocksize, + * or aborts with an error message if it's too large. + * + * Return Value: On success, 0 is returned. If the block size is too + * large, -EINVAL is returned. + */ +static int nilfs_get_blocksize(struct super_block *sb, + struct nilfs_super_block *sbp, int *blocksize) +{ + unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size); + + if (unlikely(shift_bits > + ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS)) { + nilfs_err(sb, "too large filesystem blocksize: 2 ^ %u KiB", + shift_bits); + return -EINVAL; + } + *blocksize = BLOCK_SIZE << shift_bits; + return 0; +} + +/** * load_nilfs - load and recover the nilfs * @nilfs: the_nilfs structure to be released * @sb: super block instance used to recover past segment @@ -245,11 +274,15 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime); /* verify consistency between two super blocks */ - blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size); + err = nilfs_get_blocksize(sb, sbp[0], &blocksize); + if (err) + goto scan_error; + if (blocksize != nilfs->ns_blocksize) { nilfs_warn(sb, "blocksize differs between two super blocks (%d != %d)", blocksize, nilfs->ns_blocksize); + err = -EINVAL; goto scan_error; } @@ -443,11 +476,33 @@ static int nilfs_valid_sb(struct nilfs_super_block *sbp) return crc == le32_to_cpu(sbp->s_sum); } -static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset) +/** + * nilfs_sb2_bad_offset - check the location of the second superblock + * @sbp: superblock raw data buffer + * @offset: byte offset of second superblock calculated from device size + * + * nilfs_sb2_bad_offset() checks if the position on the second + * superblock is valid or not based on the filesystem parameters + * stored in @sbp. If @offset points to a location within the segment + * area, or if the parameters themselves are not normal, it is + * determined to be invalid. + * + * Return Value: true if invalid, false if valid. + */ +static bool nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset) { - return offset < ((le64_to_cpu(sbp->s_nsegments) * - le32_to_cpu(sbp->s_blocks_per_segment)) << - (le32_to_cpu(sbp->s_log_block_size) + 10)); + unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size); + u32 blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); + u64 nsegments = le64_to_cpu(sbp->s_nsegments); + u64 index; + + if (blocks_per_segment < NILFS_SEG_MIN_BLOCKS || + shift_bits > ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS) + return true; + + index = offset >> (shift_bits + BLOCK_SIZE_BITS); + do_div(index, blocks_per_segment); + return index < nsegments; } static void nilfs_release_super_block(struct the_nilfs *nilfs) @@ -586,9 +641,11 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) if (err) goto failed_sbh; - blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); - if (blocksize < NILFS_MIN_BLOCK_SIZE || - blocksize > NILFS_MAX_BLOCK_SIZE) { + err = nilfs_get_blocksize(sb, sbp, &blocksize); + if (err) + goto failed_sbh; + + if (blocksize < NILFS_MIN_BLOCK_SIZE) { nilfs_err(sb, "couldn't mount because of unsupported filesystem blocksize %d", blocksize); @@ -690,9 +747,7 @@ int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) { unsigned long ncleansegs; - down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); - up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment; return 0; } diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index cd7d09a569ff..a2a15bc4df28 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -18,7 +18,7 @@ #include "fanotify.h" -static bool fanotify_path_equal(struct path *p1, struct path *p2) +static bool fanotify_path_equal(const struct path *p1, const struct path *p2) { return p1->mnt == p2->mnt && p1->dentry == p2->dentry; } diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 1d9f11255c64..57f51a9a3015 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -452,13 +452,7 @@ static inline bool fanotify_is_error_event(u32 mask) return mask & FAN_FS_ERROR; } -static inline bool fanotify_event_has_path(struct fanotify_event *event) -{ - return event->type == FANOTIFY_EVENT_TYPE_PATH || - event->type == FANOTIFY_EVENT_TYPE_PATH_PERM; -} - -static inline struct path *fanotify_event_path(struct fanotify_event *event) +static inline const struct path *fanotify_event_path(struct fanotify_event *event) { if (event->type == FANOTIFY_EVENT_TYPE_PATH) return &FANOTIFY_PE(event)->path; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index f0e49a406ffa..4546da4a54f9 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -249,7 +249,7 @@ out: return event; } -static int create_fd(struct fsnotify_group *group, struct path *path, +static int create_fd(struct fsnotify_group *group, const struct path *path, struct file **file) { int client_fd; @@ -619,7 +619,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, char __user *buf, size_t count) { struct fanotify_event_metadata metadata; - struct path *path = fanotify_event_path(event); + const struct path *path = fanotify_event_path(event); struct fanotify_info *info = fanotify_event_info(event); unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; @@ -1553,7 +1553,7 @@ static int fanotify_test_fid(struct dentry *dentry) } static int fanotify_events_supported(struct fsnotify_group *group, - struct path *path, __u64 mask, + const struct path *path, __u64 mask, unsigned int flags) { unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 87d8a50ee803..fde74eb333cc 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -76,10 +76,6 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) */ extern void __fsnotify_update_child_dentry_flags(struct inode *inode); -/* allocate and destroy and event holder to attach events to notification/access queues */ -extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void); -extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder); - extern struct kmem_cache *fsnotify_mark_connector_cachep; #endif /* __FS_NOTIFY_FSNOTIFY_H_ */ diff --git a/fs/nsfs.c b/fs/nsfs.c index 800c1d0eb0d0..3506f6074288 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -28,7 +28,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = dentry->d_fsdata; - return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", + return dynamic_dname(buffer, buflen, "%s:[%lu]", ns_ops->name, inode->i_ino); } diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index 52615e6090e1..a3865bc4a0c6 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -594,17 +594,37 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { u8 *mrec_end = (u8 *)ctx->mrec + le32_to_cpu(ctx->mrec->bytes_allocated); - u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) + - a->name_length * sizeof(ntfschar); - if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end || - name_end > mrec_end) + u8 *name_end; + + /* check whether ATTR_RECORD wrap */ + if ((u8 *)a < (u8 *)ctx->mrec) + break; + + /* check whether Attribute Record Header is within bounds */ + if ((u8 *)a > mrec_end || + (u8 *)a + sizeof(ATTR_RECORD) > mrec_end) + break; + + /* check whether ATTR_RECORD's name is within bounds */ + name_end = (u8 *)a + le16_to_cpu(a->name_offset) + + a->name_length * sizeof(ntfschar); + if (name_end > mrec_end) break; + ctx->attr = a; if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || a->type == AT_END)) return -ENOENT; if (unlikely(!a->length)) break; + + /* check whether ATTR_RECORD's length wrap */ + if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a) + break; + /* check whether ATTR_RECORD's length is within bounds */ + if ((u8 *)a + le32_to_cpu(a->length) > mrec_end) + break; + if (a->type != type) continue; /* diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 58b660dbbee9..c481b14e4fd9 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -527,12 +527,12 @@ err_out: goto out; } -static inline int ntfs_submit_bh_for_read(struct buffer_head *bh) +static inline void ntfs_submit_bh_for_read(struct buffer_head *bh) { lock_buffer(bh); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - return submit_bh(REQ_OP_READ, bh); + submit_bh(REQ_OP_READ, bh); } /** diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index db0f1995aedd..08c659332e26 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1829,6 +1829,13 @@ int ntfs_read_inode_mount(struct inode *vi) goto err_out; } + /* Sanity check offset to the first attribute */ + if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) { + ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.", + le16_to_cpu(m->attrs_offset)); + goto err_out; + } + /* Need this to sanity check attribute list references to $MFT. */ vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 5ae8de09b271..001f4e053c85 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -2092,7 +2092,8 @@ get_ctx_vol_failed: // TODO: Initialize security. /* Get the extended system files' directory inode. */ vol->extend_ino = ntfs_iget(sb, FILE_Extend); - if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino)) { + if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) || + !S_ISDIR(vol->extend_ino->i_mode)) { if (!IS_ERR(vol->extend_ino)) iput(vol->extend_ino); ntfs_error(sb, "Failed to load $Extend."); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 511e58f2b0f8..e5399ebc3a2b 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -707,7 +707,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, setattr_copy(mnt_userns, inode, attr); if (mode != inode->i_mode) { - err = ntfs_acl_chmod(mnt_userns, inode); + err = ntfs_acl_chmod(mnt_userns, dentry); if (err) goto out; @@ -1160,7 +1160,7 @@ const struct inode_operations ntfs_file_inode_operations = { .setattr = ntfs3_setattr, .listxattr = ntfs_listxattr, .permission = ntfs_permission, - .get_acl = ntfs_get_acl, + .get_inode_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, .fiemap = ntfs_fiemap, }; diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index e61545b9772e..c6eb371a3695 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -3798,7 +3798,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) } log_init_pg_hdr(log, page_size, page_size, 1, 1); - log_create(log, l_size, 0, get_random_int(), false, false); + log_create(log, l_size, 0, get_random_u32(), false, false); log->ra = ra; @@ -3872,7 +3872,7 @@ check_restart_area: /* Do some checks based on whether we have a valid log page. */ if (!rst_info.valid_page) { - open_log_count = get_random_int(); + open_log_count = get_random_u32(); goto init_log_instance; } open_log_count = le32_to_cpu(ra2->open_log_count); @@ -4023,7 +4023,7 @@ find_oldest: memcpy(ra->clients, Add2Ptr(ra2, t16), le16_to_cpu(ra2->ra_len) - t16); - log->current_openlog_count = get_random_int(); + log->current_openlog_count = get_random_u32(); ra->open_log_count = cpu_to_le32(log->current_openlog_count); log->ra_size = offsetof(struct RESTART_AREA, clients) + sizeof(struct CLIENT_REC); diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 31bc94f87940..20b953871574 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -636,17 +636,9 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, bh->b_size = block_size; off = vbo & (PAGE_SIZE - 1); set_bh_page(bh, page, off); - - lock_buffer(bh); - bh->b_end_io = end_buffer_read_sync; - get_bh(bh); - submit_bh(REQ_OP_READ, bh); - - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - err = -EIO; + err = bh_read(bh, 0); + if (err < 0) goto out; - } zero_user_segment(page, off + voff, off + block_size); } } @@ -2069,8 +2061,6 @@ const struct inode_operations ntfs_link_inode_operations = { .setattr = ntfs3_setattr, .listxattr = ntfs_listxattr, .permission = ntfs_permission, - .get_acl = ntfs_get_acl, - .set_acl = ntfs_set_acl, }; const struct address_space_operations ntfs_aops = { diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index 0e72d2067804..c8db35e2ae17 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -371,7 +371,7 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry, * ntfs_create_inode -> ntfs_init_acl -> posix_acl_create -> * ntfs_get_acl -> ntfs_get_acl_ex -> ni_lock */ - struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); + struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT); if (IS_ERR(p)) { err = PTR_ERR(p); @@ -598,7 +598,7 @@ const struct inode_operations ntfs_dir_inode_operations = { .mknod = ntfs_mknod, .rename = ntfs_rename, .permission = ntfs_permission, - .get_acl = ntfs_get_acl, + .get_inode_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, .setattr = ntfs3_setattr, .getattr = ntfs_getattr, @@ -611,7 +611,7 @@ const struct inode_operations ntfs_special_inode_operations = { .setattr = ntfs3_setattr, .getattr = ntfs_getattr, .listxattr = ntfs_listxattr, - .get_acl = ntfs_get_acl, + .get_inode_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, }; diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 5cefcfa52118..0e051c5595a2 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -857,7 +857,7 @@ unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase, /* globals from xattr.c */ #ifdef CONFIG_NTFS3_FS_POSIX_ACL struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu); -int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, struct inode *dir); @@ -866,7 +866,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, #define ntfs_set_acl NULL #endif -int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode); +int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry); int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask); ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size); diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index cfd59bb0f9de..616df209feea 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -652,71 +652,10 @@ out: /* * ntfs_set_acl - inode_operations::set_acl */ -int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { - return ntfs_set_acl_ex(mnt_userns, inode, acl, type, false); -} - -static int ntfs_xattr_get_acl(struct user_namespace *mnt_userns, - struct inode *inode, int type, void *buffer, - size_t size) -{ - struct posix_acl *acl; - int err; - - if (!(inode->i_sb->s_flags & SB_POSIXACL)) { - ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); - return -EOPNOTSUPP; - } - - acl = ntfs_get_acl(inode, type, false); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (!acl) - return -ENODATA; - - err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return err; -} - -static int ntfs_xattr_set_acl(struct user_namespace *mnt_userns, - struct inode *inode, int type, const void *value, - size_t size) -{ - struct posix_acl *acl; - int err; - - if (!(inode->i_sb->s_flags & SB_POSIXACL)) { - ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); - return -EOPNOTSUPP; - } - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EPERM; - - if (!value) { - acl = NULL; - } else { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (acl) { - err = posix_acl_valid(&init_user_ns, acl); - if (err) - goto release_and_out; - } - } - - err = ntfs_set_acl(mnt_userns, inode, acl, type); - -release_and_out: - posix_acl_release(acl); - return err; + return ntfs_set_acl_ex(mnt_userns, d_inode(dentry), acl, type, false); } /* @@ -758,8 +697,9 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, /* * ntfs_acl_chmod - Helper for ntfs3_setattr(). */ -int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode) +int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry) { + struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; if (!(sb->s_flags & SB_POSIXACL)) @@ -768,7 +708,7 @@ int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode) if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - return posix_acl_chmod(mnt_userns, inode, inode->i_mode); + return posix_acl_chmod(mnt_userns, dentry, inode->i_mode); } /* @@ -884,19 +824,6 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, goto out; } -#ifdef CONFIG_NTFS3_FS_POSIX_ACL - if (!strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) || - !strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT)) { - /* TODO: init_user_ns? */ - err = ntfs_xattr_get_acl( - &init_user_ns, inode, - strlen(name) == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 - ? ACL_TYPE_ACCESS - : ACL_TYPE_DEFAULT, - buffer, size); - goto out; - } -#endif /* Deal with NTFS extended attribute. */ err = ntfs_get_ea(inode, name, strlen(name), buffer, size, NULL); @@ -1009,18 +936,6 @@ set_new_fa: goto out; } -#ifdef CONFIG_NTFS3_FS_POSIX_ACL - if (!strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) || - !strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT)) { - err = ntfs_xattr_set_acl( - mnt_userns, inode, - strlen(name) == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 - ? ACL_TYPE_ACCESS - : ACL_TYPE_DEFAULT, - value, size); - goto out; - } -#endif /* Deal with NTFS extended attribute. */ err = ntfs_set_ea(inode, name, strlen(name), value, size, flags, 0); @@ -1110,7 +1025,7 @@ static bool ntfs_xattr_user_list(struct dentry *dentry) } // clang-format off -static const struct xattr_handler ntfs_xattr_handler = { +static const struct xattr_handler ntfs_other_xattr_handler = { .prefix = "", .get = ntfs_getxattr, .set = ntfs_setxattr, @@ -1118,7 +1033,11 @@ static const struct xattr_handler ntfs_xattr_handler = { }; const struct xattr_handler *ntfs_xattr_handlers[] = { - &ntfs_xattr_handler, +#ifdef CONFIG_NTFS3_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &ntfs_other_xattr_handler, NULL, }; // clang-format on diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 23a72a423955..9f19cf9a5a9f 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -260,12 +260,13 @@ static int ocfs2_set_acl(handle_t *handle, return ret; } -int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { struct buffer_head *bh = NULL; int status, had_lock; struct ocfs2_lock_holder oh; + struct inode *inode = d_inode(dentry); had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh); if (had_lock < 0) diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 95a57c888ab6..a897c4e41b26 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -17,7 +17,7 @@ struct ocfs2_acl_entry { }; struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu); -int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index af4157f61927..1d65f6ef00ca 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -636,7 +636,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, !buffer_new(bh) && ocfs2_should_read_blk(inode, page, block_start) && (block_start < from || block_end > to)) { - ll_rw_block(REQ_OP_READ, 1, &bh); + bh_read_nowait(bh, 0); *wait_bh++=bh; } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index b13d344d40b6..60b97c92e2b2 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -335,7 +335,7 @@ static void o2hb_arm_timeout(struct o2hb_region *reg) /* negotiate timeout must be less than write timeout. */ schedule_delayed_work(®->hr_nego_timeout_work, msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS)); - memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap)); + bitmap_zero(reg->hr_nego_node_bitmap, O2NM_MAX_NODES); } static void o2hb_disarm_timeout(struct o2hb_region *reg) @@ -375,7 +375,7 @@ static void o2hb_nego_timeout(struct work_struct *work) if (reg->hr_last_hb_status) return; - o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES); /* lowest node as master node to make negotiate decision. */ master_node = find_first_bit(live_node_bitmap, O2NM_MAX_NODES); @@ -386,8 +386,8 @@ static void o2hb_nego_timeout(struct work_struct *work) config_item_name(®->hr_item), reg->hr_bdev); set_bit(master_node, reg->hr_nego_node_bitmap); } - if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap, - sizeof(reg->hr_nego_node_bitmap))) { + if (!bitmap_equal(reg->hr_nego_node_bitmap, live_node_bitmap, + O2NM_MAX_NODES)) { /* check negotiate bitmap every second to do timeout * approve decision. */ @@ -856,8 +856,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg) * live nodes heartbeat on it. In other words, the region has been * added to all nodes. */ - if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, - sizeof(o2hb_live_node_bitmap))) + if (!bitmap_equal(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, + O2NM_MAX_NODES)) goto unlock; printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n", @@ -1087,7 +1087,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * If a node is not configured but is in the livemap, we still need * to read the slot so as to be able to remove it from the livemap. */ - o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES); i = -1; while ((i = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { @@ -1437,11 +1437,11 @@ void o2hb_init(void) for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++) INIT_LIST_HEAD(&o2hb_live_slots[i]); - memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); - memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); - memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); - memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); - memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap)); + bitmap_zero(o2hb_live_node_bitmap, O2NM_MAX_NODES); + bitmap_zero(o2hb_region_bitmap, O2NM_MAX_REGIONS); + bitmap_zero(o2hb_live_region_bitmap, O2NM_MAX_REGIONS); + bitmap_zero(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS); + bitmap_zero(o2hb_failed_region_bitmap, O2NM_MAX_REGIONS); o2hb_dependent_users = 0; @@ -1450,23 +1450,21 @@ void o2hb_init(void) /* if we're already in a callback then we're already serialized by the sem */ static void o2hb_fill_node_map_from_callback(unsigned long *map, - unsigned bytes) + unsigned int bits) { - BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); - - memcpy(map, &o2hb_live_node_bitmap, bytes); + bitmap_copy(map, o2hb_live_node_bitmap, bits); } /* * get a map of all nodes that are heartbeating in any regions */ -void o2hb_fill_node_map(unsigned long *map, unsigned bytes) +void o2hb_fill_node_map(unsigned long *map, unsigned int bits) { /* callers want to serialize this map and callbacks so that they * can trust that they don't miss nodes coming to the party */ down_read(&o2hb_callback_sem); spin_lock(&o2hb_live_lock); - o2hb_fill_node_map_from_callback(map, bytes); + o2hb_fill_node_map_from_callback(map, bits); spin_unlock(&o2hb_live_lock); up_read(&o2hb_callback_sem); } @@ -2460,7 +2458,7 @@ int o2hb_check_node_heartbeating_no_sem(u8 node_num) unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; spin_lock(&o2hb_live_lock); - o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); + o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES); spin_unlock(&o2hb_live_lock); if (!test_bit(node_num, testing_map)) { mlog(ML_HEARTBEAT, @@ -2477,7 +2475,7 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num) { unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); + o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES); if (!test_bit(node_num, testing_map)) { mlog(ML_HEARTBEAT, "node (%u) does not have heartbeating enabled.\n", diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index 1d4100abf6f8..8ef8c1b9eeb7 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -59,7 +59,7 @@ int o2hb_register_callback(const char *region_uuid, void o2hb_unregister_callback(const char *region_uuid, struct o2hb_callback_func *hc); void o2hb_fill_node_map(unsigned long *map, - unsigned bytes); + unsigned int bits); void o2hb_exit(void); void o2hb_init(void); int o2hb_check_node_heartbeating_no_sem(u8 node_num); diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 7524994e3199..35c05c18de59 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -438,7 +438,7 @@ static int o2net_fill_bitmap(char *buf, int len) unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; int i = -1, out = 0; - o2net_fill_node_map(map, sizeof(map)); + o2net_fill_node_map(map, O2NM_MAX_NODES); while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i); diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 27fee68f860a..2f61d39e4e50 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -54,7 +54,7 @@ int o2nm_configured_node_map(unsigned long *map, unsigned bytes) return -EINVAL; read_lock(&cluster->cl_nodes_lock); - memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap)); + bitmap_copy(map, cluster->cl_nodes_bitmap, O2NM_MAX_NODES); read_unlock(&cluster->cl_nodes_lock); return 0; diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index f660c0dbdb63..a07b24d170f2 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -900,7 +900,7 @@ static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) { struct kvec vec = { .iov_len = len, .iov_base = data, }; struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; - iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, len); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, len); return sock_recvmsg(sock, &msg, MSG_DONTWAIT); } @@ -990,14 +990,12 @@ static int o2net_tx_can_proceed(struct o2net_node *nn, } /* Get a map of all nodes to which this node is currently connected to */ -void o2net_fill_node_map(unsigned long *map, unsigned bytes) +void o2net_fill_node_map(unsigned long *map, unsigned int bits) { struct o2net_sock_container *sc; int node, ret; - BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); - - memset(map, 0, bytes); + bitmap_zero(map, bits); for (node = 0; node < O2NM_MAX_NODES; ++node) { if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret)) continue; @@ -1604,6 +1602,7 @@ static void o2net_start_connect(struct work_struct *work) sc->sc_sock = sock; /* freed by sc_kref_release */ sock->sk->sk_allocation = GFP_ATOMIC; + sock->sk->sk_use_task_frag = false; myaddr.sin_family = AF_INET; myaddr.sin_addr.s_addr = mynode->nd_ipv4_address; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 81c3d65d68fe..694471fc46b8 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2032,7 +2032,7 @@ struct ocfs2_empty_dir_priv { unsigned seen_other; unsigned dx_dir; }; -static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name, +static bool ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name, int name_len, loff_t pos, u64 ino, unsigned type) { @@ -2052,7 +2052,7 @@ static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name, */ if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) { p->seen_dot = 1; - return 0; + return true; } if (name_len == 2 && !strncmp("..", name, 2) && @@ -2060,13 +2060,13 @@ static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name, p->seen_dot_dot = 1; if (p->dx_dir && p->seen_dot) - return 1; + return false; - return 0; + return true; } p->seen_other = 1; - return 1; + return false; } static int ocfs2_empty_dir_dx(struct inode *inode, diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index fd2022712167..20f790a47484 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -1094,7 +1094,7 @@ static inline enum dlm_status dlm_err_to_dlm_status(int err) static inline void dlm_node_iter_init(unsigned long *map, struct dlm_node_iter *iter) { - memcpy(iter->node_map, map, sizeof(iter->node_map)); + bitmap_copy(iter->node_map, map, O2NM_MAX_NODES); iter->curnode = -1; } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index c4eccd499db8..5c04dde99981 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1576,8 +1576,8 @@ static int dlm_should_restart_join(struct dlm_ctxt *dlm, spin_lock(&dlm->spinlock); /* For now, we restart the process if the node maps have * changed at all */ - ret = memcmp(ctxt->live_map, dlm->live_nodes_map, - sizeof(dlm->live_nodes_map)); + ret = !bitmap_equal(ctxt->live_map, dlm->live_nodes_map, + O2NM_MAX_NODES); spin_unlock(&dlm->spinlock); if (ret) @@ -1604,13 +1604,11 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) /* group sem locking should work for us here -- we're already * registered for heartbeat events so filling this should be * atomic wrt getting those handlers called. */ - o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map)); + o2hb_fill_node_map(dlm->live_nodes_map, O2NM_MAX_NODES); spin_lock(&dlm->spinlock); - memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map)); - + bitmap_copy(ctxt->live_map, dlm->live_nodes_map, O2NM_MAX_NODES); __dlm_set_joining_node(dlm, dlm->node_num); - spin_unlock(&dlm->spinlock); node = -1; @@ -1643,8 +1641,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) * yes_resp_map. Copy that into our domain map and send a join * assert message to clean up everyone elses state. */ spin_lock(&dlm->spinlock); - memcpy(dlm->domain_map, ctxt->yes_resp_map, - sizeof(ctxt->yes_resp_map)); + bitmap_copy(dlm->domain_map, ctxt->yes_resp_map, O2NM_MAX_NODES); set_bit(dlm->node_num, dlm->domain_map); spin_unlock(&dlm->spinlock); @@ -2009,9 +2006,9 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n", dlm->recovery_map, &(dlm->recovery_map[0])); - memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map)); - memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map)); - memset(dlm->domain_map, 0, sizeof(dlm->domain_map)); + bitmap_zero(dlm->recovery_map, O2NM_MAX_NODES); + bitmap_zero(dlm->live_nodes_map, O2NM_MAX_NODES); + bitmap_zero(dlm->domain_map, O2NM_MAX_NODES); dlm->dlm_thread_task = NULL; dlm->dlm_reco_thread_task = NULL; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 227da5b1b6ab..d610da8e2f24 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -258,12 +258,12 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, mle->type = type; INIT_HLIST_NODE(&mle->master_hash_node); INIT_LIST_HEAD(&mle->hb_events); - memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); + bitmap_zero(mle->maybe_map, O2NM_MAX_NODES); spin_lock_init(&mle->spinlock); init_waitqueue_head(&mle->wq); atomic_set(&mle->woken, 0); kref_init(&mle->mle_refs); - memset(mle->response_map, 0, sizeof(mle->response_map)); + bitmap_zero(mle->response_map, O2NM_MAX_NODES); mle->master = O2NM_MAX_NODES; mle->new_master = O2NM_MAX_NODES; mle->inuse = 0; @@ -290,8 +290,8 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, atomic_inc(&dlm->mle_cur_count[mle->type]); /* copy off the node_map and register hb callbacks on our copy */ - memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); - memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); + bitmap_copy(mle->node_map, dlm->domain_map, O2NM_MAX_NODES); + bitmap_copy(mle->vote_map, dlm->domain_map, O2NM_MAX_NODES); clear_bit(dlm->node_num, mle->vote_map); clear_bit(dlm->node_num, mle->node_map); @@ -572,7 +572,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, spin_unlock(&dlm->track_lock); memset(res->lvb, 0, DLM_LVB_LEN); - memset(res->refmap, 0, sizeof(res->refmap)); + bitmap_zero(res->refmap, O2NM_MAX_NODES); } struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, @@ -1036,10 +1036,10 @@ recheck: spin_lock(&mle->spinlock); m = mle->master; - map_changed = (memcmp(mle->vote_map, mle->node_map, - sizeof(mle->vote_map)) != 0); - voting_done = (memcmp(mle->vote_map, mle->response_map, - sizeof(mle->vote_map)) == 0); + map_changed = !bitmap_equal(mle->vote_map, mle->node_map, + O2NM_MAX_NODES); + voting_done = bitmap_equal(mle->vote_map, mle->response_map, + O2NM_MAX_NODES); /* restart if we hit any errors */ if (map_changed) { @@ -1277,11 +1277,11 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, /* now blank out everything, as if we had never * contacted anyone */ - memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); - memset(mle->response_map, 0, sizeof(mle->response_map)); + bitmap_zero(mle->maybe_map, O2NM_MAX_NODES); + bitmap_zero(mle->response_map, O2NM_MAX_NODES); /* reset the vote_map to the current node_map */ - memcpy(mle->vote_map, mle->node_map, - sizeof(mle->node_map)); + bitmap_copy(mle->vote_map, mle->node_map, + O2NM_MAX_NODES); /* put myself into the maybe map */ if (mle->type != DLM_MLE_BLOCK) set_bit(dlm->node_num, mle->maybe_map); @@ -2094,7 +2094,7 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) flags = item->u.am.flags; spin_lock(&dlm->spinlock); - memcpy(nodemap, dlm->domain_map, sizeof(nodemap)); + bitmap_copy(nodemap, dlm->domain_map, O2NM_MAX_NODES); spin_unlock(&dlm->spinlock); clear_bit(dlm->node_num, nodemap); @@ -3447,7 +3447,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, ret = 0; } - memset(iter.node_map, 0, sizeof(iter.node_map)); + bitmap_zero(iter.node_map, O2NM_MAX_NODES); set_bit(old_master, iter.node_map); mlog(0, "doing assert master of %.*s back to %u\n", res->lockname.len, res->lockname.name, old_master); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 52ad342fec3e..50da8af988c1 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -733,7 +733,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) struct dlm_reco_node_data *ndata; spin_lock(&dlm->spinlock); - memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map)); + bitmap_copy(dlm->reco.node_map, dlm->domain_map, O2NM_MAX_NODES); /* nodes can only be removed (by dying) after dropping * this lock, and death will be trapped later, so this should do */ spin_unlock(&dlm->spinlock); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9c67edd215d5..5c60b6bc85bf 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1991,7 +1991,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, } } - if (file && should_remove_suid(file->f_path.dentry)) { + if (file && setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) { ret = __ocfs2_write_remove_suid(inode, di_bh); if (ret) { mlog_errno(ret); @@ -2279,7 +2279,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * inode. There's also the dinode i_size state which * can be lost via setattr during extending writes (we * set inode->i_size at the end of a write. */ - if (should_remove_suid(dentry)) { + if (setattr_should_drop_suidgid(&init_user_ns, inode)) { if (meta_level == 0) { ocfs2_inode_unlock_for_extent_tree(inode, &di_bh, @@ -2712,7 +2712,7 @@ const struct inode_operations ocfs2_file_iops = { .permission = ocfs2_permission, .listxattr = ocfs2_listxattr, .fiemap = ocfs2_fiemap, - .get_acl = ocfs2_iop_get_acl, + .get_inode_acl = ocfs2_iop_get_acl, .set_acl = ocfs2_iop_set_acl, .fileattr_get = ocfs2_fileattr_get, .fileattr_set = ocfs2_fileattr_set, @@ -2722,7 +2722,7 @@ const struct inode_operations ocfs2_special_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, - .get_acl = ocfs2_iop_get_acl, + .get_inode_acl = ocfs2_iop_get_acl, .set_acl = ocfs2_iop_set_acl, }; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index fa87d89cf754..3fb98b4569a2 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -157,7 +157,7 @@ static void ocfs2_queue_replay_slots(struct ocfs2_super *osb, replay_map->rm_state = REPLAY_DONE; } -static void ocfs2_free_replay_slots(struct ocfs2_super *osb) +void ocfs2_free_replay_slots(struct ocfs2_super *osb) { struct ocfs2_replay_map *replay_map = osb->replay_map; @@ -2057,7 +2057,7 @@ struct ocfs2_orphan_filldir_priv { enum ocfs2_orphan_reco_type orphan_reco_type; }; -static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, +static bool ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, int name_len, loff_t pos, u64 ino, unsigned type) { @@ -2066,21 +2066,21 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, struct inode *iter; if (name_len == 1 && !strncmp(".", name, 1)) - return 0; + return true; if (name_len == 2 && !strncmp("..", name, 2)) - return 0; + return true; /* do not include dio entry in case of orphan scan */ if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) && (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX, OCFS2_DIO_ORPHAN_PREFIX_LEN))) - return 0; + return true; /* Skip bad inodes so that recovery can continue */ iter = ocfs2_iget(p->osb, ino, OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0); if (IS_ERR(iter)) - return 0; + return true; if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX, OCFS2_DIO_ORPHAN_PREFIX_LEN)) @@ -2090,7 +2090,7 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, * happen concurrently with unlink/rename */ if (OCFS2_I(iter)->ip_next_orphan) { iput(iter); - return 0; + return true; } trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno); @@ -2099,7 +2099,7 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, OCFS2_I(iter)->ip_next_orphan = p->head; p->head = iter; - return 0; + return true; } static int ocfs2_queue_orphans(struct ocfs2_super *osb, diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 969d0aa28718..41c382f68529 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -150,6 +150,7 @@ int ocfs2_recovery_init(struct ocfs2_super *osb); void ocfs2_recovery_exit(struct ocfs2_super *osb); int ocfs2_compute_replay_slots(struct ocfs2_super *osb); +void ocfs2_free_replay_slots(struct ocfs2_super *osb); /* * Journal Control: * Initialize, Load, Shutdown, Wipe a journal. diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 961d1cf54388..a8fd51afb794 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -232,6 +232,7 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns, handle_t *handle = NULL; struct ocfs2_super *osb; struct ocfs2_dinode *dirfe; + struct ocfs2_dinode *fe = NULL; struct buffer_head *new_fe_bh = NULL; struct inode *inode = NULL; struct ocfs2_alloc_context *inode_ac = NULL; @@ -382,6 +383,7 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns, goto leave; } + fe = (struct ocfs2_dinode *) new_fe_bh->b_data; if (S_ISDIR(mode)) { status = ocfs2_fill_new_dir(osb, handle, dir, inode, new_fe_bh, data_ac, meta_ac); @@ -454,8 +456,11 @@ roll_back: leave: if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && fe) + ocfs2_set_links_count(fe, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) @@ -632,18 +637,9 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, return status; } - status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, + return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, parent_fe_bh, handle, inode_ac, fe_blkno, suballoc_loc, suballoc_bit); - if (status < 0) { - u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit); - int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode, - inode_ac->ac_bh, suballoc_bit, bg_blkno, 1); - if (tmp) - mlog_errno(tmp); - } - - return status; } static int ocfs2_mkdir(struct user_namespace *mnt_userns, @@ -2028,8 +2024,11 @@ bail: ocfs2_clusters_to_bytes(osb->sb, 1)); if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && fe) + ocfs2_set_links_count(fe, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) @@ -2916,7 +2915,7 @@ const struct inode_operations ocfs2_dir_iops = { .permission = ocfs2_permission, .listxattr = ocfs2_listxattr, .fiemap = ocfs2_fiemap, - .get_acl = ocfs2_iop_get_acl, + .get_inode_acl = ocfs2_iop_get_acl, .set_acl = ocfs2_iop_set_acl, .fileattr_get = ocfs2_fileattr_get, .fileattr_set = ocfs2_fileattr_set, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 740b64238312..a503c553bab2 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -560,8 +560,7 @@ static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di) u32 nlink = le16_to_cpu(di->i_links_count); u32 hi = le16_to_cpu(di->i_links_count_hi); - if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL)) - nlink |= (hi << OCFS2_LINKS_HI_SHIFT); + nlink |= (hi << OCFS2_LINKS_HI_SHIFT); return nlink; } diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 638d875eccc7..7aebdbf5cc0a 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -527,7 +527,7 @@ struct ocfs2_extent_block * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty. */ struct ocfs2_slot_map { -/*00*/ __le16 sm_slots[0]; +/*00*/ DECLARE_FLEX_ARRAY(__le16, sm_slots); /* * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255, * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize. @@ -548,7 +548,7 @@ struct ocfs2_extended_slot { * i_size. */ struct ocfs2_slot_map_extended { -/*00*/ struct ocfs2_extended_slot se_slots[0]; +/*00*/ DECLARE_FLEX_ARRAY(struct ocfs2_extended_slot, se_slots); /* * Actual size is i_size of the slot_map system file. It should * match s_max_slots * sizeof(struct ocfs2_extended_slot) @@ -727,7 +727,7 @@ struct ocfs2_dinode { struct ocfs2_extent_list i_list; struct ocfs2_truncate_log i_dealloc; struct ocfs2_inline_data i_data; - __u8 i_symlink[0]; + DECLARE_FLEX_ARRAY(__u8, i_symlink); } id2; /* Actual on-disk size is one block */ }; @@ -892,7 +892,7 @@ struct ocfs2_group_desc /*30*/ struct ocfs2_block_check bg_check; /* Error checking */ __le64 bg_reserved2; /*40*/ union { - __u8 bg_bitmap[0]; + DECLARE_FLEX_ARRAY(__u8, bg_bitmap); struct { /* * Block groups may be discontiguous when diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 1358981e80a3..623db358b1ef 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2614,7 +2614,7 @@ static inline unsigned int ocfs2_cow_align_length(struct super_block *sb, } /* - * Calculate out the start and number of virtual clusters we need to to CoW. + * Calculate out the start and number of virtual clusters we need to CoW. * * cpos is vitual start cluster position we want to do CoW in a * file and write_len is the cluster length. diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 88f75f7f02d7..c973c03f6fd8 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -273,17 +273,17 @@ static int o2cb_cluster_check(void) */ #define O2CB_MAP_STABILIZE_COUNT 60 for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) { - o2hb_fill_node_map(hbmap, sizeof(hbmap)); + o2hb_fill_node_map(hbmap, O2NM_MAX_NODES); if (!test_bit(node_num, hbmap)) { printk(KERN_ERR "o2cb: %s heartbeat has not been " "started.\n", (o2hb_global_heartbeat_active() ? "Global" : "Local")); return -EINVAL; } - o2net_fill_node_map(netmap, sizeof(netmap)); + o2net_fill_node_map(netmap, O2NM_MAX_NODES); /* Force set the current node to allow easy compare */ set_bit(node_num, netmap); - if (!memcmp(hbmap, netmap, sizeof(hbmap))) + if (bitmap_equal(hbmap, netmap, O2NM_MAX_NODES)) return 0; if (i < O2CB_MAP_STABILIZE_COUNT - 1) msleep(1000); diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index a75e2b7d67f5..64e6ddcfe329 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -991,7 +991,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) lc->oc_type = NO_CONTROLD; rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name, - DLM_LSFL_FS | DLM_LSFL_NEWEXCL, DLM_LVB_LEN, + DLM_LSFL_NEWEXCL, DLM_LVB_LEN, &ocfs2_ls_ops, conn, &ops_rv, &fsdlm); if (rc) { if (rc == -EEXIST || rc == -EPROTO) diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index dd77b7aaabf5..a8d5ca98fa57 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -334,10 +334,10 @@ int ocfs2_cluster_connect(const char *stack_name, goto out; } - strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); + strscpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); new_conn->cc_namelen = grouplen; if (cluster_name_len) - strlcpy(new_conn->cc_cluster_name, cluster_name, + strscpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1); new_conn->cc_cluster_name_len = cluster_name_len; new_conn->cc_recovery_handler = recovery_handler; @@ -669,6 +669,8 @@ static struct ctl_table_header *ocfs2_table_header; static int __init ocfs2_stack_glue_init(void) { + int ret; + strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB); ocfs2_table_header = register_sysctl("fs/ocfs2/nm", ocfs2_nm_table); @@ -678,7 +680,11 @@ static int __init ocfs2_stack_glue_init(void) return -ENOMEM; /* or something. */ } - return ocfs2_sysfs_init(); + ret = ocfs2_sysfs_init(); + if (ret) + unregister_sysctl_table(ocfs2_table_header); + + return ret; } static void __exit ocfs2_stack_glue_exit(void) diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 5805a03d100b..9c74eace3adc 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -106,7 +106,7 @@ int ocfs2_claim_clusters(handle_t *handle, u32 *cluster_start, u32 *num_clusters); /* - * Use this variant of ocfs2_claim_clusters to specify a maxiumum + * Use this variant of ocfs2_claim_clusters to specify a maximum * number of clusters smaller than the allocation reserved. */ int __ocfs2_claim_clusters(handle_t *handle, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index e2cc9eec287c..0b0e6a132101 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1159,6 +1159,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) out_dismount: atomic_set(&osb->vol_state, VOLUME_DISABLED); wake_up(&osb->osb_mount_event); + ocfs2_free_replay_slots(osb); ocfs2_dismount_volume(sb, 1); goto out; @@ -1764,9 +1765,7 @@ static int ocfs2_get_sector(struct super_block *sb, if (!buffer_dirty(*bh)) clear_buffer_uptodate(*bh); unlock_buffer(*bh); - ll_rw_block(REQ_OP_READ, 1, bh); - wait_on_buffer(*bh); - if (!buffer_uptodate(*bh)) { + if (bh_read(*bh, 0) < 0) { mlog_errno(-EIO); brelse(*bh); *bh = NULL; @@ -1824,12 +1823,14 @@ static int ocfs2_mount_volume(struct super_block *sb) status = ocfs2_truncate_log_init(osb); if (status < 0) { mlog_errno(status); - goto out_system_inodes; + goto out_check_volume; } ocfs2_super_unlock(osb, 1); return 0; +out_check_volume: + ocfs2_free_replay_slots(osb); out_system_inodes: if (osb->local_alloc_state == OCFS2_LA_ENABLED) ocfs2_shutdown_local_alloc(osb); @@ -2221,7 +2222,7 @@ static int ocfs2_initialize_super(struct super_block *sb, goto out_journal; } - strlcpy(osb->vol_label, di->id2.i_super.s_label, + strscpy(osb->vol_label, di->id2.i_super.s_label, OCFS2_MAX_VOL_LABEL_LEN); osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); diff --git a/fs/omfs/file.c b/fs/omfs/file.c index fa7fe2393ff6..3a5b4b88a583 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -294,11 +294,6 @@ static void omfs_readahead(struct readahead_control *rac) mpage_readahead(rac, omfs_get_block); } -static int omfs_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, omfs_get_block, wbc); -} - static int omfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -375,10 +370,10 @@ const struct address_space_operations omfs_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = omfs_read_folio, .readahead = omfs_readahead, - .writepage = omfs_writepage, .writepages = omfs_writepages, .write_begin = omfs_write_begin, .write_end = generic_write_end, .bmap = omfs_bmap, + .migrate_folio = buffer_migrate_folio, }; diff --git a/fs/open.c b/fs/open.c index cf7e5c350a54..82c1a28b3308 100644 --- a/fs/open.c +++ b/fs/open.c @@ -54,7 +54,7 @@ int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry, } /* Remove suid, sgid, and file capabilities on truncate too */ - ret = dentry_needs_remove_privs(dentry); + ret = dentry_needs_remove_privs(mnt_userns, dentry); if (ret < 0) return ret; if (ret) @@ -188,7 +188,7 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small) if (IS_APPEND(file_inode(f.file))) goto out_putf; sb_start_write(inode->i_sb); - error = security_path_truncate(&f.file->f_path); + error = security_file_truncate(f.file); if (!error) error = do_truncate(file_mnt_user_ns(f.file), dentry, length, ATTR_MTIME | ATTR_CTIME, f.file); @@ -723,10 +723,10 @@ retry_deleg: return -EINVAL; if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) return -EINVAL; - if (!S_ISDIR(inode->i_mode)) - newattrs.ia_valid |= - ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; inode_lock(inode); + if (!S_ISDIR(inode->i_mode)) + newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | + setattr_should_drop_sgid(mnt_userns, inode); /* Continue to send actual fs values, not the mount values. */ error = security_path_chown( path, @@ -842,7 +842,9 @@ static int do_dentry_open(struct file *f, return 0; } - if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { + if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { + i_readcount_inc(inode); + } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { error = get_write_access(inode); if (unlikely(error)) goto cleanup_file; @@ -882,8 +884,6 @@ static int do_dentry_open(struct file *f, goto cleanup_all; } f->f_mode |= FMODE_OPENED; - if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) - i_readcount_inc(inode); if ((f->f_mode & FMODE_READ) && likely(f->f_op->read || f->f_op->read_iter)) f->f_mode |= FMODE_CAN_READ; @@ -937,10 +937,7 @@ cleanup_all: if (WARN_ON_ONCE(error > 0)) error = -EINVAL; fops_put(f->f_op); - if (f->f_mode & FMODE_WRITER) { - put_write_access(inode); - __mnt_drop_write(f->f_path.mnt); - } + put_file_access(f); cleanup_file: path_put(&f->f_path); f->f_path.mnt = NULL; diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 605e5a3506ec..c5da2091cefb 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -64,8 +64,7 @@ struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu) return acl; } -static int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, - int type) +int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int error = 0; void *value = NULL; @@ -119,12 +118,13 @@ out: return error; } -int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int orangefs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int error; struct iattr iattr; int rc; + struct inode *inode = d_inode(dentry); memset(&iattr, 0, sizeof iattr); @@ -153,46 +153,7 @@ int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, rc = __orangefs_set_acl(inode, acl, type); if (!rc && (iattr.ia_valid == ATTR_MODE)) - rc = __orangefs_setattr(inode, &iattr); + rc = __orangefs_setattr_mode(dentry, &iattr); return rc; } - -int orangefs_init_acl(struct inode *inode, struct inode *dir) -{ - struct posix_acl *default_acl, *acl; - umode_t mode = inode->i_mode; - struct iattr iattr; - int error = 0; - - error = posix_acl_create(dir, &mode, &default_acl, &acl); - if (error) - return error; - - if (default_acl) { - error = __orangefs_set_acl(inode, default_acl, - ACL_TYPE_DEFAULT); - posix_acl_release(default_acl); - } else { - inode->i_default_acl = NULL; - } - - if (acl) { - if (!error) - error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS); - posix_acl_release(acl); - } else { - inode->i_acl = NULL; - } - - /* If mode of the inode was changed, then do a forcible ->setattr */ - if (mode != inode->i_mode) { - memset(&iattr, 0, sizeof iattr); - inode->i_mode = mode; - iattr.ia_mode = mode; - iattr.ia_valid |= ATTR_MODE; - __orangefs_setattr(inode, &iattr); - } - - return error; -} diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index e2c2699d8016..9cacce5d55c1 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -398,7 +398,7 @@ static int orangefs_dir_release(struct inode *inode, struct file *file) const struct file_operations orangefs_dir_operations = { .llseek = orangefs_dir_llseek, .read = generic_read_dir, - .iterate = orangefs_dir_iterate, + .iterate_shared = orangefs_dir_iterate, .open = orangefs_dir_open, .release = orangefs_dir_release }; diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 86810e5d7914..167fa43b24f9 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -273,7 +273,6 @@ out: gossip_debug(GOSSIP_FILE_DEBUG, "%s(%pU): PUT buffer_index %d\n", __func__, handle, buffer_index); - buffer_index = -1; } op_release(new_op); return ret; @@ -417,9 +416,7 @@ static int orangefs_file_release(struct inode *inode, struct file *file) * readahead cache (if any); this forces an expensive refresh of * data for the next caller of mmap (or 'get_block' accesses) */ - if (file_inode(file) && - file_inode(file)->i_mapping && - mapping_nrpages(&file_inode(file)->i_data)) { + if (mapping_nrpages(file->f_mapping)) { if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) { gossip_debug(GOSSIP_INODE_DEBUG, "calling flush_racache on %pU\n", diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 7a8c0c6e698d..4df560894386 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -53,7 +53,7 @@ static int orangefs_writepage_locked(struct page *page, bv.bv_len = wlen; bv.bv_offset = off % PAGE_SIZE; WARN_ON(wlen == 0); - iov_iter_bvec(&iter, WRITE, &bv, 1, wlen); + iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, wlen); ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen, len, wr, NULL, NULL); @@ -112,7 +112,7 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow, else ow->bv[i].bv_offset = 0; } - iov_iter_bvec(&iter, WRITE, ow->bv, ow->npages, ow->len); + iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->npages, ow->len); WARN_ON(ow->off >= len); if (ow->off + ow->len > len) @@ -270,7 +270,7 @@ static void orangefs_readahead(struct readahead_control *rac) offset = readahead_pos(rac); i_pages = &rac->mapping->i_pages; - iov_iter_xarray(&iter, READ, i_pages, offset, readahead_length(rac)); + iov_iter_xarray(&iter, ITER_DEST, i_pages, offset, readahead_length(rac)); /* read in the pages. */ if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, @@ -303,7 +303,7 @@ static int orangefs_read_folio(struct file *file, struct folio *folio) bv.bv_page = &folio->page; bv.bv_len = folio_size(folio); bv.bv_offset = 0; - iov_iter_bvec(&iter, READ, &bv, 1, folio_size(folio)); + iov_iter_bvec(&iter, ITER_DEST, &bv, 1, folio_size(folio)); ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, folio_size(folio), inode->i_size, NULL, NULL, file); @@ -530,7 +530,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, size_t count = iov_iter_count(iter); ssize_t total_count = 0; ssize_t ret = -EINVAL; - int i = 0; gossip_debug(GOSSIP_FILE_DEBUG, "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n", @@ -556,7 +555,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, while (iov_iter_count(iter)) { size_t each_count = iov_iter_count(iter); size_t amt_complete; - i++; /* how much to transfer in this loop iteration */ if (each_count > orangefs_bufmap_size_query()) @@ -828,15 +826,23 @@ again: spin_unlock(&inode->i_lock); mark_inode_dirty(inode); - if (iattr->ia_valid & ATTR_MODE) - /* change mod on a file that has ACLs */ - ret = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); - ret = 0; out: return ret; } +int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr) +{ + int ret; + struct inode *inode = d_inode(dentry); + + ret = __orangefs_setattr(inode, iattr); + /* change mode on a file that has ACLs */ + if (!ret && (iattr->ia_valid & ATTR_MODE)) + ret = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + return ret; +} + /* * Change attributes of an object referenced by dentry. */ @@ -849,7 +855,7 @@ int orangefs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, ret = setattr_prepare(&init_user_ns, dentry, iattr); if (ret) goto out; - ret = __orangefs_setattr(d_inode(dentry), iattr); + ret = __orangefs_setattr_mode(dentry, iattr); sync_inode_metadata(d_inode(dentry), 1); out: gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n", @@ -967,7 +973,7 @@ static int orangefs_fileattr_set(struct user_namespace *mnt_userns, /* ORANGEFS2 implementation of VFS inode operations for files */ static const struct inode_operations orangefs_file_inode_operations = { - .get_acl = orangefs_get_acl, + .get_inode_acl = orangefs_get_acl, .set_acl = orangefs_set_acl, .setattr = orangefs_setattr, .getattr = orangefs_getattr, @@ -1097,8 +1103,9 @@ struct inode *orangefs_iget(struct super_block *sb, * Allocate an inode for a newly created file and insert it into the inode hash. */ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, - int mode, dev_t dev, struct orangefs_object_kref *ref) + umode_t mode, dev_t dev, struct orangefs_object_kref *ref) { + struct posix_acl *acl = NULL, *default_acl = NULL; unsigned long hash = orangefs_handle_hash(ref); struct inode *inode; int error; @@ -1115,6 +1122,10 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, if (!inode) return ERR_PTR(-ENOMEM); + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) + goto out_iput; + orangefs_set_inode(inode, ref); inode->i_ino = hash; /* needed for stat etc */ @@ -1125,6 +1136,19 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, orangefs_init_iops(inode); inode->i_rdev = dev; + if (default_acl) { + error = __orangefs_set_acl(inode, default_acl, + ACL_TYPE_DEFAULT); + if (error) + goto out_iput; + } + + if (acl) { + error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS); + if (error) + goto out_iput; + } + error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref); if (error < 0) goto out_iput; @@ -1132,10 +1156,22 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, gossip_debug(GOSSIP_INODE_DEBUG, "Initializing ACL's for inode %pU\n", get_khandle_from_ino(inode)); - orangefs_init_acl(inode, dir); + if (mode != inode->i_mode) { + struct iattr iattr = { + .ia_mode = mode, + .ia_valid = ATTR_MODE, + }; + inode->i_mode = mode; + __orangefs_setattr(inode, &iattr); + __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + } + posix_acl_release(acl); + posix_acl_release(default_acl); return inode; out_iput: iput(inode); + posix_acl_release(acl); + posix_acl_release(default_acl); return ERR_PTR(error); } diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 600e8eee541f..75c1a3dcf68c 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -430,7 +430,7 @@ static int orangefs_rename(struct user_namespace *mnt_userns, /* ORANGEFS implementation of VFS inode operations for directories */ const struct inode_operations orangefs_dir_inode_operations = { .lookup = orangefs_lookup, - .get_acl = orangefs_get_acl, + .get_inode_acl = orangefs_get_acl, .set_acl = orangefs_set_acl, .create = orangefs_create, .unlink = orangefs_unlink, diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 29eaa4544372..1b508f543384 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -194,15 +194,10 @@ void orangefs_debugfs_init(int debug_mask) */ static void orangefs_kernel_debug_init(void) { - int rc = -ENOMEM; - char *k_buffer = NULL; + static char k_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { }; gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__); - k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL); - if (!k_buffer) - goto out; - if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) { strcpy(k_buffer, kernel_debug_string); strcat(k_buffer, "\n"); @@ -213,15 +208,14 @@ static void orangefs_kernel_debug_init(void) debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE, 0444, debug_dir, k_buffer, &kernel_debug_fops); - -out: - gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc); } void orangefs_debugfs_cleanup(void) { debugfs_remove_recursive(debug_dir); + kfree(debug_help_string); + debug_help_string = NULL; } /* open ORANGEFS_KMOD_DEBUG_HELP_FILE */ @@ -297,18 +291,13 @@ static int help_show(struct seq_file *m, void *v) /* * initialize the client-debug file. */ -static int orangefs_client_debug_init(void) +static void orangefs_client_debug_init(void) { - int rc = -ENOMEM; - char *c_buffer = NULL; + static char c_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { }; gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__); - c_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL); - if (!c_buffer) - goto out; - if (strlen(client_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) { strcpy(c_buffer, client_debug_string); strcat(c_buffer, "\n"); @@ -322,13 +311,6 @@ static int orangefs_client_debug_init(void) debug_dir, c_buffer, &kernel_debug_fops); - - rc = 0; - -out: - - gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc); - return rc; } /* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/ @@ -671,6 +653,7 @@ int orangefs_prepare_debugfs_help_string(int at_boot) memset(debug_help_string, 0, DEBUG_HELP_STRING_SIZE); strlcat(debug_help_string, new, string_size); mutex_unlock(&orangefs_help_file_lock); + kfree(new); } rc = 0; diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index b5940ec1836a..6e0cc01b3a14 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -103,13 +103,13 @@ enum orangefs_vfs_op_states { #define ORANGEFS_CACHE_CREATE_FLAGS 0 #endif -extern int orangefs_init_acl(struct inode *inode, struct inode *dir); extern const struct xattr_handler *orangefs_xattr_handlers[]; extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu); extern int orangefs_set_acl(struct user_namespace *mnt_userns, - struct inode *inode, struct posix_acl *acl, + struct dentry *dentry, struct posix_acl *acl, int type); +int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type); /* * orangefs data structures @@ -356,11 +356,12 @@ void fsid_key_table_finalize(void); vm_fault_t orangefs_page_mkwrite(struct vm_fault *); struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, - int mode, + umode_t mode, dev_t dev, struct orangefs_object_kref *ref); int __orangefs_setattr(struct inode *, struct iattr *); +int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr); int orangefs_setattr(struct user_namespace *, struct dentry *, struct iattr *); int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c index cd7297815f91..5ab741c60b7e 100644 --- a/fs/orangefs/orangefs-mod.c +++ b/fs/orangefs/orangefs-mod.c @@ -141,7 +141,7 @@ static int __init orangefs_init(void) gossip_err("%s: could not initialize device subsystem %d!\n", __func__, ret); - goto cleanup_device; + goto cleanup_sysfs; } ret = register_filesystem(&orangefs_fs_type); @@ -152,11 +152,11 @@ static int __init orangefs_init(void) goto out; } - orangefs_sysfs_exit(); - -cleanup_device: orangefs_dev_cleanup(); +cleanup_sysfs: + orangefs_sysfs_exit(); + sysfs_init_failed: orangefs_debugfs_cleanup(); diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c index de80b62553bb..be4ba03a01a0 100644 --- a/fs/orangefs/orangefs-sysfs.c +++ b/fs/orangefs/orangefs-sysfs.c @@ -896,9 +896,18 @@ static struct attribute *orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(orangefs_default); +static struct kobject *orangefs_obj; + +static void orangefs_obj_release(struct kobject *kobj) +{ + kfree(orangefs_obj); + orangefs_obj = NULL; +} + static struct kobj_type orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = orangefs_default_groups, + .release = orangefs_obj_release, }; static struct orangefs_attribute acache_hard_limit_attribute = @@ -934,9 +943,18 @@ static struct attribute *acache_orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(acache_orangefs_default); +static struct kobject *acache_orangefs_obj; + +static void acache_orangefs_obj_release(struct kobject *kobj) +{ + kfree(acache_orangefs_obj); + acache_orangefs_obj = NULL; +} + static struct kobj_type acache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = acache_orangefs_default_groups, + .release = acache_orangefs_obj_release, }; static struct orangefs_attribute capcache_hard_limit_attribute = @@ -972,9 +990,18 @@ static struct attribute *capcache_orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(capcache_orangefs_default); +static struct kobject *capcache_orangefs_obj; + +static void capcache_orangefs_obj_release(struct kobject *kobj) +{ + kfree(capcache_orangefs_obj); + capcache_orangefs_obj = NULL; +} + static struct kobj_type capcache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = capcache_orangefs_default_groups, + .release = capcache_orangefs_obj_release, }; static struct orangefs_attribute ccache_hard_limit_attribute = @@ -1010,9 +1037,18 @@ static struct attribute *ccache_orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(ccache_orangefs_default); +static struct kobject *ccache_orangefs_obj; + +static void ccache_orangefs_obj_release(struct kobject *kobj) +{ + kfree(ccache_orangefs_obj); + ccache_orangefs_obj = NULL; +} + static struct kobj_type ccache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = ccache_orangefs_default_groups, + .release = ccache_orangefs_obj_release, }; static struct orangefs_attribute ncache_hard_limit_attribute = @@ -1048,9 +1084,18 @@ static struct attribute *ncache_orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(ncache_orangefs_default); +static struct kobject *ncache_orangefs_obj; + +static void ncache_orangefs_obj_release(struct kobject *kobj) +{ + kfree(ncache_orangefs_obj); + ncache_orangefs_obj = NULL; +} + static struct kobj_type ncache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = ncache_orangefs_default_groups, + .release = ncache_orangefs_obj_release, }; static struct orangefs_attribute pc_acache_attribute = @@ -1079,9 +1124,18 @@ static struct attribute *pc_orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(pc_orangefs_default); +static struct kobject *pc_orangefs_obj; + +static void pc_orangefs_obj_release(struct kobject *kobj) +{ + kfree(pc_orangefs_obj); + pc_orangefs_obj = NULL; +} + static struct kobj_type pc_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = pc_orangefs_default_groups, + .release = pc_orangefs_obj_release, }; static struct orangefs_attribute stats_reads_attribute = @@ -1103,19 +1157,20 @@ static struct attribute *stats_orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(stats_orangefs_default); +static struct kobject *stats_orangefs_obj; + +static void stats_orangefs_obj_release(struct kobject *kobj) +{ + kfree(stats_orangefs_obj); + stats_orangefs_obj = NULL; +} + static struct kobj_type stats_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = stats_orangefs_default_groups, + .release = stats_orangefs_obj_release, }; -static struct kobject *orangefs_obj; -static struct kobject *acache_orangefs_obj; -static struct kobject *capcache_orangefs_obj; -static struct kobject *ccache_orangefs_obj; -static struct kobject *ncache_orangefs_obj; -static struct kobject *pc_orangefs_obj; -static struct kobject *stats_orangefs_obj; - int orangefs_sysfs_init(void) { int rc = -EINVAL; diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index dd188c7996b3..6708e54b0e30 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -96,7 +96,7 @@ config OVERLAY_FS_XINO_AUTO depends on 64BIT help If this config option is enabled then overlay filesystems will use - unused high bits in undelying filesystem inode numbers to map all + unused high bits in underlying filesystem inode numbers to map all inodes to a unified address space. The mapped 64bit inode numbers might not be compatible with applications that expect 32bit inodes. diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index fdde6c56cc3d..6e4e65ee050d 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -44,7 +44,36 @@ static bool ovl_must_copy_xattr(const char *name) !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); } -int ovl_copy_xattr(struct super_block *sb, struct path *oldpath, struct dentry *new) +static int ovl_copy_acl(struct ovl_fs *ofs, const struct path *path, + struct dentry *dentry, const char *acl_name) +{ + int err; + struct posix_acl *clone, *real_acl = NULL; + + real_acl = ovl_get_acl_path(path, acl_name, false); + if (!real_acl) + return 0; + + if (IS_ERR(real_acl)) { + err = PTR_ERR(real_acl); + if (err == -ENODATA || err == -EOPNOTSUPP) + return 0; + return err; + } + + clone = posix_acl_clone(real_acl, GFP_KERNEL); + posix_acl_release(real_acl); /* release original acl */ + if (!clone) + return -ENOMEM; + + err = ovl_do_set_acl(ofs, dentry, acl_name, clone); + + /* release cloned acl */ + posix_acl_release(clone); + return err; +} + +int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct dentry *new) { struct dentry *old = oldpath->dentry; ssize_t list_size, size, value_size = 0; @@ -93,6 +122,15 @@ int ovl_copy_xattr(struct super_block *sb, struct path *oldpath, struct dentry * error = 0; continue; /* Discard */ } + + if (is_posix_acl_xattr(name)) { + error = ovl_copy_acl(OVL_FS(sb), oldpath, new, name); + if (!error) + continue; + /* POSIX ACLs must be copied. */ + break; + } + retry: size = ovl_do_getxattr(oldpath, name, value, value_size); if (size == -ERANGE) @@ -132,8 +170,8 @@ out: return error; } -static int ovl_copy_fileattr(struct inode *inode, struct path *old, - struct path *new) +static int ovl_copy_fileattr(struct inode *inode, const struct path *old, + const struct path *new) { struct fileattr oldfa = { .flags_valid = true }; struct fileattr newfa = { .flags_valid = true }; @@ -193,11 +231,11 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old, return ovl_real_fileattr_set(new, &newfa); } -static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old, - struct path *new, loff_t len) +static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, + struct file *new_file, loff_t len) { + struct path datapath; struct file *old_file; - struct file *new_file; loff_t old_pos = 0; loff_t new_pos = 0; loff_t cloned; @@ -206,23 +244,18 @@ static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old, bool skip_hole = false; int error = 0; - if (len == 0) - return 0; + ovl_path_lowerdata(dentry, &datapath); + if (WARN_ON(datapath.dentry == NULL)) + return -EIO; - old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY); + old_file = ovl_path_open(&datapath, O_LARGEFILE | O_RDONLY); if (IS_ERR(old_file)) return PTR_ERR(old_file); - new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY); - if (IS_ERR(new_file)) { - error = PTR_ERR(new_file); - goto out_fput; - } - /* Try to use clone_file_range to clone up within the same fs */ cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0); if (cloned == len) - goto out; + goto out_fput; /* Couldn't clone, so now we try to copy the data */ /* Check if lower fs supports seek operation */ @@ -282,10 +315,8 @@ static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old, len -= bytes; } -out: if (!error && ovl_should_sync(ofs)) error = vfs_fsync(new_file, 0); - fput(new_file); out_fput: fput(old_file); return error; @@ -556,30 +587,31 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) return err; } -static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) +static int ovl_copy_up_data(struct ovl_copy_up_ctx *c, const struct path *temp) { struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); - struct inode *inode = d_inode(c->dentry); - struct path upperpath, datapath; + struct file *new_file; int err; - ovl_path_upper(c->dentry, &upperpath); - if (WARN_ON(upperpath.dentry != NULL)) - return -EIO; + if (!S_ISREG(c->stat.mode) || c->metacopy || !c->stat.size) + return 0; - upperpath.dentry = temp; + new_file = ovl_path_open(temp, O_LARGEFILE | O_WRONLY); + if (IS_ERR(new_file)) + return PTR_ERR(new_file); - /* - * Copy up data first and then xattrs. Writing data after - * xattrs will remove security.capability xattr automatically. - */ - if (S_ISREG(c->stat.mode) && !c->metacopy) { - ovl_path_lowerdata(c->dentry, &datapath); - err = ovl_copy_up_data(ofs, &datapath, &upperpath, - c->stat.size); - if (err) - return err; - } + err = ovl_copy_up_file(ofs, c->dentry, new_file, c->stat.size); + fput(new_file); + + return err; +} + +static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp) +{ + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); + struct inode *inode = d_inode(c->dentry); + struct path upperpath = { .mnt = ovl_upper_mnt(ofs), .dentry = temp }; + int err; err = ovl_copy_xattr(c->dentry->d_sb, &c->lowerpath, temp); if (err) @@ -662,6 +694,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *inode; struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir); + struct path path = { .mnt = ovl_upper_mnt(ofs) }; struct dentry *temp, *upper; struct ovl_cu_creds cc; int err; @@ -688,7 +721,16 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) if (IS_ERR(temp)) goto unlock; - err = ovl_copy_up_inode(c, temp); + /* + * Copy up data first and then xattrs. Writing data after + * xattrs will remove security.capability xattr automatically. + */ + path.dentry = temp; + err = ovl_copy_up_data(c, &path); + if (err) + goto cleanup; + + err = ovl_copy_up_metadata(c, temp); if (err) goto cleanup; @@ -732,6 +774,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *udir = d_inode(c->destdir); struct dentry *temp, *upper; + struct file *tmpfile; struct ovl_cu_creds cc; int err; @@ -739,15 +782,22 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) if (err) return err; - temp = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode); + tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode); ovl_revert_cu_creds(&cc); - if (IS_ERR(temp)) - return PTR_ERR(temp); + if (IS_ERR(tmpfile)) + return PTR_ERR(tmpfile); + + temp = tmpfile->f_path.dentry; + if (!c->metacopy && c->stat.size) { + err = ovl_copy_up_file(ofs, c->dentry, tmpfile, c->stat.size); + if (err) + return err; + } - err = ovl_copy_up_inode(c, temp); + err = ovl_copy_up_metadata(c, temp); if (err) - goto out_dput; + goto out_fput; inode_lock_nested(udir, I_MUTEX_PARENT); @@ -761,16 +811,14 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) inode_unlock(udir); if (err) - goto out_dput; + goto out_fput; if (!c->metacopy) ovl_set_upperdata(d_inode(c->dentry)); - ovl_inode_update(d_inode(c->dentry), temp); + ovl_inode_update(d_inode(c->dentry), dget(temp)); - return 0; - -out_dput: - dput(temp); +out_fput: + fput(tmpfile); return err; } @@ -872,7 +920,7 @@ static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode, return true; } -static ssize_t ovl_getxattr_value(struct path *path, char *name, char **value) +static ssize_t ovl_getxattr_value(const struct path *path, char *name, char **value) { ssize_t res; char *buf; @@ -899,7 +947,7 @@ static ssize_t ovl_getxattr_value(struct path *path, char *name, char **value) static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) { struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); - struct path upperpath, datapath; + struct path upperpath; int err; char *capability = NULL; ssize_t cap_size; @@ -908,10 +956,6 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) if (WARN_ON(upperpath.dentry == NULL)) return -EIO; - ovl_path_lowerdata(c->dentry, &datapath); - if (WARN_ON(datapath.dentry == NULL)) - return -EIO; - if (c->stat.size) { err = cap_size = ovl_getxattr_value(&upperpath, XATTR_NAME_CAPS, &capability); @@ -919,7 +963,7 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) goto out; } - err = ovl_copy_up_data(ofs, &datapath, &upperpath, c->stat.size); + err = ovl_copy_up_data(c, &upperpath); if (err) goto out_free; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 6b03457f72bb..f61e37f4c8ff 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -435,28 +435,12 @@ out: } static int ovl_set_upper_acl(struct ovl_fs *ofs, struct dentry *upperdentry, - const char *name, const struct posix_acl *acl) + const char *acl_name, struct posix_acl *acl) { - void *buffer; - size_t size; - int err; - if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl) return 0; - size = posix_acl_xattr_size(acl->a_count); - buffer = kmalloc(size, GFP_KERNEL); - if (!buffer) - return -ENOMEM; - - err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - if (err < 0) - goto out_free; - - err = ovl_do_setxattr(ofs, upperdentry, name, buffer, size, XATTR_CREATE); -out_free: - kfree(buffer); - return err; + return ovl_do_set_acl(ofs, upperdentry, acl_name, acl); } static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, @@ -592,28 +576,42 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, goto out_revert_creds; } - err = -ENOMEM; - override_cred = prepare_creds(); - if (override_cred) { + if (!attr->hardlink) { + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_revert_creds; + /* + * In the creation cases(create, mkdir, mknod, symlink), + * ovl should transfer current's fs{u,g}id to underlying + * fs. Because underlying fs want to initialize its new + * inode owner using current's fs{u,g}id. And in this + * case, the @inode is a new inode that is initialized + * in inode_init_owner() to current's fs{u,g}id. So use + * the inode's i_{u,g}id to override the cred's fs{u,g}id. + * + * But in the other hardlink case, ovl_link() does not + * create a new inode, so just use the ovl mounter's + * fs{u,g}id. + */ override_cred->fsuid = inode->i_uid; override_cred->fsgid = inode->i_gid; - if (!attr->hardlink) { - err = security_dentry_create_files_as(dentry, - attr->mode, &dentry->d_name, old_cred, - override_cred); - if (err) { - put_cred(override_cred); - goto out_revert_creds; - } + err = security_dentry_create_files_as(dentry, + attr->mode, &dentry->d_name, old_cred, + override_cred); + if (err) { + put_cred(override_cred); + goto out_revert_creds; } put_cred(override_creds(override_cred)); put_cred(override_cred); - - if (!ovl_dentry_is_whiteout(dentry)) - err = ovl_create_upper(dentry, inode, attr); - else - err = ovl_create_over_whiteout(dentry, inode, attr); } + + if (!ovl_dentry_is_whiteout(dentry)) + err = ovl_create_upper(dentry, inode, attr); + else + err = ovl_create_over_whiteout(dentry, inode, attr); + out_revert_creds: revert_creds(old_cred); return err; @@ -1311,7 +1309,9 @@ const struct inode_operations ovl_dir_inode_operations = { .permission = ovl_permission, .getattr = ovl_getattr, .listxattr = ovl_listxattr, + .get_inode_acl = ovl_get_inode_acl, .get_acl = ovl_get_acl, + .set_acl = ovl_set_acl, .update_time = ovl_update_time, .fileattr_get = ovl_fileattr_get, .fileattr_set = ovl_fileattr_set, diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index e065a5b9a442..a25bb3453dde 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -339,7 +339,7 @@ out_iput: return dentry; } -/* Get the upper or lower dentry in stach whose on layer @idx */ +/* Get the upper or lower dentry in stack whose on layer @idx */ static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx) { struct ovl_entry *oe = dentry->d_fsdata; @@ -463,7 +463,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, /* Get connected upper overlay dir from index */ if (index) { - struct dentry *upper = ovl_index_upper(ofs, index); + struct dentry *upper = ovl_index_upper(ofs, index, true); dput(index); if (IS_ERR_OR_NULL(upper)) @@ -739,7 +739,7 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, /* Then try to get a connected upper dir by index */ if (index && d_is_dir(index)) { - struct dentry *upper = ovl_index_upper(ofs, index); + struct dentry *upper = ovl_index_upper(ofs, index, true); err = PTR_ERR(upper); if (IS_ERR_OR_NULL(upper)) @@ -796,7 +796,7 @@ static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type) return ERR_PTR(-ENOMEM); /* Copy unaligned inner fh into aligned buffer */ - memcpy(&fh->fb, fid, buflen - OVL_FH_WIRE_OFFSET); + memcpy(fh->buf, fid, buflen - OVL_FH_WIRE_OFFSET); return fh; } diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index daff601b5c41..c9d0c362c7ef 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -34,11 +34,11 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode) return 'm'; } -/* No atime modificaton nor notify on underlying */ +/* No atime modification nor notify on underlying */ #define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY) static struct file *ovl_open_realfile(const struct file *file, - struct path *realpath) + const struct path *realpath) { struct inode *realinode = d_inode(realpath->dentry); struct inode *inode = file_inode(file); @@ -96,6 +96,7 @@ static int ovl_change_flags(struct file *file, unsigned int flags) spin_lock(&file->f_lock); file->f_flags = (file->f_flags & ~OVL_SETFL_MASK) | flags; + file->f_iocb_flags = iocb_flags(file); spin_unlock(&file->f_lock); return 0; @@ -517,9 +518,16 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len const struct cred *old_cred; int ret; + inode_lock(inode); + /* Update mode */ + ovl_copyattr(inode); + ret = file_remove_privs(file); + if (ret) + goto out_unlock; + ret = ovl_real_fdget(file, &real); if (ret) - return ret; + goto out_unlock; old_cred = ovl_override_creds(file_inode(file)->i_sb); ret = vfs_fallocate(real.file, mode, offset, len); @@ -530,6 +538,9 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len fdput(real); +out_unlock: + inode_unlock(inode); + return ret; } @@ -567,14 +578,23 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, const struct cred *old_cred; loff_t ret; + inode_lock(inode_out); + if (op != OVL_DEDUPE) { + /* Update mode */ + ovl_copyattr(inode_out); + ret = file_remove_privs(file_out); + if (ret) + goto out_unlock; + } + ret = ovl_real_fdget(file_out, &real_out); if (ret) - return ret; + goto out_unlock; ret = ovl_real_fdget(file_in, &real_in); if (ret) { fdput(real_out); - return ret; + goto out_unlock; } old_cred = ovl_override_creds(file_inode(file_out)->i_sb); @@ -603,6 +623,9 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, fdput(real_in); fdput(real_out); +out_unlock: + inode_unlock(inode_out); + return ret; } diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 0fbcb590af84..ee6dfa577c93 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -14,6 +14,8 @@ #include <linux/fileattr.h> #include <linux/security.h> #include <linux/namei.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> #include "overlayfs.h" @@ -460,7 +462,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) * of the POSIX ACLs retrieved from the lower layer to this function to not * alter the POSIX ACLs for the underlying filesystem. */ -static void ovl_idmap_posix_acl(struct inode *realinode, +static void ovl_idmap_posix_acl(const struct inode *realinode, struct user_namespace *mnt_userns, struct posix_acl *acl) { @@ -485,6 +487,64 @@ static void ovl_idmap_posix_acl(struct inode *realinode, } /* + * The @noperm argument is used to skip permission checking and is a temporary + * measure. Quoting Miklos from an earlier discussion: + * + * > So there are two paths to getting an acl: + * > 1) permission checking and 2) retrieving the value via getxattr(2). + * > This is a similar situation as reading a symlink vs. following it. + * > When following a symlink overlayfs always reads the link on the + * > underlying fs just as if it was a readlink(2) call, calling + * > security_inode_readlink() instead of security_inode_follow_link(). + * > This is logical: we are reading the link from the underlying storage, + * > and following it on overlayfs. + * > + * > Applying the same logic to acl: we do need to call the + * > security_inode_getxattr() on the underlying fs, even if just want to + * > check permissions on overlay. This is currently not done, which is an + * > inconsistency. + * > + * > Maybe adding the check to ovl_get_acl() is the right way to go, but + * > I'm a little afraid of a performance regression. Will look into that. + * + * Until we have made a decision allow this helper to take the @noperm + * argument. We should hopefully be able to remove it soon. + */ +struct posix_acl *ovl_get_acl_path(const struct path *path, + const char *acl_name, bool noperm) +{ + struct posix_acl *real_acl, *clone; + struct user_namespace *mnt_userns; + struct inode *realinode = d_inode(path->dentry); + + mnt_userns = mnt_user_ns(path->mnt); + + if (noperm) + real_acl = get_inode_acl(realinode, posix_acl_type(acl_name)); + else + real_acl = vfs_get_acl(mnt_userns, path->dentry, acl_name); + if (IS_ERR_OR_NULL(real_acl)) + return real_acl; + + if (!is_idmapped_mnt(path->mnt)) + return real_acl; + + /* + * We cannot alter the ACLs returned from the relevant layer as that + * would alter the cached values filesystem wide for the lower + * filesystem. Instead we can clone the ACLs and then apply the + * relevant idmapping of the layer. + */ + clone = posix_acl_clone(real_acl, GFP_KERNEL); + posix_acl_release(real_acl); /* release original acl */ + if (!clone) + return ERR_PTR(-ENOMEM); + + ovl_idmap_posix_acl(realinode, mnt_userns, clone); + return clone; +} + +/* * When the relevant layer is an idmapped mount we need to take the idmapping * of the layer into account and translate any ACL_{GROUP,USER} values * according to the idmapped mount. @@ -495,10 +555,12 @@ static void ovl_idmap_posix_acl(struct inode *realinode, * * This is obviously only relevant when idmapped layers are used. */ -struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) +struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns, + struct inode *inode, int type, + bool rcu, bool noperm) { struct inode *realinode = ovl_inode_real(inode); - struct posix_acl *acl, *clone; + struct posix_acl *acl; struct path realpath; if (!IS_POSIXACL(realinode)) @@ -512,40 +574,115 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) } if (rcu) { + /* + * If the layer is idmapped drop out of RCU path walk + * so we can clone the ACLs. + */ + if (is_idmapped_mnt(realpath.mnt)) + return ERR_PTR(-ECHILD); + acl = get_cached_acl_rcu(realinode, type); } else { const struct cred *old_cred; old_cred = ovl_override_creds(inode->i_sb); - acl = get_acl(realinode, type); + acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm); revert_creds(old_cred); } - /* - * If there are no POSIX ACLs, or we encountered an error, - * or the layer isn't idmapped we don't need to do anything. - */ - if (!is_idmapped_mnt(realpath.mnt) || IS_ERR_OR_NULL(acl)) - return acl; + + return acl; +} + +static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, + struct posix_acl *acl, int type) +{ + int err; + struct path realpath; + const char *acl_name; + const struct cred *old_cred; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct dentry *upperdentry = ovl_dentry_upper(dentry); + struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); + + err = ovl_want_write(dentry); + if (err) + return err; /* - * We only get here if the layer is idmapped. So drop out of RCU path - * walk so we can clone the ACLs. There's no need to release the ACLs - * since get_cached_acl_rcu() doesn't take a reference on the ACLs. + * If ACL is to be removed from a lower file, check if it exists in + * the first place before copying it up. */ - if (rcu) - return ERR_PTR(-ECHILD); + acl_name = posix_acl_xattr_name(type); + if (!acl && !upperdentry) { + struct posix_acl *real_acl; - clone = posix_acl_clone(acl, GFP_KERNEL); - if (!clone) - clone = ERR_PTR(-ENOMEM); + ovl_path_lower(dentry, &realpath); + old_cred = ovl_override_creds(dentry->d_sb); + real_acl = vfs_get_acl(mnt_user_ns(realpath.mnt), realdentry, + acl_name); + revert_creds(old_cred); + if (IS_ERR(real_acl)) { + err = PTR_ERR(real_acl); + goto out_drop_write; + } + posix_acl_release(real_acl); + } + + if (!upperdentry) { + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + realdentry = ovl_dentry_upper(dentry); + } + + old_cred = ovl_override_creds(dentry->d_sb); + if (acl) + err = ovl_do_set_acl(ofs, realdentry, acl_name, acl); else - ovl_idmap_posix_acl(realinode, mnt_user_ns(realpath.mnt), clone); + err = ovl_do_remove_acl(ofs, realdentry, acl_name); + revert_creds(old_cred); + + /* copy c/mtime */ + ovl_copyattr(inode); + +out_drop_write: + ovl_drop_write(dentry); + return err; +} + +int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + struct posix_acl *acl, int type) +{ + int err; + struct inode *inode = d_inode(dentry); + struct dentry *workdir = ovl_workdir(dentry); + struct inode *realinode = ovl_inode_real(inode); + + if (!IS_POSIXACL(d_inode(workdir))) + return -EOPNOTSUPP; + if (!realinode->i_op->set_acl) + return -EOPNOTSUPP; + if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + if (!inode_owner_or_capable(&init_user_ns, inode)) + return -EPERM; + /* - * Since we're not in RCU path walk we always need to release the - * original ACLs. + * Check if sgid bit needs to be cleared (actual setacl operation will + * be done with mounter's capabilities and so that won't do it for us). */ - posix_acl_release(acl); - return clone; + if (unlikely(inode->i_mode & S_ISGID) && type == ACL_TYPE_ACCESS && + !in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) { + struct iattr iattr = { .ia_valid = ATTR_KILL_SGID }; + + err = ovl_setattr(&init_user_ns, dentry, &iattr); + if (err) + return err; + } + + return ovl_set_or_remove_acl(dentry, inode, acl, type); } #endif @@ -588,7 +725,7 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * Introducing security_inode_fileattr_get/set() hooks would solve this issue * properly. */ -static int ovl_security_fileattr(struct path *realpath, struct fileattr *fa, +static int ovl_security_fileattr(const struct path *realpath, struct fileattr *fa, bool set) { struct file *file; @@ -610,7 +747,7 @@ static int ovl_security_fileattr(struct path *realpath, struct fileattr *fa, return err; } -int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa) +int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa) { int err; @@ -685,7 +822,7 @@ static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa) } } -int ovl_real_fileattr_get(struct path *realpath, struct fileattr *fa) +int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa) { int err; @@ -721,7 +858,9 @@ static const struct inode_operations ovl_file_inode_operations = { .permission = ovl_permission, .getattr = ovl_getattr, .listxattr = ovl_listxattr, + .get_inode_acl = ovl_get_inode_acl, .get_acl = ovl_get_acl, + .set_acl = ovl_set_acl, .update_time = ovl_update_time, .fiemap = ovl_fiemap, .fileattr_get = ovl_fileattr_get, @@ -741,7 +880,9 @@ static const struct inode_operations ovl_special_inode_operations = { .permission = ovl_permission, .getattr = ovl_getattr, .listxattr = ovl_listxattr, + .get_inode_acl = ovl_get_inode_acl, .get_acl = ovl_get_acl, + .set_acl = ovl_set_acl, .update_time = ovl_update_time, }; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 69dc577974f8..46753134533a 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -26,7 +26,7 @@ struct ovl_lookup_data { bool metacopy; }; -static int ovl_check_redirect(struct path *path, struct ovl_lookup_data *d, +static int ovl_check_redirect(const struct path *path, struct ovl_lookup_data *d, size_t prelen, const char *post) { int res; @@ -194,7 +194,7 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, return real; } -static bool ovl_is_opaquedir(struct ovl_fs *ofs, struct path *path) +static bool ovl_is_opaquedir(struct ovl_fs *ofs, const struct path *path) { return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE); } @@ -487,7 +487,8 @@ fail: } /* Get upper dentry from index */ -struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) +struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index, + bool connected) { struct ovl_fh *fh; struct dentry *upper; @@ -499,7 +500,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) if (IS_ERR_OR_NULL(fh)) return ERR_CAST(fh); - upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), true); + upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), connected); kfree(fh); if (IS_ERR_OR_NULL(upper)) @@ -572,7 +573,7 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) * directly from the index dentry, but for dir index we first need to * decode the upper directory. */ - upper = ovl_index_upper(ofs, index); + upper = ovl_index_upper(ofs, index, false); if (IS_ERR_OR_NULL(upper)) { err = PTR_ERR(upper); /* @@ -1085,6 +1086,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, .mnt = ovl_upper_mnt(ofs), }; + /* + * It's safe to assign upperredirect here: the previous + * assignment of happens only if upperdentry is non-NULL, and + * this one only if upperdentry is NULL. + */ upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0); if (IS_ERR(upperredirect)) { err = PTR_ERR(upperredirect); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 87759165d32b..1df7f850ff3b 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -8,6 +8,8 @@ #include <linux/uuid.h> #include <linux/fs.h> #include <linux/namei.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> #include "ovl_entry.h" #undef pr_fmt @@ -108,7 +110,7 @@ struct ovl_fh { u8 padding[3]; /* make sure fb.fid is 32bit aligned */ union { struct ovl_fb fb; - u8 buf[0]; + DECLARE_FLEX_ARRAY(u8, buf); }; } __packed; @@ -208,7 +210,7 @@ static inline int ovl_do_symlink(struct ovl_fs *ofs, return err; } -static inline ssize_t ovl_do_getxattr(struct path *path, const char *name, +static inline ssize_t ovl_do_getxattr(const struct path *path, const char *name, void *value, size_t size) { int err, len; @@ -238,7 +240,7 @@ static inline ssize_t ovl_getxattr_upper(struct ovl_fs *ofs, } static inline ssize_t ovl_path_getxattr(struct ovl_fs *ofs, - struct path *path, + const struct path *path, enum ovl_xattr ox, void *value, size_t size) { @@ -250,7 +252,7 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, size_t size, int flags) { int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, - (void *)value, size, flags); + value, size, flags); pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n", dentry, name, min((int)size, 48), value, size, flags, err); @@ -278,6 +280,18 @@ static inline int ovl_removexattr(struct ovl_fs *ofs, struct dentry *dentry, return ovl_do_removexattr(ofs, dentry, ovl_xattr(ofs, ox)); } +static inline int ovl_do_set_acl(struct ovl_fs *ofs, struct dentry *dentry, + const char *acl_name, struct posix_acl *acl) +{ + return vfs_set_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name, acl); +} + +static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry, + const char *acl_name) +{ + return vfs_remove_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name); +} + static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, struct dentry *olddentry, struct inode *newdir, struct dentry *newdentry, unsigned int flags) @@ -310,14 +324,16 @@ static inline int ovl_do_whiteout(struct ovl_fs *ofs, return err; } -static inline struct dentry *ovl_do_tmpfile(struct ovl_fs *ofs, - struct dentry *dentry, umode_t mode) +static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs, + struct dentry *dentry, umode_t mode) { - struct dentry *ret = vfs_tmpfile(ovl_upper_mnt_userns(ofs), dentry, mode, 0); - int err = PTR_ERR_OR_ZERO(ret); + struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry }; + struct file *file = vfs_tmpfile_open(ovl_upper_mnt_userns(ofs), &path, mode, + O_LARGEFILE | O_WRONLY, current_cred()); + int err = PTR_ERR_OR_ZERO(file); pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); - return ret; + return file; } static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs, @@ -399,15 +415,15 @@ const char *ovl_dentry_get_redirect(struct dentry *dentry); void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); void ovl_inode_update(struct inode *inode, struct dentry *upperdentry); void ovl_dir_modified(struct dentry *dentry, bool impurity); -u64 ovl_dentry_version_get(struct dentry *dentry); +u64 ovl_inode_version_get(struct inode *inode); bool ovl_is_whiteout(struct dentry *dentry); -struct file *ovl_path_open(struct path *path, int flags); +struct file *ovl_path_open(const struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry, int flags); void ovl_copy_up_end(struct dentry *dentry); bool ovl_already_copied_up(struct dentry *dentry, int flags); -bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, struct path *path, +bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, enum ovl_xattr ox); -bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, struct path *path); +bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path); static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *upperdentry) @@ -430,9 +446,9 @@ bool ovl_need_index(struct dentry *dentry); int ovl_nlink_start(struct dentry *dentry); void ovl_nlink_end(struct dentry *dentry); int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); -int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct path *path); +int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path); bool ovl_is_metacopy_dentry(struct dentry *dentry); -char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct path *path, int padding); +char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding); int ovl_sync_status(struct ovl_fs *ofs); static inline void ovl_set_flag(unsigned long flag, struct inode *inode) @@ -523,7 +539,8 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox, struct dentry *real, bool is_upper, bool set); -struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index); +struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index, + bool connected); int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index); int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, struct qstr *name); @@ -556,7 +573,7 @@ void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper, struct list_head *list); void ovl_cache_free(struct list_head *list); void ovl_dir_cache_free(struct inode *inode); -int ovl_check_d_type_supported(struct path *realpath); +int ovl_check_d_type_supported(const struct path *realpath); int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir, struct vfsmount *mnt, struct dentry *dentry, int level); int ovl_indexdir_cleanup(struct ovl_fs *ofs); @@ -568,9 +585,9 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs); * lower dir was removed under it and possibly before it was rotated from upper * to lower layer. */ -static inline bool ovl_dir_is_real(struct dentry *dir) +static inline bool ovl_dir_is_real(struct inode *dir) { - return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir)); + return !ovl_test_flag(OVL_WHITEOUTS, dir); } /* inode.c */ @@ -592,9 +609,33 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); #ifdef CONFIG_FS_POSIX_ACL -struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu); +struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns, + struct inode *inode, int type, + bool rcu, bool noperm); +static inline struct posix_acl *ovl_get_inode_acl(struct inode *inode, int type, + bool rcu) +{ + return do_ovl_get_acl(&init_user_ns, inode, type, rcu, true); +} +static inline struct posix_acl *ovl_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, int type) +{ + return do_ovl_get_acl(mnt_userns, d_inode(dentry), type, false, false); +} +int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + struct posix_acl *acl, int type); +struct posix_acl *ovl_get_acl_path(const struct path *path, + const char *acl_name, bool noperm); #else -#define ovl_get_acl NULL +#define ovl_get_inode_acl NULL +#define ovl_get_acl NULL +#define ovl_set_acl NULL +static inline struct posix_acl *ovl_get_acl_path(const struct path *path, + const char *acl_name, + bool noperm) +{ + return NULL; +} #endif int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags); @@ -673,8 +714,8 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, extern const struct file_operations ovl_file_operations; int __init ovl_aio_request_cache_init(void); void ovl_aio_request_cache_destroy(void); -int ovl_real_fileattr_get(struct path *realpath, struct fileattr *fa); -int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa); +int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa); +int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa); int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa); int ovl_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, struct fileattr *fa); @@ -683,7 +724,7 @@ int ovl_fileattr_set(struct user_namespace *mnt_userns, int ovl_copy_up(struct dentry *dentry); int ovl_copy_up_with_data(struct dentry *dentry); int ovl_maybe_copy_up(struct dentry *dentry, int flags); -int ovl_copy_xattr(struct super_block *sb, struct path *path, struct dentry *new); +int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentry *new); int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat); struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, bool is_upper); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 78f62cc1797b..8cd2b9947de1 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -170,7 +170,7 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, return p; } -static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, +static bool ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, const char *name, int len, u64 ino, unsigned int d_type) { @@ -179,22 +179,22 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, struct ovl_cache_entry *p; if (ovl_cache_entry_find_link(name, len, &newp, &parent)) - return 0; + return true; p = ovl_cache_entry_new(rdd, name, len, ino, d_type); if (p == NULL) { rdd->err = -ENOMEM; - return -ENOMEM; + return false; } list_add_tail(&p->l_node, rdd->list); rb_link_node(&p->node, parent, newp); rb_insert_color(&p->node, rdd->root); - return 0; + return true; } -static int ovl_fill_lowest(struct ovl_readdir_data *rdd, +static bool ovl_fill_lowest(struct ovl_readdir_data *rdd, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -211,7 +211,7 @@ static int ovl_fill_lowest(struct ovl_readdir_data *rdd, list_add_tail(&p->l_node, &rdd->middle); } - return rdd->err; + return rdd->err == 0; } void ovl_cache_free(struct list_head *list) @@ -235,22 +235,22 @@ void ovl_dir_cache_free(struct inode *inode) } } -static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) +static void ovl_cache_put(struct ovl_dir_file *od, struct inode *inode) { struct ovl_dir_cache *cache = od->cache; WARN_ON(cache->refcount <= 0); cache->refcount--; if (!cache->refcount) { - if (ovl_dir_cache(d_inode(dentry)) == cache) - ovl_set_dir_cache(d_inode(dentry), NULL); + if (ovl_dir_cache(inode) == cache) + ovl_set_dir_cache(inode, NULL); ovl_cache_free(&cache->entries); kfree(cache); } } -static int ovl_fill_merge(struct dir_context *ctx, const char *name, +static bool ovl_fill_merge(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -264,7 +264,7 @@ static int ovl_fill_merge(struct dir_context *ctx, const char *name, return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type); } -static int ovl_check_whiteouts(struct path *path, struct ovl_readdir_data *rdd) +static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd) { int err; struct ovl_cache_entry *p; @@ -291,7 +291,7 @@ static int ovl_check_whiteouts(struct path *path, struct ovl_readdir_data *rdd) return err; } -static inline int ovl_dir_read(struct path *realpath, +static inline int ovl_dir_read(const struct path *realpath, struct ovl_readdir_data *rdd) { struct file *realfile; @@ -323,15 +323,15 @@ static void ovl_dir_reset(struct file *file) { struct ovl_dir_file *od = file->private_data; struct ovl_dir_cache *cache = od->cache; - struct dentry *dentry = file->f_path.dentry; + struct inode *inode = file_inode(file); bool is_real; - if (cache && ovl_dentry_version_get(dentry) != cache->version) { - ovl_cache_put(od, dentry); + if (cache && ovl_inode_version_get(inode) != cache->version) { + ovl_cache_put(od, inode); od->cache = NULL; od->cursor = NULL; } - is_real = ovl_dir_is_real(dentry); + is_real = ovl_dir_is_real(inode); if (od->is_real != is_real) { /* is_real can only become false when dir is copied up */ if (WARN_ON(is_real)) @@ -394,9 +394,10 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) { int res; struct ovl_dir_cache *cache; + struct inode *inode = d_inode(dentry); - cache = ovl_dir_cache(d_inode(dentry)); - if (cache && ovl_dentry_version_get(dentry) == cache->version) { + cache = ovl_dir_cache(inode); + if (cache && ovl_inode_version_get(inode) == cache->version) { WARN_ON(!cache->refcount); cache->refcount++; return cache; @@ -418,8 +419,8 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) return ERR_PTR(res); } - cache->version = ovl_dentry_version_get(dentry); - ovl_set_dir_cache(d_inode(dentry), cache); + cache->version = ovl_inode_version_get(inode); + ovl_set_dir_cache(inode, cache); return cache; } @@ -455,7 +456,7 @@ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, * copy up origin, call vfs_getattr() on the overlay entry to make * sure that d_ino will be consistent with st_ino from stat(2). */ -static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p) +static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry *p) { struct dentry *dir = path->dentry; @@ -528,7 +529,7 @@ fail: goto out; } -static int ovl_fill_plain(struct dir_context *ctx, const char *name, +static bool ovl_fill_plain(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -540,14 +541,14 @@ static int ovl_fill_plain(struct dir_context *ctx, const char *name, p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); if (p == NULL) { rdd->err = -ENOMEM; - return -ENOMEM; + return false; } list_add_tail(&p->l_node, rdd->list); - return 0; + return true; } -static int ovl_dir_read_impure(struct path *path, struct list_head *list, +static int ovl_dir_read_impure(const struct path *path, struct list_head *list, struct rb_root *root) { int err; @@ -592,20 +593,21 @@ static int ovl_dir_read_impure(struct path *path, struct list_head *list, return 0; } -static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path) +static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path) { int res; struct dentry *dentry = path->dentry; + struct inode *inode = d_inode(dentry); struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_dir_cache *cache; - cache = ovl_dir_cache(d_inode(dentry)); - if (cache && ovl_dentry_version_get(dentry) == cache->version) + cache = ovl_dir_cache(inode); + if (cache && ovl_inode_version_get(inode) == cache->version) return cache; /* Impure cache is not refcounted, free it here */ - ovl_dir_cache_free(d_inode(dentry)); - ovl_set_dir_cache(d_inode(dentry), NULL); + ovl_dir_cache_free(inode); + ovl_set_dir_cache(inode, NULL); cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); if (!cache) @@ -627,13 +629,13 @@ static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path) OVL_XATTR_IMPURE); ovl_drop_write(dentry); } - ovl_clear_flag(OVL_IMPURE, d_inode(dentry)); + ovl_clear_flag(OVL_IMPURE, inode); kfree(cache); return NULL; } - cache->version = ovl_dentry_version_get(dentry); - ovl_set_dir_cache(d_inode(dentry), cache); + cache->version = ovl_inode_version_get(inode); + ovl_set_dir_cache(inode, cache); return cache; } @@ -648,7 +650,7 @@ struct ovl_readdir_translate { bool xinowarn; }; -static int ovl_fill_real(struct dir_context *ctx, const char *name, +static bool ovl_fill_real(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -675,7 +677,7 @@ static int ovl_fill_real(struct dir_context *ctx, const char *name, static bool ovl_is_impure_dir(struct file *file) { struct ovl_dir_file *od = file->private_data; - struct inode *dir = d_inode(file->f_path.dentry); + struct inode *dir = file_inode(file); /* * Only upper dir can be impure, but if we are in the middle of @@ -834,7 +836,7 @@ out_unlock: } static struct file *ovl_dir_open_realfile(const struct file *file, - struct path *realpath) + const struct path *realpath) { struct file *res; const struct cred *old_cred; @@ -893,7 +895,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, struct file *realfile; int err; - err = ovl_sync_status(OVL_FS(file->f_path.dentry->d_sb)); + err = ovl_sync_status(OVL_FS(file_inode(file)->i_sb)); if (err <= 0) return err; @@ -913,7 +915,7 @@ static int ovl_dir_release(struct inode *inode, struct file *file) if (od->cache) { inode_lock(inode); - ovl_cache_put(od, file->f_path.dentry); + ovl_cache_put(od, inode); inode_unlock(inode); } fput(od->realfile); @@ -942,7 +944,7 @@ static int ovl_dir_open(struct inode *inode, struct file *file) return PTR_ERR(realfile); } od->realfile = realfile; - od->is_real = ovl_dir_is_real(file->f_path.dentry); + od->is_real = ovl_dir_is_real(inode); od->is_upper = OVL_TYPE_UPPER(type); file->private_data = od; @@ -1027,7 +1029,7 @@ void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper, inode_unlock(upper->d_inode); } -static int ovl_check_d_type(struct dir_context *ctx, const char *name, +static bool ovl_check_d_type(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -1036,19 +1038,19 @@ static int ovl_check_d_type(struct dir_context *ctx, const char *name, /* Even if d_type is not supported, DT_DIR is returned for . and .. */ if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen)) - return 0; + return true; if (d_type != DT_UNKNOWN) rdd->d_type_supported = true; - return 0; + return true; } /* * Returns 1 if d_type is supported, 0 not supported/unknown. Negative values * if error is encountered. */ -int ovl_check_d_type_supported(struct path *realpath) +int ovl_check_d_type_supported(const struct path *realpath) { int err; struct ovl_readdir_data rdd = { @@ -1065,20 +1067,16 @@ int ovl_check_d_type_supported(struct path *realpath) #define OVL_INCOMPATDIR_NAME "incompat" -static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, struct path *path, +static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *path, int level) { int err; struct inode *dir = path->dentry->d_inode; LIST_HEAD(list); - struct rb_root root = RB_ROOT; struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { - .ctx.actor = ovl_fill_merge, - .dentry = NULL, + .ctx.actor = ovl_fill_plain, .list = &list, - .root = &root, - .is_lowest = false, }; bool incompat = false; @@ -1159,14 +1157,10 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) struct inode *dir = indexdir->d_inode; struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir }; LIST_HEAD(list); - struct rb_root root = RB_ROOT; struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { - .ctx.actor = ovl_fill_merge, - .dentry = NULL, + .ctx.actor = ovl_fill_plain, .list = &list, - .root = &root, - .is_lowest = false, }; err = ovl_dir_read(&path, &rdd); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index ec746d447f1b..85b891152a2c 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -15,6 +15,7 @@ #include <linux/seq_file.h> #include <linux/posix_acl_xattr.h> #include <linux/exportfs.h> +#include <linux/file.h> #include "overlayfs.h" MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); @@ -138,11 +139,16 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry, unsigned int flags, bool weak) { struct ovl_entry *oe = dentry->d_fsdata; + struct inode *inode = d_inode_rcu(dentry); struct dentry *upper; unsigned int i; int ret = 1; - upper = ovl_dentry_upper(dentry); + /* Careful in RCU mode */ + if (!inode) + return -ECHILD; + + upper = ovl_i_dentry_upper(inode); if (upper) ret = ovl_revalidate_real(upper, flags, weak); @@ -812,13 +818,11 @@ retry: * allowed as upper are limited to "normal" ones, where checking * for the above two errors is sufficient. */ - err = ovl_do_removexattr(ofs, work, - XATTR_NAME_POSIX_ACL_DEFAULT); + err = ovl_do_remove_acl(ofs, work, XATTR_NAME_POSIX_ACL_DEFAULT); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; - err = ovl_do_removexattr(ofs, work, - XATTR_NAME_POSIX_ACL_ACCESS); + err = ovl_do_remove_acl(ofs, work, XATTR_NAME_POSIX_ACL_ACCESS); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; @@ -908,7 +912,7 @@ static int ovl_mount_dir(const char *name, struct path *path) return err; } -static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs, +static int ovl_check_namelen(const struct path *path, struct ovl_fs *ofs, const char *name) { struct kstatfs statfs; @@ -1000,70 +1004,6 @@ static unsigned int ovl_split_lowerdirs(char *str) return ctr; } -static int __maybe_unused -ovl_posix_acl_xattr_get(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, void *buffer, size_t size) -{ - return ovl_xattr_get(dentry, inode, handler->name, buffer, size); -} - -static int __maybe_unused -ovl_posix_acl_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) -{ - struct dentry *workdir = ovl_workdir(dentry); - struct inode *realinode = ovl_inode_real(inode); - struct posix_acl *acl = NULL; - int err; - - /* Check that everything is OK before copy-up */ - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - err = -EOPNOTSUPP; - if (!IS_POSIXACL(d_inode(workdir))) - goto out_acl_release; - if (!realinode->i_op->set_acl) - goto out_acl_release; - if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) { - err = acl ? -EACCES : 0; - goto out_acl_release; - } - err = -EPERM; - if (!inode_owner_or_capable(&init_user_ns, inode)) - goto out_acl_release; - - posix_acl_release(acl); - - /* - * Check if sgid bit needs to be cleared (actual setacl operation will - * be done with mounter's capabilities and so that won't do it for us). - */ - if (unlikely(inode->i_mode & S_ISGID) && - handler->flags == ACL_TYPE_ACCESS && - !in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) { - struct iattr iattr = { .ia_valid = ATTR_KILL_SGID }; - - err = ovl_setattr(&init_user_ns, dentry, &iattr); - if (err) - return err; - } - - err = ovl_xattr_set(dentry, inode, handler->name, value, size, flags); - return err; - -out_acl_release: - posix_acl_release(acl); - return err; -} - static int ovl_own_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) @@ -1096,22 +1036,6 @@ static int ovl_other_xattr_set(const struct xattr_handler *handler, return ovl_xattr_set(dentry, inode, name, value, size, flags); } -static const struct xattr_handler __maybe_unused -ovl_posix_acl_access_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_ACCESS, - .flags = ACL_TYPE_ACCESS, - .get = ovl_posix_acl_xattr_get, - .set = ovl_posix_acl_xattr_set, -}; - -static const struct xattr_handler __maybe_unused -ovl_posix_acl_default_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .get = ovl_posix_acl_xattr_get, - .set = ovl_posix_acl_xattr_set, -}; - static const struct xattr_handler ovl_own_trusted_xattr_handler = { .prefix = OVL_XATTR_TRUSTED_PREFIX, .get = ovl_own_xattr_get, @@ -1132,8 +1056,8 @@ static const struct xattr_handler ovl_other_xattr_handler = { static const struct xattr_handler *ovl_trusted_xattr_handlers[] = { #ifdef CONFIG_FS_POSIX_ACL - &ovl_posix_acl_access_xattr_handler, - &ovl_posix_acl_default_xattr_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif &ovl_own_trusted_xattr_handler, &ovl_other_xattr_handler, @@ -1142,8 +1066,8 @@ static const struct xattr_handler *ovl_trusted_xattr_handlers[] = { static const struct xattr_handler *ovl_user_xattr_handlers[] = { #ifdef CONFIG_FS_POSIX_ACL - &ovl_posix_acl_access_xattr_handler, - &ovl_posix_acl_default_xattr_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif &ovl_own_user_xattr_handler, &ovl_other_xattr_handler, @@ -1353,10 +1277,11 @@ static int ovl_create_volatile_dirty(struct ovl_fs *ofs) } static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, - struct path *workpath) + const struct path *workpath) { struct vfsmount *mnt = ovl_upper_mnt(ofs); - struct dentry *temp, *workdir; + struct dentry *workdir; + struct file *tmpfile; bool rename_whiteout; bool d_type; int fh_type; @@ -1392,10 +1317,10 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, pr_warn("upper fs needs to support d_type.\n"); /* Check if upper/work fs supports O_TMPFILE */ - temp = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0); - ofs->tmpfile = !IS_ERR(temp); + tmpfile = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0); + ofs->tmpfile = !IS_ERR(tmpfile); if (ofs->tmpfile) - dput(temp); + fput(tmpfile); else pr_warn("upper fs does not support tmpfile.\n"); @@ -1482,7 +1407,7 @@ out: } static int ovl_get_workdir(struct super_block *sb, struct ovl_fs *ofs, - struct path *upperpath) + const struct path *upperpath) { int err; struct path workpath = { }; @@ -1525,7 +1450,7 @@ out: } static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, - struct ovl_entry *oe, struct path *upperpath) + struct ovl_entry *oe, const struct path *upperpath) { struct vfsmount *mnt = ovl_upper_mnt(ofs); struct dentry *indexdir; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 87f811c089e4..bde291623c8c 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -463,7 +463,7 @@ static void ovl_dir_version_inc(struct dentry *dentry, bool impurity) * which have been copied up and have origins), so only need to note * changes to impure entries. */ - if (!ovl_dir_is_real(dentry) || impurity) + if (!ovl_dir_is_real(inode) || impurity) OVL_I(inode)->version++; } @@ -475,10 +475,8 @@ void ovl_dir_modified(struct dentry *dentry, bool impurity) ovl_dir_version_inc(dentry, impurity); } -u64 ovl_dentry_version_get(struct dentry *dentry) +u64 ovl_inode_version_get(struct inode *inode) { - struct inode *inode = d_inode(dentry); - WARN_ON(!inode_is_locked(inode)); return OVL_I(inode)->version; } @@ -490,7 +488,7 @@ bool ovl_is_whiteout(struct dentry *dentry) return inode && IS_WHITEOUT(inode); } -struct file *ovl_path_open(struct path *path, int flags) +struct file *ovl_path_open(const struct path *path, int flags) { struct inode *inode = d_inode(path->dentry); struct user_namespace *real_mnt_userns = mnt_user_ns(path->mnt); @@ -578,7 +576,7 @@ void ovl_copy_up_end(struct dentry *dentry) ovl_inode_unlock(d_inode(dentry)); } -bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, struct path *path) +bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path) { int res; @@ -591,7 +589,7 @@ bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, struct path *path) return false; } -bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, struct path *path, +bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, enum ovl_xattr ox) { int res; @@ -971,7 +969,7 @@ err: } /* err < 0, 0 if no metacopy xattr, 1 if metacopy xattr found */ -int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct path *path) +int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path) { int res; @@ -1015,7 +1013,7 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry) return (oe->numlower > 1); } -char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct path *path, int padding) +char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding) { int res; char *s, *next, *buf = NULL; @@ -1104,13 +1102,18 @@ void ovl_copyattr(struct inode *inode) struct path realpath; struct inode *realinode; struct user_namespace *real_mnt_userns; + vfsuid_t vfsuid; + vfsgid_t vfsgid; ovl_i_path_real(inode, &realpath); realinode = d_inode(realpath.dentry); real_mnt_userns = mnt_user_ns(realpath.mnt); - inode->i_uid = i_uid_into_mnt(real_mnt_userns, realinode); - inode->i_gid = i_gid_into_mnt(real_mnt_userns, realinode); + vfsuid = i_uid_into_vfsuid(real_mnt_userns, realinode); + vfsgid = i_gid_into_vfsgid(real_mnt_userns, realinode); + + inode->i_uid = vfsuid_into_kuid(vfsuid); + inode->i_gid = vfsgid_into_kgid(vfsgid); inode->i_mode = realinode->i_mode; inode->i_atime = realinode->i_atime; inode->i_mtime = realinode->i_mtime; diff --git a/fs/pipe.c b/fs/pipe.c index 74ae9fafd25a..42c7ff41c2db 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -860,7 +860,7 @@ static struct vfsmount *pipe_mnt __read_mostly; */ static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) { - return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", + return dynamic_dname(buffer, buflen, "pipe:[%lu]", d_inode(dentry)->i_ino); } diff --git a/fs/pnode.c b/fs/pnode.c index 1106137c747a..468e4e65a615 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -244,7 +244,7 @@ static int propagate_one(struct mount *m) } do { struct mount *parent = last_source->mnt_parent; - if (last_source == first_source) + if (peers(last_source, first_source)) break; done = parent->mnt_master == p; if (done && peers(n, parent)) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 5af33800743e..d7bc81fc0840 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -24,6 +24,12 @@ #include <linux/user_namespace.h> #include <linux/namei.h> #include <linux/mnt_idmapping.h> +#include <linux/iversion.h> +#include <linux/security.h> +#include <linux/evm.h> +#include <linux/fsnotify.h> + +#include "internal.h" static struct posix_acl **acl_by_type(struct inode *inode, int type) { @@ -63,7 +69,7 @@ struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) if (acl == ACL_DONT_CACHE) { struct posix_acl *ret; - ret = inode->i_op->get_acl(inode, type, LOOKUP_RCU); + ret = inode->i_op->get_inode_acl(inode, type, LOOKUP_RCU); if (!IS_ERR(ret)) acl = ret; } @@ -105,15 +111,17 @@ void forget_all_cached_acls(struct inode *inode) } EXPORT_SYMBOL(forget_all_cached_acls); -struct posix_acl *get_acl(struct inode *inode, int type) +static struct posix_acl *__get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, struct inode *inode, + int type) { - void *sentinel; + struct posix_acl *sentinel; struct posix_acl **p; struct posix_acl *acl; /* * The sentinel is used to detect when another operation like - * set_cached_acl() or forget_cached_acl() races with get_acl(). + * set_cached_acl() or forget_cached_acl() races with get_inode_acl(). * It is guaranteed that is_uncached_acl(sentinel) is true. */ @@ -132,25 +140,27 @@ struct posix_acl *get_acl(struct inode *inode, int type) * current value of the ACL will not be ACL_NOT_CACHED and so our own * sentinel will not be set; another task will update the cache. We * could wait for that other task to complete its job, but it's easier - * to just call ->get_acl to fetch the ACL ourself. (This is going to - * be an unlikely race.) + * to just call ->get_inode_acl to fetch the ACL ourself. (This is + * going to be an unlikely race.) */ cmpxchg(p, ACL_NOT_CACHED, sentinel); /* - * Normally, the ACL returned by ->get_acl will be cached. + * Normally, the ACL returned by ->get{_inode}_acl will be cached. * A filesystem can prevent that by calling - * forget_cached_acl(inode, type) in ->get_acl. + * forget_cached_acl(inode, type) in ->get{_inode}_acl. * - * If the filesystem doesn't have a get_acl() function at all, we'll - * just create the negative cache entry. + * If the filesystem doesn't have a get{_inode}_ acl() function at all, + * we'll just create the negative cache entry. */ - if (!inode->i_op->get_acl) { + if (dentry && inode->i_op->get_acl) { + acl = inode->i_op->get_acl(mnt_userns, dentry, type); + } else if (inode->i_op->get_inode_acl) { + acl = inode->i_op->get_inode_acl(inode, type, false); + } else { set_cached_acl(inode, type, NULL); return NULL; } - acl = inode->i_op->get_acl(inode, type, false); - if (IS_ERR(acl)) { /* * Remove our sentinel so that we don't block future attempts @@ -168,7 +178,12 @@ struct posix_acl *get_acl(struct inode *inode, int type) posix_acl_release(acl); return acl; } -EXPORT_SYMBOL(get_acl); + +struct posix_acl *get_inode_acl(struct inode *inode, int type) +{ + return __get_acl(&init_user_ns, NULL, inode, type); +} +EXPORT_SYMBOL(get_inode_acl); /* * Init a fresh posix_acl @@ -577,19 +592,20 @@ EXPORT_SYMBOL(__posix_acl_chmod); * posix_acl_chmod - chmod a posix acl * * @mnt_userns: user namespace of the mount @inode was found from - * @inode: inode to check permissions on + * @dentry: dentry to check permissions on * @mode: the new mode of @inode * - * If the inode has been found through an idmapped mount the user namespace of + * If the dentry has been found through an idmapped mount the user namespace of * the vfsmount must be passed through @mnt_userns. This function will then * take care to map the inode according to @mnt_userns before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply passs init_user_ns. */ int - posix_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode, + posix_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry, umode_t mode) { + struct inode *inode = d_inode(dentry); struct posix_acl *acl; int ret = 0; @@ -598,7 +614,7 @@ int if (!inode->i_op->set_acl) return -EOPNOTSUPP; - acl = get_acl(inode, ACL_TYPE_ACCESS); + acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR_OR_NULL(acl)) { if (acl == ERR_PTR(-EOPNOTSUPP)) return 0; @@ -608,7 +624,7 @@ int ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); if (ret) return ret; - ret = inode->i_op->set_acl(mnt_userns, inode, acl, ACL_TYPE_ACCESS); + ret = inode->i_op->set_acl(mnt_userns, dentry, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); return ret; } @@ -628,7 +644,7 @@ posix_acl_create(struct inode *dir, umode_t *mode, if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) return 0; - p = get_acl(dir, ACL_TYPE_DEFAULT); + p = get_inode_acl(dir, ACL_TYPE_DEFAULT); if (!p || p == ERR_PTR(-EOPNOTSUPP)) { *mode &= ~current_umask(); return 0; @@ -710,9 +726,9 @@ EXPORT_SYMBOL(posix_acl_update_mode); /* * Fix up the uids and gids in posix acl extended attributes in place. */ -static int posix_acl_fix_xattr_common(void *value, size_t size) +static int posix_acl_fix_xattr_common(const void *value, size_t size) { - struct posix_acl_xattr_header *header = value; + const struct posix_acl_xattr_header *header = value; int count; if (!header) @@ -720,149 +736,43 @@ static int posix_acl_fix_xattr_common(void *value, size_t size) if (size < sizeof(struct posix_acl_xattr_header)) return -EINVAL; if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) - return -EINVAL; + return -EOPNOTSUPP; count = posix_acl_xattr_count(size); if (count < 0) return -EINVAL; if (count == 0) - return -EINVAL; + return 0; return count; } -void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, - const struct inode *inode, - void *value, size_t size) -{ - struct posix_acl_xattr_header *header = value; - struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; - struct user_namespace *fs_userns = i_user_ns(inode); - int count; - vfsuid_t vfsuid; - vfsgid_t vfsgid; - kuid_t uid; - kgid_t gid; - - if (no_idmapping(mnt_userns, i_user_ns(inode))) - return; - - count = posix_acl_fix_xattr_common(value, size); - if (count < 0) - return; - - for (end = entry + count; entry != end; entry++) { - switch (le16_to_cpu(entry->e_tag)) { - case ACL_USER: - uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); - vfsuid = make_vfsuid(mnt_userns, fs_userns, uid); - entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, - vfsuid_into_kuid(vfsuid))); - break; - case ACL_GROUP: - gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); - vfsgid = make_vfsgid(mnt_userns, fs_userns, gid); - entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, - vfsgid_into_kgid(vfsgid))); - break; - default: - break; - } - } -} - -void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns, - const struct inode *inode, - void *value, size_t size) -{ - struct posix_acl_xattr_header *header = value; - struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; - struct user_namespace *fs_userns = i_user_ns(inode); - int count; - vfsuid_t vfsuid; - vfsgid_t vfsgid; - kuid_t uid; - kgid_t gid; - - if (no_idmapping(mnt_userns, i_user_ns(inode))) - return; - - count = posix_acl_fix_xattr_common(value, size); - if (count < 0) - return; - - for (end = entry + count; entry != end; entry++) { - switch (le16_to_cpu(entry->e_tag)) { - case ACL_USER: - uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); - vfsuid = VFSUIDT_INIT(uid); - uid = from_vfsuid(mnt_userns, fs_userns, vfsuid); - entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, uid)); - break; - case ACL_GROUP: - gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); - vfsgid = VFSGIDT_INIT(gid); - gid = from_vfsgid(mnt_userns, fs_userns, vfsgid); - entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, gid)); - break; - default: - break; - } - } -} - -static void posix_acl_fix_xattr_userns( - struct user_namespace *to, struct user_namespace *from, - void *value, size_t size) -{ - struct posix_acl_xattr_header *header = value; - struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; - int count; - kuid_t uid; - kgid_t gid; - - count = posix_acl_fix_xattr_common(value, size); - if (count < 0) - return; - - for (end = entry + count; entry != end; entry++) { - switch(le16_to_cpu(entry->e_tag)) { - case ACL_USER: - uid = make_kuid(from, le32_to_cpu(entry->e_id)); - entry->e_id = cpu_to_le32(from_kuid(to, uid)); - break; - case ACL_GROUP: - gid = make_kgid(from, le32_to_cpu(entry->e_id)); - entry->e_id = cpu_to_le32(from_kgid(to, gid)); - break; - default: - break; - } - } -} - -void posix_acl_fix_xattr_from_user(void *value, size_t size) -{ - struct user_namespace *user_ns = current_user_ns(); - if (user_ns == &init_user_ns) - return; - posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); -} - -void posix_acl_fix_xattr_to_user(void *value, size_t size) -{ - struct user_namespace *user_ns = current_user_ns(); - if (user_ns == &init_user_ns) - return; - posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); -} - -/* - * Convert from extended attribute to in-memory representation. +/** + * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format + * @userns: the filesystem's idmapping + * @value: the uapi representation of POSIX ACLs + * @size: the size of @void + * + * Filesystems that store POSIX ACLs in the unaltered uapi format should use + * posix_acl_from_xattr() when reading them from the backing store and + * converting them into the struct posix_acl VFS format. The helper is + * specifically intended to be called from the acl inode operation. + * + * The posix_acl_from_xattr() function will map the raw {g,u}id values stored + * in ACL_{GROUP,USER} entries into idmapping in @userns. + * + * Note that posix_acl_from_xattr() does not take idmapped mounts into account. + * If it did it calling it from the get acl inode operation would return POSIX + * ACLs mapped according to an idmapped mount which would mean that the value + * couldn't be cached for the filesystem. Idmapped mounts are taken into + * account on the fly during permission checking or right at the VFS - + * userspace boundary before reporting them to the user. + * + * Return: Allocated struct posix_acl on success, NULL for a valid header but + * without actual POSIX ACL entries, or ERR_PTR() encoded error code. */ -struct posix_acl * -posix_acl_from_xattr(struct user_namespace *user_ns, - const void *value, size_t size) +struct posix_acl *posix_acl_from_xattr(struct user_namespace *userns, + const void *value, size_t size) { const struct posix_acl_xattr_header *header = value; const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end; @@ -870,16 +780,9 @@ posix_acl_from_xattr(struct user_namespace *user_ns, struct posix_acl *acl; struct posix_acl_entry *acl_e; - if (!value) - return NULL; - if (size < sizeof(struct posix_acl_xattr_header)) - return ERR_PTR(-EINVAL); - if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) - return ERR_PTR(-EOPNOTSUPP); - - count = posix_acl_xattr_count(size); + count = posix_acl_fix_xattr_common(value, size); if (count < 0) - return ERR_PTR(-EINVAL); + return ERR_PTR(count); if (count == 0) return NULL; @@ -900,16 +803,14 @@ posix_acl_from_xattr(struct user_namespace *user_ns, break; case ACL_USER: - acl_e->e_uid = - make_kuid(user_ns, - le32_to_cpu(entry->e_id)); + acl_e->e_uid = make_kuid(userns, + le32_to_cpu(entry->e_id)); if (!uid_valid(acl_e->e_uid)) goto fail; break; case ACL_GROUP: - acl_e->e_gid = - make_kgid(user_ns, - le32_to_cpu(entry->e_id)); + acl_e->e_gid = make_kgid(userns, + le32_to_cpu(entry->e_id)); if (!gid_valid(acl_e->e_gid)) goto fail; break; @@ -968,35 +869,76 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, } EXPORT_SYMBOL (posix_acl_to_xattr); -static int -posix_acl_xattr_get(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *name, void *value, size_t size) -{ - struct posix_acl *acl; - int error; +/** + * vfs_posix_acl_to_xattr - convert from kernel to userspace representation + * @idmap: idmap of the mount + * @inode: inode the posix acls are set on + * @acl: the posix acls as represented by the vfs + * @buffer: the buffer into which to convert @acl + * @size: size of @buffer + * + * This converts @acl from the VFS representation in the filesystem idmapping + * to the uapi form reportable to userspace. And mount and caller idmappings + * are handled appropriately. + * + * Return: On success, the size of the stored uapi posix acls, on error a + * negative errno. + */ +static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap, + struct inode *inode, + const struct posix_acl *acl, void *buffer, + size_t size) - if (!IS_POSIXACL(inode)) - return -EOPNOTSUPP; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; +{ + struct posix_acl_xattr_header *ext_acl = buffer; + struct posix_acl_xattr_entry *ext_entry; + struct user_namespace *fs_userns, *caller_userns; + struct user_namespace *mnt_userns; + ssize_t real_size, n; + vfsuid_t vfsuid; + vfsgid_t vfsgid; - acl = get_acl(inode, handler->flags); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; + real_size = posix_acl_xattr_size(acl->a_count); + if (!buffer) + return real_size; + if (real_size > size) + return -ERANGE; - error = posix_acl_to_xattr(&init_user_ns, acl, value, size); - posix_acl_release(acl); + ext_entry = (void *)(ext_acl + 1); + ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); - return error; + fs_userns = i_user_ns(inode); + caller_userns = current_user_ns(); + mnt_userns = mnt_idmap_owner(idmap); + for (n=0; n < acl->a_count; n++, ext_entry++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; + ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); + ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch(acl_e->e_tag) { + case ACL_USER: + vfsuid = make_vfsuid(mnt_userns, fs_userns, acl_e->e_uid); + ext_entry->e_id = cpu_to_le32(from_kuid( + caller_userns, vfsuid_into_kuid(vfsuid))); + break; + case ACL_GROUP: + vfsgid = make_vfsgid(mnt_userns, fs_userns, acl_e->e_gid); + ext_entry->e_id = cpu_to_le32(from_kgid( + caller_userns, vfsgid_into_kgid(vfsgid))); + break; + default: + ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); + break; + } + } + return real_size; } int -set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode, +set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry, int type, struct posix_acl *acl) { + struct inode *inode = d_inode(dentry); + if (!IS_POSIXACL(inode)) return -EOPNOTSUPP; if (!inode->i_op->set_acl) @@ -1012,30 +954,10 @@ set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode, if (ret) return ret; } - return inode->i_op->set_acl(mnt_userns, inode, acl, type); + return inode->i_op->set_acl(mnt_userns, dentry, acl, type); } EXPORT_SYMBOL(set_posix_acl); -static int -posix_acl_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, - struct dentry *unused, struct inode *inode, - const char *name, const void *value, size_t size, - int flags) -{ - struct posix_acl *acl = NULL; - int ret; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - ret = set_posix_acl(mnt_userns, inode, handler->flags, acl); - posix_acl_release(acl); - return ret; -} - static bool posix_acl_xattr_list(struct dentry *dentry) { @@ -1046,8 +968,6 @@ const struct xattr_handler posix_acl_access_xattr_handler = { .name = XATTR_NAME_POSIX_ACL_ACCESS, .flags = ACL_TYPE_ACCESS, .list = posix_acl_xattr_list, - .get = posix_acl_xattr_get, - .set = posix_acl_xattr_set, }; EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler); @@ -1055,15 +975,14 @@ const struct xattr_handler posix_acl_default_xattr_handler = { .name = XATTR_NAME_POSIX_ACL_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = posix_acl_xattr_list, - .get = posix_acl_xattr_get, - .set = posix_acl_xattr_set, }; EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler); -int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int simple_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int error; + struct inode *inode = d_inode(dentry); if (type == ACL_TYPE_ACCESS) { error = posix_acl_update_mode(mnt_userns, inode, @@ -1073,6 +992,8 @@ int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode, } inode->i_ctime = current_time(inode); + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); set_cached_acl(inode, type, acl); return 0; } @@ -1095,3 +1016,252 @@ int simple_acl_create(struct inode *dir, struct inode *inode) posix_acl_release(acl); return 0; } + +static int vfs_set_acl_idmapped_mnt(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + struct posix_acl *acl) +{ + for (int n = 0; n < acl->a_count; n++) { + struct posix_acl_entry *acl_e = &acl->a_entries[n]; + + switch (acl_e->e_tag) { + case ACL_USER: + acl_e->e_uid = from_vfsuid(mnt_userns, fs_userns, + VFSUIDT_INIT(acl_e->e_uid)); + break; + case ACL_GROUP: + acl_e->e_gid = from_vfsgid(mnt_userns, fs_userns, + VFSGIDT_INIT(acl_e->e_gid)); + break; + } + } + + return 0; +} + +/** + * vfs_set_acl - set posix acls + * @mnt_userns: user namespace of the mount + * @dentry: the dentry based on which to set the posix acls + * @acl_name: the name of the posix acl + * @kacl: the posix acls in the appropriate VFS format + * + * This function sets @kacl. The caller must all posix_acl_release() on @kacl + * afterwards. + * + * Return: On success 0, on error negative errno. + */ +int vfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *acl_name, struct posix_acl *kacl) +{ + int acl_type; + int error; + struct inode *inode = d_inode(dentry); + struct inode *delegated_inode = NULL; + + acl_type = posix_acl_type(acl_name); + if (acl_type < 0) + return -EINVAL; + + if (kacl) { + /* + * If we're on an idmapped mount translate from mount specific + * vfs{g,u}id_t into global filesystem k{g,u}id_t. + * Afterwards we can cache the POSIX ACLs filesystem wide and - + * if this is a filesystem with a backing store - ultimately + * translate them to backing store values. + */ + error = vfs_set_acl_idmapped_mnt(mnt_userns, i_user_ns(inode), kacl); + if (error) + return error; + } + +retry_deleg: + inode_lock(inode); + + /* + * We only care about restrictions the inode struct itself places upon + * us otherwise POSIX ACLs aren't subject to any VFS restrictions. + */ + error = may_write_xattr(mnt_userns, inode); + if (error) + goto out_inode_unlock; + + error = security_inode_set_acl(mnt_userns, dentry, acl_name, kacl); + if (error) + goto out_inode_unlock; + + error = try_break_deleg(inode, &delegated_inode); + if (error) + goto out_inode_unlock; + + if (inode->i_opflags & IOP_XATTR) + error = set_posix_acl(mnt_userns, dentry, acl_type, kacl); + else if (unlikely(is_bad_inode(inode))) + error = -EIO; + else + error = -EOPNOTSUPP; + if (!error) { + fsnotify_xattr(dentry); + evm_inode_post_set_acl(dentry, acl_name, kacl); + } + +out_inode_unlock: + inode_unlock(inode); + + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } + + return error; +} +EXPORT_SYMBOL_GPL(vfs_set_acl); + +/** + * vfs_get_acl - get posix acls + * @mnt_userns: user namespace of the mount + * @dentry: the dentry based on which to retrieve the posix acls + * @acl_name: the name of the posix acl + * + * This function retrieves @kacl from the filesystem. The caller must all + * posix_acl_release() on @kacl. + * + * Return: On success POSIX ACLs in VFS format, on error negative errno. + */ +struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *acl_name) +{ + struct inode *inode = d_inode(dentry); + struct posix_acl *acl; + int acl_type, error; + + acl_type = posix_acl_type(acl_name); + if (acl_type < 0) + return ERR_PTR(-EINVAL); + + /* + * The VFS has no restrictions on reading POSIX ACLs so calling + * something like xattr_permission() isn't needed. Only LSMs get a say. + */ + error = security_inode_get_acl(mnt_userns, dentry, acl_name); + if (error) + return ERR_PTR(error); + + if (!IS_POSIXACL(inode)) + return ERR_PTR(-EOPNOTSUPP); + if (S_ISLNK(inode->i_mode)) + return ERR_PTR(-EOPNOTSUPP); + + acl = __get_acl(mnt_userns, dentry, inode, acl_type); + if (IS_ERR(acl)) + return acl; + if (!acl) + return ERR_PTR(-ENODATA); + + return acl; +} +EXPORT_SYMBOL_GPL(vfs_get_acl); + +/** + * vfs_remove_acl - remove posix acls + * @mnt_userns: user namespace of the mount + * @dentry: the dentry based on which to retrieve the posix acls + * @acl_name: the name of the posix acl + * + * This function removes posix acls. + * + * Return: On success 0, on error negative errno. + */ +int vfs_remove_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *acl_name) +{ + int acl_type; + int error; + struct inode *inode = d_inode(dentry); + struct inode *delegated_inode = NULL; + + acl_type = posix_acl_type(acl_name); + if (acl_type < 0) + return -EINVAL; + +retry_deleg: + inode_lock(inode); + + /* + * We only care about restrictions the inode struct itself places upon + * us otherwise POSIX ACLs aren't subject to any VFS restrictions. + */ + error = may_write_xattr(mnt_userns, inode); + if (error) + goto out_inode_unlock; + + error = security_inode_remove_acl(mnt_userns, dentry, acl_name); + if (error) + goto out_inode_unlock; + + error = try_break_deleg(inode, &delegated_inode); + if (error) + goto out_inode_unlock; + + if (inode->i_opflags & IOP_XATTR) + error = set_posix_acl(mnt_userns, dentry, acl_type, NULL); + else if (unlikely(is_bad_inode(inode))) + error = -EIO; + else + error = -EOPNOTSUPP; + if (!error) { + fsnotify_xattr(dentry); + evm_inode_post_remove_acl(mnt_userns, dentry, acl_name); + } + +out_inode_unlock: + inode_unlock(inode); + + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } + + return error; +} +EXPORT_SYMBOL_GPL(vfs_remove_acl); + +int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + const char *acl_name, const void *kvalue, size_t size) +{ + int error; + struct posix_acl *acl = NULL; + + if (size) { + /* + * Note that posix_acl_from_xattr() uses GFP_NOFS when it + * probably doesn't need to here. + */ + acl = posix_acl_from_xattr(current_user_ns(), kvalue, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + + error = vfs_set_acl(mnt_idmap_owner(idmap), dentry, acl_name, acl); + posix_acl_release(acl); + return error; +} + +ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, + const char *acl_name, void *kvalue, size_t size) +{ + ssize_t error; + struct posix_acl *acl; + + acl = vfs_get_acl(mnt_idmap_owner(idmap), dentry, acl_name); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + error = vfs_posix_acl_to_xattr(idmap, d_inode(dentry), + acl, kvalue, size); + posix_acl_release(acl); + return error; +} diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index c930001056f9..32b1116ae137 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -92,6 +92,7 @@ config PROC_PAGE_MONITOR config PROC_CHILDREN bool "Include /proc/<pid>/task/<tid>/children file" + depends on PROC_FS default n help Provides a fast way to retrieve first level children pids of a task. See diff --git a/fs/proc/array.c b/fs/proc/array.c index 99fcbfda8e25..49283b8103c7 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -279,7 +279,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) collect_sigign_sigcatch(p, &ignored, &caught); num_threads = get_nr_threads(p); rcu_read_lock(); /* FIXME: is this correct? */ - qsize = get_ucounts_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING); + qsize = get_rlimit_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING); rcu_read_unlock(); qlim = task_rlimit(p, RLIMIT_SIGPENDING); unlock_task_sighand(p, &flags); diff --git a/fs/proc/base.c b/fs/proc/base.c index 93f7e3d971e4..9e479d7d202b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1761,7 +1761,7 @@ out: return ERR_PTR(error); } -static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) +static int do_proc_readlink(const struct path *path, char __user *buffer, int buflen) { char *tmp = kmalloc(PATH_MAX, GFP_KERNEL); char *pathname; @@ -2350,6 +2350,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) GENRADIX(struct map_files_info) fa; struct map_files_info *p; int ret; + struct vma_iterator vmi; genradix_init(&fa); @@ -2388,7 +2389,9 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) * routine might require mmap_lock taken in might_fault(). */ - for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { + pos = 2; + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (++pos <= ctx->pos) @@ -2728,7 +2731,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, return -ESRCH; length = security_getprocattr(task, PROC_I(inode)->op.lsm, - (char*)file->f_path.dentry->d_name.name, + file->f_path.dentry->d_name.name, &p); put_task_struct(task); if (length > 0) @@ -3196,6 +3199,19 @@ static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace * return 0; } +static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm; + + mm = get_task_mm(task); + if (mm) { + seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + mmput(mm); + } + + return 0; +} #endif /* CONFIG_KSM */ #ifdef CONFIG_STACKLEAK_METRICS @@ -3331,6 +3347,7 @@ static const struct pid_entry tgid_base_stuff[] = { #endif #ifdef CONFIG_KSM ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), + ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif }; @@ -3668,6 +3685,7 @@ static const struct pid_entry tid_base_stuff[] = { #endif #ifdef CONFIG_KSM ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), + ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif }; diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index fa762c5fbcb2..91fe1597af7b 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -3,6 +3,7 @@ #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include "internal.h" static int cmdline_proc_show(struct seq_file *m, void *v) { @@ -13,7 +14,10 @@ static int cmdline_proc_show(struct seq_file *m, void *v) static int __init proc_cmdline_init(void) { - proc_create_single("cmdline", 0, NULL, cmdline_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("cmdline", 0, NULL, cmdline_proc_show); + pde->size = saved_command_line_len + 1; return 0; } fs_initcall(proc_cmdline_init); diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index dfe6ce3505ce..e0758fe7936d 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -33,7 +33,16 @@ static int show_console_dev(struct seq_file *m, void *v) if (con->device) { const struct tty_driver *driver; int index; + + /* + * Take console_lock to serialize device() callback with + * other console operations. For example, fg_console is + * modified under console_lock when switching vt. + */ + console_lock(); driver = con->device(con, &index); + console_unlock(); + if (driver) { dev = MKDEV(driver->major, driver->minor_start); dev += index; @@ -63,7 +72,12 @@ static void *c_start(struct seq_file *m, loff_t *pos) struct console *con; loff_t off = 0; - console_lock(); + /* + * Hold the console_list_lock to guarantee safe traversal of the + * console list. SRCU cannot be used because there is no + * place to store the SRCU cookie. + */ + console_list_lock(); for_each_console(con) if (off++ == *pos) break; @@ -74,13 +88,14 @@ static void *c_start(struct seq_file *m, loff_t *pos) static void *c_next(struct seq_file *m, void *v, loff_t *pos) { struct console *con = v; + ++*pos; - return con->next; + return hlist_entry_safe(con->node.next, struct console, node); } static void c_stop(struct seq_file *m, void *v) { - console_unlock(); + console_list_unlock(); } static const struct seq_operations consoles_op = { diff --git a/fs/proc/devices.c b/fs/proc/devices.c index 837971e74109..fe7bfcb7d049 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -4,6 +4,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/blkdev.h> +#include "internal.h" static int devinfo_show(struct seq_file *f, void *v) { @@ -54,7 +55,10 @@ static const struct seq_operations devinfo_ops = { static int __init proc_devices_init(void) { - proc_create_seq("devices", 0, NULL, &devinfo_ops); + struct proc_dir_entry *pde; + + pde = proc_create_seq("devices", 0, NULL, &devinfo_ops); + pde_make_permanent(pde); return 0; } fs_initcall(proc_devices_init); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 913bef0d2a36..fc46d6fe080c 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -7,6 +7,7 @@ #include <linux/namei.h> #include <linux/pid.h> #include <linux/ptrace.h> +#include <linux/bitmap.h> #include <linux/security.h> #include <linux/file.h> #include <linux/seq_file.h> @@ -279,6 +280,30 @@ out: return 0; } +static int proc_readfd_count(struct inode *inode, loff_t *count) +{ + struct task_struct *p = get_proc_task(inode); + struct fdtable *fdt; + + if (!p) + return -ENOENT; + + task_lock(p); + if (p->files) { + rcu_read_lock(); + + fdt = files_fdtable(p->files); + *count = bitmap_weight(fdt->open_fds, fdt->max_fds); + + rcu_read_unlock(); + } + task_unlock(p); + + put_task_struct(p); + + return 0; +} + static int proc_readfd(struct file *file, struct dir_context *ctx) { return proc_readfd_common(file, ctx, proc_fd_instantiate); @@ -319,9 +344,29 @@ int proc_fd_permission(struct user_namespace *mnt_userns, return rv; } +static int proc_fd_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + int rv = 0; + + generic_fillattr(&init_user_ns, inode, stat); + + /* If it's a directory, put the number of open fds there */ + if (S_ISDIR(inode->i_mode)) { + rv = proc_readfd_count(inode, &stat->size); + if (rv < 0) + return rv; + } + + return rv; +} + const struct inode_operations proc_fd_inode_operations = { .lookup = proc_lookupfd, .permission = proc_fd_permission, + .getattr = proc_fd_getattr, .setattr = proc_setattr, }; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 06a80f78433d..b701d0207edf 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -79,6 +79,11 @@ static inline bool pde_is_permanent(const struct proc_dir_entry *pde) return pde->flags & PROC_ENTRY_PERMANENT; } +static inline void pde_make_permanent(struct proc_dir_entry *pde) +{ + pde->flags |= PROC_ENTRY_PERMANENT; +} + extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); @@ -285,7 +290,7 @@ struct proc_maps_private { struct task_struct *task; struct mm_struct *mm; #ifdef CONFIG_MMU - struct vm_area_struct *tail_vma; + struct vma_iterator iter; #endif #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index dff921f7ca33..71157ee35c1a 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -18,7 +18,6 @@ #include <linux/capability.h> #include <linux/elf.h> #include <linux/elfcore.h> -#include <linux/notifier.h> #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/printk.h> @@ -541,25 +540,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) fallthrough; case KCORE_VMEMMAP: case KCORE_TEXT: - if (kern_addr_valid(start)) { - /* - * Using bounce buffer to bypass the - * hardened user copy kernel text checks. - */ - if (copy_from_kernel_nofault(buf, (void *)start, - tsz)) { - if (clear_user(buffer, tsz)) { - ret = -EFAULT; - goto out; - } - } else { - if (copy_to_user(buffer, buf, tsz)) { - ret = -EFAULT; - goto out; - } + /* + * Using bounce buffer to bypass the + * hardened user copy kernel text checks. + */ + if (copy_from_kernel_nofault(buf, (void *)start, tsz)) { + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; } } else { - if (clear_user(buffer, tsz)) { + if (copy_to_user(buffer, buf, tsz)) { ret = -EFAULT; goto out; } @@ -638,10 +629,6 @@ static int __meminit kcore_callback(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block kcore_callback_nb __meminitdata = { - .notifier_call = kcore_callback, - .priority = 0, -}; static struct kcore_list kcore_vmalloc; @@ -694,7 +681,7 @@ static int __init proc_kcore_init(void) add_modules_range(); /* Store direct-map area from physical memory map */ kcore_update_ram(); - register_hotmemory_notifier(&kcore_callback_nb); + hotplug_memory_notifier(kcore_callback, DEFAULT_CALLBACK_PRI); return 0; } diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index 592e6dc7c110..2fc92a13f9f8 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -17,8 +17,6 @@ #include <asm/io.h> -extern wait_queue_head_t log_wait; - static int kmsg_open(struct inode * inode, struct file * file) { return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index f32878d9a39f..817981e57223 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -9,6 +9,7 @@ #include <linux/seq_file.h> #include <linux/seqlock.h> #include <linux/time.h> +#include "internal.h" static int loadavg_proc_show(struct seq_file *m, void *v) { @@ -27,7 +28,10 @@ static int loadavg_proc_show(struct seq_file *m, void *v) static int __init proc_loadavg_init(void) { - proc_create_single("loadavg", 0, NULL, loadavg_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("loadavg", 0, NULL, loadavg_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_loadavg_init); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 6e89f0e2fd20..440960110a42 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -115,6 +115,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #endif show_val_kb(m, "PageTables: ", global_node_page_state(NR_PAGETABLE)); + show_val_kb(m, "SecPageTables: ", + global_node_page_state(NR_SECONDARY_PAGETABLE)); show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", @@ -162,7 +164,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) static int __init proc_meminfo_init(void) { - proc_create_single("meminfo", 0, NULL, meminfo_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("meminfo", 0, NULL, meminfo_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_meminfo_init); diff --git a/fs/proc/page.c b/fs/proc/page.c index a2873a617ae8..6249c347809a 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -91,6 +91,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, } static const struct proc_ops kpagecount_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpagecount_read, }; @@ -218,8 +219,9 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); -#ifdef CONFIG_64BIT +#ifdef CONFIG_ARCH_USES_PG_ARCH_X u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2); + u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3); #endif return u; @@ -268,6 +270,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, } static const struct proc_ops kpageflags_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpageflags_read, }; @@ -322,6 +325,7 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf, } static const struct proc_ops kpagecgroup_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpagecgroup_read, }; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 021e83fe831f..48f2d60bd78a 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -28,13 +28,6 @@ static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; -/* shared constants to be used in various sysctls */ -const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; -EXPORT_SYMBOL(sysctl_vals); - -const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX }; -EXPORT_SYMBOL_GPL(sysctl_long_vals); - /* Support for permanently empty directories */ struct ctl_table sysctl_mount_point[] = { @@ -1246,7 +1239,7 @@ static bool get_links(struct ctl_dir *dir, static int insert_links(struct ctl_table_header *head) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; - struct ctl_dir *core_parent = NULL; + struct ctl_dir *core_parent; struct ctl_table_header *links; int err; diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index 12901dcf57e2..f4616083faef 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -3,6 +3,7 @@ #include <linux/kernel_stat.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include "internal.h" /* * /proc/softirqs ... display the number of softirqs @@ -27,7 +28,10 @@ static int show_softirqs(struct seq_file *p, void *v) static int __init proc_softirqs_init(void) { - proc_create_single("softirqs", 0, NULL, show_softirqs); + struct proc_dir_entry *pde; + + pde = proc_create_single("softirqs", 0, NULL, show_softirqs); + pde_make_permanent(pde); return 0; } fs_initcall(proc_softirqs_init); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4e0023643f8b..e35a0398db63 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/pagewalk.h> -#include <linux/vmacache.h> #include <linux/mm_inline.h> #include <linux/hugetlb.h> #include <linux/huge_mm.h> @@ -124,12 +123,26 @@ static void release_task_mempolicy(struct proc_maps_private *priv) } #endif +static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, + loff_t *ppos) +{ + struct vm_area_struct *vma = vma_next(&priv->iter); + + if (vma) { + *ppos = vma->vm_start; + } else { + *ppos = -2UL; + vma = get_gate_vma(priv->mm); + } + + return vma; +} + static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; unsigned long last_addr = *ppos; struct mm_struct *mm; - struct vm_area_struct *vma; /* See m_next(). Zero at the start or after lseek. */ if (last_addr == -1UL) @@ -153,31 +166,21 @@ static void *m_start(struct seq_file *m, loff_t *ppos) return ERR_PTR(-EINTR); } + vma_iter_init(&priv->iter, mm, last_addr); hold_task_mempolicy(priv); - priv->tail_vma = get_gate_vma(mm); - - vma = find_vma(mm, last_addr); - if (vma) - return vma; + if (last_addr == -2UL) + return get_gate_vma(mm); - return priv->tail_vma; + return proc_get_vma(priv, ppos); } static void *m_next(struct seq_file *m, void *v, loff_t *ppos) { - struct proc_maps_private *priv = m->private; - struct vm_area_struct *next, *vma = v; - - if (vma == priv->tail_vma) - next = NULL; - else if (vma->vm_next) - next = vma->vm_next; - else - next = priv->tail_vma; - - *ppos = next ? next->vm_start : -1UL; - - return next; + if (*ppos == -2UL) { + *ppos = -1UL; + return NULL; + } + return proc_get_vma(m->private, ppos); } static void m_stop(struct seq_file *m, void *v) @@ -271,6 +274,7 @@ static void show_vma_header_prefix(struct seq_file *m, static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { + struct anon_vma_name *anon_name = NULL; struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; vm_flags_t flags = vma->vm_flags; @@ -290,6 +294,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) start = vma->vm_start; end = vma->vm_end; show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); + if (mm) + anon_name = anon_vma_name(vma); /* * Print the dentry name for named mappings, and a @@ -297,7 +303,14 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) */ if (file) { seq_pad(m, ' '); - seq_file_path(m, file, "\n"); + /* + * If user named this anon shared memory via + * prctl(PR_SET_VMA ..., use the provided name. + */ + if (anon_name) + seq_printf(m, "[anon_shmem:%s]", anon_name->name); + else + seq_file_path(m, file, "\n"); goto done; } @@ -309,8 +322,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) name = arch_vma_name(vma); if (!name) { - struct anon_vma_name *anon_name; - if (!mm) { name = "[vdso]"; goto done; @@ -327,7 +338,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - anon_name = anon_vma_name(vma); if (anon_name) { seq_pad(m, ' '); seq_printf(m, "[anon:%s]", anon_name->name); @@ -664,6 +674,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_RAND_READ)] = "rr", [ilog2(VM_DONTCOPY)] = "dc", [ilog2(VM_DONTEXPAND)] = "de", + [ilog2(VM_LOCKONFAULT)] = "lf", [ilog2(VM_ACCOUNT)] = "ac", [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", @@ -864,7 +875,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %d\n", - hugepage_vma_check(vma, vma->vm_flags, true, false)); + hugepage_vma_check(vma, vma->vm_flags, true, false, true)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); @@ -877,16 +888,16 @@ static int show_smaps_rollup(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct mem_size_stats mss; - struct mm_struct *mm; + struct mm_struct *mm = priv->mm; struct vm_area_struct *vma; - unsigned long last_vma_end = 0; + unsigned long vma_start = 0, last_vma_end = 0; int ret = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); priv->task = get_proc_task(priv->inode); if (!priv->task) return -ESRCH; - mm = priv->mm; if (!mm || !mmget_not_zero(mm)) { ret = -ESRCH; goto out_put_task; @@ -899,8 +910,13 @@ static int show_smaps_rollup(struct seq_file *m, void *v) goto out_put_mm; hold_task_mempolicy(priv); + vma = mas_find(&mas, ULONG_MAX); + + if (unlikely(!vma)) + goto empty_set; - for (vma = priv->mm->mmap; vma;) { + vma_start = vma->vm_start; + do { smap_gather_stats(vma, &mss, 0); last_vma_end = vma->vm_end; @@ -909,6 +925,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * access it for write request. */ if (mmap_lock_is_contended(mm)) { + mas_pause(&mas); mmap_read_unlock(mm); ret = mmap_read_lock_killable(mm); if (ret) { @@ -952,7 +969,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * contains last_vma_end. * Iterate VMA' from last_vma_end. */ - vma = find_vma(mm, last_vma_end - 1); + vma = mas_find(&mas, ULONG_MAX); /* Case 3 above */ if (!vma) break; @@ -966,11 +983,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v) smap_gather_stats(vma, &mss, last_vma_end); } /* Case 2 above */ - vma = vma->vm_next; - } + } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); - show_vma_header_prefix(m, priv->mm->mmap->vm_start, - last_vma_end, 0, 0, 0, 0); +empty_set: + show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); seq_pad(m, ' '); seq_puts(m, "[rollup]\n"); @@ -1263,6 +1279,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { + MA_STATE(mas, &mm->mm_mt, 0, 0); struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, @@ -1282,7 +1299,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } if (type == CLEAR_REFS_SOFT_DIRTY) { - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; vma->vm_flags &= ~VM_SOFTDIRTY; @@ -1294,8 +1311,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, 0, NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } - walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops, - &cp); + walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); if (type == CLEAR_REFS_SOFT_DIRTY) { mmu_notifier_invalidate_range_end(&range); flush_tlb_mm(mm); @@ -1418,9 +1434,19 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, if (pte_swp_uffd_wp(pte)) flags |= PM_UFFD_WP; entry = pte_to_swp_entry(pte); - if (pm->show_pfn) + if (pm->show_pfn) { + pgoff_t offset; + /* + * For PFN swap offsets, keeping the offset field + * to be PFN only to be compatible with old smaps. + */ + if (is_pfn_swap_entry(entry)) + offset = swp_offset_pfn(entry); + else + offset = swp_offset(entry); frame = swp_type(entry) | - (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + (offset << MAX_SWAPFILES_SHIFT); + } flags |= PM_SWAP; migration = is_migration_entry(entry); if (is_pfn_swap_entry(entry)) @@ -1477,7 +1503,11 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned long offset; if (pm->show_pfn) { - offset = swp_offset(entry) + + if (is_pfn_swap_entry(entry)) + offset = swp_offset_pfn(entry); + else + offset = swp_offset(entry); + offset = offset + ((addr & ~PMD_MASK) >> PAGE_SHIFT); frame = swp_type(entry) | (offset << MAX_SWAPFILES_SHIFT); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index a6d21fc0033c..2fd06f52b6a4 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -20,15 +20,13 @@ */ void task_mem(struct seq_file *m, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; struct vm_region *region; - struct rb_node *p; unsigned long bytes = 0, sbytes = 0, slack = 0, size; - - mmap_read_lock(mm); - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { - vma = rb_entry(p, struct vm_area_struct, vm_rb); + mmap_read_lock(mm); + for_each_vma(vmi, vma) { bytes += kobjsize(vma); region = vma->vm_region; @@ -82,15 +80,13 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) unsigned long task_vsize(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; - struct rb_node *p; unsigned long vsize = 0; mmap_read_lock(mm); - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { - vma = rb_entry(p, struct vm_area_struct, vm_rb); + for_each_vma(vmi, vma) vsize += vma->vm_end - vma->vm_start; - } mmap_read_unlock(mm); return vsize; } @@ -99,14 +95,13 @@ unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; struct vm_region *region; - struct rb_node *p; unsigned long size = kobjsize(mm); mmap_read_lock(mm); - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { - vma = rb_entry(p, struct vm_area_struct, vm_rb); + for_each_vma(vmi, vma) { size += kobjsize(vma); region = vma->vm_region; if (region) { @@ -190,17 +185,19 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) */ static int show_map(struct seq_file *m, void *_p) { - struct rb_node *p = _p; - - return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb)); + return nommu_vma_show(m, _p); } static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_maps_private *priv = m->private; struct mm_struct *mm; - struct rb_node *p; - loff_t n = *pos; + struct vm_area_struct *vma; + unsigned long addr = *pos; + + /* See m_next(). Zero at the start or after lseek. */ + if (addr == -1UL) + return NULL; /* pin the task and mm whilst we play with them */ priv->task = get_proc_task(priv->inode); @@ -216,10 +213,10 @@ static void *m_start(struct seq_file *m, loff_t *pos) return ERR_PTR(-EINTR); } - /* start from the Nth VMA */ - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) - if (n-- == 0) - return p; + /* start the next element from addr */ + vma = find_vma(mm, addr); + if (vma) + return vma; mmap_read_unlock(mm); mmput(mm); @@ -242,10 +239,10 @@ static void m_stop(struct seq_file *m, void *_vml) static void *m_next(struct seq_file *m, void *_p, loff_t *pos) { - struct rb_node *p = _p; + struct vm_area_struct *vma = _p; - (*pos)++; - return p ? rb_next(p) : NULL; + *pos = vma->vm_end; + return find_vma(vma->vm_mm, vma->vm_end); } static const struct seq_operations proc_pid_maps_ops = { diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index deb99bc9b7e6..b5343d209381 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -7,6 +7,7 @@ #include <linux/time.h> #include <linux/time_namespace.h> #include <linux/kernel_stat.h> +#include "internal.h" static int uptime_proc_show(struct seq_file *m, void *v) { @@ -39,7 +40,10 @@ static int uptime_proc_show(struct seq_file *m, void *v) static int __init proc_uptime_init(void) { - proc_create_single("uptime", 0, NULL, uptime_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("uptime", 0, NULL, uptime_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_uptime_init); diff --git a/fs/proc/version.c b/fs/proc/version.c index b449f186577f..02e3c3cd4a9a 100644 --- a/fs/proc/version.c +++ b/fs/proc/version.c @@ -5,6 +5,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/utsname.h> +#include "internal.h" static int version_proc_show(struct seq_file *m, void *v) { @@ -17,7 +18,10 @@ static int version_proc_show(struct seq_file *m, void *v) static int __init proc_version_init(void) { - proc_create_single("version", 0, NULL, version_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("version", 0, NULL, version_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_version_init); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index f2aa86c421f2..09a81e4b1273 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -199,7 +199,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) struct kvec kvec = { .iov_base = buf, .iov_len = count }; struct iov_iter iter; - iov_iter_kvec(&iter, READ, &kvec, 1, count); + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); return read_from_oldmem(&iter, count, ppos, false); } @@ -212,7 +212,7 @@ ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) struct kvec kvec = { .iov_base = buf, .iov_len = count }; struct iov_iter iter; - iov_iter_kvec(&iter, READ, &kvec, 1, count); + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); return read_from_oldmem(&iter, count, ppos, cc_platform_has(CC_ATTR_MEM_ENCRYPT)); @@ -437,7 +437,7 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) offset = (loff_t) index << PAGE_SHIFT; kvec.iov_base = page_address(page); kvec.iov_len = PAGE_SIZE; - iov_iter_kvec(&iter, READ, &kvec, 1, PAGE_SIZE); + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, PAGE_SIZE); rc = __read_vmcore(&iter, &offset); if (rc < 0) { @@ -1567,6 +1567,7 @@ static int __init vmcore_init(void) return rc; rc = parse_crash_elf_headers(); if (rc) { + elfcorehdr_free(elfcorehdr_addr); pr_warn("Kdump: vmcore not initialized\n"); return rc; } diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index b2fd3c20e7c2..cbc0b468c1ab 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -28,14 +28,11 @@ #include <linux/crypto.h> #include <linux/string.h> #include <linux/timer.h> -#include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/jiffies.h> #include <linux/workqueue.h> -#include <crypto/acompress.h> - #include "internal.h" /* @@ -92,9 +89,13 @@ static char *compress = module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "compression to use"); +/* How much of the kernel log to snapshot */ +unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; +module_param(kmsg_bytes, ulong, 0444); +MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)"); + /* Compression parameters */ -static struct crypto_acomp *tfm; -static struct acomp_req *creq; +static struct crypto_comp *tfm; struct pstore_zbackend { int (*zbufsize)(size_t size); @@ -104,9 +105,6 @@ struct pstore_zbackend { static char *big_oops_buf; static size_t big_oops_buf_sz; -/* How much of the console log to snapshot */ -unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; - void pstore_set_kmsg_bytes(int bytes) { kmsg_bytes = bytes; @@ -272,21 +270,12 @@ static const struct pstore_zbackend zbackends[] = { static int pstore_compress(const void *in, void *out, unsigned int inlen, unsigned int outlen) { - struct scatterlist src, dst; int ret; if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS)) return -EINVAL; - sg_init_table(&src, 1); - sg_set_buf(&src, in, inlen); - - sg_init_table(&dst, 1); - sg_set_buf(&dst, out, outlen); - - acomp_request_set_params(creq, &src, &dst, inlen, outlen); - - ret = crypto_acomp_compress(creq); + ret = crypto_comp_compress(tfm, in, inlen, out, &outlen); if (ret) { pr_err("crypto_comp_compress failed, ret = %d!\n", ret); return ret; @@ -297,7 +286,7 @@ static int pstore_compress(const void *in, void *out, static void allocate_buf_for_compression(void) { - struct crypto_acomp *acomp; + struct crypto_comp *ctx; int size; char *buf; @@ -309,7 +298,7 @@ static void allocate_buf_for_compression(void) if (!psinfo || tfm) return; - if (!crypto_has_acomp(zbackend->name, 0, CRYPTO_ALG_ASYNC)) { + if (!crypto_has_comp(zbackend->name, 0, 0)) { pr_err("Unknown compression: %s\n", zbackend->name); return; } @@ -328,24 +317,16 @@ static void allocate_buf_for_compression(void) return; } - acomp = crypto_alloc_acomp(zbackend->name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR_OR_NULL(acomp)) { + ctx = crypto_alloc_comp(zbackend->name, 0, 0); + if (IS_ERR_OR_NULL(ctx)) { kfree(buf); pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name, - PTR_ERR(acomp)); - return; - } - - creq = acomp_request_alloc(acomp); - if (!creq) { - crypto_free_acomp(acomp); - kfree(buf); - pr_err("acomp_request_alloc('%s') failed\n", zbackend->name); + PTR_ERR(ctx)); return; } /* A non-NULL big_oops_buf indicates compression is available. */ - tfm = acomp; + tfm = ctx; big_oops_buf_sz = size; big_oops_buf = buf; @@ -355,8 +336,7 @@ static void allocate_buf_for_compression(void) static void free_buf_for_compression(void) { if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) { - acomp_request_free(creq); - crypto_free_acomp(tfm); + crypto_free_comp(tfm); tfm = NULL; } kfree(big_oops_buf); @@ -413,6 +393,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, const char *why; unsigned int part = 1; unsigned long flags = 0; + int saved_ret = 0; int ret; why = kmsg_dump_reason_str(reason); @@ -483,12 +464,21 @@ static void pstore_dump(struct kmsg_dumper *dumper, if (ret == 0 && reason == KMSG_DUMP_OOPS) { pstore_new_entry = 1; pstore_timer_kick(); + } else { + /* Preserve only the first non-zero returned value. */ + if (!saved_ret) + saved_ret = ret; } total += record.size; part++; } spin_unlock_irqrestore(&psinfo->buf_lock, flags); + + if (saved_ret) { + pr_err_once("backend (%s) writing error (%d)\n", psinfo->name, + saved_ret); + } } static struct kmsg_dumper pstore_dumper = { @@ -584,8 +574,9 @@ out: int pstore_register(struct pstore_info *psi) { if (backend && strcmp(backend, psi->name)) { - pr_warn("ignoring unexpected backend '%s'\n", psi->name); - return -EPERM; + pr_warn("backend '%s' already in use: ignoring '%s'\n", + backend, psi->name); + return -EBUSY; } /* Sanity check flags. */ @@ -684,6 +675,8 @@ void pstore_unregister(struct pstore_info *psi) psinfo = NULL; kfree(backend); backend = NULL; + + pr_info("Unregistered %s as persistent store backend\n", psi->name); mutex_unlock(&psinfo_lock); } EXPORT_SYMBOL_GPL(pstore_unregister); @@ -693,8 +686,6 @@ static void decompress_record(struct pstore_record *record) int ret; int unzipped_len; char *unzipped, *workspace; - struct acomp_req *dreq; - struct scatterlist src, dst; if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !record->compressed) return; @@ -718,30 +709,16 @@ static void decompress_record(struct pstore_record *record) if (!workspace) return; - dreq = acomp_request_alloc(tfm); - if (!dreq) { - kfree(workspace); - return; - } - - sg_init_table(&src, 1); - sg_set_buf(&src, record->buf, record->size); - - sg_init_table(&dst, 1); - sg_set_buf(&dst, workspace, unzipped_len); - - acomp_request_set_params(dreq, &src, &dst, record->size, unzipped_len); - /* After decompression "unzipped_len" is almost certainly smaller. */ - ret = crypto_acomp_decompress(dreq); + ret = crypto_comp_decompress(tfm, record->buf, record->size, + workspace, &unzipped_len); if (ret) { - pr_err("crypto_acomp_decompress failed, ret = %d!\n", ret); + pr_err("crypto_comp_decompress failed, ret = %d!\n", ret); kfree(workspace); return; } /* Append ECC notice to decompressed buffer. */ - unzipped_len = dreq->dlen; memcpy(workspace + unzipped_len, record->buf + record->size, record->ecc_notice_size); @@ -749,7 +726,6 @@ static void decompress_record(struct pstore_record *record) unzipped = kmemdup(workspace, unzipped_len + record->ecc_notice_size, GFP_KERNEL); kfree(workspace); - acomp_request_free(dreq); if (!unzipped) return; diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c index d8542ec2f38c..b31c9c72d90b 100644 --- a/fs/pstore/pmsg.c +++ b/fs/pstore/pmsg.c @@ -46,7 +46,7 @@ static int pmsg_major; #undef pr_fmt #define pr_fmt(fmt) PMSG_NAME ": " fmt -static char *pmsg_devnode(struct device *dev, umode_t *mode) +static char *pmsg_devnode(const struct device *dev, umode_t *mode) { if (mode) *mode = 0220; diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index fefe3d391d3a..9a5052431fd3 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -18,10 +18,11 @@ #include <linux/platform_device.h> #include <linux/slab.h> #include <linux/compiler.h> -#include <linux/pstore_ram.h> #include <linux/of.h> #include <linux/of_address.h> + #include "internal.h" +#include "ram_internal.h" #define RAMOOPS_KERNMSG_HDR "====" #define MIN_MEM_SIZE 4096UL @@ -451,20 +452,28 @@ static void ramoops_free_przs(struct ramoops_context *cxt) { int i; + /* Free pmsg PRZ */ + persistent_ram_free(&cxt->mprz); + + /* Free console PRZ */ + persistent_ram_free(&cxt->cprz); + /* Free dump PRZs */ if (cxt->dprzs) { for (i = 0; i < cxt->max_dump_cnt; i++) - persistent_ram_free(cxt->dprzs[i]); + persistent_ram_free(&cxt->dprzs[i]); kfree(cxt->dprzs); + cxt->dprzs = NULL; cxt->max_dump_cnt = 0; } /* Free ftrace PRZs */ if (cxt->fprzs) { for (i = 0; i < cxt->max_ftrace_cnt; i++) - persistent_ram_free(cxt->fprzs[i]); + persistent_ram_free(&cxt->fprzs[i]); kfree(cxt->fprzs); + cxt->fprzs = NULL; cxt->max_ftrace_cnt = 0; } } @@ -548,9 +557,10 @@ static int ramoops_init_przs(const char *name, while (i > 0) { i--; - persistent_ram_free(prz_ar[i]); + persistent_ram_free(&prz_ar[i]); } kfree(prz_ar); + prz_ar = NULL; goto fail; } *paddr += zone_sz; @@ -735,6 +745,7 @@ static int ramoops_probe(struct platform_device *pdev) /* Make sure we didn't get bogus platform data pointer. */ if (!pdata) { pr_err("NULL platform data\n"); + err = -EINVAL; goto fail_out; } @@ -742,6 +753,7 @@ static int ramoops_probe(struct platform_device *pdev) !pdata->ftrace_size && !pdata->pmsg_size)) { pr_err("The memory size and the record/console size must be " "non-zero\n"); + err = -EINVAL; goto fail_out; } @@ -772,12 +784,17 @@ static int ramoops_probe(struct platform_device *pdev) dump_mem_sz, cxt->record_size, &cxt->max_dump_cnt, 0, 0); if (err) - goto fail_out; + goto fail_init; err = ramoops_init_prz("console", dev, cxt, &cxt->cprz, &paddr, cxt->console_size, 0); if (err) - goto fail_init_cprz; + goto fail_init; + + err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr, + cxt->pmsg_size, 0); + if (err) + goto fail_init; cxt->max_ftrace_cnt = (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU) ? nr_cpu_ids @@ -788,12 +805,7 @@ static int ramoops_probe(struct platform_device *pdev) (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU) ? PRZ_FLAG_NO_LOCK : 0); if (err) - goto fail_init_fprz; - - err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr, - cxt->pmsg_size, 0); - if (err) - goto fail_init_mprz; + goto fail_init; cxt->pstore.data = cxt; /* @@ -857,11 +869,7 @@ fail_buf: kfree(cxt->pstore.buf); fail_clear: cxt->pstore.bufsize = 0; - persistent_ram_free(cxt->mprz); -fail_init_mprz: -fail_init_fprz: - persistent_ram_free(cxt->cprz); -fail_init_cprz: +fail_init: ramoops_free_przs(cxt); fail_out: return err; @@ -876,8 +884,6 @@ static int ramoops_remove(struct platform_device *pdev) kfree(cxt->pstore.buf); cxt->pstore.bufsize = 0; - persistent_ram_free(cxt->mprz); - persistent_ram_free(cxt->cprz); ramoops_free_przs(cxt); return 0; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index a89e33719fcf..966191d3a5ba 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -13,13 +13,14 @@ #include <linux/kernel.h> #include <linux/list.h> #include <linux/memblock.h> -#include <linux/pstore_ram.h> #include <linux/rslib.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <asm/page.h> +#include "ram_internal.h" + /** * struct persistent_ram_buffer - persistent circular RAM buffer * @@ -439,7 +440,11 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size, phys_addr_t addr = page_start + i * PAGE_SIZE; pages[i] = pfn_to_page(addr >> PAGE_SHIFT); } - vaddr = vmap(pages, page_count, VM_MAP, prot); + /* + * VM_IOREMAP used here to bypass this region during vread() + * and kmap_atomic() (i.e. kcore) to avoid __va() failures. + */ + vaddr = vmap(pages, page_count, VM_MAP | VM_IOREMAP, prot); kfree(pages); /* @@ -543,8 +548,14 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, return 0; } -void persistent_ram_free(struct persistent_ram_zone *prz) +void persistent_ram_free(struct persistent_ram_zone **_prz) { + struct persistent_ram_zone *prz; + + if (!_prz) + return; + + prz = *_prz; if (!prz) return; @@ -568,6 +579,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz) persistent_ram_free_old(prz); kfree(prz->label); kfree(prz); + *_prz = NULL; } struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, @@ -604,6 +616,6 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, return prz; err: - persistent_ram_free(prz); + persistent_ram_free(&prz); return ERR_PTR(ret); } diff --git a/fs/pstore/ram_internal.h b/fs/pstore/ram_internal.h new file mode 100644 index 000000000000..5f694698351f --- /dev/null +++ b/fs/pstore/ram_internal.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2010 Marco Stornelli <marco.stornelli@gmail.com> + * Copyright (C) 2011 Kees Cook <keescook@chromium.org> + * Copyright (C) 2011 Google, Inc. + */ + +#include <linux/pstore_ram.h> + +/* + * Choose whether access to the RAM zone requires locking or not. If a zone + * can be written to from different CPUs like with ftrace for example, then + * PRZ_FLAG_NO_LOCK is used. For all other cases, locking is required. + */ +#define PRZ_FLAG_NO_LOCK BIT(0) +/* + * If a PRZ should only have a single-boot lifetime, this marks it as + * getting wiped after its contents get copied out after boot. + */ +#define PRZ_FLAG_ZAP_OLD BIT(1) + +/** + * struct persistent_ram_zone - Details of a persistent RAM zone (PRZ) + * used as a pstore backend + * + * @paddr: physical address of the mapped RAM area + * @size: size of mapping + * @label: unique name of this PRZ + * @type: frontend type for this PRZ + * @flags: holds PRZ_FLAGS_* bits + * + * @buffer_lock: + * locks access to @buffer "size" bytes and "start" offset + * @buffer: + * pointer to actual RAM area managed by this PRZ + * @buffer_size: + * bytes in @buffer->data (not including any trailing ECC bytes) + * + * @par_buffer: + * pointer into @buffer->data containing ECC bytes for @buffer->data + * @par_header: + * pointer into @buffer->data containing ECC bytes for @buffer header + * (i.e. all fields up to @data) + * @rs_decoder: + * RSLIB instance for doing ECC calculations + * @corrected_bytes: + * ECC corrected bytes accounting since boot + * @bad_blocks: + * ECC uncorrectable bytes accounting since boot + * @ecc_info: + * ECC configuration details + * + * @old_log: + * saved copy of @buffer->data prior to most recent wipe + * @old_log_size: + * bytes contained in @old_log + * + */ +struct persistent_ram_zone { + phys_addr_t paddr; + size_t size; + void *vaddr; + char *label; + enum pstore_type_id type; + u32 flags; + + raw_spinlock_t buffer_lock; + struct persistent_ram_buffer *buffer; + size_t buffer_size; + + char *par_buffer; + char *par_header; + struct rs_control *rs_decoder; + int corrected_bytes; + int bad_blocks; + struct persistent_ram_ecc_info ecc_info; + + char *old_log; + size_t old_log_size; +}; + +struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, + u32 sig, struct persistent_ram_ecc_info *ecc_info, + unsigned int memtype, u32 flags, char *label); +void persistent_ram_free(struct persistent_ram_zone **_prz); +void persistent_ram_zap(struct persistent_ram_zone *prz); + +int persistent_ram_write(struct persistent_ram_zone *prz, const void *s, + unsigned int count); +int persistent_ram_write_user(struct persistent_ram_zone *prz, + const void __user *s, unsigned int count); + +void persistent_ram_save_old(struct persistent_ram_zone *prz); +size_t persistent_ram_old_size(struct persistent_ram_zone *prz); +void *persistent_ram_old(struct persistent_ram_zone *prz); +void persistent_ram_free_old(struct persistent_ram_zone *prz); +ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz, + char *str, size_t len); diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index 017d0d4ad329..2770746bb7aa 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -761,7 +761,7 @@ static inline int notrace psz_kmsg_write_record(struct psz_context *cxt, /* avoid destroying old data, allocate a new one */ len = zone->buffer_size + sizeof(*zone->buffer); zone->oldbuf = zone->buffer; - zone->buffer = kzalloc(len, GFP_KERNEL); + zone->buffer = kzalloc(len, GFP_ATOMIC); if (!zone->buffer) { zone->buffer = zone->oldbuf; return -ENOMEM; diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index b9895afca9d1..85b2fa3b211c 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -470,10 +470,8 @@ out2: out1: iput(sbi->inodes); out: - if (bh1) - brelse(bh1); - if (bh2) - brelse(bh2); + brelse(bh1); + brelse(bh2); outnobh: kfree(qs); s->s_fs_info = NULL; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 0427b44bfee5..f27faf5db554 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2324,6 +2324,8 @@ static int vfs_setup_quota_inode(struct inode *inode, int type) struct super_block *sb = inode->i_sb; struct quota_info *dqopt = sb_dqopt(sb); + if (is_bad_inode(inode)) + return -EUCLEAN; if (!S_ISREG(inode->i_mode)) return -EACCES; if (IS_RDONLY(inode)) diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index 5f2405994280..0f1493e0f6d0 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -71,6 +71,40 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) return ret; } +static inline int do_check_range(struct super_block *sb, const char *val_name, + uint val, uint min_val, uint max_val) +{ + if (val < min_val || val > max_val) { + quota_error(sb, "Getting %s %u out of range %u-%u", + val_name, val, min_val, max_val); + return -EUCLEAN; + } + + return 0; +} + +static int check_dquot_block_header(struct qtree_mem_dqinfo *info, + struct qt_disk_dqdbheader *dh) +{ + int err = 0; + + err = do_check_range(info->dqi_sb, "dqdh_next_free", + le32_to_cpu(dh->dqdh_next_free), 0, + info->dqi_blocks - 1); + if (err) + return err; + err = do_check_range(info->dqi_sb, "dqdh_prev_free", + le32_to_cpu(dh->dqdh_prev_free), 0, + info->dqi_blocks - 1); + if (err) + return err; + err = do_check_range(info->dqi_sb, "dqdh_entries", + le16_to_cpu(dh->dqdh_entries), 0, + qtree_dqstr_in_blk(info)); + + return err; +} + /* Remove empty block from list and return it */ static int get_free_dqblk(struct qtree_mem_dqinfo *info) { @@ -85,6 +119,9 @@ static int get_free_dqblk(struct qtree_mem_dqinfo *info) ret = read_blk(info, blk, buf); if (ret < 0) goto out_buf; + ret = check_dquot_block_header(info, dh); + if (ret) + goto out_buf; info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free); } else { @@ -232,6 +269,9 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, *err = read_blk(info, blk, buf); if (*err < 0) goto out_buf; + *err = check_dquot_block_header(info, dh); + if (*err) + goto out_buf; } else { blk = get_free_dqblk(info); if ((int)blk < 0) { @@ -313,6 +353,10 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, } ref = (__le32 *)buf; newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + ret = do_check_range(dquot->dq_sb, "block", newblk, 0, + info->dqi_blocks - 1); + if (ret) + goto out_buf; if (!newblk) newson = 1; if (depth == info->dqi_qtree_depth - 1) { @@ -424,6 +468,9 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, goto out_buf; } dh = (struct qt_disk_dqdbheader *)buf; + ret = check_dquot_block_header(info, dh); + if (ret) + goto out_buf; le16_add_cpu(&dh->dqdh_entries, -1); if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */ ret = remove_free_dqentry(info, buf, blk); @@ -480,12 +527,10 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); - if (newblk < QT_TREEOFF || newblk >= info->dqi_blocks) { - quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", - newblk, info->dqi_blocks); - ret = -EUCLEAN; + ret = do_check_range(dquot->dq_sb, "block", newblk, QT_TREEOFF, + info->dqi_blocks - 1); + if (ret) goto out_buf; - } if (depth == info->dqi_qtree_depth - 1) { ret = free_dqentry(info, dquot, newblk); @@ -586,12 +631,10 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); if (!blk) /* No reference? */ goto out_buf; - if (blk < QT_TREEOFF || blk >= info->dqi_blocks) { - quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", - blk, info->dqi_blocks); - ret = -EUCLEAN; + ret = do_check_range(dquot->dq_sb, "block", blk, QT_TREEOFF, + info->dqi_blocks - 1); + if (ret) goto out_buf; - } if (depth < info->dqi_qtree_depth - 1) ret = find_tree_dqentry(info, dquot, blk, depth+1); @@ -705,15 +748,21 @@ static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id, goto out_buf; } for (i = __get_index(info, *id, depth); i < epb; i++) { - if (ref[i] == cpu_to_le32(0)) { + uint blk_no = le32_to_cpu(ref[i]); + + if (blk_no == 0) { *id += level_inc; continue; } + ret = do_check_range(info->dqi_sb, "block", blk_no, 0, + info->dqi_blocks - 1); + if (ret) + goto out_buf; if (depth == info->dqi_qtree_depth - 1) { ret = 0; goto out_buf; } - ret = find_next_id(info, id, le32_to_cpu(ref[i]), depth + 1); + ret = find_next_id(info, id, blk_no, depth + 1); if (ret != -ENOENT) break; } diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index ba3525ccc27e..cb240eac5036 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -203,9 +203,9 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - unsigned long maxpages, lpages, nr, loop, ret; + unsigned long maxpages, lpages, nr_folios, loop, ret, nr_pages, pfn; struct inode *inode = file_inode(file); - struct page **pages = NULL, **ptr, *page; + struct folio_batch fbatch; loff_t isize; /* the mapping mustn't extend beyond the EOF */ @@ -221,31 +221,39 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, goto out; /* gang-find the pages */ - pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL); - if (!pages) - goto out_free; - - nr = find_get_pages_contig(inode->i_mapping, pgoff, lpages, pages); - if (nr != lpages) - goto out_free_pages; /* leave if some pages were missing */ + folio_batch_init(&fbatch); + nr_pages = 0; +repeat: + nr_folios = filemap_get_folios_contig(inode->i_mapping, &pgoff, + ULONG_MAX, &fbatch); + if (!nr_folios) { + ret = -ENOSYS; + return ret; + } + if (ret == -ENOSYS) { + ret = (unsigned long) folio_address(fbatch.folios[0]); + pfn = folio_pfn(fbatch.folios[0]); + } /* check the pages for physical adjacency */ - ptr = pages; - page = *ptr++; - page++; - for (loop = lpages; loop > 1; loop--) - if (*ptr++ != page++) - goto out_free_pages; + for (loop = 0; loop < nr_folios; loop++) { + if (pfn + nr_pages != folio_pfn(fbatch.folios[loop])) { + ret = -ENOSYS; + goto out_free; /* leave if not physical adjacent */ + } + nr_pages += folio_nr_pages(fbatch.folios[loop]); + if (nr_pages >= lpages) + goto out_free; /* successfully found desired pages*/ + } + if (nr_pages < lpages) { + folio_batch_release(&fbatch); + goto repeat; /* loop if pages are missing */ + } /* okay - all conditions fulfilled */ - ret = (unsigned long) page_address(pages[0]); -out_free_pages: - ptr = pages; - for (loop = nr; loop > 0; loop--) - put_page(*ptr++); out_free: - kfree(pages); + folio_batch_release(&fbatch); out: return ret; } diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index bc66d0173e33..b3257e852820 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -146,15 +146,15 @@ static int ramfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, } static int ramfs_tmpfile(struct user_namespace *mnt_userns, - struct inode *dir, struct dentry *dentry, umode_t mode) + struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; inode = ramfs_get_inode(dir->i_sb, dir, mode, 0); if (!inode) return -ENOSPC; - d_tmpfile(dentry, inode); - return 0; + d_tmpfile(file, inode); + return finish_open_simple(file, 0); } static const struct inode_operations ramfs_dir_inode_operations = { diff --git a/fs/read_write.c b/fs/read_write.c index 1a261dcf1778..7a2ff6157eda 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -384,7 +384,7 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); - iov_iter_ubuf(&iter, READ, buf, len); + iov_iter_ubuf(&iter, ITER_DEST, buf, len); ret = call_read_iter(filp, &kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); @@ -424,7 +424,7 @@ ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; - iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len); + iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len); ret = file->f_op->read_iter(&kiocb, &iter); if (ret > 0) { if (pos) @@ -486,7 +486,7 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); - iov_iter_ubuf(&iter, WRITE, (void __user *)buf, len); + iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); ret = call_write_iter(filp, &kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); @@ -496,14 +496,9 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t } /* caller is responsible for file_start_write/file_end_write */ -ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) +ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) { - struct kvec iov = { - .iov_base = (void *)buf, - .iov_len = min_t(size_t, count, MAX_RW_COUNT), - }; struct kiocb kiocb; - struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) @@ -519,8 +514,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; - iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len); - ret = file->f_op->write_iter(&kiocb, &iter); + ret = file->f_op->write_iter(&kiocb, from); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; @@ -530,6 +524,18 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t inc_syscw(current); return ret; } + +/* caller is responsible for file_start_write/file_end_write */ +ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) +{ + struct kvec iov = { + .iov_base = (void *)buf, + .iov_len = min_t(size_t, count, MAX_RW_COUNT), + }; + struct iov_iter iter; + iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len); + return __kernel_write_iter(file, &iter, pos); +} /* * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", * but autofs is one of the few internal kernel users that actually @@ -905,7 +911,7 @@ static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, struct iov_iter iter; ssize_t ret; - ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret >= 0) { ret = do_iter_read(file, &iter, pos, flags); kfree(iov); @@ -922,7 +928,7 @@ static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, struct iov_iter iter; ssize_t ret; - ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret >= 0) { file_start_write(file); ret = do_iter_write(file, &iter, pos, flags); @@ -1382,6 +1388,8 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { + lockdep_assert(sb_write_started(file_inode(file_out)->i_sb)); + return do_splice_direct(file_in, &pos_in, file_out, &pos_out, len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); } @@ -1418,7 +1426,9 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, * and several different sets of file_operations, but they all end up * using the same ->copy_file_range() function pointer. */ - if (file_out->f_op->copy_file_range) { + if (flags & COPY_FILE_SPLICE) { + /* cross sb splice is allowed */ + } else if (file_out->f_op->copy_file_range) { if (file_in->f_op->copy_file_range != file_out->f_op->copy_file_range) return -EXDEV; @@ -1468,8 +1478,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, size_t len, unsigned int flags) { ssize_t ret; + bool splice = flags & COPY_FILE_SPLICE; - if (flags != 0) + if (flags & ~COPY_FILE_SPLICE) return -EINVAL; ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len, @@ -1495,14 +1506,14 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * same sb using clone, but for filesystems where both clone and copy * are supported (e.g. nfs,cifs), we only call the copy method. */ - if (file_out->f_op->copy_file_range) { + if (!splice && file_out->f_op->copy_file_range) { ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); goto done; } - if (file_in->f_op->remap_file_range && + if (!splice && file_in->f_op->remap_file_range && file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, @@ -1522,6 +1533,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * consistent story about which filesystems support copy_file_range() * and which filesystems do not, that will allow userspace tools to * make consistent desicions w.r.t using copy_file_range(). + * + * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE. */ ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); @@ -1576,6 +1589,10 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, pos_out = f_out.file->f_pos; } + ret = -EINVAL; + if (flags != 0) + goto out; + ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len, flags); if (ret > 0) { diff --git a/fs/readdir.c b/fs/readdir.c index 09e8ed7d4161..9c53edb60c03 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -140,7 +140,7 @@ struct readdir_callback { int result; }; -static int fillonedir(struct dir_context *ctx, const char *name, int namlen, +static bool fillonedir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct readdir_callback *buf = @@ -149,14 +149,14 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen, unsigned long d_ino; if (buf->result) - return -EINVAL; + return false; buf->result = verify_dirent_name(name, namlen); - if (buf->result < 0) - return buf->result; + if (buf->result) + return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->result = -EOVERFLOW; - return -EOVERFLOW; + return false; } buf->result++; dirent = buf->dirent; @@ -169,12 +169,12 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen, unsafe_put_user(namlen, &dirent->d_namlen, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_write_access_end(); - return 0; + return true; efault_end: user_write_access_end(); efault: buf->result = -EFAULT; - return -EFAULT; + return false; } SYSCALL_DEFINE3(old_readdir, unsigned int, fd, @@ -219,7 +219,7 @@ struct getdents_callback { int error; }; -static int filldir(struct dir_context *ctx, const char *name, int namlen, +static bool filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct linux_dirent __user *dirent, *prev; @@ -232,18 +232,18 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) - return buf->error; + return false; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) - return -EINVAL; + return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->error = -EOVERFLOW; - return -EOVERFLOW; + return false; } prev_reclen = buf->prev_reclen; if (prev_reclen && signal_pending(current)) - return -EINTR; + return false; dirent = buf->current_dir; prev = (void __user *) dirent - prev_reclen; if (!user_write_access_begin(prev, reclen + prev_reclen)) @@ -260,12 +260,12 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, buf->current_dir = (void __user *)dirent + reclen; buf->prev_reclen = reclen; buf->count -= reclen; - return 0; + return true; efault_end: user_write_access_end(); efault: buf->error = -EFAULT; - return -EFAULT; + return false; } SYSCALL_DEFINE3(getdents, unsigned int, fd, @@ -307,7 +307,7 @@ struct getdents_callback64 { int error; }; -static int filldir64(struct dir_context *ctx, const char *name, int namlen, +static bool filldir64(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct linux_dirent64 __user *dirent, *prev; @@ -319,13 +319,13 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) - return buf->error; + return false; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) - return -EINVAL; + return false; prev_reclen = buf->prev_reclen; if (prev_reclen && signal_pending(current)) - return -EINTR; + return false; dirent = buf->current_dir; prev = (void __user *)dirent - prev_reclen; if (!user_write_access_begin(prev, reclen + prev_reclen)) @@ -342,13 +342,13 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, buf->prev_reclen = reclen; buf->current_dir = (void __user *)dirent + reclen; buf->count -= reclen; - return 0; + return true; efault_end: user_write_access_end(); efault: buf->error = -EFAULT; - return -EFAULT; + return false; } SYSCALL_DEFINE3(getdents64, unsigned int, fd, @@ -397,7 +397,7 @@ struct compat_readdir_callback { int result; }; -static int compat_fillonedir(struct dir_context *ctx, const char *name, +static bool compat_fillonedir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { @@ -407,14 +407,14 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name, compat_ulong_t d_ino; if (buf->result) - return -EINVAL; + return false; buf->result = verify_dirent_name(name, namlen); - if (buf->result < 0) - return buf->result; + if (buf->result) + return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->result = -EOVERFLOW; - return -EOVERFLOW; + return false; } buf->result++; dirent = buf->dirent; @@ -427,12 +427,12 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name, unsafe_put_user(namlen, &dirent->d_namlen, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_write_access_end(); - return 0; + return true; efault_end: user_write_access_end(); efault: buf->result = -EFAULT; - return -EFAULT; + return false; } COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, @@ -471,7 +471,7 @@ struct compat_getdents_callback { int error; }; -static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, +static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct compat_linux_dirent __user *dirent, *prev; @@ -484,18 +484,18 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) - return buf->error; + return false; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) - return -EINVAL; + return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->error = -EOVERFLOW; - return -EOVERFLOW; + return false; } prev_reclen = buf->prev_reclen; if (prev_reclen && signal_pending(current)) - return -EINTR; + return false; dirent = buf->current_dir; prev = (void __user *) dirent - prev_reclen; if (!user_write_access_begin(prev, reclen + prev_reclen)) @@ -511,12 +511,12 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, buf->prev_reclen = reclen; buf->current_dir = (void __user *)dirent + reclen; buf->count -= reclen; - return 0; + return true; efault_end: user_write_access_end(); efault: buf->error = -EFAULT; - return -EFAULT; + return false; } COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h index d9052b8ce6dd..29c503a06db4 100644 --- a/fs/reiserfs/acl.h +++ b/fs/reiserfs/acl.h @@ -49,9 +49,9 @@ static inline int reiserfs_acl_count(size_t size) #ifdef CONFIG_REISERFS_FS_POSIX_ACL struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu); -int reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); -int reiserfs_acl_chmod(struct inode *inode); +int reiserfs_acl_chmod(struct dentry *dentry); int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, struct inode *dir, struct dentry *dentry, struct inode *inode); @@ -63,7 +63,7 @@ int reiserfs_cache_default_acl(struct inode *dir); #define reiserfs_get_acl NULL #define reiserfs_set_acl NULL -static inline int reiserfs_acl_chmod(struct inode *inode) +static inline int reiserfs_acl_chmod(struct dentry *dentry) { return 0; } diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 6e228bfbe7ef..467d13da198f 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -256,7 +256,7 @@ const struct inode_operations reiserfs_file_inode_operations = { .setattr = reiserfs_setattr, .listxattr = reiserfs_listxattr, .permission = reiserfs_permission, - .get_acl = reiserfs_get_acl, + .get_inode_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, .fileattr_get = reiserfs_fileattr_get, .fileattr_set = reiserfs_fileattr_set, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index b9580a6515ee..c7d1fa526dea 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -3404,7 +3404,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (!error && reiserfs_posixacl(inode->i_sb)) { if (attr->ia_valid & ATTR_MODE) - error = reiserfs_acl_chmod(inode); + error = reiserfs_acl_chmod(dentry); } out: diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 94addfcefede..9f62da7471c9 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -868,7 +868,7 @@ loop_next: */ if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) { spin_unlock(lock); - ll_rw_block(REQ_OP_WRITE, 1, &bh); + write_dirty_buffer(bh, 0); spin_lock(lock); } put_bh(bh); @@ -1054,7 +1054,7 @@ static int flush_commit_list(struct super_block *s, if (tbh) { if (buffer_dirty(tbh)) { depth = reiserfs_write_unlock_nested(s); - ll_rw_block(REQ_OP_WRITE, 1, &tbh); + write_dirty_buffer(tbh, 0); reiserfs_write_lock_nested(s, depth); } put_bh(tbh) ; @@ -2240,7 +2240,7 @@ abort_replay: } } /* read in the log blocks, memcpy to the corresponding real block */ - ll_rw_block(REQ_OP_READ, get_desc_trans_len(desc), log_blocks); + bh_read_batch(get_desc_trans_len(desc), log_blocks); for (i = 0; i < get_desc_trans_len(desc); i++) { wait_on_buffer(log_blocks[i]); @@ -2342,10 +2342,11 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev, } else bhlist[j++] = bh; } - ll_rw_block(REQ_OP_READ, j, bhlist); + bh = bhlist[0]; + bh_read_nowait(bh, 0); + bh_readahead_batch(j - 1, &bhlist[1], 0); for (i = 1; i < j; i++) brelse(bhlist[i]); - bh = bhlist[0]; wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 3d7a35d6a18b..0b8aa99749f1 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -696,6 +696,7 @@ static int reiserfs_create(struct user_namespace *mnt_userns, struct inode *dir, out_failed: reiserfs_write_unlock(dir->i_sb); + reiserfs_security_free(&security); return retval; } @@ -779,6 +780,7 @@ static int reiserfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, out_failed: reiserfs_write_unlock(dir->i_sb); + reiserfs_security_free(&security); return retval; } @@ -878,6 +880,7 @@ static int reiserfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, retval = journal_end(&th); out_failed: reiserfs_write_unlock(dir->i_sb); + reiserfs_security_free(&security); return retval; } @@ -1194,6 +1197,7 @@ static int reiserfs_symlink(struct user_namespace *mnt_userns, retval = journal_end(&th); out_failed: reiserfs_write_unlock(parent_dir->i_sb); + reiserfs_security_free(&security); return retval; } @@ -1659,7 +1663,7 @@ const struct inode_operations reiserfs_dir_inode_operations = { .setattr = reiserfs_setattr, .listxattr = reiserfs_listxattr, .permission = reiserfs_permission, - .get_acl = reiserfs_get_acl, + .get_inode_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, .fileattr_get = reiserfs_fileattr_get, .fileattr_set = reiserfs_fileattr_set, @@ -1683,6 +1687,6 @@ const struct inode_operations reiserfs_special_inode_operations = { .setattr = reiserfs_setattr, .listxattr = reiserfs_listxattr, .permission = reiserfs_permission, - .get_acl = reiserfs_get_acl, + .get_inode_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, }; diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 30319dc33c18..84a194b77f19 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -456,7 +456,7 @@ static int print_internal(struct buffer_head *bh, int first, int last) to = B_NR_ITEMS(bh); } else { from = first; - to = last < B_NR_ITEMS(bh) ? last : B_NR_ITEMS(bh); + to = min_t(int, last, B_NR_ITEMS(bh)); } reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh); diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 4a7cb16e9345..3dba8acf4e83 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -411,7 +411,7 @@ int reiserfs_proc_info_init(struct super_block *sb) char *s; /* Some block devices use /'s */ - strlcpy(b, sb->s_id, BDEVNAME_SIZE); + strscpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; @@ -441,7 +441,7 @@ int reiserfs_proc_info_done(struct super_block *sb) char *s; /* Some block devices use /'s */ - strlcpy(b, sb->s_id, BDEVNAME_SIZE); + strscpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c index 8096c74c38ac..7b498a0d060b 100644 --- a/fs/reiserfs/resize.c +++ b/fs/reiserfs/resize.c @@ -97,7 +97,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) * using the copy_size var below allows this code to work for * both shrinking and expanding the FS. */ - copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr; + copy_size = min(bmap_nr_new, bmap_nr); copy_size = copy_size * sizeof(struct reiserfs_list_bitmap_node *); for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 9a293609a022..84c12a1947b2 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -579,7 +579,7 @@ static int search_by_key_reada(struct super_block *s, if (!buffer_uptodate(bh[j])) { if (depth == -1) depth = reiserfs_write_unlock_nested(s); - ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, bh + j); + bh_readahead(bh[j], REQ_RAHEAD); } brelse(bh[j]); } @@ -685,7 +685,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, if (!buffer_uptodate(bh) && depth == -1) depth = reiserfs_write_unlock_nested(sb); - ll_rw_block(REQ_OP_READ, 1, &bh); + bh_read_nowait(bh, 0); wait_on_buffer(bh); if (depth != -1) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index c88cd2ce0665..929acce6e731 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1702,9 +1702,7 @@ static int read_super_block(struct super_block *s, int offset) /* after journal replay, reread all bitmap and super blocks */ static int reread_meta_blocks(struct super_block *s) { - ll_rw_block(REQ_OP_READ, 1, &SB_BUFFER_WITH_SB(s)); - wait_on_buffer(SB_BUFFER_WITH_SB(s)); - if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { + if (bh_read(SB_BUFFER_WITH_SB(s), 0) < 0) { reiserfs_warning(s, "reiserfs-2504", "error reading the super"); return 1; } @@ -2504,9 +2502,7 @@ static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data, len = i_size - off; toread = len; while (toread > 0) { - tocopy = - sb->s_blocksize - offset < - toread ? sb->s_blocksize - offset : toread; + tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); tmp_bh.b_state = 0; /* * Quota files are without tails so we can safely @@ -2554,8 +2550,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, return -EIO; } while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; + tocopy = min_t(unsigned long, sb->s_blocksize - offset, towrite); tmp_bh.b_state = 0; reiserfs_write_lock(sb); err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 436641369283..8b2d52443f41 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -189,7 +189,7 @@ struct reiserfs_dentry_buf { struct dentry *dentries[8]; }; -static int +static bool fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -200,16 +200,16 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir))); if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) - return -ENOSPC; + return false; if (name[0] == '.' && (namelen < 2 || (namelen == 2 && name[1] == '.'))) - return 0; + return true; dentry = lookup_one_len(name, dbuf->xadir, namelen); if (IS_ERR(dentry)) { dbuf->err = PTR_ERR(dentry); - return PTR_ERR(dentry); + return false; } else if (d_really_is_negative(dentry)) { /* A directory entry exists, but no file? */ reiserfs_error(dentry->d_sb, "xattr-20003", @@ -218,11 +218,11 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, dentry, dbuf->xadir); dput(dentry); dbuf->err = -EIO; - return -EIO; + return false; } dbuf->dentries[dbuf->count++] = dentry; - return 0; + return true; } static void @@ -797,7 +797,7 @@ struct listxattr_buf { struct dentry *dentry; }; -static int listxattr_filler(struct dir_context *ctx, const char *name, +static bool listxattr_filler(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -813,19 +813,19 @@ static int listxattr_filler(struct dir_context *ctx, const char *name, name); if (!handler /* Unsupported xattr name */ || (handler->list && !handler->list(b->dentry))) - return 0; + return true; size = namelen + 1; if (b->buf) { if (b->pos + size > b->size) { b->pos = -ERANGE; - return -ERANGE; + return false; } memcpy(b->buf + b->pos, name, namelen); b->buf[b->pos + namelen] = 0; } b->pos += size; } - return 0; + return true; } /* diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index d6fcddc46f5b..93fe414fed18 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -18,7 +18,7 @@ static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, int -reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int error, error2; @@ -26,6 +26,7 @@ reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, size_t jcreate_blocks; int size = acl ? posix_acl_xattr_size(acl->a_count) : 0; int update_mode = 0; + struct inode *inode = d_inode(dentry); umode_t mode = inode->i_mode; /* @@ -371,7 +372,7 @@ int reiserfs_cache_default_acl(struct inode *inode) if (IS_PRIVATE(inode)) return 0; - acl = get_acl(inode, ACL_TYPE_DEFAULT); + acl = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (acl && !IS_ERR(acl)) { int size = reiserfs_acl_size(acl->a_count); @@ -396,13 +397,15 @@ int reiserfs_cache_default_acl(struct inode *inode) /* * Called under i_mutex */ -int reiserfs_acl_chmod(struct inode *inode) +int reiserfs_acl_chmod(struct dentry *dentry) { + struct inode *inode = d_inode(dentry); + if (IS_PRIVATE(inode)) return 0; if (get_inode_sd_version(inode) == STAT_DATA_V1 || !reiserfs_posixacl(inode->i_sb)) return 0; - return posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + return posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); } diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 8965c8e5e172..857a65b05726 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -50,6 +50,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode, int error; sec->name = NULL; + sec->value = NULL; /* Don't add selinux attributes on xattrs - they'll never get used */ if (IS_PRIVATE(dir)) @@ -95,7 +96,6 @@ int reiserfs_security_write(struct reiserfs_transaction_handle *th, void reiserfs_security_free(struct reiserfs_security_handle *sec) { - kfree(sec->name); kfree(sec->value); sec->name = NULL; sec->value = NULL; diff --git a/fs/remap_range.c b/fs/remap_range.c index 654912d06862..41f60477bb41 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -304,7 +304,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, /* Check that we don't violate system file offset limits. */ ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, remap_flags); - if (ret) + if (ret || *len == 0) return ret; /* Wait for the completion of any pending IOs on both files */ @@ -328,9 +328,6 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (remap_flags & REMAP_FILE_DEDUP) { bool is_same = false; - if (*len == 0) - return 0; - if (!IS_DAX(inode_in)) ret = vfs_dedupe_file_range_compare(file_in, pos_in, file_out, pos_out, *len, &is_same); @@ -348,7 +345,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, remap_flags); - if (ret) + if (ret || *len == 0) return ret; /* If can't alter the file contents, we're done. */ @@ -429,7 +426,7 @@ static bool allow_file_dedupe(struct file *file) return true; if (file->f_mode & FMODE_WRITE) return true; - if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid())) return true; if (!inode_permission(mnt_userns, inode, MAY_WRITE)) return true; diff --git a/fs/seq_file.c b/fs/seq_file.c index 9456a2032224..f5fdaf3b1572 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -156,7 +156,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) ssize_t ret; init_sync_kiocb(&kiocb, file); - iov_iter_init(&iter, READ, &iov, 1, size); + iov_iter_init(&iter, ITER_DEST, &iov, 1, size); kiocb.ki_pos = *ppos; ret = seq_read_iter(&kiocb, &iter); diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h index 2cab413fffee..7d605db3bb3b 100644 --- a/fs/smbfs_common/smb2pdu.h +++ b/fs/smbfs_common/smb2pdu.h @@ -1101,7 +1101,11 @@ struct smb2_change_notify_rsp { #define SMB2_CREATE_REQUEST_LEASE "RqLs" #define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q" #define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C" -#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C" +#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C" +#define SMB2_CREATE_APP_INSTANCE_ID "\x45\xBC\xA6\x6A\xEF\xA7\xF7\x4A\x90\x08\xFA\x46\x2E\x14\x4D\x74" +#define SMB2_CREATE_APP_INSTANCE_VERSION "\xB9\x82\xD0\xB7\x3B\x56\x07\x4F\xA0\x7B\x52\x4A\x81\x16\xA0\x10" +#define SVHDX_OPEN_DEVICE_CONTEXT "\x9C\xCB\xCF\x9E\x04\xC1\xE6\x43\x98\x0E\x15\x8D\xA1\xF6\xEC\x83" +#define SMB2_CREATE_TAG_AAPL "AAPL" /* Flag (SMB3 open response) values */ #define SMB2_CREATE_FLAG_REPARSEPOINT 0x01 diff --git a/fs/splice.c b/fs/splice.c index 0878b852b355..5969b7a1d353 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -303,7 +303,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, struct kiocb kiocb; int ret; - iov_iter_pipe(&to, READ, pipe, len); + iov_iter_pipe(&to, ITER_DEST, pipe, len); init_sync_kiocb(&kiocb, in); kiocb.ki_pos = *ppos; ret = call_read_iter(in, &kiocb, &to); @@ -682,7 +682,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, n++; } - iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left); + iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left); ret = vfs_iter_write(out, &from, &sd.pos, 0); if (ret <= 0) break; @@ -1263,9 +1263,9 @@ static int vmsplice_type(struct fd f, int *type) if (!f.file) return -EBADF; if (f.file->f_mode & FMODE_WRITE) { - *type = WRITE; + *type = ITER_SOURCE; } else if (f.file->f_mode & FMODE_READ) { - *type = READ; + *type = ITER_DEST; } else { fdput(f); return -EBADF; @@ -1314,7 +1314,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, if (!iov_iter_count(&iter)) error = 0; - else if (iov_iter_rw(&iter) == WRITE) + else if (type == ITER_SOURCE) error = vmsplice_to_pipe(f.file, &iter, flags); else error = vmsplice_to_user(f.file, &iter, flags); diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index 916e78fabcaa..60fc98bdf421 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -54,9 +54,35 @@ config SQUASHFS_FILE_DIRECT endchoice +config SQUASHFS_DECOMP_SINGLE + depends on SQUASHFS + def_bool n + +config SQUASHFS_DECOMP_MULTI + depends on SQUASHFS + def_bool n + +config SQUASHFS_DECOMP_MULTI_PERCPU + depends on SQUASHFS + def_bool n + +config SQUASHFS_CHOICE_DECOMP_BY_MOUNT + bool "Select the parallel decompression mode during mount" + depends on SQUASHFS + default n + select SQUASHFS_DECOMP_SINGLE + select SQUASHFS_DECOMP_MULTI + select SQUASHFS_DECOMP_MULTI_PERCPU + select SQUASHFS_MOUNT_DECOMP_THREADS + help + Compile all parallel decompression modes and specify the + decompression mode by setting "threads=" during mount. + default Decompressor parallelisation is SQUASHFS_DECOMP_SINGLE + choice - prompt "Decompressor parallelisation options" + prompt "Select decompression parallel mode at compile time" depends on SQUASHFS + depends on !SQUASHFS_CHOICE_DECOMP_BY_MOUNT help Squashfs now supports three parallelisation options for decompression. Each one exhibits various trade-offs between @@ -64,15 +90,17 @@ choice If in doubt, select "Single threaded compression" -config SQUASHFS_DECOMP_SINGLE +config SQUASHFS_COMPILE_DECOMP_SINGLE bool "Single threaded compression" + select SQUASHFS_DECOMP_SINGLE help Traditionally Squashfs has used single-threaded decompression. Only one block (data or metadata) can be decompressed at any one time. This limits CPU and memory usage to a minimum. -config SQUASHFS_DECOMP_MULTI +config SQUASHFS_COMPILE_DECOMP_MULTI bool "Use multiple decompressors for parallel I/O" + select SQUASHFS_DECOMP_MULTI help By default Squashfs uses a single decompressor but it gives poor performance on parallel I/O workloads when using multiple CPU @@ -85,8 +113,9 @@ config SQUASHFS_DECOMP_MULTI decompressors per core. It dynamically allocates decompressors on a demand basis. -config SQUASHFS_DECOMP_MULTI_PERCPU +config SQUASHFS_COMPILE_DECOMP_MULTI_PERCPU bool "Use percpu multiple decompressors for parallel I/O" + select SQUASHFS_DECOMP_MULTI_PERCPU help By default Squashfs uses a single decompressor but it gives poor performance on parallel I/O workloads when using multiple CPU @@ -95,9 +124,21 @@ config SQUASHFS_DECOMP_MULTI_PERCPU This decompressor implementation uses a maximum of one decompressor per core. It uses percpu variables to ensure decompression is load-balanced across the cores. - endchoice +config SQUASHFS_MOUNT_DECOMP_THREADS + bool "Add the mount parameter 'threads=' for squashfs" + depends on SQUASHFS + depends on SQUASHFS_DECOMP_MULTI + default n + help + Use threads= to set the decompression parallel mode and the number of threads. + If SQUASHFS_CHOICE_DECOMP_BY_MOUNT=y + threads=<single|multi|percpu|1|2|3|...> + else + threads=<2|3|...> + The upper limit is num_online_cpus() * 2. + config SQUASHFS_XATTR bool "Squashfs XATTR support" depends on SQUASHFS diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 833aca92301f..bed3bb8b27fa 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -216,7 +216,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, res = -EIO; goto out_free_bio; } - res = squashfs_decompress(msblk, bio, offset, length, output); + res = msblk->thread_ops->decompress(msblk, bio, offset, length, output); } else { res = copy_bio_to_actor(bio, output, offset, length); } diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index d57bef91ab08..8893cb9b4198 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -134,7 +134,7 @@ void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags) if (IS_ERR(comp_opts)) return comp_opts; - stream = squashfs_decompressor_create(msblk, comp_opts); + stream = msblk->thread_ops->create(msblk, comp_opts); if (IS_ERR(stream)) kfree(comp_opts); diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c index db9f12a3ea05..416c53eedbd1 100644 --- a/fs/squashfs/decompressor_multi.c +++ b/fs/squashfs/decompressor_multi.c @@ -29,12 +29,11 @@ #define MAX_DECOMPRESSOR (num_online_cpus() * 2) -int squashfs_max_decompressors(void) +static int squashfs_max_decompressors(void) { return MAX_DECOMPRESSOR; } - struct squashfs_stream { void *comp_opts; struct list_head strm_list; @@ -59,7 +58,7 @@ static void put_decomp_stream(struct decomp_stream *decomp_strm, wake_up(&stream->wait); } -void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, +static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, void *comp_opts) { struct squashfs_stream *stream; @@ -103,7 +102,7 @@ out: } -void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) { struct squashfs_stream *stream = msblk->stream; if (stream) { @@ -145,7 +144,7 @@ static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk, * If there is no available decomp and already full, * let's wait for releasing decomp from other users. */ - if (stream->avail_decomp >= MAX_DECOMPRESSOR) + if (stream->avail_decomp >= msblk->max_thread_num) goto wait; /* Let's allocate new decomp */ @@ -161,7 +160,7 @@ static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk, } stream->avail_decomp++; - WARN_ON(stream->avail_decomp > MAX_DECOMPRESSOR); + WARN_ON(stream->avail_decomp > msblk->max_thread_num); mutex_unlock(&stream->mutex); break; @@ -180,7 +179,7 @@ wait: } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, +static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { @@ -195,3 +194,10 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, msblk->decompressor->name); return res; } + +const struct squashfs_decompressor_thread_ops squashfs_decompressor_multi = { + .create = squashfs_decompressor_create, + .destroy = squashfs_decompressor_destroy, + .decompress = squashfs_decompress, + .max_decompressors = squashfs_max_decompressors, +}; diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c index b881b9283b7f..1dfadf76ed9a 100644 --- a/fs/squashfs/decompressor_multi_percpu.c +++ b/fs/squashfs/decompressor_multi_percpu.c @@ -25,7 +25,7 @@ struct squashfs_stream { local_lock_t lock; }; -void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, +static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, void *comp_opts) { struct squashfs_stream *stream; @@ -59,7 +59,7 @@ out: return ERR_PTR(err); } -void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) { struct squashfs_stream __percpu *percpu = (struct squashfs_stream __percpu *) msblk->stream; @@ -75,19 +75,21 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, +static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { struct squashfs_stream *stream; + struct squashfs_stream __percpu *percpu = + (struct squashfs_stream __percpu *) msblk->stream; int res; - local_lock(&msblk->stream->lock); - stream = this_cpu_ptr(msblk->stream); + local_lock(&percpu->lock); + stream = this_cpu_ptr(percpu); res = msblk->decompressor->decompress(msblk, stream->stream, bio, offset, length, output); - local_unlock(&msblk->stream->lock); + local_unlock(&percpu->lock); if (res < 0) ERROR("%s decompression failed, data probably corrupt\n", @@ -96,7 +98,14 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, return res; } -int squashfs_max_decompressors(void) +static int squashfs_max_decompressors(void) { return num_possible_cpus(); } + +const struct squashfs_decompressor_thread_ops squashfs_decompressor_percpu = { + .create = squashfs_decompressor_create, + .destroy = squashfs_decompressor_destroy, + .decompress = squashfs_decompress, + .max_decompressors = squashfs_max_decompressors, +}; diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c index 4eb3d083d45e..6f161887710b 100644 --- a/fs/squashfs/decompressor_single.c +++ b/fs/squashfs/decompressor_single.c @@ -24,7 +24,7 @@ struct squashfs_stream { struct mutex mutex; }; -void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, +static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, void *comp_opts) { struct squashfs_stream *stream; @@ -49,7 +49,7 @@ out: return ERR_PTR(err); } -void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) { struct squashfs_stream *stream = msblk->stream; @@ -59,7 +59,7 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, +static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { @@ -78,7 +78,14 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, return res; } -int squashfs_max_decompressors(void) +static int squashfs_max_decompressors(void) { return 1; } + +const struct squashfs_decompressor_thread_ops squashfs_decompressor_single = { + .create = squashfs_decompressor_create, + .destroy = squashfs_decompressor_destroy, + .decompress = squashfs_decompress, + .max_decompressors = squashfs_max_decompressors, +}; diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index e56510964b22..8ba8c4c50770 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -506,8 +506,9 @@ static int squashfs_readahead_fragment(struct page **page, squashfs_i(inode)->fragment_size); struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; + int error = buffer->error; - if (buffer->error) + if (error) goto out; expected += squashfs_i(inode)->fragment_offset; @@ -529,7 +530,7 @@ static int squashfs_readahead_fragment(struct page **page, out: squashfs_cache_put(buffer); - return buffer->error; + return error; } static void squashfs_readahead(struct readahead_control *ractl) @@ -557,6 +558,13 @@ static void squashfs_readahead(struct readahead_control *ractl) int res, bsize; u64 block = 0; unsigned int expected; + struct page *last_page; + + expected = start >> msblk->block_log == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; + + max_pages = (expected + PAGE_SIZE - 1) >> PAGE_SHIFT; nr_pages = __readahead_batch(ractl, pages, max_pages); if (!nr_pages) @@ -566,13 +574,10 @@ static void squashfs_readahead(struct readahead_control *ractl) goto skip_pages; index = pages[0]->index >> shift; + if ((pages[nr_pages - 1]->index >> shift) != index) goto skip_pages; - expected = index == file_end ? - (i_size_read(inode) & (msblk->block_size - 1)) : - msblk->block_size; - if (index == file_end && squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) { res = squashfs_readahead_fragment(pages, nr_pages, @@ -593,15 +598,15 @@ static void squashfs_readahead(struct readahead_control *ractl) res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); - squashfs_page_actor_free(actor); + last_page = squashfs_page_actor_free(actor); if (res == expected) { int bytes; /* Last page (if present) may have trailing bytes not filled */ bytes = res % PAGE_SIZE; - if (pages[nr_pages - 1]->index == file_end && bytes) - memzero_page(pages[nr_pages - 1], bytes, + if (index == file_end && bytes && last_page) + memzero_page(last_page, bytes, PAGE_SIZE - bytes); for (i = 0; i < nr_pages; i++) { diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c index 54b93bf4a25c..81af6c4ca115 100644 --- a/fs/squashfs/page_actor.c +++ b/fs/squashfs/page_actor.c @@ -71,11 +71,13 @@ static void *handle_next_page(struct squashfs_page_actor *actor) (actor->next_index != actor->page[actor->next_page]->index)) { actor->next_index++; actor->returned_pages++; + actor->last_page = NULL; return actor->alloc_buffer ? actor->tmp_buffer : ERR_PTR(-ENOMEM); } actor->next_index++; actor->returned_pages++; + actor->last_page = actor->page[actor->next_page]; return actor->pageaddr = kmap_local_page(actor->page[actor->next_page++]); } @@ -125,6 +127,7 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_ actor->returned_pages = 0; actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1); actor->pageaddr = NULL; + actor->last_page = NULL; actor->alloc_buffer = msblk->decompressor->alloc_buffer; actor->squashfs_first_page = direct_first_page; actor->squashfs_next_page = direct_next_page; diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 95ffbb543d91..97d4983559b1 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -16,6 +16,7 @@ struct squashfs_page_actor { void *(*squashfs_first_page)(struct squashfs_page_actor *); void *(*squashfs_next_page)(struct squashfs_page_actor *); void (*squashfs_finish_page)(struct squashfs_page_actor *); + struct page *last_page; int pages; int length; int next_page; @@ -29,10 +30,13 @@ extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, extern struct squashfs_page_actor *squashfs_page_actor_init_special( struct squashfs_sb_info *msblk, struct page **page, int pages, int length); -static inline void squashfs_page_actor_free(struct squashfs_page_actor *actor) +static inline struct page *squashfs_page_actor_free(struct squashfs_page_actor *actor) { + struct page *last_page = actor->last_page; + kfree(actor->tmp_buffer); kfree(actor); + return last_page; } static inline void *squashfs_first_page(struct squashfs_page_actor *actor) { diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 9783e01c8100..a6164fdf9435 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -38,11 +38,24 @@ extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int); extern void *squashfs_decompressor_setup(struct super_block *, unsigned short); /* decompressor_xxx.c */ -extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *); -extern void squashfs_decompressor_destroy(struct squashfs_sb_info *); -extern int squashfs_decompress(struct squashfs_sb_info *, struct bio *, - int, int, struct squashfs_page_actor *); -extern int squashfs_max_decompressors(void); + +struct squashfs_decompressor_thread_ops { + void * (*create)(struct squashfs_sb_info *msblk, void *comp_opts); + void (*destroy)(struct squashfs_sb_info *msblk); + int (*decompress)(struct squashfs_sb_info *msblk, struct bio *bio, + int offset, int length, struct squashfs_page_actor *output); + int (*max_decompressors)(void); +}; + +#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE +extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_single; +#endif +#ifdef CONFIG_SQUASHFS_DECOMP_MULTI +extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_multi; +#endif +#ifdef CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU +extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_percpu; +#endif /* export.c */ extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64, diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 1e90c2575f9b..659082e9e51d 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -53,7 +53,7 @@ struct squashfs_sb_info { __le64 *xattr_id_table; struct mutex meta_index_mutex; struct meta_index *meta_index; - struct squashfs_stream *stream; + void *stream; __le64 *inode_lookup_table; u64 inode_table; u64 directory_table; @@ -66,5 +66,7 @@ struct squashfs_sb_info { int xattr_ids; unsigned int ids; bool panic_on_errors; + const struct squashfs_decompressor_thread_ops *thread_ops; + int max_thread_num; }; #endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 32565dafa7f3..e090fae48e68 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -47,10 +47,13 @@ enum Opt_errors { enum squashfs_param { Opt_errors, + Opt_threads, }; struct squashfs_mount_opts { enum Opt_errors errors; + const struct squashfs_decompressor_thread_ops *thread_ops; + int thread_num; }; static const struct constant_table squashfs_param_errors[] = { @@ -61,9 +64,66 @@ static const struct constant_table squashfs_param_errors[] = { static const struct fs_parameter_spec squashfs_fs_parameters[] = { fsparam_enum("errors", Opt_errors, squashfs_param_errors), + fsparam_string("threads", Opt_threads), {} }; + +static int squashfs_parse_param_threads_str(const char *str, struct squashfs_mount_opts *opts) +{ +#ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT + if (strcmp(str, "single") == 0) { + opts->thread_ops = &squashfs_decompressor_single; + return 0; + } + if (strcmp(str, "multi") == 0) { + opts->thread_ops = &squashfs_decompressor_multi; + return 0; + } + if (strcmp(str, "percpu") == 0) { + opts->thread_ops = &squashfs_decompressor_percpu; + return 0; + } +#endif + return -EINVAL; +} + +static int squashfs_parse_param_threads_num(const char *str, struct squashfs_mount_opts *opts) +{ +#ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS + int ret; + unsigned long num; + + ret = kstrtoul(str, 0, &num); + if (ret != 0) + return -EINVAL; + if (num > 1) { + opts->thread_ops = &squashfs_decompressor_multi; + if (num > opts->thread_ops->max_decompressors()) + return -EINVAL; + opts->thread_num = (int)num; + return 0; + } +#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE + if (num == 1) { + opts->thread_ops = &squashfs_decompressor_single; + opts->thread_num = 1; + return 0; + } +#endif +#endif /* !CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS */ + return -EINVAL; +} + +static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_opts *opts) +{ + int ret = squashfs_parse_param_threads_str(str, opts); + + if (ret == 0) + return ret; + return squashfs_parse_param_threads_num(str, opts); +} + static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct squashfs_mount_opts *opts = fc->fs_private; @@ -78,6 +138,10 @@ static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *para case Opt_errors: opts->errors = result.uint_32; break; + case Opt_threads: + if (squashfs_parse_param_threads(param->string, opts) != 0) + return -EINVAL; + break; default: return -EINVAL; } @@ -133,6 +197,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) return -ENOMEM; } msblk = sb->s_fs_info; + msblk->thread_ops = opts->thread_ops; msblk->panic_on_errors = (opts->errors == Opt_errors_panic); @@ -168,6 +233,12 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) goto failed_mount; } + if (opts->thread_num == 0) { + msblk->max_thread_num = msblk->thread_ops->max_decompressors(); + } else { + msblk->max_thread_num = opts->thread_num; + } + /* Check the MAJOR & MINOR versions and lookup compression type */ msblk->decompressor = supported_squashfs_filesystem( fc, @@ -252,7 +323,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) /* Allocate read_page block */ msblk->read_page = squashfs_cache_init("data", - squashfs_max_decompressors(), msblk->block_size); + msblk->max_thread_num, msblk->block_size); if (msblk->read_page == NULL) { errorf(fc, "Failed to allocate read_page block"); goto failed_mount; @@ -383,7 +454,7 @@ failed_mount: squashfs_cache_delete(msblk->block_cache); squashfs_cache_delete(msblk->fragment_cache); squashfs_cache_delete(msblk->read_page); - squashfs_decompressor_destroy(msblk); + msblk->thread_ops->destroy(msblk); kfree(msblk->inode_lookup_table); kfree(msblk->fragment_index); kfree(msblk->id_table); @@ -435,6 +506,19 @@ static int squashfs_show_options(struct seq_file *s, struct dentry *root) else seq_puts(s, ",errors=continue"); +#ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT + if (msblk->thread_ops == &squashfs_decompressor_single) { + seq_puts(s, ",threads=single"); + return 0; + } + if (msblk->thread_ops == &squashfs_decompressor_percpu) { + seq_puts(s, ",threads=percpu"); + return 0; + } +#endif +#ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS + seq_printf(s, ",threads=%d", msblk->max_thread_num); +#endif return 0; } @@ -446,6 +530,16 @@ static int squashfs_init_fs_context(struct fs_context *fc) if (!opts) return -ENOMEM; +#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE + opts->thread_ops = &squashfs_decompressor_single; +#elif defined(CONFIG_SQUASHFS_DECOMP_MULTI) + opts->thread_ops = &squashfs_decompressor_multi; +#elif defined(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) + opts->thread_ops = &squashfs_decompressor_percpu; +#else +#error "fail: unknown squashfs decompression thread mode?" +#endif + opts->thread_num = 0; fc->fs_private = opts; fc->ops = &squashfs_context_ops; return 0; @@ -478,7 +572,7 @@ static void squashfs_put_super(struct super_block *sb) squashfs_cache_delete(sbi->block_cache); squashfs_cache_delete(sbi->fragment_cache); squashfs_cache_delete(sbi->read_page); - squashfs_decompressor_destroy(sbi); + sbi->thread_ops->destroy(sbi); kfree(sbi->id_table); kfree(sbi->fragment_index); kfree(sbi->meta_index); @@ -568,7 +662,7 @@ static struct file_system_type squashfs_fs_type = { .init_fs_context = squashfs_init_fs_context, .parameters = squashfs_fs_parameters, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("squashfs"); diff --git a/fs/stat.c b/fs/stat.c index 9ced8860e0f3..d6cc74ca8486 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -5,6 +5,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ +#include <linux/blkdev.h> #include <linux/export.h> #include <linux/mm.h> #include <linux/errno.h> @@ -43,12 +44,15 @@ void generic_fillattr(struct user_namespace *mnt_userns, struct inode *inode, struct kstat *stat) { + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + stat->dev = inode->i_sb->s_dev; stat->ino = inode->i_ino; stat->mode = inode->i_mode; stat->nlink = inode->i_nlink; - stat->uid = i_uid_into_mnt(mnt_userns, inode); - stat->gid = i_gid_into_mnt(mnt_userns, inode); + stat->uid = vfsuid_into_kuid(vfsuid); + stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->i_rdev; stat->size = i_size_read(inode); stat->atime = inode->i_atime; @@ -230,11 +234,22 @@ retry: goto out; error = vfs_getattr(&path, stat, request_mask, flags); + stat->mnt_id = real_mount(path.mnt)->mnt_id; stat->result_mask |= STATX_MNT_ID; + if (path.mnt->mnt_root == path.dentry) stat->attributes |= STATX_ATTR_MOUNT_ROOT; stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; + + /* Handle STATX_DIOALIGN for block devices. */ + if (request_mask & STATX_DIOALIGN) { + struct inode *inode = d_backing_inode(path.dentry); + + if (S_ISBLK(inode->i_mode)) + bdev_statx_dioalign(inode, stat); + } + path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -611,6 +626,8 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) tmp.stx_dev_major = MAJOR(stat->dev); tmp.stx_dev_minor = MINOR(stat->dev); tmp.stx_mnt_id = stat->mnt_id; + tmp.stx_dio_mem_align = stat->dio_mem_align; + tmp.stx_dio_offset_align = stat->dio_offset_align; return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; } diff --git a/fs/super.c b/fs/super.c index 734ed584a946..12c08cb20405 100644 --- a/fs/super.c +++ b/fs/super.c @@ -291,7 +291,7 @@ static void __put_super(struct super_block *s) WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); security_sb_free(s); - fscrypt_sb_free(s); + fscrypt_destroy_keyring(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); call_rcu(&s->rcu, destroy_super_rcu); @@ -480,6 +480,7 @@ void generic_shutdown_super(struct super_block *sb) evict_inodes(sb); /* only nonzero refcount inodes can have marks */ fsnotify_sb_delete(sb); + fscrypt_destroy_keyring(sb); security_sb_delete(sb); if (sb->s_dio_done_wq) { @@ -1111,55 +1112,14 @@ static int test_single_super(struct super_block *s, struct fs_context *fc) return 1; } -/** - * vfs_get_super - Get a superblock with a search key set in s_fs_info. - * @fc: The filesystem context holding the parameters - * @keying: How to distinguish superblocks - * @fill_super: Helper to initialise a new superblock - * - * Search for a superblock and create a new one if not found. The search - * criterion is controlled by @keying. If the search fails, a new superblock - * is created and @fill_super() is called to initialise it. - * - * @keying can take one of a number of values: - * - * (1) vfs_get_single_super - Only one superblock of this type may exist on the - * system. This is typically used for special system filesystems. - * - * (2) vfs_get_keyed_super - Multiple superblocks may exist, but they must have - * distinct keys (where the key is in s_fs_info). Searching for the same - * key again will turn up the superblock for that key. - * - * (3) vfs_get_independent_super - Multiple superblocks may exist and are - * unkeyed. Each call will get a new superblock. - * - * A permissions check is made by sget_fc() unless we're getting a superblock - * for a kernel-internal mount or a submount. - */ -int vfs_get_super(struct fs_context *fc, - enum vfs_get_super_keying keying, - int (*fill_super)(struct super_block *sb, - struct fs_context *fc)) +static int vfs_get_super(struct fs_context *fc, bool reconf, + int (*test)(struct super_block *, struct fs_context *), + int (*fill_super)(struct super_block *sb, + struct fs_context *fc)) { - int (*test)(struct super_block *, struct fs_context *); struct super_block *sb; int err; - switch (keying) { - case vfs_get_single_super: - case vfs_get_single_reconf_super: - test = test_single_super; - break; - case vfs_get_keyed_super: - test = test_keyed_super; - break; - case vfs_get_independent_super: - test = NULL; - break; - default: - BUG(); - } - sb = sget_fc(fc, test, set_anon_super_fc); if (IS_ERR(sb)) return PTR_ERR(sb); @@ -1173,7 +1133,7 @@ int vfs_get_super(struct fs_context *fc, fc->root = dget(sb->s_root); } else { fc->root = dget(sb->s_root); - if (keying == vfs_get_single_reconf_super) { + if (reconf) { err = reconfigure_super(fc); if (err < 0) { dput(fc->root); @@ -1189,13 +1149,12 @@ error: deactivate_locked_super(sb); return err; } -EXPORT_SYMBOL(vfs_get_super); int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { - return vfs_get_super(fc, vfs_get_independent_super, fill_super); + return vfs_get_super(fc, false, NULL, fill_super); } EXPORT_SYMBOL(get_tree_nodev); @@ -1203,7 +1162,7 @@ int get_tree_single(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { - return vfs_get_super(fc, vfs_get_single_super, fill_super); + return vfs_get_super(fc, false, test_single_super, fill_super); } EXPORT_SYMBOL(get_tree_single); @@ -1211,7 +1170,7 @@ int get_tree_single_reconf(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { - return vfs_get_super(fc, vfs_get_single_reconf_super, fill_super); + return vfs_get_super(fc, true, test_single_super, fill_super); } EXPORT_SYMBOL(get_tree_single_reconf); @@ -1221,7 +1180,7 @@ int get_tree_keyed(struct fs_context *fc, void *key) { fc->s_fs_info = key; - return vfs_get_super(fc, vfs_get_keyed_super, fill_super); + return vfs_get_super(fc, false, test_keyed_super, fill_super); } EXPORT_SYMBOL(get_tree_keyed); diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index d4ec9bb97de9..3b8567564e7e 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -438,7 +438,7 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size) res += blocks; direct = 1; } - return blocks; + return res; } int sysv_getattr(struct user_namespace *mnt_userns, const struct path *path, diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index c57b46a352d8..3125e76376ee 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -24,6 +24,17 @@ static bool ubifs_crypt_empty_dir(struct inode *inode) return ubifs_check_dir_empty(inode) == 0; } +/** + * ubifs_encrypt - Encrypt data. + * @inode: inode which refers to the data node + * @dn: data node to encrypt + * @in_len: length of data to be compressed + * @out_len: allocated memory size for the data area of @dn + * @block: logical block number of the block + * + * This function encrypt a possibly-compressed data in the data node. + * The encrypted data length will store in @out_len. + */ int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn, unsigned int in_len, unsigned int *out_len, int block) { diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index fc718f6178f2..9c9d3f0e36a4 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2467,7 +2467,7 @@ error_dump: static inline int chance(unsigned int n, unsigned int out_of) { - return !!((prandom_u32() % out_of) + 1 <= n); + return !!(get_random_u32_below(out_of) + 1 <= n); } @@ -2485,13 +2485,13 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write) if (chance(1, 2)) { d->pc_delay = 1; /* Fail within 1 minute */ - delay = prandom_u32() % 60000; + delay = get_random_u32_below(60000); d->pc_timeout = jiffies; d->pc_timeout += msecs_to_jiffies(delay); ubifs_warn(c, "failing after %lums", delay); } else { d->pc_delay = 2; - delay = prandom_u32() % 10000; + delay = get_random_u32_below(10000); /* Fail within 10000 operations */ d->pc_cnt_max = delay; ubifs_warn(c, "failing after %lu calls", delay); @@ -2571,7 +2571,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf, unsigned int from, to, ffs = chance(1, 2); unsigned char *p = (void *)buf; - from = prandom_u32() % len; + from = get_random_u32_below(len); /* Corruption span max to end of write unit */ to = min(len, ALIGN(from + 1, c->max_write_size)); @@ -2581,7 +2581,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf, if (ffs) memset(p + from, 0xFF, to - from); else - prandom_bytes(p + from, to - from); + get_random_bytes(p + from, to - from); return to; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 86151889548e..0f29cf201136 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -68,13 +68,14 @@ static int inherit_flags(const struct inode *dir, umode_t mode) * @c: UBIFS file-system description object * @dir: parent directory inode * @mode: inode mode flags + * @is_xattr: whether the inode is xattr inode * * This function finds an unused inode number, allocates new inode and * initializes it. Returns new inode in case of success and an error code in * case of failure. */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, - umode_t mode) + umode_t mode, bool is_xattr) { int err; struct inode *inode; @@ -99,10 +100,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, current_time(inode); inode->i_mapping->nrpages = 0; - err = fscrypt_prepare_new_inode(dir, inode, &encrypted); - if (err) { - ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err); - goto out_iput; + if (!is_xattr) { + err = fscrypt_prepare_new_inode(dir, inode, &encrypted); + if (err) { + ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err); + goto out_iput; + } } switch (mode & S_IFMT) { @@ -309,7 +312,7 @@ static int ubifs_create(struct user_namespace *mnt_userns, struct inode *dir, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; @@ -370,7 +373,7 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) if (err) return ERR_PTR(err); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_free; @@ -424,8 +427,9 @@ static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) } static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { + struct dentry *dentry = file->f_path.dentry; struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, @@ -462,7 +466,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, return err; } - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_budg; @@ -475,7 +479,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, mutex_lock(&ui->ui_mutex); insert_inode_hash(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); ubifs_assert(c, ui->dirty); instantiated = 1; @@ -489,7 +493,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, ubifs_release_budget(c, &req); - return 0; + return finish_open_simple(file, 0); out_cancel: unlock_2_inodes(dir, inode); @@ -872,7 +876,7 @@ out_fname: } /** - * check_dir_empty - check if a directory is empty or not. + * ubifs_check_dir_empty - check if a directory is empty or not. * @dir: VFS inode object of the directory to check * * This function checks if directory @dir is empty. Returns zero if the @@ -1004,7 +1008,7 @@ static int ubifs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, S_IFDIR | mode); + inode = ubifs_new_inode(c, dir, S_IFDIR | mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; @@ -1091,7 +1095,7 @@ static int ubifs_mknod(struct user_namespace *mnt_userns, struct inode *dir, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { kfree(dev); err = PTR_ERR(inode); @@ -1173,7 +1177,7 @@ static int ubifs_symlink(struct user_namespace *mnt_userns, struct inode *dir, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO); + inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 75dab0ae3939..d02509920baf 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -503,7 +503,7 @@ static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui) static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent) { if (c->double_hash) - dent->cookie = (__force __le32) prandom_u32(); + dent->cookie = (__force __le32) get_random_u32(); else dent->cookie = 0; } @@ -1472,23 +1472,25 @@ out_free: * @block: data block number * @dn: data node to re-compress * @new_len: new length + * @dn_size: size of the data node @dn in memory * * This function is used when an inode is truncated and the last data node of * the inode has to be re-compressed/encrypted and re-written. */ static int truncate_data_node(const struct ubifs_info *c, const struct inode *inode, unsigned int block, struct ubifs_data_node *dn, - int *new_len) + int *new_len, int dn_size) { void *buf; - int err, dlen, compr_type, out_len, old_dlen; + int err, dlen, compr_type, out_len, data_size; out_len = le32_to_cpu(dn->size); buf = kmalloc_array(out_len, WORST_COMPR_FACTOR, GFP_NOFS); if (!buf) return -ENOMEM; - dlen = old_dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + data_size = dn_size - UBIFS_DATA_NODE_SZ; compr_type = le16_to_cpu(dn->compr_type); if (IS_ENCRYPTED(inode)) { @@ -1508,11 +1510,11 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in } if (IS_ENCRYPTED(inode)) { - err = ubifs_encrypt(inode, dn, out_len, &old_dlen, block); + err = ubifs_encrypt(inode, dn, out_len, &data_size, block); if (err) goto out; - out_len = old_dlen; + out_len = data_size; } else { dn->compr_size = 0; } @@ -1550,6 +1552,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, struct ubifs_trun_node *trun; struct ubifs_data_node *dn; int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode); + int dn_size; struct ubifs_inode *ui = ubifs_inode(inode); ino_t inum = inode->i_ino; unsigned int blk; @@ -1562,10 +1565,13 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, ubifs_assert(c, S_ISREG(inode->i_mode)); ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); - sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + - UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR; + dn_size = COMPRESSED_DATA_NODE_BUF_SZ; - sz += ubifs_auth_node_sz(c); + if (IS_ENCRYPTED(inode)) + dn_size += UBIFS_CIPHER_BLOCK_SIZE; + + sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + + dn_size + ubifs_auth_node_sz(c); ino = kmalloc(sz, GFP_NOFS); if (!ino) @@ -1596,15 +1602,15 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, if (dn_len <= 0 || dn_len > UBIFS_BLOCK_SIZE) { ubifs_err(c, "bad data node (block %u, inode %lu)", blk, inode->i_ino); - ubifs_dump_node(c, dn, sz - UBIFS_INO_NODE_SZ - - UBIFS_TRUN_NODE_SZ); + ubifs_dump_node(c, dn, dn_size); goto out_free; } if (dn_len <= dlen) dlen = 0; /* Nothing to do */ else { - err = truncate_data_node(c, inode, blk, dn, &dlen); + err = truncate_data_node(c, inode, blk, dn, + &dlen, dn_size); if (err) goto out_free; } diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index d76a19e460cd..c4d079328b92 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1970,28 +1970,28 @@ static int dbg_populate_lsave(struct ubifs_info *c) if (!dbg_is_chk_gen(c)) return 0; - if (prandom_u32() & 3) + if (get_random_u32_below(4)) return 0; for (i = 0; i < c->lsave_cnt; i++) c->lsave[i] = c->main_first; list_for_each_entry(lprops, &c->empty_list, list) - c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; + c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum; list_for_each_entry(lprops, &c->freeable_list, list) - c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; + c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum; list_for_each_entry(lprops, &c->frdi_idx_list, list) - c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; + c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum; heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; + c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum; heap = &c->lpt_heap[LPROPS_DIRTY - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; + c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum; heap = &c->lpt_heap[LPROPS_FREE - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; + c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum; return 1; } diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 58c92c96ecef..a55e04822d16 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -700,7 +700,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt) c->ilebs[c->ileb_cnt++] = lnum; dbg_cmt("LEB %d", lnum); } - if (dbg_is_chk_index(c) && !(prandom_u32() & 7)) + if (dbg_is_chk_index(c) && !get_random_u32_below(8)) return -ENOSPC; return 0; } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 7d6d2f152e03..478bbbb5382f 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -2026,7 +2026,7 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags); /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, - umode_t mode); + umode_t mode, bool is_xattr); int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); int ubifs_check_dir_empty(struct inode *dir); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index e4c4761aff7f..3db8486e3725 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -110,7 +110,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, if (err) return err; - inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO); + inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO, true); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_budg; diff --git a/fs/udf/dir.c b/fs/udf/dir.c index cad3772f9dbe..be640f4b2f2c 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -130,7 +130,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) brelse(tmp); } if (num) { - ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha); + bh_readahead_batch(num, bha, REQ_RAHEAD); for (i = 0; i < num; i++) brelse(bha[i]); } diff --git a/fs/udf/directory.c b/fs/udf/directory.c index a2adf6293093..16bcf2c6b8b3 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -89,7 +89,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, brelse(tmp); } if (num) { - ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha); + bh_readahead_batch(num, bha, REQ_RAHEAD); for (i = 0; i < num; i++) brelse(bha[i]); } diff --git a/fs/udf/file.c b/fs/udf/file.c index 09aef77269fe..5c659e23e578 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -252,6 +252,7 @@ const struct file_operations udf_file_operations = { .release = udf_release_file, .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, }; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 8d06daed549f..1d7c2a812fc1 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -182,11 +182,6 @@ static void udf_write_failed(struct address_space *mapping, loff_t to) } } -static int udf_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, udf_get_block, wbc); -} - static int udf_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -239,12 +234,12 @@ const struct address_space_operations udf_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = udf_read_folio, .readahead = udf_readahead, - .writepage = udf_writepage, .writepages = udf_writepages, .write_begin = udf_write_begin, .write_end = generic_write_end, .direct_IO = udf_direct_IO, .bmap = udf_bmap, + .migrate_folio = buffer_migrate_folio, }; /* @@ -439,6 +434,12 @@ static int udf_get_block(struct inode *inode, sector_t block, iinfo->i_next_alloc_goal++; } + /* + * Block beyond EOF and prealloc extents? Just discard preallocation + * as it is not useful and complicates things. + */ + if (((loff_t)block) << inode->i_blkbits > iinfo->i_lenExtents) + udf_discard_prealloc(inode); udf_clear_extent_cache(inode); phys = inode_getblk(inode, block, &err, &new); if (!phys) @@ -488,8 +489,6 @@ static int udf_do_extend_file(struct inode *inode, uint32_t add; int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); struct super_block *sb = inode->i_sb; - struct kernel_lb_addr prealloc_loc = {}; - uint32_t prealloc_len = 0; struct udf_inode_info *iinfo; int err; @@ -510,19 +509,6 @@ static int udf_do_extend_file(struct inode *inode, ~(sb->s_blocksize - 1); } - /* Last extent are just preallocated blocks? */ - if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) == - EXT_NOT_RECORDED_ALLOCATED) { - /* Save the extent so that we can reattach it to the end */ - prealloc_loc = last_ext->extLocation; - prealloc_len = last_ext->extLength; - /* Mark the extent as a hole */ - last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | - (last_ext->extLength & UDF_EXTENT_LENGTH_MASK); - last_ext->extLocation.logicalBlockNum = 0; - last_ext->extLocation.partitionReferenceNum = 0; - } - /* Can we merge with the previous extent? */ if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) == EXT_NOT_RECORDED_NOT_ALLOCATED) { @@ -550,7 +536,7 @@ static int udf_do_extend_file(struct inode *inode, * more extents, we may need to enter possible following * empty indirect extent. */ - if (new_block_bytes || prealloc_len) + if (new_block_bytes) udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0); } @@ -584,17 +570,6 @@ static int udf_do_extend_file(struct inode *inode, } out: - /* Do we have some preallocated blocks saved? */ - if (prealloc_len) { - err = udf_add_aext(inode, last_pos, &prealloc_loc, - prealloc_len, 1); - if (err) - return err; - last_ext->extLocation = prealloc_loc; - last_ext->extLength = prealloc_len; - count++; - } - /* last_pos should point to the last written extent... */ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) last_pos->offset -= sizeof(struct short_ad); @@ -610,13 +585,17 @@ out: static void udf_do_extend_final_block(struct inode *inode, struct extent_position *last_pos, struct kernel_long_ad *last_ext, - uint32_t final_block_len) + uint32_t new_elen) { - struct super_block *sb = inode->i_sb; uint32_t added_bytes; - added_bytes = final_block_len - - (last_ext->extLength & (sb->s_blocksize - 1)); + /* + * Extent already large enough? It may be already rounded up to block + * size... + */ + if (new_elen <= (last_ext->extLength & UDF_EXTENT_LENGTH_MASK)) + return; + added_bytes = (last_ext->extLength & UDF_EXTENT_LENGTH_MASK) - new_elen; last_ext->extLength += added_bytes; UDF_I(inode)->i_lenExtents += added_bytes; @@ -633,12 +612,12 @@ static int udf_extend_file(struct inode *inode, loff_t newsize) int8_t etype; struct super_block *sb = inode->i_sb; sector_t first_block = newsize >> sb->s_blocksize_bits, offset; - unsigned long partial_final_block; + loff_t new_elen; int adsize; struct udf_inode_info *iinfo = UDF_I(inode); struct kernel_long_ad extent; int err = 0; - int within_final_block; + bool within_last_ext; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); @@ -647,8 +626,17 @@ static int udf_extend_file(struct inode *inode, loff_t newsize) else BUG(); + /* + * When creating hole in file, just don't bother with preserving + * preallocation. It likely won't be very useful anyway. + */ + udf_discard_prealloc(inode); + etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset); - within_final_block = (etype != -1); + within_last_ext = (etype != -1); + /* We don't expect extents past EOF... */ + WARN_ON_ONCE(within_last_ext && + elen > ((loff_t)offset + 1) << inode->i_blkbits); if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) || (epos.bh && epos.offset == sizeof(struct allocExtDesc))) { @@ -664,19 +652,17 @@ static int udf_extend_file(struct inode *inode, loff_t newsize) extent.extLength |= etype << 30; } - partial_final_block = newsize & (sb->s_blocksize - 1); + new_elen = ((loff_t)offset << inode->i_blkbits) | + (newsize & (sb->s_blocksize - 1)); /* File has extent covering the new size (could happen when extending * inside a block)? */ - if (within_final_block) { + if (within_last_ext) { /* Extending file within the last file block */ - udf_do_extend_final_block(inode, &epos, &extent, - partial_final_block); + udf_do_extend_final_block(inode, &epos, &extent, new_elen); } else { - loff_t add = ((loff_t)offset << sb->s_blocksize_bits) | - partial_final_block; - err = udf_do_extend_file(inode, &epos, &extent, add); + err = udf_do_extend_file(inode, &epos, &extent, new_elen); } if (err < 0) @@ -777,10 +763,11 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, goto out_free; } - /* Are we beyond EOF? */ + /* Are we beyond EOF and preallocated extent? */ if (etype == -1) { int ret; loff_t hole_len; + isBeyondEOF = true; if (count) { if (c) @@ -1211,13 +1198,7 @@ struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, if (!bh) return NULL; - if (buffer_uptodate(bh)) - return bh; - - ll_rw_block(REQ_OP_READ, 1, &bh); - - wait_on_buffer(bh); - if (buffer_uptodate(bh)) + if (bh_read(bh, 0) >= 0) return bh; brelse(bh); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index b3d5f97f16cd..7c95c549dd64 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -240,7 +240,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, poffset - lfi); else { if (!copy_name) { - copy_name = kmalloc(UDF_NAME_LEN, + copy_name = kmalloc(UDF_NAME_LEN_CS0, GFP_NOFS); if (!copy_name) { fi = ERR_PTR(-ENOMEM); @@ -626,7 +626,7 @@ static int udf_create(struct user_namespace *mnt_userns, struct inode *dir, } static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { struct inode *inode = udf_new_inode(dir, mode); @@ -640,9 +640,9 @@ static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; mark_inode_dirty(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); unlock_new_inode(inode); - return 0; + return finish_open_simple(file, 0); } static int udf_mknod(struct user_namespace *mnt_userns, struct inode *dir, @@ -1091,8 +1091,9 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir, return -EINVAL; ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); - if (IS_ERR(ofi)) { - retval = PTR_ERR(ofi); + if (!ofi || IS_ERR(ofi)) { + if (IS_ERR(ofi)) + retval = PTR_ERR(ofi); goto end_rename; } @@ -1101,8 +1102,7 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir, brelse(ofibh.sbh); tloc = lelb_to_cpu(ocfi.icb.extLocation); - if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) - != old_inode->i_ino) + if (udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino) goto end_rename; nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi); diff --git a/fs/udf/super.c b/fs/udf/super.c index 4042d9739fb7..06eda8177b5f 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -162,7 +162,7 @@ static void udf_free_in_core_inode(struct inode *inode) static void init_once(void *foo) { - struct udf_inode_info *ei = (struct udf_inode_info *)foo; + struct udf_inode_info *ei = foo; ei->i_data = NULL; inode_init_once(&ei->vfs_inode); @@ -820,7 +820,7 @@ static int udf_find_fileset(struct super_block *sb, struct kernel_lb_addr *fileset, struct kernel_lb_addr *root) { - struct buffer_head *bh = NULL; + struct buffer_head *bh; uint16_t ident; int ret; diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 532cda99644e..036ebd892b85 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -120,60 +120,42 @@ void udf_truncate_tail_extent(struct inode *inode) void udf_discard_prealloc(struct inode *inode) { - struct extent_position epos = { NULL, 0, {0, 0} }; + struct extent_position epos = {}; + struct extent_position prev_epos = {}; struct kernel_lb_addr eloc; uint32_t elen; uint64_t lbcount = 0; int8_t etype = -1, netype; - int adsize; struct udf_inode_info *iinfo = UDF_I(inode); + int bsize = 1 << inode->i_blkbits; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB || - inode->i_size == iinfo->i_lenExtents) + ALIGN(inode->i_size, bsize) == ALIGN(iinfo->i_lenExtents, bsize)) return; - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - adsize = sizeof(struct short_ad); - else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - adsize = sizeof(struct long_ad); - else - adsize = 0; - epos.block = iinfo->i_location; /* Find the last extent in the file */ - while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { - etype = netype; + while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 0)) != -1) { + brelse(prev_epos.bh); + prev_epos = epos; + if (prev_epos.bh) + get_bh(prev_epos.bh); + + etype = udf_next_aext(inode, &epos, &eloc, &elen, 1); lbcount += elen; } if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { - epos.offset -= adsize; lbcount -= elen; - extent_trunc(inode, &epos, &eloc, etype, elen, 0); - if (!epos.bh) { - iinfo->i_lenAlloc = - epos.offset - - udf_file_entry_alloc_offset(inode); - mark_inode_dirty(inode); - } else { - struct allocExtDesc *aed = - (struct allocExtDesc *)(epos.bh->b_data); - aed->lengthAllocDescs = - cpu_to_le32(epos.offset - - sizeof(struct allocExtDesc)); - if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || - UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) - udf_update_tag(epos.bh->b_data, epos.offset); - else - udf_update_tag(epos.bh->b_data, - sizeof(struct allocExtDesc)); - mark_buffer_dirty_inode(epos.bh, inode); - } + udf_delete_aext(inode, prev_epos); + udf_free_blocks(inode->i_sb, inode, &eloc, 0, + DIV_ROUND_UP(elen, 1 << inode->i_blkbits)); } /* This inode entry is in-memory only and thus we don't have to mark * the inode dirty */ iinfo->i_lenExtents = lbcount; brelse(epos.bh); + brelse(prev_epos.bh); } static void udf_update_alloc_ext_desc(struct inode *inode, diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 4fa620543d30..291b56dd011e 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -6,7 +6,11 @@ #include <linux/bitops.h> #include <linux/magic.h> -#define UDF_MAX_READ_VERSION 0x0250 +/* + * Even UDF 2.6 media should have version <= 0x250 but apparently there are + * some broken filesystems with version set to 0x260. Accommodate those. + */ +#define UDF_MAX_READ_VERSION 0x0260 #define UDF_MAX_WRITE_VERSION 0x0201 #define UDF_FLAG_USE_EXTENDED_FE 0 diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index bd810d8239f2..2436e3f82147 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -295,14 +295,10 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, if (!buffer_mapped(bh)) map_bh(bh, inode->i_sb, oldb + pos); - if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ, 1, &bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - ufs_error(inode->i_sb, __func__, - "read of block failed\n"); - break; - } + if (bh_read(bh, 0) < 0) { + ufs_error(inode->i_sb, __func__, + "read of block failed\n"); + break; } UFSD(" change from %llu to %llu, pos %u\n", diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 175de70e3adf..98ac37e34e3d 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -30,6 +30,7 @@ #include <linux/security.h> #include <linux/hugetlb.h> #include <linux/swapops.h> +#include <linux/miscdevice.h> int sysctl_unprivileged_userfaultfd __read_mostly; @@ -415,13 +416,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) if (ctx->features & UFFD_FEATURE_SIGBUS) goto out; - if ((vmf->flags & FAULT_FLAG_USER) == 0 && - ctx->flags & UFFD_USER_MODE_ONLY) { - printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd " - "sysctl knob to 1 if kernel faults must be handled " - "without obtaining CAP_SYS_PTRACE capability\n"); + if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY)) goto out; - } /* * If it's already released don't get it. This avoids to loop @@ -615,14 +611,16 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, if (release_new_ctx) { struct vm_area_struct *vma; struct mm_struct *mm = release_new_ctx->mm; + VMA_ITERATOR(vmi, mm, 0); /* the various vma->vm_userfaultfd_ctx still points to it */ mmap_write_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) + for_each_vma(vmi, vma) { if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; vma->vm_flags &= ~__VM_UFFD_FLAGS; } + } mmap_write_unlock(mm); userfaultfd_ctx_put(release_new_ctx); @@ -803,11 +801,13 @@ static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, return false; } -int userfaultfd_unmap_prep(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct list_head *unmaps) +int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *unmaps) { - for ( ; vma && vma->vm_start < end; vma = vma->vm_next) { + VMA_ITERATOR(vmi, mm, start); + struct vm_area_struct *vma; + + for_each_vma_range(vmi, vma, end) { struct userfaultfd_unmap_ctx *unmap_ctx; struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; @@ -857,6 +857,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; + MA_STATE(mas, &mm->mm_mt, 0, 0); WRITE_ONCE(ctx->released, true); @@ -873,7 +874,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) */ mmap_write_lock(mm); prev = NULL; - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { cond_resched(); BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ !!(vma->vm_flags & __VM_UFFD_FLAGS)); @@ -887,10 +888,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); - if (prev) + if (prev) { + mas_pause(&mas); vma = prev; - else + } else { prev = vma; + } + vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } @@ -991,7 +995,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *new, int fd; fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new, - O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); + O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); if (fd < 0) return fd; @@ -1272,6 +1276,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool found; bool basic_ioctls; unsigned long start, end, vma_end; + MA_STATE(mas, &mm->mm_mt, 0, 0); user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1314,7 +1319,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - vma = find_vma_prev(mm, start, &prev); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) goto out_unlock; @@ -1339,7 +1345,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, */ found = false; basic_ioctls = false; - for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { + for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1399,8 +1405,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, } BUG_ON(!found); - if (vma->vm_start < start) - prev = vma; + mas_set(&mas, start); + prev = mas_prev(&mas, 0); + if (prev != vma) + mas_next(&mas, ULONG_MAX); ret = 0; do { @@ -1430,6 +1438,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ((struct vm_userfaultfd_ctx){ ctx }), anon_vma_name(vma)); if (prev) { + /* vma_merge() invalidated the mas */ + mas_pause(&mas); vma = prev; goto next; } @@ -1437,11 +1447,15 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ret = split_vma(mm, vma, start, 1); if (ret) break; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } if (vma->vm_end > end) { ret = split_vma(mm, vma, end, 0); if (ret) break; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } next: /* @@ -1458,8 +1472,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = vma->vm_next; - } while (vma && vma->vm_start < end); + vma = mas_next(&mas, end - 1); + } while (vma); out_unlock: mmap_write_unlock(mm); mmput(mm); @@ -1503,6 +1517,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, bool found; unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; + MA_STATE(mas, &mm->mm_mt, 0, 0); ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) @@ -1521,7 +1536,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - vma = find_vma_prev(mm, start, &prev); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) goto out_unlock; @@ -1546,7 +1562,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, */ found = false; ret = -EINVAL; - for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { + for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1566,8 +1582,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, } BUG_ON(!found); - if (vma->vm_start < start) - prev = vma; + mas_set(&mas, start); + prev = mas_prev(&mas, 0); + if (prev != vma) + mas_next(&mas, ULONG_MAX); ret = 0; do { @@ -1612,17 +1630,20 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) { vma = prev; + mas_pause(&mas); goto next; } if (vma->vm_start < start) { ret = split_vma(mm, vma, start, 1); if (ret) break; + mas_pause(&mas); } if (vma->vm_end > end) { ret = split_vma(mm, vma, end, 0); if (ret) break; + mas_pause(&mas); } next: /* @@ -1636,8 +1657,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = vma->vm_next; - } while (vma && vma->vm_start < end); + vma = mas_next(&mas, end - 1); + } while (vma); out_unlock: mmap_write_unlock(mm); mmput(mm); @@ -2056,20 +2077,11 @@ static void init_once_userfaultfd_ctx(void *mem) seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock); } -SYSCALL_DEFINE1(userfaultfd, int, flags) +static int new_userfaultfd(int flags) { struct userfaultfd_ctx *ctx; int fd; - if (!sysctl_unprivileged_userfaultfd && - (flags & UFFD_USER_MODE_ONLY) == 0 && - !capable(CAP_SYS_PTRACE)) { - printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd " - "sysctl knob to 1 if kernel faults must be handled " - "without obtaining CAP_SYS_PTRACE capability\n"); - return -EPERM; - } - BUG_ON(!current->mm); /* Check the UFFD_* constants for consistency. */ @@ -2094,7 +2106,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) mmgrab(ctx->mm); fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx, - O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); + O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); if (fd < 0) { mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); @@ -2102,8 +2114,60 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) return fd; } +static inline bool userfaultfd_syscall_allowed(int flags) +{ + /* Userspace-only page faults are always allowed */ + if (flags & UFFD_USER_MODE_ONLY) + return true; + + /* + * The user is requesting a userfaultfd which can handle kernel faults. + * Privileged users are always allowed to do this. + */ + if (capable(CAP_SYS_PTRACE)) + return true; + + /* Otherwise, access to kernel fault handling is sysctl controlled. */ + return sysctl_unprivileged_userfaultfd; +} + +SYSCALL_DEFINE1(userfaultfd, int, flags) +{ + if (!userfaultfd_syscall_allowed(flags)) + return -EPERM; + + return new_userfaultfd(flags); +} + +static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags) +{ + if (cmd != USERFAULTFD_IOC_NEW) + return -EINVAL; + + return new_userfaultfd(flags); +} + +static const struct file_operations userfaultfd_dev_fops = { + .unlocked_ioctl = userfaultfd_dev_ioctl, + .compat_ioctl = userfaultfd_dev_ioctl, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice userfaultfd_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "userfaultfd", + .fops = &userfaultfd_dev_fops +}; + static int __init userfaultfd_init(void) { + int ret; + + ret = misc_register(&userfaultfd_misc); + if (ret) + return ret; + userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache", sizeof(struct userfaultfd_ctx), 0, diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 629785c95007..c7fcb855e068 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -32,6 +32,11 @@ struct fsverity_hash_alg { unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */ unsigned int block_size; /* block size in bytes, e.g. 64 for SHA-256 */ mempool_t req_pool; /* mempool with a preallocated hash request */ + /* + * The HASH_ALGO_* constant for this algorithm. This is different from + * FS_VERITY_HASH_ALG_*, which uses a different numbering scheme. + */ + enum hash_algo algo_id; }; /* Merkle tree parameters: hash algorithm, initial hash state, and topology */ @@ -70,8 +75,6 @@ struct fsverity_info { const struct inode *inode; }; -/* Arbitrary limit to bound the kmalloc() size. Can be changed. */ -#define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 #define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \ sizeof(struct fsverity_descriptor)) diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index 71d0fccb6d4c..6f8170cf4ae7 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -16,11 +16,13 @@ struct fsverity_hash_alg fsverity_hash_algs[] = { .name = "sha256", .digest_size = SHA256_DIGEST_SIZE, .block_size = SHA256_BLOCK_SIZE, + .algo_id = HASH_ALGO_SHA256, }, [FS_VERITY_HASH_ALG_SHA512] = { .name = "sha512", .digest_size = SHA512_DIGEST_SIZE, .block_size = SHA512_BLOCK_SIZE, + .algo_id = HASH_ALGO_SHA512, }, }; @@ -324,5 +326,9 @@ void __init fsverity_check_hash_algs(void) */ BUG_ON(!is_power_of_2(alg->digest_size)); BUG_ON(!is_power_of_2(alg->block_size)); + + /* Verify that there is a valid mapping to HASH_ALGO_*. */ + BUG_ON(alg->algo_id == 0); + BUG_ON(alg->digest_size != hash_digest_size[alg->algo_id]); } } diff --git a/fs/verity/measure.c b/fs/verity/measure.c index e99c00350c28..5c79ea1b2468 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -65,8 +65,7 @@ EXPORT_SYMBOL_GPL(fsverity_ioctl_measure); * @alg: (out) pointer to the hash algorithm enumeration * * Return the file hash algorithm and digest of an fsverity protected file. - * Assumption: before calling fsverity_get_digest(), the file must have been - * opened. + * Assumption: before calling this, the file must have been opened. * * Return: 0 on success, -errno on failure */ @@ -76,27 +75,13 @@ int fsverity_get_digest(struct inode *inode, { const struct fsverity_info *vi; const struct fsverity_hash_alg *hash_alg; - int i; vi = fsverity_get_info(inode); if (!vi) return -ENODATA; /* not a verity file */ hash_alg = vi->tree_params.hash_alg; - memset(digest, 0, FS_VERITY_MAX_DIGEST_SIZE); - - /* convert the verity hash algorithm name to a hash_algo_name enum */ - i = match_string(hash_algo_name, HASH_ALGO__LAST, hash_alg->name); - if (i < 0) - return -EINVAL; - *alg = i; - - if (WARN_ON_ONCE(hash_alg->digest_size != hash_digest_size[*alg])) - return -EINVAL; memcpy(digest, vi->file_digest, hash_alg->digest_size); - - pr_debug("file digest %s:%*phN\n", hash_algo_name[*alg], - hash_digest_size[*alg], digest); - + *alg = hash_alg->algo_id; return 0; } diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c index 6ee849dc7bc1..2aefc5565152 100644 --- a/fs/verity/read_metadata.c +++ b/fs/verity/read_metadata.c @@ -53,14 +53,14 @@ static int fsverity_read_merkle_tree(struct inode *inode, break; } - virt = kmap(page); + virt = kmap_local_page(page); if (copy_to_user(buf, virt + offs_in_page, bytes_to_copy)) { - kunmap(page); + kunmap_local(virt); put_page(page); err = -EFAULT; break; } - kunmap(page); + kunmap_local(virt); put_page(page); retval += bytes_to_copy; diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 14e2fb49cff5..961ba248021f 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -39,16 +39,6 @@ static void hash_at_level(const struct merkle_tree_params *params, (params->log_blocksize - params->log_arity); } -/* Extract a hash from a hash page */ -static void extract_hash(struct page *hpage, unsigned int hoffset, - unsigned int hsize, u8 *out) -{ - void *virt = kmap_atomic(hpage); - - memcpy(out, virt + hoffset, hsize); - kunmap_atomic(virt); -} - static inline int cmp_hashes(const struct fsverity_info *vi, const u8 *want_hash, const u8 *real_hash, pgoff_t index, int level) @@ -129,7 +119,7 @@ static bool verify_page(struct inode *inode, const struct fsverity_info *vi, } if (PageChecked(hpage)) { - extract_hash(hpage, hoffset, hsize, _want_hash); + memcpy_from_page(_want_hash, hpage, hoffset, hsize); want_hash = _want_hash; put_page(hpage); pr_debug_ratelimited("Hash page already checked, want %s:%*phN\n", @@ -158,7 +148,7 @@ descend: if (err) goto out; SetPageChecked(hpage); - extract_hash(hpage, hoffset, hsize, _want_hash); + memcpy_from_page(_want_hash, hpage, hoffset, hsize); want_hash = _want_hash; put_page(hpage); pr_debug("Verified hash page at level %d, now want %s:%*phN\n", @@ -210,9 +200,8 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page); * @bio: the bio to verify * * Verify a set of pages that have just been read from a verity file. The pages - * must be pagecache pages that are still locked and not yet uptodate. Pages - * that fail verification are set to the Error state. Verification is skipped - * for pages already in the Error state, e.g. due to fscrypt decryption failure. + * must be pagecache pages that are still locked and not yet uptodate. If a + * page fails verification, then bio->bi_status is set to an error status. * * This is a helper function for use by the ->readahead() method of filesystems * that issue bios to read data directly into the page cache. Filesystems that @@ -254,9 +243,10 @@ void fsverity_verify_bio(struct bio *bio) unsigned long level0_ra_pages = min(max_ra_pages, params->level0_blocks - level0_index); - if (!PageError(page) && - !verify_page(inode, vi, req, page, level0_ra_pages)) - SetPageError(page); + if (!verify_page(inode, vi, req, page, level0_ra_pages)) { + bio->bi_status = BLK_STS_IOERR; + break; + } } fsverity_free_hash_request(params->hash_alg, req); diff --git a/fs/xattr.c b/fs/xattr.c index a1f4998bc6be..adab9a70b536 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -80,6 +80,31 @@ xattr_resolve_name(struct inode *inode, const char **name) return ERR_PTR(-EOPNOTSUPP); } +/** + * may_write_xattr - check whether inode allows writing xattr + * @mnt_userns: User namespace of the mount the inode was found from + * @inode: the inode on which to set an xattr + * + * Check whether the inode allows writing xattrs. Specifically, we can never + * set or remove an extended attribute on a read-only filesystem or on an + * immutable / append-only inode. + * + * We also need to ensure that the inode has a mapping in the mount to + * not risk writing back invalid i_{g,u}id values. + * + * Return: On success zero is returned. On error a negative errno is returned. + */ +int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode) +{ + if (IS_IMMUTABLE(inode)) + return -EPERM; + if (IS_APPEND(inode)) + return -EPERM; + if (HAS_UNMAPPED_ID(mnt_userns, inode)) + return -EPERM; + return 0; +} + /* * Check permissions for extended attribute access. This is a bit complicated * because different namespaces have very different rules. @@ -88,20 +113,12 @@ static int xattr_permission(struct user_namespace *mnt_userns, struct inode *inode, const char *name, int mask) { - /* - * We can never set or remove an extended attribute on a read-only - * filesystem or on an immutable / append-only inode. - */ if (mask & MAY_WRITE) { - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - /* - * Updating an xattr will likely cause i_uid and i_gid - * to be writen back improperly if their true value is - * unknown to the vfs. - */ - if (HAS_UNMAPPED_ID(mnt_userns, inode)) - return -EPERM; + int ret; + + ret = may_write_xattr(mnt_userns, inode); + if (ret) + return ret; } /* @@ -172,6 +189,9 @@ __vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, { const struct xattr_handler *handler; + if (is_posix_acl_xattr(name)) + return -EOPNOTSUPP; + handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); @@ -282,15 +302,9 @@ out: } EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); -static inline bool is_posix_acl_xattr(const char *name) -{ - return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || - (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0); -} - int vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, - const char *name, void *value, size_t size, int flags) + const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; @@ -298,16 +312,12 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { - error = cap_convert_nscap(mnt_userns, dentry, - (const void **)&value, size); + error = cap_convert_nscap(mnt_userns, dentry, &value, size); if (error < 0) return error; size = error; } - if (size && is_posix_acl_xattr(name)) - posix_acl_setxattr_idmapped_mnt(mnt_userns, inode, value, size); - retry_deleg: inode_lock(inode); error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size, @@ -358,11 +368,12 @@ out_noalloc: * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr * * Allocate memory, if not already allocated, or re-allocate correct size, - * before retrieving the extended attribute. + * before retrieving the extended attribute. The xattr value buffer should + * always be freed by the caller, even on error. * * Returns the result of alloc, if failed, or the getxattr operation. */ -ssize_t +int vfs_getxattr_alloc(struct user_namespace *mnt_userns, struct dentry *dentry, const char *name, char **xattr_value, size_t xattr_size, gfp_t flags) @@ -403,6 +414,9 @@ __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, { const struct xattr_handler *handler; + if (is_posix_acl_xattr(name)) + return -EOPNOTSUPP; + handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); @@ -441,10 +455,7 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry, return ret; } nolsm: - error = __vfs_getxattr(dentry, inode, name, value, size); - if (error > 0 && is_posix_acl_xattr(name)) - posix_acl_getxattr_idmapped_mnt(mnt_userns, inode, value, size); - return error; + return __vfs_getxattr(dentry, inode, name, value, size); } EXPORT_SYMBOL_GPL(vfs_getxattr); @@ -475,6 +486,9 @@ __vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode = d_inode(dentry); const struct xattr_handler *handler; + if (is_posix_acl_xattr(name)) + return -EOPNOTSUPP; + handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); @@ -584,25 +598,19 @@ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx) return error; } -static void setxattr_convert(struct user_namespace *mnt_userns, - struct dentry *d, struct xattr_ctx *ctx) -{ - if (ctx->size && - ((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || - (strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))) - posix_acl_fix_xattr_from_user(ctx->kvalue, ctx->size); -} - -int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_ctx *ctx) { - setxattr_convert(mnt_userns, dentry, ctx); - return vfs_setxattr(mnt_userns, dentry, ctx->kname->name, + if (is_posix_acl_xattr(ctx->kname->name)) + return do_set_acl(idmap, dentry, ctx->kname->name, + ctx->kvalue, ctx->size); + + return vfs_setxattr(mnt_idmap_owner(idmap), dentry, ctx->kname->name, ctx->kvalue, ctx->size, ctx->flags); } static long -setxattr(struct user_namespace *mnt_userns, struct dentry *d, +setxattr(struct mnt_idmap *idmap, struct dentry *d, const char __user *name, const void __user *value, size_t size, int flags) { @@ -620,7 +628,7 @@ setxattr(struct user_namespace *mnt_userns, struct dentry *d, if (error) return error; - error = do_setxattr(mnt_userns, d, &ctx); + error = do_setxattr(idmap, d, &ctx); kvfree(ctx.kvalue); return error; @@ -639,7 +647,7 @@ retry: return error; error = mnt_want_write(path.mnt); if (!error) { - error = setxattr(mnt_user_ns(path.mnt), path.dentry, name, + error = setxattr(mnt_idmap(path.mnt), path.dentry, name, value, size, flags); mnt_drop_write(path.mnt); } @@ -676,7 +684,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, audit_file(f.file); error = mnt_want_write_file(f.file); if (!error) { - error = setxattr(file_mnt_user_ns(f.file), + error = setxattr(file_mnt_idmap(f.file), f.file->f_path.dentry, name, value, size, flags); mnt_drop_write_file(f.file); @@ -689,7 +697,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, * Extended attribute GET operations */ ssize_t -do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, +do_getxattr(struct mnt_idmap *idmap, struct dentry *d, struct xattr_ctx *ctx) { ssize_t error; @@ -703,11 +711,12 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, return -ENOMEM; } - error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size); + if (is_posix_acl_xattr(ctx->kname->name)) + error = do_get_acl(idmap, d, kname, ctx->kvalue, ctx->size); + else + error = vfs_getxattr(mnt_idmap_owner(idmap), d, kname, + ctx->kvalue, ctx->size); if (error > 0) { - if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || - (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_to_user(ctx->kvalue, error); if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { @@ -720,7 +729,7 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, } static ssize_t -getxattr(struct user_namespace *mnt_userns, struct dentry *d, +getxattr(struct mnt_idmap *idmap, struct dentry *d, const char __user *name, void __user *value, size_t size) { ssize_t error; @@ -739,7 +748,7 @@ getxattr(struct user_namespace *mnt_userns, struct dentry *d, if (error < 0) return error; - error = do_getxattr(mnt_userns, d, &ctx); + error = do_getxattr(idmap, d, &ctx); kvfree(ctx.kvalue); return error; @@ -755,7 +764,7 @@ retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; - error = getxattr(mnt_user_ns(path.mnt), path.dentry, name, value, size); + error = getxattr(mnt_idmap(path.mnt), path.dentry, name, value, size); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -785,7 +794,7 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, if (!f.file) return error; audit_file(f.file); - error = getxattr(file_mnt_user_ns(f.file), f.file->f_path.dentry, + error = getxattr(file_mnt_idmap(f.file), f.file->f_path.dentry, name, value, size); fdput(f); return error; @@ -870,7 +879,7 @@ SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) * Extended attribute REMOVE operations */ static long -removexattr(struct user_namespace *mnt_userns, struct dentry *d, +removexattr(struct mnt_idmap *idmap, struct dentry *d, const char __user *name) { int error; @@ -882,7 +891,10 @@ removexattr(struct user_namespace *mnt_userns, struct dentry *d, if (error < 0) return error; - return vfs_removexattr(mnt_userns, d, kname); + if (is_posix_acl_xattr(kname)) + return vfs_remove_acl(mnt_idmap_owner(idmap), d, kname); + + return vfs_removexattr(mnt_idmap_owner(idmap), d, kname); } static int path_removexattr(const char __user *pathname, @@ -896,7 +908,7 @@ retry: return error; error = mnt_want_write(path.mnt); if (!error) { - error = removexattr(mnt_user_ns(path.mnt), path.dentry, name); + error = removexattr(mnt_idmap(path.mnt), path.dentry, name); mnt_drop_write(path.mnt); } path_put(&path); @@ -929,7 +941,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) audit_file(f.file); error = mnt_want_write_file(f.file); if (!error) { - error = removexattr(file_mnt_user_ns(f.file), + error = removexattr(file_mnt_idmap(f.file), f.file->f_path.dentry, name); mnt_drop_write_file(f.file); } @@ -999,8 +1011,29 @@ const char *xattr_full_name(const struct xattr_handler *handler, } EXPORT_SYMBOL(xattr_full_name); -/* - * Allocate new xattr and copy in the value; but leave the name to callers. +/** + * free_simple_xattr - free an xattr object + * @xattr: the xattr object + * + * Free the xattr object. Can handle @xattr being NULL. + */ +static inline void free_simple_xattr(struct simple_xattr *xattr) +{ + if (xattr) + kfree(xattr->name); + kvfree(xattr); +} + +/** + * simple_xattr_alloc - allocate new xattr object + * @value: value of the xattr object + * @size: size of @value + * + * Allocate a new xattr object and initialize respective members. The caller is + * responsible for handling the name of the xattr. + * + * Return: On success a new xattr object is returned. On failure NULL is + * returned. */ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) { @@ -1021,20 +1054,69 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) return new_xattr; } -/* - * xattr GET operation for in-memory/pseudo filesystems +/** + * rbtree_simple_xattr_cmp - compare xattr name with current rbtree xattr entry + * @key: xattr name + * @node: current node + * + * Compare the xattr name with the xattr name attached to @node in the rbtree. + * + * Return: Negative value if continuing left, positive if continuing right, 0 + * if the xattr attached to @node matches @key. + */ +static int rbtree_simple_xattr_cmp(const void *key, const struct rb_node *node) +{ + const char *xattr_name = key; + const struct simple_xattr *xattr; + + xattr = rb_entry(node, struct simple_xattr, rb_node); + return strcmp(xattr->name, xattr_name); +} + +/** + * rbtree_simple_xattr_node_cmp - compare two xattr rbtree nodes + * @new_node: new node + * @node: current node + * + * Compare the xattr attached to @new_node with the xattr attached to @node. + * + * Return: Negative value if continuing left, positive if continuing right, 0 + * if the xattr attached to @new_node matches the xattr attached to @node. + */ +static int rbtree_simple_xattr_node_cmp(struct rb_node *new_node, + const struct rb_node *node) +{ + struct simple_xattr *xattr; + xattr = rb_entry(new_node, struct simple_xattr, rb_node); + return rbtree_simple_xattr_cmp(xattr->name, node); +} + +/** + * simple_xattr_get - get an xattr object + * @xattrs: the header of the xattr object + * @name: the name of the xattr to retrieve + * @buffer: the buffer to store the value into + * @size: the size of @buffer + * + * Try to find and retrieve the xattr object associated with @name. + * If @buffer is provided store the value of @xattr in @buffer + * otherwise just return the length. The size of @buffer is limited + * to XATTR_SIZE_MAX which currently is 65536. + * + * Return: On success the length of the xattr value is returned. On error a + * negative error code is returned. */ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size) { - struct simple_xattr *xattr; + struct simple_xattr *xattr = NULL; + struct rb_node *rbp; int ret = -ENODATA; - spin_lock(&xattrs->lock); - list_for_each_entry(xattr, &xattrs->head, list) { - if (strcmp(name, xattr->name)) - continue; - + read_lock(&xattrs->lock); + rbp = rb_find(name, &xattrs->rb_root, rbtree_simple_xattr_cmp); + if (rbp) { + xattr = rb_entry(rbp, struct simple_xattr, rb_node); ret = xattr->size; if (buffer) { if (size < xattr->size) @@ -1042,34 +1124,44 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, else memcpy(buffer, xattr->value, xattr->size); } - break; } - spin_unlock(&xattrs->lock); + read_unlock(&xattrs->lock); return ret; } /** - * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems - * @xattrs: target simple_xattr list - * @name: name of the extended attribute - * @value: value of the xattr. If %NULL, will remove the attribute. - * @size: size of the new xattr - * @flags: %XATTR_{CREATE|REPLACE} - * @removed_size: returns size of the removed xattr, -1 if none removed + * simple_xattr_set - set an xattr object + * @xattrs: the header of the xattr object + * @name: the name of the xattr to retrieve + * @value: the value to store along the xattr + * @size: the size of @value + * @flags: the flags determining how to set the xattr + * @removed_size: the size of the removed xattr + * + * Set a new xattr object. + * If @value is passed a new xattr object will be allocated. If XATTR_REPLACE + * is specified in @flags a matching xattr object for @name must already exist. + * If it does it will be replaced with the new xattr object. If it doesn't we + * fail. If XATTR_CREATE is specified and a matching xattr does already exist + * we fail. If it doesn't we create a new xattr. If @flags is zero we simply + * insert the new xattr replacing any existing one. * - * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails - * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist; - * otherwise, fails with -ENODATA. + * If @value is empty and a matching xattr object is found we delete it if + * XATTR_REPLACE is specified in @flags or @flags is zero. * - * Returns 0 on success, -errno on failure. + * If @value is empty and no matching xattr object for @name is found we do + * nothing if XATTR_CREATE is specified in @flags or @flags is zero. For + * XATTR_REPLACE we fail as mentioned above. + * + * Return: On success zero and on error a negative error code is returned. */ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags, ssize_t *removed_size) { - struct simple_xattr *xattr; - struct simple_xattr *new_xattr = NULL; - int err = 0; + struct simple_xattr *xattr = NULL, *new_xattr = NULL; + struct rb_node *parent = NULL, **rbp; + int err = 0, ret; if (removed_size) *removed_size = -1; @@ -1082,42 +1174,68 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, new_xattr->name = kstrdup(name, GFP_KERNEL); if (!new_xattr->name) { - kvfree(new_xattr); + free_simple_xattr(new_xattr); return -ENOMEM; } } - spin_lock(&xattrs->lock); - list_for_each_entry(xattr, &xattrs->head, list) { - if (!strcmp(name, xattr->name)) { - if (flags & XATTR_CREATE) { - xattr = new_xattr; - err = -EEXIST; - } else if (new_xattr) { - list_replace(&xattr->list, &new_xattr->list); - if (removed_size) - *removed_size = xattr->size; - } else { - list_del(&xattr->list); - if (removed_size) - *removed_size = xattr->size; - } - goto out; - } - } - if (flags & XATTR_REPLACE) { - xattr = new_xattr; - err = -ENODATA; - } else { - list_add(&new_xattr->list, &xattrs->head); - xattr = NULL; + write_lock(&xattrs->lock); + rbp = &xattrs->rb_root.rb_node; + while (*rbp) { + parent = *rbp; + ret = rbtree_simple_xattr_cmp(name, *rbp); + if (ret < 0) + rbp = &(*rbp)->rb_left; + else if (ret > 0) + rbp = &(*rbp)->rb_right; + else + xattr = rb_entry(*rbp, struct simple_xattr, rb_node); + if (xattr) + break; } -out: - spin_unlock(&xattrs->lock); + if (xattr) { - kfree(xattr->name); - kvfree(xattr); + /* Fail if XATTR_CREATE is requested and the xattr exists. */ + if (flags & XATTR_CREATE) { + err = -EEXIST; + goto out_unlock; + } + + if (new_xattr) + rb_replace_node(&xattr->rb_node, &new_xattr->rb_node, + &xattrs->rb_root); + else + rb_erase(&xattr->rb_node, &xattrs->rb_root); + if (!err && removed_size) + *removed_size = xattr->size; + } else { + /* Fail if XATTR_REPLACE is requested but no xattr is found. */ + if (flags & XATTR_REPLACE) { + err = -ENODATA; + goto out_unlock; + } + + /* + * If XATTR_CREATE or no flags are specified together with a + * new value simply insert it. + */ + if (new_xattr) { + rb_link_node(&new_xattr->rb_node, parent, rbp); + rb_insert_color(&new_xattr->rb_node, &xattrs->rb_root); + } + + /* + * If XATTR_CREATE or no flags are specified and neither an + * old or new xattr exist then we don't need to do anything. + */ } + +out_unlock: + write_unlock(&xattrs->lock); + if (err) + free_simple_xattr(new_xattr); + else + free_simple_xattr(xattr); return err; } @@ -1141,14 +1259,31 @@ static int xattr_list_one(char **buffer, ssize_t *remaining_size, return 0; } -/* - * xattr LIST operation for in-memory/pseudo filesystems +/** + * simple_xattr_list - list all xattr objects + * @inode: inode from which to get the xattrs + * @xattrs: the header of the xattr object + * @buffer: the buffer to store all xattrs into + * @size: the size of @buffer + * + * List all xattrs associated with @inode. If @buffer is NULL we returned + * the required size of the buffer. If @buffer is provided we store the + * xattrs value into it provided it is big enough. + * + * Note, the number of xattr names that can be listed with listxattr(2) is + * limited to XATTR_LIST_MAX aka 65536 bytes. If a larger buffer is passed + * then vfs_listxattr() caps it to XATTR_LIST_MAX and if more xattr names + * are found it will return -E2BIG. + * + * Return: On success the required size or the size of the copied xattrs is + * returned. On error a negative error code is returned. */ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size) { - bool trusted = capable(CAP_SYS_ADMIN); + bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); struct simple_xattr *xattr; + struct rb_node *rbp; ssize_t remaining_size = size; int err = 0; @@ -1169,8 +1304,10 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, } #endif - spin_lock(&xattrs->lock); - list_for_each_entry(xattr, &xattrs->head, list) { + read_lock(&xattrs->lock); + for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) { + xattr = rb_entry(rbp, struct simple_xattr, rb_node); + /* skip "trusted." attributes for unprivileged callers */ if (!trusted && xattr_is_trusted(xattr->name)) continue; @@ -1179,18 +1316,76 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, if (err) break; } - spin_unlock(&xattrs->lock); + read_unlock(&xattrs->lock); return err ? err : size - remaining_size; } -/* - * Adds an extended attribute to the list +/** + * rbtree_simple_xattr_less - compare two xattr rbtree nodes + * @new_node: new node + * @node: current node + * + * Compare the xattr attached to @new_node with the xattr attached to @node. + * Note that this function technically tolerates duplicate entries. + * + * Return: True if insertion point in the rbtree is found. + */ +static bool rbtree_simple_xattr_less(struct rb_node *new_node, + const struct rb_node *node) +{ + return rbtree_simple_xattr_node_cmp(new_node, node) < 0; +} + +/** + * simple_xattr_add - add xattr objects + * @xattrs: the header of the xattr object + * @new_xattr: the xattr object to add + * + * Add an xattr object to @xattrs. This assumes no replacement or removal + * of matching xattrs is wanted. Should only be called during inode + * initialization when a few distinct initial xattrs are supposed to be set. + */ +void simple_xattr_add(struct simple_xattrs *xattrs, + struct simple_xattr *new_xattr) +{ + write_lock(&xattrs->lock); + rb_add(&new_xattr->rb_node, &xattrs->rb_root, rbtree_simple_xattr_less); + write_unlock(&xattrs->lock); +} + +/** + * simple_xattrs_init - initialize new xattr header + * @xattrs: header to initialize + * + * Initialize relevant fields of a an xattr header. */ -void simple_xattr_list_add(struct simple_xattrs *xattrs, - struct simple_xattr *new_xattr) +void simple_xattrs_init(struct simple_xattrs *xattrs) { - spin_lock(&xattrs->lock); - list_add(&new_xattr->list, &xattrs->head); - spin_unlock(&xattrs->lock); + xattrs->rb_root = RB_ROOT; + rwlock_init(&xattrs->lock); +} + +/** + * simple_xattrs_free - free xattrs + * @xattrs: xattr header whose xattrs to destroy + * + * Destroy all xattrs in @xattr. When this is called no one can hold a + * reference to any of the xattrs anymore. + */ +void simple_xattrs_free(struct simple_xattrs *xattrs) +{ + struct rb_node *rbp; + + rbp = rb_first(&xattrs->rb_root); + while (rbp) { + struct simple_xattr *xattr; + struct rb_node *rbp_next; + + rbp_next = rb_next(rbp); + xattr = rb_entry(rbp, struct simple_xattr, rb_node); + rb_erase(&xattr->rb_node, &xattrs->rb_root); + free_simple_xattr(xattr); + rbp = rbp_next; + } } diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 517a138faa66..191b22b9a35b 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -133,6 +133,21 @@ xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno) return true; } +static inline bool +xfs_verify_agbext( + struct xfs_perag *pag, + xfs_agblock_t agbno, + xfs_agblock_t len) +{ + if (agbno + len <= agbno) + return false; + + if (!xfs_verify_agbno(pag, agbno)) + return false; + + return xfs_verify_agbno(pag, agbno + len - 1); +} + /* * Verify that an AG inode number pointer neither points outside the AG * nor points at static metadata. diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index e2bdf089c0a3..989cf341779b 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -263,11 +263,7 @@ xfs_alloc_get_rec( goto out_bad_rec; /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(pag, *bno)) - goto out_bad_rec; - if (*bno > *bno + *len) - goto out_bad_rec; - if (!xfs_verify_agbno(pag, *bno + *len - 1)) + if (!xfs_verify_agbext(pag, *bno, *len)) goto out_bad_rec; return 0; @@ -1520,7 +1516,7 @@ xfs_alloc_ag_vextent_lastblock( #ifdef DEBUG /* Randomly don't execute the first algorithm. */ - if (prandom_u32() & 1) + if (get_random_u32_below(2)) return 0; #endif diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index e56723dc9cd5..0d56a8d862e8 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -294,7 +294,7 @@ xfs_check_block( else thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); if (*thispa == *pp) { - xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", + xfs_warn(mp, "%s: thispa(%d) == pp(%d) %lld", __func__, j, i, (unsigned long long)be64_to_cpu(*thispa)); xfs_err(mp, "%s: ptrs are equal in node\n", @@ -4058,7 +4058,7 @@ xfs_bmap_alloc_userdata( * the busy list. */ bma->datatype = XFS_ALLOC_NOBUSY; - if (whichfork == XFS_DATA_FORK) { + if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) { bma->datatype |= XFS_ALLOC_USERDATA; if (bma->offset == 0) bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; @@ -4551,7 +4551,8 @@ xfs_bmapi_convert_delalloc( * the extent. Just return the real extent at this offset. */ if (!isnullstartblock(bma.got.br_startblock)) { - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, + xfs_iomap_inode_sequence(ip, flags)); *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } @@ -4599,7 +4600,8 @@ xfs_bmapi_convert_delalloc( XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, + xfs_iomap_inode_sequence(ip, flags)); *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index eef27858a013..29c4b4ccb909 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -556,7 +556,6 @@ xfs_btree_islastblock( struct xfs_buf *bp; block = xfs_btree_get_block(cur, level, &bp); - ASSERT(block && xfs_btree_check_block(cur, block, level, bp) == 0); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK); diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index e7201dc68f43..e576560b46e9 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2192,8 +2192,8 @@ xfs_da_grow_inode_int( */ mapp = kmem_alloc(sizeof(*mapp) * count, 0); for (b = *bno, mapi = 0; b < *bno + count; ) { - nmap = min(XFS_BMAP_MAX_NMAP, count); c = (int)(*bno + count - b); + nmap = min(XFS_BMAP_MAX_NMAP, c); error = xfs_bmapi_write(tp, dp, b, c, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, args->total, &mapp[mapi], &nmap); diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 76eedc2756b3..92bac3373f1f 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -261,7 +261,7 @@ xfs_dir_createname( { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -357,7 +357,7 @@ xfs_dir_lookup( { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; int lock_mode; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -435,7 +435,7 @@ xfs_dir_removename( { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); @@ -493,7 +493,7 @@ xfs_dir_replace( { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -610,19 +610,23 @@ xfs_dir2_grow_inode( int xfs_dir2_isblock( struct xfs_da_args *args, - int *vp) /* out: 1 is block, 0 is not block */ + bool *isblock) { - xfs_fileoff_t last; /* last file offset */ - int rval; + struct xfs_mount *mp = args->dp->i_mount; + xfs_fileoff_t eof; + int error; - if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) - return rval; - rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize; - if (XFS_IS_CORRUPT(args->dp->i_mount, - rval != 0 && - args->dp->i_disk_size != args->geo->blksize)) + error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); + if (error) + return error; + + *isblock = false; + if (XFS_FSB_TO_B(mp, eof) != args->geo->blksize) + return 0; + + *isblock = true; + if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) return -EFSCORRUPTED; - *vp = rval; return 0; } @@ -632,14 +636,20 @@ xfs_dir2_isblock( int xfs_dir2_isleaf( struct xfs_da_args *args, - int *vp) /* out: 1 is block, 0 is not block */ + bool *isleaf) { - xfs_fileoff_t last; /* last file offset */ - int rval; + xfs_fileoff_t eof; + int error; - if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) - return rval; - *vp = last == args->geo->leafblk + args->geo->fsbcount; + error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); + if (error) + return error; + + *isleaf = false; + if (eof != args->geo->leafblk + args->geo->fsbcount) + return 0; + + *isleaf = true; return 0; } diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index b6df3c34b26a..dd39f17dd9a9 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -61,8 +61,8 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); /* * Interface routines used by userspace utilities */ -extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r); -extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r); +extern int xfs_dir2_isblock(struct xfs_da_args *args, bool *isblock); +extern int xfs_dir2_isleaf(struct xfs_da_args *args, bool *isleaf); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp); diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index d9b66306a9a7..cb9e950a911d 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -146,6 +146,8 @@ xfs_dir3_leaf_check_int( xfs_dir2_leaf_tail_t *ltp; int stale; int i; + bool isleaf1 = (hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR3_LEAF1_MAGIC); ltp = xfs_dir2_leaf_tail_p(geo, leaf); @@ -158,8 +160,7 @@ xfs_dir3_leaf_check_int( return __this_address; /* Leaves and bests don't overlap in leaf format. */ - if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC || - hdr->magic == XFS_DIR3_LEAF1_MAGIC) && + if (isleaf1 && (char *)&hdr->ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) return __this_address; @@ -175,6 +176,10 @@ xfs_dir3_leaf_check_int( } if (hdr->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; + if (isleaf1 && xfs_dir2_dataptr_to_db(geo, + be32_to_cpu(hdr->ents[i].address)) >= + be32_to_cpu(ltp->bestcount)) + return __this_address; } if (hdr->stale != stale) return __this_address; diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 003812fd7d35..8cd37e6e9d38 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -865,7 +865,6 @@ xfs_dir2_sf_lookup( struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; int i; /* entry index */ - int error; xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ enum xfs_dacmp cmp; /* comparison result */ @@ -929,8 +928,7 @@ xfs_dir2_sf_lookup( if (!ci_sfep) return -ENOENT; /* otherwise process the CI match as required by the caller */ - error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); - return error; + return xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); } /* diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 5362908164b0..01a9e86b3037 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -40,13 +40,12 @@ #define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25 #define XFS_ERRTAG_BMAP_FINISH_ONE 26 #define XFS_ERRTAG_AG_RESV_CRITICAL 27 + /* - * DEBUG mode instrumentation to test and/or trigger delayed allocation - * block killing in the event of failed writes. When enabled, all - * buffered writes are silenty dropped and handled as if they failed. - * All delalloc blocks in the range of the write (including pre-existing - * delalloc blocks!) are tossed as part of the write failure error - * handling sequence. + * Drop-writes support removed because write error handling cannot trash + * pre-existing delalloc extents in any useful way anymore. We retain the + * definition so that we can reject it as an invalid value in + * xfs_errortag_valid(). */ #define XFS_ERRTAG_DROP_WRITES 28 #define XFS_ERRTAG_LOG_BAD_CRC 29 @@ -62,7 +61,9 @@ #define XFS_ERRTAG_LARP 39 #define XFS_ERRTAG_DA_LEAF_SPLIT 40 #define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41 -#define XFS_ERRTAG_MAX 42 +#define XFS_ERRTAG_WB_DELAY_MS 42 +#define XFS_ERRTAG_WRITE_DELAY_MS 43 +#define XFS_ERRTAG_MAX 44 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -95,7 +96,6 @@ #define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 #define XFS_RANDOM_BMAP_FINISH_ONE 1 #define XFS_RANDOM_AG_RESV_CRITICAL 4 -#define XFS_RANDOM_DROP_WRITES 1 #define XFS_RANDOM_LOG_BAD_CRC 1 #define XFS_RANDOM_LOG_ITEM_PIN 1 #define XFS_RANDOM_BUF_LRU_REF 2 @@ -109,5 +109,7 @@ #define XFS_RANDOM_LARP 1 #define XFS_RANDOM_DA_LEAF_SPLIT 1 #define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 +#define XFS_RANDOM_WB_DELAY_MS 3000 +#define XFS_RANDOM_WRITE_DELAY_MS 3000 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index b55bdfa9c8a8..371dc07233e0 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1564,20 +1564,6 @@ struct xfs_rmap_rec { #define RMAPBT_UNUSED_OFFSET_BITLEN 7 #define RMAPBT_OFFSET_BITLEN 54 -#define XFS_RMAP_ATTR_FORK (1 << 0) -#define XFS_RMAP_BMBT_BLOCK (1 << 1) -#define XFS_RMAP_UNWRITTEN (1 << 2) -#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \ - XFS_RMAP_BMBT_BLOCK) -#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN) -struct xfs_rmap_irec { - xfs_agblock_t rm_startblock; /* extent start block */ - xfs_extlen_t rm_blockcount; /* extent length */ - uint64_t rm_owner; /* extent owner */ - uint64_t rm_offset; /* offset within the owner */ - unsigned int rm_flags; /* state flags */ -}; - /* * Key structure * @@ -1626,7 +1612,7 @@ unsigned int xfs_refc_block(struct xfs_mount *mp); * on the startblock. This speeds up mount time deletion of stale * staging extents because they're all at the right side of the tree. */ -#define XFS_REFC_COW_START ((xfs_agblock_t)(1U << 31)) +#define XFS_REFC_COWFLAG (1U << 31) #define REFCNTBT_COWFLAG_BITLEN 1 #define REFCNTBT_AGBLOCK_BITLEN 31 @@ -1640,12 +1626,6 @@ struct xfs_refcount_key { __be32 rc_startblock; /* starting block number */ }; -struct xfs_refcount_irec { - xfs_agblock_t rc_startblock; /* starting block number */ - xfs_extlen_t rc_blockcount; /* count of free blocks */ - xfs_nlink_t rc_refcount; /* number of inodes linked here */ -}; - #define MAXREFCOUNT ((xfs_nlink_t)~0U) #define MAXREFCEXTLEN ((xfs_extlen_t)~0U) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 6cdfd64bc56b..5118dedf9267 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -636,7 +636,7 @@ xfs_ialloc_ag_alloc( /* randomly do sparse inode allocations */ if (xfs_has_sparseinodes(tp->t_mountp) && igeo->ialloc_min_blks < igeo->ialloc_blks) - do_sparse = prandom_u32() & 1; + do_sparse = get_random_u32_below(2); #endif /* @@ -805,7 +805,7 @@ sparse_alloc: * number from being easily guessable. */ error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno, - args.agbno, args.len, prandom_u32()); + args.agbno, args.len, get_random_u32()); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 9327a4f39206..6b21760184d9 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -78,7 +78,7 @@ xfs_iformat_local( */ if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad size %d for local fork, size = %zd).", + "corrupt inode %llu (bad size %d for local fork, size = %zd).", (unsigned long long) ip->i_ino, size, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); xfs_inode_verifier_error(ip, -EFSCORRUPTED, @@ -192,7 +192,7 @@ xfs_iformat_btree( XFS_DFORK_SIZE(dip, mp, whichfork) || ifp->if_nextents > ip->i_nblocks) || level == 0 || level > XFS_BM_MAXLEVELS(mp, whichfork)) { - xfs_warn(mp, "corrupt inode %Lu (btree).", + xfs_warn(mp, "corrupt inode %llu (btree).", (unsigned long long) ip->i_ino); xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_btree", dfp, size, diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index b351b9dc6561..f13e0809dc63 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -613,25 +613,49 @@ typedef struct xfs_efi_log_format { uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_t efi_extents[1]; /* array of extents to free */ + xfs_extent_t efi_extents[]; /* array of extents to free */ } xfs_efi_log_format_t; +static inline size_t +xfs_efi_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efi_log_format) + + nr * sizeof(struct xfs_extent); +} + typedef struct xfs_efi_log_format_32 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_32_t efi_extents[1]; /* array of extents to free */ + xfs_extent_32_t efi_extents[]; /* array of extents to free */ } __attribute__((packed)) xfs_efi_log_format_32_t; +static inline size_t +xfs_efi_log_format32_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efi_log_format_32) + + nr * sizeof(struct xfs_extent_32); +} + typedef struct xfs_efi_log_format_64 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_64_t efi_extents[1]; /* array of extents to free */ + xfs_extent_64_t efi_extents[]; /* array of extents to free */ } xfs_efi_log_format_64_t; +static inline size_t +xfs_efi_log_format64_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efi_log_format_64) + + nr * sizeof(struct xfs_extent_64); +} + /* * This is the structure used to lay out an efd log item in the * log. The efd_extents array is a variable size array whose @@ -642,25 +666,49 @@ typedef struct xfs_efd_log_format { uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_t efd_extents[1]; /* array of extents freed */ + xfs_extent_t efd_extents[]; /* array of extents freed */ } xfs_efd_log_format_t; +static inline size_t +xfs_efd_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efd_log_format) + + nr * sizeof(struct xfs_extent); +} + typedef struct xfs_efd_log_format_32 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_32_t efd_extents[1]; /* array of extents freed */ + xfs_extent_32_t efd_extents[]; /* array of extents freed */ } __attribute__((packed)) xfs_efd_log_format_32_t; +static inline size_t +xfs_efd_log_format32_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efd_log_format_32) + + nr * sizeof(struct xfs_extent_32); +} + typedef struct xfs_efd_log_format_64 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_64_t efd_extents[1]; /* array of extents freed */ + xfs_extent_64_t efd_extents[]; /* array of extents freed */ } xfs_efd_log_format_64_t; +static inline size_t +xfs_efd_log_format64_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efd_log_format_64) + + nr * sizeof(struct xfs_extent_64); +} + /* * RUI/RUD (reverse mapping) log format definitions */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 64b910caafaa..6f7ed9288fe4 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -46,13 +46,16 @@ STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur, int xfs_refcount_lookup_le( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; + cur->bc_rec.rc.rc_domain = domain; return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); } @@ -63,13 +66,16 @@ xfs_refcount_lookup_le( int xfs_refcount_lookup_ge( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_GE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; + cur->bc_rec.rc.rc_domain = domain; return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); } @@ -80,13 +86,16 @@ xfs_refcount_lookup_ge( int xfs_refcount_lookup_eq( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; + cur->bc_rec.rc.rc_domain = domain; return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); } @@ -96,7 +105,17 @@ xfs_refcount_btrec_to_irec( const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec) { - irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock); + uint32_t start; + + start = be32_to_cpu(rec->refc.rc_startblock); + if (start & XFS_REFC_COWFLAG) { + start &= ~XFS_REFC_COWFLAG; + irec->rc_domain = XFS_REFC_DOMAIN_COW; + } else { + irec->rc_domain = XFS_REFC_DOMAIN_SHARED; + } + + irec->rc_startblock = start; irec->rc_blockcount = be32_to_cpu(rec->refc.rc_blockcount); irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount); } @@ -114,7 +133,6 @@ xfs_refcount_get_rec( struct xfs_perag *pag = cur->bc_ag.pag; union xfs_btree_rec *rec; int error; - xfs_agblock_t realstart; error = xfs_btree_get_rec(cur, &rec, stat); if (error || !*stat) @@ -124,22 +142,11 @@ xfs_refcount_get_rec( if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) goto out_bad_rec; - /* handle special COW-staging state */ - realstart = irec->rc_startblock; - if (realstart & XFS_REFC_COW_START) { - if (irec->rc_refcount != 1) - goto out_bad_rec; - realstart &= ~XFS_REFC_COW_START; - } else if (irec->rc_refcount < 2) { + if (!xfs_refcount_check_domain(irec)) goto out_bad_rec; - } /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(pag, realstart)) - goto out_bad_rec; - if (realstart > realstart + irec->rc_blockcount) - goto out_bad_rec; - if (!xfs_verify_agbno(pag, realstart + irec->rc_blockcount - 1)) + if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount)) goto out_bad_rec; if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) @@ -169,12 +176,17 @@ xfs_refcount_update( struct xfs_refcount_irec *irec) { union xfs_btree_rec rec; + uint32_t start; int error; trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); - rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock); + + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + rec.refc.rc_startblock = cpu_to_be32(start); rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount); rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount); + error = xfs_btree_update(cur, &rec); if (error) trace_xfs_refcount_update_error(cur->bc_mp, @@ -196,9 +208,12 @@ xfs_refcount_insert( int error; trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + cur->bc_rec.rc.rc_startblock = irec->rc_startblock; cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; cur->bc_rec.rc.rc_refcount = irec->rc_refcount; + cur->bc_rec.rc.rc_domain = irec->rc_domain; + error = xfs_btree_insert(cur, i); if (error) goto out_error; @@ -244,7 +259,8 @@ xfs_refcount_delete( } if (error) goto out_error; - error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec); + error = xfs_refcount_lookup_ge(cur, irec.rc_domain, irec.rc_startblock, + &found_rec); out_error: if (error) trace_xfs_refcount_delete_error(cur->bc_mp, @@ -343,6 +359,7 @@ xfs_refc_next( STATIC int xfs_refcount_split_extent( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t agbno, bool *shape_changed) { @@ -351,7 +368,7 @@ xfs_refcount_split_extent( int error; *shape_changed = false; - error = xfs_refcount_lookup_le(cur, agbno, &found_rec); + error = xfs_refcount_lookup_le(cur, domain, agbno, &found_rec); if (error) goto out_error; if (!found_rec) @@ -364,6 +381,8 @@ xfs_refcount_split_extent( error = -EFSCORRUPTED; goto out_error; } + if (rcext.rc_domain != domain) + return 0; if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno) return 0; @@ -415,6 +434,9 @@ xfs_refcount_merge_center_extents( trace_xfs_refcount_merge_center_extents(cur->bc_mp, cur->bc_ag.pag->pag_agno, left, center, right); + ASSERT(left->rc_domain == center->rc_domain); + ASSERT(right->rc_domain == center->rc_domain); + /* * Make sure the center and right extents are not in the btree. * If the center extent was synthesized, the first delete call @@ -423,8 +445,8 @@ xfs_refcount_merge_center_extents( * call removes the center and the second one removes the right * extent. */ - error = xfs_refcount_lookup_ge(cur, center->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_ge(cur, center->rc_domain, + center->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -451,8 +473,8 @@ xfs_refcount_merge_center_extents( } /* Enlarge the left extent. */ - error = xfs_refcount_lookup_le(cur, left->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, left->rc_domain, + left->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -491,10 +513,12 @@ xfs_refcount_merge_left_extent( trace_xfs_refcount_merge_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, left, cleft); + ASSERT(left->rc_domain == cleft->rc_domain); + /* If the extent at agbno (cleft) wasn't synthesized, remove it. */ if (cleft->rc_refcount > 1) { - error = xfs_refcount_lookup_le(cur, cleft->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, cleft->rc_domain, + cleft->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -512,8 +536,8 @@ xfs_refcount_merge_left_extent( } /* Enlarge the left extent. */ - error = xfs_refcount_lookup_le(cur, left->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, left->rc_domain, + left->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -552,13 +576,15 @@ xfs_refcount_merge_right_extent( trace_xfs_refcount_merge_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, cright, right); + ASSERT(right->rc_domain == cright->rc_domain); + /* * If the extent ending at agbno+aglen (cright) wasn't synthesized, * remove it. */ if (cright->rc_refcount > 1) { - error = xfs_refcount_lookup_le(cur, cright->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, cright->rc_domain, + cright->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -576,8 +602,8 @@ xfs_refcount_merge_right_extent( } /* Enlarge the right extent. */ - error = xfs_refcount_lookup_le(cur, right->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, right->rc_domain, + right->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -600,8 +626,6 @@ out_error: return error; } -#define XFS_FIND_RCEXT_SHARED 1 -#define XFS_FIND_RCEXT_COW 2 /* * Find the left extent and the one after it (cleft). This function assumes * that we've already split any extent crossing agbno. @@ -611,16 +635,16 @@ xfs_refcount_find_left_extents( struct xfs_btree_cur *cur, struct xfs_refcount_irec *left, struct xfs_refcount_irec *cleft, + enum xfs_refc_domain domain, xfs_agblock_t agbno, - xfs_extlen_t aglen, - int flags) + xfs_extlen_t aglen) { struct xfs_refcount_irec tmp; int error; int found_rec; left->rc_startblock = cleft->rc_startblock = NULLAGBLOCK; - error = xfs_refcount_lookup_le(cur, agbno - 1, &found_rec); + error = xfs_refcount_lookup_le(cur, domain, agbno - 1, &found_rec); if (error) goto out_error; if (!found_rec) @@ -634,11 +658,9 @@ xfs_refcount_find_left_extents( goto out_error; } - if (xfs_refc_next(&tmp) != agbno) - return 0; - if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) + if (tmp.rc_domain != domain) return 0; - if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) + if (xfs_refc_next(&tmp) != agbno) return 0; /* We have a left extent; retrieve (or invent) the next right one */ *left = tmp; @@ -655,6 +677,9 @@ xfs_refcount_find_left_extents( goto out_error; } + if (tmp.rc_domain != domain) + goto not_found; + /* if tmp starts at the end of our range, just use that */ if (tmp.rc_startblock == agbno) *cleft = tmp; @@ -671,8 +696,10 @@ xfs_refcount_find_left_extents( cleft->rc_blockcount = min(aglen, tmp.rc_startblock - agbno); cleft->rc_refcount = 1; + cleft->rc_domain = domain; } } else { +not_found: /* * No extents, so pretend that there's one covering the whole * range. @@ -680,6 +707,7 @@ xfs_refcount_find_left_extents( cleft->rc_startblock = agbno; cleft->rc_blockcount = aglen; cleft->rc_refcount = 1; + cleft->rc_domain = domain; } trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, left, cleft, agbno); @@ -700,16 +728,16 @@ xfs_refcount_find_right_extents( struct xfs_btree_cur *cur, struct xfs_refcount_irec *right, struct xfs_refcount_irec *cright, + enum xfs_refc_domain domain, xfs_agblock_t agbno, - xfs_extlen_t aglen, - int flags) + xfs_extlen_t aglen) { struct xfs_refcount_irec tmp; int error; int found_rec; right->rc_startblock = cright->rc_startblock = NULLAGBLOCK; - error = xfs_refcount_lookup_ge(cur, agbno + aglen, &found_rec); + error = xfs_refcount_lookup_ge(cur, domain, agbno + aglen, &found_rec); if (error) goto out_error; if (!found_rec) @@ -723,11 +751,9 @@ xfs_refcount_find_right_extents( goto out_error; } - if (tmp.rc_startblock != agbno + aglen) + if (tmp.rc_domain != domain) return 0; - if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) - return 0; - if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) + if (tmp.rc_startblock != agbno + aglen) return 0; /* We have a right extent; retrieve (or invent) the next left one */ *right = tmp; @@ -744,6 +770,9 @@ xfs_refcount_find_right_extents( goto out_error; } + if (tmp.rc_domain != domain) + goto not_found; + /* if tmp ends at the end of our range, just use that */ if (xfs_refc_next(&tmp) == agbno + aglen) *cright = tmp; @@ -760,8 +789,10 @@ xfs_refcount_find_right_extents( cright->rc_blockcount = right->rc_startblock - cright->rc_startblock; cright->rc_refcount = 1; + cright->rc_domain = domain; } } else { +not_found: /* * No extents, so pretend that there's one covering the whole * range. @@ -769,6 +800,7 @@ xfs_refcount_find_right_extents( cright->rc_startblock = agbno; cright->rc_blockcount = aglen; cright->rc_refcount = 1; + cright->rc_domain = domain; } trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, cright, right, agbno + aglen); @@ -783,21 +815,146 @@ out_error: /* Is this extent valid? */ static inline bool xfs_refc_valid( - struct xfs_refcount_irec *rc) + const struct xfs_refcount_irec *rc) { return rc->rc_startblock != NULLAGBLOCK; } +static inline xfs_nlink_t +xfs_refc_merge_refcount( + const struct xfs_refcount_irec *irec, + enum xfs_refc_adjust_op adjust) +{ + /* Once a record hits MAXREFCOUNT, it is pinned there forever */ + if (irec->rc_refcount == MAXREFCOUNT) + return MAXREFCOUNT; + return irec->rc_refcount + adjust; +} + +static inline bool +xfs_refc_want_merge_center( + const struct xfs_refcount_irec *left, + const struct xfs_refcount_irec *cleft, + const struct xfs_refcount_irec *cright, + const struct xfs_refcount_irec *right, + bool cleft_is_cright, + enum xfs_refc_adjust_op adjust, + unsigned long long *ulenp) +{ + unsigned long long ulen = left->rc_blockcount; + xfs_nlink_t new_refcount; + + /* + * To merge with a center record, both shoulder records must be + * adjacent to the record we want to adjust. This is only true if + * find_left and find_right made all four records valid. + */ + if (!xfs_refc_valid(left) || !xfs_refc_valid(right) || + !xfs_refc_valid(cleft) || !xfs_refc_valid(cright)) + return false; + + /* There must only be one record for the entire range. */ + if (!cleft_is_cright) + return false; + + /* The shoulder record refcounts must match the new refcount. */ + new_refcount = xfs_refc_merge_refcount(cleft, adjust); + if (left->rc_refcount != new_refcount) + return false; + if (right->rc_refcount != new_refcount) + return false; + + /* + * The new record cannot exceed the max length. ulen is a ULL as the + * individual record block counts can be up to (u32 - 1) in length + * hence we need to catch u32 addition overflows here. + */ + ulen += cleft->rc_blockcount + right->rc_blockcount; + if (ulen >= MAXREFCEXTLEN) + return false; + + *ulenp = ulen; + return true; +} + +static inline bool +xfs_refc_want_merge_left( + const struct xfs_refcount_irec *left, + const struct xfs_refcount_irec *cleft, + enum xfs_refc_adjust_op adjust) +{ + unsigned long long ulen = left->rc_blockcount; + xfs_nlink_t new_refcount; + + /* + * For a left merge, the left shoulder record must be adjacent to the + * start of the range. If this is true, find_left made left and cleft + * contain valid contents. + */ + if (!xfs_refc_valid(left) || !xfs_refc_valid(cleft)) + return false; + + /* Left shoulder record refcount must match the new refcount. */ + new_refcount = xfs_refc_merge_refcount(cleft, adjust); + if (left->rc_refcount != new_refcount) + return false; + + /* + * The new record cannot exceed the max length. ulen is a ULL as the + * individual record block counts can be up to (u32 - 1) in length + * hence we need to catch u32 addition overflows here. + */ + ulen += cleft->rc_blockcount; + if (ulen >= MAXREFCEXTLEN) + return false; + + return true; +} + +static inline bool +xfs_refc_want_merge_right( + const struct xfs_refcount_irec *cright, + const struct xfs_refcount_irec *right, + enum xfs_refc_adjust_op adjust) +{ + unsigned long long ulen = right->rc_blockcount; + xfs_nlink_t new_refcount; + + /* + * For a right merge, the right shoulder record must be adjacent to the + * end of the range. If this is true, find_right made cright and right + * contain valid contents. + */ + if (!xfs_refc_valid(right) || !xfs_refc_valid(cright)) + return false; + + /* Right shoulder record refcount must match the new refcount. */ + new_refcount = xfs_refc_merge_refcount(cright, adjust); + if (right->rc_refcount != new_refcount) + return false; + + /* + * The new record cannot exceed the max length. ulen is a ULL as the + * individual record block counts can be up to (u32 - 1) in length + * hence we need to catch u32 addition overflows here. + */ + ulen += cright->rc_blockcount; + if (ulen >= MAXREFCEXTLEN) + return false; + + return true; +} + /* * Try to merge with any extents on the boundaries of the adjustment range. */ STATIC int xfs_refcount_merge_extents( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t *agbno, xfs_extlen_t *aglen, enum xfs_refc_adjust_op adjust, - int flags, bool *shape_changed) { struct xfs_refcount_irec left = {0}, cleft = {0}; @@ -812,12 +969,12 @@ xfs_refcount_merge_extents( * just below (agbno + aglen) [cright], and just above (agbno + aglen) * [right]. */ - error = xfs_refcount_find_left_extents(cur, &left, &cleft, *agbno, - *aglen, flags); + error = xfs_refcount_find_left_extents(cur, &left, &cleft, domain, + *agbno, *aglen); if (error) return error; - error = xfs_refcount_find_right_extents(cur, &right, &cright, *agbno, - *aglen, flags); + error = xfs_refcount_find_right_extents(cur, &right, &cright, domain, + *agbno, *aglen); if (error) return error; @@ -829,23 +986,15 @@ xfs_refcount_merge_extents( (cleft.rc_blockcount == cright.rc_blockcount); /* Try to merge left, cleft, and right. cleft must == cright. */ - ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount + - right.rc_blockcount; - if (xfs_refc_valid(&left) && xfs_refc_valid(&right) && - xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal && - left.rc_refcount == cleft.rc_refcount + adjust && - right.rc_refcount == cleft.rc_refcount + adjust && - ulen < MAXREFCEXTLEN) { + if (xfs_refc_want_merge_center(&left, &cleft, &cright, &right, cequal, + adjust, &ulen)) { *shape_changed = true; return xfs_refcount_merge_center_extents(cur, &left, &cleft, &right, ulen, aglen); } /* Try to merge left and cleft. */ - ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount; - if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) && - left.rc_refcount == cleft.rc_refcount + adjust && - ulen < MAXREFCEXTLEN) { + if (xfs_refc_want_merge_left(&left, &cleft, adjust)) { *shape_changed = true; error = xfs_refcount_merge_left_extent(cur, &left, &cleft, agbno, aglen); @@ -861,16 +1010,13 @@ xfs_refcount_merge_extents( } /* Try to merge cright and right. */ - ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount; - if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) && - right.rc_refcount == cright.rc_refcount + adjust && - ulen < MAXREFCEXTLEN) { + if (xfs_refc_want_merge_right(&cright, &right, adjust)) { *shape_changed = true; return xfs_refcount_merge_right_extent(cur, &right, &cright, aglen); } - return error; + return 0; } /* @@ -933,7 +1079,8 @@ xfs_refcount_adjust_extents( if (*aglen == 0) return 0; - error = xfs_refcount_lookup_ge(cur, *agbno, &found_rec); + error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_SHARED, *agbno, + &found_rec); if (error) goto out_error; @@ -941,10 +1088,11 @@ xfs_refcount_adjust_extents( error = xfs_refcount_get_rec(cur, &ext, &found_rec); if (error) goto out_error; - if (!found_rec) { + if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) { ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; ext.rc_blockcount = 0; ext.rc_refcount = 0; + ext.rc_domain = XFS_REFC_DOMAIN_SHARED; } /* @@ -957,6 +1105,8 @@ xfs_refcount_adjust_extents( tmp.rc_blockcount = min(*aglen, ext.rc_startblock - *agbno); tmp.rc_refcount = 1 + adj; + tmp.rc_domain = XFS_REFC_DOMAIN_SHARED; + trace_xfs_refcount_modify_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, &tmp); @@ -986,15 +1136,30 @@ xfs_refcount_adjust_extents( (*agbno) += tmp.rc_blockcount; (*aglen) -= tmp.rc_blockcount; - error = xfs_refcount_lookup_ge(cur, *agbno, + /* Stop if there's nothing left to modify */ + if (*aglen == 0 || !xfs_refcount_still_have_space(cur)) + break; + + /* Move the cursor to the start of ext. */ + error = xfs_refcount_lookup_ge(cur, + XFS_REFC_DOMAIN_SHARED, *agbno, &found_rec); if (error) goto out_error; } - /* Stop if there's nothing left to modify */ - if (*aglen == 0 || !xfs_refcount_still_have_space(cur)) - break; + /* + * A previous step trimmed agbno/aglen such that the end of the + * range would not be in the middle of the record. If this is + * no longer the case, something is seriously wrong with the + * btree. Make sure we never feed the synthesized record into + * the processing loop below. + */ + if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount == 0) || + XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount > *aglen)) { + error = -EFSCORRUPTED; + goto out_error; + } /* * Adjust the reference count and either update the tree @@ -1070,13 +1235,15 @@ xfs_refcount_adjust( /* * Ensure that no rcextents cross the boundary of the adjustment range. */ - error = xfs_refcount_split_extent(cur, agbno, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, + agbno, &shape_changed); if (error) goto out_error; if (shape_changed) shape_changes++; - error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, + agbno + aglen, &shape_changed); if (error) goto out_error; if (shape_changed) @@ -1085,8 +1252,8 @@ xfs_refcount_adjust( /* * Try to merge with the left or right extents of the range. */ - error = xfs_refcount_merge_extents(cur, new_agbno, new_aglen, adj, - XFS_FIND_RCEXT_SHARED, &shape_changed); + error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_SHARED, + new_agbno, new_aglen, adj, &shape_changed); if (error) goto out_error; if (shape_changed) @@ -1125,6 +1292,32 @@ xfs_refcount_finish_one_cleanup( } /* + * Set up a continuation a deferred refcount operation by updating the intent. + * Checks to make sure we're not going to run off the end of the AG. + */ +static inline int +xfs_refcount_continue_op( + struct xfs_btree_cur *cur, + xfs_fsblock_t startblock, + xfs_agblock_t new_agbno, + xfs_extlen_t new_len, + xfs_fsblock_t *new_fsbno) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_perag *pag = cur->bc_ag.pag; + + if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, new_len))) + return -EFSCORRUPTED; + + *new_fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + + ASSERT(xfs_verify_fsbext(mp, *new_fsbno, new_len)); + ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, *new_fsbno)); + + return 0; +} + +/* * Process one of the deferred refcount operations. We pass back the * btree cursor to maintain our lock on the btree between calls. * This saves time and eliminates a buffer deadlock between the @@ -1191,12 +1384,20 @@ xfs_refcount_finish_one( case XFS_REFCOUNT_INCREASE: error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, new_len, XFS_REFCOUNT_ADJUST_INCREASE); - *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + if (error) + goto out_drop; + if (*new_len > 0) + error = xfs_refcount_continue_op(rcur, startblock, + new_agbno, *new_len, new_fsb); break; case XFS_REFCOUNT_DECREASE: error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, new_len, XFS_REFCOUNT_ADJUST_DECREASE); - *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + if (error) + goto out_drop; + if (*new_len > 0) + error = xfs_refcount_continue_op(rcur, startblock, + new_agbno, *new_len, new_fsb); break; case XFS_REFCOUNT_ALLOC_COW: *new_fsb = startblock + blockcount; @@ -1307,7 +1508,8 @@ xfs_refcount_find_shared( *flen = 0; /* Try to find a refcount extent that crosses the start */ - error = xfs_refcount_lookup_le(cur, agbno, &have); + error = xfs_refcount_lookup_le(cur, XFS_REFC_DOMAIN_SHARED, agbno, + &have); if (error) goto out_error; if (!have) { @@ -1325,6 +1527,8 @@ xfs_refcount_find_shared( error = -EFSCORRUPTED; goto out_error; } + if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED) + goto done; /* If the extent ends before the start, look at the next one */ if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) { @@ -1340,6 +1544,8 @@ xfs_refcount_find_shared( error = -EFSCORRUPTED; goto out_error; } + if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED) + goto done; } /* If the extent starts after the range we want, bail out */ @@ -1371,7 +1577,8 @@ xfs_refcount_find_shared( error = -EFSCORRUPTED; goto out_error; } - if (tmp.rc_startblock >= agbno + aglen || + if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED || + tmp.rc_startblock >= agbno + aglen || tmp.rc_startblock != *fbno + *flen) break; *flen = min(*flen + tmp.rc_blockcount, agbno + aglen - *fbno); @@ -1455,17 +1662,23 @@ xfs_refcount_adjust_cow_extents( return 0; /* Find any overlapping refcount records */ - error = xfs_refcount_lookup_ge(cur, agbno, &found_rec); + error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_COW, agbno, + &found_rec); if (error) goto out_error; error = xfs_refcount_get_rec(cur, &ext, &found_rec); if (error) goto out_error; + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec && + ext.rc_domain != XFS_REFC_DOMAIN_COW)) { + error = -EFSCORRUPTED; + goto out_error; + } if (!found_rec) { - ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks + - XFS_REFC_COW_START; + ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; ext.rc_blockcount = 0; ext.rc_refcount = 0; + ext.rc_domain = XFS_REFC_DOMAIN_COW; } switch (adj) { @@ -1480,6 +1693,8 @@ xfs_refcount_adjust_cow_extents( tmp.rc_startblock = agbno; tmp.rc_blockcount = aglen; tmp.rc_refcount = 1; + tmp.rc_domain = XFS_REFC_DOMAIN_COW; + trace_xfs_refcount_modify_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, &tmp); @@ -1542,24 +1757,24 @@ xfs_refcount_adjust_cow( bool shape_changed; int error; - agbno += XFS_REFC_COW_START; - /* * Ensure that no rcextents cross the boundary of the adjustment range. */ - error = xfs_refcount_split_extent(cur, agbno, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW, + agbno, &shape_changed); if (error) goto out_error; - error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW, + agbno + aglen, &shape_changed); if (error) goto out_error; /* * Try to merge with the left or right extents of the range. */ - error = xfs_refcount_merge_extents(cur, &agbno, &aglen, adj, - XFS_FIND_RCEXT_COW, &shape_changed); + error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_COW, &agbno, + &aglen, adj, &shape_changed); if (error) goto out_error; @@ -1666,10 +1881,18 @@ xfs_refcount_recover_extent( be32_to_cpu(rec->refc.rc_refcount) != 1)) return -EFSCORRUPTED; - rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0); + rr = kmalloc(sizeof(struct xfs_refcount_recovery), + GFP_KERNEL | __GFP_NOFAIL); + INIT_LIST_HEAD(&rr->rr_list); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); - list_add_tail(&rr->rr_list, debris); + if (XFS_IS_CORRUPT(cur->bc_mp, + rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { + kfree(rr); + return -EFSCORRUPTED; + } + + list_add_tail(&rr->rr_list, debris); return 0; } @@ -1687,10 +1910,11 @@ xfs_refcount_recover_cow_leftovers( union xfs_btree_irec low; union xfs_btree_irec high; xfs_fsblock_t fsb; - xfs_agblock_t agbno; int error; - if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START) + /* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */ + BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG); + if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS) return -EOPNOTSUPP; INIT_LIST_HEAD(&debris); @@ -1717,7 +1941,7 @@ xfs_refcount_recover_cow_leftovers( /* Find all the leftover CoW staging extents. */ memset(&low, 0, sizeof(low)); memset(&high, 0, sizeof(high)); - low.rc.rc_startblock = XFS_REFC_COW_START; + low.rc.rc_domain = high.rc.rc_domain = XFS_REFC_DOMAIN_COW; high.rc.rc_startblock = -1U; error = xfs_btree_query_range(cur, &low, &high, xfs_refcount_recover_extent, &debris); @@ -1738,8 +1962,8 @@ xfs_refcount_recover_cow_leftovers( &rr->rr_rrec); /* Free the orphan record */ - agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START; - fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno); + fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, + rr->rr_rrec.rc_startblock); xfs_refcount_free_cow_extent(tp, fsb, rr->rr_rrec.rc_blockcount); @@ -1751,7 +1975,7 @@ xfs_refcount_recover_cow_leftovers( goto out_free; list_del(&rr->rr_list); - kmem_free(rr); + kfree(rr); } return error; @@ -1761,7 +1985,7 @@ out_free: /* Free the leftover list */ list_for_each_entry_safe(rr, n, &debris, rr_list) { list_del(&rr->rr_list); - kmem_free(rr); + kfree(rr); } return error; } @@ -1770,6 +1994,7 @@ out_free: int xfs_refcount_has_record( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, xfs_extlen_t len, bool *exists) @@ -1781,6 +2006,7 @@ xfs_refcount_has_record( low.rc.rc_startblock = bno; memset(&high, 0xFF, sizeof(high)); high.rc.rc_startblock = bno + len - 1; + low.rc.rc_domain = high.rc.rc_domain = domain; return xfs_btree_has_record(cur, &low, &high, exists); } diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index e8b322de7f3d..452f30556f5a 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -14,14 +14,33 @@ struct xfs_bmbt_irec; struct xfs_refcount_irec; extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, - xfs_agblock_t bno, int *stat); + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur, - xfs_agblock_t bno, int *stat); + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); extern int xfs_refcount_lookup_eq(struct xfs_btree_cur *cur, - xfs_agblock_t bno, int *stat); + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); +static inline uint32_t +xfs_refcount_encode_startblock( + xfs_agblock_t startblock, + enum xfs_refc_domain domain) +{ + uint32_t start; + + /* + * low level btree operations need to handle the generic btree range + * query functions (which set rc_domain == -1U), so we check that the + * domain is /not/ shared. + */ + start = startblock & ~XFS_REFC_COWFLAG; + if (domain != XFS_REFC_DOMAIN_SHARED) + start |= XFS_REFC_COWFLAG; + + return start; +} + enum xfs_refcount_intent_type { XFS_REFCOUNT_INCREASE = 1, XFS_REFCOUNT_DECREASE, @@ -36,6 +55,18 @@ struct xfs_refcount_intent { xfs_fsblock_t ri_startblock; }; +/* Check that the refcount is appropriate for the record domain. */ +static inline bool +xfs_refcount_check_domain( + const struct xfs_refcount_irec *irec) +{ + if (irec->rc_domain == XFS_REFC_DOMAIN_COW && irec->rc_refcount != 1) + return false; + if (irec->rc_domain == XFS_REFC_DOMAIN_SHARED && irec->rc_refcount < 2) + return false; + return true; +} + void xfs_refcount_increase_extent(struct xfs_trans *tp, struct xfs_bmbt_irec *irec); void xfs_refcount_decrease_extent(struct xfs_trans *tp, @@ -79,7 +110,8 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, #define XFS_REFCOUNT_ITEM_OVERHEAD 32 extern int xfs_refcount_has_record(struct xfs_btree_cur *cur, - xfs_agblock_t bno, xfs_extlen_t len, bool *exists); + enum xfs_refc_domain domain, xfs_agblock_t bno, + xfs_extlen_t len, bool *exists); union xfs_btree_rec; extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 316c1ec0c3c2..e1f789866683 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -13,6 +13,7 @@ #include "xfs_btree.h" #include "xfs_btree_staging.h" #include "xfs_refcount_btree.h" +#include "xfs_refcount.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -160,7 +161,12 @@ xfs_refcountbt_init_rec_from_cur( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) { - rec->refc.rc_startblock = cpu_to_be32(cur->bc_rec.rc.rc_startblock); + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + uint32_t start; + + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + rec->refc.rc_startblock = cpu_to_be32(start); rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount); rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount); } @@ -182,10 +188,13 @@ xfs_refcountbt_key_diff( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { - struct xfs_refcount_irec *rec = &cur->bc_rec.rc; const struct xfs_refcount_key *kp = &key->refc; + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + uint32_t start; - return (int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock; + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + return (int64_t)be32_to_cpu(kp->rc_startblock) - start; } STATIC int64_t diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 094dfc897ebc..b56aca1e7c66 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -235,13 +235,8 @@ xfs_rmap_get_rec( goto out_bad_rec; } else { /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(pag, irec->rm_startblock)) - goto out_bad_rec; - if (irec->rm_startblock > - irec->rm_startblock + irec->rm_blockcount) - goto out_bad_rec; - if (!xfs_verify_agbno(pag, - irec->rm_startblock + irec->rm_blockcount - 1)) + if (!xfs_verify_agbext(pag, irec->rm_startblock, + irec->rm_blockcount)) goto out_bad_rec; } diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a20cade590e9..1eeecf2eb2a7 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -972,7 +972,9 @@ xfs_log_sb( */ if (xfs_has_lazysbcount(mp)) { mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); - mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); + mp->m_sb.sb_ifree = min_t(uint64_t, + percpu_counter_sum(&mp->m_ifree), + mp->m_sb.sb_icount); mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); } diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 2c4ad6e4bb14..5b2f27cbdb80 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -422,7 +422,7 @@ xfs_calc_itruncate_reservation_minlogsize( /* * In renaming a files we can modify: - * the four inodes involved: 4 * inode size + * the five inodes involved: 5 * inode size * the two directory btrees: 2 * (max depth + v2) * dir block size * the two directory bmap btrees: 2 * max depth * block size * And the bmap_finish transaction can free dir and bmap blocks (two sets @@ -437,7 +437,7 @@ xfs_calc_rename_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 4) + + max((xfs_calc_inode_res(mp, 5) + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index a6b7d98cf68f..5ebdda7e1078 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -166,6 +166,36 @@ typedef struct xfs_bmbt_irec xfs_exntst_t br_state; /* extent state */ } xfs_bmbt_irec_t; +enum xfs_refc_domain { + XFS_REFC_DOMAIN_SHARED = 0, + XFS_REFC_DOMAIN_COW, +}; + +#define XFS_REFC_DOMAIN_STRINGS \ + { XFS_REFC_DOMAIN_SHARED, "shared" }, \ + { XFS_REFC_DOMAIN_COW, "cow" } + +struct xfs_refcount_irec { + xfs_agblock_t rc_startblock; /* starting block number */ + xfs_extlen_t rc_blockcount; /* count of free blocks */ + xfs_nlink_t rc_refcount; /* number of inodes linked here */ + enum xfs_refc_domain rc_domain; /* shared or cow staging extent? */ +}; + +#define XFS_RMAP_ATTR_FORK (1 << 0) +#define XFS_RMAP_BMBT_BLOCK (1 << 1) +#define XFS_RMAP_UNWRITTEN (1 << 2) +#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \ + XFS_RMAP_BMBT_BLOCK) +#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN) +struct xfs_rmap_irec { + xfs_agblock_t rm_startblock; /* extent start block */ + xfs_extlen_t rm_blockcount; /* extent length */ + uint64_t rm_owner; /* extent owner */ + uint64_t rm_offset; /* offset within the owner */ + unsigned int rm_flags; /* state flags */ +}; + /* per-AG block reservation types */ enum xfs_ag_resv_type { XFS_AG_RESV_NONE = 0, diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index b7b838bd4ba4..4dd52b15f09c 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -609,9 +609,16 @@ out: /* AGFL */ struct xchk_agfl_info { - unsigned int sz_entries; + /* Number of AGFL entries that the AGF claims are in use. */ + unsigned int agflcount; + + /* Number of AGFL entries that we found. */ unsigned int nr_entries; + + /* Buffer to hold AGFL entries for extent checking. */ xfs_agblock_t *entries; + + struct xfs_buf *agfl_bp; struct xfs_scrub *sc; }; @@ -641,10 +648,10 @@ xchk_agfl_block( struct xfs_scrub *sc = sai->sc; if (xfs_verify_agbno(sc->sa.pag, agbno) && - sai->nr_entries < sai->sz_entries) + sai->nr_entries < sai->agflcount) sai->entries[sai->nr_entries++] = agbno; else - xchk_block_set_corrupt(sc, sc->sa.agfl_bp); + xchk_block_set_corrupt(sc, sai->agfl_bp); xchk_agfl_block_xref(sc, agbno); @@ -696,19 +703,26 @@ int xchk_agfl( struct xfs_scrub *sc) { - struct xchk_agfl_info sai; + struct xchk_agfl_info sai = { + .sc = sc, + }; struct xfs_agf *agf; xfs_agnumber_t agno = sc->sm->sm_agno; - unsigned int agflcount; unsigned int i; int error; + /* Lock the AGF and AGI so that nobody can touch this AG. */ error = xchk_ag_read_headers(sc, agno, &sc->sa); if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error)) - goto out; + return error; if (!sc->sa.agf_bp) return -EFSCORRUPTED; - xchk_buffer_recheck(sc, sc->sa.agfl_bp); + + /* Try to read the AGFL, and verify its structure if we get it. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &sai.agfl_bp); + if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error)) + return error; + xchk_buffer_recheck(sc, sai.agfl_bp); xchk_agfl_xref(sc); @@ -717,24 +731,21 @@ xchk_agfl( /* Allocate buffer to ensure uniqueness of AGFL entries. */ agf = sc->sa.agf_bp->b_addr; - agflcount = be32_to_cpu(agf->agf_flcount); - if (agflcount > xfs_agfl_size(sc->mp)) { + sai.agflcount = be32_to_cpu(agf->agf_flcount); + if (sai.agflcount > xfs_agfl_size(sc->mp)) { xchk_block_set_corrupt(sc, sc->sa.agf_bp); goto out; } - memset(&sai, 0, sizeof(sai)); - sai.sc = sc; - sai.sz_entries = agflcount; - sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, - KM_MAYFAIL); + sai.entries = kvcalloc(sai.agflcount, sizeof(xfs_agblock_t), + XCHK_GFP_FLAGS); if (!sai.entries) { error = -ENOMEM; goto out; } /* Check the blocks in the AGFL. */ - error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr, - sc->sa.agfl_bp, xchk_agfl_block, &sai); + error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr, sai.agfl_bp, + xchk_agfl_block, &sai); if (error == -ECANCELED) { error = 0; goto out_free; @@ -742,7 +753,7 @@ xchk_agfl( if (error) goto out_free; - if (agflcount != sai.nr_entries) { + if (sai.agflcount != sai.nr_entries) { xchk_block_set_corrupt(sc, sc->sa.agf_bp); goto out_free; } @@ -758,7 +769,7 @@ xchk_agfl( } out_free: - kmem_free(sai.entries); + kvfree(sai.entries); out: return error; } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 1b0b4e243f77..d75d82151eeb 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -442,12 +442,18 @@ out_revert: /* AGFL */ struct xrep_agfl { + /* Bitmap of alleged AGFL blocks that we're not going to add. */ + struct xbitmap crossed; + /* Bitmap of other OWN_AG metadata blocks. */ struct xbitmap agmetablocks; /* Bitmap of free space. */ struct xbitmap *freesp; + /* rmapbt cursor for finding crosslinked blocks */ + struct xfs_btree_cur *rmap_cur; + struct xfs_scrub *sc; }; @@ -477,6 +483,41 @@ xrep_agfl_walk_rmap( return xbitmap_set_btcur_path(&ra->agmetablocks, cur); } +/* Strike out the blocks that are cross-linked according to the rmapbt. */ +STATIC int +xrep_agfl_check_extent( + struct xrep_agfl *ra, + uint64_t start, + uint64_t len) +{ + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start); + xfs_agblock_t last_agbno = agbno + len - 1; + int error; + + ASSERT(XFS_FSB_TO_AGNO(ra->sc->mp, start) == ra->sc->sa.pag->pag_agno); + + while (agbno <= last_agbno) { + bool other_owners; + + error = xfs_rmap_has_other_keys(ra->rmap_cur, agbno, 1, + &XFS_RMAP_OINFO_AG, &other_owners); + if (error) + return error; + + if (other_owners) { + error = xbitmap_set(&ra->crossed, agbno, 1); + if (error) + return error; + } + + if (xchk_should_terminate(ra->sc, &error)) + return error; + agbno++; + } + + return 0; +} + /* * Map out all the non-AGFL OWN_AG space in this AG so that we can deduce * which blocks belong to the AGFL. @@ -496,44 +537,58 @@ xrep_agfl_collect_blocks( struct xrep_agfl ra; struct xfs_mount *mp = sc->mp; struct xfs_btree_cur *cur; + struct xbitmap_range *br, *n; int error; ra.sc = sc; ra.freesp = agfl_extents; xbitmap_init(&ra.agmetablocks); + xbitmap_init(&ra.crossed); /* Find all space used by the free space btrees & rmapbt. */ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra); - if (error) - goto err; xfs_btree_del_cursor(cur, error); + if (error) + goto out_bmp; /* Find all blocks currently being used by the bnobt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag, XFS_BTNUM_BNO); error = xbitmap_set_btblocks(&ra.agmetablocks, cur); - if (error) - goto err; xfs_btree_del_cursor(cur, error); + if (error) + goto out_bmp; /* Find all blocks currently being used by the cntbt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag, XFS_BTNUM_CNT); error = xbitmap_set_btblocks(&ra.agmetablocks, cur); - if (error) - goto err; - xfs_btree_del_cursor(cur, error); + if (error) + goto out_bmp; /* * Drop the freesp meta blocks that are in use by btrees. * The remaining blocks /should/ be AGFL blocks. */ error = xbitmap_disunion(agfl_extents, &ra.agmetablocks); - xbitmap_destroy(&ra.agmetablocks); if (error) - return error; + goto out_bmp; + + /* Strike out the blocks that are cross-linked. */ + ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); + for_each_xbitmap_extent(br, n, agfl_extents) { + error = xrep_agfl_check_extent(&ra, br->start, br->len); + if (error) + break; + } + xfs_btree_del_cursor(ra.rmap_cur, error); + if (error) + goto out_bmp; + error = xbitmap_disunion(agfl_extents, &ra.crossed); + if (error) + goto out_bmp; /* * Calculate the new AGFL size. If we found more blocks than fit in @@ -541,11 +596,10 @@ xrep_agfl_collect_blocks( */ *flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents), xfs_agfl_size(mp)); - return 0; -err: +out_bmp: + xbitmap_destroy(&ra.crossed); xbitmap_destroy(&ra.agmetablocks); - xfs_btree_del_cursor(cur, error); return error; } @@ -631,7 +685,7 @@ xrep_agfl_init_header( if (br->len) break; list_del(&br->list); - kmem_free(br); + kfree(br); } /* Write new AGFL to disk. */ @@ -697,7 +751,6 @@ xrep_agfl( * freespace overflow to the freespace btrees. */ sc->sa.agf_bp = agf_bp; - sc->sa.agfl_bp = agfl_bp; error = xrep_roll_ag_trans(sc); if (error) goto err; diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index ab427b4d7fe0..3b38f4e2a537 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -100,9 +100,7 @@ xchk_allocbt_rec( bno = be32_to_cpu(rec->alloc.ar_startblock); len = be32_to_cpu(rec->alloc.ar_blockcount); - if (bno + len <= bno || - !xfs_verify_agbno(pag, bno) || - !xfs_verify_agbno(pag, bno + len - 1)) + if (!xfs_verify_agbext(pag, bno, len)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); xchk_allocbt_xref(bs->sc, bno, len); diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index b6f0c9f3f124..31529b9bf389 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -49,7 +49,7 @@ xchk_setup_xattr_buf( if (ab) { if (sz <= ab->sz) return 0; - kmem_free(ab); + kvfree(ab); sc->buf = NULL; } @@ -79,7 +79,8 @@ xchk_setup_xattr( * without the inode lock held, which means we can sleep. */ if (sc->flags & XCHK_TRY_HARDER) { - error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, GFP_KERNEL); + error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, + XCHK_GFP_FLAGS); if (error) return error; } @@ -138,8 +139,7 @@ xchk_xattr_listent( * doesn't work, we overload the seen_enough variable to convey * the error message back to the main scrub function. */ - error = xchk_setup_xattr_buf(sx->sc, valuelen, - GFP_KERNEL | __GFP_RETRY_MAYFAIL); + error = xchk_setup_xattr_buf(sx->sc, valuelen, XCHK_GFP_FLAGS); if (error == -ENOMEM) error = -EDEADLOCK; if (error) { @@ -324,8 +324,7 @@ xchk_xattr_block( return 0; /* Allocate memory for block usage checking. */ - error = xchk_setup_xattr_buf(ds->sc, 0, - GFP_KERNEL | __GFP_RETRY_MAYFAIL); + error = xchk_setup_xattr_buf(ds->sc, 0, XCHK_GFP_FLAGS); if (error == -ENOMEM) return -EDEADLOCK; if (error) diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index b89bf9de9b1c..a255f09e9f0a 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -10,6 +10,7 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "scrub/scrub.h" #include "scrub/bitmap.h" /* @@ -25,7 +26,7 @@ xbitmap_set( { struct xbitmap_range *bmr; - bmr = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL); + bmr = kmalloc(sizeof(struct xbitmap_range), XCHK_GFP_FLAGS); if (!bmr) return -ENOMEM; @@ -47,7 +48,7 @@ xbitmap_destroy( for_each_xbitmap_extent(bmr, n, bitmap) { list_del(&bmr->list); - kmem_free(bmr); + kfree(bmr); } } @@ -174,15 +175,15 @@ xbitmap_disunion( /* Total overlap, just delete ex. */ lp = lp->next; list_del(&br->list); - kmem_free(br); + kfree(br); break; case 0: /* * Deleting from the middle: add the new right extent * and then shrink the left extent. */ - new_br = kmem_alloc(sizeof(struct xbitmap_range), - KM_MAYFAIL); + new_br = kmalloc(sizeof(struct xbitmap_range), + XCHK_GFP_FLAGS); if (!new_br) { error = -ENOMEM; goto out; diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index f0b9cb6506fd..d50d0eab196a 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -90,6 +90,7 @@ out: struct xchk_bmap_info { struct xfs_scrub *sc; + struct xfs_iext_cursor icur; xfs_fileoff_t lastoff; bool is_rt; bool is_shared; @@ -146,6 +147,48 @@ xchk_bmap_get_rmap( return has_rmap; } +static inline bool +xchk_bmap_has_prev( + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_bmbt_irec got; + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); + + if (!xfs_iext_peek_prev_extent(ifp, &info->icur, &got)) + return false; + if (got.br_startoff + got.br_blockcount != irec->br_startoff) + return false; + if (got.br_startblock + got.br_blockcount != irec->br_startblock) + return false; + if (got.br_state != irec->br_state) + return false; + return true; +} + +static inline bool +xchk_bmap_has_next( + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_bmbt_irec got; + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); + + if (!xfs_iext_peek_next_extent(ifp, &info->icur, &got)) + return false; + if (irec->br_startoff + irec->br_blockcount != got.br_startoff) + return false; + if (irec->br_startblock + irec->br_blockcount != got.br_startblock) + return false; + if (got.br_state != irec->br_state) + return false; + return true; +} + /* Make sure that we have rmapbt records for this extent. */ STATIC void xchk_bmap_xref_rmap( @@ -214,6 +257,34 @@ xchk_bmap_xref_rmap( if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); + + /* + * If the rmap starts before this bmbt record, make sure there's a bmbt + * record for the previous offset that is contiguous with this mapping. + * Skip this for CoW fork extents because the refcount btree (and not + * the inode) is the ondisk owner for those extents. + */ + if (info->whichfork != XFS_COW_FORK && rmap.rm_startblock < agbno && + !xchk_bmap_has_prev(info, irec)) { + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + return; + } + + /* + * If the rmap ends after this bmbt record, make sure there's a bmbt + * record for the next offset that is contiguous with this mapping. + * Skip this for CoW fork extents because the refcount btree (and not + * the inode) is the ondisk owner for those extents. + */ + rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; + if (info->whichfork != XFS_COW_FORK && + rmap_end > agbno + irec->br_blockcount && + !xchk_bmap_has_next(info, irec)) { + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + return; + } } /* Cross-reference a single rtdev extent record. */ @@ -264,6 +335,8 @@ xchk_bmap_iextent_xref( case XFS_COW_FORK: xchk_xref_is_cow_staging(info->sc, agbno, irec->br_blockcount); + xchk_xref_is_not_shared(info->sc, agbno, + irec->br_blockcount); break; } @@ -297,14 +370,13 @@ xchk_bmap_dirattr_extent( } /* Scrub a single extent record. */ -STATIC int +STATIC void xchk_bmap_iextent( struct xfs_inode *ip, struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { struct xfs_mount *mp = info->sc->mp; - int error = 0; /* * Check for out-of-order extents. This record could have come @@ -325,14 +397,6 @@ xchk_bmap_iextent( xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); - /* - * Check for delalloc extents. We never iterate the ones in the - * in-core extent scan, and we should never see these in the bmbt. - */ - if (isnullstartblock(irec->br_startblock)) - xchk_fblock_set_corrupt(info->sc, info->whichfork, - irec->br_startoff); - /* Make sure the extent points to a valid place. */ if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) xchk_fblock_set_corrupt(info->sc, info->whichfork, @@ -353,15 +417,12 @@ xchk_bmap_iextent( irec->br_startoff); if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return 0; + return; if (info->is_rt) xchk_bmap_rt_iextent_xref(ip, info, irec); else xchk_bmap_iextent_xref(ip, info, irec); - - info->lastoff = irec->br_startoff + irec->br_blockcount; - return error; } /* Scrub a bmbt record. */ @@ -599,14 +660,41 @@ xchk_bmap_check_rmaps( for_each_perag(sc->mp, agno, pag) { error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag); - if (error) - break; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - break; + if (error || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { + xfs_perag_put(pag); + return error; + } } - if (pag) - xfs_perag_put(pag); - return error; + + return 0; +} + +/* Scrub a delalloc reservation from the incore extent map tree. */ +STATIC void +xchk_bmap_iextent_delalloc( + struct xfs_inode *ip, + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = info->sc->mp; + + /* + * Check for out-of-order extents. This record could have come + * from the incore list, for which there is no ordering check. + */ + if (irec->br_startoff < info->lastoff) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* Make sure the extent points to a valid place. */ + if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); } /* @@ -626,7 +714,6 @@ xchk_bmap( struct xfs_inode *ip = sc->ip; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t endoff; - struct xfs_iext_cursor icur; int error = 0; /* Non-existent forks can be ignored. */ @@ -661,6 +748,8 @@ xchk_bmap( case XFS_DINODE_FMT_DEV: case XFS_DINODE_FMT_LOCAL: /* No mappings to check. */ + if (whichfork == XFS_COW_FORK) + xchk_fblock_set_corrupt(sc, whichfork, 0); goto out; case XFS_DINODE_FMT_EXTENTS: break; @@ -690,20 +779,22 @@ xchk_bmap( /* Scrub extent records. */ info.lastoff = 0; ifp = xfs_ifork_ptr(ip, whichfork); - for_each_xfs_iext(ifp, &icur, &irec) { + for_each_xfs_iext(ifp, &info.icur, &irec) { if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) goto out; - if (isnullstartblock(irec.br_startblock)) - continue; + if (irec.br_startoff >= endoff) { xchk_fblock_set_corrupt(sc, whichfork, irec.br_startoff); goto out; } - error = xchk_bmap_iextent(ip, &info, &irec); - if (error) - goto out; + + if (isnullstartblock(irec.br_startblock)) + xchk_bmap_iextent_delalloc(ip, &info, &irec); + else + xchk_bmap_iextent(ip, &info, &irec); + info.lastoff = irec.br_startoff + irec.br_blockcount; } error = xchk_bmap_check_rmaps(sc, whichfork); diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 2f4519590dc1..0fd36d5b4646 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -408,7 +408,6 @@ xchk_btree_check_owner( struct xfs_buf *bp) { struct xfs_btree_cur *cur = bs->cur; - struct check_owner *co; /* * In theory, xfs_btree_get_block should only give us a null buffer @@ -431,10 +430,13 @@ xchk_btree_check_owner( * later scanning. */ if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) { - co = kmem_alloc(sizeof(struct check_owner), - KM_MAYFAIL); + struct check_owner *co; + + co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS); if (!co) return -ENOMEM; + + INIT_LIST_HEAD(&co->list); co->level = level; co->daddr = xfs_buf_daddr(bp); list_add_tail(&co->list, &bs->to_check); @@ -649,7 +651,7 @@ xchk_btree( xchk_btree_set_corrupt(sc, cur, 0); return 0; } - bs = kmem_zalloc(cur_sz, KM_NOFS | KM_MAYFAIL); + bs = kzalloc(cur_sz, XCHK_GFP_FLAGS); if (!bs) return -ENOMEM; bs->cur = cur; @@ -740,9 +742,9 @@ out: error = xchk_btree_check_block_owner(bs, co->level, co->daddr); list_del(&co->list); - kmem_free(co); + kfree(co); } - kmem_free(bs); + kfree(bs); return error; } diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 9bbbf20f401b..613260b04a3d 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -424,10 +424,6 @@ xchk_ag_read_headers( if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF)) return error; - error = xfs_alloc_read_agfl(sa->pag, sc->tp, &sa->agfl_bp); - if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL)) - return error; - return 0; } @@ -515,10 +511,6 @@ xchk_ag_free( struct xchk_ag *sa) { xchk_ag_btcur_free(sa); - if (sa->agfl_bp) { - xfs_trans_brelse(sc->tp, sa->agfl_bp); - sa->agfl_bp = NULL; - } if (sa->agf_bp) { xfs_trans_brelse(sc->tp, sa->agf_bp); sa->agf_bp = NULL; @@ -789,6 +781,33 @@ xchk_buffer_recheck( trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa); } +static inline int +xchk_metadata_inode_subtype( + struct xfs_scrub *sc, + unsigned int scrub_type) +{ + __u32 smtype = sc->sm->sm_type; + int error; + + sc->sm->sm_type = scrub_type; + + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xchk_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xchk_bmap_data(sc); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + break; + } + + sc->sm->sm_type = smtype; + return error; +} + /* * Scrub the attr/data forks of a metadata inode. The metadata inode must be * pointed to by sc->ip and the ILOCK must be held. @@ -797,13 +816,17 @@ int xchk_metadata_inode_forks( struct xfs_scrub *sc) { - __u32 smtype; bool shared; int error; if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return 0; + /* Check the inode record. */ + error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + /* Metadata inodes don't live on the rt device. */ if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); @@ -823,10 +846,7 @@ xchk_metadata_inode_forks( } /* Invoke the data fork scrubber. */ - smtype = sc->sm->sm_type; - sc->sm->sm_type = XFS_SCRUB_TYPE_BMBTD; - error = xchk_bmap_data(sc); - sc->sm->sm_type = smtype; + error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; @@ -841,7 +861,7 @@ xchk_metadata_inode_forks( xchk_ino_set_corrupt(sc, sc->ip->i_ino); } - return error; + return 0; } /* diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 454145db10e7..b73648d81d23 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -25,7 +25,7 @@ xchk_should_terminate( if (fatal_signal_pending(current)) { if (*error == 0) - *error = -EAGAIN; + *error = -EINTR; return true; } return false; diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 84fe3d33d699..d17cee177085 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -486,7 +486,7 @@ xchk_da_btree( return 0; /* Set up initial da state. */ - ds = kmem_zalloc(sizeof(struct xchk_da_btree), KM_NOFS | KM_MAYFAIL); + ds = kzalloc(sizeof(struct xchk_da_btree), XCHK_GFP_FLAGS); if (!ds) return -ENOMEM; ds->dargs.dp = sc->ip; @@ -591,6 +591,6 @@ out: out_state: xfs_da_state_free(ds->state); - kmem_free(ds); + kfree(ds); return error; } diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 5abb5fdb71d9..d1b0f23c2c59 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -99,7 +99,7 @@ out: * we check the inode number to make sure it's sane, then we check that * we can look up this filename. Finally, we check the ftype. */ -STATIC int +STATIC bool xchk_dir_actor( struct dir_context *dir_iter, const char *name, @@ -124,7 +124,7 @@ xchk_dir_actor( xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos)); if (xchk_should_terminate(sdc->sc, &error)) - return error; + return !error; /* Does this inode number make sense? */ if (!xfs_verify_dir_ino(mp, ino)) { @@ -191,8 +191,8 @@ out: * and return zero to xchk_directory. */ if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return -EFSCORRUPTED; - return error; + return false; + return !error; } /* Scrub a directory btree record. */ @@ -666,7 +666,12 @@ xchk_directory_blocks( struct xfs_scrub *sc) { struct xfs_bmbt_irec got; - struct xfs_da_args args; + struct xfs_da_args args = { + .dp = sc ->ip, + .whichfork = XFS_DATA_FORK, + .geo = sc->mp->m_dir_geo, + .trans = sc->tp, + }; struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); struct xfs_mount *mp = sc->mp; xfs_fileoff_t leaf_lblk; @@ -676,7 +681,7 @@ xchk_directory_blocks( xfs_dablk_t dabno; xfs_dir2_db_t last_data_db = 0; bool found; - int is_block = 0; + bool is_block = false; int error; /* Ignore local format directories. */ @@ -689,9 +694,6 @@ xchk_directory_blocks( free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET); /* Is this a block dir? */ - args.dp = sc->ip; - args.geo = mp->m_dir_geo; - args.trans = sc->tp; error = xfs_dir2_isblock(&args, &is_block); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 6a6f8fe7f87c..4777e7b89fdc 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -14,6 +14,8 @@ #include "xfs_health.h" #include "xfs_btree.h" #include "xfs_ag.h" +#include "xfs_rtalloc.h" +#include "xfs_inode.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -43,6 +45,16 @@ * our tolerance for mismatch between expected and actual counter values. */ +struct xchk_fscounters { + struct xfs_scrub *sc; + uint64_t icount; + uint64_t ifree; + uint64_t fdblocks; + uint64_t frextents; + unsigned long long icount_min; + unsigned long long icount_max; +}; + /* * Since the expected value computation is lockless but only browses incore * values, the percpu counters should be fairly close to each other. However, @@ -116,10 +128,11 @@ xchk_setup_fscounters( struct xchk_fscounters *fsc; int error; - sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0); + sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; fsc = sc->buf; + fsc->sc = sc; xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max); @@ -138,6 +151,18 @@ xchk_setup_fscounters( return xchk_trans_alloc(sc, 0); } +/* + * Part 1: Collecting filesystem summary counts. For each AG, we add its + * summary counts (total inodes, free inodes, free data blocks) to an incore + * copy of the overall filesystem summary counts. + * + * To avoid false corruption reports in part 2, any failure in this part must + * set the INCOMPLETE flag even when a negative errno is returned. This care + * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, + * ECANCELED) that are absorbed into a scrub state flag update by + * xchk_*_process_error. + */ + /* Count free space btree blocks manually for pre-lazysbcount filesystems. */ static int xchk_fscount_btreeblks( @@ -225,8 +250,10 @@ retry: } if (pag) xfs_perag_put(pag); - if (error) + if (error) { + xchk_set_incomplete(sc); return error; + } /* * The global incore space reservation is taken from the incore @@ -267,6 +294,64 @@ retry: return 0; } +#ifdef CONFIG_XFS_RT +STATIC int +xchk_fscount_add_frextent( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xchk_fscounters *fsc = priv; + int error = 0; + + fsc->frextents += rec->ar_extcount; + + xchk_should_terminate(fsc->sc, &error); + return error; +} + +/* Calculate the number of free realtime extents from the realtime bitmap. */ +STATIC int +xchk_fscount_count_frextents( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc) +{ + struct xfs_mount *mp = sc->mp; + int error; + + fsc->frextents = 0; + if (!xfs_has_realtime(mp)) + return 0; + + xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + error = xfs_rtalloc_query_all(sc->mp, sc->tp, + xchk_fscount_add_frextent, fsc); + if (error) { + xchk_set_incomplete(sc); + goto out_unlock; + } + +out_unlock: + xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + return error; +} +#else +STATIC int +xchk_fscount_count_frextents( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc) +{ + fsc->frextents = 0; + return 0; +} +#endif /* CONFIG_XFS_RT */ + +/* + * Part 2: Comparing filesystem summary counters. All we have to do here is + * sum the percpu counters and compare them to what we've observed. + */ + /* * Is the @counter reasonably close to the @expected value? * @@ -333,16 +418,17 @@ xchk_fscounters( { struct xfs_mount *mp = sc->mp; struct xchk_fscounters *fsc = sc->buf; - int64_t icount, ifree, fdblocks; + int64_t icount, ifree, fdblocks, frextents; int error; /* Snapshot the percpu counters. */ icount = percpu_counter_sum(&mp->m_icount); ifree = percpu_counter_sum(&mp->m_ifree); fdblocks = percpu_counter_sum(&mp->m_fdblocks); + frextents = percpu_counter_sum(&mp->m_frextents); /* No negative values, please! */ - if (icount < 0 || ifree < 0 || fdblocks < 0) + if (icount < 0 || ifree < 0 || fdblocks < 0 || frextents < 0) xchk_set_corrupt(sc); /* See if icount is obviously wrong. */ @@ -353,6 +439,10 @@ xchk_fscounters( if (fdblocks > mp->m_sb.sb_dblocks) xchk_set_corrupt(sc); + /* See if frextents is obviously wrong. */ + if (frextents > mp->m_sb.sb_rextents) + xchk_set_corrupt(sc); + /* * If ifree exceeds icount by more than the minimum variance then * something's probably wrong with the counters. @@ -367,6 +457,13 @@ xchk_fscounters( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) return 0; + /* Count the free extents counter for rt volumes. */ + error = xchk_fscount_count_frextents(sc, fsc); + if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + return 0; + /* Compare the in-core counters with whatever we counted. */ if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount)) xchk_set_corrupt(sc); @@ -378,5 +475,9 @@ xchk_fscounters( fsc->fdblocks)) xchk_set_corrupt(sc); + if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, + fsc->frextents)) + xchk_set_corrupt(sc); + return 0; } diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index e1026e07bf94..e312be7cd375 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -108,9 +108,8 @@ xchk_iallocbt_chunk( xfs_agblock_t bno; bno = XFS_AGINO_TO_AGBNO(mp, agino); - if (bno + len <= bno || - !xfs_verify_agbno(pag, bno) || - !xfs_verify_agbno(pag, bno + len - 1)) + + if (!xfs_verify_agbext(pag, bno, len)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); xchk_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len); diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 51820b40ab1c..7a2f38e5202c 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -365,7 +365,7 @@ xchk_dinode( * pagecache can't cache all the blocks in this file due to * overly large offsets, flag the inode for admin review. */ - if (isize >= mp->m_super->s_maxbytes) + if (isize > mp->m_super->s_maxbytes) xchk_ino_set_warning(sc, ino); /* di_nblocks */ diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index ab182a5cd0c0..d8dff3fd8053 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -38,7 +38,7 @@ struct xchk_parent_ctx { }; /* Look for a single entry in a directory pointing to an inode. */ -STATIC int +STATIC bool xchk_parent_actor( struct dir_context *dc, const char *name, @@ -62,7 +62,7 @@ xchk_parent_actor( if (xchk_should_terminate(spc->sc, &error)) spc->cancelled = true; - return error; + return !error; } /* Count the number of dentries in the parent dir that point to this inode. */ diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 21b4c9006859..9eeac8565394 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -14,6 +14,7 @@ #include "xfs_inode.h" #include "xfs_quota.h" #include "xfs_qm.h" +#include "xfs_bmap.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -84,7 +85,7 @@ xchk_quota_item( int error = 0; if (xchk_should_terminate(sc, &error)) - return -ECANCELED; + return error; /* * Except for the root dquot, the actual dquot we got must either have @@ -189,11 +190,12 @@ xchk_quota_data_fork( for_each_xfs_iext(ifp, &icur, &irec) { if (xchk_should_terminate(sc, &error)) break; + /* - * delalloc extents or blocks mapped above the highest + * delalloc/unwritten extents or blocks mapped above the highest * quota id shouldn't happen. */ - if (isnullstartblock(irec.br_startblock) || + if (!xfs_bmap_is_written_extent(&irec) || irec.br_startoff > max_dqid_off || irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index c68b767dc08f..d9c1b3cea4a5 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -127,8 +127,8 @@ xchk_refcountbt_rmap_check( * is healthy each rmap_irec we see will be in agbno order * so we don't need insertion sort here. */ - frag = kmem_alloc(sizeof(struct xchk_refcnt_frag), - KM_MAYFAIL); + frag = kmalloc(sizeof(struct xchk_refcnt_frag), + XCHK_GFP_FLAGS); if (!frag) return -ENOMEM; memcpy(&frag->rm, rec, sizeof(frag->rm)); @@ -215,7 +215,7 @@ xchk_refcountbt_process_rmap_fragments( continue; } list_del(&frag->list); - kmem_free(frag); + kfree(frag); nr++; } @@ -257,11 +257,11 @@ done: /* Delete fragments and work list. */ list_for_each_entry_safe(frag, n, &worklist, list) { list_del(&frag->list); - kmem_free(frag); + kfree(frag); } list_for_each_entry_safe(frag, n, &refchk->fragments, list) { list_del(&frag->list); - kmem_free(frag); + kfree(frag); } } @@ -269,15 +269,13 @@ done: STATIC void xchk_refcountbt_xref_rmap( struct xfs_scrub *sc, - xfs_agblock_t bno, - xfs_extlen_t len, - xfs_nlink_t refcount) + const struct xfs_refcount_irec *irec) { struct xchk_refcnt_check refchk = { - .sc = sc, - .bno = bno, - .len = len, - .refcount = refcount, + .sc = sc, + .bno = irec->rc_startblock, + .len = irec->rc_blockcount, + .refcount = irec->rc_refcount, .seen = 0, }; struct xfs_rmap_irec low; @@ -291,9 +289,9 @@ xchk_refcountbt_xref_rmap( /* Cross-reference with the rmapbt to confirm the refcount. */ memset(&low, 0, sizeof(low)); - low.rm_startblock = bno; + low.rm_startblock = irec->rc_startblock; memset(&high, 0xFF, sizeof(high)); - high.rm_startblock = bno + len - 1; + high.rm_startblock = irec->rc_startblock + irec->rc_blockcount - 1; INIT_LIST_HEAD(&refchk.fragments); error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high, @@ -302,30 +300,29 @@ xchk_refcountbt_xref_rmap( goto out_free; xchk_refcountbt_process_rmap_fragments(&refchk); - if (refcount != refchk.seen) + if (irec->rc_refcount != refchk.seen) xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); out_free: list_for_each_entry_safe(frag, n, &refchk.fragments, list) { list_del(&frag->list); - kmem_free(frag); + kfree(frag); } } /* Cross-reference with the other btrees. */ STATIC void xchk_refcountbt_xref( - struct xfs_scrub *sc, - xfs_agblock_t agbno, - xfs_extlen_t len, - xfs_nlink_t refcount) + struct xfs_scrub *sc, + const struct xfs_refcount_irec *irec) { if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return; - xchk_xref_is_used_space(sc, agbno, len); - xchk_xref_is_not_inode_chunk(sc, agbno, len); - xchk_refcountbt_xref_rmap(sc, agbno, len, refcount); + xchk_xref_is_used_space(sc, irec->rc_startblock, irec->rc_blockcount); + xchk_xref_is_not_inode_chunk(sc, irec->rc_startblock, + irec->rc_blockcount); + xchk_refcountbt_xref_rmap(sc, irec); } /* Scrub a refcountbt record. */ @@ -334,35 +331,27 @@ xchk_refcountbt_rec( struct xchk_btree *bs, const union xfs_btree_rec *rec) { + struct xfs_refcount_irec irec; xfs_agblock_t *cow_blocks = bs->private; struct xfs_perag *pag = bs->cur->bc_ag.pag; - xfs_agblock_t bno; - xfs_extlen_t len; - xfs_nlink_t refcount; - bool has_cowflag; - bno = be32_to_cpu(rec->refc.rc_startblock); - len = be32_to_cpu(rec->refc.rc_blockcount); - refcount = be32_to_cpu(rec->refc.rc_refcount); + xfs_refcount_btrec_to_irec(rec, &irec); - /* Only CoW records can have refcount == 1. */ - has_cowflag = (bno & XFS_REFC_COW_START); - if ((refcount == 1 && !has_cowflag) || (refcount != 1 && has_cowflag)) + /* Check the domain and refcount are not incompatible. */ + if (!xfs_refcount_check_domain(&irec)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - if (has_cowflag) - (*cow_blocks) += len; + + if (irec.rc_domain == XFS_REFC_DOMAIN_COW) + (*cow_blocks) += irec.rc_blockcount; /* Check the extent. */ - bno &= ~XFS_REFC_COW_START; - if (bno + len <= bno || - !xfs_verify_agbno(pag, bno) || - !xfs_verify_agbno(pag, bno + len - 1)) + if (!xfs_verify_agbext(pag, irec.rc_startblock, irec.rc_blockcount)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - if (refcount == 0) + if (irec.rc_refcount == 0) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - xchk_refcountbt_xref(bs->sc, bno, len, refcount); + xchk_refcountbt_xref(bs->sc, &irec); return 0; } @@ -426,7 +415,6 @@ xchk_xref_is_cow_staging( xfs_extlen_t len) { struct xfs_refcount_irec rc; - bool has_cowflag; int has_refcount; int error; @@ -434,8 +422,8 @@ xchk_xref_is_cow_staging( return; /* Find the CoW staging extent. */ - error = xfs_refcount_lookup_le(sc->sa.refc_cur, - agbno + XFS_REFC_COW_START, &has_refcount); + error = xfs_refcount_lookup_le(sc->sa.refc_cur, XFS_REFC_DOMAIN_COW, + agbno, &has_refcount); if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) return; if (!has_refcount) { @@ -451,9 +439,8 @@ xchk_xref_is_cow_staging( return; } - /* CoW flag must be set, refcount must be 1. */ - has_cowflag = (rc.rc_startblock & XFS_REFC_COW_START); - if (!has_cowflag || rc.rc_refcount != 1) + /* CoW lookup returned a shared extent record? */ + if (rc.rc_domain != XFS_REFC_DOMAIN_COW) xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); /* Must be at least as long as what was passed in */ @@ -477,7 +464,8 @@ xchk_xref_is_not_shared( if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm)) return; - error = xfs_refcount_has_record(sc->sa.refc_cur, agbno, len, &shared); + error = xfs_refcount_has_record(sc->sa.refc_cur, XFS_REFC_DOMAIN_SHARED, + agbno, len, &shared); if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) return; if (shared) diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index c18bd039fce9..4b92f9253ccd 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -61,7 +61,6 @@ xrep_attempt( sc->flags |= XREP_ALREADY_FIXED; return -EAGAIN; case -EDEADLOCK: - case -EAGAIN: /* Tell the caller to try again having grabbed all the locks. */ if (!(sc->flags & XCHK_TRY_HARDER)) { sc->flags |= XCHK_TRY_HARDER; @@ -70,10 +69,15 @@ xrep_attempt( /* * We tried harder but still couldn't grab all the resources * we needed to fix it. The corruption has not been fixed, - * so report back to userspace. + * so exit to userspace with the scan's output flags unchanged. */ - return -EFSCORRUPTED; + return 0; default: + /* + * EAGAIN tells the caller to re-scrub, so we cannot return + * that here. + */ + ASSERT(error != -EAGAIN); return error; } } @@ -121,32 +125,40 @@ xrep_roll_ag_trans( { int error; - /* Keep the AG header buffers locked so we can keep going. */ - if (sc->sa.agi_bp) + /* + * Keep the AG header buffers locked while we roll the transaction. + * Ensure that both AG buffers are dirty and held when we roll the + * transaction so that they move forward in the log without losing the + * bli (and hence the bli type) when the transaction commits. + * + * Normal code would never hold clean buffers across a roll, but repair + * needs both buffers to maintain a total lock on the AG. + */ + if (sc->sa.agi_bp) { + xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM); xfs_trans_bhold(sc->tp, sc->sa.agi_bp); - if (sc->sa.agf_bp) + } + + if (sc->sa.agf_bp) { + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM); xfs_trans_bhold(sc->tp, sc->sa.agf_bp); - if (sc->sa.agfl_bp) - xfs_trans_bhold(sc->tp, sc->sa.agfl_bp); + } /* - * Roll the transaction. We still own the buffer and the buffer lock - * regardless of whether or not the roll succeeds. If the roll fails, - * the buffers will be released during teardown on our way out of the - * kernel. If it succeeds, we join them to the new transaction and - * move on. + * Roll the transaction. We still hold the AG header buffers locked + * regardless of whether or not that succeeds. On failure, the buffers + * will be released during teardown on our way out of the kernel. If + * successful, join the buffers to the new transaction and move on. */ error = xfs_trans_roll(&sc->tp); if (error) return error; - /* Join AG headers to the new transaction. */ + /* Join the AG headers to the new transaction. */ if (sc->sa.agi_bp) xfs_trans_bjoin(sc->tp, sc->sa.agi_bp); if (sc->sa.agf_bp) xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); - if (sc->sa.agfl_bp) - xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp); return 0; } @@ -498,6 +510,7 @@ xrep_put_freelist( struct xfs_scrub *sc, xfs_agblock_t agbno) { + struct xfs_buf *agfl_bp; int error; /* Make sure there's space on the freelist. */ @@ -516,8 +529,12 @@ xrep_put_freelist( return error; /* Put the block on the AGFL. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + return error; + error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp, - sc->sa.agfl_bp, agbno, 0); + agfl_bp, agbno, 0); if (error) return error; xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1, diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 2e8e400f10a9..07a7a75f987f 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -174,7 +174,7 @@ xchk_teardown( if (sc->flags & XCHK_REAPING_DISABLED) xchk_start_reaping(sc); if (sc->buf) { - kmem_free(sc->buf); + kvfree(sc->buf); sc->buf = NULL; } return error; @@ -467,7 +467,7 @@ xfs_scrub_metadata( xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB, "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); - sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL); + sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS); if (!sc) { error = -ENOMEM; goto out; @@ -557,7 +557,7 @@ out_nofix: out_teardown: error = xchk_teardown(sc, error); out_sc: - kmem_free(sc); + kfree(sc); out: trace_xchk_done(XFS_I(file_inode(file)), sm, error); if (error == -EFSCORRUPTED || error == -EFSBADCRC) { diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 3de5287e98d8..b4d391b4c938 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -8,6 +8,15 @@ struct xfs_scrub; +/* + * Standard flags for allocating memory within scrub. NOFS context is + * configured by the process allocation scope. Scrub and repair must be able + * to back out gracefully if there isn't enough memory. Force-cast to avoid + * complaints from static checkers. + */ +#define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \ + __GFP_RETRY_MAYFAIL)) + /* Type info and names for the scrub types. */ enum xchk_type { ST_NONE = 1, /* disabled */ @@ -39,7 +48,6 @@ struct xchk_ag { /* AG btree roots */ struct xfs_buf *agf_bp; - struct xfs_buf *agfl_bp; struct xfs_buf *agi_bp; /* AG btrees */ @@ -161,12 +169,4 @@ void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno, # define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0) #endif -struct xchk_fscounters { - uint64_t icount; - uint64_t ifree; - uint64_t fdblocks; - unsigned long long icount_min; - unsigned long long icount_max; -}; - #endif /* __XFS_SCRUB_SCRUB_H__ */ diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 75311f8daeeb..c1c99ffe7408 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -21,7 +21,7 @@ xchk_setup_symlink( struct xfs_scrub *sc) { /* Allocate the buffer without the inode lock held. */ - sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, GFP_KERNEL); + sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index b744c62052b6..a05f44eb8178 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -242,12 +242,13 @@ xfs_acl_set_mode( } int -xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { umode_t mode; bool set_mode = false; int error = 0; + struct inode *inode = d_inode(dentry); if (!acl) goto set_acl; diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 263404d0bfda..dcd176149c7a 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -11,7 +11,7 @@ struct posix_acl; #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu); -extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +extern int xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); void xfs_forget_acl(struct inode *inode, const char *name); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 5d1a995b15f8..41734202796f 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -17,6 +17,8 @@ #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_reflink.h" +#include "xfs_errortag.h" +#include "xfs_error.h" struct xfs_writepage_ctx { struct iomap_writepage_ctx ctx; @@ -114,9 +116,8 @@ xfs_end_ioend( if (unlikely(error)) { if (ioend->io_flags & IOMAP_F_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); - xfs_bmap_punch_delalloc_range(ip, - XFS_B_TO_FSBT(mp, offset), - XFS_B_TO_FSB(mp, size)); + xfs_bmap_punch_delalloc_range(ip, offset, + offset + size); } goto done; } @@ -218,11 +219,17 @@ xfs_imap_valid( * checked (and found nothing at this offset) could have added * overlapping blocks. */ - if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) + if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) { + trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap, + XFS_WPC(wpc)->data_seq, XFS_DATA_FORK); return false; + } if (xfs_inode_has_cow_data(ip) && - XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) + XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) { + trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap, + XFS_WPC(wpc)->cow_seq, XFS_COW_FORK); return false; + } return true; } @@ -286,6 +293,8 @@ xfs_map_blocks( if (xfs_is_shutdown(mp)) return -EIO; + XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS); + /* * COW fork blocks can overlap data fork blocks even if the blocks * aren't shared. COW I/O always takes precedent, so we must always @@ -373,7 +382,7 @@ retry: isnullstartblock(imap.br_startblock)) goto allocate_blocks; - xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq); trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: @@ -455,12 +464,8 @@ xfs_discard_folio( struct folio *folio, loff_t pos) { - struct inode *inode = folio->mapping->host; - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(folio->mapping->host); struct xfs_mount *mp = ip->i_mount; - size_t offset = offset_in_folio(folio, pos); - xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, pos); - xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, offset); int error; if (xfs_is_shutdown(mp)) @@ -470,8 +475,9 @@ xfs_discard_folio( "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", folio, ip->i_ino, pos); - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - i_blocks_per_folio(inode, folio) - pageoff_fsb); + error = xfs_bmap_punch_delalloc_range(ip, pos, + round_up(pos, folio_size(folio))); + if (error && !xfs_is_shutdown(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); } diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 5077a7ad5646..2788a6f2edcd 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -86,8 +86,6 @@ xfs_attri_log_nameval_alloc( */ nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) + name_len + value_len); - if (!nv) - return nv; nv->name.i_addr = nv + 1; nv->name.i_len = name_len; @@ -247,28 +245,6 @@ xfs_attri_init( return attrip; } -/* - * Copy an attr format buffer from the given buf, and into the destination attr - * format structure. - */ -STATIC int -xfs_attri_copy_format( - struct xfs_log_iovec *buf, - struct xfs_attri_log_format *dst_attr_fmt) -{ - struct xfs_attri_log_format *src_attr_fmt = buf->i_addr; - size_t len; - - len = sizeof(struct xfs_attri_log_format); - if (buf->i_len != len) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; - } - - memcpy((char *)dst_attr_fmt, (char *)src_attr_fmt, len); - return 0; -} - static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_attrd_log_item, attrd_item); @@ -441,8 +417,6 @@ xfs_attr_create_intent( attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name, args->namelen, args->value, args->valuelen); } - if (!attr->xattri_nameval) - return ERR_PTR(-ENOMEM); attrip = xfs_attri_init(mp, attr->xattri_nameval); xfs_trans_add_item(tp, &attrip->attri_item); @@ -735,24 +709,50 @@ xlog_recover_attri_commit_pass2( struct xfs_attri_log_nameval *nv; const void *attr_value = NULL; const void *attr_name; - int error; + size_t len; attri_formatp = item->ri_buf[0].i_addr; attr_name = item->ri_buf[1].i_addr; /* Validate xfs_attri_log_format before the large memory allocation */ + len = sizeof(struct xfs_attri_log_format); + if (item->ri_buf[0].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + if (!xfs_attri_validate(mp, attri_formatp)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + /* Validate the attr name */ + if (item->ri_buf[1].i_len != + xlog_calc_iovec_len(attri_formatp->alfi_name_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[1].i_addr, item->ri_buf[1].i_len); return -EFSCORRUPTED; } - if (attri_formatp->alfi_value_len) + /* Validate the attr value, if present */ + if (attri_formatp->alfi_value_len != 0) { + if (item->ri_buf[2].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, + item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + attr_value = item->ri_buf[2].i_addr; + } /* * Memory alloc failure will cause replay to abort. We attach the @@ -762,13 +762,9 @@ xlog_recover_attri_commit_pass2( nv = xfs_attri_log_nameval_alloc(attr_name, attri_formatp->alfi_name_len, attr_value, attri_formatp->alfi_value_len); - if (!nv) - return -ENOMEM; attrip = xfs_attri_init(mp, nv); - error = xfs_attri_copy_format(&item->ri_buf[0], &attrip->attri_format); - if (error) - goto out; + memcpy(&attrip->attri_format, attri_formatp, len); /* * The ATTRI has two references. One for the ATTRD and one for ATTRI to @@ -780,10 +776,6 @@ xlog_recover_attri_commit_pass2( xfs_attri_release(attrip); xfs_attri_log_nameval_put(nv); return 0; -out: - xfs_attri_item_free(attrip); - xfs_attri_log_nameval_put(nv); - return error; } /* @@ -848,7 +840,8 @@ xlog_recover_attrd_commit_pass2( attrd_formatp = item->ri_buf[0].i_addr; if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 51f66e982484..41323da523d1 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -608,28 +608,18 @@ static const struct xfs_item_ops xfs_bui_item_ops = { .iop_relog = xfs_bui_item_relog, }; -/* - * Copy an BUI format buffer from the given buf, and into the destination - * BUI format structure. The BUI/BUD items were designed not to need any - * special alignment handling. - */ -static int +static inline void xfs_bui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_bui_log_format *dst_bui_fmt) + struct xfs_bui_log_format *dst, + const struct xfs_bui_log_format *src) { - struct xfs_bui_log_format *src_bui_fmt; - uint len; + unsigned int i; - src_bui_fmt = buf->i_addr; - len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents); + memcpy(dst, src, offsetof(struct xfs_bui_log_format, bui_extents)); - if (buf->i_len == len) { - memcpy(dst_bui_fmt, src_bui_fmt, len); - return 0; - } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; + for (i = 0; i < src->bui_nextents; i++) + memcpy(&dst->bui_extents[i], &src->bui_extents[i], + sizeof(struct xfs_map_extent)); } /* @@ -646,23 +636,34 @@ xlog_recover_bui_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; struct xfs_mount *mp = log->l_mp; struct xfs_bui_log_item *buip; struct xfs_bui_log_format *bui_formatp; + size_t len; bui_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len < xfs_bui_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } - buip = xfs_bui_init(mp); - error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format); - if (error) { - xfs_bui_item_free(buip); - return error; + + len = xfs_bui_log_format_sizeof(bui_formatp->bui_nextents); + if (item->ri_buf[0].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; } + + buip = xfs_bui_init(mp); + xfs_bui_copy_format(&buip->bui_format, bui_formatp); atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); /* * Insert the intent into the AIL directly and drop one reference so @@ -696,7 +697,8 @@ xlog_recover_bud_commit_pass2( bud_formatp = item->ri_buf[0].i_addr; if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 04d0c2bff67c..867645b74d88 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -590,11 +590,13 @@ out_unlock_iolock: int xfs_bmap_punch_delalloc_range( struct xfs_inode *ip, - xfs_fileoff_t start_fsb, - xfs_fileoff_t length) + xfs_off_t start_byte, + xfs_off_t end_byte) { + struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = &ip->i_df; - xfs_fileoff_t end_fsb = start_fsb + length; + xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; int error = 0; @@ -607,7 +609,7 @@ xfs_bmap_punch_delalloc_range( while (got.br_startoff + got.br_blockcount > start_fsb) { del = got; - xfs_trim_extent(&del, start_fsb, length); + xfs_trim_extent(&del, start_fsb, end_fsb - start_fsb); /* * A delete can push the cursor forward. Step back to the diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 24b37d211f1d..6888078f5c31 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -31,7 +31,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap) #endif /* CONFIG_XFS_RT */ int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, - xfs_fileoff_t start_fsb, xfs_fileoff_t length); + xfs_off_t start_byte, xfs_off_t end_byte); struct kgetbmap { __s64 bmv_offset; /* file offset of segment in blocks */ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index dde346450952..54c774af6e1c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1945,6 +1945,7 @@ xfs_free_buftarg( list_lru_destroy(&btp->bt_lru); blkdev_issue_flush(btp->bt_bdev); + invalidate_bdev(btp->bt_bdev); fs_put_dax(btp->bt_daxdev, btp->bt_mount); kmem_free(btp); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 522d450a94b1..df7322ed73fa 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1018,6 +1018,8 @@ xfs_buf_item_relse( trace_xfs_buf_item_relse(bp, _RET_IP_); ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); + if (atomic_read(&bip->bli_refcount)) + return; bp->b_log_item = NULL; xfs_buf_rele(bp); xfs_buf_item_free(bip); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index e295fc8062d8..9f3ceb461515 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -512,7 +512,7 @@ xfs_readdir( { struct xfs_da_args args = { NULL }; unsigned int lock_mode; - int isblock; + bool isblock; int error; trace_xfs_readdir(dp); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 296faa41d81d..ae082808cfed 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -46,7 +46,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_REFCOUNT_FINISH_ONE, XFS_RANDOM_BMAP_FINISH_ONE, XFS_RANDOM_AG_RESV_CRITICAL, - XFS_RANDOM_DROP_WRITES, + 0, /* XFS_RANDOM_DROP_WRITES has been removed */ XFS_RANDOM_LOG_BAD_CRC, XFS_RANDOM_LOG_ITEM_PIN, XFS_RANDOM_BUF_LRU_REF, @@ -60,6 +60,8 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_LARP, XFS_RANDOM_DA_LEAF_SPLIT, XFS_RANDOM_ATTR_LEAF_TO_NODE, + XFS_RANDOM_WB_DELAY_MS, + XFS_RANDOM_WRITE_DELAY_MS, }; struct xfs_errortag_attr { @@ -162,7 +164,6 @@ XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDA XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); -XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES); XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); @@ -176,6 +177,8 @@ XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL); XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); +XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS); +XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -206,7 +209,6 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(refcount_finish_one), XFS_ERRORTAG_ATTR_LIST(bmap_finish_one), XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), - XFS_ERRORTAG_ATTR_LIST(drop_writes), XFS_ERRORTAG_ATTR_LIST(log_bad_crc), XFS_ERRORTAG_ATTR_LIST(log_item_pin), XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), @@ -220,6 +222,8 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(larp), XFS_ERRORTAG_ATTR_LIST(da_leaf_split), XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), + XFS_ERRORTAG_ATTR_LIST(wb_delay_ms), + XFS_ERRORTAG_ATTR_LIST(write_delay_ms), NULL, }; ATTRIBUTE_GROUPS(xfs_errortag); @@ -234,13 +238,18 @@ int xfs_errortag_init( struct xfs_mount *mp) { + int ret; + mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX, KM_MAYFAIL); if (!mp->m_errortag) return -ENOMEM; - return xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype, - &mp->m_kobj, "errortag"); + ret = xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype, + &mp->m_kobj, "errortag"); + if (ret) + kmem_free(mp->m_errortag); + return ret; } void @@ -251,6 +260,32 @@ xfs_errortag_del( kmem_free(mp->m_errortag); } +static bool +xfs_errortag_valid( + unsigned int error_tag) +{ + if (error_tag >= XFS_ERRTAG_MAX) + return false; + + /* Error out removed injection types */ + if (error_tag == XFS_ERRTAG_DROP_WRITES) + return false; + return true; +} + +bool +xfs_errortag_enabled( + struct xfs_mount *mp, + unsigned int tag) +{ + if (!mp->m_errortag) + return false; + if (!xfs_errortag_valid(tag)) + return false; + + return mp->m_errortag[tag] != 0; +} + bool xfs_errortag_test( struct xfs_mount *mp, @@ -272,9 +307,11 @@ xfs_errortag_test( if (!mp->m_errortag) return false; - ASSERT(error_tag < XFS_ERRTAG_MAX); + if (!xfs_errortag_valid(error_tag)) + return false; + randfactor = mp->m_errortag[error_tag]; - if (!randfactor || prandom_u32() % randfactor) + if (!randfactor || get_random_u32_below(randfactor)) return false; xfs_warn_ratelimited(mp, @@ -288,7 +325,7 @@ xfs_errortag_get( struct xfs_mount *mp, unsigned int error_tag) { - if (error_tag >= XFS_ERRTAG_MAX) + if (!xfs_errortag_valid(error_tag)) return -EINVAL; return mp->m_errortag[error_tag]; @@ -300,7 +337,7 @@ xfs_errortag_set( unsigned int error_tag, unsigned int tag_value) { - if (error_tag >= XFS_ERRTAG_MAX) + if (!xfs_errortag_valid(error_tag)) return -EINVAL; mp->m_errortag[error_tag] = tag_value; @@ -314,7 +351,7 @@ xfs_errortag_add( { BUILD_BUG_ON(ARRAY_SIZE(xfs_errortag_random_default) != XFS_ERRTAG_MAX); - if (error_tag >= XFS_ERRTAG_MAX) + if (!xfs_errortag_valid(error_tag)) return -EINVAL; return xfs_errortag_set(mp, error_tag, diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 5191e9145e55..dbe6c37dc697 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -45,6 +45,18 @@ extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression, const char *file, int line, unsigned int error_tag); #define XFS_TEST_ERROR(expr, mp, tag) \ ((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag))) +bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); +#define XFS_ERRORTAG_DELAY(mp, tag) \ + do { \ + might_sleep(); \ + if (!xfs_errortag_enabled((mp), (tag))) \ + break; \ + xfs_warn_ratelimited((mp), \ +"Injecting %ums delay at file %s, line %d, on filesystem \"%s\"", \ + (mp)->m_errortag[(tag)], __FILE__, __LINE__, \ + (mp)->m_super->s_id); \ + mdelay((mp)->m_errortag[(tag)]); \ + } while (0) extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag); extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag, @@ -55,6 +67,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #define xfs_errortag_init(mp) (0) #define xfs_errortag_del(mp) #define XFS_TEST_ERROR(expr, mp, tag) (expr) +#define XFS_ERRORTAG_DELAY(mp, tag) ((void)0) #define xfs_errortag_set(mp, tag, val) (ENOSYS) #define xfs_errortag_add(mp, tag) (ENOSYS) #define xfs_errortag_clearall(mp) (ENOSYS) diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 27ccfcd82f04..d5130d1fcfae 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -66,27 +66,16 @@ xfs_efi_release( xfs_efi_item_free(efip); } -/* - * This returns the number of iovecs needed to log the given efi item. - * We only need 1 iovec for an efi item. It just logs the efi_log_format - * structure. - */ -static inline int -xfs_efi_item_sizeof( - struct xfs_efi_log_item *efip) -{ - return sizeof(struct xfs_efi_log_format) + - (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t); -} - STATIC void xfs_efi_item_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + *nvecs += 1; - *nbytes += xfs_efi_item_sizeof(EFI_ITEM(lip)); + *nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents); } /* @@ -112,7 +101,7 @@ xfs_efi_item_format( xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT, &efip->efi_format, - xfs_efi_item_sizeof(efip)); + xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents)); } @@ -155,13 +144,11 @@ xfs_efi_init( { struct xfs_efi_log_item *efip; - uint size; ASSERT(nextents > 0); if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { - size = (uint)(sizeof(struct xfs_efi_log_item) + - ((nextents - 1) * sizeof(xfs_extent_t))); - efip = kmem_zalloc(size, 0); + efip = kzalloc(xfs_efi_log_item_sizeof(nextents), + GFP_KERNEL | __GFP_NOFAIL); } else { efip = kmem_cache_zalloc(xfs_efi_cache, GFP_KERNEL | __GFP_NOFAIL); @@ -188,15 +175,17 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) { xfs_efi_log_format_t *src_efi_fmt = buf->i_addr; uint i; - uint len = sizeof(xfs_efi_log_format_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t); - uint len32 = sizeof(xfs_efi_log_format_32_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t); - uint len64 = sizeof(xfs_efi_log_format_64_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t); + uint len = xfs_efi_log_format_sizeof(src_efi_fmt->efi_nextents); + uint len32 = xfs_efi_log_format32_sizeof(src_efi_fmt->efi_nextents); + uint len64 = xfs_efi_log_format64_sizeof(src_efi_fmt->efi_nextents); if (buf->i_len == len) { - memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len); + memcpy(dst_efi_fmt, src_efi_fmt, + offsetof(struct xfs_efi_log_format, efi_extents)); + for (i = 0; i < src_efi_fmt->efi_nextents; i++) + memcpy(&dst_efi_fmt->efi_extents[i], + &src_efi_fmt->efi_extents[i], + sizeof(struct xfs_extent)); return 0; } else if (buf->i_len == len32) { xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr; @@ -227,7 +216,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } return 0; } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, NULL, buf->i_addr, + buf->i_len); return -EFSCORRUPTED; } @@ -246,27 +236,16 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp) kmem_cache_free(xfs_efd_cache, efdp); } -/* - * This returns the number of iovecs needed to log the given efd item. - * We only need 1 iovec for an efd item. It just logs the efd_log_format - * structure. - */ -static inline int -xfs_efd_item_sizeof( - struct xfs_efd_log_item *efdp) -{ - return sizeof(xfs_efd_log_format_t) + - (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t); -} - STATIC void xfs_efd_item_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + *nvecs += 1; - *nbytes += xfs_efd_item_sizeof(EFD_ITEM(lip)); + *nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents); } /* @@ -291,7 +270,7 @@ xfs_efd_item_format( xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT, &efdp->efd_format, - xfs_efd_item_sizeof(efdp)); + xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents)); } /* @@ -340,9 +319,8 @@ xfs_trans_get_efd( ASSERT(nextents > 0); if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { - efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) + - (nextents - 1) * sizeof(struct xfs_extent), - 0); + efdp = kzalloc(xfs_efd_log_item_sizeof(nextents), + GFP_KERNEL | __GFP_NOFAIL); } else { efdp = kmem_cache_zalloc(xfs_efd_cache, GFP_KERNEL | __GFP_NOFAIL); @@ -733,6 +711,12 @@ xlog_recover_efi_commit_pass2( efi_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + efip = xfs_efi_init(mp, efi_formatp->efi_nextents); error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); if (error) { @@ -769,12 +753,24 @@ xlog_recover_efd_commit_pass2( xfs_lsn_t lsn) { struct xfs_efd_log_format *efd_formatp; + int buflen = item->ri_buf[0].i_len; efd_formatp = item->ri_buf[0].i_addr; - ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + - ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || - (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + - ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t))))); + + if (buflen < sizeof(struct xfs_efd_log_format)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + efd_formatp, buflen); + return -EFSCORRUPTED; + } + + if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof( + efd_formatp->efd_nextents) && + item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof( + efd_formatp->efd_nextents)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + efd_formatp, buflen); + return -EFSCORRUPTED; + } xlog_recover_release_intent(log, XFS_LI_EFI, efd_formatp->efd_efi_id); return 0; diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 186d0f2137f1..da6a5afa607c 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -52,6 +52,14 @@ struct xfs_efi_log_item { xfs_efi_log_format_t efi_format; }; +static inline size_t +xfs_efi_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_efi_log_item, efi_format) + + xfs_efi_log_format_sizeof(nr); +} + /* * This is the "extent free done" log item. It is used to log * the fact that some extents earlier mentioned in an efi item @@ -64,6 +72,14 @@ struct xfs_efd_log_item { xfs_efd_log_format_t efd_format; }; +static inline size_t +xfs_efd_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_efd_log_item, efd_format) + + xfs_efd_log_format_sizeof(nr); +} + /* * Max number of extents in fast allocation path. */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c6c80265c0b2..595a5bcf46b9 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1261,7 +1261,7 @@ xfs_file_llseek( } #ifdef CONFIG_FS_DAX -static int +static inline vm_fault_t xfs_dax_fault( struct vm_fault *vmf, enum page_entry_size pe_size, @@ -1274,14 +1274,15 @@ xfs_dax_fault( &xfs_read_iomap_ops); } #else -static int +static inline vm_fault_t xfs_dax_fault( struct vm_fault *vmf, enum page_entry_size pe_size, bool write_fault, pfn_t *pfn) { - return 0; + ASSERT(0); + return VM_FAULT_SIGBUS; } #endif @@ -1324,7 +1325,7 @@ __xfs_filemap_fault( if (write_fault) { xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = iomap_page_mkwrite(vmf, - &xfs_buffered_write_iomap_ops); + &xfs_page_mkwrite_iomap_ops); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); } else { ret = filemap_fault(vmf); diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index d8337274c74d..88a88506ffff 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -524,7 +524,7 @@ xfs_getfsmap_rtdev_rtbitmap_query( struct xfs_mount *mp = tp->t_mountp; int error; - xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED); + xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); /* * Set up query parameters to return free rtextents covering the range @@ -551,7 +551,7 @@ xfs_getfsmap_rtdev_rtbitmap_query( if (error) goto err; err: - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED); + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); return error; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 2bbe7916a998..f35e2cee5265 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -342,6 +342,9 @@ xfs_iget_recycle( trace_xfs_iget_recycle(ip); + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + return -EAGAIN; + /* * We need to make it look like the inode is being reclaimed to prevent * the actual reclaim workers from stomping over us while we recycle @@ -355,6 +358,7 @@ xfs_iget_recycle( ASSERT(!rwsem_is_locked(&inode->i_rwsem)); error = xfs_reinit_inode(mp, inode); + xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) { /* * Re-initializing the inode failed, and we are in deep @@ -518,6 +522,8 @@ xfs_iget_cache_hit( if (ip->i_flags & XFS_IRECLAIMABLE) { /* Drops i_flags_lock and RCU read lock. */ error = xfs_iget_recycle(pag, ip); + if (error == -EAGAIN) + goto out_skip; if (error) return error; } else { @@ -596,7 +602,7 @@ xfs_iget_cache_miss( */ if (xfs_has_v3inodes(mp) && (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) { - VFS_I(ip)->i_generation = prandom_u32(); + VFS_I(ip)->i_generation = get_random_u32(); } else { struct xfs_buf *bp; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 28493c8e9bb2..d354ea2b74f9 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -835,9 +835,8 @@ xfs_init_new_inode( * ID or one of the supplementary group IDs, the S_ISGID bit is cleared * (and only if the irix_sgid_inherit compatibility variable is set). */ - if (irix_sgid_inherit && - (inode->i_mode & S_ISGID) && - !in_group_p(i_gid_into_mnt(mnt_userns, inode))) + if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && + !vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode))) inode->i_mode &= ~S_ISGID; ip->i_disk_size = 0; @@ -2480,7 +2479,7 @@ xfs_remove( error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, tp->t_mountp->m_sb.sb_rootino, 0); if (error) - return error; + goto out_trans_cancel; } } else { /* @@ -2819,7 +2818,7 @@ retry: * Lock all the participating inodes. Depending upon whether * the target_name exists in the target directory, and * whether the target directory is the same as the source - * directory, we can lock from 2 to 4 inodes. + * directory, we can lock from 2 to 5 inodes. */ xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); @@ -3119,7 +3118,7 @@ xfs_iflush( if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT, + "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); goto flush_out; } @@ -3129,7 +3128,7 @@ xfs_iflush( ip->i_df.if_format != XFS_DINODE_FMT_BTREE, mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad regular inode %Lu, ptr "PTR_FMT, + "%s: Bad regular inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } @@ -3140,7 +3139,7 @@ xfs_iflush( ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad directory inode %Lu, ptr "PTR_FMT, + "%s: Bad directory inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } @@ -3158,7 +3157,7 @@ xfs_iflush( if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize, mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT, + "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_forkoff, ip); goto flush_out; } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6e19ece916bf..ca2941ab6cbc 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -550,7 +550,7 @@ xfs_inode_item_push( if (!bp || (ip->i_flags & XFS_ISTALE)) { /* - * Inode item/buffer is being being aborted due to cluster + * Inode item/buffer is being aborted due to cluster * buffer deletion. Trigger a log force to have that operation * completed and items removed from the AIL before the next push * attempt. diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index d28ffaebd067..0e5dba2343ea 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -321,7 +321,7 @@ xlog_recover_inode_commit_pass2( */ if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) { xfs_alert(mp, - "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", + "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %lld", __func__, dip, bp, in_f->ilf_ino); error = -EFSCORRUPTED; goto out_release; @@ -329,7 +329,7 @@ xlog_recover_inode_commit_pass2( ldip = item->ri_buf[1].i_addr; if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld", + "%s: Bad inode log record, rec ptr "PTR_FMT", ino %lld", __func__, item, in_f->ilf_ino); error = -EFSCORRUPTED; goto out_release; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 1f783e979629..13f1b2add390 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1138,10 +1138,6 @@ xfs_ioctl_setattr_xflags( if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip)) ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; - /* Don't allow us to set DAX mode for a reflinked file for now. */ - if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip)) - return -EINVAL; - /* diflags2 only valid for v3 inodes. */ i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); if (i_flags2 && !xfs_has_v3inodes(mp)) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 07da03976ec1..669c1bc5c3a7 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -48,13 +48,53 @@ xfs_alert_fsblock_zero( return -EFSCORRUPTED; } +u64 +xfs_iomap_inode_sequence( + struct xfs_inode *ip, + u16 iomap_flags) +{ + u64 cookie = 0; + + if (iomap_flags & IOMAP_F_XATTR) + return READ_ONCE(ip->i_af.if_seq); + if ((iomap_flags & IOMAP_F_SHARED) && ip->i_cowfp) + cookie = (u64)READ_ONCE(ip->i_cowfp->if_seq) << 32; + return cookie | READ_ONCE(ip->i_df.if_seq); +} + +/* + * Check that the iomap passed to us is still valid for the given offset and + * length. + */ +static bool +xfs_iomap_valid( + struct inode *inode, + const struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + + if (iomap->validity_cookie != + xfs_iomap_inode_sequence(ip, iomap->flags)) { + trace_xfs_iomap_invalid(ip, iomap); + return false; + } + + XFS_ERRORTAG_DELAY(ip->i_mount, XFS_ERRTAG_WRITE_DELAY_MS); + return true; +} + +const struct iomap_page_ops xfs_iomap_page_ops = { + .iomap_valid = xfs_iomap_valid, +}; + int xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, unsigned int mapping_flags, - u16 iomap_flags) + u16 iomap_flags, + u64 sequence_cookie) { struct xfs_mount *mp = ip->i_mount; struct xfs_buftarg *target = xfs_inode_buftarg(ip); @@ -91,6 +131,9 @@ xfs_bmbt_to_iomap( if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) iomap->flags |= IOMAP_F_DIRTY; + + iomap->validity_cookie = sequence_cookie; + iomap->page_ops = &xfs_iomap_page_ops; return 0; } @@ -195,7 +238,8 @@ xfs_iomap_write_direct( xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, unsigned int flags, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + u64 *seq) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; @@ -285,6 +329,7 @@ xfs_iomap_write_direct( error = xfs_alert_fsblock_zero(ip, imap); out_unlock: + *seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -743,6 +788,7 @@ xfs_direct_write_iomap_begin( bool shared = false; u16 iomap_flags = 0; unsigned int lockmode = XFS_ILOCK_SHARED; + u64 seq; ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); @@ -811,9 +857,10 @@ xfs_direct_write_iomap_begin( goto out_unlock; } + seq = xfs_iomap_inode_sequence(ip, iomap_flags); xfs_iunlock(ip, lockmode); trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq); allocate_blocks: error = -EAGAIN; @@ -839,24 +886,26 @@ allocate_blocks: xfs_iunlock(ip, lockmode); error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, - flags, &imap); + flags, &imap, &seq); if (error) return error; trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, - iomap_flags | IOMAP_F_NEW); + iomap_flags | IOMAP_F_NEW, seq); out_found_cow: - xfs_iunlock(ip, lockmode); length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); if (imap.br_startblock != HOLESTARTBLOCK) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); + seq = xfs_iomap_inode_sequence(ip, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); if (error) - return error; + goto out_unlock; } - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + xfs_iunlock(ip, lockmode); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); out_unlock: if (lockmode) @@ -915,6 +964,7 @@ xfs_buffered_write_iomap_begin( int allocfork = XFS_DATA_FORK; int error = 0; unsigned int lockmode = XFS_ILOCK_EXCL; + u64 seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -926,6 +976,10 @@ xfs_buffered_write_iomap_begin( ASSERT(!XFS_IS_REALTIME_INODE(ip)); + error = xfs_qm_dqattach(ip); + if (error) + return error; + error = xfs_ilock_for_iomap(ip, flags, &lockmode); if (error) return error; @@ -1029,10 +1083,6 @@ xfs_buffered_write_iomap_begin( allocfork = XFS_COW_FORK; } - error = xfs_qm_dqattach_locked(ip, false); - if (error) - goto out_unlock; - if (eof && offset + count > XFS_ISIZE(ip)) { /* * Determine the initial size of the preallocation. @@ -1094,26 +1144,31 @@ retry: * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch * them out if the write happens to fail. */ + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); xfs_iunlock(ip, XFS_ILOCK_EXCL); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); found_imap: + seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); found_cow: - xfs_iunlock(ip, XFS_ILOCK_EXCL); + seq = xfs_iomap_inode_sequence(ip, 0); if (imap.br_startoff <= offset_fsb) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); if (error) - return error; + goto out_unlock; + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, - IOMAP_F_SHARED); + IOMAP_F_SHARED, seq); } xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1121,6 +1176,16 @@ out_unlock: } static int +xfs_buffered_write_delalloc_punch( + struct inode *inode, + loff_t offset, + loff_t length) +{ + return xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, + offset + length); +} + +static int xfs_buffered_write_iomap_end( struct inode *inode, loff_t offset, @@ -1129,56 +1194,17 @@ xfs_buffered_write_iomap_end( unsigned flags, struct iomap *iomap) { - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t start_fsb; - xfs_fileoff_t end_fsb; - int error = 0; - - if (iomap->type != IOMAP_DELALLOC) - return 0; - - /* - * Behave as if the write failed if drop writes is enabled. Set the NEW - * flag to force delalloc cleanup. - */ - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DROP_WRITES)) { - iomap->flags |= IOMAP_F_NEW; - written = 0; - } - /* - * start_fsb refers to the first unused block after a short write. If - * nothing was written, round offset down to point at the first block in - * the range. - */ - if (unlikely(!written)) - start_fsb = XFS_B_TO_FSBT(mp, offset); - else - start_fsb = XFS_B_TO_FSB(mp, offset + written); - end_fsb = XFS_B_TO_FSB(mp, offset + length); + struct xfs_mount *mp = XFS_M(inode->i_sb); + int error; - /* - * Trim delalloc blocks if they were allocated by this write and we - * didn't manage to write the whole range. - * - * We don't need to care about racing delalloc as we hold i_mutex - * across the reserve/allocate/unreserve calls. If there are delalloc - * blocks in the range, they are ours. - */ - if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) { - truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), - XFS_FSB_TO_B(mp, end_fsb) - 1); - - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - end_fsb - start_fsb); - if (error && !xfs_is_shutdown(mp)) { - xfs_alert(mp, "%s: unable to clean up ino %lld", - __func__, ip->i_ino); - return error; - } + error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset, + length, written, &xfs_buffered_write_delalloc_punch); + if (error && !xfs_is_shutdown(mp)) { + xfs_alert(mp, "%s: unable to clean up ino 0x%llx", + __func__, XFS_I(inode)->i_ino); + return error; } - return 0; } @@ -1187,6 +1213,15 @@ const struct iomap_ops xfs_buffered_write_iomap_ops = { .iomap_end = xfs_buffered_write_iomap_end, }; +/* + * iomap_page_mkwrite() will never fail in a way that requires delalloc extents + * that it allocated to be revoked. Hence we do not need an .iomap_end method + * for this operation. + */ +const struct iomap_ops xfs_page_mkwrite_iomap_ops = { + .iomap_begin = xfs_buffered_write_iomap_begin, +}; + static int xfs_read_iomap_begin( struct inode *inode, @@ -1204,6 +1239,7 @@ xfs_read_iomap_begin( int nimaps = 1, error = 0; bool shared = false; unsigned int lockmode = XFS_ILOCK_SHARED; + u64 seq; ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); @@ -1215,15 +1251,16 @@ xfs_read_iomap_begin( return error; error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, 0); - if (!error && (flags & IOMAP_REPORT)) + if (!error && ((flags & IOMAP_REPORT) || IS_DAX(inode))) error = xfs_reflink_trim_around_shared(ip, &imap, &shared); + seq = xfs_iomap_inode_sequence(ip, shared ? IOMAP_F_SHARED : 0); xfs_iunlock(ip, lockmode); if (error) return error; trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, - shared ? IOMAP_F_SHARED : 0); + shared ? IOMAP_F_SHARED : 0, seq); } const struct iomap_ops xfs_read_iomap_ops = { @@ -1248,6 +1285,7 @@ xfs_seek_iomap_begin( struct xfs_bmbt_irec imap, cmap; int error = 0; unsigned lockmode; + u64 seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -1282,8 +1320,9 @@ xfs_seek_iomap_begin( if (data_fsb < cow_fsb + cmap.br_blockcount) end_fsb = min(end_fsb, data_fsb); xfs_trim_extent(&cmap, offset_fsb, end_fsb); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, - IOMAP_F_SHARED); + IOMAP_F_SHARED, seq); /* * This is a COW extent, so we must probe the page cache * because there could be dirty page cache being backed @@ -1304,8 +1343,9 @@ xfs_seek_iomap_begin( imap.br_startblock = HOLESTARTBLOCK; imap.br_state = XFS_EXT_NORM; done: + seq = xfs_iomap_inode_sequence(ip, 0); xfs_trim_extent(&imap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); out_unlock: xfs_iunlock(ip, lockmode); return error; @@ -1331,6 +1371,7 @@ xfs_xattr_iomap_begin( struct xfs_bmbt_irec imap; int nimaps = 1, error = 0; unsigned lockmode; + int seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -1347,12 +1388,14 @@ xfs_xattr_iomap_begin( error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, XFS_BMAPI_ATTRFORK); out_unlock: + + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_XATTR); xfs_iunlock(ip, lockmode); if (error) return error; ASSERT(nimaps); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_XATTR, seq); } const struct iomap_ops xfs_xattr_iomap_ops = { @@ -1370,7 +1413,7 @@ xfs_zero_range( if (IS_DAX(inode)) return dax_zero_range(inode, pos, len, did_zero, - &xfs_direct_write_iomap_ops); + &xfs_dax_write_iomap_ops); return iomap_zero_range(inode, pos, len, did_zero, &xfs_buffered_write_iomap_ops); } @@ -1385,7 +1428,7 @@ xfs_truncate_page( if (IS_DAX(inode)) return dax_truncate_page(inode, pos, did_zero, - &xfs_direct_write_iomap_ops); + &xfs_dax_write_iomap_ops); return iomap_truncate_page(inode, pos, did_zero, &xfs_buffered_write_iomap_ops); } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index c782e8c0479c..4da13440bae9 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -13,14 +13,15 @@ struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, unsigned int flags, - struct xfs_bmbt_irec *imap); + struct xfs_bmbt_irec *imap, u64 *sequence); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip, xfs_fileoff_t end_fsb); +u64 xfs_iomap_inode_sequence(struct xfs_inode *ip, u16 iomap_flags); int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, unsigned int mapping_flags, - u16 iomap_flags); + u16 iomap_flags, u64 sequence_cookie); int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, bool *did_zero); @@ -47,6 +48,7 @@ xfs_aligned_fsb_count( } extern const struct iomap_ops xfs_buffered_write_iomap_ops; +extern const struct iomap_ops xfs_page_mkwrite_iomap_ops; extern const struct iomap_ops xfs_direct_write_iomap_ops; extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 45518b8c613c..515318dfbc38 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -167,7 +167,7 @@ xfs_generic_create( struct dentry *dentry, umode_t mode, dev_t rdev, - bool tmpfile) /* unnamed file */ + struct file *tmpfile) /* unnamed file */ { struct inode *inode; struct xfs_inode *ip = NULL; @@ -234,7 +234,7 @@ xfs_generic_create( * d_tmpfile can immediately set it back to zero. */ set_nlink(inode, 1); - d_tmpfile(dentry, inode); + d_tmpfile(tmpfile, inode); } else d_instantiate(dentry, inode); @@ -261,7 +261,7 @@ xfs_vn_mknod( umode_t mode, dev_t rdev) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, false); + return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, NULL); } STATIC int @@ -272,7 +272,7 @@ xfs_vn_create( umode_t mode, bool flags) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, false); + return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, NULL); } STATIC int @@ -283,7 +283,7 @@ xfs_vn_mkdir( umode_t mode) { return xfs_generic_create(mnt_userns, dir, dentry, mode | S_IFDIR, 0, - false); + NULL); } STATIC struct dentry * @@ -558,6 +558,8 @@ xfs_vn_getattr( struct inode *inode = d_inode(path->dentry); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); trace_xfs_getattr(ip); @@ -568,8 +570,8 @@ xfs_vn_getattr( stat->dev = inode->i_sb->s_dev; stat->mode = inode->i_mode; stat->nlink = inode->i_nlink; - stat->uid = i_uid_into_mnt(mnt_userns, inode); - stat->gid = i_gid_into_mnt(mnt_userns, inode); + stat->uid = vfsuid_into_kuid(vfsuid); + stat->gid = vfsgid_into_kgid(vfsgid); stat->ino = ip->i_ino; stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; @@ -604,6 +606,16 @@ xfs_vn_getattr( stat->blksize = BLKDEV_IOSIZE; stat->rdev = inode->i_rdev; break; + case S_IFREG: + if (request_mask & STATX_DIOALIGN) { + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + struct block_device *bdev = target->bt_bdev; + + stat->result_mask |= STATX_DIOALIGN; + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + stat->dio_offset_align = bdev_logical_block_size(bdev); + } + fallthrough; default: stat->blksize = xfs_stat_blksize(ip); stat->rdev = 0; @@ -639,6 +651,7 @@ xfs_vn_change_ok( static int xfs_setattr_nonsize( struct user_namespace *mnt_userns, + struct dentry *dentry, struct xfs_inode *ip, struct iattr *iattr) { @@ -745,7 +758,7 @@ xfs_setattr_nonsize( * Posix ACL code seems to care about this issue either. */ if (mask & ATTR_MODE) { - error = posix_acl_chmod(mnt_userns, inode, inode->i_mode); + error = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); if (error) return error; } @@ -767,6 +780,7 @@ out_dqrele: STATIC int xfs_setattr_size( struct user_namespace *mnt_userns, + struct dentry *dentry, struct xfs_inode *ip, struct iattr *iattr) { @@ -798,7 +812,7 @@ xfs_setattr_size( * Use the regular setattr path to update the timestamps. */ iattr->ia_valid &= ~ATTR_SIZE; - return xfs_setattr_nonsize(mnt_userns, ip, iattr); + return xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr); } /* @@ -975,7 +989,7 @@ xfs_vn_setattr_size( error = xfs_vn_change_ok(mnt_userns, dentry, iattr); if (error) return error; - return xfs_setattr_size(mnt_userns, ip, iattr); + return xfs_setattr_size(mnt_userns, dentry, ip, iattr); } STATIC int @@ -1007,7 +1021,7 @@ xfs_vn_setattr( error = xfs_vn_change_ok(mnt_userns, dentry, iattr); if (!error) - error = xfs_setattr_nonsize(mnt_userns, ip, iattr); + error = xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr); } return error; @@ -1080,14 +1094,16 @@ STATIC int xfs_vn_tmpfile( struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, + struct file *file, umode_t mode) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, true); + int err = xfs_generic_create(mnt_userns, dir, file->f_path.dentry, mode, 0, file); + + return finish_open_simple(file, err); } static const struct inode_operations xfs_inode_operations = { - .get_acl = xfs_get_acl, + .get_inode_acl = xfs_get_acl, .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, @@ -1114,7 +1130,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .get_acl = xfs_get_acl, + .get_inode_acl = xfs_get_acl, .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, @@ -1141,7 +1157,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .get_acl = xfs_get_acl, + .get_inode_acl = xfs_get_acl, .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, @@ -1171,10 +1187,6 @@ xfs_inode_supports_dax( if (!S_ISREG(VFS_I(ip)->i_mode)) return false; - /* Only supported on non-reflinked files. */ - if (xfs_is_reflink_inode(ip)) - return false; - /* Block size must match page size */ if (mp->m_sb.sb_blocksize != PAGE_SIZE) return false; diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index cb5fc68c9ea0..e570dcb5df8d 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -13,7 +13,6 @@ extern const struct file_operations xfs_dir_file_operations; extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); -extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr); int xfs_vn_setattr_size(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *vap); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 36312b00b164..a1c2bcf65d37 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -66,6 +66,8 @@ xfs_bulkstat_one_int( struct xfs_bulkstat *buf = bc->buf; xfs_extnum_t nextents; int error = -EINVAL; + vfsuid_t vfsuid; + vfsgid_t vfsgid; if (xfs_internal_inum(mp, ino)) goto out_advance; @@ -81,14 +83,16 @@ xfs_bulkstat_one_int( ASSERT(ip != NULL); ASSERT(ip->i_imap.im_blkno != 0); inode = VFS_I(ip); + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid = i_gid_into_vfsgid(mnt_userns, inode); /* xfs_iget returns the following without needing * further change. */ buf->bs_projectid = ip->i_projid; buf->bs_ino = ino; - buf->bs_uid = from_kuid(sb_userns, i_uid_into_mnt(mnt_userns, inode)); - buf->bs_gid = from_kgid(sb_userns, i_gid_into_mnt(mnt_userns, inode)); + buf->bs_uid = from_kuid(sb_userns, vfsuid_into_kuid(vfsuid)); + buf->bs_gid = from_kgid(sb_userns, vfsgid_into_kgid(vfsgid)); buf->bs_size = ip->i_disk_size; buf->bs_nlink = inode->i_nlink; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 386b0307aed8..fc61cc024023 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -226,12 +226,12 @@ xlog_ticket_reservation( if (head == &log->l_write_head) { ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); return tic->t_unit_res; - } else { - if (tic->t_flags & XLOG_TIC_PERM_RESERV) - return tic->t_unit_res * tic->t_cnt; - else - return tic->t_unit_res; } + + if (tic->t_flags & XLOG_TIC_PERM_RESERV) + return tic->t_unit_res * tic->t_cnt; + + return tic->t_unit_res; } STATIC bool @@ -644,12 +644,14 @@ xfs_log_mount( int min_logfsbs; if (!xfs_has_norecovery(mp)) { - xfs_notice(mp, "Mounting V%d Filesystem", - XFS_SB_VERSION_NUM(&mp->m_sb)); + xfs_notice(mp, "Mounting V%d Filesystem %pU", + XFS_SB_VERSION_NUM(&mp->m_sb), + &mp->m_sb.sb_uuid); } else { xfs_notice(mp, -"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", - XFS_SB_VERSION_NUM(&mp->m_sb)); +"Mounting V%d filesystem %pU in no-recovery mode. Filesystem will be inconsistent.", + XFS_SB_VERSION_NUM(&mp->m_sb), + &mp->m_sb.sb_uuid); ASSERT(xfs_is_readonly(mp)); } @@ -887,6 +889,23 @@ xlog_force_iclog( } /* + * Cycle all the iclogbuf locks to make sure all log IO completion + * is done before we tear down these buffers. + */ +static void +xlog_wait_iclog_completion(struct xlog *log) +{ + int i; + struct xlog_in_core *iclog = log->l_iclog; + + for (i = 0; i < log->l_iclog_bufs; i++) { + down(&iclog->ic_sema); + up(&iclog->ic_sema); + iclog = iclog->ic_next; + } +} + +/* * Wait for the iclog and all prior iclogs to be written disk as required by the * log force state machine. Waiting on ic_force_wait ensures iclog completions * have been ordered and callbacks run before we are woken here, hence @@ -1111,6 +1130,14 @@ xfs_log_unmount( { xfs_log_clean(mp); + /* + * If shutdown has come from iclog IO context, the log + * cleaning will have been skipped and so we need to wait + * for the iclog to complete shutdown processing before we + * tear anything down. + */ + xlog_wait_iclog_completion(mp->m_log); + xfs_buftarg_drain(mp->m_ddev_targp); xfs_trans_ail_destroy(mp); @@ -2114,17 +2141,6 @@ xlog_dealloc_log( int i; /* - * Cycle all the iclogbuf locks to make sure all log IO completion - * is done before we tear down these buffers. - */ - iclog = log->l_iclog; - for (i = 0; i < log->l_iclog_bufs; i++) { - down(&iclog->ic_sema); - up(&iclog->ic_sema); - iclog = iclog->ic_next; - } - - /* * Destroy the CIL after waiting for iclog IO completion because an * iclog EIO error will try to shut down the log, which accesses the * CIL to wake up the waiters. @@ -3544,7 +3560,7 @@ xlog_ticket_alloc( tic->t_curr_res = unit_res; tic->t_cnt = cnt; tic->t_ocnt = cnt; - tic->t_tid = prandom_u32(); + tic->t_tid = get_random_u32(); if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 17e923b9c5fa..322eb2ee6c55 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2552,6 +2552,8 @@ xlog_recover_process_intents( for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; lip = xfs_trans_ail_cursor_next(ailp, &cur)) { + const struct xfs_item_ops *ops; + if (!xlog_item_is_intent(lip)) break; @@ -2567,13 +2569,17 @@ xlog_recover_process_intents( * deferred ops, you /must/ attach them to the capture list in * the recover routine or else those subsequent intents will be * replayed in the wrong order! + * + * The recovery function can free the log item, so we must not + * access lip after it returns. */ spin_unlock(&ailp->ail_lock); - error = lip->li_ops->iop_recover(lip, &capture_list); + ops = lip->li_ops; + error = ops->iop_recover(lip, &capture_list); spin_lock(&ailp->ail_lock); if (error) { trace_xlog_intent_recovery_failed(log->l_mp, error, - lip->li_ops->iop_recover); + ops->iop_recover); break; } } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index f10c88cee116..fb87ffb48f7f 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -300,26 +300,28 @@ xfs_validate_new_dalign( "alignment check failed: sunit/swidth vs. blocksize(%d)", mp->m_sb.sb_blocksize); return -EINVAL; - } else { - /* - * Convert the stripe unit and width to FSBs. - */ - mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); - if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { - xfs_warn(mp, - "alignment check failed: sunit/swidth vs. agsize(%d)", - mp->m_sb.sb_agblocks); - return -EINVAL; - } else if (mp->m_dalign) { - mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); - } else { - xfs_warn(mp, - "alignment check failed: sunit(%d) less than bsize(%d)", - mp->m_dalign, mp->m_sb.sb_blocksize); - return -EINVAL; - } } + /* + * Convert the stripe unit and width to FSBs. + */ + mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); + if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. agsize(%d)", + mp->m_sb.sb_agblocks); + return -EINVAL; + } + + if (!mp->m_dalign) { + xfs_warn(mp, + "alignment check failed: sunit(%d) less than bsize(%d)", + mp->m_dalign, mp->m_sb.sb_blocksize); + return -EINVAL; + } + + mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); + if (!xfs_has_dalign(mp)) { xfs_warn(mp, "cannot change alignment: superblock does not support data alignment"); @@ -536,6 +538,20 @@ xfs_check_summary_counts( return 0; } +static void +xfs_unmount_check( + struct xfs_mount *mp) +{ + if (xfs_is_shutdown(mp)) + return; + + if (percpu_counter_sum(&mp->m_ifree) > + percpu_counter_sum(&mp->m_icount)) { + xfs_alert(mp, "ifree/icount mismatch at unmount"); + xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); + } +} + /* * Flush and reclaim dirty inodes in preparation for unmount. Inodes and * internal inode structures can be sitting in the CIL and AIL at this point, @@ -1075,6 +1091,7 @@ xfs_unmountfs( if (error) xfs_warn(mp, "Unable to free reserved block pool. " "Freespace may not be correct on next mount."); + xfs_unmount_check(mp); xfs_log_unmount(mp); xfs_da_unmount(mp); diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index 69d9c83ea4b2..c4078d0ec108 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -23,17 +23,18 @@ #include <linux/mm.h> #include <linux/dax.h> -struct failure_info { +struct xfs_failure_info { xfs_agblock_t startblock; xfs_extlen_t blockcount; int mf_flags; + bool want_shutdown; }; static pgoff_t xfs_failure_pgoff( struct xfs_mount *mp, const struct xfs_rmap_irec *rec, - const struct failure_info *notify) + const struct xfs_failure_info *notify) { loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset); @@ -47,7 +48,7 @@ static unsigned long xfs_failure_pgcnt( struct xfs_mount *mp, const struct xfs_rmap_irec *rec, - const struct failure_info *notify) + const struct xfs_failure_info *notify) { xfs_agblock_t end_rec; xfs_agblock_t end_notify; @@ -71,13 +72,13 @@ xfs_dax_failure_fn( { struct xfs_mount *mp = cur->bc_mp; struct xfs_inode *ip; - struct failure_info *notify = data; + struct xfs_failure_info *notify = data; int error = 0; if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); - return -EFSCORRUPTED; + notify->want_shutdown = true; + return 0; } /* Get files that incore, filter out others that are not in use. */ @@ -86,8 +87,10 @@ xfs_dax_failure_fn( /* Continue the rmap query if the inode isn't incore */ if (error == -ENODATA) return 0; - if (error) - return error; + if (error) { + notify->want_shutdown = true; + return 0; + } error = mf_dax_kill_procs(VFS_I(ip)->i_mapping, xfs_failure_pgoff(mp, rec, notify), @@ -104,6 +107,7 @@ xfs_dax_notify_ddev_failure( xfs_daddr_t bblen, int mf_flags) { + struct xfs_failure_info notify = { .mf_flags = mf_flags }; struct xfs_trans *tp = NULL; struct xfs_btree_cur *cur = NULL; struct xfs_buf *agf_bp = NULL; @@ -120,7 +124,6 @@ xfs_dax_notify_ddev_failure( for (; agno <= end_agno; agno++) { struct xfs_rmap_irec ri_low = { }; struct xfs_rmap_irec ri_high; - struct failure_info notify; struct xfs_agf *agf; xfs_agblock_t agend; struct xfs_perag *pag; @@ -161,6 +164,11 @@ xfs_dax_notify_ddev_failure( } xfs_trans_cancel(tp); + if (error || notify.want_shutdown) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); + if (!error) + error = -EFSCORRUPTED; + } return error; } @@ -175,13 +183,13 @@ xfs_dax_notify_failure( u64 ddev_start; u64 ddev_end; - if (!(mp->m_sb.sb_flags & SB_BORN)) { + if (!(mp->m_super->s_flags & SB_BORN)) { xfs_warn(mp, "filesystem is not ready for notify_failure()!"); return -EIO; } if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) { - xfs_warn(mp, + xfs_debug(mp, "notify_failure() not supported on realtime device!"); return -EOPNOTSUPP; } @@ -194,7 +202,7 @@ xfs_dax_notify_failure( } if (!xfs_has_rmapbt(mp)) { - xfs_warn(mp, "notify_failure() needs rmapbt enabled!"); + xfs_debug(mp, "notify_failure() needs rmapbt enabled!"); return -EOPNOTSUPP; } diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 758702b9495f..9737b5a9f405 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -118,10 +118,10 @@ xfs_check_ondisk_structs(void) /* log structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88); XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); - XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 28); - XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 32); - XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 28); - XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_extent_32, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176); @@ -134,6 +134,21 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_attri_log_format, 40); XFS_CHECK_STRUCT_SIZE(struct xfs_attrd_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_bui_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_bud_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_cui_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_cud_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_rui_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent, 16); + + XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16); + XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16); + XFS_CHECK_OFFSET(struct xfs_rui_log_format, rui_extents, 16); + XFS_CHECK_OFFSET(struct xfs_efi_log_format, efi_extents, 16); + XFS_CHECK_OFFSET(struct xfs_efi_log_format_32, efi_extents, 16); + XFS_CHECK_OFFSET(struct xfs_efi_log_format_64, efi_extents, 16); /* * The v5 superblock format extended several v4 header structures with diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 37a24f0f7cd4..38d23f0e703a 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -125,6 +125,7 @@ xfs_fs_map_blocks( int nimaps = 1; uint lock_flags; int error = 0; + u64 seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -176,6 +177,7 @@ xfs_fs_map_blocks( lock_flags = xfs_ilock_data_map_shared(ip); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, bmapi_flags); + seq = xfs_iomap_inode_sequence(ip, 0); ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK); @@ -189,7 +191,7 @@ xfs_fs_map_blocks( xfs_iunlock(ip, lock_flags); error = xfs_iomap_write_direct(ip, offset_fsb, - end_fsb - offset_fsb, 0, &imap); + end_fsb - offset_fsb, 0, &imap, &seq); if (error) goto out_unlock; @@ -209,7 +211,7 @@ xfs_fs_map_blocks( } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq); *device_generation = mp->m_generation; return error; out_unlock: diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 18bb4ec4d7c9..ff53d40a2dae 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -423,6 +423,14 @@ xfs_qm_dquot_isolate( goto out_miss_busy; /* + * If something else is freeing this dquot and hasn't yet removed it + * from the LRU, leave it for the freeing task to complete the freeing + * process rather than risk it being free from under us here. + */ + if (dqp->q_flags & XFS_DQFLAG_FREEING) + goto out_miss_unlock; + + /* * This dquot has acquired a reference in the meantime remove it from * the freelist and try again. */ @@ -441,10 +449,8 @@ xfs_qm_dquot_isolate( * skip it so there is time for the IO to complete before we try to * reclaim it again on the next LRU pass. */ - if (!xfs_dqflock_nowait(dqp)) { - xfs_dqunlock(dqp); - goto out_miss_busy; - } + if (!xfs_dqflock_nowait(dqp)) + goto out_miss_unlock; if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; @@ -478,6 +484,8 @@ xfs_qm_dquot_isolate( XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims); return LRU_REMOVED; +out_miss_unlock: + xfs_dqunlock(dqp); out_miss_busy: trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 7e97bf19793d..858e3e9eb4a8 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -523,7 +523,9 @@ xfs_cui_item_recover( type = refc_type; break; default: - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &cuip->cui_format, + sizeof(cuip->cui_format)); error = -EFSCORRUPTED; goto abort_error; } @@ -536,7 +538,8 @@ xfs_cui_item_recover( &new_fsb, &new_len, &rcur); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - refc, sizeof(*refc)); + &cuip->cui_format, + sizeof(cuip->cui_format)); if (error) goto abort_error; @@ -622,28 +625,18 @@ static const struct xfs_item_ops xfs_cui_item_ops = { .iop_relog = xfs_cui_item_relog, }; -/* - * Copy an CUI format buffer from the given buf, and into the destination - * CUI format structure. The CUI/CUD items were designed not to need any - * special alignment handling. - */ -static int +static inline void xfs_cui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_cui_log_format *dst_cui_fmt) + struct xfs_cui_log_format *dst, + const struct xfs_cui_log_format *src) { - struct xfs_cui_log_format *src_cui_fmt; - uint len; + unsigned int i; - src_cui_fmt = buf->i_addr; - len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents); + memcpy(dst, src, offsetof(struct xfs_cui_log_format, cui_extents)); - if (buf->i_len == len) { - memcpy(dst_cui_fmt, src_cui_fmt, len); - return 0; - } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; + for (i = 0; i < src->cui_nextents; i++) + memcpy(&dst->cui_extents[i], &src->cui_extents[i], + sizeof(struct xfs_phys_extent)); } /* @@ -660,19 +653,28 @@ xlog_recover_cui_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; struct xfs_mount *mp = log->l_mp; struct xfs_cui_log_item *cuip; struct xfs_cui_log_format *cui_formatp; + size_t len; cui_formatp = item->ri_buf[0].i_addr; - cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); - error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format); - if (error) { - xfs_cui_item_free(cuip); - return error; + if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; } + + len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents); + if (item->ri_buf[0].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); + xfs_cui_copy_format(&cuip->cui_format, cui_formatp); atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); /* * Insert the intent into the AIL directly and drop one reference so @@ -706,7 +708,8 @@ xlog_recover_cud_commit_pass2( cud_formatp = item->ri_buf[0].i_addr; if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 251f20ddd368..fe46bce8cae6 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -200,7 +200,9 @@ xfs_reflink_trim_around_shared( if (fbno == NULLAGBLOCK) { /* No shared blocks at all. */ return 0; - } else if (fbno == agbno) { + } + + if (fbno == agbno) { /* * The start of this extent is shared. Truncate the * mapping at the end of the shared region so that a @@ -210,16 +212,16 @@ xfs_reflink_trim_around_shared( irec->br_blockcount = flen; *shared = true; return 0; - } else { - /* - * There's a shared extent midway through this extent. - * Truncate the mapping at the start of the shared - * extent so that a subsequent iteration starts at the - * start of the shared region. - */ - irec->br_blockcount = fbno - agbno; - return 0; } + + /* + * There's a shared extent midway through this extent. + * Truncate the mapping at the start of the shared + * extent so that a subsequent iteration starts at the + * start of the shared region. + */ + irec->br_blockcount = fbno - agbno; + return 0; } int @@ -1691,8 +1693,12 @@ xfs_reflink_unshare( inode_dio_wait(inode); - error = iomap_file_unshare(inode, offset, len, - &xfs_buffered_write_iomap_ops); + if (IS_DAX(inode)) + error = dax_file_unshare(inode, offset, len, + &xfs_dax_write_iomap_ops); + else + error = iomap_file_unshare(inode, offset, len, + &xfs_buffered_write_iomap_ops); if (error) goto out; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index fef92e02f3bb..534504ede1a3 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -155,31 +155,6 @@ xfs_rui_init( return ruip; } -/* - * Copy an RUI format buffer from the given buf, and into the destination - * RUI format structure. The RUI/RUD items were designed not to need any - * special alignment handling. - */ -STATIC int -xfs_rui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_rui_log_format *dst_rui_fmt) -{ - struct xfs_rui_log_format *src_rui_fmt; - uint len; - - src_rui_fmt = buf->i_addr; - len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents); - - if (buf->i_len != len) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; - } - - memcpy(dst_rui_fmt, src_rui_fmt, len); - return 0; -} - static inline struct xfs_rud_log_item *RUD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_rud_log_item, rud_item); @@ -582,7 +557,9 @@ xfs_rui_item_recover( type = XFS_RMAP_FREE; break; default: - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &ruip->rui_format, + sizeof(ruip->rui_format)); error = -EFSCORRUPTED; goto abort_error; } @@ -652,6 +629,20 @@ static const struct xfs_item_ops xfs_rui_item_ops = { .iop_relog = xfs_rui_item_relog, }; +static inline void +xfs_rui_copy_format( + struct xfs_rui_log_format *dst, + const struct xfs_rui_log_format *src) +{ + unsigned int i; + + memcpy(dst, src, offsetof(struct xfs_rui_log_format, rui_extents)); + + for (i = 0; i < src->rui_nextents; i++) + memcpy(&dst->rui_extents[i], &src->rui_extents[i], + sizeof(struct xfs_map_extent)); +} + /* * This routine is called to create an in-core extent rmap update * item from the rui format structure which was logged on disk. @@ -666,19 +657,28 @@ xlog_recover_rui_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; struct xfs_mount *mp = log->l_mp; struct xfs_rui_log_item *ruip; struct xfs_rui_log_format *rui_formatp; + size_t len; rui_formatp = item->ri_buf[0].i_addr; - ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); - error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format); - if (error) { - xfs_rui_item_free(ruip); - return error; + if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents); + if (item->ri_buf[0].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; } + + ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); + xfs_rui_copy_format(&ruip->rui_format, rui_formatp); atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); /* * Insert the intent into the AIL directly and drop one reference so @@ -711,7 +711,11 @@ xlog_recover_rud_commit_pass2( struct xfs_rud_log_format *rud_formatp; rud_formatp = item->ri_buf[0].i_addr; - ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format)); + if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + rud_formatp, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } xlog_recover_release_intent(log, XFS_LI_RUI, rud_formatp->rud_rui_id); return 0; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 292d5e54a92c..16534e9873f6 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1311,10 +1311,10 @@ xfs_rtalloc_reinit_frextents( uint64_t val = 0; int error; - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent, &val); - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); if (error) return error; @@ -1326,6 +1326,41 @@ xfs_rtalloc_reinit_frextents( } /* + * Read in the bmbt of an rt metadata inode so that we never have to load them + * at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use + * an empty transaction to avoid deadlocking on loops in the bmbt. + */ +static inline int +xfs_rtmount_iread_extents( + struct xfs_inode *ip, + unsigned int lock_class) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc_empty(ip->i_mount, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL | lock_class); + + error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + + if (xfs_inode_has_attr_fork(ip)) { + error = xfs_iread_extents(tp, ip, XFS_ATTR_FORK); + if (error) + goto out_unlock; + } + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL | lock_class); + xfs_trans_cancel(tp); + return error; +} + +/* * Get the bitmap and summary inodes and the summary cache into the mount * structure at mount time. */ @@ -1342,14 +1377,27 @@ xfs_rtmount_inodes( return error; ASSERT(mp->m_rbmip != NULL); + error = xfs_rtmount_iread_extents(mp->m_rbmip, XFS_ILOCK_RTBITMAP); + if (error) + goto out_rele_bitmap; + error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip); - if (error) { - xfs_irele(mp->m_rbmip); - return error; - } + if (error) + goto out_rele_bitmap; ASSERT(mp->m_rsumip != NULL); + + error = xfs_rtmount_iread_extents(mp->m_rsumip, XFS_ILOCK_RTSUM); + if (error) + goto out_rele_summary; + xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks); return 0; + +out_rele_summary: + xfs_irele(mp->m_rsumip); +out_rele_bitmap: + xfs_irele(mp->m_rbmip); + return error; } void diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 20e0534a772c..90a77cd3ebad 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -74,7 +74,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) defer_relog += per_cpu_ptr(stats, i)->s.defer_relog; } - len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", + len += scnprintf(buf + len, PATH_MAX-len, "xpc %llu %llu %llu\n", xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n", defer_relog); @@ -125,7 +125,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v) { int j; - seq_printf(m, "qm"); + seq_puts(m, "qm"); for (j = XFSSTAT_START_XQMSTAT; j < XFSSTAT_END_XQMSTAT; j++) seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j)); seq_putc(m, '\n'); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 9ac59814bbb6..0c4b73e9b29d 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -653,7 +653,7 @@ xfs_fs_destroy_inode( static void xfs_fs_dirty_inode( struct inode *inode, - int flag) + int flags) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -661,7 +661,13 @@ xfs_fs_dirty_inode( if (!(inode->i_sb->s_flags & SB_LAZYTIME)) return; - if (flag != I_DIRTY_SYNC || !(inode->i_state & I_DIRTY_TIME)) + + /* + * Only do the timestamp update if the inode is dirty (I_DIRTY_SYNC) + * and has dirty timestamp (I_DIRTY_TIME). I_DIRTY_TIME can be passed + * in flags possibly together with I_DIRTY_SYNC. + */ + if ((flags & ~I_DIRTY_TIME) != I_DIRTY_SYNC || !(flags & I_DIRTY_TIME)) return; if (xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp)) @@ -1104,7 +1110,7 @@ xfs_fs_put_super( if (!sb->s_fs_info) return; - xfs_notice(mp, "Unmounting Filesystem"); + xfs_notice(mp, "Unmounting Filesystem %pU", &mp->m_sb.sb_uuid); xfs_filestream_unmount(mp); xfs_unmountfs(mp); @@ -2022,18 +2028,14 @@ xfs_init_caches(void) goto out_destroy_trans_cache; xfs_efd_cache = kmem_cache_create("xfs_efd_item", - (sizeof(struct xfs_efd_log_item) + - (XFS_EFD_MAX_FAST_EXTENTS - 1) * - sizeof(struct xfs_extent)), - 0, 0, NULL); + xfs_efd_log_item_sizeof(XFS_EFD_MAX_FAST_EXTENTS), + 0, 0, NULL); if (!xfs_efd_cache) goto out_destroy_buf_item_cache; xfs_efi_cache = kmem_cache_create("xfs_efi_item", - (sizeof(struct xfs_efi_log_item) + - (XFS_EFI_MAX_FAST_EXTENTS - 1) * - sizeof(struct xfs_extent)), - 0, 0, NULL); + xfs_efi_log_item_sizeof(XFS_EFI_MAX_FAST_EXTENTS), + 0, 0, NULL); if (!xfs_efi_cache) goto out_destroy_efd_cache; diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index 43585850f154..513095e353a5 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -33,10 +33,15 @@ xfs_sysfs_init( const char *name) { struct kobject *parent; + int err; parent = parent_kobj ? &parent_kobj->kobject : NULL; init_completion(&kobj->complete); - return kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name); + err = kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name); + if (err) + kobject_put(&kobj->kobject); + + return err; } static inline void diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index d269ef57ff01..8a5dc1538aa8 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -34,6 +34,8 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_error.h" +#include <linux/iomap.h> +#include "xfs_iomap.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f9057af6e0c8..421d1e504ac4 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -799,6 +799,9 @@ TRACE_DEFINE_ENUM(PE_SIZE_PTE); TRACE_DEFINE_ENUM(PE_SIZE_PMD); TRACE_DEFINE_ENUM(PE_SIZE_PUD); +TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); +TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); + TRACE_EVENT(xfs_filemap_fault, TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, bool write_fault), @@ -1170,7 +1173,7 @@ DECLARE_EVENT_CLASS(xfs_dqtrx_class, __entry->ino_res_used = qtrx->qt_ino_res_used; __entry->icount_delta = qtrx->qt_icount_delta; ), - TP_printk("dev %d:%d dquot id 0x%x type %s flags %s" + TP_printk("dev %d:%d dquot id 0x%x type %s flags %s " "blk_res %llu bcount_delta %lld delbcnt_delta %lld " "rtblk_res %llu rtblk_res_used %llu rtbcount_delta %lld delrtb_delta %lld " "ino_res %llu ino_res_used %llu icount_delta %lld", @@ -1602,7 +1605,7 @@ TRACE_EVENT(xfs_bunmap, __entry->caller_ip = caller_ip; __entry->flags = flags; ), - TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx" + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx " "flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, @@ -2925,6 +2928,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, domain) __field(xfs_agblock_t, startblock) __field(xfs_extlen_t, blockcount) __field(xfs_nlink_t, refcount) @@ -2932,13 +2936,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->domain = irec->rc_domain; __entry->startblock = irec->rc_startblock; __entry->blockcount = irec->rc_blockcount; __entry->refcount = irec->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), __entry->startblock, __entry->blockcount, __entry->refcount) @@ -2958,6 +2964,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, domain) __field(xfs_agblock_t, startblock) __field(xfs_extlen_t, blockcount) __field(xfs_nlink_t, refcount) @@ -2966,14 +2973,16 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->domain = irec->rc_domain; __entry->startblock = irec->rc_startblock; __entry->blockcount = irec->rc_blockcount; __entry->refcount = irec->rc_refcount; __entry->agbno = agbno; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), __entry->startblock, __entry->blockcount, __entry->refcount, @@ -2994,9 +3003,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, i1_domain) __field(xfs_agblock_t, i1_startblock) __field(xfs_extlen_t, i1_blockcount) __field(xfs_nlink_t, i1_refcount) + __field(enum xfs_refc_domain, i2_domain) __field(xfs_agblock_t, i2_startblock) __field(xfs_extlen_t, i2_blockcount) __field(xfs_nlink_t, i2_refcount) @@ -3004,20 +3015,24 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; __entry->i1_refcount = i1->rc_refcount; + __entry->i2_domain = i2->rc_domain; __entry->i2_startblock = i2->rc_startblock; __entry->i2_blockcount = i2->rc_blockcount; __entry->i2_refcount = i2->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- " - "agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i1_startblock, __entry->i1_blockcount, __entry->i1_refcount, + __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i2_startblock, __entry->i2_blockcount, __entry->i2_refcount) @@ -3038,9 +3053,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, i1_domain) __field(xfs_agblock_t, i1_startblock) __field(xfs_extlen_t, i1_blockcount) __field(xfs_nlink_t, i1_refcount) + __field(enum xfs_refc_domain, i2_domain) __field(xfs_agblock_t, i2_startblock) __field(xfs_extlen_t, i2_blockcount) __field(xfs_nlink_t, i2_refcount) @@ -3049,21 +3066,25 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; __entry->i1_refcount = i1->rc_refcount; + __entry->i2_domain = i2->rc_domain; __entry->i2_startblock = i2->rc_startblock; __entry->i2_blockcount = i2->rc_blockcount; __entry->i2_refcount = i2->rc_refcount; __entry->agbno = agbno; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- " - "agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i1_startblock, __entry->i1_blockcount, __entry->i1_refcount, + __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i2_startblock, __entry->i2_blockcount, __entry->i2_refcount, @@ -3086,12 +3107,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, i1_domain) __field(xfs_agblock_t, i1_startblock) __field(xfs_extlen_t, i1_blockcount) __field(xfs_nlink_t, i1_refcount) + __field(enum xfs_refc_domain, i2_domain) __field(xfs_agblock_t, i2_startblock) __field(xfs_extlen_t, i2_blockcount) __field(xfs_nlink_t, i2_refcount) + __field(enum xfs_refc_domain, i3_domain) __field(xfs_agblock_t, i3_startblock) __field(xfs_extlen_t, i3_blockcount) __field(xfs_nlink_t, i3_refcount) @@ -3099,27 +3123,33 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; __entry->i1_refcount = i1->rc_refcount; + __entry->i2_domain = i2->rc_domain; __entry->i2_startblock = i2->rc_startblock; __entry->i2_blockcount = i2->rc_blockcount; __entry->i2_refcount = i2->rc_refcount; + __entry->i3_domain = i3->rc_domain; __entry->i3_startblock = i3->rc_startblock; __entry->i3_blockcount = i3->rc_blockcount; __entry->i3_refcount = i3->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- " - "agbno 0x%x fsbcount 0x%x refcount %u -- " - "agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i1_startblock, __entry->i1_blockcount, __entry->i1_refcount, + __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i2_startblock, __entry->i2_blockcount, __entry->i2_refcount, + __print_symbolic(__entry->i3_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i3_startblock, __entry->i3_blockcount, __entry->i3_refcount) @@ -3322,6 +3352,92 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \ TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), \ TP_ARGS(ip, irec)) +/* inode iomap invalidation events */ +DECLARE_EVENT_CLASS(xfs_wb_invalid_class, + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork), + TP_ARGS(ip, iomap, wpcseq, whichfork), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u64, addr) + __field(loff_t, pos) + __field(u64, len) + __field(u16, type) + __field(u16, flags) + __field(u32, wpcseq) + __field(u32, forkseq) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->addr = iomap->addr; + __entry->pos = iomap->offset; + __entry->len = iomap->length; + __entry->type = iomap->type; + __entry->flags = iomap->flags; + __entry->wpcseq = wpcseq; + __entry->forkseq = READ_ONCE(xfs_ifork_ptr(ip, whichfork)->if_seq); + ), + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x wpcseq 0x%x forkseq 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pos, + __entry->addr, + __entry->len, + __entry->type, + __entry->flags, + __entry->wpcseq, + __entry->forkseq) +); +#define DEFINE_WB_INVALID_EVENT(name) \ +DEFINE_EVENT(xfs_wb_invalid_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork), \ + TP_ARGS(ip, iomap, wpcseq, whichfork)) +DEFINE_WB_INVALID_EVENT(xfs_wb_cow_iomap_invalid); +DEFINE_WB_INVALID_EVENT(xfs_wb_data_iomap_invalid); + +DECLARE_EVENT_CLASS(xfs_iomap_invalid_class, + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap), + TP_ARGS(ip, iomap), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u64, addr) + __field(loff_t, pos) + __field(u64, len) + __field(u64, validity_cookie) + __field(u64, inodeseq) + __field(u16, type) + __field(u16, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->addr = iomap->addr; + __entry->pos = iomap->offset; + __entry->len = iomap->length; + __entry->validity_cookie = iomap->validity_cookie; + __entry->type = iomap->type; + __entry->flags = iomap->flags; + __entry->inodeseq = xfs_iomap_inode_sequence(ip, iomap->flags); + ), + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x validity_cookie 0x%llx inodeseq 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pos, + __entry->addr, + __entry->len, + __entry->type, + __entry->flags, + __entry->validity_cookie, + __entry->inodeseq) +); +#define DEFINE_IOMAP_INVALID_EVENT(name) \ +DEFINE_EVENT(xfs_iomap_invalid_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap), \ + TP_ARGS(ip, iomap)) +DEFINE_IOMAP_INVALID_EVENT(xfs_iomap_invalid); + /* refcount/reflink tracepoint definitions */ /* reflink tracepoints */ diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index d3a97a028560..7d4109af193e 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -422,7 +422,7 @@ xfsaild_push( struct xfs_ail_cursor cur; struct xfs_log_item *lip; xfs_lsn_t lsn; - xfs_lsn_t target; + xfs_lsn_t target = NULLCOMMITLSN; long tout; int stuck = 0; int flushing = 0; @@ -472,6 +472,8 @@ xfsaild_push( XFS_STATS_INC(mp, xs_push_ail); + ASSERT(target != NULLCOMMITLSN); + lsn = lip->li_lsn; while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { int lock_result; @@ -602,9 +604,9 @@ xfsaild( while (1) { if (tout && tout <= 20) - set_current_state(TASK_KILLABLE); + set_current_state(TASK_KILLABLE|TASK_FREEZABLE); else - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); /* * Check kthread_should_stop() after we set the task state to @@ -653,14 +655,14 @@ xfsaild( ailp->ail_target == ailp->ail_target_prev && list_empty(&ailp->ail_buf_list)) { spin_unlock(&ailp->ail_lock); - freezable_schedule(); + schedule(); tout = 0; continue; } spin_unlock(&ailp->ail_lock); if (tout) - freezable_schedule_timeout(msecs_to_jiffies(tout)); + schedule_timeout(msecs_to_jiffies(tout)); __set_current_state(TASK_RUNNING); @@ -730,11 +732,10 @@ void xfs_ail_push_all_sync( struct xfs_ail *ailp) { - struct xfs_log_item *lip; DEFINE_WAIT(wait); spin_lock(&ailp->ail_lock); - while ((lip = xfs_ail_max(ailp)) != NULL) { + while (xfs_ail_max(ailp) != NULL) { prepare_to_wait(&ailp->ail_empty, &wait, TASK_UNINTERRUPTIBLE); wake_up_process(ailp->ail_task); spin_unlock(&ailp->ail_lock); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index c325a28b89a8..10aa1fd39d2b 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -210,7 +210,7 @@ __xfs_xattr_put_listent( return; } offset = context->buffer + context->count; - strncpy(offset, prefix, prefix_len); + memcpy(offset, prefix, prefix_len); offset += prefix_len; strncpy(offset, (char *)name, namelen); /* real name */ offset += namelen; diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 860f0b1032c6..2c53fbb8d918 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -41,6 +41,13 @@ static void zonefs_account_active(struct inode *inode) return; /* + * For zones that transitioned to the offline or readonly condition, + * we only need to clear the active state. + */ + if (zi->i_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY)) + goto out; + + /* * If the zone is active, that is, if it is explicitly open or * partially written, check if it was already accounted as active. */ @@ -53,6 +60,7 @@ static void zonefs_account_active(struct inode *inode) return; } +out: /* The zone is not active. If it was, update the active count */ if (zi->i_flags & ZONEFS_ZONE_ACTIVE) { zi->i_flags &= ~ZONEFS_ZONE_ACTIVE; @@ -324,6 +332,7 @@ static loff_t zonefs_check_zone_condition(struct inode *inode, inode->i_flags |= S_IMMUTABLE; inode->i_mode &= ~0777; zone->wp = zone->start; + zi->i_flags |= ZONEFS_ZONE_OFFLINE; return 0; case BLK_ZONE_COND_READONLY: /* @@ -342,8 +351,10 @@ static loff_t zonefs_check_zone_condition(struct inode *inode, zone->cond = BLK_ZONE_COND_OFFLINE; inode->i_mode &= ~0777; zone->wp = zone->start; + zi->i_flags |= ZONEFS_ZONE_OFFLINE; return 0; } + zi->i_flags |= ZONEFS_ZONE_READONLY; inode->i_mode &= ~0222; return i_size_read(inode); case BLK_ZONE_COND_FULL: @@ -478,8 +489,7 @@ static void __zonefs_io_error(struct inode *inode, bool write) struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); unsigned int noio_flag; - unsigned int nr_zones = - zi->i_zone_size >> (sbi->s_zone_sectors_shift + SECTOR_SHIFT); + unsigned int nr_zones = 1; struct zonefs_ioerr_data err = { .inode = inode, .write = write, @@ -487,6 +497,15 @@ static void __zonefs_io_error(struct inode *inode, bool write) int ret; /* + * The only files that have more than one zone are conventional zone + * files with aggregated conventional zones, for which the inode zone + * size is always larger than the device zone size. + */ + if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev)) + nr_zones = zi->i_zone_size >> + (sbi->s_zone_sectors_shift + SECTOR_SHIFT); + + /* * Memory allocations in blkdev_report_zones() can trigger a memory * reclaim which may in turn cause a recursion into zonefs as well as * struct request allocations for the same device. The former case may @@ -1407,6 +1426,14 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, zi->i_ztype = type; zi->i_zsector = zone->start; zi->i_zone_size = zone->len << SECTOR_SHIFT; + if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT && + !(sbi->s_features & ZONEFS_F_AGGRCNV)) { + zonefs_err(sb, + "zone size %llu doesn't match device's zone sectors %llu\n", + zi->i_zone_size, + bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT); + return -EINVAL; + } zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE, zone->capacity << SECTOR_SHIFT); @@ -1456,11 +1483,11 @@ static struct dentry *zonefs_create_inode(struct dentry *parent, struct inode *dir = d_inode(parent); struct dentry *dentry; struct inode *inode; - int ret; + int ret = -ENOMEM; dentry = d_alloc_name(parent, name); if (!dentry) - return NULL; + return ERR_PTR(ret); inode = new_inode(parent->d_sb); if (!inode) @@ -1485,7 +1512,7 @@ static struct dentry *zonefs_create_inode(struct dentry *parent, dput: dput(dentry); - return NULL; + return ERR_PTR(ret); } struct zonefs_zone_data { @@ -1505,7 +1532,7 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd, struct blk_zone *zone, *next, *end; const char *zgroup_name; char *file_name; - struct dentry *dir; + struct dentry *dir, *dent; unsigned int n = 0; int ret; @@ -1523,8 +1550,8 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd, zgroup_name = "seq"; dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type); - if (!dir) { - ret = -ENOMEM; + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); goto free; } @@ -1570,8 +1597,9 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd, * Use the file number within its group as file name. */ snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n); - if (!zonefs_create_inode(dir, file_name, zone, type)) { - ret = -ENOMEM; + dent = zonefs_create_inode(dir, file_name, zone, type); + if (IS_ERR(dent)) { + ret = PTR_ERR(dent); goto free; } @@ -1905,18 +1933,18 @@ static int __init zonefs_init(void) if (ret) return ret; - ret = register_filesystem(&zonefs_type); + ret = zonefs_sysfs_init(); if (ret) goto destroy_inodecache; - ret = zonefs_sysfs_init(); + ret = register_filesystem(&zonefs_type); if (ret) - goto unregister_fs; + goto sysfs_exit; return 0; -unregister_fs: - unregister_filesystem(&zonefs_type); +sysfs_exit: + zonefs_sysfs_exit(); destroy_inodecache: zonefs_destroy_inodecache(); @@ -1925,9 +1953,9 @@ destroy_inodecache: static void __exit zonefs_exit(void) { + unregister_filesystem(&zonefs_type); zonefs_sysfs_exit(); zonefs_destroy_inodecache(); - unregister_filesystem(&zonefs_type); } MODULE_AUTHOR("Damien Le Moal"); diff --git a/fs/zonefs/sysfs.c b/fs/zonefs/sysfs.c index 9cb6755ce39a..9920689dc098 100644 --- a/fs/zonefs/sysfs.c +++ b/fs/zonefs/sysfs.c @@ -15,11 +15,6 @@ struct zonefs_sysfs_attr { ssize_t (*show)(struct zonefs_sb_info *sbi, char *buf); }; -static inline struct zonefs_sysfs_attr *to_attr(struct attribute *attr) -{ - return container_of(attr, struct zonefs_sysfs_attr, attr); -} - #define ZONEFS_SYSFS_ATTR_RO(name) \ static struct zonefs_sysfs_attr zonefs_sysfs_attr_##name = __ATTR_RO(name) diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 4b3de66c3233..1dbe78119ff1 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -39,8 +39,10 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) return ZONEFS_ZTYPE_SEQ; } -#define ZONEFS_ZONE_OPEN (1 << 0) -#define ZONEFS_ZONE_ACTIVE (1 << 1) +#define ZONEFS_ZONE_OPEN (1U << 0) +#define ZONEFS_ZONE_ACTIVE (1U << 1) +#define ZONEFS_ZONE_OFFLINE (1U << 2) +#define ZONEFS_ZONE_READONLY (1U << 3) /* * In-memory inode data. |