diff options
Diffstat (limited to 'fs')
201 files changed, 5003 insertions, 2499 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 399d455d50d6..d0c77ec31b1d 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -768,22 +768,18 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, struct v9fs_inode __maybe_unused *v9inode; struct v9fs_session_info *v9ses; struct p9_fid *fid; - struct dentry *res = NULL; struct inode *inode; int p9_omode; if (d_in_lookup(dentry)) { - res = v9fs_vfs_lookup(dir, dentry, 0); - if (IS_ERR(res)) - return PTR_ERR(res); - - if (res) - dentry = res; + struct dentry *res = v9fs_vfs_lookup(dir, dentry, 0); + if (res || d_really_is_positive(dentry)) + return finish_no_open(file, res); } /* Only creates */ - if (!(flags & O_CREAT) || d_really_is_positive(dentry)) - return finish_no_open(file, res); + if (!(flags & O_CREAT)) + return finish_no_open(file, NULL); v9ses = v9fs_inode2v9ses(dir); perm = unixmode2p9mode(v9ses, mode); @@ -795,17 +791,17 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, "write-only file with writeback enabled, creating w/ O_RDWR\n"); } fid = v9fs_create(v9ses, dir, dentry, NULL, perm, p9_omode); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); - goto error; - } + if (IS_ERR(fid)) + return PTR_ERR(fid); v9fs_invalidate_inode_attr(dir); inode = d_inode(dentry); v9inode = V9FS_I(inode); err = finish_open(file, dentry, generic_file_open); - if (err) - goto error; + if (unlikely(err)) { + p9_fid_put(fid); + return err; + } file->private_data = fid; #ifdef CONFIG_9P_FSCACHE @@ -818,13 +814,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, v9fs_open_fid_add(inode, &fid); file->f_mode |= FMODE_CREATED; -out: - dput(res); - return err; - -error: - p9_fid_put(fid); - goto out; + return 0; } /** diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 5b5fda617b80..be297e335468 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -238,20 +238,16 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, struct p9_fid *dfid = NULL, *ofid = NULL; struct v9fs_session_info *v9ses; struct posix_acl *pacl = NULL, *dacl = NULL; - struct dentry *res = NULL; if (d_in_lookup(dentry)) { - res = v9fs_vfs_lookup(dir, dentry, 0); - if (IS_ERR(res)) - return PTR_ERR(res); - - if (res) - dentry = res; + struct dentry *res = v9fs_vfs_lookup(dir, dentry, 0); + if (res || d_really_is_positive(dentry)) + return finish_no_open(file, res); } /* Only creates */ - if (!(flags & O_CREAT) || d_really_is_positive(dentry)) - return finish_no_open(file, res); + if (!(flags & O_CREAT)) + return finish_no_open(file, NULL); v9ses = v9fs_inode2v9ses(dir); @@ -337,7 +333,6 @@ out: p9_fid_put(ofid); p9_fid_put(fid); v9fs_put_acl(dacl, pacl); - dput(res); return err; } diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index 4b1342c72089..fd3aa9f97ce6 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -239,7 +239,7 @@ static void afs_edit_init_block(union afs_xdr_dir_block *meta, * The caller must hold the inode locked. */ void afs_edit_dir_add(struct afs_vnode *vnode, - struct qstr *name, struct afs_fid *new_fid, + const struct qstr *name, struct afs_fid *new_fid, enum afs_edit_dir_reason why) { union afs_xdr_dir_block *meta, *block; @@ -391,7 +391,7 @@ error: * The caller must hold the inode locked. */ void afs_edit_dir_remove(struct afs_vnode *vnode, - struct qstr *name, enum afs_edit_dir_reason why) + const struct qstr *name, enum afs_edit_dir_reason why) { union afs_xdr_dir_block *meta, *block, *pblock; union afs_xdr_dirent *de, *pde; diff --git a/fs/afs/dir_search.c b/fs/afs/dir_search.c index b25bd892db4d..d2516e55b5ed 100644 --- a/fs/afs/dir_search.c +++ b/fs/afs/dir_search.c @@ -188,7 +188,7 @@ bad: /* * Search the appropriate hash chain in the contents of an AFS directory. */ -int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name, +int afs_dir_search(struct afs_vnode *dvnode, const struct qstr *name, struct afs_fid *_fid, afs_dataversion_t *_dir_version) { struct afs_dir_iter iter = { .dvnode = dvnode, }; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 444a3ea4fdf6..a45ae5c2ef8a 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1099,9 +1099,9 @@ int afs_single_writepages(struct address_space *mapping, /* * dir_edit.c */ -extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *, +extern void afs_edit_dir_add(struct afs_vnode *, const struct qstr *, struct afs_fid *, enum afs_edit_dir_reason); -extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason); +extern void afs_edit_dir_remove(struct afs_vnode *, const struct qstr *, enum afs_edit_dir_reason); void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name, struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why); void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode); @@ -1114,7 +1114,7 @@ bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name); union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block); int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name, struct afs_fid *_fid); -int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name, +int afs_dir_search(struct afs_vnode *dvnode, const struct qstr *name, struct afs_fid *_fid, afs_dataversion_t *_dir_version); /* diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 9434a5399f2b..1ad048e6e164 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -137,7 +137,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) ret = -EINVAL; if (content[size - 1] == '.') - ret = vfs_parse_fs_string(fc, "source", content, size - 1); + ret = vfs_parse_fs_qstr(fc, "source", + &QSTR_LEN(content, size - 1)); do_delayed_call(&cleanup); if (ret < 0) return ret; diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c index 1e36a12b88f7..5ace2511fec5 100644 --- a/fs/bpf_fs_kfuncs.c +++ b/fs/bpf_fs_kfuncs.c @@ -79,7 +79,7 @@ __bpf_kfunc void bpf_put_file(struct file *file) * pathname in *buf*, including the NUL termination character. On error, a * negative integer is returned. */ -__bpf_kfunc int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) +__bpf_kfunc int bpf_path_d_path(const struct path *path, char *buf, size_t buf__sz) { int len; char *ret; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6aad6b65522b..621e0df097e3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5829,7 +5829,7 @@ struct btrfs_dir_list { * See process_dir_items_leaf() for details about why it is needed. * This is a recursive operation - if an existing dentry corresponds to a * directory, that directory's new entries are logged too (same behaviour as - * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes + * ext3/4, xfs, f2fs, nilfs2). Note that when logging the inodes * the dentries point to we do not acquire their VFS lock, otherwise lockdep * complains about the following circular lock dependency / possible deadlock: * diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index f327fbb9a0ca..81f4f06bc87e 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1601,10 +1601,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) err = -ENOENT; if (configfs_dirent_is_ready(parent_sd)) { file->private_data = configfs_new_dirent(parent_sd, NULL, 0, NULL); - if (IS_ERR(file->private_data)) - err = PTR_ERR(file->private_data); - else - err = 0; + err = PTR_ERR_OR_ZERO(file->private_data); } inode_unlock(d_inode(dentry)); diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 69133ec1fac2..f3f79c67add5 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -114,26 +114,21 @@ static int create_link(struct config_item *parent_item, } -static int get_target(const char *symname, struct path *path, - struct config_item **target, struct super_block *sb) +static int get_target(const char *symname, struct config_item **target, + struct super_block *sb) { + struct path path __free(path_put) = {}; int ret; - ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, path); - if (!ret) { - if (path->dentry->d_sb == sb) { - *target = configfs_get_config_item(path->dentry); - if (!*target) { - ret = -ENOENT; - path_put(path); - } - } else { - ret = -EPERM; - path_put(path); - } - } - - return ret; + ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path); + if (ret) + return ret; + if (path.dentry->d_sb != sb) + return -EPERM; + *target = configfs_get_config_item(path.dentry); + if (!*target) + return -ENOENT; + return 0; } @@ -141,7 +136,6 @@ int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int ret; - struct path path; struct configfs_dirent *sd; struct config_item *parent_item; struct config_item *target_item = NULL; @@ -188,7 +182,7 @@ int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir, * AV, a thoroughly annoyed bastard. */ inode_unlock(dir); - ret = get_target(symname, &path, &target_item, dentry->d_sb); + ret = get_target(symname, &target_item, dentry->d_sb); inode_lock(dir); if (ret) goto out_put; @@ -210,7 +204,6 @@ int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir, } config_item_put(target_item); - path_put(&path); out_put: config_item_put(parent_item); diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 12daa85ed941..ca54bf24b719 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -421,7 +421,7 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) vm_fault_t vmf; unsigned long off = i * PAGE_SIZE; vmf = vmf_insert_mixed(vma, vma->vm_start + off, - address + off); + PHYS_PFN(address + off)); if (vmf & VM_FAULT_ERROR) ret = vm_fault_to_errno(vmf, 0); } diff --git a/fs/dcache.c b/fs/dcache.c index 65cc11939654..a067fa0a965a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1390,6 +1390,7 @@ struct check_mount { unsigned int mounted; }; +/* locks: mount_locked_reader && dentry->d_lock */ static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry) { struct check_mount *info = data; @@ -1416,9 +1417,8 @@ int path_has_submounts(const struct path *parent) { struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; - read_seqlock_excl(&mount_lock); + guard(mount_locked_reader)(); d_walk(parent->dentry, &data, path_check_mount); - read_sequnlock_excl(&mount_lock); return data.mounted; } @@ -1717,13 +1717,13 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dname = dentry->d_shortname.string; } - dentry->d_name.len = name->len; - dentry->d_name.hash = name->hash; + dentry->__d_name.len = name->len; + dentry->__d_name.hash = name->hash; memcpy(dname, name->name, name->len); dname[name->len] = 0; /* Make sure we always see the terminating NUL character */ - smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ + smp_store_release(&dentry->__d_name.name, dname); /* ^^^ */ dentry->d_flags = 0; lockref_init(&dentry->d_lockref); @@ -2743,15 +2743,15 @@ static void swap_names(struct dentry *dentry, struct dentry *target) /* * Both external: swap the pointers */ - swap(target->d_name.name, dentry->d_name.name); + swap(target->__d_name.name, dentry->__d_name.name); } else { /* * dentry:internal, target:external. Steal target's * storage and make target internal. */ - dentry->d_name.name = target->d_name.name; + dentry->__d_name.name = target->__d_name.name; target->d_shortname = dentry->d_shortname; - target->d_name.name = target->d_shortname.string; + target->__d_name.name = target->d_shortname.string; } } else { if (unlikely(dname_external(dentry))) { @@ -2759,9 +2759,9 @@ static void swap_names(struct dentry *dentry, struct dentry *target) * dentry:external, target:internal. Give dentry's * storage to target and make dentry internal */ - target->d_name.name = dentry->d_name.name; + target->__d_name.name = dentry->__d_name.name; dentry->d_shortname = target->d_shortname; - dentry->d_name.name = dentry->d_shortname.string; + dentry->__d_name.name = dentry->d_shortname.string; } else { /* * Both are internal. @@ -2771,7 +2771,7 @@ static void swap_names(struct dentry *dentry, struct dentry *target) target->d_shortname.words[i]); } } - swap(dentry->d_name.hash_len, target->d_name.hash_len); + swap(dentry->__d_name.hash_len, target->__d_name.hash_len); } static void copy_name(struct dentry *dentry, struct dentry *target) @@ -2781,11 +2781,11 @@ static void copy_name(struct dentry *dentry, struct dentry *target) old_name = external_name(dentry); if (unlikely(dname_external(target))) { atomic_inc(&external_name(target)->count); - dentry->d_name = target->d_name; + dentry->__d_name = target->__d_name; } else { dentry->d_shortname = target->d_shortname; - dentry->d_name.name = dentry->d_shortname.string; - dentry->d_name.hash_len = target->d_name.hash_len; + dentry->__d_name.name = dentry->d_shortname.string; + dentry->__d_name.hash_len = target->__d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->count))) kfree_rcu(old_name, head); @@ -3134,7 +3134,7 @@ void d_mark_tmpfile(struct file *file, struct inode *inode) !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - dentry->d_name.len = sprintf(dentry->d_shortname.string, "#%llu", + dentry->__d_name.len = sprintf(dentry->d_shortname.string, "#%llu", (unsigned long long)inode->i_ino); spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_parent->d_lock); diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 1dfd5b81d831..6648a924e31a 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -59,14 +59,6 @@ static int ecryptfs_d_revalidate(struct inode *dir, const struct qstr *name, return rc; } -struct kmem_cache *ecryptfs_dentry_info_cache; - -static void ecryptfs_dentry_free_rcu(struct rcu_head *head) -{ - kmem_cache_free(ecryptfs_dentry_info_cache, - container_of(head, struct ecryptfs_dentry_info, rcu)); -} - /** * ecryptfs_d_release * @dentry: The ecryptfs dentry @@ -75,11 +67,7 @@ static void ecryptfs_dentry_free_rcu(struct rcu_head *head) */ static void ecryptfs_d_release(struct dentry *dentry) { - struct ecryptfs_dentry_info *p = dentry->d_fsdata; - if (p) { - path_put(&p->lower_path); - call_rcu(&p->rcu, ecryptfs_dentry_free_rcu); - } + dput(dentry->d_fsdata); } const struct dentry_operations ecryptfs_dops = { diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 1f562e75d0e4..9e6ab0b41337 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -258,13 +258,6 @@ struct ecryptfs_inode_info { struct ecryptfs_crypt_stat crypt_stat; }; -/* dentry private data. Each dentry must keep track of a lower - * vfsmount too. */ -struct ecryptfs_dentry_info { - struct path lower_path; - struct rcu_head rcu; -}; - /** * ecryptfs_global_auth_tok - A key used to encrypt all new files under the mountpoint * @flags: Status flags @@ -348,6 +341,7 @@ struct ecryptfs_mount_crypt_stat { /* superblock private data. */ struct ecryptfs_sb_info { struct super_block *wsi_sb; + struct vfsmount *lower_mnt; struct ecryptfs_mount_crypt_stat mount_crypt_stat; }; @@ -494,22 +488,25 @@ ecryptfs_set_superblock_lower(struct super_block *sb, } static inline void -ecryptfs_set_dentry_private(struct dentry *dentry, - struct ecryptfs_dentry_info *dentry_info) +ecryptfs_set_dentry_lower(struct dentry *dentry, + struct dentry *lower_dentry) { - dentry->d_fsdata = dentry_info; + dentry->d_fsdata = lower_dentry; } static inline struct dentry * ecryptfs_dentry_to_lower(struct dentry *dentry) { - return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry; + return dentry->d_fsdata; } -static inline const struct path * -ecryptfs_dentry_to_lower_path(struct dentry *dentry) +static inline struct path +ecryptfs_lower_path(struct dentry *dentry) { - return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path; + return (struct path){ + .mnt = ecryptfs_superblock_to_private(dentry->d_sb)->lower_mnt, + .dentry = ecryptfs_dentry_to_lower(dentry) + }; } #define ecryptfs_printk(type, fmt, arg...) \ @@ -532,7 +529,6 @@ extern unsigned int ecryptfs_number_of_users; extern struct kmem_cache *ecryptfs_auth_tok_list_item_cache; extern struct kmem_cache *ecryptfs_file_info_cache; -extern struct kmem_cache *ecryptfs_dentry_info_cache; extern struct kmem_cache *ecryptfs_inode_info_cache; extern struct kmem_cache *ecryptfs_sb_info_cache; extern struct kmem_cache *ecryptfs_header_cache; @@ -557,7 +553,6 @@ int ecryptfs_encrypt_and_encode_filename( size_t *encoded_name_size, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, const char *name, size_t name_size); -struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry); void ecryptfs_dump_hex(char *data, int bytes); int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, int sg_size); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 5f8f96da09fe..7929411837cf 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -33,13 +33,12 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, struct iov_iter *to) { ssize_t rc; - const struct path *path; struct file *file = iocb->ki_filp; rc = generic_file_read_iter(iocb, to); if (rc >= 0) { - path = ecryptfs_dentry_to_lower_path(file->f_path.dentry); - touch_atime(path); + struct path path = ecryptfs_lower_path(file->f_path.dentry); + touch_atime(&path); } return rc; } @@ -59,12 +58,11 @@ static ssize_t ecryptfs_splice_read_update_atime(struct file *in, loff_t *ppos, size_t len, unsigned int flags) { ssize_t rc; - const struct path *path; rc = filemap_splice_read(in, ppos, pipe, len, flags); if (rc >= 0) { - path = ecryptfs_dentry_to_lower_path(in->f_path.dentry); - touch_atime(path); + struct path path = ecryptfs_lower_path(in->f_path.dentry); + touch_atime(&path); } return rc; } @@ -283,6 +281,7 @@ static int ecryptfs_dir_open(struct inode *inode, struct file *file) * ecryptfs_lookup() */ struct ecryptfs_file_info *file_info; struct file *lower_file; + struct path path; /* Released in ecryptfs_release or end of function if failure */ file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); @@ -292,8 +291,8 @@ static int ecryptfs_dir_open(struct inode *inode, struct file *file) "Error attempting to allocate memory\n"); return -ENOMEM; } - lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry), - file->f_flags, current_cred()); + path = ecryptfs_lower_path(ecryptfs_dentry); + lower_file = dentry_open(&path, file->f_flags, current_cred()); if (IS_ERR(lower_file)) { printk(KERN_ERR "%s: Error attempting to initialize " "the lower file for the dentry with name " diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index abd954c6a14e..ed1394da8d6b 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -327,24 +327,15 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode) static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, struct dentry *lower_dentry) { - const struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent); + struct dentry *lower_parent = ecryptfs_dentry_to_lower(dentry->d_parent); struct inode *inode, *lower_inode; - struct ecryptfs_dentry_info *dentry_info; int rc = 0; - dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL); - if (!dentry_info) { - dput(lower_dentry); - return ERR_PTR(-ENOMEM); - } - fsstack_copy_attr_atime(d_inode(dentry->d_parent), - d_inode(path->dentry)); + d_inode(lower_parent)); BUG_ON(!d_count(lower_dentry)); - ecryptfs_set_dentry_private(dentry, dentry_info); - dentry_info->lower_path.mnt = mntget(path->mnt); - dentry_info->lower_path.dentry = lower_dentry; + ecryptfs_set_dentry_lower(dentry, lower_dentry); /* * negative dentry can go positive under us here - its parent is not @@ -1021,10 +1012,10 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap, { struct dentry *dentry = path->dentry; struct kstat lower_stat; + struct path lower_path = ecryptfs_lower_path(dentry); int rc; - rc = vfs_getattr_nosec(ecryptfs_dentry_to_lower_path(dentry), - &lower_stat, request_mask, flags); + rc = vfs_getattr_nosec(&lower_path, &lower_stat, request_mask, flags); if (!rc) { fsstack_copy_attr_all(d_inode(dentry), ecryptfs_inode_to_lower(d_inode(dentry))); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index eab1beb846d3..16ea14dd2c62 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -106,15 +106,14 @@ static int ecryptfs_init_lower_file(struct dentry *dentry, struct file **lower_file) { const struct cred *cred = current_cred(); - const struct path *path = ecryptfs_dentry_to_lower_path(dentry); + struct path path = ecryptfs_lower_path(dentry); int rc; - rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt, - cred); + rc = ecryptfs_privileged_open(lower_file, path.dentry, path.mnt, cred); if (rc) { printk(KERN_ERR "Error opening lower file " "for lower_dentry [0x%p] and lower_mnt [0x%p]; " - "rc = [%d]\n", path->dentry, path->mnt, rc); + "rc = [%d]\n", path.dentry, path.mnt, rc); (*lower_file) = NULL; } return rc; @@ -437,7 +436,6 @@ static int ecryptfs_get_tree(struct fs_context *fc) struct ecryptfs_fs_context *ctx = fc->fs_private; struct ecryptfs_sb_info *sbi = fc->s_fs_info; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; - struct ecryptfs_dentry_info *root_info; const char *err = "Getting sb failed"; struct inode *inode; struct path path; @@ -543,14 +541,8 @@ static int ecryptfs_get_tree(struct fs_context *fc) goto out_free; } - rc = -ENOMEM; - root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL); - if (!root_info) - goto out_free; - - /* ->kill_sb() will take care of root_info */ - ecryptfs_set_dentry_private(s->s_root, root_info); - root_info->lower_path = path; + ecryptfs_set_dentry_lower(s->s_root, path.dentry); + ecryptfs_superblock_to_private(s)->lower_mnt = path.mnt; s->s_flags |= SB_ACTIVE; fc->root = dget(s->s_root); @@ -580,6 +572,7 @@ static void ecryptfs_kill_block_super(struct super_block *sb) kill_anon_super(sb); if (!sb_info) return; + mntput(sb_info->lower_mnt); ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); kmem_cache_free(ecryptfs_sb_info_cache, sb_info); } @@ -668,11 +661,6 @@ static struct ecryptfs_cache_info { .size = sizeof(struct ecryptfs_file_info), }, { - .cache = &ecryptfs_dentry_info_cache, - .name = "ecryptfs_dentry_info_cache", - .size = sizeof(struct ecryptfs_dentry_info), - }, - { .cache = &ecryptfs_inode_info_cache, .name = "ecryptfs_inode_cache", .size = sizeof(struct ecryptfs_inode_info), diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c index cc01556c9d9b..2d2d510f2372 100644 --- a/fs/exfat/balloc.c +++ b/fs/exfat/balloc.c @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/bitmap.h> #include <linux/buffer_head.h> +#include <linux/backing-dev.h> #include "exfat_raw.h" #include "exfat_fs.h" @@ -26,13 +27,58 @@ /* * Allocation Bitmap Management Functions */ +static bool exfat_test_bitmap_range(struct super_block *sb, unsigned int clu, + unsigned int count) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + unsigned int start = clu; + unsigned int end = clu + count; + unsigned int ent_idx, i, b; + unsigned int bit_offset, bits_to_check; + __le_long *bitmap_le; + unsigned long mask, word; + + if (!is_valid_cluster(sbi, start) || !is_valid_cluster(sbi, end - 1)) + return false; + + while (start < end) { + ent_idx = CLUSTER_TO_BITMAP_ENT(start); + i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx); + b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx); + + bitmap_le = (__le_long *)sbi->vol_amap[i]->b_data; + + /* Calculate how many bits we can check in the current word */ + bit_offset = b % BITS_PER_LONG; + bits_to_check = min(end - start, + (unsigned int)(BITS_PER_LONG - bit_offset)); + + /* Create a bitmask for the range of bits to check */ + if (bits_to_check >= BITS_PER_LONG) + mask = ~0UL; + else + mask = ((1UL << bits_to_check) - 1) << bit_offset; + word = lel_to_cpu(bitmap_le[b / BITS_PER_LONG]); + + /* Check if all bits in the mask are set */ + if ((word & mask) != mask) + return false; + + start += bits_to_check; + } + + return true; +} + static int exfat_allocate_bitmap(struct super_block *sb, struct exfat_dentry *ep) { struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct blk_plug plug; long long map_size; - unsigned int i, need_map_size; + unsigned int i, j, need_map_size; sector_t sector; + unsigned int max_ra_count; sbi->map_clu = le32_to_cpu(ep->dentry.bitmap.start_clu); map_size = le64_to_cpu(ep->dentry.bitmap.size); @@ -56,22 +102,37 @@ static int exfat_allocate_bitmap(struct super_block *sb, return -ENOMEM; sector = exfat_cluster_to_sector(sbi, sbi->map_clu); + max_ra_count = min(sb->s_bdi->ra_pages, sb->s_bdi->io_pages) << + (PAGE_SHIFT - sb->s_blocksize_bits); for (i = 0; i < sbi->map_sectors; i++) { - sbi->vol_amap[i] = sb_bread(sb, sector + i); - if (!sbi->vol_amap[i]) { - /* release all buffers and free vol_amap */ - int j = 0; - - while (j < i) - brelse(sbi->vol_amap[j++]); - - kvfree(sbi->vol_amap); - sbi->vol_amap = NULL; - return -EIO; + /* Trigger the next readahead in advance. */ + if (0 == (i % max_ra_count)) { + blk_start_plug(&plug); + for (j = i; j < min(max_ra_count, sbi->map_sectors - i) + i; j++) + sb_breadahead(sb, sector + j); + blk_finish_plug(&plug); } + + sbi->vol_amap[i] = sb_bread(sb, sector + i); + if (!sbi->vol_amap[i]) + goto err_out; } + if (exfat_test_bitmap_range(sb, sbi->map_clu, + EXFAT_B_TO_CLU_ROUND_UP(map_size, sbi)) == false) + goto err_out; + return 0; + +err_out: + j = 0; + /* release all buffers and free vol_amap */ + while (j < i) + brelse(sbi->vol_amap[j++]); + + kvfree(sbi->vol_amap); + sbi->vol_amap = NULL; + return -EIO; } int exfat_load_bitmap(struct super_block *sb) diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index ee060e26f51d..7229146fe2bf 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -1244,3 +1244,163 @@ int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir) return count; } + +static int exfat_get_volume_label_dentry(struct super_block *sb, + struct exfat_entry_set_cache *es) +{ + int i; + int dentry = 0; + unsigned int type; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_hint_femp hint_femp; + struct exfat_inode_info *ei = EXFAT_I(sb->s_root->d_inode); + struct exfat_chain clu; + struct exfat_dentry *ep; + struct buffer_head *bh; + + hint_femp.eidx = EXFAT_HINT_NONE; + exfat_chain_set(&clu, sbi->root_dir, 0, ALLOC_FAT_CHAIN); + + while (clu.dir != EXFAT_EOF_CLUSTER) { + for (i = 0; i < sbi->dentries_per_clu; i++, dentry++) { + ep = exfat_get_dentry(sb, &clu, i, &bh); + if (!ep) + return -EIO; + + type = exfat_get_entry_type(ep); + if (hint_femp.eidx == EXFAT_HINT_NONE) { + if (type == TYPE_DELETED || type == TYPE_UNUSED) { + hint_femp.cur = clu; + hint_femp.eidx = dentry; + hint_femp.count = 1; + } + } + + if (type == TYPE_UNUSED) { + brelse(bh); + goto not_found; + } + + if (type != TYPE_VOLUME) { + brelse(bh); + continue; + } + + memset(es, 0, sizeof(*es)); + es->sb = sb; + es->bh = es->__bh; + es->bh[0] = bh; + es->num_bh = 1; + es->start_off = EXFAT_DEN_TO_B(i) % sb->s_blocksize; + + return 0; + } + + if (exfat_get_next_cluster(sb, &(clu.dir))) + return -EIO; + } + +not_found: + if (hint_femp.eidx == EXFAT_HINT_NONE) { + hint_femp.cur.dir = EXFAT_EOF_CLUSTER; + hint_femp.eidx = dentry; + hint_femp.count = 0; + } + + ei->hint_femp = hint_femp; + + return -ENOENT; +} + +int exfat_read_volume_label(struct super_block *sb, struct exfat_uni_name *label_out) +{ + int ret, i; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_entry_set_cache es; + struct exfat_dentry *ep; + + mutex_lock(&sbi->s_lock); + + memset(label_out, 0, sizeof(*label_out)); + ret = exfat_get_volume_label_dentry(sb, &es); + if (ret < 0) { + /* + * ENOENT signifies that a volume label dentry doesn't exist + * We will treat this as an empty volume label and not fail. + */ + if (ret == -ENOENT) + ret = 0; + + goto unlock; + } + + ep = exfat_get_dentry_cached(&es, 0); + label_out->name_len = ep->dentry.volume_label.char_count; + if (label_out->name_len > EXFAT_VOLUME_LABEL_LEN) { + ret = -EIO; + exfat_put_dentry_set(&es, false); + goto unlock; + } + + for (i = 0; i < label_out->name_len; i++) + label_out->name[i] = le16_to_cpu(ep->dentry.volume_label.volume_label[i]); + + exfat_put_dentry_set(&es, false); +unlock: + mutex_unlock(&sbi->s_lock); + return ret; +} + +int exfat_write_volume_label(struct super_block *sb, + struct exfat_uni_name *label) +{ + int ret, i; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct inode *root_inode = sb->s_root->d_inode; + struct exfat_entry_set_cache es; + struct exfat_chain clu; + struct exfat_dentry *ep; + + if (label->name_len > EXFAT_VOLUME_LABEL_LEN) + return -EINVAL; + + mutex_lock(&sbi->s_lock); + + ret = exfat_get_volume_label_dentry(sb, &es); + if (ret == -ENOENT) { + if (label->name_len == 0) { + /* No volume label dentry, no need to clear */ + ret = 0; + goto unlock; + } + + ret = exfat_find_empty_entry(root_inode, &clu, 1, &es); + } + + if (ret < 0) + goto unlock; + + ep = exfat_get_dentry_cached(&es, 0); + + if (label->name_len == 0 && ep->dentry.volume_label.char_count == 0) { + /* volume label had been cleared */ + exfat_put_dentry_set(&es, 0); + goto unlock; + } + + memset(ep, 0, sizeof(*ep)); + ep->type = EXFAT_VOLUME; + + for (i = 0; i < label->name_len; i++) + ep->dentry.volume_label.volume_label[i] = + cpu_to_le16(label->name[i]); + + ep->dentry.volume_label.char_count = label->name_len; + es.modified = true; + + ret = exfat_put_dentry_set(&es, IS_DIRSYNC(root_inode)); + +unlock: + mutex_unlock(&sbi->s_lock); + return ret; +} diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index f8ead4d47ef0..329697c89d09 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -477,6 +477,9 @@ int exfat_force_shutdown(struct super_block *sb, u32 flags); /* namei.c */ extern const struct dentry_operations exfat_dentry_ops; extern const struct dentry_operations exfat_utf8_dentry_ops; +int exfat_find_empty_entry(struct inode *inode, + struct exfat_chain *p_dir, int num_entries, + struct exfat_entry_set_cache *es); /* cache.c */ int exfat_cache_init(void); @@ -517,6 +520,10 @@ int exfat_get_empty_dentry_set(struct exfat_entry_set_cache *es, unsigned int num_entries); int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync); int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir); +int exfat_read_volume_label(struct super_block *sb, + struct exfat_uni_name *label_out); +int exfat_write_volume_label(struct super_block *sb, + struct exfat_uni_name *label); /* inode.c */ extern const struct inode_operations exfat_file_inode_operations; diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h index 971a1ccd0e89..4082fa7b8c14 100644 --- a/fs/exfat/exfat_raw.h +++ b/fs/exfat/exfat_raw.h @@ -80,6 +80,7 @@ #define BOOTSEC_OLDBPB_LEN 53 #define EXFAT_FILE_NAME_LEN 15 +#define EXFAT_VOLUME_LABEL_LEN 11 #define EXFAT_MIN_SECT_SIZE_BITS 9 #define EXFAT_MAX_SECT_SIZE_BITS 12 @@ -160,6 +161,11 @@ struct exfat_dentry { __le64 size; } __packed upcase; /* up-case table directory entry */ struct { + __u8 char_count; + __le16 volume_label[EXFAT_VOLUME_LABEL_LEN]; + __u8 reserved[8]; + } __packed volume_label; /* volume label directory entry */ + struct { __u8 flags; __u8 vendor_guid[16]; __u8 vendor_defined[14]; diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index 232cc7f8ab92..825083634ba2 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -89,35 +89,36 @@ int exfat_ent_get(struct super_block *sb, unsigned int loc, int err; if (!is_valid_cluster(sbi, loc)) { - exfat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", + exfat_fs_error_ratelimit(sb, + "invalid access to FAT (entry 0x%08x)", loc); return -EIO; } err = __exfat_ent_get(sb, loc, content); if (err) { - exfat_fs_error(sb, + exfat_fs_error_ratelimit(sb, "failed to access to FAT (entry 0x%08x, err:%d)", loc, err); return err; } if (*content == EXFAT_FREE_CLUSTER) { - exfat_fs_error(sb, + exfat_fs_error_ratelimit(sb, "invalid access to FAT free cluster (entry 0x%08x)", loc); return -EIO; } if (*content == EXFAT_BAD_CLUSTER) { - exfat_fs_error(sb, + exfat_fs_error_ratelimit(sb, "invalid access to FAT bad cluster (entry 0x%08x)", loc); return -EIO; } if (*content != EXFAT_EOF_CLUSTER && !is_valid_cluster(sbi, *content)) { - exfat_fs_error(sb, + exfat_fs_error_ratelimit(sb, "invalid access to FAT (entry 0x%08x) bogus content (0x%08x)", loc, *content); return -EIO; diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 538d2b6ac2ec..f246cf439588 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -486,6 +486,54 @@ static int exfat_ioctl_shutdown(struct super_block *sb, unsigned long arg) return exfat_force_shutdown(sb, flags); } +static int exfat_ioctl_get_volume_label(struct super_block *sb, unsigned long arg) +{ + int ret; + char label[FSLABEL_MAX] = {0}; + struct exfat_uni_name uniname; + + ret = exfat_read_volume_label(sb, &uniname); + if (ret < 0) + return ret; + + ret = exfat_utf16_to_nls(sb, &uniname, label, uniname.name_len); + if (ret < 0) + return ret; + + if (copy_to_user((char __user *)arg, label, ret + 1)) + return -EFAULT; + + return 0; +} + +static int exfat_ioctl_set_volume_label(struct super_block *sb, + unsigned long arg) +{ + int ret = 0, lossy; + char label[FSLABEL_MAX]; + struct exfat_uni_name uniname; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(label, (char __user *)arg, FSLABEL_MAX)) + return -EFAULT; + + memset(&uniname, 0, sizeof(uniname)); + if (label[0]) { + ret = exfat_nls_to_utf16(sb, label, FSLABEL_MAX, + &uniname, &lossy); + if (ret < 0) + return ret; + else if (lossy & NLS_NAME_LOSSY) + return -EINVAL; + } + + uniname.name_len = ret; + + return exfat_write_volume_label(sb, &uniname); +} + long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -500,6 +548,10 @@ long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return exfat_ioctl_shutdown(inode->i_sb, arg); case FITRIM: return exfat_ioctl_fitrim(inode, arg); + case FS_IOC_GETFSLABEL: + return exfat_ioctl_get_volume_label(inode->i_sb, arg); + case FS_IOC_SETFSLABEL: + return exfat_ioctl_set_volume_label(inode->i_sb, arg); default: return -ENOTTY; } diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index c10844e1e16c..f9501c3a3666 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -25,7 +25,7 @@ int __exfat_write_inode(struct inode *inode, int sync) struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); - bool is_dir = (ei->type == TYPE_DIR) ? true : false; + bool is_dir = (ei->type == TYPE_DIR); struct timespec64 ts; if (inode->i_ino == EXFAT_ROOT_INO) diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index f5f1c4e8a29f..7eb9c67fd35f 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -300,7 +300,7 @@ static int exfat_check_max_dentries(struct inode *inode) * the directory entry index in p_dir is returned on succeeds * -error code is returned on failure */ -static int exfat_find_empty_entry(struct inode *inode, +int exfat_find_empty_entry(struct inode *inode, struct exfat_chain *p_dir, int num_entries, struct exfat_entry_set_cache *es) { @@ -587,7 +587,7 @@ unlock: } /* lookup a file */ -static int exfat_find(struct inode *dir, struct qstr *qname, +static int exfat_find(struct inode *dir, const struct qstr *qname, struct exfat_dir_entry *info) { int ret, dentry, count; diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index 1729bf42eb51..8243d94ceaf4 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -789,7 +789,7 @@ int exfat_create_upcase_table(struct super_block *sb) return ret; } - if (exfat_get_next_cluster(sb, &(clu.dir))) + if (exfat_get_next_cluster(sb, &clu.dir)) return -EIO; } diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 8926e63f5bb7..7f9592856bf7 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -31,6 +31,16 @@ static void exfat_free_iocharset(struct exfat_sb_info *sbi) kfree(sbi->options.iocharset); } +static void exfat_set_iocharset(struct exfat_mount_options *opts, + char *iocharset) +{ + opts->iocharset = iocharset; + if (!strcmp(opts->iocharset, "utf8")) + opts->utf8 = 1; + else + opts->utf8 = 0; +} + static void exfat_put_super(struct super_block *sb) { struct exfat_sb_info *sbi = EXFAT_SB(sb); @@ -243,11 +253,11 @@ static const struct fs_parameter_spec exfat_parameters[] = { fsparam_u32oct("allow_utime", Opt_allow_utime), fsparam_string("iocharset", Opt_charset), fsparam_enum("errors", Opt_errors, exfat_param_enums), - fsparam_flag("discard", Opt_discard), + fsparam_flag_no("discard", Opt_discard), fsparam_flag("keep_last_dots", Opt_keep_last_dots), fsparam_flag("sys_tz", Opt_sys_tz), fsparam_s32("time_offset", Opt_time_offset), - fsparam_flag("zero_size_dir", Opt_zero_size_dir), + fsparam_flag_no("zero_size_dir", Opt_zero_size_dir), __fsparam(NULL, "utf8", Opt_utf8, fs_param_deprecated, NULL), __fsparam(NULL, "debug", Opt_debug, fs_param_deprecated, @@ -292,14 +302,14 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_charset: exfat_free_iocharset(sbi); - opts->iocharset = param->string; + exfat_set_iocharset(opts, param->string); param->string = NULL; break; case Opt_errors: opts->errors = result.uint_32; break; case Opt_discard: - opts->discard = 1; + opts->discard = !result.negated; break; case Opt_keep_last_dots: opts->keep_last_dots = 1; @@ -317,7 +327,7 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param) opts->time_offset = result.int_32; break; case Opt_zero_size_dir: - opts->zero_size_dir = true; + opts->zero_size_dir = !result.negated; break; case Opt_utf8: case Opt_debug: @@ -664,8 +674,8 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) /* set up enough so that it can read an inode */ exfat_hash_init(sb); - if (!strcmp(sbi->options.iocharset, "utf8")) - opts->utf8 = 1; + if (sbi->options.utf8) + set_default_d_op(sb, &exfat_utf8_dentry_ops); else { sbi->nls_io = load_nls(sbi->options.iocharset); if (!sbi->nls_io) { @@ -674,12 +684,8 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) err = -EINVAL; goto free_table; } - } - - if (sbi->options.utf8) - set_default_d_op(sb, &exfat_utf8_dentry_ops); - else set_default_d_op(sb, &exfat_dentry_ops); + } root_inode = new_inode(sb); if (!root_inode) { @@ -742,12 +748,44 @@ static void exfat_free(struct fs_context *fc) static int exfat_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; + struct exfat_sb_info *remount_sbi = fc->s_fs_info; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_mount_options *new_opts = &remount_sbi->options; + struct exfat_mount_options *cur_opts = &sbi->options; + fc->sb_flags |= SB_NODIRATIME; sync_filesystem(sb); - mutex_lock(&EXFAT_SB(sb)->s_lock); + mutex_lock(&sbi->s_lock); exfat_clear_volume_dirty(sb); - mutex_unlock(&EXFAT_SB(sb)->s_lock); + mutex_unlock(&sbi->s_lock); + + if (new_opts->allow_utime == (unsigned short)-1) + new_opts->allow_utime = ~new_opts->fs_dmask & 0022; + + /* + * Since the old settings of these mount options are cached in + * inodes or dentries, they cannot be modified dynamically. + */ + if (strcmp(new_opts->iocharset, cur_opts->iocharset) || + new_opts->keep_last_dots != cur_opts->keep_last_dots || + new_opts->sys_tz != cur_opts->sys_tz || + new_opts->time_offset != cur_opts->time_offset || + !uid_eq(new_opts->fs_uid, cur_opts->fs_uid) || + !gid_eq(new_opts->fs_gid, cur_opts->fs_gid) || + new_opts->fs_fmask != cur_opts->fs_fmask || + new_opts->fs_dmask != cur_opts->fs_dmask || + new_opts->allow_utime != cur_opts->allow_utime) + return -EINVAL; + + if (new_opts->discard != cur_opts->discard && + new_opts->discard && + !bdev_max_discard_sectors(sb->s_bdev)) { + exfat_warn(sb, "remounting with \"discard\" option, but the device does not support discard"); + return -EINVAL; + } + + swap(*cur_opts, *new_opts); return 0; } @@ -777,8 +815,8 @@ static int exfat_init_fs_context(struct fs_context *fc) sbi->options.fs_fmask = current->fs->umask; sbi->options.fs_dmask = current->fs->umask; sbi->options.allow_utime = -1; - sbi->options.iocharset = exfat_default_iocharset; sbi->options.errors = EXFAT_ERRORS_RO; + exfat_set_iocharset(&sbi->options, exfat_default_iocharset); fc->s_fs_info = sbi; fc->ops = &exfat_context_ops; diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index c9ca41d91a6c..01873c2a34ad 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -1,31 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -# Ext3 configs are here for backward compatibility with old configs which may -# have EXT3_FS set but not EXT4_FS set and thus would result in non-bootable -# kernels after the removal of ext3 driver. -config EXT3_FS - tristate "The Extended 3 (ext3) filesystem" - select EXT4_FS - help - This config option is here only for backward compatibility. ext3 - filesystem is now handled by the ext4 driver. - -config EXT3_FS_POSIX_ACL - bool "Ext3 POSIX Access Control Lists" - depends on EXT3_FS - select EXT4_FS_POSIX_ACL - select FS_POSIX_ACL - help - This config option is here only for backward compatibility. ext3 - filesystem is now handled by the ext4 driver. - -config EXT3_FS_SECURITY - bool "Ext3 Security Labels" - depends on EXT3_FS - select EXT4_FS_SECURITY - help - This config option is here only for backward compatibility. ext3 - filesystem is now handled by the ext4 driver. - config EXT4_FS tristate "The Extended 4 (ext4) filesystem" select BUFFER_HEAD diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6cb784a56b3b..57087da6c7be 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1450,7 +1450,9 @@ struct ext4_super_block { __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ __le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */ - __le32 s_reserved[94]; /* Padding to the end of the block */ + __le16 s_def_resuid_hi; + __le16 s_def_resgid_hi; + __le32 s_reserved[93]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; @@ -1820,6 +1822,18 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } +static inline int ext4_get_resuid(struct ext4_super_block *es) +{ + return le16_to_cpu(es->s_def_resuid) | + le16_to_cpu(es->s_def_resuid_hi) << 16; +} + +static inline int ext4_get_resgid(struct ext4_super_block *es) +{ + return le16_to_cpu(es->s_def_resgid) | + le16_to_cpu(es->s_def_resgid_hi) << 16; +} + /* * Returns: sbi->field[index] * Used to access an array element from the following sbi fields which require @@ -1990,6 +2004,16 @@ static inline bool ext4_verity_in_progress(struct inode *inode) #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime /* + * Check whether the inode is tracked as orphan (either in orphan file or + * orphan list). + */ +static inline bool ext4_inode_orphan_tracked(struct inode *inode) +{ + return ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || + !list_empty(&EXT4_I(inode)->i_orphan); +} + +/* * Codes for operating systems */ #define EXT4_OS_LINUX 0 @@ -3142,6 +3166,8 @@ extern struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags); extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block); +extern struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb, + sector_t block); extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io, bool simu_fail); extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 42bee1d4f9f9..fa66b08de999 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -663,7 +663,7 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) { - blk_opf_t write_flags = REQ_SYNC; + blk_opf_t write_flags = JBD2_JOURNAL_REQ_FLAGS; struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; /* Add REQ_FUA | REQ_PREFLUSH only its tail */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 93240e35ee36..7a8b30932189 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -354,7 +354,7 @@ static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc) * to cleanup the orphan list in ext4_handle_inode_extension(). Do it * now. */ - if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { + if (ext4_inode_orphan_tracked(inode) && inode->i_nlink) { handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index 91185c40f755..22fc333244ef 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -74,7 +74,8 @@ static int ext4_getfsmap_dev_compare(const void *p1, const void *p2) static bool ext4_getfsmap_rec_before_low_key(struct ext4_getfsmap_info *info, struct ext4_fsmap *rec) { - return rec->fmr_physical < info->gfi_low.fmr_physical; + return rec->fmr_physical + rec->fmr_length <= + info->gfi_low.fmr_physical; } /* @@ -200,15 +201,18 @@ static int ext4_getfsmap_meta_helper(struct super_block *sb, ext4_group_first_block_no(sb, agno)); fs_end = fs_start + EXT4_C2B(sbi, len); - /* Return relevant extents from the meta_list */ + /* + * Return relevant extents from the meta_list. We emit all extents that + * partially/fully overlap with the query range + */ list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) { - if (p->fmr_physical < info->gfi_next_fsblk) { + if (p->fmr_physical + p->fmr_length <= info->gfi_next_fsblk) { list_del(&p->fmr_list); kfree(p); continue; } - if (p->fmr_physical <= fs_start || - p->fmr_physical + p->fmr_length <= fs_end) { + if (p->fmr_physical <= fs_end && + p->fmr_physical + p->fmr_length > fs_start) { /* Emit the retained free extent record if present */ if (info->gfi_lastfree.fmr_owner) { error = ext4_getfsmap_helper(sb, info, diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index d45124318200..da76353b3a57 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1025,7 +1025,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, } /* Go read the buffer for the next level down */ - bh = ext4_sb_bread(inode->i_sb, nr, 0); + bh = ext4_sb_bread_nofail(inode->i_sb, nr); /* * A read failure? Report error and clear slot diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5b7a15db4953..f9e4ac87211e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3872,47 +3872,12 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, return ret; } -static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written) -{ - /* must be a directio to fall back to buffered */ - if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != - (IOMAP_WRITE | IOMAP_DIRECT)) - return false; - - /* atomic writes are all-or-nothing */ - if (flags & IOMAP_ATOMIC) - return false; - - /* can only try again if we wrote nothing */ - return written == 0; -} - -static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, - ssize_t written, unsigned flags, struct iomap *iomap) -{ - /* - * Check to see whether an error occurred while writing out the data to - * the allocated blocks. If so, return the magic error code for - * non-atomic write so that we fallback to buffered I/O and attempt to - * complete the remainder of the I/O. - * For non-atomic writes, any blocks that may have been - * allocated in preparation for the direct I/O will be reused during - * buffered I/O. For atomic write, we never fallback to buffered-io. - */ - if (ext4_want_directio_fallback(flags, written)) - return -ENOTBLK; - - return 0; -} - const struct iomap_ops ext4_iomap_ops = { .iomap_begin = ext4_iomap_begin, - .iomap_end = ext4_iomap_end, }; const struct iomap_ops ext4_iomap_overwrite_ops = { .iomap_begin = ext4_iomap_overwrite_begin, - .iomap_end = ext4_iomap_end, }; static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, @@ -4287,7 +4252,11 @@ int ext4_can_truncate(struct inode *inode) * We have to make sure i_disksize gets properly updated before we truncate * page cache due to hole punching or zero range. Otherwise i_disksize update * can get lost as it may have been postponed to submission of writeback but - * that will never happen after we truncate page cache. + * that will never happen if we remove the folio containing i_size from the + * page cache. Also if we punch hole within i_size but above i_disksize, + * following ext4_page_mkwrite() may mistakenly allocate written blocks over + * the hole and thus introduce allocated blocks beyond i_disksize which is + * not allowed (e2fsck would complain in case of crash). */ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t len) @@ -4298,9 +4267,11 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t size = i_size_read(inode); WARN_ON(!inode_is_locked(inode)); - if (offset > size || offset + len < size) + if (offset > size) return 0; + if (offset + len < size) + size = offset + len; if (EXT4_I(inode)->i_disksize >= size) return 0; @@ -4748,7 +4719,7 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode * old inodes get re-used with the upper 16 bits of the * uid/gid intact. */ - if (ei->i_dtime && list_empty(&ei->i_orphan)) { + if (ei->i_dtime && !ext4_inode_orphan_tracked(inode)) { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } else { diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 84e3c73952d7..a93a7baae990 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -27,14 +27,16 @@ #include "fsmap.h" #include <trace/events/ext4.h> -typedef void ext4_update_sb_callback(struct ext4_super_block *es, - const void *arg); +typedef void ext4_update_sb_callback(struct ext4_sb_info *sbi, + struct ext4_super_block *es, + const void *arg); /* * Superblock modification callback function for changing file system * label */ -static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg) +static void ext4_sb_setlabel(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) { /* Sanity check, this should never happen */ BUILD_BUG_ON(sizeof(es->s_volume_name) < EXT4_LABEL_MAX); @@ -46,7 +48,8 @@ static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg) * Superblock modification callback function for changing file system * UUID. */ -static void ext4_sb_setuuid(struct ext4_super_block *es, const void *arg) +static void ext4_sb_setuuid(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) { memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE); } @@ -71,7 +74,7 @@ int ext4_update_primary_sb(struct super_block *sb, handle_t *handle, goto out_err; lock_buffer(bh); - func(es, arg); + func(sbi, es, arg); ext4_superblock_csum_set(sb); unlock_buffer(bh); @@ -149,7 +152,7 @@ static int ext4_update_backup_sb(struct super_block *sb, unlock_buffer(bh); goto out_bh; } - func(es, arg); + func(EXT4_SB(sb), es, arg); if (ext4_has_feature_metadata_csum(sb)) es->s_checksum = ext4_superblock_csum(es); set_buffer_uptodate(bh); @@ -1230,6 +1233,295 @@ static int ext4_ioctl_setuuid(struct file *filp, return ret; } + +#define TUNE_OPS_SUPPORTED (EXT4_TUNE_FL_ERRORS_BEHAVIOR | \ + EXT4_TUNE_FL_MNT_COUNT | EXT4_TUNE_FL_MAX_MNT_COUNT | \ + EXT4_TUNE_FL_CHECKINTRVAL | EXT4_TUNE_FL_LAST_CHECK_TIME | \ + EXT4_TUNE_FL_RESERVED_BLOCKS | EXT4_TUNE_FL_RESERVED_UID | \ + EXT4_TUNE_FL_RESERVED_GID | EXT4_TUNE_FL_DEFAULT_MNT_OPTS | \ + EXT4_TUNE_FL_DEF_HASH_ALG | EXT4_TUNE_FL_RAID_STRIDE | \ + EXT4_TUNE_FL_RAID_STRIPE_WIDTH | EXT4_TUNE_FL_MOUNT_OPTS | \ + EXT4_TUNE_FL_FEATURES | EXT4_TUNE_FL_EDIT_FEATURES | \ + EXT4_TUNE_FL_FORCE_FSCK | EXT4_TUNE_FL_ENCODING | \ + EXT4_TUNE_FL_ENCODING_FLAGS) + +#define EXT4_TUNE_SET_COMPAT_SUPP \ + (EXT4_FEATURE_COMPAT_DIR_INDEX | \ + EXT4_FEATURE_COMPAT_STABLE_INODES) +#define EXT4_TUNE_SET_INCOMPAT_SUPP \ + (EXT4_FEATURE_INCOMPAT_EXTENTS | \ + EXT4_FEATURE_INCOMPAT_EA_INODE | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD) +#define EXT4_TUNE_SET_RO_COMPAT_SUPP \ + (EXT4_FEATURE_RO_COMPAT_LARGE_FILE | \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_PROJECT | \ + EXT4_FEATURE_RO_COMPAT_VERITY) + +#define EXT4_TUNE_CLEAR_COMPAT_SUPP (0) +#define EXT4_TUNE_CLEAR_INCOMPAT_SUPP (0) +#define EXT4_TUNE_CLEAR_RO_COMPAT_SUPP (0) + +#define SB_ENC_SUPP_MASK (SB_ENC_STRICT_MODE_FL | \ + SB_ENC_NO_COMPAT_FALLBACK_FL) + +static int ext4_ioctl_get_tune_sb(struct ext4_sb_info *sbi, + struct ext4_tune_sb_params __user *params) +{ + struct ext4_tune_sb_params ret; + struct ext4_super_block *es = sbi->s_es; + + memset(&ret, 0, sizeof(ret)); + ret.set_flags = TUNE_OPS_SUPPORTED; + ret.errors_behavior = le16_to_cpu(es->s_errors); + ret.mnt_count = le16_to_cpu(es->s_mnt_count); + ret.max_mnt_count = le16_to_cpu(es->s_max_mnt_count); + ret.checkinterval = le32_to_cpu(es->s_checkinterval); + ret.last_check_time = le32_to_cpu(es->s_lastcheck); + ret.reserved_blocks = ext4_r_blocks_count(es); + ret.blocks_count = ext4_blocks_count(es); + ret.reserved_uid = ext4_get_resuid(es); + ret.reserved_gid = ext4_get_resgid(es); + ret.default_mnt_opts = le32_to_cpu(es->s_default_mount_opts); + ret.def_hash_alg = es->s_def_hash_version; + ret.raid_stride = le16_to_cpu(es->s_raid_stride); + ret.raid_stripe_width = le32_to_cpu(es->s_raid_stripe_width); + ret.encoding = le16_to_cpu(es->s_encoding); + ret.encoding_flags = le16_to_cpu(es->s_encoding_flags); + strscpy_pad(ret.mount_opts, es->s_mount_opts); + ret.feature_compat = le32_to_cpu(es->s_feature_compat); + ret.feature_incompat = le32_to_cpu(es->s_feature_incompat); + ret.feature_ro_compat = le32_to_cpu(es->s_feature_ro_compat); + ret.set_feature_compat_mask = EXT4_TUNE_SET_COMPAT_SUPP; + ret.set_feature_incompat_mask = EXT4_TUNE_SET_INCOMPAT_SUPP; + ret.set_feature_ro_compat_mask = EXT4_TUNE_SET_RO_COMPAT_SUPP; + ret.clear_feature_compat_mask = EXT4_TUNE_CLEAR_COMPAT_SUPP; + ret.clear_feature_incompat_mask = EXT4_TUNE_CLEAR_INCOMPAT_SUPP; + ret.clear_feature_ro_compat_mask = EXT4_TUNE_CLEAR_RO_COMPAT_SUPP; + if (copy_to_user(params, &ret, sizeof(ret))) + return -EFAULT; + return 0; +} + +static void ext4_sb_setparams(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) +{ + const struct ext4_tune_sb_params *params = arg; + + if (params->set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) + es->s_errors = cpu_to_le16(params->errors_behavior); + if (params->set_flags & EXT4_TUNE_FL_MNT_COUNT) + es->s_mnt_count = cpu_to_le16(params->mnt_count); + if (params->set_flags & EXT4_TUNE_FL_MAX_MNT_COUNT) + es->s_max_mnt_count = cpu_to_le16(params->max_mnt_count); + if (params->set_flags & EXT4_TUNE_FL_CHECKINTRVAL) + es->s_checkinterval = cpu_to_le32(params->checkinterval); + if (params->set_flags & EXT4_TUNE_FL_LAST_CHECK_TIME) + es->s_lastcheck = cpu_to_le32(params->last_check_time); + if (params->set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) { + ext4_fsblk_t blk = params->reserved_blocks; + + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); + } + if (params->set_flags & EXT4_TUNE_FL_RESERVED_UID) { + int uid = params->reserved_uid; + + es->s_def_resuid = cpu_to_le16(uid & 0xFFFF); + es->s_def_resuid_hi = cpu_to_le16(uid >> 16); + } + if (params->set_flags & EXT4_TUNE_FL_RESERVED_GID) { + int gid = params->reserved_gid; + + es->s_def_resgid = cpu_to_le16(gid & 0xFFFF); + es->s_def_resgid_hi = cpu_to_le16(gid >> 16); + } + if (params->set_flags & EXT4_TUNE_FL_DEFAULT_MNT_OPTS) + es->s_default_mount_opts = cpu_to_le32(params->default_mnt_opts); + if (params->set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + es->s_def_hash_version = params->def_hash_alg; + if (params->set_flags & EXT4_TUNE_FL_RAID_STRIDE) + es->s_raid_stride = cpu_to_le16(params->raid_stride); + if (params->set_flags & EXT4_TUNE_FL_RAID_STRIPE_WIDTH) + es->s_raid_stripe_width = + cpu_to_le32(params->raid_stripe_width); + if (params->set_flags & EXT4_TUNE_FL_ENCODING) + es->s_encoding = cpu_to_le16(params->encoding); + if (params->set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) + es->s_encoding_flags = cpu_to_le16(params->encoding_flags); + strscpy_pad(es->s_mount_opts, params->mount_opts); + if (params->set_flags & EXT4_TUNE_FL_EDIT_FEATURES) { + es->s_feature_compat |= + cpu_to_le32(params->set_feature_compat_mask); + es->s_feature_incompat |= + cpu_to_le32(params->set_feature_incompat_mask); + es->s_feature_ro_compat |= + cpu_to_le32(params->set_feature_ro_compat_mask); + es->s_feature_compat &= + ~cpu_to_le32(params->clear_feature_compat_mask); + es->s_feature_incompat &= + ~cpu_to_le32(params->clear_feature_incompat_mask); + es->s_feature_ro_compat &= + ~cpu_to_le32(params->clear_feature_ro_compat_mask); + if (params->set_feature_compat_mask & + EXT4_FEATURE_COMPAT_DIR_INDEX) + es->s_def_hash_version = sbi->s_def_hash_version; + if (params->set_feature_incompat_mask & + EXT4_FEATURE_INCOMPAT_CSUM_SEED) + es->s_checksum_seed = cpu_to_le32(sbi->s_csum_seed); + } + if (params->set_flags & EXT4_TUNE_FL_FORCE_FSCK) + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); +} + +static int ext4_ioctl_set_tune_sb(struct file *filp, + struct ext4_tune_sb_params __user *in) +{ + struct ext4_tune_sb_params params; + struct super_block *sb = file_inode(filp)->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int enabling_casefold = 0; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(¶ms, in, sizeof(params))) + return -EFAULT; + + if ((params.set_flags & ~TUNE_OPS_SUPPORTED) != 0) + return -EOPNOTSUPP; + + if ((params.set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) && + (params.errors_behavior > EXT4_ERRORS_PANIC)) + return -EINVAL; + + if ((params.set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) && + (params.reserved_blocks > ext4_blocks_count(sbi->s_es) / 2)) + return -EINVAL; + if ((params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) && + ((params.def_hash_alg > DX_HASH_LAST) || + (params.def_hash_alg == DX_HASH_SIPHASH))) + return -EINVAL; + if ((params.set_flags & EXT4_TUNE_FL_FEATURES) && + (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES)) + return -EINVAL; + + if (params.set_flags & EXT4_TUNE_FL_FEATURES) { + params.set_feature_compat_mask = + params.feature_compat & + ~le32_to_cpu(es->s_feature_compat); + params.set_feature_incompat_mask = + params.feature_incompat & + ~le32_to_cpu(es->s_feature_incompat); + params.set_feature_ro_compat_mask = + params.feature_ro_compat & + ~le32_to_cpu(es->s_feature_ro_compat); + params.clear_feature_compat_mask = + ~params.feature_compat & + le32_to_cpu(es->s_feature_compat); + params.clear_feature_incompat_mask = + ~params.feature_incompat & + le32_to_cpu(es->s_feature_incompat); + params.clear_feature_ro_compat_mask = + ~params.feature_ro_compat & + le32_to_cpu(es->s_feature_ro_compat); + params.set_flags |= EXT4_TUNE_FL_EDIT_FEATURES; + } + if (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES) { + if ((params.set_feature_compat_mask & + ~EXT4_TUNE_SET_COMPAT_SUPP) || + (params.set_feature_incompat_mask & + ~EXT4_TUNE_SET_INCOMPAT_SUPP) || + (params.set_feature_ro_compat_mask & + ~EXT4_TUNE_SET_RO_COMPAT_SUPP) || + (params.clear_feature_compat_mask & + ~EXT4_TUNE_CLEAR_COMPAT_SUPP) || + (params.clear_feature_incompat_mask & + ~EXT4_TUNE_CLEAR_INCOMPAT_SUPP) || + (params.clear_feature_ro_compat_mask & + ~EXT4_TUNE_CLEAR_RO_COMPAT_SUPP)) + return -EOPNOTSUPP; + + /* + * Filter out the features that are already set from + * the set_mask. + */ + params.set_feature_compat_mask &= + ~le32_to_cpu(es->s_feature_compat); + params.set_feature_incompat_mask &= + ~le32_to_cpu(es->s_feature_incompat); + params.set_feature_ro_compat_mask &= + ~le32_to_cpu(es->s_feature_ro_compat); + if ((params.set_feature_incompat_mask & + EXT4_FEATURE_INCOMPAT_CASEFOLD)) { + enabling_casefold = 1; + if (!(params.set_flags & EXT4_TUNE_FL_ENCODING)) { + params.encoding = EXT4_ENC_UTF8_12_1; + params.set_flags |= EXT4_TUNE_FL_ENCODING; + } + if (!(params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS)) { + params.encoding_flags = 0; + params.set_flags |= EXT4_TUNE_FL_ENCODING_FLAGS; + } + } + if ((params.set_feature_compat_mask & + EXT4_FEATURE_COMPAT_DIR_INDEX)) { + uuid_t uu; + + memcpy(&uu, sbi->s_hash_seed, UUID_SIZE); + if (uuid_is_null(&uu)) + generate_random_uuid((char *) + &sbi->s_hash_seed); + if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + sbi->s_def_hash_version = params.def_hash_alg; + else if (sbi->s_def_hash_version == 0) + sbi->s_def_hash_version = DX_HASH_HALF_MD4; + if (!(es->s_flags & + cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH)) && + !(es->s_flags & + cpu_to_le32(EXT2_FLAGS_SIGNED_HASH))) { +#ifdef __CHAR_UNSIGNED__ + sbi->s_hash_unsigned = 3; +#else + sbi->s_hash_unsigned = 0; +#endif + } + } + } + if (params.set_flags & EXT4_TUNE_FL_ENCODING) { + if (!enabling_casefold) + return -EINVAL; + if (params.encoding == 0) + params.encoding = EXT4_ENC_UTF8_12_1; + else if (params.encoding != EXT4_ENC_UTF8_12_1) + return -EINVAL; + } + if (params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) { + if (!enabling_casefold) + return -EINVAL; + if (params.encoding_flags & ~SB_ENC_SUPP_MASK) + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = ext4_update_superblocks_fn(sb, ext4_sb_setparams, ¶ms); + mnt_drop_write_file(filp); + + if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + sbi->s_def_hash_version = params.def_hash_alg; + + return ret; +} + static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1616,6 +1908,11 @@ resizefs_out: return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg); case EXT4_IOC_SETFSUUID: return ext4_ioctl_setuuid(filp, (const void __user *)arg); + case EXT4_IOC_GET_TUNE_SB_PARAM: + return ext4_ioctl_get_tune_sb(EXT4_SB(sb), + (void __user *)arg); + case EXT4_IOC_SET_TUNE_SB_PARAM: + return ext4_ioctl_set_tune_sb(filp, (void __user *)arg); default: return -ENOTTY; } @@ -1703,7 +2000,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } #endif -static void set_overhead(struct ext4_super_block *es, const void *arg) +static void set_overhead(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) { es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg)); } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8b18802e83eb..9087183602e4 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3655,16 +3655,26 @@ static void ext4_discard_work(struct work_struct *work) static inline void ext4_mb_avg_fragment_size_destroy(struct ext4_sb_info *sbi) { + if (!sbi->s_mb_avg_fragment_size) + return; + for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) xa_destroy(&sbi->s_mb_avg_fragment_size[i]); + kfree(sbi->s_mb_avg_fragment_size); + sbi->s_mb_avg_fragment_size = NULL; } static inline void ext4_mb_largest_free_orders_destroy(struct ext4_sb_info *sbi) { + if (!sbi->s_mb_largest_free_orders) + return; + for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) xa_destroy(&sbi->s_mb_largest_free_orders[i]); + kfree(sbi->s_mb_largest_free_orders); + sbi->s_mb_largest_free_orders = NULL; } int ext4_mb_init(struct super_block *sb) diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 51661570cf3b..ab1ff51302fb 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -231,9 +231,9 @@ static int kmmpd(void *data) * Adjust the mmp_check_interval depending on how much time * it took for the MMP block to be written. */ - mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ, - EXT4_MMP_MAX_CHECK_INTERVAL), - EXT4_MMP_MIN_CHECK_INTERVAL); + mmp_check_interval = clamp(EXT4_MMP_CHECK_MULT * diff / HZ, + EXT4_MMP_MIN_CHECK_INTERVAL, + EXT4_MMP_MAX_CHECK_INTERVAL); mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); } diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index adae3caf175a..4b091c21908f 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -225,7 +225,7 @@ static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to) do { if (bh_offset(bh) + blocksize <= from) continue; - if (bh_offset(bh) > to) + if (bh_offset(bh) >= to) break; wait_on_buffer(bh); if (buffer_uptodate(bh)) diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 524d4658fa40..33c3a89396b1 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -109,11 +109,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); - /* - * Inode orphaned in orphan file or in orphan list? - */ - if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || - !list_empty(&EXT4_I(inode)->i_orphan)) + if (ext4_inode_orphan_tracked(inode)) return 0; /* @@ -587,9 +583,20 @@ int ext4_init_orphan_info(struct super_block *sb) ext4_msg(sb, KERN_ERR, "get orphan inode failed"); return PTR_ERR(inode); } + /* + * This is just an artificial limit to prevent corrupted fs from + * consuming absurd amounts of memory when pinning blocks of orphan + * file in memory. + */ + if (inode->i_size > 8 << 20) { + ext4_msg(sb, KERN_ERR, "orphan file too big: %llu", + (unsigned long long)inode->i_size); + ret = -EFSCORRUPTED; + goto out_put; + } oi->of_blocks = inode->i_size >> sb->s_blocksize_bits; oi->of_csum_seed = EXT4_I(inode)->i_csum_seed; - oi->of_binfo = kmalloc_array(oi->of_blocks, + oi->of_binfo = kvmalloc_array(oi->of_blocks, sizeof(struct ext4_orphan_block), GFP_KERNEL); if (!oi->of_binfo) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7f2d4014d128..33e7c08c9529 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -265,6 +265,15 @@ struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, return __ext4_sb_bread_gfp(sb, block, 0, gfp); } +struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb, + sector_t block) +{ + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, + ~__GFP_FS) | __GFP_MOVABLE | __GFP_NOFAIL; + + return __ext4_sb_bread_gfp(sb, block, 0, gfp); +} + void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) { struct buffer_head *bh = bdev_getblk(sb->s_bdev, block, @@ -1438,9 +1447,9 @@ static void ext4_free_in_core_inode(struct inode *inode) static void ext4_destroy_inode(struct inode *inode) { - if (!list_empty(&(EXT4_I(inode)->i_orphan))) { + if (ext4_inode_orphan_tracked(inode)) { ext4_msg(inode->i_sb, KERN_ERR, - "Inode %lu (%p): orphan list check failed!", + "Inode %lu (%p): inode tracked as orphan!", inode->i_ino, EXT4_I(inode)); print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, EXT4_I(inode), sizeof(struct ext4_inode_info), @@ -2466,7 +2475,7 @@ static int parse_apply_sb_mount_options(struct super_block *sb, struct ext4_fs_context *m_ctx) { struct ext4_sb_info *sbi = EXT4_SB(sb); - char *s_mount_opts = NULL; + char s_mount_opts[65]; struct ext4_fs_context *s_ctx = NULL; struct fs_context *fc = NULL; int ret = -ENOMEM; @@ -2474,15 +2483,11 @@ static int parse_apply_sb_mount_options(struct super_block *sb, if (!sbi->s_es->s_mount_opts[0]) return 0; - s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, - sizeof(sbi->s_es->s_mount_opts), - GFP_KERNEL); - if (!s_mount_opts) - return ret; + strscpy_pad(s_mount_opts, sbi->s_es->s_mount_opts); fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); if (!fc) - goto out_free; + return -ENOMEM; s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); if (!s_ctx) @@ -2514,11 +2519,8 @@ parse_failed: ret = 0; out_free: - if (fc) { - ext4_fc_free(fc); - kfree(fc); - } - kfree(s_mount_opts); + ext4_fc_free(fc); + kfree(fc); return ret; } @@ -2964,11 +2966,11 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, } if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) || - le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) + ext4_get_resuid(es) != EXT4_DEF_RESUID) SEQ_OPTS_PRINT("resuid=%u", from_kuid_munged(&init_user_ns, sbi->s_resuid)); if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) || - le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) + ext4_get_resgid(es) != EXT4_DEF_RESGID) SEQ_OPTS_PRINT("resgid=%u", from_kgid_munged(&init_user_ns, sbi->s_resgid)); def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); @@ -5283,8 +5285,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_set_def_opts(sb, es); - sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); - sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); + sbi->s_resuid = make_kuid(&init_user_ns, ext4_get_resuid(es)); + sbi->s_resgid = make_kgid(&init_user_ns, ext4_get_resuid(es)); sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 5a6fe1513fd2..ce7253b3f549 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -251,6 +251,10 @@ check_xattrs(struct inode *inode, struct buffer_head *bh, err_str = "invalid ea_ino"; goto errout; } + if (ea_ino && !size) { + err_str = "invalid size in ea xattr"; + goto errout; + } if (size > EXT4_XATTR_SIZE_MAX) { err_str = "e_value size too large"; goto errout; @@ -1019,7 +1023,7 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, int ref_change) { struct ext4_iloc iloc; - s64 ref_count; + u64 ref_count; int ret; inode_lock_nested(ea_inode, I_MUTEX_XATTR); @@ -1029,13 +1033,17 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, goto out; ref_count = ext4_xattr_inode_get_ref(ea_inode); + if ((ref_count == 0 && ref_change < 0) || (ref_count == U64_MAX && ref_change > 0)) { + ext4_error_inode(ea_inode, __func__, __LINE__, 0, + "EA inode %lu ref wraparound: ref_count=%lld ref_change=%d", + ea_inode->i_ino, ref_count, ref_change); + ret = -EFSCORRUPTED; + goto out; + } ref_count += ref_change; ext4_xattr_inode_set_ref(ea_inode, ref_count); if (ref_change > 0) { - WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld", - ea_inode->i_ino, ref_count); - if (ref_count == 1) { WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u", ea_inode->i_ino, ea_inode->i_nlink); @@ -1044,9 +1052,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, ext4_orphan_del(handle, ea_inode); } } else { - WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld", - ea_inode->i_ino, ref_count); - if (ref_count == 0) { WARN_ONCE(ea_inode->i_nlink != 1, "EA inode %lu i_nlink=%u", @@ -1530,7 +1535,7 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) && !(current->flags & PF_MEMALLOC_NOFS)); - ea_data = kvmalloc(value_len, GFP_KERNEL); + ea_data = kvmalloc(value_len, GFP_NOFS); if (!ea_data) { mb_cache_entry_put(ea_inode_cache, ce); return NULL; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index db3831f7f2f5..bbe07e3a6c75 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1442,6 +1442,34 @@ u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi) return get_sectors_written(sbi->sb->s_bdev); } +static inline void stat_cp_time(struct cp_control *cpc, enum cp_time type) +{ + cpc->stats.times[type] = ktime_get(); +} + +static inline void check_cp_time(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + unsigned long long sb_diff, cur_diff; + enum cp_time ct; + + sb_diff = (u64)ktime_ms_delta(sbi->cp_stats.times[CP_TIME_END], + sbi->cp_stats.times[CP_TIME_START]); + cur_diff = (u64)ktime_ms_delta(cpc->stats.times[CP_TIME_END], + cpc->stats.times[CP_TIME_START]); + + if (cur_diff > sb_diff) { + sbi->cp_stats = cpc->stats; + if (cur_diff < CP_LONG_LATENCY_THRESHOLD) + return; + + f2fs_warn(sbi, "checkpoint was blocked for %llu ms", cur_diff); + for (ct = CP_TIME_START; ct < CP_TIME_MAX - 1; ct++) + f2fs_warn(sbi, "Step#%d: %llu ms", ct, + (u64)ktime_ms_delta(cpc->stats.times[ct + 1], + cpc->stats.times[ct])); + } +} + static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1459,6 +1487,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT/SIT pages */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + stat_cp_time(cpc, CP_TIME_SYNC_META); + /* start to update checkpoint, cp ver is already updated previously */ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); @@ -1555,20 +1585,26 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Here, we have one bio having CP pack except cp pack 2 page */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + stat_cp_time(cpc, CP_TIME_SYNC_CP_META); + /* Wait for all dirty meta pages to be submitted for IO */ f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META); + stat_cp_time(cpc, CP_TIME_WAIT_DIRTY_META); /* wait for previous submitted meta pages writeback */ f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); + stat_cp_time(cpc, CP_TIME_WAIT_CP_DATA); /* flush all device cache */ err = f2fs_flush_device_cache(sbi); if (err) return err; + stat_cp_time(cpc, CP_TIME_FLUSH_DEVICE); /* barrier and flush checkpoint cp pack 2 page if it can */ commit_checkpoint(sbi, ckpt, start_blk); f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); + stat_cp_time(cpc, CP_TIME_WAIT_LAST_CP); /* * invalidate intermediate page cache borrowed from meta inode which are @@ -1613,6 +1649,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned long long ckpt_ver; int err = 0; + stat_cp_time(cpc, CP_TIME_START); + if (f2fs_readonly(sbi->sb) || f2fs_hw_is_readonly(sbi)) return -EROFS; @@ -1624,6 +1662,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (cpc->reason != CP_RESIZE) f2fs_down_write(&sbi->cp_global_sem); + stat_cp_time(cpc, CP_TIME_LOCK); + if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || ((cpc->reason & CP_DISCARD) && !sbi->discard_blks))) @@ -1639,6 +1679,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (err) goto out; + stat_cp_time(cpc, CP_TIME_OP_LOCK); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); f2fs_flush_merged_writes(sbi); @@ -1678,6 +1720,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_sit_entries(sbi, cpc); + stat_cp_time(cpc, CP_TIME_FLUSH_META); + /* save inmem log status */ f2fs_save_inmem_curseg(sbi); @@ -1695,6 +1739,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) stat_inc_cp_count(sbi); stop: unblock_operations(sbi); + stat_cp_time(cpc, CP_TIME_END); + check_cp_time(sbi, cpc); if (cpc->reason & CP_RECOVERY) f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver); @@ -1778,6 +1824,7 @@ static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi) llist_for_each_entry_safe(req, next, dispatch_list, llnode) { diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time); req->ret = ret; + req->delta_time = diff; complete(&req->wait); sum_diff += diff; @@ -1873,6 +1920,12 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) else flush_remained_ckpt_reqs(sbi, &req); + if (unlikely(req.delta_time >= CP_LONG_LATENCY_THRESHOLD)) { + f2fs_warn_ratelimited(sbi, + "blocked on checkpoint for %u ms", cprc->peak_time); + dump_stack(); + } + return req.ret; } diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 5c1f47e45dab..6ad8d3bc6df7 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1215,9 +1215,11 @@ int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock) { void *fsdata = NULL; struct page *pagep; + struct page **rpages; int log_cluster_size = F2FS_I(inode)->i_log_cluster_size; pgoff_t start_idx = from >> (PAGE_SHIFT + log_cluster_size) << log_cluster_size; + int i; int err; err = f2fs_is_compressed_cluster(inode, start_idx); @@ -1238,27 +1240,30 @@ int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock) if (err <= 0) return err; - if (err > 0) { - struct page **rpages = fsdata; - int cluster_size = F2FS_I(inode)->i_cluster_size; - int i; - - for (i = cluster_size - 1; i >= 0; i--) { - struct folio *folio = page_folio(rpages[i]); - loff_t start = folio->index << PAGE_SHIFT; - - if (from <= start) { - folio_zero_segment(folio, 0, folio_size(folio)); - } else { - folio_zero_segment(folio, from - start, - folio_size(folio)); - break; - } - } + rpages = fsdata; + + for (i = (1 << log_cluster_size) - 1; i >= 0; i--) { + struct folio *folio = page_folio(rpages[i]); + loff_t start = (loff_t)folio->index << PAGE_SHIFT; + loff_t offset = from > start ? from - start : 0; - f2fs_compress_write_end(inode, fsdata, start_idx, true); + folio_zero_segment(folio, offset, folio_size(folio)); + + if (from >= start) + break; } - return 0; + + f2fs_compress_write_end(inode, fsdata, start_idx, true); + + err = filemap_write_and_wait_range(inode->i_mapping, + round_down(from, 1 << log_cluster_size << PAGE_SHIFT), + LLONG_MAX); + if (err) + return err; + + truncate_pagecache(inode, from); + + return f2fs_do_truncate_blocks(inode, round_up(from, PAGE_SIZE), lock); } static int f2fs_write_compressed_pages(struct compress_ctx *cc, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7961e0ddfca3..ef38e62cda8f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -733,9 +733,11 @@ static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, static bool io_type_is_mergeable(struct f2fs_bio_info *io, struct f2fs_io_info *fio) { + blk_opf_t mask = ~(REQ_PREFLUSH | REQ_FUA); + if (io->fio.op != fio->op) return false; - return io->fio.op_flags == fio->op_flags; + return (io->fio.op_flags & mask) == (fio->op_flags & mask); } static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, @@ -911,7 +913,7 @@ alloc_new: if (fio->io_wbc) wbc_account_cgroup_owner(fio->io_wbc, folio, folio_size(folio)); - inc_page_count(fio->sbi, WB_DATA_TYPE(data_folio, false)); + inc_page_count(fio->sbi, WB_DATA_TYPE(folio, false)); *fio->last_block = fio->new_blkaddr; *fio->bio = bio; @@ -1083,7 +1085,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, } /* This can handle encryption stuffs */ -static int f2fs_submit_page_read(struct inode *inode, struct folio *folio, +static void f2fs_submit_page_read(struct inode *inode, struct folio *folio, block_t blkaddr, blk_opf_t op_flags, bool for_write) { @@ -1092,23 +1094,16 @@ static int f2fs_submit_page_read(struct inode *inode, struct folio *folio, bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, folio->index, for_write); - if (IS_ERR(bio)) - return PTR_ERR(bio); /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(inode, blkaddr); - if (!bio_add_folio(bio, folio, PAGE_SIZE, 0)) { - iostat_update_and_unbind_ctx(bio); - if (bio->bi_private) - mempool_free(bio->bi_private, bio_post_read_ctx_pool); - bio_put(bio); - return -EFAULT; - } + if (!bio_add_folio(bio, folio, PAGE_SIZE, 0)) + f2fs_bug_on(sbi, 1); + inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); f2fs_submit_read_bio(sbi, bio, DATA); - return 0; } static void __set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) @@ -1265,10 +1260,8 @@ got_it: return folio; } - err = f2fs_submit_page_read(inode, folio, dn.data_blkaddr, + f2fs_submit_page_read(inode, folio, dn.data_blkaddr, op_flags, for_write); - if (err) - goto put_err; return folio; put_err: @@ -1572,6 +1565,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) pgofs = (pgoff_t)map->m_lblk; end = pgofs + maxblocks; + if (flag == F2FS_GET_BLOCK_PRECACHE) + mode = LOOKUP_NODE_RA; + next_dnode: if (map->m_may_create) { if (f2fs_lfs_mode(sbi)) @@ -1778,12 +1774,13 @@ sync_out: if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; - f2fs_update_read_extent_cache_range(&dn, - start_pgofs, map->m_pblk + ofs, - map->m_len - ofs); + if (map->m_len > ofs) + f2fs_update_read_extent_cache_range(&dn, + start_pgofs, map->m_pblk + ofs, + map->m_len - ofs); } if (map->m_next_extent) - *map->m_next_extent = pgofs + 1; + *map->m_next_extent = is_hole ? pgofs + 1 : pgofs; } f2fs_put_dnode(&dn); unlock_out: @@ -2145,16 +2142,10 @@ submit_and_realloc: f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } - if (bio == NULL) { + if (bio == NULL) bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, f2fs_ra_op_flags(rac), index, false); - if (IS_ERR(bio)) { - ret = PTR_ERR(bio); - bio = NULL; - goto out; - } - } /* * If the page is under writeback, we need to wait for @@ -2303,18 +2294,10 @@ submit_and_realloc: bio = NULL; } - if (!bio) { + if (!bio) bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages - i, f2fs_ra_op_flags(rac), folio->index, for_write); - if (IS_ERR(bio)) { - ret = PTR_ERR(bio); - f2fs_decompress_end_io(dic, ret, true); - f2fs_put_dnode(&dn); - *bio_ret = NULL; - return ret; - } - } if (!bio_add_folio(bio, folio, blocksize, 0)) goto submit_and_realloc; @@ -3639,11 +3622,9 @@ repeat: err = -EFSCORRUPTED; goto put_folio; } - err = f2fs_submit_page_read(use_cow ? + f2fs_submit_page_read(use_cow ? F2FS_I(inode)->cow_inode : inode, folio, blkaddr, 0, true); - if (err) - goto put_folio; folio_lock(folio); if (unlikely(folio->mapping != mapping)) { diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index fffd7749d6d1..48f4f98afb01 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -16,6 +16,21 @@ #include "xattr.h" #include <trace/events/f2fs.h> +static inline bool f2fs_should_fallback_to_linear(struct inode *dir) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + + switch (F2FS_OPTION(sbi).lookup_mode) { + case LOOKUP_PERF: + return false; + case LOOKUP_COMPAT: + return true; + case LOOKUP_AUTO: + return !sb_no_casefold_compat_fallback(sbi->sb); + } + return false; +} + #if IS_ENABLED(CONFIG_UNICODE) extern struct kmem_cache *f2fs_cf_name_slab; #endif @@ -366,7 +381,7 @@ start_find_entry: out: #if IS_ENABLED(CONFIG_UNICODE) - if (!sb_no_casefold_compat_fallback(dir->i_sb) && + if (f2fs_should_fallback_to_linear(dir) && IS_CASEFOLDED(dir) && !de && use_hash) { use_hash = false; goto start_find_entry; diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 199c1e7a83ef..33e09c453c70 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -604,7 +604,13 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, p = &(*p)->rb_right; leftmost = false; } else { + f2fs_err_ratelimited(sbi, "%s: corrupted extent, type: %d, " + "extent node in rb tree [%u, %u, %u], age [%llu, %llu], " + "extent node to insert [%u, %u, %u], age [%llu, %llu]", + __func__, et->type, en->ei.fofs, en->ei.blk, en->ei.len, en->ei.age, + en->ei.last_blocks, ei->fofs, ei->blk, ei->len, ei->age, ei->last_blocks); f2fs_bug_on(sbi, 1); + return NULL; } } @@ -664,6 +670,15 @@ static void __update_extent_tree_range(struct inode *inode, if (!et) return; + if (unlikely(len == 0)) { + f2fs_err_ratelimited(sbi, "%s: extent len is zero, type: %d, " + "extent [%u, %u, %u], age [%llu, %llu]", + __func__, type, tei->fofs, tei->blk, tei->len, + tei->age, tei->last_blocks); + f2fs_bug_on(sbi, 1); + return; + } + if (type == EX_READ) trace_f2fs_update_read_extent_tree_range(inode, fofs, len, tei->blk, 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6e465bbc85ee..5b4e9548a231 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -131,6 +131,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; * string rather than using the MS_LAZYTIME flag, so this must remain. */ #define F2FS_MOUNT_LAZYTIME 0x40000000 +#define F2FS_MOUNT_RESERVE_NODE 0x80000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -155,6 +156,18 @@ enum blkzone_allocation_policy { BLKZONE_ALLOC_PRIOR_CONV, /* Prioritize writing to conventional zones */ }; +enum bggc_io_aware_policy { + AWARE_ALL_IO, /* skip background GC if there is any kind of pending IO */ + AWARE_READ_IO, /* skip background GC if there is pending read IO */ + AWARE_NONE, /* don't aware IO for background GC */ +}; + +enum device_allocation_policy { + ALLOCATE_FORWARD_NOHINT, + ALLOCATE_FORWARD_WITHIN_HINT, + ALLOCATE_FORWARD_FROM_HINT, +}; + /* * An implementation of an rwsem that is explicitly unfair to readers. This * prevents priority inversion when a low-priority reader acquires the read lock @@ -172,6 +185,7 @@ struct f2fs_rwsem { struct f2fs_mount_info { unsigned int opt; block_t root_reserved_blocks; /* root reserved blocks */ + block_t root_reserved_nodes; /* root reserved nodes */ kuid_t s_resuid; /* reserved blocks for uid */ kgid_t s_resgid; /* reserved blocks for gid */ int active_logs; /* # of active logs */ @@ -212,6 +226,7 @@ struct f2fs_mount_info { int compress_mode; /* compression mode */ unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ unsigned char noextensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ + unsigned int lookup_mode; }; #define F2FS_FEATURE_ENCRYPT 0x00000001 @@ -266,14 +281,36 @@ enum { #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_INTERVAL 5 /* 5 secs */ +#define DEF_ENABLE_INTERVAL 16 /* 16 secs */ #define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */ #define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ +enum cp_time { + CP_TIME_START, /* begin */ + CP_TIME_LOCK, /* after cp_global_sem */ + CP_TIME_OP_LOCK, /* after block_operation */ + CP_TIME_FLUSH_META, /* after flush sit/nat */ + CP_TIME_SYNC_META, /* after sync_meta_pages */ + CP_TIME_SYNC_CP_META, /* after sync cp meta pages */ + CP_TIME_WAIT_DIRTY_META,/* after wait on dirty meta */ + CP_TIME_WAIT_CP_DATA, /* after wait on cp data */ + CP_TIME_FLUSH_DEVICE, /* after flush device cache */ + CP_TIME_WAIT_LAST_CP, /* after wait on last cp pack */ + CP_TIME_END, /* after unblock_operation */ + CP_TIME_MAX, +}; + +/* time cost stats of checkpoint */ +struct cp_stats { + ktime_t times[CP_TIME_MAX]; +}; + struct cp_control { int reason; __u64 trim_start; __u64 trim_end; __u64 trim_minlen; + struct cp_stats stats; }; /* @@ -334,7 +371,10 @@ struct ckpt_req { struct completion wait; /* completion for checkpoint done */ struct llist_node llnode; /* llist_node to be linked in wait queue */ int ret; /* return code of checkpoint */ - ktime_t queue_time; /* request queued time */ + union { + ktime_t queue_time; /* request queued time */ + ktime_t delta_time; /* time in queue */ + }; }; struct ckpt_req_control { @@ -350,6 +390,9 @@ struct ckpt_req_control { unsigned int peak_time; /* peak wait time in msec until now */ }; +/* a time threshold that checkpoint was blocked for, unit: ms */ +#define CP_LONG_LATENCY_THRESHOLD 5000 + /* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ @@ -1375,6 +1418,7 @@ enum { DISCARD_TIME, GC_TIME, DISABLE_TIME, + ENABLE_TIME, UMOUNT_DISCARD_TIMEOUT, MAX_TIME, }; @@ -1454,6 +1498,12 @@ enum { TOTAL_CALL = FOREGROUND, }; +enum f2fs_lookup_mode { + LOOKUP_PERF, + LOOKUP_COMPAT, + LOOKUP_AUTO, +}; + static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); @@ -1643,6 +1693,7 @@ struct f2fs_sb_info { unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ struct ckpt_req_control cprc_info; /* for checkpoint request control */ + struct cp_stats cp_stats; /* for time stat of checkpoint */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ @@ -1810,6 +1861,9 @@ struct f2fs_sb_info { spinlock_t dev_lock; /* protect dirty_device */ bool aligned_blksize; /* all devices has the same logical blksize */ unsigned int first_seq_zone_segno; /* first segno in sequential zone */ + unsigned int bggc_io_aware; /* For adjust the BG_GC priority when pending IO */ + unsigned int allocate_section_hint; /* the boundary position between devices */ + unsigned int allocate_section_policy; /* determine the section writing priority */ /* For write statistics */ u64 sectors_written_start; @@ -2362,13 +2416,11 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } -static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, +static inline bool __allow_reserved_root(struct f2fs_sb_info *sbi, struct inode *inode, bool cap) { if (!inode) return true; - if (!test_opt(sbi, RESERVE_ROOT)) - return false; if (IS_NOQUOTA(inode)) return true; if (uid_eq(F2FS_OPTION(sbi).s_resuid, current_fsuid())) @@ -2389,7 +2441,7 @@ static inline unsigned int get_available_block_count(struct f2fs_sb_info *sbi, avail_user_block_count = sbi->user_block_count - sbi->current_reserved_blocks; - if (!__allow_reserved_blocks(sbi, inode, cap)) + if (test_opt(sbi, RESERVE_ROOT) && !__allow_reserved_root(sbi, inode, cap)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { @@ -2747,7 +2799,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, struct inode *inode, bool is_inode) { block_t valid_block_count; - unsigned int valid_node_count; + unsigned int valid_node_count, avail_user_node_count; unsigned int avail_user_block_count; int err; @@ -2769,15 +2821,20 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + 1; - avail_user_block_count = get_available_block_count(sbi, inode, false); + avail_user_block_count = get_available_block_count(sbi, inode, + test_opt(sbi, RESERVE_NODE)); if (unlikely(valid_block_count > avail_user_block_count)) { spin_unlock(&sbi->stat_lock); goto enospc; } + avail_user_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + if (test_opt(sbi, RESERVE_NODE) && + !__allow_reserved_root(sbi, inode, true)) + avail_user_node_count -= F2FS_OPTION(sbi).root_reserved_nodes; valid_node_count = sbi->total_valid_node_count + 1; - if (unlikely(valid_node_count > sbi->total_node_count)) { + if (unlikely(valid_node_count > avail_user_node_count)) { spin_unlock(&sbi->stat_lock); goto enospc; } @@ -3004,13 +3061,10 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type) if (sbi->gc_mode == GC_URGENT_HIGH) return true; - if (zoned_gc) { - if (is_inflight_read_io(sbi)) - return false; - } else { - if (is_inflight_io(sbi, type)) - return false; - } + if (sbi->bggc_io_aware == AWARE_READ_IO && is_inflight_read_io(sbi)) + return false; + if (sbi->bggc_io_aware == AWARE_ALL_IO && is_inflight_io(sbi, type)) + return false; if (sbi->gc_mode == GC_URGENT_MID) return true; @@ -3770,6 +3824,7 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname); * node.c */ struct node_info; +enum node_type; int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); @@ -3792,7 +3847,8 @@ int f2fs_remove_inode_page(struct inode *inode); struct folio *f2fs_new_inode_folio(struct inode *inode); struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs); void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); -struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid); +struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, + enum node_type node_type); struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino); struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid); int f2fs_move_node_folio(struct folio *node_folio, int gc_type); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 42faaed6a02d..ffa045b39c01 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -35,15 +35,23 @@ #include <trace/events/f2fs.h> #include <uapi/linux/f2fs.h> -static void f2fs_zero_post_eof_page(struct inode *inode, loff_t new_size) +static void f2fs_zero_post_eof_page(struct inode *inode, + loff_t new_size, bool lock) { loff_t old_size = i_size_read(inode); if (old_size >= new_size) return; + if (mapping_empty(inode->i_mapping)) + return; + + if (lock) + filemap_invalidate_lock(inode->i_mapping); /* zero or drop pages only in range of [old_size, new_size] */ - truncate_pagecache(inode, old_size); + truncate_inode_pages_range(inode->i_mapping, old_size, new_size); + if (lock) + filemap_invalidate_unlock(inode->i_mapping); } static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) @@ -114,9 +122,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT, true); file_update_time(vmf->vma->vm_file); filemap_invalidate_lock_shared(inode->i_mapping); @@ -904,8 +910,16 @@ int f2fs_truncate(struct inode *inode) /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); - if (err) + if (err) { + /* + * Always truncate page #0 to avoid page cache + * leak in evict() path. + */ + truncate_inode_pages_range(inode->i_mapping, + F2FS_BLK_TO_BYTES(0), + F2FS_BLK_END_BYTES(0)); return err; + } } err = f2fs_truncate_blocks(inode, i_size_read(inode), true); @@ -1141,7 +1155,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, filemap_invalidate_lock(inode->i_mapping); if (attr->ia_size > old_size) - f2fs_zero_post_eof_page(inode, attr->ia_size); + f2fs_zero_post_eof_page(inode, attr->ia_size, false); truncate_setsize(inode, attr->ia_size); if (attr->ia_size <= old_size) @@ -1260,9 +1274,7 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, offset + len); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len, true); pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1547,7 +1559,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, offset + len); + f2fs_zero_post_eof_page(inode, offset + len, false); f2fs_lock_op(sbi); f2fs_drop_extent_tree(inode); @@ -1670,9 +1682,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - filemap_invalidate_lock(mapping); - f2fs_zero_post_eof_page(inode, offset + len); - filemap_invalidate_unlock(mapping); + f2fs_zero_post_eof_page(inode, offset + len, true); pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1806,7 +1816,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(mapping); - f2fs_zero_post_eof_page(inode, offset + len); + f2fs_zero_post_eof_page(inode, offset + len, false); truncate_pagecache(inode, offset); while (!ret && idx > pg_start) { @@ -1864,9 +1874,7 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, if (err) return err; - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, offset + len); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len, true); f2fs_balance_fs(sbi, true); @@ -4914,9 +4922,8 @@ static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from) if (err) return err; - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, iocb->ki_pos + iov_iter_count(from)); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, + iocb->ki_pos + iov_iter_count(from), true); return count; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 098e9f71421e..a7708cf80c04 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1071,7 +1071,7 @@ next_step: } /* phase == 2 */ - node_folio = f2fs_get_node_folio(sbi, nid); + node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR); if (IS_ERR(node_folio)) continue; @@ -1145,7 +1145,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, nid = le32_to_cpu(sum->nid); ofs_in_node = le16_to_cpu(sum->ofs_in_node); - node_folio = f2fs_get_node_folio(sbi, nid); + node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR); if (IS_ERR(node_folio)) return false; @@ -1794,6 +1794,13 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct folio *sum_folio = filemap_get_folio(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) { + f2fs_err(sbi, "%s: segment %u is used by log", + __func__, segno); + f2fs_bug_on(sbi, 1); + goto skip; + } + if (get_valid_blocks(sbi, segno, false) == 0) goto freed; if (gc_type == BG_GC && __is_large_section(sbi) && @@ -1805,7 +1812,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, sum = folio_address(sum_folio); if (type != GET_SUM_TYPE((&sum->footer))) { - f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT", + f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SIT and SSA", segno, type, GET_SUM_TYPE((&sum->footer))); f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_CORRUPTED_SUMMARY); @@ -2068,6 +2075,13 @@ int f2fs_gc_range(struct f2fs_sb_info *sbi, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; + /* + * avoid migrating empty section, as it can be allocated by + * log in parallel. + */ + if (!get_valid_blocks(sbi, segno, true)) + continue; + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) continue; @@ -2182,6 +2196,8 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) SM_I(sbi)->segment_count = (int)SM_I(sbi)->segment_count + segs; MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs; MAIN_SECS(sbi) += secs; + if (sbi->allocate_section_hint > MAIN_SECS(sbi)) + sbi->allocate_section_hint = MAIN_SECS(sbi); FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs; FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs; F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks); @@ -2189,6 +2205,9 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) if (f2fs_is_multi_device(sbi)) { int last_dev = sbi->s_ndevs - 1; + sbi->allocate_section_hint = FDEV(0).total_segments / + SEGS_PER_SEC(sbi); + FDEV(last_dev).total_segments = (int)FDEV(last_dev).total_segments + segs; FDEV(last_dev).end_blk = diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 27743b93e186..482a362f2625 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -27,12 +27,17 @@ static struct kmem_cache *free_nid_slab; static struct kmem_cache *nat_entry_set_slab; static struct kmem_cache *fsync_node_entry_slab; +static inline bool is_invalid_nid(struct f2fs_sb_info *sbi, nid_t nid) +{ + return nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid; +} + /* * Check whether the given nid is within node id range. */ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { - if (unlikely(nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid)) { + if (unlikely(is_invalid_nid(sbi, nid))) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.", __func__, nid); @@ -871,7 +876,8 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } if (!done) { - nfolio[i] = f2fs_get_node_folio(sbi, nids[i]); + nfolio[i] = f2fs_get_node_folio(sbi, nids[i], + NODE_TYPE_NON_INODE); if (IS_ERR(nfolio[i])) { err = PTR_ERR(nfolio[i]); f2fs_folio_put(nfolio[0], false); @@ -989,7 +995,7 @@ static int truncate_dnode(struct dnode_of_data *dn) return 1; /* get direct node */ - folio = f2fs_get_node_folio(sbi, dn->nid); + folio = f2fs_get_node_folio(sbi, dn->nid, NODE_TYPE_NON_INODE); if (PTR_ERR(folio) == -ENOENT) return 1; else if (IS_ERR(folio)) @@ -1033,7 +1039,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); - folio = f2fs_get_node_folio(F2FS_I_SB(dn->inode), dn->nid); + folio = f2fs_get_node_folio(F2FS_I_SB(dn->inode), dn->nid, + NODE_TYPE_NON_INODE); if (IS_ERR(folio)) { trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(folio)); return PTR_ERR(folio); @@ -1111,7 +1118,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, /* get indirect nodes in the path */ for (i = 0; i < idx + 1; i++) { /* reference count'll be increased */ - folios[i] = f2fs_get_node_folio(F2FS_I_SB(dn->inode), nid[i]); + folios[i] = f2fs_get_node_folio(F2FS_I_SB(dn->inode), nid[i], + NODE_TYPE_NON_INODE); if (IS_ERR(folios[i])) { err = PTR_ERR(folios[i]); idx = i - 1; @@ -1496,21 +1504,37 @@ static int sanity_check_node_footer(struct f2fs_sb_info *sbi, struct folio *folio, pgoff_t nid, enum node_type ntype) { - if (unlikely(nid != nid_of_node(folio) || - (ntype == NODE_TYPE_INODE && !IS_INODE(folio)) || - (ntype == NODE_TYPE_XATTR && - !f2fs_has_xattr_block(ofs_of_node(folio))) || - time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))) { - f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " - "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", - ntype, nid, nid_of_node(folio), ino_of_node(folio), - ofs_of_node(folio), cpver_of_node(folio), - next_blkaddr_of_node(folio)); - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); - return -EFSCORRUPTED; + if (unlikely(nid != nid_of_node(folio))) + goto out_err; + + switch (ntype) { + case NODE_TYPE_INODE: + if (!IS_INODE(folio)) + goto out_err; + break; + case NODE_TYPE_XATTR: + if (!f2fs_has_xattr_block(ofs_of_node(folio))) + goto out_err; + break; + case NODE_TYPE_NON_INODE: + if (IS_INODE(folio)) + goto out_err; + break; + default: + break; } + if (time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER)) + goto out_err; return 0; +out_err: + f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " + "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + ntype, nid, nid_of_node(folio), ino_of_node(folio), + ofs_of_node(folio), cpver_of_node(folio), + next_blkaddr_of_node(folio)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); + return -EFSCORRUPTED; } static struct folio *__get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, @@ -1546,7 +1570,7 @@ repeat: if (unlikely(!folio_test_uptodate(folio))) { err = -EIO; - goto out_err; + goto out_put_err; } if (!f2fs_inode_chksum_verify(sbi, folio)) { @@ -1567,9 +1591,10 @@ out_put_err: return ERR_PTR(err); } -struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid) +struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, + enum node_type node_type) { - return __get_node_folio(sbi, nid, NULL, 0, NODE_TYPE_REGULAR); + return __get_node_folio(sbi, nid, NULL, 0, node_type); } struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino) @@ -2634,6 +2659,16 @@ retry: f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); i = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); + + if (unlikely(is_invalid_nid(sbi, i->nid))) { + spin_unlock(&nm_i->nid_list_lock); + f2fs_err(sbi, "Corrupted nid %u in free_nid_list", + i->nid); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_CORRUPTED_NID); + return false; + } + *nid = i->nid; __move_free_nid(sbi, i, FREE_NID, PREALLOC_NID); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 030390543b54..9cb8dcf8d417 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -57,6 +57,7 @@ enum node_type { NODE_TYPE_REGULAR, NODE_TYPE_INODE, NODE_TYPE_XATTR, + NODE_TYPE_NON_INODE, }; /* diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 4cb3a91801b4..215e442db72c 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -548,7 +548,7 @@ got_it: } /* Get the node page */ - node_folio = f2fs_get_node_folio(sbi, nid); + node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR); if (IS_ERR(node_folio)) return PTR_ERR(node_folio); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index cc82d42ef14c..b45eace879d7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2774,6 +2774,8 @@ static int get_new_segment(struct f2fs_sb_info *sbi, unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); + unsigned int alloc_policy = sbi->allocate_section_policy; + unsigned int alloc_hint = sbi->allocate_section_hint; bool init = true; int i; int ret = 0; @@ -2807,6 +2809,21 @@ static int get_new_segment(struct f2fs_sb_info *sbi, } #endif + /* + * Prevent allocate_section_hint from exceeding MAIN_SECS() + * due to desynchronization. + */ + if (alloc_policy != ALLOCATE_FORWARD_NOHINT && + alloc_hint > MAIN_SECS(sbi)) + alloc_hint = MAIN_SECS(sbi); + + if (alloc_policy == ALLOCATE_FORWARD_FROM_HINT && + hint < alloc_hint) + hint = alloc_hint; + else if (alloc_policy == ALLOCATE_FORWARD_WITHIN_HINT && + hint >= alloc_hint) + hint = 0; + find_other_zone: secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); @@ -3672,7 +3689,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || - f2fs_is_cow_file(inode)) + f2fs_is_cow_file(inode) || + is_inode_flag_set(inode, FI_NEED_IPU)) return CURSEG_HOT_DATA; return f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); @@ -3936,12 +3954,18 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) int seg_type = log_type_to_seg_type(type); bool keep_order = (f2fs_lfs_mode(fio->sbi) && seg_type == CURSEG_COLD_DATA); + int err; if (keep_order) f2fs_down_read(&fio->sbi->io_order_lock); - if (f2fs_allocate_data_block(fio->sbi, folio, fio->old_blkaddr, - &fio->new_blkaddr, sum, type, fio)) { + err = f2fs_allocate_data_block(fio->sbi, folio, fio->old_blkaddr, + &fio->new_blkaddr, sum, type, fio); + if (unlikely(err)) { + f2fs_err_ratelimited(fio->sbi, + "%s Failed to allocate data block, ino:%u, index:%lu, type:%d, old_blkaddr:0x%x, new_blkaddr:0x%x, err:%d", + __func__, fio->ino, folio->index, type, + fio->old_blkaddr, fio->new_blkaddr, err); if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host)) fscrypt_finalize_bounce_page(&fio->encrypted_page); folio_end_writeback(folio); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5e2ee5c686b1..1ce2c8abaf48 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -600,6 +600,16 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi)); } +static inline unsigned int get_left_section_blocks(struct f2fs_sb_info *sbi, + enum log_type type, unsigned int segno) +{ + if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) + return CAP_BLKS_PER_SEC(sbi) - SEGS_TO_BLKS(sbi, + (segno - GET_START_SEG_FROM_SEC(sbi, segno))) - + CURSEG_I(sbi, type)->next_blkoff; + return CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); +} + static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, unsigned int node_blocks, unsigned int data_blocks, unsigned int dent_blocks) @@ -614,14 +624,7 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, if (unlikely(segno == NULL_SEGNO)) return false; - if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) { - left_blocks = CAP_BLKS_PER_SEC(sbi) - - SEGS_TO_BLKS(sbi, (segno - GET_START_SEG_FROM_SEC(sbi, segno))) - - CURSEG_I(sbi, i)->next_blkoff; - } else { - left_blocks = CAP_BLKS_PER_SEC(sbi) - - get_ckpt_valid_blocks(sbi, segno, true); - } + left_blocks = get_left_section_blocks(sbi, i, segno); blocks = i <= CURSEG_COLD_DATA ? data_blocks : node_blocks; if (blocks > left_blocks) @@ -634,14 +637,7 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, if (unlikely(segno == NULL_SEGNO)) return false; - if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) { - left_blocks = CAP_BLKS_PER_SEC(sbi) - - SEGS_TO_BLKS(sbi, (segno - GET_START_SEG_FROM_SEC(sbi, segno))) - - CURSEG_I(sbi, CURSEG_HOT_DATA)->next_blkoff; - } else { - left_blocks = CAP_BLKS_PER_SEC(sbi) - - get_ckpt_valid_blocks(sbi, segno, true); - } + left_blocks = get_left_section_blocks(sbi, CURSEG_HOT_DATA, segno); if (dent_blocks > left_blocks) return false; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2619cbbd7d2d..fd8e7b0b2166 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -143,6 +143,7 @@ enum { Opt_extent_cache, Opt_data_flush, Opt_reserve_root, + Opt_reserve_node, Opt_resgid, Opt_resuid, Opt_mode, @@ -181,6 +182,7 @@ enum { Opt_nat_bits, Opt_jqfmt, Opt_checkpoint, + Opt_lookup_mode, Opt_err, }; @@ -244,6 +246,13 @@ static const struct constant_table f2fs_param_errors[] = { {} }; +static const struct constant_table f2fs_param_lookup_mode[] = { + {"perf", LOOKUP_PERF}, + {"compat", LOOKUP_COMPAT}, + {"auto", LOOKUP_AUTO}, + {} +}; + static const struct fs_parameter_spec f2fs_param_specs[] = { fsparam_enum("background_gc", Opt_gc_background, f2fs_param_background_gc), fsparam_flag("disable_roll_forward", Opt_disable_roll_forward), @@ -265,6 +274,7 @@ static const struct fs_parameter_spec f2fs_param_specs[] = { fsparam_flag_no("extent_cache", Opt_extent_cache), fsparam_flag("data_flush", Opt_data_flush), fsparam_u32("reserve_root", Opt_reserve_root), + fsparam_u32("reserve_node", Opt_reserve_node), fsparam_gid("resgid", Opt_resgid), fsparam_uid("resuid", Opt_resuid), fsparam_enum("mode", Opt_mode, f2fs_param_mode), @@ -300,6 +310,7 @@ static const struct fs_parameter_spec f2fs_param_specs[] = { fsparam_enum("memory", Opt_memory_mode, f2fs_param_memory_mode), fsparam_flag("age_extent_cache", Opt_age_extent_cache), fsparam_enum("errors", Opt_errors, f2fs_param_errors), + fsparam_enum("lookup_mode", Opt_lookup_mode, f2fs_param_lookup_mode), {} }; @@ -336,6 +347,8 @@ static match_table_t f2fs_checkpoint_tokens = { #define F2FS_SPEC_discard_unit (1 << 21) #define F2FS_SPEC_memory_mode (1 << 22) #define F2FS_SPEC_errors (1 << 23) +#define F2FS_SPEC_lookup_mode (1 << 24) +#define F2FS_SPEC_reserve_node (1 << 25) struct f2fs_fs_context { struct f2fs_mount_info info; @@ -437,22 +450,30 @@ static void f2fs_destroy_casefold_cache(void) { } static inline void limit_reserve_root(struct f2fs_sb_info *sbi) { - block_t limit = min((sbi->user_block_count >> 3), + block_t block_limit = min((sbi->user_block_count >> 3), sbi->user_block_count - sbi->reserved_blocks); + block_t node_limit = sbi->total_node_count >> 3; /* limit is 12.5% */ if (test_opt(sbi, RESERVE_ROOT) && - F2FS_OPTION(sbi).root_reserved_blocks > limit) { - F2FS_OPTION(sbi).root_reserved_blocks = limit; + F2FS_OPTION(sbi).root_reserved_blocks > block_limit) { + F2FS_OPTION(sbi).root_reserved_blocks = block_limit; f2fs_info(sbi, "Reduce reserved blocks for root = %u", F2FS_OPTION(sbi).root_reserved_blocks); } - if (!test_opt(sbi, RESERVE_ROOT) && + if (test_opt(sbi, RESERVE_NODE) && + F2FS_OPTION(sbi).root_reserved_nodes > node_limit) { + F2FS_OPTION(sbi).root_reserved_nodes = node_limit; + f2fs_info(sbi, "Reduce reserved nodes for root = %u", + F2FS_OPTION(sbi).root_reserved_nodes); + } + if (!test_opt(sbi, RESERVE_ROOT) && !test_opt(sbi, RESERVE_NODE) && (!uid_eq(F2FS_OPTION(sbi).s_resuid, make_kuid(&init_user_ns, F2FS_DEF_RESUID)) || !gid_eq(F2FS_OPTION(sbi).s_resgid, make_kgid(&init_user_ns, F2FS_DEF_RESGID)))) - f2fs_info(sbi, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root", + f2fs_info(sbi, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root" + " and reserve_node", from_kuid_munged(&init_user_ns, F2FS_OPTION(sbi).s_resuid), from_kgid_munged(&init_user_ns, @@ -847,6 +868,11 @@ static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param) F2FS_CTX_INFO(ctx).root_reserved_blocks = result.uint_32; ctx->spec_mask |= F2FS_SPEC_reserve_root; break; + case Opt_reserve_node: + ctx_set_opt(ctx, F2FS_MOUNT_RESERVE_NODE); + F2FS_CTX_INFO(ctx).root_reserved_nodes = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_reserve_node; + break; case Opt_resuid: F2FS_CTX_INFO(ctx).s_resuid = result.uid; ctx->spec_mask |= F2FS_SPEC_resuid; @@ -994,6 +1020,10 @@ static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; case Opt_checkpoint_enable: + F2FS_CTX_INFO(ctx).unusable_cap_perc = 0; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap_perc; + F2FS_CTX_INFO(ctx).unusable_cap = 0; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap; ctx_clear_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; default: @@ -1149,6 +1179,10 @@ static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_nat_bits: ctx_set_opt(ctx, F2FS_MOUNT_NAT_BITS); break; + case Opt_lookup_mode: + F2FS_CTX_INFO(ctx).lookup_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_lookup_mode; + break; } return 0; } @@ -1191,7 +1225,11 @@ static int f2fs_check_quota_consistency(struct fs_context *fc, goto err_jquota_change; if (old_qname) { - if (strcmp(old_qname, new_qname) == 0) { + if (!new_qname) { + f2fs_info(sbi, "remove qf_name %s", + old_qname); + continue; + } else if (strcmp(old_qname, new_qname) == 0) { ctx->qname_mask &= ~(1 << i); continue; } @@ -1430,6 +1468,14 @@ static int f2fs_check_opt_consistency(struct fs_context *fc, ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_ROOT); ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_ROOT; } + if (test_opt(sbi, RESERVE_NODE) && + (ctx->opt_mask & F2FS_MOUNT_RESERVE_NODE) && + ctx_test_opt(ctx, F2FS_MOUNT_RESERVE_NODE)) { + f2fs_info(sbi, "Preserve previous reserve_node=%u", + F2FS_OPTION(sbi).root_reserved_nodes); + ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_NODE); + ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_NODE; + } err = f2fs_check_test_dummy_encryption(fc, sb); if (err) @@ -1629,6 +1675,9 @@ static void f2fs_apply_options(struct fs_context *fc, struct super_block *sb) if (ctx->spec_mask & F2FS_SPEC_reserve_root) F2FS_OPTION(sbi).root_reserved_blocks = F2FS_CTX_INFO(ctx).root_reserved_blocks; + if (ctx->spec_mask & F2FS_SPEC_reserve_node) + F2FS_OPTION(sbi).root_reserved_nodes = + F2FS_CTX_INFO(ctx).root_reserved_nodes; if (ctx->spec_mask & F2FS_SPEC_resgid) F2FS_OPTION(sbi).s_resgid = F2FS_CTX_INFO(ctx).s_resgid; if (ctx->spec_mask & F2FS_SPEC_resuid) @@ -1658,6 +1707,8 @@ static void f2fs_apply_options(struct fs_context *fc, struct super_block *sb) F2FS_OPTION(sbi).memory_mode = F2FS_CTX_INFO(ctx).memory_mode; if (ctx->spec_mask & F2FS_SPEC_errors) F2FS_OPTION(sbi).errors = F2FS_CTX_INFO(ctx).errors; + if (ctx->spec_mask & F2FS_SPEC_lookup_mode) + F2FS_OPTION(sbi).lookup_mode = F2FS_CTX_INFO(ctx).lookup_mode; f2fs_apply_compression(fc, sb); f2fs_apply_test_dummy_encryption(fc, sb); @@ -2349,9 +2400,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) seq_puts(seq, "fragment:block"); seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs); - if (test_opt(sbi, RESERVE_ROOT)) - seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u", + if (test_opt(sbi, RESERVE_ROOT) || test_opt(sbi, RESERVE_NODE)) + seq_printf(seq, ",reserve_root=%u,reserve_node=%u,resuid=%u," + "resgid=%u", F2FS_OPTION(sbi).root_reserved_blocks, + F2FS_OPTION(sbi).root_reserved_nodes, from_kuid_munged(&init_user_ns, F2FS_OPTION(sbi).s_resuid), from_kgid_munged(&init_user_ns, @@ -2422,6 +2475,13 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, NAT_BITS)) seq_puts(seq, ",nat_bits"); + if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_PERF) + seq_show_option(seq, "lookup_mode", "perf"); + else if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_COMPAT) + seq_show_option(seq, "lookup_mode", "compat"); + else if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_AUTO) + seq_show_option(seq, "lookup_mode", "auto"); + return 0; } @@ -2486,6 +2546,8 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount) #endif f2fs_build_fault_attr(sbi, 0, 0, FAULT_ALL); + + F2FS_OPTION(sbi).lookup_mode = LOOKUP_PERF; } #ifdef CONFIG_QUOTA @@ -2566,21 +2628,39 @@ out_unlock: restore_flag: sbi->gc_mode = gc_mode; sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ + f2fs_info(sbi, "f2fs_disable_checkpoint() finish, err:%d", err); return err; } static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { - int retry = DEFAULT_RETRY_IO_COUNT; + unsigned int nr_pages = get_pages(sbi, F2FS_DIRTY_DATA) / 16; + long long start, writeback, end; + + f2fs_info(sbi, "f2fs_enable_checkpoint() starts, meta: %lld, node: %lld, data: %lld", + get_pages(sbi, F2FS_DIRTY_META), + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DATA)); + + f2fs_update_time(sbi, ENABLE_TIME); + + start = ktime_get(); /* we should flush all the data to keep data consistency */ - do { - sync_inodes_sb(sbi->sb); + while (get_pages(sbi, F2FS_DIRTY_DATA)) { + writeback_inodes_sb_nr(sbi->sb, nr_pages, WB_REASON_SYNC); f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); - } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--); - if (unlikely(retry < 0)) - f2fs_warn(sbi, "checkpoint=enable has some unwritten data."); + if (f2fs_time_over(sbi, ENABLE_TIME)) + break; + } + writeback = ktime_get(); + + sync_inodes_sb(sbi->sb); + + if (unlikely(get_pages(sbi, F2FS_DIRTY_DATA))) + f2fs_warn(sbi, "checkpoint=enable has some unwritten data: %lld", + get_pages(sbi, F2FS_DIRTY_DATA)); f2fs_down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); @@ -2593,6 +2673,12 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) /* Let's ensure there's no pending checkpoint anymore */ f2fs_flush_ckpt_thread(sbi); + + end = ktime_get(); + + f2fs_info(sbi, "f2fs_enable_checkpoint() finishes, writeback:%llu, sync:%llu", + ktime_ms_delta(writeback, start), + ktime_ms_delta(end, writeback)); } static int __f2fs_remount(struct fs_context *fc, struct super_block *sb) @@ -4156,6 +4242,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->total_node_count = SEGS_TO_BLKS(sbi, ((le32_to_cpu(raw_super->segment_count_nat) / 2) * NAT_ENTRY_PER_BLOCK)); + sbi->allocate_section_hint = le32_to_cpu(raw_super->section_count); + sbi->allocate_section_policy = ALLOCATE_FORWARD_NOHINT; F2FS_ROOT_INO(sbi) = le32_to_cpu(raw_super->root_ino); F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino); F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino); @@ -4179,6 +4267,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL; + sbi->interval_time[ENABLE_TIME] = DEF_ENABLE_INTERVAL; sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] = DEF_UMOUNT_DISCARD_TIMEOUT; clear_sbi_flag(sbi, SBI_NEED_FSCK); @@ -4637,9 +4726,11 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) logical_blksize = bdev_logical_block_size(sbi->sb->s_bdev); sbi->aligned_blksize = true; + sbi->bggc_io_aware = AWARE_ALL_IO; #ifdef CONFIG_BLK_DEV_ZONED sbi->max_open_zones = UINT_MAX; sbi->blkzone_alloc_policy = BLKZONE_ALLOC_PRIOR_SEQ; + sbi->bggc_io_aware = AWARE_READ_IO; #endif for (i = 0; i < max_devices; i++) { @@ -4667,6 +4758,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) SEGS_TO_BLKS(sbi, FDEV(i).total_segments) - 1 + le32_to_cpu(raw_super->segment0_blkaddr); + sbi->allocate_section_hint = FDEV(i).total_segments / + SEGS_PER_SEC(sbi); } else { FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; FDEV(i).end_blk = FDEV(i).start_blk + diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f736052dea50..6d2a4fba68a2 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -281,6 +281,22 @@ static ssize_t encoding_flags_show(struct f2fs_attr *a, le16_to_cpu(F2FS_RAW_SUPER(sbi)->s_encoding_flags)); } +static ssize_t effective_lookup_mode_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + switch (F2FS_OPTION(sbi).lookup_mode) { + case LOOKUP_PERF: + return sysfs_emit(buf, "perf\n"); + case LOOKUP_COMPAT: + return sysfs_emit(buf, "compat\n"); + case LOOKUP_AUTO: + if (sb_no_casefold_compat_fallback(sbi->sb)) + return sysfs_emit(buf, "auto:perf\n"); + return sysfs_emit(buf, "auto:compat\n"); + } + return 0; +} + static ssize_t mounted_time_sec_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -866,6 +882,27 @@ out: return count; } + if (!strcmp(a->attr.name, "bggc_io_aware")) { + if (t < AWARE_ALL_IO || t > AWARE_NONE) + return -EINVAL; + sbi->bggc_io_aware = t; + return count; + } + + if (!strcmp(a->attr.name, "allocate_section_hint")) { + if (t < 0 || t > MAIN_SECS(sbi)) + return -EINVAL; + sbi->allocate_section_hint = t; + return count; + } + + if (!strcmp(a->attr.name, "allocate_section_policy")) { + if (t < ALLOCATE_FORWARD_NOHINT || t > ALLOCATE_FORWARD_FROM_HINT) + return -EINVAL; + sbi->allocate_section_policy = t; + return count; + } + *ui = (unsigned int)t; return count; @@ -1138,6 +1175,8 @@ F2FS_SBI_GENERAL_RW_ATTR(max_victim_search); F2FS_SBI_GENERAL_RW_ATTR(migration_granularity); F2FS_SBI_GENERAL_RW_ATTR(migration_window_granularity); F2FS_SBI_GENERAL_RW_ATTR(dir_level); +F2FS_SBI_GENERAL_RW_ATTR(allocate_section_hint); +F2FS_SBI_GENERAL_RW_ATTR(allocate_section_policy); #ifdef CONFIG_F2FS_IOSTAT F2FS_SBI_GENERAL_RW_ATTR(iostat_enable); F2FS_SBI_GENERAL_RW_ATTR(iostat_period_ms); @@ -1175,6 +1214,7 @@ F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy); #endif F2FS_SBI_GENERAL_RW_ATTR(carve_out); F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section); +F2FS_SBI_GENERAL_RW_ATTR(bggc_io_aware); /* STAT_INFO ATTR */ #ifdef CONFIG_F2FS_STAT_FS @@ -1211,6 +1251,7 @@ F2FS_GENERAL_RO_ATTR(current_reserved_blocks); F2FS_GENERAL_RO_ATTR(unusable); F2FS_GENERAL_RO_ATTR(encoding); F2FS_GENERAL_RO_ATTR(encoding_flags); +F2FS_GENERAL_RO_ATTR(effective_lookup_mode); F2FS_GENERAL_RO_ATTR(mounted_time_sec); F2FS_GENERAL_RO_ATTR(main_blkaddr); F2FS_GENERAL_RO_ATTR(pending_discard); @@ -1303,6 +1344,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(discard_idle_interval), ATTR_LIST(gc_idle_interval), ATTR_LIST(umount_discard_timeout), + ATTR_LIST(bggc_io_aware), #ifdef CONFIG_F2FS_IOSTAT ATTR_LIST(iostat_enable), ATTR_LIST(iostat_period_ms), @@ -1329,6 +1371,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(current_reserved_blocks), ATTR_LIST(encoding), ATTR_LIST(encoding_flags), + ATTR_LIST(effective_lookup_mode), ATTR_LIST(mounted_time_sec), #ifdef CONFIG_F2FS_STAT_FS ATTR_LIST(cp_foreground_calls), @@ -1371,6 +1414,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(max_read_extent_count), ATTR_LIST(carve_out), ATTR_LIST(reserved_pin_section), + ATTR_LIST(allocate_section_hint), + ATTR_LIST(allocate_section_policy), NULL, }; ATTRIBUTE_GROUPS(f2fs); @@ -1723,12 +1768,15 @@ static int __maybe_unused disk_map_seq_show(struct seq_file *seq, seq_printf(seq, " Main : 0x%010x (%10d)\n", SM_I(sbi)->main_blkaddr, le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main)); - seq_printf(seq, " # of Sections : %12d\n", - le32_to_cpu(F2FS_RAW_SUPER(sbi)->section_count)); + seq_printf(seq, " Block size : %12lu KB\n", F2FS_BLKSIZE >> 10); + seq_printf(seq, " Segment size : %12d MB\n", + (BLKS_PER_SEG(sbi) << (F2FS_BLKSIZE_BITS - 10)) >> 10); seq_printf(seq, " Segs/Sections : %12d\n", SEGS_PER_SEC(sbi)); seq_printf(seq, " Section size : %12d MB\n", - SEGS_PER_SEC(sbi) << 1); + (BLKS_PER_SEC(sbi) << (F2FS_BLKSIZE_BITS - 10)) >> 10); + seq_printf(seq, " # of Sections : %12d\n", + le32_to_cpu(F2FS_RAW_SUPER(sbi)->section_count)); if (!f2fs_is_multi_device(sbi)) return 0; @@ -1742,6 +1790,69 @@ static int __maybe_unused disk_map_seq_show(struct seq_file *seq, return 0; } +static int __maybe_unused donation_list_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *inode; + struct f2fs_inode_info *fi; + struct dentry *dentry; + char *buf, *path; + int i; + + buf = f2fs_getname(sbi); + if (!buf) + return 0; + + seq_printf(seq, "Donation List\n"); + seq_printf(seq, " # of files : %u\n", sbi->donate_files); + seq_printf(seq, " %-50s %10s %20s %20s %22s\n", + "File path", "Status", "Donation offset (kb)", + "Donation size (kb)", "File cached size (kb)"); + seq_printf(seq, "---\n"); + + for (i = 0; i < sbi->donate_files; i++) { + spin_lock(&sbi->inode_lock[DONATE_INODE]); + if (list_empty(&sbi->inode_list[DONATE_INODE])) { + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + break; + } + fi = list_first_entry(&sbi->inode_list[DONATE_INODE], + struct f2fs_inode_info, gdonate_list); + list_move_tail(&fi->gdonate_list, &sbi->inode_list[DONATE_INODE]); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + + if (!inode) + continue; + + inode_lock_shared(inode); + + dentry = d_find_alias(inode); + if (!dentry) { + path = NULL; + } else { + path = dentry_path_raw(dentry, buf, PATH_MAX); + if (IS_ERR(path)) + goto next; + } + seq_printf(seq, " %-50s %10s %20llu %20llu %22llu\n", + path ? path : "<unlinked>", + is_inode_flag_set(inode, FI_DONATE_FINISHED) ? + "Evicted" : "Donated", + (loff_t)fi->donate_start << (PAGE_SHIFT - 10), + (loff_t)(fi->donate_end + 1) << (PAGE_SHIFT - 10), + (loff_t)inode->i_mapping->nrpages << (PAGE_SHIFT - 10)); +next: + dput(dentry); + inode_unlock_shared(inode); + iput(inode); + } + f2fs_putname(buf); + return 0; +} + #ifdef CONFIG_F2FS_FAULT_INJECTION static int __maybe_unused inject_stats_seq_show(struct seq_file *seq, void *offset) @@ -1851,6 +1962,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) discard_plist_seq_show, sb); proc_create_single_data("disk_map", 0444, sbi->s_proc, disk_map_seq_show, sb); + proc_create_single_data("donation_list", 0444, sbi->s_proc, + donation_list_seq_show, sb); #ifdef CONFIG_F2FS_FAULT_INJECTION proc_create_single_data("inject_stats", 0444, sbi->s_proc, inject_stats_seq_show, sb); diff --git a/fs/fat/dir.c b/fs/fat/dir.c index acbec5bdd521..92b091783966 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -1209,7 +1209,7 @@ EXPORT_SYMBOL_GPL(fat_alloc_new_dir); static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, int *nr_cluster, struct msdos_dir_entry **de, - struct buffer_head **bh, loff_t *i_pos) + struct buffer_head **bh) { struct super_block *sb = dir->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); @@ -1269,7 +1269,6 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, get_bh(bhs[n]); *bh = bhs[n]; *de = (struct msdos_dir_entry *)((*bh)->b_data + offset); - *i_pos = fat_make_i_pos(sb, *bh, *de); /* Second stage: clear the rest of cluster, and write outs */ err = fat_zeroed_cluster(dir, start_blknr, ++n, bhs, MAX_BUF_PER_PAGE); @@ -1298,7 +1297,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots, struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */ struct msdos_dir_entry *de; int err, free_slots, i, nr_bhs; - loff_t pos, i_pos; + loff_t pos; sinfo->nr_slots = nr_slots; @@ -1386,7 +1385,7 @@ found: * add the cluster to dir. */ cluster = fat_add_new_entries(dir, slots, nr_slots, &nr_cluster, - &de, &bh, &i_pos); + &de, &bh); if (cluster < 0) { err = cluster; goto error_remove; diff --git a/fs/file_table.c b/fs/file_table.c index 81c72576e548..b223d873e48b 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -54,7 +54,7 @@ struct backing_file { #define backing_file(f) container_of(f, struct backing_file, file) -struct path *backing_file_user_path(const struct file *f) +const struct path *backing_file_user_path(const struct file *f) { return &backing_file(f)->user_path; } @@ -171,7 +171,7 @@ static int init_file(struct file *f, int flags, const struct cred *cred) * the respective member when opening the file. */ mutex_init(&f->f_pos_lock); - memset(&f->f_path, 0, sizeof(f->f_path)); + memset(&f->__f_path, 0, sizeof(f->f_path)); memset(&f->f_ra, 0, sizeof(f->f_ra)); f->f_flags = flags; @@ -319,7 +319,7 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred) static void file_init_path(struct file *file, const struct path *path, const struct file_operations *fop) { - file->f_path = *path; + file->__f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); diff --git a/fs/fs_context.c b/fs/fs_context.c index 666e61753aed..93b7ebf8d927 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -161,25 +161,24 @@ int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param) EXPORT_SYMBOL(vfs_parse_fs_param); /** - * vfs_parse_fs_string - Convenience function to just parse a string. + * vfs_parse_fs_qstr - Convenience function to just parse a string. * @fc: Filesystem context. * @key: Parameter name. * @value: Default value. - * @v_size: Maximum number of bytes in the value. */ -int vfs_parse_fs_string(struct fs_context *fc, const char *key, - const char *value, size_t v_size) +int vfs_parse_fs_qstr(struct fs_context *fc, const char *key, + const struct qstr *value) { int ret; struct fs_parameter param = { .key = key, .type = fs_value_is_flag, - .size = v_size, + .size = value ? value->len : 0, }; if (value) { - param.string = kmemdup_nul(value, v_size, GFP_KERNEL); + param.string = kmemdup_nul(value->name, value->len, GFP_KERNEL); if (!param.string) return -ENOMEM; param.type = fs_value_is_string; @@ -189,7 +188,7 @@ int vfs_parse_fs_string(struct fs_context *fc, const char *key, kfree(param.string); return ret; } -EXPORT_SYMBOL(vfs_parse_fs_string); +EXPORT_SYMBOL(vfs_parse_fs_qstr); /** * vfs_parse_monolithic_sep - Parse key[=val][,key[=val]]* mount data @@ -218,16 +217,14 @@ int vfs_parse_monolithic_sep(struct fs_context *fc, void *data, while ((key = sep(&options)) != NULL) { if (*key) { - size_t v_len = 0; char *value = strchr(key, '='); if (value) { if (unlikely(value == key)) continue; *value++ = 0; - v_len = strlen(value); } - ret = vfs_parse_fs_string(fc, key, value, v_len); + ret = vfs_parse_fs_string(fc, key, value); if (ret < 0) break; } diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index a774166264de..3a4ae632c94a 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -13,7 +13,7 @@ config FUSE_FS although chances are your distribution already has that library installed if you've installed the "fuse" package itself. - See <file:Documentation/filesystems/fuse.rst> for more information. + See <file:Documentation/filesystems/fuse/fuse.rst> for more information. See <file:Documentation/Changes> for needed library/utility version. If you want to develop a userspace FS, or if you want to use diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 3f0f312a31c1..22ad9538dfc4 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -10,10 +10,11 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o -fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o +fuse-y := trace.o # put trace.o first so we see ftrace errors sooner +fuse-y += dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o -fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o +fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o backing.o fuse-$(CONFIG_SYSCTL) += sysctl.o fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o diff --git a/fs/fuse/backing.c b/fs/fuse/backing.c new file mode 100644 index 000000000000..4afda419dd14 --- /dev/null +++ b/fs/fuse/backing.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE passthrough to backing file. + * + * Copyright (c) 2023 CTERA Networks. + */ + +#include "fuse_i.h" + +#include <linux/file.h> + +struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) +{ + if (fb && refcount_inc_not_zero(&fb->count)) + return fb; + return NULL; +} + +static void fuse_backing_free(struct fuse_backing *fb) +{ + pr_debug("%s: fb=0x%p\n", __func__, fb); + + if (fb->file) + fput(fb->file); + put_cred(fb->cred); + kfree_rcu(fb, rcu); +} + +void fuse_backing_put(struct fuse_backing *fb) +{ + if (fb && refcount_dec_and_test(&fb->count)) + fuse_backing_free(fb); +} + +void fuse_backing_files_init(struct fuse_conn *fc) +{ + idr_init(&fc->backing_files_map); +} + +static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb) +{ + int id; + + idr_preload(GFP_KERNEL); + spin_lock(&fc->lock); + /* FIXME: xarray might be space inefficient */ + id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC); + spin_unlock(&fc->lock); + idr_preload_end(); + + WARN_ON_ONCE(id == 0); + return id; +} + +static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc, + int id) +{ + struct fuse_backing *fb; + + spin_lock(&fc->lock); + fb = idr_remove(&fc->backing_files_map, id); + spin_unlock(&fc->lock); + + return fb; +} + +static int fuse_backing_id_free(int id, void *p, void *data) +{ + struct fuse_backing *fb = p; + + WARN_ON_ONCE(refcount_read(&fb->count) != 1); + fuse_backing_free(fb); + return 0; +} + +void fuse_backing_files_free(struct fuse_conn *fc) +{ + idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL); + idr_destroy(&fc->backing_files_map); +} + +int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) +{ + struct file *file; + struct super_block *backing_sb; + struct fuse_backing *fb = NULL; + int res; + + pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags); + + /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ + res = -EPERM; + if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) + goto out; + + res = -EINVAL; + if (map->flags || map->padding) + goto out; + + file = fget_raw(map->fd); + res = -EBADF; + if (!file) + goto out; + + /* read/write/splice/mmap passthrough only relevant for regular files */ + res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL; + if (!d_is_reg(file->f_path.dentry)) + goto out_fput; + + backing_sb = file_inode(file)->i_sb; + res = -ELOOP; + if (backing_sb->s_stack_depth >= fc->max_stack_depth) + goto out_fput; + + fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL); + res = -ENOMEM; + if (!fb) + goto out_fput; + + fb->file = file; + fb->cred = prepare_creds(); + refcount_set(&fb->count, 1); + + res = fuse_backing_id_alloc(fc, fb); + if (res < 0) { + fuse_backing_free(fb); + fb = NULL; + } + +out: + pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res); + + return res; + +out_fput: + fput(file); + goto out; +} + +int fuse_backing_close(struct fuse_conn *fc, int backing_id) +{ + struct fuse_backing *fb = NULL; + int err; + + pr_debug("%s: backing_id=%d\n", __func__, backing_id); + + /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ + err = -EPERM; + if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) + goto out; + + err = -EINVAL; + if (backing_id <= 0) + goto out; + + err = -ENOENT; + fb = fuse_backing_id_remove(fc, backing_id); + if (!fb) + goto out; + + fuse_backing_put(fb); + err = 0; +out: + pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err); + + return err; +} + +struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc, int backing_id) +{ + struct fuse_backing *fb; + + rcu_read_lock(); + fb = idr_find(&fc->backing_files_map, backing_id); + fb = fuse_backing_get(fb); + rcu_read_unlock(); + + return fb; +} diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b39844d75a80..28c96961e85d 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -52,6 +52,7 @@ #include <linux/user_namespace.h> #include "fuse_i.h" +#include "fuse_dev_i.h" #define CUSE_CONNTBL_LEN 64 @@ -547,7 +548,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file) */ static int cuse_channel_release(struct inode *inode, struct file *file) { - struct fuse_dev *fud = file->private_data; + struct fuse_dev *fud = __fuse_get_dev(file); struct cuse_conn *cc = fc_to_cc(fud->fc); /* remove from the conntbl, no more access from this point on */ diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ad8645c0f9fe..132f38619d70 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -25,7 +25,6 @@ #include <linux/sched.h> #include <linux/seq_file.h> -#define CREATE_TRACE_POINTS #include "fuse_trace.h" MODULE_ALIAS_MISCDEV(FUSE_MINOR); @@ -207,8 +206,9 @@ static struct fuse_req *fuse_get_req(struct mnt_idmap *idmap, if (fuse_block_alloc(fc, for_background)) { err = -EINTR; - if (wait_event_killable_exclusive(fc->blocked_waitq, - !fuse_block_alloc(fc, for_background))) + if (wait_event_state_exclusive(fc->blocked_waitq, + !fuse_block_alloc(fc, for_background), + (TASK_KILLABLE | TASK_FREEZABLE))) goto out; } /* Matches smp_wmb() in fuse_set_initialized() */ @@ -322,6 +322,7 @@ unsigned int fuse_req_hash(u64 unique) { return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); } +EXPORT_SYMBOL_GPL(fuse_req_hash); /* * A new request is available, wake fiq->waitq @@ -369,12 +370,32 @@ void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) } } +static inline void fuse_request_assign_unique_locked(struct fuse_iqueue *fiq, + struct fuse_req *req) +{ + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique_locked(fiq); + + /* tracepoint captures in.h.unique and in.h.len */ + trace_fuse_request_send(req); +} + +inline void fuse_request_assign_unique(struct fuse_iqueue *fiq, + struct fuse_req *req) +{ + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique(fiq); + + /* tracepoint captures in.h.unique and in.h.len */ + trace_fuse_request_send(req); +} +EXPORT_SYMBOL_GPL(fuse_request_assign_unique); + static void fuse_dev_queue_req(struct fuse_iqueue *fiq, struct fuse_req *req) { spin_lock(&fiq->lock); if (fiq->connected) { - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique_locked(fiq); + fuse_request_assign_unique_locked(fiq, req); list_add_tail(&req->list, &fiq->pending); fuse_dev_wake_and_unlock(fiq); } else { @@ -397,7 +418,6 @@ static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req) req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); - trace_fuse_request_send(req); fiq->ops->send_req(fiq, req); } @@ -687,10 +707,10 @@ static bool fuse_request_queue_background_uring(struct fuse_conn *fc, { struct fuse_iqueue *fiq = &fc->iq; - req->in.h.unique = fuse_get_unique(fiq); req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); + fuse_request_assign_unique(fiq, req); return fuse_uring_queue_bq_req(req); } @@ -1528,14 +1548,34 @@ static int fuse_dev_open(struct inode *inode, struct file *file) return 0; } +struct fuse_dev *fuse_get_dev(struct file *file) +{ + struct fuse_dev *fud = __fuse_get_dev(file); + int err; + + if (likely(fud)) + return fud; + + err = wait_event_interruptible(fuse_dev_waitq, + READ_ONCE(file->private_data) != FUSE_DEV_SYNC_INIT); + if (err) + return ERR_PTR(err); + + fud = __fuse_get_dev(file); + if (!fud) + return ERR_PTR(-EPERM); + + return fud; +} + static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) { struct fuse_copy_state cs; struct file *file = iocb->ki_filp; struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!user_backed_iter(to)) return -EINVAL; @@ -1555,8 +1595,8 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, struct fuse_copy_state cs; struct fuse_dev *fud = fuse_get_dev(in); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer), GFP_KERNEL); @@ -1600,35 +1640,31 @@ static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_poll_wakeup_out outarg; - int err = -EINVAL; + int err; if (size != sizeof(outarg)) - goto err; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto err; + return err; fuse_copy_finish(cs); return fuse_notify_poll_wakeup(fc, &outarg); - -err: - fuse_copy_finish(cs); - return err; } static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_inval_inode_out outarg; - int err = -EINVAL; + int err; if (size != sizeof(outarg)) - goto err; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto err; + return err; fuse_copy_finish(cs); down_read(&fc->killsb); @@ -1636,10 +1672,6 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size, outarg.off, outarg.len); up_read(&fc->killsb); return err; - -err: - fuse_copy_finish(cs); - return err; } static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, @@ -1647,29 +1679,26 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, { struct fuse_notify_inval_entry_out outarg; int err; - char *buf = NULL; + char *buf; struct qstr name; - err = -EINVAL; if (size < sizeof(outarg)) - goto err; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto err; + return err; - err = -ENAMETOOLONG; if (outarg.namelen > fc->name_max) - goto err; + return -ENAMETOOLONG; err = -EINVAL; if (size != sizeof(outarg) + outarg.namelen + 1) - goto err; + return -EINVAL; - err = -ENOMEM; buf = kzalloc(outarg.namelen + 1, GFP_KERNEL); if (!buf) - goto err; + return -ENOMEM; name.name = buf; name.len = outarg.namelen; @@ -1682,12 +1711,8 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags); up_read(&fc->killsb); - kfree(buf); - return err; - err: kfree(buf); - fuse_copy_finish(cs); return err; } @@ -1696,29 +1721,25 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, { struct fuse_notify_delete_out outarg; int err; - char *buf = NULL; + char *buf; struct qstr name; - err = -EINVAL; if (size < sizeof(outarg)) - goto err; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto err; + return err; - err = -ENAMETOOLONG; if (outarg.namelen > fc->name_max) - goto err; + return -ENAMETOOLONG; - err = -EINVAL; if (size != sizeof(outarg) + outarg.namelen + 1) - goto err; + return -EINVAL; - err = -ENOMEM; buf = kzalloc(outarg.namelen + 1, GFP_KERNEL); if (!buf) - goto err; + return -ENOMEM; name.name = buf; name.len = outarg.namelen; @@ -1731,12 +1752,8 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0); up_read(&fc->killsb); - kfree(buf); - return err; - err: kfree(buf); - fuse_copy_finish(cs); return err; } @@ -1754,17 +1771,15 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, loff_t file_size; loff_t end; - err = -EINVAL; if (size < sizeof(outarg)) - goto out_finish; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto out_finish; + return err; - err = -EINVAL; if (size - sizeof(outarg) != outarg.size) - goto out_finish; + return -EINVAL; nodeid = outarg.nodeid; @@ -1824,8 +1839,6 @@ out_iput: iput(inode); out_up_killsb: up_read(&fc->killsb); -out_finish: - fuse_copy_finish(cs); return err; } @@ -1940,13 +1953,12 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, u64 nodeid; int err; - err = -EINVAL; if (size != sizeof(outarg)) - goto copy_finish; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto copy_finish; + return err; fuse_copy_finish(cs); @@ -1962,10 +1974,6 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, up_read(&fc->killsb); return err; - -copy_finish: - fuse_copy_finish(cs); - return err; } /* @@ -2044,6 +2052,42 @@ static int fuse_notify_inc_epoch(struct fuse_conn *fc) return 0; } +static int fuse_notify_prune(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_prune_out outarg; + const unsigned int batch = 512; + u64 *nodeids __free(kfree) = kmalloc(sizeof(u64) * batch, GFP_KERNEL); + unsigned int num, i; + int err; + + if (!nodeids) + return -ENOMEM; + + if (size < sizeof(outarg)) + return -EINVAL; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + return err; + + if (size - sizeof(outarg) != outarg.count * sizeof(u64)) + return -EINVAL; + + for (; outarg.count; outarg.count -= num) { + num = min(batch, outarg.count); + err = fuse_copy_one(cs, nodeids, num * sizeof(u64)); + if (err) + return err; + + scoped_guard(rwsem_read, &fc->killsb) { + for (i = 0; i < num; i++) + fuse_try_prune_one_inode(fc, nodeids[i]); + } + } + return 0; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { @@ -2075,8 +2119,10 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_INC_EPOCH: return fuse_notify_inc_epoch(fc); + case FUSE_NOTIFY_PRUNE: + return fuse_notify_prune(fc, size, cs); + default: - fuse_copy_finish(cs); return -EINVAL; } } @@ -2156,7 +2202,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, */ if (!oh.unique) { err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs); - goto out; + goto copy_finish; } err = -EINVAL; @@ -2229,7 +2275,7 @@ copy_finish: static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) { struct fuse_copy_state cs; - struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp); + struct fuse_dev *fud = __fuse_get_dev(iocb->ki_filp); if (!fud) return -EPERM; @@ -2251,11 +2297,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, unsigned idx; struct pipe_buffer *bufs; struct fuse_copy_state cs; - struct fuse_dev *fud; + struct fuse_dev *fud = __fuse_get_dev(out); size_t rem; ssize_t ret; - fud = fuse_get_dev(out); if (!fud) return -EPERM; @@ -2341,7 +2386,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) struct fuse_iqueue *fiq; struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) + if (IS_ERR(fud)) return EPOLLERR; fiq = &fud->fc->iq; @@ -2394,7 +2439,7 @@ static void end_polls(struct fuse_conn *fc) * The same effect is usually achievable through killing the filesystem daemon * and all users of the filesystem. The exception is the combination of an * asynchronous request and the tricky deadlock (see - * Documentation/filesystems/fuse.rst). + * Documentation/filesystems/fuse/fuse.rst). * * Aborting requests under I/O goes as follows: 1: Separate out unlocked * requests, they should be finished off immediately. Locked requests will be @@ -2488,7 +2533,7 @@ void fuse_wait_aborted(struct fuse_conn *fc) int fuse_dev_release(struct inode *inode, struct file *file) { - struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_dev *fud = __fuse_get_dev(file); if (fud) { struct fuse_conn *fc = fud->fc; @@ -2519,8 +2564,8 @@ static int fuse_dev_fasync(int fd, struct file *file, int on) { struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); /* No locking - fasync_helper does its own locking */ return fasync_helper(fd, file, on, &fud->fc->iq.fasync); @@ -2530,7 +2575,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) { struct fuse_dev *fud; - if (new->private_data) + if (__fuse_get_dev(new)) return -EINVAL; fud = fuse_dev_alloc_install(fc); @@ -2561,7 +2606,7 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) * uses the same ioctl handler. */ if (fd_file(f)->f_op == file->f_op) - fud = fuse_get_dev(fd_file(f)); + fud = __fuse_get_dev(fd_file(f)); res = -EINVAL; if (fud) { @@ -2579,8 +2624,8 @@ static long fuse_dev_ioctl_backing_open(struct file *file, struct fuse_dev *fud = fuse_get_dev(file); struct fuse_backing_map map; - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; @@ -2596,8 +2641,8 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) struct fuse_dev *fud = fuse_get_dev(file); int backing_id; - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; @@ -2608,6 +2653,19 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) return fuse_backing_close(fud->fc, backing_id); } +static long fuse_dev_ioctl_sync_init(struct file *file) +{ + int err = -EINVAL; + + mutex_lock(&fuse_mutex); + if (!__fuse_get_dev(file)) { + WRITE_ONCE(file->private_data, FUSE_DEV_SYNC_INIT); + err = 0; + } + mutex_unlock(&fuse_mutex); + return err; +} + static long fuse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2623,6 +2681,9 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, case FUSE_DEV_IOC_BACKING_CLOSE: return fuse_dev_ioctl_backing_close(file, argp); + case FUSE_DEV_IOC_SYNC_INIT: + return fuse_dev_ioctl_sync_init(file); + default: return -ENOTTY; } @@ -2631,7 +2692,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, #ifdef CONFIG_PROC_FS static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file) { - struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_dev *fud = __fuse_get_dev(file); if (!fud) return; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index a30c44234a4e..f6b12aebb8bb 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -7,6 +7,7 @@ #include "fuse_i.h" #include "dev_uring_i.h" #include "fuse_dev_i.h" +#include "fuse_trace.h" #include <linux/fs.h> #include <linux/io_uring/cmd.h> @@ -1139,9 +1140,9 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -EINVAL; fud = fuse_get_dev(cmd->file); - if (!fud) { + if (IS_ERR(fud)) { pr_info_ratelimited("No fuse device found\n"); - return -ENOTCONN; + return PTR_ERR(fud); } fc = fud->fc; @@ -1268,8 +1269,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) if (!queue) goto err; - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique(fiq); + fuse_request_assign_unique(fiq, req); spin_lock(&queue->lock); err = -ENOTCONN; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5c569c3cb53f..ecaec0fea3a1 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -739,22 +739,18 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, int err; struct mnt_idmap *idmap = file_mnt_idmap(file); struct fuse_conn *fc = get_fuse_conn(dir); - struct dentry *res = NULL; if (fuse_is_bad(dir)) return -EIO; if (d_in_lookup(entry)) { - res = fuse_lookup(dir, entry, 0); - if (IS_ERR(res)) - return PTR_ERR(res); - - if (res) - entry = res; + struct dentry *res = fuse_lookup(dir, entry, 0); + if (res || d_really_is_positive(entry)) + return finish_no_open(file, res); } - if (!(flags & O_CREAT) || d_really_is_positive(entry)) - goto no_open; + if (!(flags & O_CREAT)) + return finish_no_open(file, NULL); /* Only creates */ file->f_mode |= FMODE_CREATED; @@ -768,16 +764,13 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, goto mknod; } else if (err == -EEXIST) fuse_invalidate_entry(entry); -out_dput: - dput(res); return err; mknod: err = fuse_mknod(idmap, dir, entry, mode, 0); if (err) - goto out_dput; -no_open: - return finish_no_open(file, res); + return err; + return finish_no_open(file, NULL); } /* diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4adcf09d4b01..f1ef77a0be05 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -356,8 +356,14 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff, * Make the release synchronous if this is a fuseblk mount, * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. + * + * Always use the asynchronous file put because the current thread + * might be the fuse server. This can happen if a process starts some + * aio and closes the fd before the aio completes. Since aio takes its + * own ref to the file, the IO completion has to drop the ref, which is + * how the fuse server can end up closing its clients' files. */ - fuse_file_put(ff, ff->fm->fc->destroy); + fuse_file_put(ff, false); } void fuse_release_common(struct file *file, bool isdir) @@ -865,22 +871,20 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, struct fuse_args_pages *ap = &ia->ap; size_t count = ia->read.in.size; size_t num_read = args->out_args[0].size; - struct address_space *mapping = NULL; - - for (i = 0; mapping == NULL && i < ap->num_folios; i++) - mapping = ap->folios[i]->mapping; + struct address_space *mapping; + struct inode *inode; - if (mapping) { - struct inode *inode = mapping->host; + WARN_ON_ONCE(!ap->num_folios); + mapping = ap->folios[0]->mapping; + inode = mapping->host; - /* - * Short read means EOF. If file size is larger, truncate it - */ - if (!err && num_read < count) - fuse_short_read(inode, ia->read.attr_ver, num_read, ap); + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (!err && num_read < count) + fuse_short_read(inode, ia->read.attr_ver, num_read, ap); - fuse_invalidate_atime(inode); - } + fuse_invalidate_atime(inode); for (i = 0; i < ap->num_folios; i++) { folio_end_read(ap->folios[i], !err); @@ -1175,7 +1179,6 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, num = min(iov_iter_count(ii), fc->max_write); ap->args.in_pages = true; - ap->descs[0].offset = offset; while (num && ap->num_folios < max_folios) { size_t tmp; @@ -1823,19 +1826,15 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); - struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_folios; i++) { + for (i = 0; i < ap->num_folios; i++) /* * Benchmarks showed that ending writeback within the * scope of the fi->lock alleviates xarray lock * contention and noticeably improves performance. */ iomap_finish_folio_write(inode, ap->folios[i], 1); - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - wb_writeout_inc(&bdi->wb); - } wake_up(&fi->page_waitq); } @@ -2010,14 +2009,11 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, uint32_t folio_index, loff_t offset, unsigned len) { - struct inode *inode = folio->mapping->host; struct fuse_args_pages *ap = &wpa->ia.ap; ap->folios[folio_index] = folio; ap->descs[folio_index].offset = offset; ap->descs[folio_index].length = len; - - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, @@ -2960,10 +2956,12 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, .nodeid_out = ff_out->nodeid, .fh_out = ff_out->fh, .off_out = pos_out, - .len = min_t(size_t, len, UINT_MAX & PAGE_MASK), + .len = len, .flags = flags }; struct fuse_write_out outarg; + struct fuse_copy_file_range_out outarg_64; + u64 bytes_copied; ssize_t err; /* mark unstable when write-back is not used, and file_out gets * extended */ @@ -3013,33 +3011,51 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (is_unstable) set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); - args.opcode = FUSE_COPY_FILE_RANGE; + args.opcode = FUSE_COPY_FILE_RANGE_64; args.nodeid = ff_in->nodeid; args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.out_numargs = 1; - args.out_args[0].size = sizeof(outarg); - args.out_args[0].value = &outarg; + args.out_args[0].size = sizeof(outarg_64); + args.out_args[0].value = &outarg_64; + if (fc->no_copy_file_range_64) { +fallback: + /* Fall back to old op that can't handle large copy length */ + args.opcode = FUSE_COPY_FILE_RANGE; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + inarg.len = len = min_t(size_t, len, UINT_MAX & PAGE_MASK); + } err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_copy_file_range = 1; - err = -EOPNOTSUPP; + if (fc->no_copy_file_range_64) { + fc->no_copy_file_range = 1; + err = -EOPNOTSUPP; + } else { + fc->no_copy_file_range_64 = 1; + goto fallback; + } } - if (!err && outarg.size > len) - err = -EIO; - if (err) goto out; + bytes_copied = fc->no_copy_file_range_64 ? + outarg.size : outarg_64.bytes_copied; + + if (bytes_copied > len) { + err = -EIO; + goto out; + } + truncate_inode_pages_range(inode_out->i_mapping, ALIGN_DOWN(pos_out, PAGE_SIZE), - ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1); + ALIGN(pos_out + bytes_copied, PAGE_SIZE) - 1); file_update_time(file_out); - fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size); + fuse_write_update_attr(inode_out, pos_out + bytes_copied, bytes_copied); - err = outarg.size; + err = bytes_copied; out: if (is_unstable) clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 5a9bd771a319..6e8373f97040 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -12,6 +12,8 @@ #define FUSE_INT_REQ_BIT (1ULL << 0) #define FUSE_REQ_ID_STEP (1ULL << 1) +extern struct wait_queue_head fuse_dev_waitq; + struct fuse_arg; struct fuse_args; struct fuse_pqueue; @@ -37,15 +39,22 @@ struct fuse_copy_state { } ring; }; -static inline struct fuse_dev *fuse_get_dev(struct file *file) +#define FUSE_DEV_SYNC_INIT ((struct fuse_dev *) 1) +#define FUSE_DEV_PTR_MASK (~1UL) + +static inline struct fuse_dev *__fuse_get_dev(struct file *file) { /* * Lockless access is OK, because file->private data is set * once during mount and is valid until the file is released. */ - return READ_ONCE(file->private_data); + struct fuse_dev *fud = READ_ONCE(file->private_data); + + return (typeof(fud)) ((unsigned long) fud & FUSE_DEV_PTR_MASK); } +struct fuse_dev *fuse_get_dev(struct file *file); + unsigned int fuse_req_hash(u64 unique); struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index cc428d04be3e..c2f2a48156d6 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -856,6 +856,9 @@ struct fuse_conn { /** Does the filesystem support copy_file_range? */ unsigned no_copy_file_range:1; + /** Does the filesystem support copy_file_range_64? */ + unsigned no_copy_file_range_64:1; + /* Send DESTROY request */ unsigned int destroy:1; @@ -901,6 +904,9 @@ struct fuse_conn { /* Is link not implemented by fs? */ unsigned int no_link:1; + /* Is synchronous FUSE_INIT allowed? */ + unsigned int sync_init:1; + /* Use io_uring for communication */ unsigned int io_uring; @@ -1255,6 +1261,11 @@ int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags); /** + * Assign a unique id to a fuse request + */ +void fuse_request_assign_unique(struct fuse_iqueue *fiq, struct fuse_req *req); + +/** * End a finished request */ void fuse_request_end(struct fuse_req *req); @@ -1315,7 +1326,7 @@ struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); struct fuse_dev *fuse_dev_alloc(void); void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); -void fuse_send_init(struct fuse_mount *fm); +int fuse_send_init(struct fuse_mount *fm); /** * Fill in superblock and initialize fuse connection @@ -1407,6 +1418,12 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, u64 child_nodeid, struct qstr *name, u32 flags); +/* + * Try to prune this inode. If neither the inode itself nor dentries associated + * with this inode have any external reference, then the inode can be freed. + */ +void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid); + int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir); @@ -1512,29 +1529,11 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir); -/* passthrough.c */ -static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi) -{ -#ifdef CONFIG_FUSE_PASSTHROUGH - return READ_ONCE(fi->fb); -#else - return NULL; -#endif -} - -static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi, - struct fuse_backing *fb) -{ -#ifdef CONFIG_FUSE_PASSTHROUGH - return xchg(&fi->fb, fb); -#else - return NULL; -#endif -} - +/* backing.c */ #ifdef CONFIG_FUSE_PASSTHROUGH struct fuse_backing *fuse_backing_get(struct fuse_backing *fb); void fuse_backing_put(struct fuse_backing *fb); +struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc, int backing_id); #else static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) @@ -1545,6 +1544,11 @@ static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) static inline void fuse_backing_put(struct fuse_backing *fb) { } +static inline struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc, + int backing_id) +{ + return NULL; +} #endif void fuse_backing_files_init(struct fuse_conn *fc); @@ -1552,9 +1556,27 @@ void fuse_backing_files_free(struct fuse_conn *fc); int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map); int fuse_backing_close(struct fuse_conn *fc, int backing_id); -struct fuse_backing *fuse_passthrough_open(struct file *file, - struct inode *inode, - int backing_id); +/* passthrough.c */ +static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return READ_ONCE(fi->fb); +#else + return NULL; +#endif +} + +static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi, + struct fuse_backing *fb) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return xchg(&fi->fb, fb); +#else + return NULL; +#endif +} + +struct fuse_backing *fuse_passthrough_open(struct file *file, int backing_id); void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb); static inline struct file *fuse_file_passthrough(struct fuse_file *ff) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 7485a41af892..d1babf56f254 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dev_i.h" #include "dev_uring_i.h" #include <linux/dax.h> @@ -34,6 +35,7 @@ MODULE_LICENSE("GPL"); static struct kmem_cache *fuse_inode_cachep; struct list_head fuse_conn_list; DEFINE_MUTEX(fuse_mutex); +DECLARE_WAIT_QUEUE_HEAD(fuse_dev_waitq); static int set_global_limit(const char *val, const struct kernel_param *kp); @@ -101,14 +103,11 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) if (!fi) return NULL; - fi->i_time = 0; + /* Initialize private data (i.e. everything except fi->inode) */ + BUILD_BUG_ON(offsetof(struct fuse_inode, inode) != 0); + memset((void *) fi + sizeof(fi->inode), 0, sizeof(*fi) - sizeof(fi->inode)); + fi->inval_mask = ~0; - fi->nodeid = 0; - fi->nlookup = 0; - fi->attr_version = 0; - fi->orig_ino = 0; - fi->state = 0; - fi->submount_lookup = NULL; mutex_init(&fi->mutex); spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); @@ -586,6 +585,17 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, return 0; } +void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid) +{ + struct inode *inode; + + inode = fuse_ilookup(fc, nodeid, NULL); + if (!inode) + return; + d_prune_aliases(inode); + iput(inode); +} + bool fuse_lock_inode(struct inode *inode) { bool locked = false; @@ -1469,7 +1479,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, wake_up_all(&fc->blocked_waitq); } -void fuse_send_init(struct fuse_mount *fm) +static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm) { struct fuse_init_args *ia; u64 flags; @@ -1528,10 +1538,30 @@ void fuse_send_init(struct fuse_mount *fm) ia->args.out_args[0].value = &ia->out; ia->args.force = true; ia->args.nocreds = true; - ia->args.end = process_init_reply; - if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0) - process_init_reply(fm, &ia->args, -ENOTCONN); + return ia; +} + +int fuse_send_init(struct fuse_mount *fm) +{ + struct fuse_init_args *ia = fuse_new_init(fm); + int err; + + if (fm->fc->sync_init) { + err = fuse_simple_request(fm, &ia->args); + /* Ignore size of init reply */ + if (err > 0) + err = 0; + } else { + ia->args.end = process_init_reply; + err = fuse_simple_background(fm, &ia->args, GFP_KERNEL); + if (!err) + return 0; + } + process_init_reply(fm, &ia->args, err); + if (fm->fc->conn_error) + return -ENOTCONN; + return 0; } EXPORT_SYMBOL_GPL(fuse_send_init); @@ -1561,8 +1591,6 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) if (err) return err; - /* fuse does it's own writeback accounting */ - sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT; sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT; /* @@ -1821,6 +1849,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) !sb_set_blocksize(sb, PAGE_SIZE)) goto err; #endif + fc->sync_fs = 1; } else { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; @@ -1872,8 +1901,12 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) mutex_lock(&fuse_mutex); err = -EINVAL; - if (ctx->fudptr && *ctx->fudptr) - goto err_unlock; + if (ctx->fudptr && *ctx->fudptr) { + if (*ctx->fudptr == FUSE_DEV_SYNC_INIT) + fc->sync_init = 1; + else + goto err_unlock; + } err = fuse_ctl_add_conn(fc); if (err) @@ -1881,8 +1914,10 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - if (ctx->fudptr) + if (ctx->fudptr) { *ctx->fudptr = fud; + wake_up_all(&fuse_dev_waitq); + } mutex_unlock(&fuse_mutex); return 0; @@ -1903,6 +1938,7 @@ EXPORT_SYMBOL_GPL(fuse_fill_super_common); static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; + struct fuse_mount *fm; int err; if (!ctx->file || !ctx->rootmode_present || @@ -1923,8 +1959,10 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) return err; /* file->private_data shall be visible on all CPUs after this */ smp_mb(); - fuse_send_init(get_fuse_mount_super(sb)); - return 0; + + fm = get_fuse_mount_super(sb); + + return fuse_send_init(fm); } /* @@ -1985,7 +2023,7 @@ static int fuse_get_tree(struct fs_context *fsc) * Allow creating a fuse mount with an already initialized fuse * connection */ - fud = READ_ONCE(ctx->file->private_data); + fud = __fuse_get_dev(ctx->file); if (ctx->file->f_op == &fuse_dev_operations && fud) { fsc->sget_key = fud->fc; sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super); diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c index c99e285f3183..3728933188f3 100644 --- a/fs/fuse/iomode.c +++ b/fs/fuse/iomode.c @@ -177,8 +177,7 @@ static int fuse_file_passthrough_open(struct inode *inode, struct file *file) (ff->open_flags & ~FOPEN_PASSTHROUGH_MASK)) return -EINVAL; - fb = fuse_passthrough_open(file, inode, - ff->args->open_outarg.backing_id); + fb = fuse_passthrough_open(file, ff->args->open_outarg.backing_id); if (IS_ERR(fb)) return PTR_ERR(fb); diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index eb97ac009e75..72de97c03d0e 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -144,171 +144,12 @@ ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma) return backing_file_mmap(backing_file, vma, &ctx); } -struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) -{ - if (fb && refcount_inc_not_zero(&fb->count)) - return fb; - return NULL; -} - -static void fuse_backing_free(struct fuse_backing *fb) -{ - pr_debug("%s: fb=0x%p\n", __func__, fb); - - if (fb->file) - fput(fb->file); - put_cred(fb->cred); - kfree_rcu(fb, rcu); -} - -void fuse_backing_put(struct fuse_backing *fb) -{ - if (fb && refcount_dec_and_test(&fb->count)) - fuse_backing_free(fb); -} - -void fuse_backing_files_init(struct fuse_conn *fc) -{ - idr_init(&fc->backing_files_map); -} - -static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb) -{ - int id; - - idr_preload(GFP_KERNEL); - spin_lock(&fc->lock); - /* FIXME: xarray might be space inefficient */ - id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC); - spin_unlock(&fc->lock); - idr_preload_end(); - - WARN_ON_ONCE(id == 0); - return id; -} - -static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc, - int id) -{ - struct fuse_backing *fb; - - spin_lock(&fc->lock); - fb = idr_remove(&fc->backing_files_map, id); - spin_unlock(&fc->lock); - - return fb; -} - -static int fuse_backing_id_free(int id, void *p, void *data) -{ - struct fuse_backing *fb = p; - - WARN_ON_ONCE(refcount_read(&fb->count) != 1); - fuse_backing_free(fb); - return 0; -} - -void fuse_backing_files_free(struct fuse_conn *fc) -{ - idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL); - idr_destroy(&fc->backing_files_map); -} - -int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) -{ - struct file *file; - struct super_block *backing_sb; - struct fuse_backing *fb = NULL; - int res; - - pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags); - - /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ - res = -EPERM; - if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) - goto out; - - res = -EINVAL; - if (map->flags || map->padding) - goto out; - - file = fget_raw(map->fd); - res = -EBADF; - if (!file) - goto out; - - /* read/write/splice/mmap passthrough only relevant for regular files */ - res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL; - if (!d_is_reg(file->f_path.dentry)) - goto out_fput; - - backing_sb = file_inode(file)->i_sb; - res = -ELOOP; - if (backing_sb->s_stack_depth >= fc->max_stack_depth) - goto out_fput; - - fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL); - res = -ENOMEM; - if (!fb) - goto out_fput; - - fb->file = file; - fb->cred = prepare_creds(); - refcount_set(&fb->count, 1); - - res = fuse_backing_id_alloc(fc, fb); - if (res < 0) { - fuse_backing_free(fb); - fb = NULL; - } - -out: - pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res); - - return res; - -out_fput: - fput(file); - goto out; -} - -int fuse_backing_close(struct fuse_conn *fc, int backing_id) -{ - struct fuse_backing *fb = NULL; - int err; - - pr_debug("%s: backing_id=%d\n", __func__, backing_id); - - /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ - err = -EPERM; - if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) - goto out; - - err = -EINVAL; - if (backing_id <= 0) - goto out; - - err = -ENOENT; - fb = fuse_backing_id_remove(fc, backing_id); - if (!fb) - goto out; - - fuse_backing_put(fb); - err = 0; -out: - pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err); - - return err; -} - /* * Setup passthrough to a backing file. * * Returns an fb object with elevated refcount to be stored in fuse inode. */ -struct fuse_backing *fuse_passthrough_open(struct file *file, - struct inode *inode, - int backing_id) +struct fuse_backing *fuse_passthrough_open(struct file *file, int backing_id) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fm->fc; @@ -320,12 +161,8 @@ struct fuse_backing *fuse_passthrough_open(struct file *file, if (backing_id <= 0) goto out; - rcu_read_lock(); - fb = idr_find(&fc->backing_files_map, backing_id); - fb = fuse_backing_get(fb); - rcu_read_unlock(); - err = -ENOENT; + fb = fuse_backing_lookup(fc, backing_id); if (!fb) goto out; diff --git a/fs/fuse/trace.c b/fs/fuse/trace.c new file mode 100644 index 000000000000..93bd72efc98c --- /dev/null +++ b/fs/fuse/trace.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "dev_uring_i.h" +#include "fuse_i.h" +#include "fuse_dev_i.h" + +#include <linux/pagemap.h> + +#define CREATE_TRACE_POINTS +#include "fuse_trace.h" diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 76c8fd0bfc75..6bc7c97b017d 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -20,6 +20,7 @@ #include <linux/cleanup.h> #include <linux/uio.h> #include "fuse_i.h" +#include "fuse_dev_i.h" /* Used to help calculate the FUSE connection's max_pages limit for a request's * size. Parts of the struct fuse_req are sliced into scattergather lists in @@ -761,7 +762,6 @@ static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req) static void virtio_fs_request_complete(struct fuse_req *req, struct virtio_fs_vq *fsvq) { - struct fuse_pqueue *fpq = &fsvq->fud->pq; struct fuse_args *args; struct fuse_args_pages *ap; unsigned int len, i, thislen; @@ -790,9 +790,7 @@ static void virtio_fs_request_complete(struct fuse_req *req, } } - spin_lock(&fpq->lock); clear_bit(FR_SENT, &req->flags); - spin_unlock(&fpq->lock); fuse_request_end(req); spin_lock(&fsvq->lock); @@ -1384,7 +1382,7 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, unsigned int out_sgs = 0; unsigned int in_sgs = 0; unsigned int total_sgs; - unsigned int i; + unsigned int i, hash; int ret; bool notify; struct fuse_pqueue *fpq; @@ -1444,8 +1442,9 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, /* Request successfully sent. */ fpq = &fsvq->fud->pq; + hash = fuse_req_hash(req->in.h.unique); spin_lock(&fpq->lock); - list_add_tail(&req->list, fpq->processing); + list_add_tail(&req->list, &fpq->processing[hash]); spin_unlock(&fpq->lock); set_bit(FR_SENT, &req->flags); /* matches barrier in request_wait_answer() */ @@ -1480,8 +1479,7 @@ static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req) struct virtio_fs_vq *fsvq; int ret; - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique(fiq); + fuse_request_assign_unique(fiq, req); clear_bit(FR_PENDING, &req->flags); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 8760e7e20c9d..8a7ed80d9f2d 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1368,27 +1368,19 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode) { - struct dentry *d; bool excl = !!(flags & O_EXCL); - if (!d_in_lookup(dentry)) - goto skip_lookup; - - d = __gfs2_lookup(dir, dentry, file); - if (IS_ERR(d)) - return PTR_ERR(d); - if (d != NULL) - dentry = d; - if (d_really_is_positive(dentry)) { - if (!(file->f_mode & FMODE_OPENED)) + if (d_in_lookup(dentry)) { + struct dentry *d = __gfs2_lookup(dir, dentry, file); + if (file->f_mode & FMODE_OPENED) { + if (IS_ERR(d)) + return PTR_ERR(d); + dput(d); + return excl && (flags & O_CREAT) ? -EEXIST : 0; + } + if (d || d_really_is_positive(dentry)) return finish_no_open(file, d); - dput(d); - return excl && (flags & O_CREAT) ? -EEXIST : 0; } - - BUG_ON(d != NULL); - -skip_lookup: if (!(flags & O_CREAT)) return -ENOENT; diff --git a/fs/internal.h b/fs/internal.h index a33d18ee5b74..9b2b4d116880 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -53,7 +53,7 @@ extern int finish_clean_context(struct fs_context *fc); * namei.c */ extern int filename_lookup(int dfd, struct filename *name, unsigned flags, - struct path *path, struct path *root); + struct path *path, const struct path *root); int do_rmdir(int dfd, struct filename *name); int do_unlinkat(int dfd, struct filename *name); int may_linkat(struct mnt_idmap *idmap, const struct path *link); @@ -84,9 +84,9 @@ void mnt_put_write_access_file(struct file *file); extern void dissolve_on_fput(struct vfsmount *); extern bool may_mount(void); -int path_mount(const char *dev_name, struct path *path, +int path_mount(const char *dev_name, const struct path *path, const char *type_page, unsigned long flags, void *data_page); -int path_umount(struct path *path, int flags); +int path_umount(const struct path *path, int flags); int show_path(struct seq_file *m, struct dentry *root); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 38861ca04899..2d0719bf6d87 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -131,7 +131,7 @@ __flush_batch(journal_t *journal, int *batch_count) blk_start_plug(&plug); for (i = 0; i < *batch_count; i++) - write_dirty_buffer(journal->j_chkpt_bhs[i], REQ_SYNC); + write_dirty_buffer(journal->j_chkpt_bhs[i], JBD2_JOURNAL_REQ_FLAGS); blk_finish_plug(&plug); for (i = 0; i < *batch_count; i++) { diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index fcedeb514e14..21f3d029da7d 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -59,9 +59,15 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino) */ inode->i_link[inode->i_size] = '\0'; } - } else { + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { inode->i_op = &jfs_file_inode_operations; init_special_inode(inode, inode->i_mode, inode->i_rdev); + } else { + printk(KERN_DEBUG "JFS: Invalid file type 0%04o for inode %lu.\n", + inode->i_mode, inode->i_ino); + iget_failed(inode); + return ERR_PTR(-EIO); } unlock_new_inode(inode); return inode; diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index ab11849cf9cc..0ab83bb7bbdf 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -2903,7 +2903,7 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) stbl = DT_GETSTBL(p); for (i = index; i < p->header.nextindex; i++) { - if (stbl[i] < 0 || stbl[i] > 127) { + if (stbl[i] < 0 || stbl[i] >= DTPAGEMAXSLOT) { jfs_err("JFS: Invalid stbl[%d] = %d for inode %ld, block = %lld", i, stbl[i], (long)ip->i_ino, (long long)bn); free_page(dirent_buf); @@ -3108,7 +3108,7 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack) /* get the leftmost entry */ stbl = DT_GETSTBL(p); - if (stbl[0] < 0 || stbl[0] > 127) { + if (stbl[0] < 0 || stbl[0] >= DTPAGEMAXSLOT) { DT_PUTPAGE(mp); jfs_error(ip->i_sb, "stbl[0] out of bound\n"); return -EIO; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 270808b6219b..b343c5ea1159 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1199,7 +1199,6 @@ static int open_dummy_log(struct super_block *sb) init_waitqueue_head(&dummy_log->syncwait); dummy_log->no_integrity = 1; /* Make up some stuff */ - dummy_log->base = 0; dummy_log->size = 1024; rc = lmLogInit(dummy_log); if (rc) { diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index 98f9a432c336..52e6b58c5dbd 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -325,13 +325,13 @@ static int chkSuper(struct super_block *sb) if ((j_sb->s_flag & cpu_to_le32(JFS_BAD_SAIT)) != cpu_to_le32(JFS_BAD_SAIT)) { expected_AIM_bytesize = 2 * PSIZE; - AIM_bytesize = lengthPXD(&(j_sb->s_aim2)) * bsize; + AIM_bytesize = lengthPXD(&j_sb->s_aim2) * bsize; expected_AIT_bytesize = 4 * PSIZE; - AIT_bytesize = lengthPXD(&(j_sb->s_ait2)) * bsize; - AIM_byte_addr = addressPXD(&(j_sb->s_aim2)) * bsize; - AIT_byte_addr = addressPXD(&(j_sb->s_ait2)) * bsize; + AIT_bytesize = lengthPXD(&j_sb->s_ait2) * bsize; + AIM_byte_addr = addressPXD(&j_sb->s_aim2) * bsize; + AIT_byte_addr = addressPXD(&j_sb->s_ait2) * bsize; byte_addr_diff0 = AIT_byte_addr - AIM_byte_addr; - fsckwsp_addr = addressPXD(&(j_sb->s_fsckpxd)) * bsize; + fsckwsp_addr = addressPXD(&j_sb->s_fsckpxd) * bsize; byte_addr_diff1 = fsckwsp_addr - AIT_byte_addr; if ((AIM_bytesize != expected_AIM_bytesize) || (AIT_bytesize != expected_AIT_bytesize) || diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index be17e3c43582..7840a03e5bcb 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -272,14 +272,15 @@ int txInit(void) if (TxBlock == NULL) return -ENOMEM; - for (k = 1; k < nTxBlock - 1; k++) { - TxBlock[k].next = k + 1; + for (k = 0; k < nTxBlock; k++) { init_waitqueue_head(&TxBlock[k].gcwait); init_waitqueue_head(&TxBlock[k].waitor); } + + for (k = 1; k < nTxBlock - 1; k++) { + TxBlock[k].next = k + 1; + } TxBlock[k].next = 0; - init_waitqueue_head(&TxBlock[k].gcwait); - init_waitqueue_head(&TxBlock[k].waitor); TxAnchor.freetid = 1; init_waitqueue_head(&TxAnchor.freewait); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index e80262a51884..d68afa196535 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -216,8 +216,7 @@ out_err: if (warned++ == 0) printk(KERN_WARNING "lockd_up: makesock failed, error=%d\n", err); - svc_xprt_destroy_all(serv, net); - svc_rpcb_cleanup(serv, net); + svc_xprt_destroy_all(serv, net, true); return err; } @@ -255,8 +254,7 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) nlm_shutdown_hosts_net(net); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); - svc_xprt_destroy_all(serv, net); - svc_rpcb_cleanup(serv, net); + svc_xprt_destroy_all(serv, net, true); } } else { pr_err("%s: no users! net=%x\n", diff --git a/fs/mount.h b/fs/mount.h index 79c85639a7ba..f13a28752d0b 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -58,7 +58,10 @@ struct mount { #endif struct list_head mnt_mounts; /* list of children, anchored here */ struct list_head mnt_child; /* and going through their mnt_child */ - struct list_head mnt_instance; /* mount instance on sb->s_mounts */ + struct mount *mnt_next_for_sb; /* the next two fields are hlist_node, */ + struct mount * __aligned(1) *mnt_pprev_for_sb; + /* except that LSB of pprev is stolen */ +#define WRITE_HOLD 1 /* ... for use by mnt_hold_writers() */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ @@ -148,6 +151,11 @@ static inline void get_mnt_ns(struct mnt_namespace *ns) extern seqlock_t mount_lock; +DEFINE_LOCK_GUARD_0(mount_writer, write_seqlock(&mount_lock), + write_sequnlock(&mount_lock)) +DEFINE_LOCK_GUARD_0(mount_locked_reader, read_seqlock_excl(&mount_lock), + read_sequnlock_excl(&mount_lock)) + struct proc_mounts { struct mnt_namespace *ns; struct path root; @@ -224,4 +232,33 @@ static inline void mnt_notify_add(struct mount *m) } #endif +static inline struct mount *topmost_overmount(struct mount *m) +{ + while (m->overmount) + m = m->overmount; + return m; +} + +static inline bool __test_write_hold(struct mount * __aligned(1) *val) +{ + return (unsigned long)val & WRITE_HOLD; +} + +static inline bool test_write_hold(const struct mount *m) +{ + return __test_write_hold(m->mnt_pprev_for_sb); +} + +static inline void set_write_hold(struct mount *m) +{ + m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb + | WRITE_HOLD); +} + +static inline void clear_write_hold(struct mount *m) +{ + m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb + & ~WRITE_HOLD); +} + struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry); diff --git a/fs/namei.c b/fs/namei.c index 507ca0d7878d..7377020a2cba 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2695,7 +2695,7 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path } int filename_lookup(int dfd, struct filename *name, unsigned flags, - struct path *path, struct path *root) + struct path *path, const struct path *root) { int retval; struct nameidata nd; @@ -3651,8 +3651,8 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, if (nd->flags & LOOKUP_DIRECTORY) open_flag |= O_DIRECTORY; - file->f_path.dentry = DENTRY_NOT_SET; - file->f_path.mnt = nd->path.mnt; + file->__f_path.dentry = DENTRY_NOT_SET; + file->__f_path.mnt = nd->path.mnt; error = dir->i_op->atomic_open(dir, dentry, file, open_to_namei_flags(open_flag), mode); d_lookup_done(dentry); @@ -4020,8 +4020,8 @@ int vfs_tmpfile(struct mnt_idmap *idmap, child = d_alloc(parentpath->dentry, &slash_name); if (unlikely(!child)) return -ENOMEM; - file->f_path.mnt = parentpath->mnt; - file->f_path.dentry = child; + file->__f_path.mnt = parentpath->mnt; + file->__f_path.dentry = child; mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = dir->i_op->tmpfile(idmap, dir, file, mode); dput(child); @@ -4256,7 +4256,7 @@ struct dentry *start_creating_path(int dfd, const char *pathname, } EXPORT_SYMBOL(start_creating_path); -void end_creating_path(struct path *path, struct dentry *dentry) +void end_creating_path(const struct path *path, struct dentry *dentry) { if (!IS_ERR(dentry)) dput(dentry); diff --git a/fs/namespace.c b/fs/namespace.c index dc01b14c58cd..d82910f33dc4 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -91,6 +91,14 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */ +static inline void namespace_lock(void); +static void namespace_unlock(void); +DEFINE_LOCK_GUARD_0(namespace_excl, namespace_lock(), namespace_unlock()) +DEFINE_LOCK_GUARD_0(namespace_shared, down_read(&namespace_sem), + up_read(&namespace_sem)) + +DEFINE_FREE(mntput, struct vfsmount *, if (!IS_ERR(_T)) mntput(_T)) + #ifdef CONFIG_FSNOTIFY LIST_HEAD(notify_list); /* protected by namespace_sem */ #endif @@ -363,7 +371,7 @@ out_free_cache: * mnt_want/drop_write() will _keep_ the filesystem * r/w. */ -bool __mnt_is_readonly(struct vfsmount *mnt) +bool __mnt_is_readonly(const struct vfsmount *mnt) { return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb); } @@ -403,7 +411,7 @@ static unsigned int mnt_get_writers(struct mount *mnt) #endif } -static int mnt_is_readonly(struct vfsmount *mnt) +static int mnt_is_readonly(const struct vfsmount *mnt) { if (READ_ONCE(mnt->mnt_sb->s_readonly_remount)) return 1; @@ -444,31 +452,31 @@ int mnt_get_write_access(struct vfsmount *m) mnt_inc_writers(mnt); /* * The store to mnt_inc_writers must be visible before we pass - * MNT_WRITE_HOLD loop below, so that the slowpath can see our - * incremented count after it has set MNT_WRITE_HOLD. + * WRITE_HOLD loop below, so that the slowpath can see our + * incremented count after it has set WRITE_HOLD. */ smp_mb(); might_lock(&mount_lock.lock); - while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { + while (__test_write_hold(READ_ONCE(mnt->mnt_pprev_for_sb))) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { cpu_relax(); } else { /* * This prevents priority inversion, if the task - * setting MNT_WRITE_HOLD got preempted on a remote + * setting WRITE_HOLD got preempted on a remote * CPU, and it prevents life lock if the task setting - * MNT_WRITE_HOLD has a lower priority and is bound to + * WRITE_HOLD has a lower priority and is bound to * the same CPU as the task that is spinning here. */ preempt_enable(); - lock_mount_hash(); - unlock_mount_hash(); + read_seqlock_excl(&mount_lock); + read_sequnlock_excl(&mount_lock); preempt_disable(); } } /* * The barrier pairs with the barrier sb_start_ro_state_change() making - * sure that if we see MNT_WRITE_HOLD cleared, we will also see + * sure that if we see WRITE_HOLD cleared, we will also see * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in * mnt_is_readonly() and bail in case we are racing with remount * read-only. @@ -606,16 +614,16 @@ EXPORT_SYMBOL(mnt_drop_write_file); * a call to mnt_unhold_writers() in order to stop preventing write access to * @mnt. * - * Context: This function expects lock_mount_hash() to be held serializing - * setting MNT_WRITE_HOLD. + * Context: This function expects to be in mount_locked_reader scope serializing + * setting WRITE_HOLD. * Return: On success 0 is returned. * On error, -EBUSY is returned. */ static inline int mnt_hold_writers(struct mount *mnt) { - mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; + set_write_hold(mnt); /* - * After storing MNT_WRITE_HOLD, we'll read the counters. This store + * After storing WRITE_HOLD, we'll read the counters. This store * should be visible before we do. */ smp_mb(); @@ -631,9 +639,9 @@ static inline int mnt_hold_writers(struct mount *mnt) * sum up each counter, if we read a counter before it is incremented, * but then read another CPU's count which it has been subsequently * decremented from -- we would see more decrements than we should. - * MNT_WRITE_HOLD protects against this scenario, because + * WRITE_HOLD protects against this scenario, because * mnt_want_write first increments count, then smp_mb, then spins on - * MNT_WRITE_HOLD, so it can't be decremented by another CPU while + * WRITE_HOLD, so it can't be decremented by another CPU while * we're counting up here. */ if (mnt_get_writers(mnt) > 0) @@ -649,19 +657,42 @@ static inline int mnt_hold_writers(struct mount *mnt) * Stop preventing write access to @mnt allowing callers to gain write access * to @mnt again. * - * This function can only be called after a successful call to - * mnt_hold_writers(). + * This function can only be called after a call to mnt_hold_writers(). * - * Context: This function expects lock_mount_hash() to be held. + * Context: This function expects to be in the same mount_locked_reader scope + * as the matching mnt_hold_writers(). */ static inline void mnt_unhold_writers(struct mount *mnt) { + if (!test_write_hold(mnt)) + return; /* - * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers + * MNT_READONLY must become visible before ~WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); - mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + clear_write_hold(mnt); +} + +static inline void mnt_del_instance(struct mount *m) +{ + struct mount **p = m->mnt_pprev_for_sb; + struct mount *next = m->mnt_next_for_sb; + + if (next) + next->mnt_pprev_for_sb = p; + *p = next; +} + +static inline void mnt_add_instance(struct mount *m, struct super_block *s) +{ + struct mount *first = s->s_mounts; + + if (first) + first->mnt_pprev_for_sb = &m->mnt_next_for_sb; + m->mnt_next_for_sb = first; + m->mnt_pprev_for_sb = &s->s_mounts; + s->s_mounts = m; } static int mnt_make_readonly(struct mount *mnt) @@ -677,17 +708,17 @@ static int mnt_make_readonly(struct mount *mnt) int sb_prepare_remount_readonly(struct super_block *sb) { - struct mount *mnt; int err = 0; - /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ + /* Racy optimization. Recheck the counter under WRITE_HOLD */ if (atomic_long_read(&sb->s_remove_count)) return -EBUSY; - lock_mount_hash(); - list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { - if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { - err = mnt_hold_writers(mnt); + guard(mount_locked_reader)(); + + for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { + if (!(m->mnt.mnt_flags & MNT_READONLY)) { + err = mnt_hold_writers(m); if (err) break; } @@ -697,11 +728,10 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (!err) sb_start_ro_state_change(sb); - list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { - if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) - mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { + if (test_write_hold(m)) + clear_write_hold(m); } - unlock_mount_hash(); return err; } @@ -760,24 +790,16 @@ static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) } /** - * __lookup_mnt - find first child mount + * __lookup_mnt - mount hash lookup * @mnt: parent mount - * @dentry: mountpoint - * - * If @mnt has a child mount @c mounted @dentry find and return it. + * @dentry: dentry of mountpoint * - * Note that the child mount @c need not be unique. There are cases - * where shadow mounts are created. For example, during mount - * propagation when a source mount @mnt whose root got overmounted by a - * mount @o after path lookup but before @namespace_sem could be - * acquired gets copied and propagated. So @mnt gets copied including - * @o. When @mnt is propagated to a destination mount @d that already - * has another mount @n mounted at the same mountpoint then the source - * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on - * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt - * on @dentry. + * If @mnt has a child mount @c mounted on @dentry find and return it. + * Caller must either hold the spinlock component of @mount_lock or + * hold rcu_read_lock(), sample the seqcount component before the call + * and recheck it afterwards. * - * Return: The first child of @mnt mounted @dentry or NULL. + * Return: The child of @mnt mounted on @dentry or %NULL. */ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { @@ -790,21 +812,12 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) return NULL; } -/* - * lookup_mnt - Return the first child mount mounted at path - * - * "First" means first mounted chronologically. If you create the - * following mounts: - * - * mount /dev/sda1 /mnt - * mount /dev/sda2 /mnt - * mount /dev/sda3 /mnt - * - * Then lookup_mnt() on the base /mnt dentry in the root mount will - * return successively the root dentry and vfsmount of /dev/sda1, then - * /dev/sda2, then /dev/sda3, then NULL. +/** + * lookup_mnt - Return the child mount mounted at given location + * @path: location in the namespace * - * lookup_mnt takes a reference to the found vfsmount. + * Acquires and returns a new reference to mount at given location + * or %NULL if nothing is mounted there. */ struct vfsmount *lookup_mnt(const struct path *path) { @@ -841,22 +854,20 @@ bool __is_local_mountpoint(const struct dentry *dentry) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct mount *mnt, *n; - bool is_covered = false; - down_read(&namespace_sem); - rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { - is_covered = (mnt->mnt_mountpoint == dentry); - if (is_covered) - break; - } - up_read(&namespace_sem); + guard(namespace_shared)(); - return is_covered; + rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) + if (mnt->mnt_mountpoint == dentry) + return true; + + return false; } struct pinned_mountpoint { struct hlist_node node; struct mountpoint *mp; + struct mount *parent; }; static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m) @@ -947,7 +958,7 @@ static void unpin_mountpoint(struct pinned_mountpoint *m) } } -static inline int check_mnt(struct mount *mnt) +static inline int check_mnt(const struct mount *mnt) { return mnt->mnt_ns == current->nsproxy->mnt_ns; } @@ -1149,6 +1160,20 @@ static void commit_tree(struct mount *mnt) touch_mnt_namespace(n); } +static void setup_mnt(struct mount *m, struct dentry *root) +{ + struct super_block *s = root->d_sb; + + atomic_inc(&s->s_active); + m->mnt.mnt_sb = s; + m->mnt.mnt_root = dget(root); + m->mnt_mountpoint = m->mnt.mnt_root; + m->mnt_parent = m; + + guard(mount_locked_reader)(); + mnt_add_instance(m, s); +} + /** * vfs_create_mount - Create a mount for a configured superblock * @fc: The configuration context with the superblock attached @@ -1172,15 +1197,8 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc) if (fc->sb_flags & SB_KERNMOUNT) mnt->mnt.mnt_flags = MNT_INTERNAL; - atomic_inc(&fc->root->d_sb->s_active); - mnt->mnt.mnt_sb = fc->root->d_sb; - mnt->mnt.mnt_root = dget(fc->root); - mnt->mnt_mountpoint = mnt->mnt.mnt_root; - mnt->mnt_parent = mnt; + setup_mnt(mnt, fc->root); - lock_mount_hash(); - list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); - unlock_mount_hash(); return &mnt->mnt; } EXPORT_SYMBOL(vfs_create_mount); @@ -1221,8 +1239,7 @@ struct vfsmount *vfs_kern_mount(struct file_system_type *type, return ERR_CAST(fc); if (name) - ret = vfs_parse_fs_string(fc, "source", - name, strlen(name)); + ret = vfs_parse_fs_string(fc, "source", name); if (!ret) ret = parse_monolithic_mount_data(fc, data); if (!ret) @@ -1238,7 +1255,6 @@ EXPORT_SYMBOL_GPL(vfs_kern_mount); static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { - struct super_block *sb = old->mnt.mnt_sb; struct mount *mnt; int err; @@ -1263,16 +1279,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, if (mnt->mnt_group_id) set_mnt_shared(mnt); - atomic_inc(&sb->s_active); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); - mnt->mnt.mnt_sb = sb; - mnt->mnt.mnt_root = dget(root); - mnt->mnt_mountpoint = mnt->mnt.mnt_root; - mnt->mnt_parent = mnt; - lock_mount_hash(); - list_add_tail(&mnt->mnt_instance, &sb->s_mounts); - unlock_mount_hash(); + setup_mnt(mnt, root); if (flag & CL_PRIVATE) // we are done with it return mnt; @@ -1378,7 +1387,7 @@ static void mntput_no_expire(struct mount *mnt) mnt->mnt.mnt_flags |= MNT_DOOMED; rcu_read_unlock(); - list_del(&mnt->mnt_instance); + mnt_del_instance(mnt); if (unlikely(!list_empty(&mnt->mnt_expire))) list_del(&mnt->mnt_expire); @@ -1719,8 +1728,6 @@ static inline void namespace_lock(void) down_write(&namespace_sem); } -DEFINE_GUARD(namespace_lock, struct rw_semaphore *, namespace_lock(), namespace_unlock()) - enum umount_tree_flags { UMOUNT_SYNC = 1, UMOUNT_PROPAGATE = 2, @@ -1785,6 +1792,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) if (how & UMOUNT_PROPAGATE) propagate_umount(&tmp_list); + bulk_make_private(&tmp_list); + while (!list_empty(&tmp_list)) { struct mnt_namespace *ns; bool disconnect; @@ -1809,7 +1818,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) umount_mnt(p); } } - change_mnt_propagation(p, MS_PRIVATE); if (disconnect) hlist_add_head(&p->mnt_umount, &unmounted); @@ -1969,10 +1977,11 @@ void __detach_mounts(struct dentry *dentry) struct pinned_mountpoint mp = {}; struct mount *mnt; - namespace_lock(); - lock_mount_hash(); + guard(namespace_excl)(); + guard(mount_writer)(); + if (!lookup_mountpoint(dentry, &mp)) - goto out_unlock; + return; event++; while (mp.node.next) { @@ -1984,9 +1993,6 @@ void __detach_mounts(struct dentry *dentry) else umount_tree(mnt, UMOUNT_CONNECTED); } unpin_mountpoint(&mp); -out_unlock: - unlock_mount_hash(); - namespace_unlock(); } /* @@ -2025,7 +2031,7 @@ static int can_umount(const struct path *path, int flags) } // caller is responsible for flags being sane -int path_umount(struct path *path, int flags) +int path_umount(const struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); int ret; @@ -2238,7 +2244,7 @@ static inline bool extend_array(struct path **res, struct path **to_free, return p; } -struct path *collect_paths(const struct path *path, +const struct path *collect_paths(const struct path *path, struct path *prealloc, unsigned count) { struct mount *root = real_mount(path->mnt); @@ -2246,7 +2252,7 @@ struct path *collect_paths(const struct path *path, struct path *res = prealloc, *to_free = NULL; unsigned n = 0; - guard(rwsem_read)(&namespace_sem); + guard(namespace_shared)(); if (!check_mnt(root)) return ERR_PTR(-EINVAL); @@ -2272,9 +2278,9 @@ struct path *collect_paths(const struct path *path, return res; } -void drop_collected_paths(struct path *paths, struct path *prealloc) +void drop_collected_paths(const struct path *paths, const struct path *prealloc) { - for (struct path *p = paths; p->mnt; p++) + for (const struct path *p = paths; p->mnt; p++) path_put(p); if (paths != prealloc) kfree(paths); @@ -2301,7 +2307,7 @@ void dissolve_on_fput(struct vfsmount *mnt) return; } - scoped_guard(namespace_lock, &namespace_sem) { + scoped_guard(namespace_excl) { if (!anon_ns_root(m)) return; @@ -2312,6 +2318,7 @@ void dissolve_on_fput(struct vfsmount *mnt) } } +/* locks: namespace_shared && pinned(mnt) || mount_locked_reader */ static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) { struct mount *child; @@ -2328,12 +2335,8 @@ static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) bool has_locked_children(struct mount *mnt, struct dentry *dentry) { - bool res; - - read_seqlock_excl(&mount_lock); - res = __has_locked_children(mnt, dentry); - read_sequnlock_excl(&mount_lock); - return res; + guard(mount_locked_reader)(); + return __has_locked_children(mnt, dentry); } /* @@ -2341,21 +2344,15 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry) * specified subtree. Such references can act as pins for mount namespaces * that aren't checked by the mount-cycle checking code, thereby allowing * cycles to be made. + * + * locks: mount_locked_reader || namespace_shared && pinned(subtree) */ static bool check_for_nsfs_mounts(struct mount *subtree) { - struct mount *p; - bool ret = false; - - lock_mount_hash(); - for (p = subtree; p; p = next_mnt(p, subtree)) + for (struct mount *p = subtree; p; p = next_mnt(p, subtree)) if (mnt_ns_loop(p->mnt.mnt_root)) - goto out; - - ret = true; -out: - unlock_mount_hash(); - return ret; + return false; + return true; } /** @@ -2375,7 +2372,7 @@ struct vfsmount *clone_private_mount(const struct path *path) struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; - guard(rwsem_read)(&namespace_sem); + guard(namespace_shared)(); if (IS_MNT_UNBINDABLE(old_mnt)) return ERR_PTR(-EINVAL); @@ -2496,8 +2493,7 @@ enum mnt_tree_flags_t { /** * attach_recursive_mnt - attach a source mount tree * @source_mnt: mount tree to be attached - * @dest_mnt: mount that @source_mnt will be mounted on - * @dest_mp: the mountpoint @source_mnt will be mounted at + * @dest: the context for mounting at the place where the tree should go * * NOTE: in the table below explains the semantics when a source mount * of a given type is attached to a destination mount of a given type. @@ -2560,10 +2556,11 @@ enum mnt_tree_flags_t { * Otherwise a negative error code is returned. */ static int attach_recursive_mnt(struct mount *source_mnt, - struct mount *dest_mnt, - struct mountpoint *dest_mp) + const struct pinned_mountpoint *dest) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; + struct mount *dest_mnt = dest->parent; + struct mountpoint *dest_mp = dest->mp; HLIST_HEAD(tree_list); struct mnt_namespace *ns = dest_mnt->mnt_ns; struct pinned_mountpoint root = {}; @@ -2643,10 +2640,9 @@ static int attach_recursive_mnt(struct mount *source_mnt, child->mnt_mountpoint); commit_tree(child); if (q) { + struct mount *r = topmost_overmount(child); struct mountpoint *mp = root.mp; - struct mount *r = child; - while (unlikely(r->overmount)) - r = r->overmount; + if (unlikely(shorter) && child != source_mnt) mp = shorter; mnt_change_mountpoint(r, mp, q); @@ -2675,110 +2671,120 @@ static int attach_recursive_mnt(struct mount *source_mnt, return err; } +static inline struct mount *where_to_mount(const struct path *path, + struct dentry **dentry, + bool beneath) +{ + struct mount *m; + + if (unlikely(beneath)) { + m = topmost_overmount(real_mount(path->mnt)); + *dentry = m->mnt_mountpoint; + return m->mnt_parent; + } + m = __lookup_mnt(path->mnt, path->dentry); + if (unlikely(m)) { + m = topmost_overmount(m); + *dentry = m->mnt.mnt_root; + return m; + } + *dentry = path->dentry; + return real_mount(path->mnt); +} + /** - * do_lock_mount - lock mount and mountpoint - * @path: target path - * @beneath: whether the intention is to mount beneath @path - * - * Follow the mount stack on @path until the top mount @mnt is found. If - * the initial @path->{mnt,dentry} is a mountpoint lookup the first - * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root} - * until nothing is stacked on top of it anymore. + * do_lock_mount - acquire environment for mounting + * @path: target path + * @res: context to set up + * @beneath: whether the intention is to mount beneath @path * - * Acquire the inode_lock() on the top mount's ->mnt_root to protect - * against concurrent removal of the new mountpoint from another mount - * namespace. + * To mount something at given location, we need + * namespace_sem locked exclusive + * inode of dentry we are mounting on locked exclusive + * struct mountpoint for that dentry + * struct mount we are mounting on * - * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint - * @mp on @mnt->mnt_parent must be acquired. This protects against a - * concurrent unlink of @mp->mnt_dentry from another mount namespace - * where @mnt doesn't have a child mount mounted @mp. A concurrent - * removal of @mnt->mnt_root doesn't matter as nothing will be mounted - * on top of it for @beneath. + * Results are stored in caller-supplied context (pinned_mountpoint); + * on success we have res->parent and res->mp pointing to parent and + * mountpoint respectively and res->node inserted into the ->m_list + * of the mountpoint, making sure the mountpoint won't disappear. + * On failure we have res->parent set to ERR_PTR(-E...), res->mp + * left NULL, res->node - empty. + * In case of success do_lock_mount returns with locks acquired (in + * proper order - inode lock nests outside of namespace_sem). * - * In addition, @beneath needs to make sure that @mnt hasn't been - * unmounted or moved from its current mountpoint in between dropping - * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt - * being unmounted would be detected later by e.g., calling - * check_mnt(mnt) in the function it's called from. For the @beneath - * case however, it's useful to detect it directly in do_lock_mount(). - * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points - * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will - * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL. + * Request to mount on overmounted location is treated as "mount on + * top of whatever's overmounting it"; request to mount beneath + * a location - "mount immediately beneath the topmost mount at that + * place". * - * Return: Either the target mountpoint on the top mount or the top - * mount's mountpoint. + * In all cases the location must not have been unmounted and the + * chosen mountpoint must be allowed to be mounted on. For "beneath" + * case we also require the location to be at the root of a mount + * that has a parent (i.e. is not a root of some namespace). */ -static int do_lock_mount(struct path *path, struct pinned_mountpoint *pinned, bool beneath) +static void do_lock_mount(const struct path *path, + struct pinned_mountpoint *res, + bool beneath) { - struct vfsmount *mnt = path->mnt; - struct dentry *dentry; - struct path under = {}; - int err = -ENOENT; + int err; - for (;;) { - struct mount *m = real_mount(mnt); + if (unlikely(beneath) && !path_mounted(path)) { + res->parent = ERR_PTR(-EINVAL); + return; + } - if (beneath) { - path_put(&under); - read_seqlock_excl(&mount_lock); - under.mnt = mntget(&m->mnt_parent->mnt); - under.dentry = dget(m->mnt_mountpoint); - read_sequnlock_excl(&mount_lock); - dentry = under.dentry; - } else { - dentry = path->dentry; + do { + struct dentry *dentry, *d; + struct mount *m, *n; + + scoped_guard(mount_locked_reader) { + m = where_to_mount(path, &dentry, beneath); + if (&m->mnt != path->mnt) { + mntget(&m->mnt); + dget(dentry); + } } inode_lock(dentry->d_inode); namespace_lock(); - if (unlikely(cant_mount(dentry) || !is_mounted(mnt))) - break; // not to be mounted on + // check if the chain of mounts (if any) has changed. + scoped_guard(mount_locked_reader) + n = where_to_mount(path, &d, beneath); - if (beneath && unlikely(m->mnt_mountpoint != dentry || - &m->mnt_parent->mnt != under.mnt)) { - namespace_unlock(); - inode_unlock(dentry->d_inode); - continue; // got moved - } + if (unlikely(n != m || dentry != d)) + err = -EAGAIN; // something moved, retry + else if (unlikely(cant_mount(dentry) || !is_mounted(path->mnt))) + err = -ENOENT; // not to be mounted on + else if (beneath && &m->mnt == path->mnt && !m->overmount) + err = -EINVAL; + else + err = get_mountpoint(dentry, res); - mnt = lookup_mnt(path); - if (unlikely(mnt)) { + if (unlikely(err)) { + res->parent = ERR_PTR(err); namespace_unlock(); inode_unlock(dentry->d_inode); - path_put(path); - path->mnt = mnt; - path->dentry = dget(mnt->mnt_root); - continue; // got overmounted + } else { + res->parent = m; } - err = get_mountpoint(dentry, pinned); - if (err) - break; - if (beneath) { - /* - * @under duplicates the references that will stay - * at least until namespace_unlock(), so the path_put() - * below is safe (and OK to do under namespace_lock - - * we are not dropping the final references here). - */ - path_put(&under); + /* + * Drop the temporary references. This is subtle - on success + * we are doing that under namespace_sem, which would normally + * be forbidden. However, in that case we are guaranteed that + * refcounts won't reach zero, since we know that path->mnt + * is mounted and thus all mounts reachable from it are pinned + * and stable, along with their mountpoints and roots. + */ + if (&m->mnt != path->mnt) { + dput(dentry); + mntput(&m->mnt); } - return 0; - } - namespace_unlock(); - inode_unlock(dentry->d_inode); - if (beneath) - path_put(&under); - return err; -} - -static inline int lock_mount(struct path *path, struct pinned_mountpoint *m) -{ - return do_lock_mount(path, m, false); + } while (err == -EAGAIN); } -static void unlock_mount(struct pinned_mountpoint *m) +static void __unlock_mount(struct pinned_mountpoint *m) { inode_unlock(m->mp->m_dentry->d_inode); read_seqlock_excl(&mount_lock); @@ -2787,16 +2793,30 @@ static void unlock_mount(struct pinned_mountpoint *m) namespace_unlock(); } -static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) +static inline void unlock_mount(struct pinned_mountpoint *m) +{ + if (!IS_ERR(m->parent)) + __unlock_mount(m); +} + +#define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \ + struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ + do_lock_mount((path), &mp, (beneath)) +#define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false) +#define LOCK_MOUNT_EXACT(mp, path) \ + struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ + lock_mount_exact((path), &mp) + +static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp) { if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER) return -EINVAL; - if (d_is_dir(mp->m_dentry) != + if (d_is_dir(mp->mp->m_dentry) != d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; - return attach_recursive_mnt(mnt, p, mp); + return attach_recursive_mnt(mnt, mp); } static int may_change_propagation(const struct mount *m) @@ -2832,13 +2852,13 @@ static int flags_to_propagation_type(int ms_flags) /* * recursively change the type of the mountpoint. */ -static int do_change_type(struct path *path, int ms_flags) +static int do_change_type(const struct path *path, int ms_flags) { struct mount *m; struct mount *mnt = real_mount(path->mnt); int recurse = ms_flags & MS_REC; int type; - int err = 0; + int err; if (!path_mounted(path)) return -EINVAL; @@ -2847,23 +2867,22 @@ static int do_change_type(struct path *path, int ms_flags) if (!type) return -EINVAL; - namespace_lock(); + guard(namespace_excl)(); + err = may_change_propagation(mnt); if (err) - goto out_unlock; + return err; if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) - goto out_unlock; + return err; } for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); - out_unlock: - namespace_unlock(); - return err; + return 0; } /* may_copy_tree() - check if a mount tree can be copied @@ -2909,7 +2928,7 @@ static int do_change_type(struct path *path, int ms_flags) * * Returns true if the mount tree can be copied, false otherwise. */ -static inline bool may_copy_tree(struct path *path) +static inline bool may_copy_tree(const struct path *path) { struct mount *mnt = real_mount(path->mnt); const struct dentry_operations *d_op; @@ -2931,7 +2950,7 @@ static inline bool may_copy_tree(struct path *path) } -static struct mount *__do_loopback(struct path *old_path, int recurse) +static struct mount *__do_loopback(const struct path *old_path, int recurse) { struct mount *old = real_mount(old_path->mnt); @@ -2953,12 +2972,11 @@ static struct mount *__do_loopback(struct path *old_path, int recurse) /* * do loopback mount. */ -static int do_loopback(struct path *path, const char *old_name, - int recurse) +static int do_loopback(const struct path *path, const char *old_name, + int recurse) { - struct path old_path; - struct mount *mnt = NULL, *parent; - struct pinned_mountpoint mp = {}; + struct path old_path __free(path_put) = {}; + struct mount *mnt = NULL; int err; if (!old_name || !*old_name) return -EINVAL; @@ -2966,49 +2984,40 @@ static int do_loopback(struct path *path, const char *old_name, if (err) return err; - err = -EINVAL; if (mnt_ns_loop(old_path.dentry)) - goto out; + return -EINVAL; - err = lock_mount(path, &mp); - if (err) - goto out; + LOCK_MOUNT(mp, path); + if (IS_ERR(mp.parent)) + return PTR_ERR(mp.parent); - parent = real_mount(path->mnt); - if (!check_mnt(parent)) - goto out2; + if (!check_mnt(mp.parent)) + return -EINVAL; mnt = __do_loopback(&old_path, recurse); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); - goto out2; - } + if (IS_ERR(mnt)) + return PTR_ERR(mnt); - err = graft_tree(mnt, parent, mp.mp); + err = graft_tree(mnt, &mp); if (err) { lock_mount_hash(); umount_tree(mnt, UMOUNT_SYNC); unlock_mount_hash(); } -out2: - unlock_mount(&mp); -out: - path_put(&old_path); return err; } -static struct file *open_detached_copy(struct path *path, bool recursive) +static struct mnt_namespace *get_detached_copy(const struct path *path, bool recursive) { struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns; struct user_namespace *user_ns = mnt_ns->user_ns; struct mount *mnt, *p; - struct file *file; ns = alloc_mnt_ns(user_ns, true); if (IS_ERR(ns)) - return ERR_CAST(ns); + return ns; - namespace_lock(); + guard(namespace_excl)(); /* * Record the sequence number of the source mount namespace. @@ -3025,23 +3034,28 @@ static struct file *open_detached_copy(struct path *path, bool recursive) mnt = __do_loopback(path, recursive); if (IS_ERR(mnt)) { - namespace_unlock(); - free_mnt_ns(ns); + emptied_ns = ns; return ERR_CAST(mnt); } - lock_mount_hash(); for (p = mnt; p; p = next_mnt(p, mnt)) { mnt_add_to_ns(ns, p); ns->nr_mounts++; } ns->root = mnt; - mntget(&mnt->mnt); - unlock_mount_hash(); - namespace_unlock(); + return ns; +} + +static struct file *open_detached_copy(struct path *path, bool recursive) +{ + struct mnt_namespace *ns = get_detached_copy(path, recursive); + struct file *file; + + if (IS_ERR(ns)) + return ERR_CAST(ns); mntput(path->mnt); - path->mnt = &mnt->mnt; + path->mnt = mntget(&ns->root->mnt); file = dentry_open(path, O_PATH, current_cred()); if (IS_ERR(file)) dissolve_on_fput(path->mnt); @@ -3158,7 +3172,8 @@ static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags) touch_mnt_namespace(mnt->mnt_ns); } -static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt) +static void mnt_warn_timestamp_expiry(const struct path *mountpoint, + struct vfsmount *mnt) { struct super_block *sb = mnt->mnt_sb; @@ -3192,7 +3207,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * * superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND * to mount(2). */ -static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) +static int do_reconfigure_mnt(const struct path *path, unsigned int mnt_flags) { struct super_block *sb = path->mnt->mnt_sb; struct mount *mnt = real_mount(path->mnt); @@ -3229,7 +3244,7 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) * If you've mounted a non-root directory somewhere and want to do remount * on it - tough luck. */ -static int do_remount(struct path *path, int sb_flags, +static int do_remount(const struct path *path, int sb_flags, int mnt_flags, void *data) { int err; @@ -3287,49 +3302,46 @@ static inline int tree_contains_unbindable(struct mount *mnt) return 0; } -static int do_set_group(struct path *from_path, struct path *to_path) +static int do_set_group(const struct path *from_path, const struct path *to_path) { - struct mount *from, *to; + struct mount *from = real_mount(from_path->mnt); + struct mount *to = real_mount(to_path->mnt); int err; - from = real_mount(from_path->mnt); - to = real_mount(to_path->mnt); - - namespace_lock(); + guard(namespace_excl)(); err = may_change_propagation(from); if (err) - goto out; + return err; err = may_change_propagation(to); if (err) - goto out; + return err; - err = -EINVAL; /* To and From paths should be mount roots */ if (!path_mounted(from_path)) - goto out; + return -EINVAL; if (!path_mounted(to_path)) - goto out; + return -EINVAL; /* Setting sharing groups is only allowed across same superblock */ if (from->mnt.mnt_sb != to->mnt.mnt_sb) - goto out; + return -EINVAL; /* From mount root should be wider than To mount root */ if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root)) - goto out; + return -EINVAL; /* From mount should not have locked children in place of To's root */ if (__has_locked_children(from, to->mnt.mnt_root)) - goto out; + return -EINVAL; /* Setting sharing groups is only allowed on private mounts */ if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to)) - goto out; + return -EINVAL; /* From should not be private */ if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from)) - goto out; + return -EINVAL; if (IS_MNT_SLAVE(from)) { hlist_add_behind(&to->mnt_slave, &from->mnt_slave); @@ -3341,11 +3353,7 @@ static int do_set_group(struct path *from_path, struct path *to_path) list_add(&to->mnt_share, &from->mnt_share); set_mnt_shared(to); } - - err = 0; -out: - namespace_unlock(); - return err; + return 0; } /** @@ -3389,17 +3397,15 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) /** * can_move_mount_beneath - check that we can mount beneath the top mount - * @from: mount to mount beneath - * @to: mount under which to mount - * @mp: mountpoint of @to + * @mnt_from: mount we are trying to move + * @mnt_to: mount under which to mount + * @mp: mountpoint of @mnt_to * - * - Make sure that @to->dentry is actually the root of a mount under - * which we can mount another mount. * - Make sure that nothing can be mounted beneath the caller's current * root or the rootfs of the namespace. * - Make sure that the caller can unmount the topmost mount ensuring * that the caller could reveal the underlying mountpoint. - * - Ensure that nothing has been mounted on top of @from before we + * - Ensure that nothing has been mounted on top of @mnt_from before we * grabbed @namespace_sem to avoid creating pointless shadow mounts. * - Prevent mounting beneath a mount if the propagation relationship * between the source mount, parent mount, and top mount would lead to @@ -3408,25 +3414,17 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) * Context: This function expects namespace_lock() to be held. * Return: On success 0, and on error a negative error code is returned. */ -static int can_move_mount_beneath(const struct path *from, - const struct path *to, +static int can_move_mount_beneath(const struct mount *mnt_from, + const struct mount *mnt_to, const struct mountpoint *mp) { - struct mount *mnt_from = real_mount(from->mnt), - *mnt_to = real_mount(to->mnt), - *parent_mnt_to = mnt_to->mnt_parent; - - if (!mnt_has_parent(mnt_to)) - return -EINVAL; - - if (!path_mounted(to)) - return -EINVAL; + struct mount *parent_mnt_to = mnt_to->mnt_parent; if (IS_MNT_LOCKED(mnt_to)) return -EINVAL; /* Avoid creating shadow mounts during mount propagation. */ - if (path_overmounted(from)) + if (mnt_from->overmount) return -EINVAL; /* @@ -3517,97 +3515,83 @@ static inline bool may_use_mount(struct mount *mnt) return check_anonymous_mnt(mnt); } -static int do_move_mount(struct path *old_path, - struct path *new_path, enum mnt_tree_flags_t flags) +static int do_move_mount(const struct path *old_path, + const struct path *new_path, + enum mnt_tree_flags_t flags) { - struct mnt_namespace *ns; - struct mount *p; - struct mount *old; - struct mount *parent; - struct pinned_mountpoint mp; + struct mount *old = real_mount(old_path->mnt); int err; bool beneath = flags & MNT_TREE_BENEATH; - err = do_lock_mount(new_path, &mp, beneath); - if (err) - return err; + if (!path_mounted(old_path)) + return -EINVAL; - old = real_mount(old_path->mnt); - p = real_mount(new_path->mnt); - parent = old->mnt_parent; - ns = old->mnt_ns; + if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry)) + return -EINVAL; - err = -EINVAL; + LOCK_MOUNT_MAYBE_BENEATH(mp, new_path, beneath); + if (IS_ERR(mp.parent)) + return PTR_ERR(mp.parent); if (check_mnt(old)) { /* if the source is in our namespace... */ /* ... it should be detachable from parent */ if (!mnt_has_parent(old) || IS_MNT_LOCKED(old)) - goto out; + return -EINVAL; + /* ... which should not be shared */ + if (IS_MNT_SHARED(old->mnt_parent)) + return -EINVAL; /* ... and the target should be in our namespace */ - if (!check_mnt(p)) - goto out; - /* parent of the source should not be shared */ - if (IS_MNT_SHARED(parent)) - goto out; + if (!check_mnt(mp.parent)) + return -EINVAL; } else { /* * otherwise the source must be the root of some anon namespace. */ if (!anon_ns_root(old)) - goto out; + return -EINVAL; /* * Bail out early if the target is within the same namespace - * subsequent checks would've rejected that, but they lose * some corner cases if we check it early. */ - if (ns == p->mnt_ns) - goto out; + if (old->mnt_ns == mp.parent->mnt_ns) + return -EINVAL; /* * Target should be either in our namespace or in an acceptable * anon namespace, sensu check_anonymous_mnt(). */ - if (!may_use_mount(p)) - goto out; + if (!may_use_mount(mp.parent)) + return -EINVAL; } - if (!path_mounted(old_path)) - goto out; - - if (d_is_dir(new_path->dentry) != - d_is_dir(old_path->dentry)) - goto out; - if (beneath) { - err = can_move_mount_beneath(old_path, new_path, mp.mp); - if (err) - goto out; + struct mount *over = real_mount(new_path->mnt); - err = -EINVAL; - p = p->mnt_parent; + if (mp.parent != over->mnt_parent) + over = mp.parent->overmount; + err = can_move_mount_beneath(old, over, mp.mp); + if (err) + return err; } /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. */ - if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) - goto out; - err = -ELOOP; + if (IS_MNT_SHARED(mp.parent) && tree_contains_unbindable(old)) + return -EINVAL; if (!check_for_nsfs_mounts(old)) - goto out; - if (mount_is_ancestor(old, p)) - goto out; + return -ELOOP; + if (mount_is_ancestor(old, mp.parent)) + return -ELOOP; - err = attach_recursive_mnt(old, p, mp.mp); -out: - unlock_mount(&mp); - return err; + return attach_recursive_mnt(old, &mp); } -static int do_move_mount_old(struct path *path, const char *old_name) +static int do_move_mount_old(const struct path *path, const char *old_name) { - struct path old_path; + struct path old_path __free(path_put) = {}; int err; if (!old_name || !*old_name) @@ -3617,18 +3601,19 @@ static int do_move_mount_old(struct path *path, const char *old_name) if (err) return err; - err = do_move_mount(&old_path, path, 0); - path_put(&old_path); - return err; + return do_move_mount(&old_path, path, 0); } /* * add a mount into a namespace's mount tree */ -static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, - const struct path *path, int mnt_flags) +static int do_add_mount(struct mount *newmnt, const struct pinned_mountpoint *mp, + int mnt_flags) { - struct mount *parent = real_mount(path->mnt); + struct mount *parent = mp->parent; + + if (IS_ERR(parent)) + return PTR_ERR(parent); mnt_flags &= ~MNT_INTERNAL_FLAGS; @@ -3642,14 +3627,15 @@ static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, } /* Refuse the same filesystem on the same mount point */ - if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path)) + if (parent->mnt.mnt_sb == newmnt->mnt.mnt_sb && + parent->mnt.mnt_root == mp->mp->m_dentry) return -EBUSY; if (d_is_symlink(newmnt->mnt.mnt_root)) return -EINVAL; newmnt->mnt.mnt_flags = mnt_flags; - return graft_tree(newmnt, parent, mp); + return graft_tree(newmnt, mp); } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); @@ -3658,41 +3644,32 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags * Create a new mount using a superblock configuration and request it * be added to the namespace tree. */ -static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, +static int do_new_mount_fc(struct fs_context *fc, const struct path *mountpoint, unsigned int mnt_flags) { - struct vfsmount *mnt; - struct pinned_mountpoint mp = {}; - struct super_block *sb = fc->root->d_sb; + struct super_block *sb; + struct vfsmount *mnt __free(mntput) = fc_mount(fc); int error; - error = security_sb_kern_mount(sb); - if (!error && mount_too_revealing(sb, &mnt_flags)) { - errorfcp(fc, "VFS", "Mount too revealing"); - error = -EPERM; - } + if (IS_ERR(mnt)) + return PTR_ERR(mnt); - if (unlikely(error)) { - fc_drop_locked(fc); + sb = fc->root->d_sb; + error = security_sb_kern_mount(sb); + if (unlikely(error)) return error; - } - up_write(&sb->s_umount); - - mnt = vfs_create_mount(fc); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); + if (unlikely(mount_too_revealing(sb, &mnt_flags))) { + errorfcp(fc, "VFS", "Mount too revealing"); + return -EPERM; + } mnt_warn_timestamp_expiry(mountpoint, mnt); - error = lock_mount(mountpoint, &mp); - if (!error) { - error = do_add_mount(real_mount(mnt), mp.mp, - mountpoint, mnt_flags); - unlock_mount(&mp); - } - if (error < 0) - mntput(mnt); + LOCK_MOUNT(mp, mountpoint); + error = do_add_mount(real_mount(mnt), &mp, mnt_flags); + if (!error) + retain_and_null_ptr(mnt); // consumed on success return error; } @@ -3700,8 +3677,9 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, * create a new mount for userspace and request it to be added into the * namespace's tree */ -static int do_new_mount(struct path *path, const char *fstype, int sb_flags, - int mnt_flags, const char *name, void *data) +static int do_new_mount(const struct path *path, const char *fstype, + int sb_flags, int mnt_flags, + const char *name, void *data) { struct file_system_type *type; struct fs_context *fc; @@ -3738,27 +3716,46 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, fc->oldapi = true; if (subtype) - err = vfs_parse_fs_string(fc, "subtype", - subtype, strlen(subtype)); + err = vfs_parse_fs_string(fc, "subtype", subtype); if (!err && name) - err = vfs_parse_fs_string(fc, "source", name, strlen(name)); + err = vfs_parse_fs_string(fc, "source", name); if (!err) err = parse_monolithic_mount_data(fc, data); if (!err && !mount_capable(fc)) err = -EPERM; if (!err) - err = vfs_get_tree(fc); - if (!err) err = do_new_mount_fc(fc, path, mnt_flags); put_fs_context(fc); return err; } -int finish_automount(struct vfsmount *m, const struct path *path) +static void lock_mount_exact(const struct path *path, + struct pinned_mountpoint *mp) { struct dentry *dentry = path->dentry; - struct pinned_mountpoint mp = {}; + int err; + + inode_lock(dentry->d_inode); + namespace_lock(); + if (unlikely(cant_mount(dentry))) + err = -ENOENT; + else if (path_overmounted(path)) + err = -EBUSY; + else + err = get_mountpoint(dentry, mp); + if (unlikely(err)) { + namespace_unlock(); + inode_unlock(dentry->d_inode); + mp->parent = ERR_PTR(err); + } else { + mp->parent = real_mount(path->mnt); + } +} + +int finish_automount(struct vfsmount *__m, const struct path *path) +{ + struct vfsmount *m __free(mntput) = __m; struct mount *mnt; int err; @@ -3769,43 +3766,21 @@ int finish_automount(struct vfsmount *m, const struct path *path) mnt = real_mount(m); - if (m->mnt_sb == path->mnt->mnt_sb && - m->mnt_root == dentry) { - err = -ELOOP; - goto discard; - } + if (m->mnt_root == path->dentry) + return -ELOOP; /* - * we don't want to use lock_mount() - in this case finding something + * we don't want to use LOCK_MOUNT() - in this case finding something * that overmounts our mountpoint to be means "quitely drop what we've * got", not "try to mount it on top". */ - inode_lock(dentry->d_inode); - namespace_lock(); - if (unlikely(cant_mount(dentry))) { - err = -ENOENT; - goto discard_locked; - } - if (path_overmounted(path)) { - err = 0; - goto discard_locked; - } - err = get_mountpoint(dentry, &mp); - if (err) - goto discard_locked; - - err = do_add_mount(mnt, mp.mp, path, - path->mnt->mnt_flags | MNT_SHRINKABLE); - unlock_mount(&mp); - if (unlikely(err)) - goto discard; - return 0; + LOCK_MOUNT_EXACT(mp, path); + if (mp.parent == ERR_PTR(-EBUSY)) + return 0; -discard_locked: - namespace_unlock(); - inode_unlock(dentry->d_inode); -discard: - mntput(m); + err = do_add_mount(mnt, &mp, path->mnt->mnt_flags | MNT_SHRINKABLE); + if (likely(!err)) + retain_and_null_ptr(m); return err; } @@ -3816,9 +3791,8 @@ discard: */ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) { - read_seqlock_excl(&mount_lock); + guard(mount_locked_reader)(); list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); - read_sequnlock_excl(&mount_lock); } EXPORT_SYMBOL(mnt_set_expiry); @@ -3835,8 +3809,8 @@ void mark_mounts_for_expiry(struct list_head *mounts) if (list_empty(mounts)) return; - namespace_lock(); - lock_mount_hash(); + guard(namespace_excl)(); + guard(mount_writer)(); /* extract from the expiration list every vfsmount that matches the * following criteria: @@ -3858,8 +3832,6 @@ void mark_mounts_for_expiry(struct list_head *mounts) touch_mnt_namespace(mnt->mnt_ns); umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); } - unlock_mount_hash(); - namespace_unlock(); } EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); @@ -3987,7 +3959,7 @@ static char *copy_mount_string(const void __user *data) * Therefore, if this magic number is present, it carries no information * and must be discarded. */ -int path_mount(const char *dev_name, struct path *path, +int path_mount(const char *dev_name, const struct path *path, const char *type_page, unsigned long flags, void *data_page) { unsigned int mnt_flags = 0, sb_flags; @@ -4069,15 +4041,13 @@ int path_mount(const char *dev_name, struct path *path, int do_mount(const char *dev_name, const char __user *dir_name, const char *type_page, unsigned long flags, void *data_page) { - struct path path; + struct path path __free(path_put) = {}; int ret; ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path); if (ret) return ret; - ret = path_mount(dev_name, &path, type_page, flags, data_page); - path_put(&path); - return ret; + return path_mount(dev_name, &path, type_page, flags, data_page); } static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns) @@ -4138,7 +4108,8 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; - struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; + struct vfsmount *rootmnt __free(mntput) = NULL; + struct vfsmount *pwdmnt __free(mntput) = NULL; struct mount *p, *q; struct mount *old; struct mount *new; @@ -4157,23 +4128,19 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, if (IS_ERR(new_ns)) return new_ns; - namespace_lock(); + guard(namespace_excl)(); /* First pass: copy the tree topology */ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) copy_flags |= CL_SLAVE; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { - namespace_unlock(); - ns_common_free(ns); - dec_mnt_namespaces(new_ns->ucounts); - mnt_ns_release(new_ns); + emptied_ns = new_ns; return ERR_CAST(new); } if (user_ns != ns->user_ns) { - lock_mount_hash(); + guard(mount_writer)(); lock_mnt_tree(new); - unlock_mount_hash(); } new_ns->root = new; @@ -4205,13 +4172,6 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, while (p->mnt.mnt_root != q->mnt.mnt_root) p = next_mnt(skip_mnt_tree(p), old); } - namespace_unlock(); - - if (rootmnt) - mntput(rootmnt); - if (pwdmnt) - mntput(pwdmnt); - ns_tree_add_raw(new_ns); return new_ns; } @@ -4436,7 +4396,8 @@ err_unlock: return ret; } -static inline int vfs_move_mount(struct path *from_path, struct path *to_path, +static inline int vfs_move_mount(const struct path *from_path, + const struct path *to_path, enum mnt_tree_flags_t mflags) { int ret; @@ -4542,7 +4503,7 @@ SYSCALL_DEFINE5(move_mount, /* * Return true if path is reachable from root * - * namespace_sem or mount_lock is held + * locks: mount_locked_reader || namespace_shared && is_mounted(mnt) */ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, const struct path *root) @@ -4556,11 +4517,8 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, bool path_is_under(const struct path *path1, const struct path *path2) { - bool res; - read_seqlock_excl(&mount_lock); - res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); - read_sequnlock_excl(&mount_lock); - return res; + guard(mount_locked_reader)(); + return is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); } EXPORT_SYMBOL(path_is_under); @@ -4592,9 +4550,10 @@ EXPORT_SYMBOL(path_is_under); SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, const char __user *, put_old) { - struct path new, old, root; + struct path new __free(path_put) = {}; + struct path old __free(path_put) = {}; + struct path root __free(path_put) = {}; struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; - struct pinned_mountpoint old_mp = {}; int error; if (!may_mount()) @@ -4603,57 +4562,54 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, error = user_path_at(AT_FDCWD, new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new); if (error) - goto out0; + return error; error = user_path_at(AT_FDCWD, put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old); if (error) - goto out1; + return error; error = security_sb_pivotroot(&old, &new); if (error) - goto out2; + return error; get_fs_root(current->fs, &root); - error = lock_mount(&old, &old_mp); - if (error) - goto out3; - error = -EINVAL; + LOCK_MOUNT(old_mp, &old); + old_mnt = old_mp.parent; + if (IS_ERR(old_mnt)) + return PTR_ERR(old_mnt); + new_mnt = real_mount(new.mnt); root_mnt = real_mount(root.mnt); - old_mnt = real_mount(old.mnt); ex_parent = new_mnt->mnt_parent; root_parent = root_mnt->mnt_parent; if (IS_MNT_SHARED(old_mnt) || IS_MNT_SHARED(ex_parent) || IS_MNT_SHARED(root_parent)) - goto out4; + return -EINVAL; if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) - goto out4; + return -EINVAL; if (new_mnt->mnt.mnt_flags & MNT_LOCKED) - goto out4; - error = -ENOENT; + return -EINVAL; if (d_unlinked(new.dentry)) - goto out4; - error = -EBUSY; + return -ENOENT; if (new_mnt == root_mnt || old_mnt == root_mnt) - goto out4; /* loop, on the same file system */ - error = -EINVAL; + return -EBUSY; /* loop, on the same file system */ if (!path_mounted(&root)) - goto out4; /* not a mountpoint */ + return -EINVAL; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) - goto out4; /* absolute root */ + return -EINVAL; /* absolute root */ if (!path_mounted(&new)) - goto out4; /* not a mountpoint */ + return -EINVAL; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) - goto out4; /* absolute root */ + return -EINVAL; /* absolute root */ /* make sure we can reach put_old from new_root */ - if (!is_path_reachable(old_mnt, old.dentry, &new)) - goto out4; + if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, &new)) + return -EINVAL; /* make certain new is below the root */ if (!is_path_reachable(new_mnt, new.dentry, &root)) - goto out4; + return -EINVAL; lock_mount_hash(); umount_mnt(new_mnt); if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { @@ -4672,17 +4628,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, mnt_notify_add(root_mnt); mnt_notify_add(new_mnt); chroot_fs_refs(&root, &new); - error = 0; -out4: - unlock_mount(&old_mp); -out3: - path_put(&root); -out2: - path_put(&old); -out1: - path_put(&new); -out0: - return error; + return 0; } static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) @@ -4772,8 +4718,10 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) if (!mnt_allow_writers(kattr, m)) { err = mnt_hold_writers(m); - if (err) + if (err) { + m = next_mnt(m, mnt); break; + } } if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) @@ -4781,25 +4729,9 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) } if (err) { - struct mount *p; - - /* - * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will - * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all - * mounts and needs to take care to include the first mount. - */ - for (p = mnt; p; p = next_mnt(p, mnt)) { - /* If we had to hold writers unblock them. */ - if (p->mnt.mnt_flags & MNT_WRITE_HOLD) - mnt_unhold_writers(p); - - /* - * We're done once the first mount we changed got - * MNT_WRITE_HOLD unset. - */ - if (p == m) - break; - } + /* undo all mnt_hold_writers() we'd done */ + for (struct mount *p = mnt; p != m; p = next_mnt(p, mnt)) + mnt_unhold_writers(p); } return err; } @@ -4830,8 +4762,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) WRITE_ONCE(m->mnt.mnt_flags, flags); /* If we had to hold writers unblock them. */ - if (m->mnt.mnt_flags & MNT_WRITE_HOLD) - mnt_unhold_writers(m); + mnt_unhold_writers(m); if (kattr->propagation) change_mnt_propagation(m, kattr->propagation); @@ -4841,7 +4772,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) touch_mnt_namespace(mnt->mnt_ns); } -static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) +static int do_mount_setattr(const struct path *path, struct mount_kattr *kattr) { struct mount *mnt = real_mount(path->mnt); int err = 0; @@ -5639,6 +5570,7 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) STATMOUNT_MNT_UIDMAP | \ STATMOUNT_MNT_GIDMAP) +/* locks: namespace_shared */ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct mnt_namespace *ns) { @@ -5885,7 +5817,7 @@ retry: if (ret) return ret; - scoped_guard(rwsem_read, &namespace_sem) + scoped_guard(namespace_shared) ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns); if (!ret) @@ -5906,6 +5838,7 @@ struct klistmount { struct path root; }; +/* locks: namespace_shared */ static ssize_t do_listmount(struct klistmount *kls, bool reverse) { struct mnt_namespace *ns = kls->ns; @@ -6040,7 +5973,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, * We only need to guard against mount topology changes as * listmount() doesn't care about any mount properties. */ - scoped_guard(rwsem_read, &namespace_sem) + scoped_guard(namespace_shared) ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE)); if (ret <= 0) return ret; @@ -6127,12 +6060,10 @@ void put_mnt_ns(struct mnt_namespace *ns) { if (!ns_ref_put(ns)) return; - namespace_lock(); + guard(namespace_excl)(); emptied_ns = ns; - lock_mount_hash(); + guard(mount_writer)(); umount_tree(ns->root, 0); - unlock_mount_hash(); - namespace_unlock(); } struct vfsmount *kern_mount(struct file_system_type *type) @@ -6181,25 +6112,18 @@ bool our_mnt(struct vfsmount *mnt) bool current_chrooted(void) { /* Does the current process have a non-standard root */ - struct path ns_root; - struct path fs_root; - bool chrooted; - - /* Find the namespace root */ - ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; - ns_root.dentry = ns_root.mnt->mnt_root; - path_get(&ns_root); - while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) - ; + struct path fs_root __free(path_put) = {}; + struct mount *root; get_fs_root(current->fs, &fs_root); - chrooted = !path_equal(&fs_root, &ns_root); + /* Find the namespace root */ + + guard(mount_locked_reader)(); - path_put(&fs_root); - path_put(&ns_root); + root = topmost_overmount(current->nsproxy->mnt_ns->root); - return chrooted; + return fs_root.mnt != &root->mnt || !path_mounted(&fs_root); } static bool mnt_already_visible(struct mnt_namespace *ns, @@ -6208,9 +6132,8 @@ static bool mnt_already_visible(struct mnt_namespace *ns, { int new_flags = *new_mnt_flags; struct mount *mnt, *n; - bool visible = false; - down_read(&namespace_sem); + guard(namespace_shared)(); rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { struct mount *child; int mnt_flags; @@ -6257,13 +6180,10 @@ static bool mnt_already_visible(struct mnt_namespace *ns, /* Preserve the locked attributes */ *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \ MNT_LOCK_ATIME); - visible = true; - goto found; + return true; next: ; } -found: - up_read(&namespace_sem); - return visible; + return false; } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 5d6edafbed20..0e4c67373e4f 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -676,7 +676,7 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr, struct pnfs_layout_segment *lseg; struct xdr_buf buf; struct xdr_stream xdr; - struct page *scratch; + struct folio *scratch; int status, i; uint32_t count; __be32 *p; @@ -689,13 +689,13 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr, return ERR_PTR(-ENOMEM); status = -ENOMEM; - scratch = alloc_page(gfp_mask); + scratch = folio_alloc(gfp_mask, 0); if (!scratch) goto out; xdr_init_decode_pages(&xdr, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_page(&xdr, scratch); + xdr_set_scratch_folio(&xdr, scratch); status = -EIO; p = xdr_inline_decode(&xdr, 4); @@ -744,7 +744,7 @@ process_extents: } out_free_scratch: - __free_page(scratch); + folio_put(scratch); out: dprintk("%s returns %d\n", __func__, status); switch (status) { diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 44306ac22353..ab76120705e2 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -541,16 +541,16 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct pnfs_block_dev *top; struct xdr_stream xdr; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; int nr_volumes, ret, i; __be32 *p; - scratch = alloc_page(gfp_mask); + scratch = folio_alloc(gfp_mask, 0); if (!scratch) goto out; xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_page(&xdr, scratch); + xdr_set_scratch_folio(&xdr, scratch); p = xdr_inline_decode(&xdr, sizeof(__be32)); if (!p) @@ -582,7 +582,7 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, out_free_volumes: kfree(volumes); out_free_scratch: - __free_page(scratch); + folio_put(scratch); out: return node; } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 86bdc7d23fb9..c8b837006bb2 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -136,7 +136,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc return; dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum); - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); } static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, @@ -153,7 +153,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, ret = svc_bind(serv, net); if (ret < 0) { printk(KERN_WARNING "NFS: bind callback service failed\n"); - goto err_bind; + goto err; } ret = 0; @@ -166,13 +166,11 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, if (ret < 0) { printk(KERN_ERR "NFS: callback service start failed\n"); - goto err_socks; + goto err; } return 0; -err_socks: - svc_rpcb_cleanup(serv, net); -err_bind: +err: nn->cb_users[minorversion]--; dprintk("NFS: Couldn't create callback socket: err = %d; " "net = %x\n", ret, net->ns.inum); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index d81217923936..46d9c65d50f8 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -829,17 +829,17 @@ static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc, struct address_space *mapping = desc->file->f_mapping; struct folio *new, *folio = *arrays; struct xdr_stream stream; - struct page *scratch; + struct folio *scratch; struct xdr_buf buf; u64 cookie; int status; - scratch = alloc_page(GFP_KERNEL); + scratch = folio_alloc(GFP_KERNEL, 0); if (scratch == NULL) return -ENOMEM; xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); do { status = nfs_readdir_entry_decode(desc, entry, &stream); @@ -891,7 +891,7 @@ static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc, if (folio != *arrays) nfs_readdir_folio_unlock_and_put(folio); - put_page(scratch); + folio_put(scratch); return status; } @@ -2198,8 +2198,6 @@ no_open: else dput(dentry); } - if (IS_ERR(res)) - return PTR_ERR(res); return finish_no_open(file, res); } EXPORT_SYMBOL_GPL(nfs_atomic_open); @@ -2260,7 +2258,7 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, struct file *file, unsigned int open_flags, umode_t mode) { - + struct dentry *res = NULL; /* Same as look+open from lookup_open(), but with different O_TRUNC * handling. */ @@ -2275,21 +2273,15 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, if (error) return error; return finish_open(file, dentry, NULL); - } else if (d_in_lookup(dentry)) { + } + if (d_in_lookup(dentry)) { /* The only flags nfs_lookup considers are * LOOKUP_EXCL and LOOKUP_RENAME_TARGET, and * we want those to be zero so the lookup isn't skipped. */ - struct dentry *res = nfs_lookup(dir, dentry, 0); - - d_lookup_done(dentry); - if (unlikely(res)) { - if (IS_ERR(res)) - return PTR_ERR(res); - return finish_no_open(file, res); - } + res = nfs_lookup(dir, dentry, 0); } - return finish_no_open(file, NULL); + return finish_no_open(file, res); } EXPORT_SYMBOL_GPL(nfs_atomic_open_v23); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 8059ece82468..d020aab40c64 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -161,6 +161,8 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); ssize_t result; + trace_nfs_file_read(iocb, to); + if (iocb->ki_flags & IOCB_DIRECT) return nfs_file_direct_read(iocb, to, false); @@ -361,6 +363,8 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio, if (pnfs_ld_read_whole_page(file_inode(file))) return true; + if (folio_test_dropbehind(folio)) + return false; /* Open for reading too? */ if (file->f_mode & FMODE_READ) return true; @@ -380,22 +384,23 @@ static int nfs_write_begin(const struct kiocb *iocb, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { - fgf_t fgp = FGP_WRITEBEGIN; struct folio *folio; struct file *file = iocb->ki_filp; int once_thru = 0; int ret; + trace_nfs_write_begin(file_inode(file), pos, len); + dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos); - fgp |= fgf_set_order(len); start: - folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp, - mapping_gfp_mask(mapping)); - if (IS_ERR(folio)) - return PTR_ERR(folio); + folio = write_begin_get_folio(iocb, mapping, pos >> PAGE_SHIFT, len); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + goto out; + } *foliop = folio; ret = nfs_flush_incompatible(file, folio); @@ -405,11 +410,14 @@ start: } else if (!once_thru && nfs_want_read_modify_write(file, folio, pos, len)) { once_thru = 1; + folio_clear_dropbehind(folio); ret = nfs_read_folio(file, folio); folio_put(folio); if (!ret) goto start; } +out: + trace_nfs_write_begin_done(file_inode(file), pos, len, ret); return ret; } @@ -423,6 +431,7 @@ static int nfs_write_end(const struct kiocb *iocb, unsigned offset = offset_in_folio(folio, pos); int status; + trace_nfs_write_end(file_inode(file), pos, len); dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); @@ -451,13 +460,16 @@ static int nfs_write_end(const struct kiocb *iocb, folio_unlock(folio); folio_put(folio); - if (status < 0) + if (status < 0) { + trace_nfs_write_end_done(file_inode(file), pos, len, status); return status; + } NFS_I(mapping->host)->write_io += copied; if (nfs_ctx_key_to_expire(ctx, mapping->host)) nfs_wb_all(mapping->host); + trace_nfs_write_end_done(file_inode(file), pos, len, copied); return copied; } @@ -690,6 +702,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) errseq_t since; int error; + trace_nfs_file_write(iocb, from); + result = nfs_key_timeout_notify(file, inode); if (result) return result; @@ -949,5 +963,6 @@ const struct file_operations nfs_file_operations = { .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, .setlease = simple_nosetlease, + .fop_flags = FOP_DONTCACHE, }; EXPORT_SYMBOL_GPL(nfs_file_operations); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index d39a1f58e18d..5c4551117c58 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -646,19 +646,19 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, { struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; __be32 *p; uint32_t nfl_util; int i; dprintk("%s: set_layout_map Begin\n", __func__); - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) return -ENOMEM; xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), * num_fh (4) */ @@ -724,11 +724,11 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, fl->fh_array[i]->size); } - __free_page(scratch); + folio_put(scratch); return 0; out_err: - __free_page(scratch); + folio_put(scratch); return -EIO; } diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 29d9234d5c08..df79aeb68db4 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -73,18 +73,18 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct nfs4_file_layout_dsaddr *dsaddr = NULL; struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; struct list_head dsaddrs; struct nfs4_pnfs_ds_addr *da; struct net *net = server->nfs_client->cl_net; /* set up xdr stream */ - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) goto out_err; xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); /* Get the stripe count (number of stripe index) */ p = xdr_inline_decode(&stream, 4); @@ -186,7 +186,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, } } - __free_page(scratch); + folio_put(scratch); return dsaddr; out_err_drain_dsaddrs: @@ -204,7 +204,7 @@ out_err_free_deviceid: out_err_free_stripe_indices: kfree(stripe_indices); out_err_free_scratch: - __free_page(scratch); + folio_put(scratch); out_err: dprintk("%s ERROR: returning NULL\n", __func__); return NULL; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 9edb5f9b0c4e..df01d2876b68 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -47,7 +47,7 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, int dev_limit, enum nfs4_ff_op_type type); static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, const struct nfs42_layoutstat_devinfo *devinfo, - struct nfs4_ff_layout_mirror *mirror); + struct nfs4_ff_layout_ds_stripe *dss_info); static struct pnfs_layout_hdr * ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) @@ -164,31 +164,32 @@ decode_name(struct xdr_stream *xdr, u32 *id) } static struct nfsd_file * -ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx, +ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx, u32 dss_id, struct nfs_client *clp, const struct cred *cred, struct nfs_fh *fh, fmode_t mode) { #if IS_ENABLED(CONFIG_NFS_LOCALIO) struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); - return nfs_local_open_fh(clp, cred, fh, &mirror->nfl, mode); + return nfs_local_open_fh(clp, cred, fh, &mirror->dss[dss_id].nfl, mode); #else return NULL; #endif } -static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, - const struct nfs4_ff_layout_mirror *m2) +static bool ff_dss_match_fh(const struct nfs4_ff_layout_ds_stripe *dss1, + const struct nfs4_ff_layout_ds_stripe *dss2) { int i, j; - if (m1->fh_versions_cnt != m2->fh_versions_cnt) + if (dss1->fh_versions_cnt != dss2->fh_versions_cnt) return false; - for (i = 0; i < m1->fh_versions_cnt; i++) { + + for (i = 0; i < dss1->fh_versions_cnt; i++) { bool found_fh = false; - for (j = 0; j < m2->fh_versions_cnt; j++) { - if (nfs_compare_fh(&m1->fh_versions[i], - &m2->fh_versions[j]) == 0) { + for (j = 0; j < dss2->fh_versions_cnt; j++) { + if (nfs_compare_fh(&dss1->fh_versions[i], + &dss2->fh_versions[j]) == 0) { found_fh = true; break; } @@ -199,6 +200,38 @@ static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, return true; } +static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, + const struct nfs4_ff_layout_mirror *m2) +{ + u32 dss_id; + + if (m1->dss_count != m2->dss_count) + return false; + + for (dss_id = 0; dss_id < m1->dss_count; dss_id++) + if (!ff_dss_match_fh(&m1->dss[dss_id], &m2->dss[dss_id])) + return false; + + return true; +} + +static bool ff_mirror_match_devid(const struct nfs4_ff_layout_mirror *m1, + const struct nfs4_ff_layout_mirror *m2) +{ + u32 dss_id; + + if (m1->dss_count != m2->dss_count) + return false; + + for (dss_id = 0; dss_id < m1->dss_count; dss_id++) + if (memcmp(&m1->dss[dss_id].devid, + &m2->dss[dss_id].devid, + sizeof(m1->dss[dss_id].devid)) != 0) + return false; + + return true; +} + static struct nfs4_ff_layout_mirror * ff_layout_add_mirror(struct pnfs_layout_hdr *lo, struct nfs4_ff_layout_mirror *mirror) @@ -209,7 +242,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo, spin_lock(&inode->i_lock); list_for_each_entry(pos, &ff_layout->mirrors, mirrors) { - if (memcmp(&mirror->devid, &pos->devid, sizeof(pos->devid)) != 0) + if (!ff_mirror_match_devid(mirror, pos)) continue; if (!ff_mirror_match_fh(mirror, pos)) continue; @@ -240,29 +273,37 @@ ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror) static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags) { struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; mirror = kzalloc(sizeof(*mirror), gfp_flags); if (mirror != NULL) { spin_lock_init(&mirror->lock); refcount_set(&mirror->ref, 1); INIT_LIST_HEAD(&mirror->mirrors); - nfs_localio_file_init(&mirror->nfl); + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) + nfs_localio_file_init(&mirror->dss[dss_id].nfl); } return mirror; } static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror) { - const struct cred *cred; + const struct cred *cred; + u32 dss_id; ff_layout_remove_mirror(mirror); - kfree(mirror->fh_versions); - nfs_close_local_fh(&mirror->nfl); - cred = rcu_access_pointer(mirror->ro_cred); - put_cred(cred); - cred = rcu_access_pointer(mirror->rw_cred); - put_cred(cred); - nfs4_ff_layout_put_deviceid(mirror->mirror_ds); + + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + kfree(mirror->dss[dss_id].fh_versions); + cred = rcu_access_pointer(mirror->dss[dss_id].ro_cred); + put_cred(cred); + cred = rcu_access_pointer(mirror->dss[dss_id].rw_cred); + put_cred(cred); + nfs_close_local_fh(&mirror->dss[dss_id].nfl); + nfs4_ff_layout_put_deviceid(mirror->dss[dss_id].mirror_ds); + } + + kfree(mirror->dss); kfree(mirror); } @@ -366,14 +407,24 @@ ff_layout_add_lseg(struct pnfs_layout_hdr *lo, free_me); } +static u32 ff_mirror_efficiency_sum(const struct nfs4_ff_layout_mirror *mirror) +{ + u32 dss_id, sum = 0; + + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) + sum += mirror->dss[dss_id].efficiency; + + return sum; +} + static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) { int i, j; for (i = 0; i < fls->mirror_array_cnt - 1; i++) { for (j = i + 1; j < fls->mirror_array_cnt; j++) - if (fls->mirror_array[i]->efficiency < - fls->mirror_array[j]->efficiency) + if (ff_mirror_efficiency_sum(fls->mirror_array[i]) < + ff_mirror_efficiency_sum(fls->mirror_array[j])) swap(fls->mirror_array[i], fls->mirror_array[j]); } @@ -388,20 +439,21 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, struct nfs4_ff_layout_segment *fls = NULL; struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; u64 stripe_unit; u32 mirror_array_cnt; __be32 *p; int i, rc; + struct nfs4_ff_layout_ds_stripe *dss_info; dprintk("--> %s\n", __func__); - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) return ERR_PTR(-ENOMEM); xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); /* stripe unit and mirror_array_cnt */ rc = -EIO; @@ -427,23 +479,32 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, fls->mirror_array_cnt = mirror_array_cnt; fls->stripe_unit = stripe_unit; + u32 dss_count = 0; for (i = 0; i < fls->mirror_array_cnt; i++) { struct nfs4_ff_layout_mirror *mirror; struct cred *kcred; const struct cred __rcu *cred; kuid_t uid; kgid_t gid; - u32 ds_count, fh_count, id; - int j; + u32 fh_count, id; + int j, dss_id; rc = -EIO; p = xdr_inline_decode(&stream, 4); if (!p) goto out_err_free; - ds_count = be32_to_cpup(p); - /* FIXME: allow for striping? */ - if (ds_count != 1) + // Ensure all mirrors have same stripe count. + if (dss_count == 0) + dss_count = be32_to_cpup(p); + else if (dss_count != be32_to_cpup(p)) + goto out_err_free; + + if (dss_count > NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT || + dss_count == 0) + goto out_err_free; + + if (dss_count > 1 && stripe_unit == 0) goto out_err_free; fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags); @@ -452,91 +513,105 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, goto out_err_free; } - fls->mirror_array[i]->ds_count = ds_count; + fls->mirror_array[i]->dss_count = dss_count; + fls->mirror_array[i]->dss = + kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe), + gfp_flags); - /* deviceid */ - rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid); - if (rc) - goto out_err_free; + for (dss_id = 0; dss_id < dss_count; dss_id++) { + dss_info = &fls->mirror_array[i]->dss[dss_id]; + dss_info->mirror = fls->mirror_array[i]; - /* efficiency */ - rc = -EIO; - p = xdr_inline_decode(&stream, 4); - if (!p) - goto out_err_free; - fls->mirror_array[i]->efficiency = be32_to_cpup(p); + /* deviceid */ + rc = decode_deviceid(&stream, &dss_info->devid); + if (rc) + goto out_err_free; - /* stateid */ - rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid); - if (rc) - goto out_err_free; + /* efficiency */ + rc = -EIO; + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_err_free; + dss_info->efficiency = be32_to_cpup(p); - /* fh */ - rc = -EIO; - p = xdr_inline_decode(&stream, 4); - if (!p) - goto out_err_free; - fh_count = be32_to_cpup(p); + /* stateid */ + rc = decode_pnfs_stateid(&stream, &dss_info->stateid); + if (rc) + goto out_err_free; - fls->mirror_array[i]->fh_versions = - kcalloc(fh_count, sizeof(struct nfs_fh), - gfp_flags); - if (fls->mirror_array[i]->fh_versions == NULL) { - rc = -ENOMEM; - goto out_err_free; - } + /* fh */ + rc = -EIO; + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_err_free; + fh_count = be32_to_cpup(p); - for (j = 0; j < fh_count; j++) { - rc = decode_nfs_fh(&stream, - &fls->mirror_array[i]->fh_versions[j]); + dss_info->fh_versions = + kcalloc(fh_count, sizeof(struct nfs_fh), + gfp_flags); + if (dss_info->fh_versions == NULL) { + rc = -ENOMEM; + goto out_err_free; + } + + for (j = 0; j < fh_count; j++) { + rc = decode_nfs_fh(&stream, + &dss_info->fh_versions[j]); + if (rc) + goto out_err_free; + } + + dss_info->fh_versions_cnt = fh_count; + + /* user */ + rc = decode_name(&stream, &id); if (rc) goto out_err_free; - } - fls->mirror_array[i]->fh_versions_cnt = fh_count; + uid = make_kuid(&init_user_ns, id); - /* user */ - rc = decode_name(&stream, &id); - if (rc) - goto out_err_free; + /* group */ + rc = decode_name(&stream, &id); + if (rc) + goto out_err_free; - uid = make_kuid(&init_user_ns, id); + gid = make_kgid(&init_user_ns, id); - /* group */ - rc = decode_name(&stream, &id); - if (rc) - goto out_err_free; + if (gfp_flags & __GFP_FS) + kcred = prepare_kernel_cred(&init_task); + else { + unsigned int nofs_flags = memalloc_nofs_save(); - gid = make_kgid(&init_user_ns, id); + kcred = prepare_kernel_cred(&init_task); + memalloc_nofs_restore(nofs_flags); + } + rc = -ENOMEM; + if (!kcred) + goto out_err_free; + kcred->fsuid = uid; + kcred->fsgid = gid; + cred = RCU_INITIALIZER(kcred); - if (gfp_flags & __GFP_FS) - kcred = prepare_kernel_cred(&init_task); - else { - unsigned int nofs_flags = memalloc_nofs_save(); - kcred = prepare_kernel_cred(&init_task); - memalloc_nofs_restore(nofs_flags); + if (lgr->range.iomode == IOMODE_READ) + rcu_assign_pointer(dss_info->ro_cred, cred); + else + rcu_assign_pointer(dss_info->rw_cred, cred); } - rc = -ENOMEM; - if (!kcred) - goto out_err_free; - kcred->fsuid = uid; - kcred->fsgid = gid; - cred = RCU_INITIALIZER(kcred); - - if (lgr->range.iomode == IOMODE_READ) - rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); - else - rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred); mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]); if (mirror != fls->mirror_array[i]) { - /* swap cred ptrs so free_mirror will clean up old */ - if (lgr->range.iomode == IOMODE_READ) { - cred = xchg(&mirror->ro_cred, cred); - rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); - } else { - cred = xchg(&mirror->rw_cred, cred); - rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred); + for (dss_id = 0; dss_id < dss_count; dss_id++) { + dss_info = &fls->mirror_array[i]->dss[dss_id]; + /* swap cred ptrs so free_mirror will clean up old */ + if (lgr->range.iomode == IOMODE_READ) { + cred = xchg(&mirror->dss[dss_id].ro_cred, + dss_info->ro_cred); + rcu_assign_pointer(dss_info->ro_cred, cred); + } else { + cred = xchg(&mirror->dss[dss_id].rw_cred, + dss_info->rw_cred); + rcu_assign_pointer(dss_info->rw_cred, cred); + } } ff_layout_free_mirror(fls->mirror_array[i]); fls->mirror_array[i] = mirror; @@ -564,7 +639,7 @@ out_sort_mirrors: ret = &fls->generic_hdr; dprintk("<-- %s (success)\n", __func__); out_free_page: - __free_page(scratch); + folio_put(scratch); return ret; out_err_free: _ff_layout_free_lseg(fls); @@ -593,6 +668,26 @@ ff_layout_free_lseg(struct pnfs_layout_segment *lseg) _ff_layout_free_lseg(fls); } +static u32 calc_commit_idx(struct pnfs_layout_segment *lseg, + u32 mirror_idx, u32 dss_id) +{ + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); + + return (mirror_idx * flseg->mirror_array[0]->dss_count) + dss_id; +} + +static u32 calc_mirror_idx_from_commit(struct pnfs_layout_segment *lseg, + u32 commit_index) +{ + return commit_index / FF_LAYOUT_LSEG(lseg)->mirror_array[0]->dss_count; +} + +static u32 calc_dss_id_from_commit(struct pnfs_layout_segment *lseg, + u32 commit_index) +{ + return commit_index % FF_LAYOUT_LSEG(lseg)->mirror_array[0]->dss_count; +} + static void nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now) { @@ -617,6 +712,7 @@ nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now) static bool nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, struct nfs4_ff_layoutstat *layoutstat, ktime_t now) { @@ -624,8 +720,8 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout); nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now); - if (!mirror->start_time) - mirror->start_time = now; + if (!mirror->dss[dss_id].start_time) + mirror->dss[dss_id].start_time = now; if (mirror->report_interval != 0) report_interval = (s64)mirror->report_interval * 1000LL; else if (layoutstats_timer != 0) @@ -675,13 +771,16 @@ nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat, static void nfs4_ff_layout_stat_io_start_read(struct inode *inode, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, ktime_t now) { bool report; spin_lock(&mirror->lock); - report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now); - nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested); + report = nfs4_ff_layoutstat_start_io( + mirror, dss_id, &mirror->dss[dss_id].read_stat, now); + nfs4_ff_layout_stat_io_update_requested( + &mirror->dss[dss_id].read_stat, requested); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); @@ -692,11 +791,12 @@ nfs4_ff_layout_stat_io_start_read(struct inode *inode, static void nfs4_ff_layout_stat_io_end_read(struct rpc_task *task, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, __u64 completed) { spin_lock(&mirror->lock); - nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat, + nfs4_ff_layout_stat_io_update_completed(&mirror->dss[dss_id].read_stat, requested, completed, ktime_get(), task->tk_start); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); @@ -706,13 +806,20 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task, static void nfs4_ff_layout_stat_io_start_write(struct inode *inode, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, ktime_t now) { bool report; spin_lock(&mirror->lock); - report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now); - nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested); + report = nfs4_ff_layoutstat_start_io( + mirror, + dss_id, + &mirror->dss[dss_id].write_stat, + now); + nfs4_ff_layout_stat_io_update_requested( + &mirror->dss[dss_id].write_stat, + requested); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); @@ -723,6 +830,7 @@ nfs4_ff_layout_stat_io_start_write(struct inode *inode, static void nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, __u64 completed, enum nfs3_stable_how committed) @@ -731,25 +839,25 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, requested = completed = 0; spin_lock(&mirror->lock); - nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat, + nfs4_ff_layout_stat_io_update_completed(&mirror->dss[dss_id].write_stat, requested, completed, ktime_get(), task->tk_start); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); } static void -ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx) +ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id) { - struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); if (devid) nfs4_mark_deviceid_unavailable(devid); } static void -ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx) +ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id) { - struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); if (devid) nfs4_mark_deviceid_available(devid); @@ -758,6 +866,7 @@ ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx) static struct nfs4_pnfs_ds * ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id, bool check_device) { struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); @@ -768,12 +877,16 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, /* mirrors are initially sorted by efficiency */ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); - ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false); + *dss_id = nfs4_ff_layout_calc_dss_id( + fls->stripe_unit, + fls->mirror_array[idx]->dss_count, + offset); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, *dss_id, false); if (IS_ERR(ds)) continue; if (check_device && - nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) { + nfs4_test_deviceid_unavailable(&mirror->dss[*dss_id].mirror_ds->id_node)) { // reinitialize the error state in case if this is the last iteration ds = ERR_PTR(-EINVAL); continue; @@ -788,42 +901,52 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, static struct nfs4_pnfs_ds * ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg, - u32 start_idx, u32 *best_idx) + u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id) { - return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false); + return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id, false); } static struct nfs4_pnfs_ds * ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg, - u32 start_idx, u32 *best_idx) + u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id) { - return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true); + return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id, true); } static struct nfs4_pnfs_ds * ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, - u32 start_idx, u32 *best_idx) + u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id) { struct nfs4_pnfs_ds *ds; - ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx); + ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id); if (!IS_ERR(ds)) return ds; - return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx); + return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id); } static struct nfs4_pnfs_ds * ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, - u32 *best_idx) + u32 *best_idx, + u32 offset, + u32 *dss_id) { struct pnfs_layout_segment *lseg = pgio->pg_lseg; struct nfs4_pnfs_ds *ds; ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx, - best_idx); + best_idx, offset, dss_id); if (!IS_ERR(ds) || !pgio->pg_mirror_idx) return ds; - return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx); + return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx, + offset, dss_id); } static void @@ -842,6 +965,56 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, } } +static bool +ff_layout_lseg_is_striped(const struct nfs4_ff_layout_segment *fls) +{ + return fls->mirror_array[0]->dss_count > 1; +} + +/* + * ff_layout_pg_test(). Called by nfs_can_coalesce_requests() + * + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t +ff_layout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + unsigned int size; + u64 p_stripe, r_stripe; + u32 stripe_offset; + u64 segment_offset = pgio->pg_lseg->pls_range.offset; + u32 stripe_unit = FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; + + /* calls nfs_generic_pg_test */ + size = pnfs_generic_pg_test(pgio, prev, req); + if (!size) + return 0; + else if (!ff_layout_lseg_is_striped(FF_LAYOUT_LSEG(pgio->pg_lseg))) + return size; + + /* see if req and prev are in the same stripe */ + if (prev) { + p_stripe = (u64)req_offset(prev) - segment_offset; + r_stripe = (u64)req_offset(req) - segment_offset; + do_div(p_stripe, stripe_unit); + do_div(r_stripe, stripe_unit); + + if (p_stripe != r_stripe) + return 0; + } + + /* calculate remaining bytes in the current stripe */ + div_u64_rem((u64)req_offset(req) - segment_offset, + stripe_unit, + &stripe_offset); + WARN_ON_ONCE(stripe_offset > stripe_unit); + if (stripe_offset >= stripe_unit) + return 0; + return min(stripe_unit - (unsigned int)stripe_offset, size); +} + static void ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) @@ -849,7 +1022,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_pgio_mirror *pgm; struct nfs4_ff_layout_mirror *mirror; struct nfs4_pnfs_ds *ds; - u32 ds_idx; + u32 ds_idx, dss_id; if (NFS_SERVER(pgio->pg_inode)->flags & (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR)) @@ -870,7 +1043,8 @@ retry: /* Reset wb_nio, since getting layout segment was successful */ req->wb_nio = 0; - ds = ff_layout_get_ds_for_read(pgio, &ds_idx); + ds = ff_layout_get_ds_for_read(pgio, &ds_idx, + req_offset(req), &dss_id); if (IS_ERR(ds)) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; @@ -882,7 +1056,7 @@ retry: mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); pgm = &pgio->pg_mirrors[0]; - pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize; + pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize; pgio->pg_mirror_idx = ds_idx; return; @@ -919,7 +1093,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs4_ff_layout_mirror *mirror; struct nfs_pgio_mirror *pgm; struct nfs4_pnfs_ds *ds; - u32 i; + u32 i, dss_id; retry: pnfs_generic_pg_check_layout(pgio, req); @@ -944,7 +1118,12 @@ retry: for (i = 0; i < pgio->pg_mirror_count; i++) { mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); - ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit, + mirror->dss_count, + req_offset(req)); + ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, + dss_id, true); if (IS_ERR(ds)) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; @@ -954,7 +1133,7 @@ retry: goto retry; } pgm = &pgio->pg_mirrors[i]; - pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize; + pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize; } if (NFS_SERVER(pgio->pg_inode)->flags & @@ -1020,14 +1199,14 @@ ff_layout_pg_get_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx) static const struct nfs_pageio_ops ff_layout_pg_read_ops = { .pg_init = ff_layout_pg_init_read, - .pg_test = pnfs_generic_pg_test, + .pg_test = ff_layout_pg_test, .pg_doio = pnfs_generic_pg_readpages, .pg_cleanup = pnfs_generic_pg_cleanup, }; static const struct nfs_pageio_ops ff_layout_pg_write_ops = { .pg_init = ff_layout_pg_init_write, - .pg_test = pnfs_generic_pg_test, + .pg_test = ff_layout_pg_test, .pg_doio = pnfs_generic_pg_writepages, .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write, .pg_cleanup = pnfs_generic_pg_cleanup, @@ -1075,9 +1254,11 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr) { u32 idx = hdr->pgio_mirror_idx + 1; u32 new_idx = 0; + u32 dss_id = 0; struct nfs4_pnfs_ds *ds; - ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx); + ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx, + hdr->args.offset, &dss_id); if (IS_ERR(ds)) pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg); else @@ -1114,11 +1295,11 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, - u32 idx) + u32 idx, u32 dss_id) { struct pnfs_layout_hdr *lo = lseg->pls_layout; struct inode *inode = lo->plh_inode; - struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; switch (op_status) { @@ -1215,9 +1396,9 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, u32 op_status, struct nfs_client *clp, struct pnfs_layout_segment *lseg, - u32 idx) + u32 idx, u32 dss_id) { - struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); switch (op_status) { case NFS_OK: @@ -1281,12 +1462,12 @@ static int ff_layout_async_handle_error(struct rpc_task *task, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, - u32 idx) + u32 idx, u32 dss_id) { int vers = clp->cl_nfs_mod->rpc_vers->number; if (task->tk_status >= 0) { - ff_layout_mark_ds_reachable(lseg, idx); + ff_layout_mark_ds_reachable(lseg, idx, dss_id); return 0; } @@ -1297,10 +1478,10 @@ static int ff_layout_async_handle_error(struct rpc_task *task, switch (vers) { case 3: return ff_layout_async_handle_error_v3(task, op_status, clp, - lseg, idx); + lseg, idx, dss_id); case 4: return ff_layout_async_handle_error_v4(task, op_status, state, - clp, lseg, idx); + clp, lseg, idx, dss_id); default: /* should never happen */ WARN_ON_ONCE(1); @@ -1309,7 +1490,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task, } static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, - u32 idx, u64 offset, u64 length, + u32 idx, u32 dss_id, u64 offset, u64 length, u32 *op_status, int opnum, int error) { struct nfs4_ff_layout_mirror *mirror; @@ -1347,7 +1528,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, mirror = FF_LAYOUT_COMP(lseg, idx); err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), - mirror, offset, length, status, opnum, + mirror, dss_id, offset, length, status, opnum, nfs_io_gfp_mask()); switch (status) { @@ -1356,7 +1537,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, case NFS4ERR_PERM: break; case NFS4ERR_NXIO: - ff_layout_mark_ds_unreachable(lseg, idx); + ff_layout_mark_ds_unreachable(lseg, idx, dss_id); /* * Don't return the layout if this is a read and we still * have layouts to try @@ -1376,10 +1557,16 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, static int ff_layout_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(hdr->lseg); + u32 dss_id = nfs4_ff_layout_calc_dss_id( + flseg->stripe_unit, + flseg->mirror_array[hdr->pgio_mirror_idx]->dss_count, + hdr->args.offset); int err; if (task->tk_status < 0) { - ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, + ff_layout_io_track_ds_error(hdr->lseg, + hdr->pgio_mirror_idx, dss_id, hdr->args.offset, hdr->args.count, &hdr->res.op_status, OP_READ, task->tk_status); @@ -1389,7 +1576,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task, err = ff_layout_async_handle_error(task, hdr->res.op_status, hdr->args.context->state, hdr->ds_clp, hdr->lseg, - hdr->pgio_mirror_idx); + hdr->pgio_mirror_idx, + dss_id); trace_nfs4_pnfs_read(hdr, err); clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); @@ -1445,23 +1633,47 @@ ff_layout_set_layoutcommit(struct inode *inode, static void ff_layout_read_record_layoutstats_start(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_start_read(hdr->inode, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, - task->tk_start); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_start_read( + hdr->inode, + mirror, + dss_id, + hdr->args.count, + task->tk_start); } static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_end_read(task, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, - hdr->res.count); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_end_read( + task, + mirror, + dss_id, + hdr->args.count, + hdr->res.count); set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags); } @@ -1549,11 +1761,17 @@ static void ff_layout_read_release(void *data) static int ff_layout_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(hdr->lseg); + u32 dss_id = nfs4_ff_layout_calc_dss_id( + flseg->stripe_unit, + flseg->mirror_array[hdr->pgio_mirror_idx]->dss_count, + hdr->args.offset); loff_t end_offs = 0; int err; if (task->tk_status < 0) { - ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, + ff_layout_io_track_ds_error(hdr->lseg, + hdr->pgio_mirror_idx, dss_id, hdr->args.offset, hdr->args.count, &hdr->res.op_status, OP_WRITE, task->tk_status); @@ -1563,7 +1781,8 @@ static int ff_layout_write_done_cb(struct rpc_task *task, err = ff_layout_async_handle_error(task, hdr->res.op_status, hdr->args.context->state, hdr->ds_clp, hdr->lseg, - hdr->pgio_mirror_idx); + hdr->pgio_mirror_idx, + dss_id); trace_nfs4_pnfs_write(hdr, err); clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); @@ -1601,9 +1820,11 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data) { int err; + u32 idx = calc_mirror_idx_from_commit(data->lseg, data->ds_commit_index); + u32 dss_id = calc_dss_id_from_commit(data->lseg, data->ds_commit_index); if (task->tk_status < 0) { - ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index, + ff_layout_io_track_ds_error(data->lseg, idx, dss_id, data->args.offset, data->args.count, &data->res.op_status, OP_COMMIT, task->tk_status); @@ -1611,8 +1832,8 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, } err = ff_layout_async_handle_error(task, data->res.op_status, - NULL, data->ds_clp, data->lseg, - data->ds_commit_index); + NULL, data->ds_clp, data->lseg, idx, + dss_id); trace_nfs4_pnfs_commit_ds(data, err); switch (err) { @@ -1631,30 +1852,54 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, } ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb); - return 0; } static void ff_layout_write_record_layoutstats_start(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_start_write(hdr->inode, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, - task->tk_start); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_start_write( + hdr->inode, + mirror, + dss_id, + hdr->args.count, + task->tk_start); } static void ff_layout_write_record_layoutstats_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_end_write(task, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, hdr->res.count, - hdr->res.verf->committed); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_end_write( + task, + mirror, + dss_id, + hdr->args.count, + hdr->res.count, + hdr->res.verf->committed); set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags); } @@ -1737,10 +1982,16 @@ static void ff_layout_write_release(void *data) static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task, struct nfs_commit_data *cdata) { + u32 idx, dss_id; + if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags)) return; + + idx = calc_mirror_idx_from_commit(cdata->lseg, cdata->ds_commit_index); + dss_id = calc_dss_id_from_commit(cdata->lseg, cdata->ds_commit_index); nfs4_ff_layout_stat_io_start_write(cdata->inode, - FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + FF_LAYOUT_COMP(cdata->lseg, idx), + dss_id, 0, task->tk_start); } @@ -1749,6 +2000,7 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task, { struct nfs_page *req; __u64 count = 0; + u32 idx, dss_id; if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags)) return; @@ -1757,8 +2009,12 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task, list_for_each_entry(req, &cdata->pages, wb_list) count += req->wb_bytes; } + + idx = calc_mirror_idx_from_commit(cdata->lseg, cdata->ds_commit_index); + dss_id = calc_dss_id_from_commit(cdata->lseg, cdata->ds_commit_index); nfs4_ff_layout_stat_io_end_write(task, - FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + FF_LAYOUT_COMP(cdata->lseg, idx), + dss_id, count, count, NFS_FILE_SYNC); set_bit(NFS_LSEG_LAYOUTRETURN, &cdata->lseg->pls_flags); } @@ -1872,6 +2128,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) u32 idx = hdr->pgio_mirror_idx; int vers; struct nfs_fh *fh; + u32 dss_id; bool ds_fatal_error = false; dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n", @@ -1879,22 +2136,26 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) hdr->args.pgbase, (size_t)hdr->args.count, offset); mirror = FF_LAYOUT_COMP(lseg, idx); - ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(lseg)->stripe_unit, + mirror->dss_count, + offset); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, false); if (IS_ERR(ds)) { ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds)); goto out_failed; } ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, - hdr->inode); + hdr->inode, dss_id); if (IS_ERR(ds_clnt)) goto out_failed; - ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id); if (!ds_cred) goto out_failed; - vers = nfs4_ff_layout_ds_version(mirror); + vers = nfs4_ff_layout_ds_version(mirror, dss_id); dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers); @@ -1902,11 +2163,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) hdr->pgio_done_cb = ff_layout_read_done_cb; refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; - fh = nfs4_ff_layout_select_ds_fh(mirror); + fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id); if (fh) hdr->args.fh = fh; - nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid); + nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid); /* * Note that if we ever decide to split across DSes, @@ -1916,7 +2177,8 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) hdr->mds_offset = offset; /* Start IO accounting for local read */ - localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh, FMODE_READ); + localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh, + FMODE_READ); if (localio) { hdr->task.tk_start = ktime_get(); ff_layout_read_record_layoutstats_start(&hdr->task, hdr); @@ -1953,25 +2215,30 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) int vers; struct nfs_fh *fh; u32 idx = hdr->pgio_mirror_idx; + u32 dss_id; bool ds_fatal_error = false; mirror = FF_LAYOUT_COMP(lseg, idx); - ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(lseg)->stripe_unit, + mirror->dss_count, + offset); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, true); if (IS_ERR(ds)) { ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds)); goto out_failed; } ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, - hdr->inode); + hdr->inode, dss_id); if (IS_ERR(ds_clnt)) goto out_failed; - ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id); if (!ds_cred) goto out_failed; - vers = nfs4_ff_layout_ds_version(mirror); + vers = nfs4_ff_layout_ds_version(mirror, dss_id); dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n", __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, @@ -1981,12 +2248,12 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) hdr->pgio_done_cb = ff_layout_write_done_cb; refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; - hdr->ds_commit_idx = idx; - fh = nfs4_ff_layout_select_ds_fh(mirror); + hdr->ds_commit_idx = calc_commit_idx(lseg, idx, dss_id); + fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id); if (fh) hdr->args.fh = fh; - nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid); + nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid); /* * Note that if we ever decide to split across DSes, @@ -1995,7 +2262,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) hdr->args.offset = offset; /* Start IO accounting for local write */ - localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh, + localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh, FMODE_READ|FMODE_WRITE); if (localio) { hdr->task.tk_start = ktime_get(); @@ -2019,20 +2286,15 @@ out_failed: return PNFS_NOT_ATTEMPTED; } -static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) -{ - return i; -} - static struct nfs_fh * -select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i) +select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i, u32 dss_id) { struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); /* FIXME: Assume that there is only one NFS version available * for the DS. */ - return &flseg->mirror_array[i]->fh_versions[0]; + return &flseg->mirror_array[i]->dss[dss_id].fh_versions[0]; } static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) @@ -2043,7 +2305,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) struct nfsd_file *localio; struct nfs4_ff_layout_mirror *mirror; const struct cred *ds_cred; - u32 idx; + u32 idx, dss_id; int vers, ret; struct nfs_fh *fh; @@ -2051,22 +2313,23 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))) goto out_err; - idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); + idx = calc_mirror_idx_from_commit(lseg, data->ds_commit_index); mirror = FF_LAYOUT_COMP(lseg, idx); - ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true); + dss_id = calc_dss_id_from_commit(lseg, data->ds_commit_index); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, true); if (IS_ERR(ds)) goto out_err; ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, - data->inode); + data->inode, dss_id); if (IS_ERR(ds_clnt)) goto out_err; - ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred, dss_id); if (!ds_cred) goto out_err; - vers = nfs4_ff_layout_ds_version(mirror); + vers = nfs4_ff_layout_ds_version(mirror, dss_id); dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__, data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count), @@ -2075,12 +2338,12 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) data->cred = ds_cred; refcount_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; - fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); + fh = select_ds_fh_from_commit(lseg, idx, dss_id); if (fh) data->args.fh = fh; /* Start IO accounting for local commit */ - localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh, + localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh, FMODE_READ|FMODE_WRITE); if (localio) { data->task.tk_start = ktime_get(); @@ -2144,25 +2407,28 @@ static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg) struct nfs4_pnfs_ds *ds; struct nfs_client *ds_clp; struct rpc_clnt *clnt; - u32 idx; + u32 idx, dss_id; for (idx = 0; idx < flseg->mirror_array_cnt; idx++) { mirror = flseg->mirror_array[idx]; - mirror_ds = mirror->mirror_ds; - if (IS_ERR_OR_NULL(mirror_ds)) - continue; - ds = mirror->mirror_ds->ds; - if (!ds) - continue; - ds_clp = ds->ds_clp; - if (!ds_clp) - continue; - clnt = ds_clp->cl_rpcclient; - if (!clnt) - continue; - if (!rpc_cancel_tasks(clnt, -EAGAIN, ff_layout_match_io, lseg)) - continue; - rpc_clnt_disconnect(clnt); + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + mirror_ds = mirror->dss[dss_id].mirror_ds; + if (IS_ERR_OR_NULL(mirror_ds)) + continue; + ds = mirror->dss[dss_id].mirror_ds->ds; + if (!ds) + continue; + ds_clp = ds->ds_clp; + if (!ds_clp) + continue; + clnt = ds_clp->cl_rpcclient; + if (!clnt) + continue; + if (!rpc_cancel_tasks(clnt, -EAGAIN, + ff_layout_match_io, lseg)) + continue; + rpc_clnt_disconnect(clnt); + } } } @@ -2184,8 +2450,9 @@ ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); struct inode *inode = lseg->pls_layout->plh_inode; struct pnfs_commit_array *array, *new; + u32 size = flseg->mirror_array_cnt * flseg->mirror_array[0]->dss_count; - new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, + new = pnfs_alloc_commit_array(size, nfs_io_gfp_mask()); if (new) { spin_lock(&inode->i_lock); @@ -2549,11 +2816,11 @@ ff_layout_encode_io_latency(struct xdr_stream *xdr, static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, const struct nfs42_layoutstat_devinfo *devinfo, - struct nfs4_ff_layout_mirror *mirror) + struct nfs4_ff_layout_ds_stripe *dss_info) { struct nfs4_pnfs_ds_addr *da; - struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds; - struct nfs_fh *fh = &mirror->fh_versions[0]; + struct nfs4_pnfs_ds *ds = dss_info->mirror_ds->ds; + struct nfs_fh *fh = &dss_info->fh_versions[0]; __be32 *p; da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node); @@ -2565,13 +2832,17 @@ ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, p = xdr_reserve_space(xdr, 4 + fh->size); xdr_encode_opaque(p, fh->data, fh->size); /* ff_io_latency4 read */ - spin_lock(&mirror->lock); - ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat); + spin_lock(&dss_info->mirror->lock); + ff_layout_encode_io_latency(xdr, + &dss_info->read_stat.io_stat); /* ff_io_latency4 write */ - ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat); - spin_unlock(&mirror->lock); + ff_layout_encode_io_latency(xdr, + &dss_info->write_stat.io_stat); + spin_unlock(&dss_info->mirror->lock); /* nfstime4 */ - ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time)); + ff_layout_encode_nfstime(xdr, + ktime_sub(ktime_get(), + dss_info->start_time)); /* bool */ p = xdr_reserve_space(xdr, 4); *p = cpu_to_be32(false); @@ -2595,7 +2866,8 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr, const void *args, static void ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data *opaque) { - struct nfs4_ff_layout_mirror *mirror = opaque->data; + struct nfs4_ff_layout_ds_stripe *dss_info = opaque->data; + struct nfs4_ff_layout_mirror *mirror = dss_info->mirror; ff_layout_put_mirror(mirror); } @@ -2612,37 +2884,47 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, { struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo); struct nfs4_ff_layout_mirror *mirror; + struct nfs4_ff_layout_ds_stripe *dss_info; struct nfs4_deviceid_node *dev; - int i = 0; + int i = 0, dss_id; list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) { - if (i >= dev_limit) - break; - if (IS_ERR_OR_NULL(mirror->mirror_ds)) - continue; - if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, - &mirror->flags) && - type != NFS4_FF_OP_LAYOUTRETURN) - continue; - /* mirror refcount put in cleanup_layoutstats */ - if (!refcount_inc_not_zero(&mirror->ref)) - continue; - dev = &mirror->mirror_ds->id_node; - memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE); - devinfo->offset = 0; - devinfo->length = NFS4_MAX_UINT64; - spin_lock(&mirror->lock); - devinfo->read_count = mirror->read_stat.io_stat.ops_completed; - devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed; - devinfo->write_count = mirror->write_stat.io_stat.ops_completed; - devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed; - spin_unlock(&mirror->lock); - devinfo->layout_type = LAYOUT_FLEX_FILES; - devinfo->ld_private.ops = &layoutstat_ops; - devinfo->ld_private.data = mirror; - - devinfo++; - i++; + for (dss_id = 0; dss_id < mirror->dss_count; ++dss_id) { + dss_info = &mirror->dss[dss_id]; + if (i >= dev_limit) + break; + if (IS_ERR_OR_NULL(dss_info->mirror_ds)) + continue; + if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, + &mirror->flags) && + type != NFS4_FF_OP_LAYOUTRETURN) + continue; + /* mirror refcount put in cleanup_layoutstats */ + if (!refcount_inc_not_zero(&mirror->ref)) + continue; + dev = &dss_info->mirror_ds->id_node; + memcpy(&devinfo->dev_id, + &dev->deviceid, + NFS4_DEVICEID4_SIZE); + devinfo->offset = 0; + devinfo->length = NFS4_MAX_UINT64; + spin_lock(&mirror->lock); + devinfo->read_count = + dss_info->read_stat.io_stat.ops_completed; + devinfo->read_bytes = + dss_info->read_stat.io_stat.bytes_completed; + devinfo->write_count = + dss_info->write_stat.io_stat.ops_completed; + devinfo->write_bytes = + dss_info->write_stat.io_stat.bytes_completed; + spin_unlock(&mirror->lock); + devinfo->layout_type = LAYOUT_FLEX_FILES; + devinfo->ld_private.ops = &layoutstat_ops; + devinfo->ld_private.data = &mirror->dss[dss_id]; + + devinfo++; + i++; + } } return i; } diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 095df09017a5..17a008c8e97c 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -21,6 +21,8 @@ * due to network error etc. */ #define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096 +#define NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT 4096 + /* LAYOUTSTATS report interval in ms */ #define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L) #define FF_LAYOUTSTATS_MAXDEV 4 @@ -71,12 +73,12 @@ struct nfs4_ff_layoutstat { struct nfs4_ff_busy_timer busy_timer; }; -struct nfs4_ff_layout_mirror { - struct pnfs_layout_hdr *layout; - struct list_head mirrors; - u32 ds_count; - u32 efficiency; +struct nfs4_ff_layout_mirror; + +struct nfs4_ff_layout_ds_stripe { + struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid devid; + u32 efficiency; struct nfs4_ff_layout_ds *mirror_ds; u32 fh_versions_cnt; struct nfs_fh *fh_versions; @@ -84,12 +86,19 @@ struct nfs4_ff_layout_mirror { const struct cred __rcu *ro_cred; const struct cred __rcu *rw_cred; struct nfs_file_localio nfl; - refcount_t ref; - spinlock_t lock; - unsigned long flags; struct nfs4_ff_layoutstat read_stat; struct nfs4_ff_layoutstat write_stat; ktime_t start_time; +}; + +struct nfs4_ff_layout_mirror { + struct pnfs_layout_hdr *layout; + struct list_head mirrors; + u32 dss_count; + struct nfs4_ff_layout_ds_stripe *dss; + refcount_t ref; + spinlock_t lock; + unsigned long flags; u32 report_interval; }; @@ -150,12 +159,12 @@ FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx) } static inline struct nfs4_deviceid_node * -FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx) +FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id) { struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, idx); if (mirror != NULL) { - struct nfs4_ff_layout_ds *mirror_ds = mirror->mirror_ds; + struct nfs4_ff_layout_ds *mirror_ds = mirror->dss[dss_id].mirror_ds; if (!IS_ERR_OR_NULL(mirror_ds)) return &mirror_ds->id_node; @@ -182,9 +191,22 @@ ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg) } static inline int -nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror) +nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror, u32 dss_id) +{ + return mirror->dss[dss_id].mirror_ds->ds_versions[0].version; +} + +static inline u32 +nfs4_ff_layout_calc_dss_id(const u64 stripe_unit, const u32 dss_count, const loff_t offset) { - return mirror->mirror_ds->ds_versions[0].version; + u64 tmp = offset; + + if (dss_count == 1 || stripe_unit == 0) + return 0; + + do_div(tmp, stripe_unit); + + return do_div(tmp, dss_count); } struct nfs4_ff_layout_ds * @@ -193,9 +215,9 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds); void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds); int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, - struct nfs4_ff_layout_mirror *mirror, u64 offset, - u64 length, int status, enum nfs_opnum4 opnum, - gfp_t gfp_flags); + struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, u64 offset, u64 length, int status, + enum nfs_opnum4 opnum, gfp_t gfp_flags); void ff_layout_send_layouterror(struct pnfs_layout_segment *lseg); int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head); void ff_layout_free_ds_ioerr(struct list_head *head); @@ -204,23 +226,27 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, struct list_head *head, unsigned int maxnum); struct nfs_fh * -nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror); +nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror, u32 dss_id); void nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror, - nfs4_stateid *stateid); + u32 dss_id, + nfs4_stateid *stateid); struct nfs4_pnfs_ds * nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, bool fail_return); struct rpc_clnt * nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror, struct nfs_client *ds_clp, - struct inode *inode); + struct inode *inode, + u32 dss_id); const struct cred *ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, const struct pnfs_layout_range *range, - const struct cred *mdscred); + const struct cred *mdscred, + u32 dss_id); bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg); bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg); diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 30365ec782bb..c55ea8fa3bfa 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -44,7 +44,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, { struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; struct list_head dsaddrs; struct nfs4_pnfs_ds_addr *da; struct nfs4_ff_layout_ds *new_ds = NULL; @@ -56,7 +56,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, int i, ret = -ENOMEM; /* set up xdr stream */ - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) goto out_err; @@ -70,7 +70,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, INIT_LIST_HEAD(&dsaddrs); xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); /* multipath count */ p = xdr_inline_decode(&stream, 4); @@ -163,7 +163,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, kfree(da); } - __free_page(scratch); + folio_put(scratch); return new_ds; out_err_drain_dsaddrs: @@ -177,7 +177,7 @@ out_err_drain_dsaddrs: kfree(ds_versions); out_scratch: - __free_page(scratch); + folio_put(scratch); out_err: kfree(new_ds); @@ -250,16 +250,16 @@ ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo, } int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, - struct nfs4_ff_layout_mirror *mirror, u64 offset, - u64 length, int status, enum nfs_opnum4 opnum, - gfp_t gfp_flags) + struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, u64 offset, u64 length, int status, + enum nfs_opnum4 opnum, gfp_t gfp_flags) { struct nfs4_ff_layout_ds_err *dserr; if (status == 0) return 0; - if (IS_ERR_OR_NULL(mirror->mirror_ds)) + if (IS_ERR_OR_NULL(mirror->dss[dss_id].mirror_ds)) return -EINVAL; dserr = kmalloc(sizeof(*dserr), gfp_flags); @@ -271,8 +271,8 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, dserr->length = length; dserr->status = status; dserr->opnum = opnum; - nfs4_stateid_copy(&dserr->stateid, &mirror->stateid); - memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid, + nfs4_stateid_copy(&dserr->stateid, &mirror->dss[dss_id].stateid); + memcpy(&dserr->deviceid, &mirror->dss[dss_id].mirror_ds->id_node.deviceid, NFS4_DEVICEID4_SIZE); spin_lock(&flo->generic_hdr.plh_inode->i_lock); @@ -282,14 +282,14 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, } static const struct cred * -ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode) +ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode, u32 dss_id) { const struct cred *cred, __rcu **pcred; if (iomode == IOMODE_READ) - pcred = &mirror->ro_cred; + pcred = &mirror->dss[dss_id].ro_cred; else - pcred = &mirror->rw_cred; + pcred = &mirror->dss[dss_id].rw_cred; rcu_read_lock(); do { @@ -304,43 +304,45 @@ ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode) } struct nfs_fh * -nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror) +nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror, u32 dss_id) { /* FIXME: For now assume there is only 1 version available for the DS */ - return &mirror->fh_versions[0]; + return &mirror->dss[dss_id].fh_versions[0]; } void nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror, - nfs4_stateid *stateid) + u32 dss_id, + nfs4_stateid *stateid) { - if (nfs4_ff_layout_ds_version(mirror) == 4) - nfs4_stateid_copy(stateid, &mirror->stateid); + if (nfs4_ff_layout_ds_version(mirror, dss_id) == 4) + nfs4_stateid_copy(stateid, &mirror->dss[dss_id].stateid); } static bool ff_layout_init_mirror_ds(struct pnfs_layout_hdr *lo, - struct nfs4_ff_layout_mirror *mirror) + struct nfs4_ff_layout_mirror *mirror, + u32 dss_id) { if (mirror == NULL) goto outerr; - if (mirror->mirror_ds == NULL) { + if (mirror->dss[dss_id].mirror_ds == NULL) { struct nfs4_deviceid_node *node; struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV); node = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), - &mirror->devid, lo->plh_lc_cred, + &mirror->dss[dss_id].devid, lo->plh_lc_cred, GFP_KERNEL); if (node) mirror_ds = FF_LAYOUT_MIRROR_DS(node); /* check for race with another call to this function */ - if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) && + if (cmpxchg(&mirror->dss[dss_id].mirror_ds, NULL, mirror_ds) && mirror_ds != ERR_PTR(-ENODEV)) nfs4_put_deviceid_node(node); } - if (IS_ERR(mirror->mirror_ds)) + if (IS_ERR(mirror->dss[dss_id].mirror_ds)) goto outerr; return true; @@ -352,6 +354,7 @@ outerr: * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call * @lseg: the layout segment we're operating on * @mirror: layout mirror describing the DS to use + * @dss_id: DS stripe id to select stripe to use * @fail_return: return layout on connect failure? * * Try to prepare a DS connection to accept an RPC call. This involves @@ -368,6 +371,7 @@ outerr: struct nfs4_pnfs_ds * nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, bool fail_return) { struct nfs4_pnfs_ds *ds; @@ -376,10 +380,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, unsigned int max_payload; int status = -EAGAIN; - if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror)) + if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror, dss_id)) goto noconnect; - ds = mirror->mirror_ds->ds; + ds = mirror->dss[dss_id].mirror_ds->ds; if (READ_ONCE(ds->ds_clp)) goto out; /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ @@ -388,10 +392,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, /* FIXME: For now we assume the server sent only one version of NFS * to use for the DS. */ - status = nfs4_pnfs_ds_connect(s, ds, &mirror->mirror_ds->id_node, + status = nfs4_pnfs_ds_connect(s, ds, &mirror->dss[dss_id].mirror_ds->id_node, dataserver_timeo, dataserver_retrans, - mirror->mirror_ds->ds_versions[0].version, - mirror->mirror_ds->ds_versions[0].minor_version); + mirror->dss[dss_id].mirror_ds->ds_versions[0].version, + mirror->dss[dss_id].mirror_ds->ds_versions[0].minor_version); /* connect success, check rsize/wsize limit */ if (!status) { @@ -404,15 +408,15 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, max_payload = nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient), NULL); - if (mirror->mirror_ds->ds_versions[0].rsize > max_payload) - mirror->mirror_ds->ds_versions[0].rsize = max_payload; - if (mirror->mirror_ds->ds_versions[0].wsize > max_payload) - mirror->mirror_ds->ds_versions[0].wsize = max_payload; + if (mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize > max_payload) + mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize = max_payload; + if (mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize > max_payload) + mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize = max_payload; goto out; } noconnect: ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), - mirror, lseg->pls_range.offset, + mirror, dss_id, lseg->pls_range.offset, lseg->pls_range.length, NFS4ERR_NXIO, OP_ILLEGAL, GFP_NOIO); ff_layout_send_layouterror(lseg); @@ -426,12 +430,13 @@ out: const struct cred * ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, const struct pnfs_layout_range *range, - const struct cred *mdscred) + const struct cred *mdscred, + u32 dss_id) { const struct cred *cred; - if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) { - cred = ff_layout_get_mirror_cred(mirror, range->iomode); + if (mirror && !mirror->dss[dss_id].mirror_ds->ds_versions[0].tightly_coupled) { + cred = ff_layout_get_mirror_cred(mirror, range->iomode, dss_id); if (!cred) cred = get_cred(mdscred); } else { @@ -445,15 +450,17 @@ ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, * @mirror: pointer to the mirror * @ds_clp: nfs_client for the DS * @inode: pointer to inode + * @dss_id: DS stripe id * * Find or create a DS rpc client with th MDS server rpc client auth flavor * in the nfs_client cl_ds_clients list. */ struct rpc_clnt * nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror, - struct nfs_client *ds_clp, struct inode *inode) + struct nfs_client *ds_clp, struct inode *inode, + u32 dss_id) { - switch (mirror->mirror_ds->ds_versions[0].version) { + switch (mirror->dss[dss_id].mirror_ds->ds_versions[0].version) { case 3: /* For NFSv3 DS, flavor is set when creating DS connections */ return ds_clp->cl_rpcclient; @@ -559,16 +566,18 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg) { struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid_node *devid; - u32 idx; + u32 idx, dss_id; for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); - if (mirror) { - if (!mirror->mirror_ds) + if (!mirror) + continue; + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + if (!mirror->dss[dss_id].mirror_ds) return true; - if (IS_ERR(mirror->mirror_ds)) + if (IS_ERR(mirror->dss[dss_id].mirror_ds)) continue; - devid = &mirror->mirror_ds->id_node; + devid = &mirror->dss[dss_id].mirror_ds->id_node; if (!nfs4_test_deviceid_unavailable(devid)) return true; } @@ -581,17 +590,21 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg) { struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid_node *devid; - u32 idx; + u32 idx, dss_id; for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); - if (!mirror || IS_ERR(mirror->mirror_ds)) - return false; - if (!mirror->mirror_ds) - continue; - devid = &mirror->mirror_ds->id_node; - if (nfs4_test_deviceid_unavailable(devid)) + if (!mirror) return false; + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + if (IS_ERR(mirror->dss[dss_id].mirror_ds)) + return false; + if (!mirror->dss[dss_id].mirror_ds) + continue; + devid = &mirror->dss[dss_id].mirror_ds->id_node; + if (nfs4_test_deviceid_unavailable(devid)) + return false; + } } return FF_LAYOUT_MIRROR_COUNT(lseg) != 0; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 9e94d18448ff..b4679b7161b0 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -1269,8 +1269,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc, int ret; data->context[NFS_MAX_CONTEXT_LEN] = '\0'; - ret = vfs_parse_fs_string(fc, "context", - data->context, strlen(data->context)); + ret = vfs_parse_fs_string(fc, "context", data->context); if (ret < 0) return ret; #else diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 9bdaf7f38bed..18b57c7c2f97 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1073,6 +1073,21 @@ out_no_revalidate: if (S_ISDIR(inode->i_mode)) stat->blksize = NFS_SERVER(inode)->dtsize; stat->btime = NFS_I(inode)->btime; + + /* Special handling for STATX_DIOALIGN and STATX_DIO_READ_ALIGN + * - NFS doesn't have DIO alignment constraints, avoid getting + * these DIO attrs from remote and just respond with most + * accommodating limits (so client will issue supported DIO). + * - this is unintuitive, but the most coarse-grained + * dio_offset_align is the most accommodating. + */ + if ((request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN)) && + S_ISREG(inode->i_mode)) { + stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN; + stat->dio_mem_align = 4; /* 4-byte alignment */ + stat->dio_offset_align = PAGE_SIZE; + stat->dio_read_offset_align = stat->dio_offset_align; + } out: trace_nfs_getattr_exit(inode, err); return err; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index c0a44f389f8f..2ecd38e1d17a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -456,6 +456,16 @@ extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); #if IS_ENABLED(CONFIG_NFS_LOCALIO) /* localio.c */ +struct nfs_local_dio { + u32 mem_align; + u32 offset_align; + loff_t middle_offset; + loff_t end_offset; + ssize_t start_len; /* Length for misaligned first extent */ + ssize_t middle_len; /* Length for DIO-aligned middle extent */ + ssize_t end_len; /* Length for misaligned last extent */ +}; + extern void nfs_local_probe_async(struct nfs_client *); extern void nfs_local_probe_async_work(struct work_struct *); extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *, diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 97abf62f109d..2c0455e91571 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -30,6 +30,8 @@ #define NFSDBG_FACILITY NFSDBG_VFS +#define NFSLOCAL_MAX_IOS 3 + struct nfs_local_kiocb { struct kiocb kiocb; struct bio_vec *bvec; @@ -37,6 +39,14 @@ struct nfs_local_kiocb { struct work_struct work; void (*aio_complete_work)(struct work_struct *); struct nfsd_file *localio; + /* Begin mostly DIO-specific members */ + size_t end_len; + short int end_iter_index; + short int n_iters; + bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS]; + loff_t offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned; + struct iov_iter iters[NFSLOCAL_MAX_IOS]; + /* End mostly DIO-specific members */ }; struct nfs_local_fsync_ctx { @@ -49,11 +59,6 @@ struct nfs_local_fsync_ctx { static bool localio_enabled __read_mostly = true; module_param(localio_enabled, bool, 0644); -static bool localio_O_DIRECT_semantics __read_mostly = false; -module_param(localio_O_DIRECT_semantics, bool, 0644); -MODULE_PARM_DESC(localio_O_DIRECT_semantics, - "LOCALIO will use O_DIRECT semantics to filesystem."); - static inline bool nfs_client_is_local(const struct nfs_client *clp) { return !!rcu_access_pointer(clp->cl_uuid.net); @@ -231,13 +236,13 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, struct nfsd_file __rcu **pnf, const fmode_t mode) { + int status = 0; struct nfsd_file *localio; localio = nfs_open_local_fh(&clp->cl_uuid, clp->cl_rpcclient, cred, fh, nfl, pnf, mode); if (IS_ERR(localio)) { - int status = PTR_ERR(localio); - trace_nfs_local_open_fh(fh, mode, status); + status = PTR_ERR(localio); switch (status) { case -ENOMEM: case -ENXIO: @@ -247,6 +252,7 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, nfs_local_probe(clp); } } + trace_nfs_local_open_fh(fh, mode, status); return localio; } @@ -281,23 +287,6 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, } EXPORT_SYMBOL_GPL(nfs_local_open_fh); -static struct bio_vec * -nfs_bvec_alloc_and_import_pagevec(struct page **pagevec, - unsigned int npages, gfp_t flags) -{ - struct bio_vec *bvec, *p; - - bvec = kmalloc_array(npages, sizeof(*bvec), flags); - if (bvec != NULL) { - for (p = bvec; npages > 0; p++, pagevec++, npages--) { - p->bv_page = *pagevec; - p->bv_len = PAGE_SIZE; - p->bv_offset = 0; - } - } - return bvec; -} - static void nfs_local_iocb_free(struct nfs_local_kiocb *iocb) { @@ -311,40 +300,191 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, { struct nfs_local_kiocb *iocb; - iocb = kmalloc(sizeof(*iocb), flags); + iocb = kzalloc(sizeof(*iocb), flags); if (iocb == NULL) return NULL; - iocb->bvec = nfs_bvec_alloc_and_import_pagevec(hdr->page_array.pagevec, - hdr->page_array.npages, flags); + + iocb->bvec = kmalloc_array(hdr->page_array.npages, + sizeof(struct bio_vec), flags); if (iocb->bvec == NULL) { kfree(iocb); return NULL; } - if (localio_O_DIRECT_semantics && - test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) { - iocb->kiocb.ki_filp = file; - iocb->kiocb.ki_flags = IOCB_DIRECT; - } else - init_sync_kiocb(&iocb->kiocb, file); + init_sync_kiocb(&iocb->kiocb, file); - iocb->kiocb.ki_pos = hdr->args.offset; iocb->hdr = hdr; iocb->kiocb.ki_flags &= ~IOCB_APPEND; iocb->aio_complete_work = NULL; + iocb->end_iter_index = -1; + return iocb; } -static void -nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir) +static bool +nfs_is_local_dio_possible(struct nfs_local_kiocb *iocb, int rw, + size_t len, struct nfs_local_dio *local_dio) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + loff_t offset = hdr->args.offset; + u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align; + loff_t start_end, orig_end, middle_end; + + nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align, + &nf_dio_offset_align, &nf_dio_read_offset_align); + if (rw == ITER_DEST) + nf_dio_offset_align = nf_dio_read_offset_align; + + if (unlikely(!nf_dio_mem_align || !nf_dio_offset_align)) + return false; + if (unlikely(nf_dio_offset_align > PAGE_SIZE)) + return false; + if (unlikely(len < nf_dio_offset_align)) + return false; + + local_dio->mem_align = nf_dio_mem_align; + local_dio->offset_align = nf_dio_offset_align; + + start_end = round_up(offset, nf_dio_offset_align); + orig_end = offset + len; + middle_end = round_down(orig_end, nf_dio_offset_align); + + local_dio->middle_offset = start_end; + local_dio->end_offset = middle_end; + + local_dio->start_len = start_end - offset; + local_dio->middle_len = middle_end - start_end; + local_dio->end_len = orig_end - middle_end; + + if (rw == ITER_DEST) + trace_nfs_local_dio_read(hdr->inode, offset, len, local_dio); + else + trace_nfs_local_dio_write(hdr->inode, offset, len, local_dio); + return true; +} + +static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i, + unsigned int addr_mask, unsigned int len_mask) +{ + const struct bio_vec *bvec = i->bvec; + size_t skip = i->iov_offset; + size_t size = i->count; + + if (size & len_mask) + return false; + do { + size_t len = bvec->bv_len; + + if (len > size) + len = size; + if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) + return false; + bvec++; + size -= len; + skip = 0; + } while (size); + + return true; +} + +/* + * Setup as many as 3 iov_iter based on extents described by @local_dio. + * Returns the number of iov_iter that were setup. + */ +static int +nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, + unsigned int nvecs, size_t len, + struct nfs_local_dio *local_dio) +{ + int n_iters = 0; + struct iov_iter *iters = iocb->iters; + + /* Setup misaligned start? */ + if (local_dio->start_len) { + iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); + iters[n_iters].count = local_dio->start_len; + iocb->offset[n_iters] = iocb->hdr->args.offset; + iocb->iter_is_dio_aligned[n_iters] = false; + ++n_iters; + } + + /* Setup misaligned end? + * If so, the end is purposely setup to be issued using buffered IO + * before the middle (which will use DIO, if DIO-aligned, with AIO). + * This creates problems if/when the end results in a partial write. + * So must save index and length of end to handle this corner case. + */ + if (local_dio->end_len) { + iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); + iocb->offset[n_iters] = local_dio->end_offset; + iov_iter_advance(&iters[n_iters], + local_dio->start_len + local_dio->middle_len); + iocb->iter_is_dio_aligned[n_iters] = false; + /* Save index and length of end */ + iocb->end_iter_index = n_iters; + iocb->end_len = local_dio->end_len; + ++n_iters; + } + + /* Setup DIO-aligned middle to be issued last, to allow for + * DIO with AIO completion (see nfs_local_call_{read,write}). + */ + iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); + if (local_dio->start_len) + iov_iter_advance(&iters[n_iters], local_dio->start_len); + iters[n_iters].count -= local_dio->end_len; + iocb->offset[n_iters] = local_dio->middle_offset; + + iocb->iter_is_dio_aligned[n_iters] = + nfs_iov_iter_aligned_bvec(&iters[n_iters], + local_dio->mem_align-1, local_dio->offset_align-1); + + if (unlikely(!iocb->iter_is_dio_aligned[n_iters])) { + trace_nfs_local_dio_misaligned(iocb->hdr->inode, + iocb->hdr->args.offset, len, local_dio); + return 0; /* no DIO-aligned IO possible */ + } + ++n_iters; + + iocb->n_iters = n_iters; + return n_iters; +} + +static noinline_for_stack void +nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw) { struct nfs_pgio_header *hdr = iocb->hdr; + struct page **pagevec = hdr->page_array.pagevec; + unsigned long v, total; + unsigned int base; + size_t len; + + v = 0; + total = hdr->args.count; + base = hdr->args.pgbase; + while (total && v < hdr->page_array.npages) { + len = min_t(size_t, total, PAGE_SIZE - base); + bvec_set_page(&iocb->bvec[v], *pagevec, len, base); + total -= len; + ++pagevec; + ++v; + base = 0; + } + len = hdr->args.count - total; + + if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) { + struct nfs_local_dio local_dio; + + if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) && + nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0) + return; /* is DIO-aligned */ + } - iov_iter_bvec(i, dir, iocb->bvec, hdr->page_array.npages, - hdr->args.count + hdr->args.pgbase); - if (hdr->args.pgbase != 0) - iov_iter_advance(i, hdr->args.pgbase); + /* Use buffered IO */ + iocb->offset[0] = hdr->args.offset; + iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len); + iocb->n_iters = 1; } static void @@ -367,10 +507,12 @@ nfs_local_pgio_init(struct nfs_pgio_header *hdr, static void nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status) { + /* Must handle partial completions */ if (status >= 0) { - hdr->res.count = status; - hdr->res.op_status = NFS4_OK; - hdr->task.tk_status = 0; + hdr->res.count += status; + /* @hdr was initialized to 0 (zeroed during allocation) */ + if (hdr->task.tk_status == 0) + hdr->res.op_status = NFS4_OK; } else { hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status); hdr->task.tk_status = status; @@ -378,12 +520,18 @@ nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status) } static void +nfs_local_iocb_release(struct nfs_local_kiocb *iocb) +{ + nfs_local_file_put(iocb->localio); + nfs_local_iocb_free(iocb); +} + +static void nfs_local_pgio_release(struct nfs_local_kiocb *iocb) { struct nfs_pgio_header *hdr = iocb->hdr; - nfs_local_file_put(iocb->localio); - nfs_local_iocb_free(iocb); + nfs_local_iocb_release(iocb); nfs_local_hdr_release(hdr, hdr->task.tk_ops); } @@ -405,7 +553,10 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status) struct nfs_pgio_header *hdr = iocb->hdr; struct file *filp = iocb->kiocb.ki_filp; - nfs_local_pgio_done(hdr, status); + if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) { + /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */ + pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n"); + } /* * Must clear replen otherwise NFSv3 data corruption will occur @@ -434,6 +585,7 @@ static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret) struct nfs_local_kiocb *iocb = container_of(kiocb, struct nfs_local_kiocb, kiocb); + nfs_local_pgio_done(iocb->hdr, ret); nfs_local_read_done(iocb, ret); nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */ } @@ -444,14 +596,25 @@ static void nfs_local_call_read(struct work_struct *work) container_of(work, struct nfs_local_kiocb, work); struct file *filp = iocb->kiocb.ki_filp; const struct cred *save_cred; - struct iov_iter iter; ssize_t status; save_cred = override_creds(filp->f_cred); - nfs_local_iter_init(&iter, iocb, READ); + for (int i = 0; i < iocb->n_iters ; i++) { + if (iocb->iter_is_dio_aligned[i]) { + iocb->kiocb.ki_flags |= IOCB_DIRECT; + iocb->kiocb.ki_complete = nfs_local_read_aio_complete; + iocb->aio_complete_work = nfs_local_read_aio_complete_work; + } - status = filp->f_op->read_iter(&iocb->kiocb, &iter); + iocb->kiocb.ki_pos = iocb->offset[i]; + status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]); + if (status != -EIOCBQUEUED) { + nfs_local_pgio_done(iocb->hdr, status); + if (iocb->hdr->task.tk_status) + break; + } + } revert_creds(save_cred); @@ -462,33 +625,17 @@ static void nfs_local_call_read(struct work_struct *work) } static int -nfs_do_local_read(struct nfs_pgio_header *hdr, - struct nfsd_file *localio, +nfs_local_do_read(struct nfs_local_kiocb *iocb, const struct rpc_call_ops *call_ops) { - struct nfs_local_kiocb *iocb; - struct file *file = nfs_to->nfsd_file_file(localio); - - /* Don't support filesystems without read_iter */ - if (!file->f_op->read_iter) - return -EAGAIN; + struct nfs_pgio_header *hdr = iocb->hdr; dprintk("%s: vfs_read count=%u pos=%llu\n", __func__, hdr->args.count, hdr->args.offset); - iocb = nfs_local_iocb_alloc(hdr, file, GFP_KERNEL); - if (iocb == NULL) - return -ENOMEM; - iocb->localio = localio; - nfs_local_pgio_init(hdr, call_ops); hdr->res.eof = false; - if (iocb->kiocb.ki_flags & IOCB_DIRECT) { - iocb->kiocb.ki_complete = nfs_local_read_aio_complete; - iocb->aio_complete_work = nfs_local_read_aio_complete_work; - } - INIT_WORK(&iocb->work, nfs_local_call_read); queue_work(nfslocaliod_workqueue, &iocb->work); @@ -529,7 +676,7 @@ nfs_set_local_verifier(struct inode *inode, } /* Factored out from fs/nfsd/vfs.h:fh_getattr() */ -static int __vfs_getattr(struct path *p, struct kstat *stat, int version) +static int __vfs_getattr(const struct path *p, struct kstat *stat, int version) { u32 request_mask = STATX_BASIC_STATS; @@ -597,7 +744,13 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0); + if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) { + /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */ + pr_info_ratelimited("nfs: Unexpected direct I/O write alignment failure\n"); + } + /* Handle short writes as if they are ENOSPC */ + status = hdr->res.count; if (status > 0 && status < hdr->args.count) { hdr->mds_offset += status; hdr->args.offset += status; @@ -605,11 +758,11 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) hdr->args.count -= status; nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset); status = -ENOSPC; + /* record -ENOSPC in terms of nfs_local_pgio_done */ + nfs_local_pgio_done(hdr, status); } - if (status < 0) + if (hdr->task.tk_status < 0) nfs_reset_boot_verifier(inode); - - nfs_local_pgio_done(hdr, status); } static void nfs_local_write_aio_complete_work(struct work_struct *work) @@ -626,6 +779,7 @@ static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret) struct nfs_local_kiocb *iocb = container_of(kiocb, struct nfs_local_kiocb, kiocb); + nfs_local_pgio_done(iocb->hdr, ret); nfs_local_write_done(iocb, ret); nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */ } @@ -637,16 +791,53 @@ static void nfs_local_call_write(struct work_struct *work) struct file *filp = iocb->kiocb.ki_filp; unsigned long old_flags = current->flags; const struct cred *save_cred; - struct iov_iter iter; ssize_t status; current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; save_cred = override_creds(filp->f_cred); - nfs_local_iter_init(&iter, iocb, WRITE); - file_start_write(filp); - status = filp->f_op->write_iter(&iocb->kiocb, &iter); + for (int i = 0; i < iocb->n_iters ; i++) { + if (iocb->iter_is_dio_aligned[i]) { + iocb->kiocb.ki_flags |= IOCB_DIRECT; + iocb->kiocb.ki_complete = nfs_local_write_aio_complete; + iocb->aio_complete_work = nfs_local_write_aio_complete_work; + } +retry: + iocb->kiocb.ki_pos = iocb->offset[i]; + status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]); + if (status != -EIOCBQUEUED) { + if (unlikely(status >= 0 && status < iocb->iters[i].count)) { + /* partial write */ + if (i == iocb->end_iter_index) { + /* Must not account partial end, otherwise, due + * to end being issued before middle: the partial + * write accounting in nfs_local_write_done() + * would incorrectly advance hdr->args.offset + */ + status = 0; + } else { + /* Partial write at start or buffered middle, + * exit early. + */ + nfs_local_pgio_done(iocb->hdr, status); + break; + } + } else if (unlikely(status == -ENOTBLK && + (iocb->kiocb.ki_flags & IOCB_DIRECT))) { + /* VFS will return -ENOTBLK if DIO WRITE fails to + * invalidate the page cache. Retry using buffered IO. + */ + iocb->kiocb.ki_flags &= ~IOCB_DIRECT; + iocb->kiocb.ki_complete = NULL; + iocb->aio_complete_work = NULL; + goto retry; + } + nfs_local_pgio_done(iocb->hdr, status); + if (iocb->hdr->task.tk_status) + break; + } + } file_end_write(filp); revert_creds(save_cred); @@ -660,26 +851,15 @@ static void nfs_local_call_write(struct work_struct *work) } static int -nfs_do_local_write(struct nfs_pgio_header *hdr, - struct nfsd_file *localio, +nfs_local_do_write(struct nfs_local_kiocb *iocb, const struct rpc_call_ops *call_ops) { - struct nfs_local_kiocb *iocb; - struct file *file = nfs_to->nfsd_file_file(localio); - - /* Don't support filesystems without write_iter */ - if (!file->f_op->write_iter) - return -EAGAIN; + struct nfs_pgio_header *hdr = iocb->hdr; dprintk("%s: vfs_write count=%u pos=%llu %s\n", __func__, hdr->args.count, hdr->args.offset, (hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable"); - iocb = nfs_local_iocb_alloc(hdr, file, GFP_NOIO); - if (iocb == NULL) - return -ENOMEM; - iocb->localio = localio; - switch (hdr->args.stable) { default: break; @@ -694,43 +874,74 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable); - if (iocb->kiocb.ki_flags & IOCB_DIRECT) { - iocb->kiocb.ki_complete = nfs_local_write_aio_complete; - iocb->aio_complete_work = nfs_local_write_aio_complete_work; - } - INIT_WORK(&iocb->work, nfs_local_call_write); queue_work(nfslocaliod_workqueue, &iocb->work); return 0; } +static struct nfs_local_kiocb * +nfs_local_iocb_init(struct nfs_pgio_header *hdr, struct nfsd_file *localio) +{ + struct file *file = nfs_to->nfsd_file_file(localio); + struct nfs_local_kiocb *iocb; + gfp_t gfp_mask; + int rw; + + if (hdr->rw_mode & FMODE_READ) { + if (!file->f_op->read_iter) + return ERR_PTR(-EOPNOTSUPP); + gfp_mask = GFP_KERNEL; + rw = ITER_DEST; + } else { + if (!file->f_op->write_iter) + return ERR_PTR(-EOPNOTSUPP); + gfp_mask = GFP_NOIO; + rw = ITER_SOURCE; + } + + iocb = nfs_local_iocb_alloc(hdr, file, gfp_mask); + if (iocb == NULL) + return ERR_PTR(-ENOMEM); + iocb->hdr = hdr; + iocb->localio = localio; + + nfs_local_iters_init(iocb, rw); + + return iocb; +} + int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio, struct nfs_pgio_header *hdr, const struct rpc_call_ops *call_ops) { + struct nfs_local_kiocb *iocb; int status = 0; if (!hdr->args.count) return 0; + iocb = nfs_local_iocb_init(hdr, localio); + if (IS_ERR(iocb)) + return PTR_ERR(iocb); + switch (hdr->rw_mode) { case FMODE_READ: - status = nfs_do_local_read(hdr, localio, call_ops); + status = nfs_local_do_read(iocb, call_ops); break; case FMODE_WRITE: - status = nfs_do_local_write(hdr, localio, call_ops); + status = nfs_local_do_write(iocb, call_ops); break; default: dprintk("%s: invalid mode: %d\n", __func__, hdr->rw_mode); - status = -EINVAL; + status = -EOPNOTSUPP; } if (status != 0) { if (status == -EAGAIN) nfs_localio_disable_client(clp); - nfs_local_file_put(localio); + nfs_local_iocb_release(iocb); hdr->task.tk_status = status; nfs_local_hdr_release(hdr, call_ops); } diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index f9a3a1fbf44c..5a4d193da1a9 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -290,7 +290,8 @@ int nfs_do_submount(struct fs_context *fc) nfs_errorf(fc, "NFS: Couldn't determine submount pathname"); ret = PTR_ERR(p); } else { - ret = vfs_parse_fs_string(fc, "source", p, buffer + 4096 - p); + ret = vfs_parse_fs_qstr(fc, "source", + &QSTR_LEN(p, buffer + 4096 - p)); if (!ret) ret = vfs_get_tree(fc); } diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 6e75c6c2d234..9eff09158518 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -23,8 +23,8 @@ #include <linux/nfs2.h> #include <linux/nfs_fs.h> #include <linux/nfs_common.h> -#include "nfstrace.h" #include "internal.h" +#include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_XDR diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 4ae01c10b7e2..e17d72908412 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -23,8 +23,8 @@ #include <linux/nfsacl.h> #include <linux/nfs_common.h> -#include "nfstrace.h" #include "internal.h" +#include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_XDR diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6a0b5871ba3b..d537fb0c230e 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -1514,7 +1514,7 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, ret = -ENOMEM; - res.scratch = alloc_page(GFP_KERNEL); + res.scratch = folio_alloc(GFP_KERNEL, 0); if (!res.scratch) goto out; @@ -1552,7 +1552,7 @@ out_free_pages: } kfree(pages); out_free_scratch: - __free_page(res.scratch); + folio_put(res.scratch); out: return ret; diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 4cc915d5741d..e10d83ba835e 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1781,7 +1781,7 @@ static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp, struct compound_hdr hdr; int status; - xdr_set_scratch_page(xdr, res->scratch); + xdr_set_scratch_folio(xdr, res->scratch); status = decode_compound_hdr(xdr, &hdr); if (status) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index c9a0d1e420c6..7f43e890d356 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -456,4 +456,5 @@ const struct file_operations nfs4_file_operations = { #else .llseek = nfs_file_llseek, #endif + .fop_flags = FOP_DONTCACHE, }; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ce61253efd45..f58098417142 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -391,7 +391,9 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent *p++ = htonl(attrs); /* bitmap */ *p++ = htonl(12); /* attribute buffer length */ *p++ = htonl(NF4DIR); + spin_lock(&dentry->d_lock); p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry->d_parent))); + spin_unlock(&dentry->d_lock); readdir->pgbase = (char *)p - (char *)start; readdir->count -= readdir->pgbase; @@ -6160,7 +6162,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, } /* for decoding across pages */ - res.acl_scratch = alloc_page(GFP_KERNEL); + res.acl_scratch = folio_alloc(GFP_KERNEL, 0); if (!res.acl_scratch) goto out_free; @@ -6196,7 +6198,7 @@ out_free: while (--i >= 0) __free_page(pages[i]); if (res.acl_scratch) - __free_page(res.acl_scratch); + folio_put(res.acl_scratch); kfree(pages); return ret; } @@ -7872,10 +7874,10 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, return err; do { err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); - if (err != -NFS4ERR_DELAY) + if (err != -NFS4ERR_DELAY && err != -NFS4ERR_GRACE) break; ssleep(1); - } while (err == -NFS4ERR_DELAY); + } while (err == -NFS4ERR_DELAY || err == -NFSERR_GRACE); return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err); } @@ -9442,7 +9444,7 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args goto out; if (rcvd->max_rqst_sz > sent->max_rqst_sz) return -EINVAL; - if (rcvd->max_resp_sz < sent->max_resp_sz) + if (rcvd->max_resp_sz > sent->max_resp_sz) return -EINVAL; if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached) return -EINVAL; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 7612e977e80b..01179f7de322 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2744,6 +2744,9 @@ out_error: case -ENETUNREACH: nfs_mark_client_ready(clp, -EIO); break; + case -EINVAL: + nfs_mark_client_ready(clp, status); + break; default: ssleep(1); break; diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index b29a26923ce0..5ec9c83f1ef0 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -149,21 +149,9 @@ static int do_nfs4_mount(struct nfs_server *server, struct fs_context *root_fc; struct vfsmount *root_mnt; struct dentry *dentry; - size_t len; + char *source; int ret; - struct fs_parameter param = { - .key = "source", - .type = fs_value_is_string, - .dirfd = -1, - }; - - struct fs_parameter param_fsc = { - .key = "fsc", - .type = fs_value_is_string, - .dirfd = -1, - }; - if (IS_ERR(server)) return PTR_ERR(server); @@ -181,15 +169,7 @@ static int do_nfs4_mount(struct nfs_server *server, root_ctx->server = server; if (ctx->fscache_uniq) { - len = strlen(ctx->fscache_uniq); - param_fsc.size = len; - param_fsc.string = kmemdup_nul(ctx->fscache_uniq, len, GFP_KERNEL); - if (param_fsc.string == NULL) { - put_fs_context(root_fc); - return -ENOMEM; - } - ret = vfs_parse_fs_param(root_fc, ¶m_fsc); - kfree(param_fsc.string); + ret = vfs_parse_fs_string(root_fc, "fsc", ctx->fscache_uniq); if (ret < 0) { put_fs_context(root_fc); return ret; @@ -197,20 +177,18 @@ static int do_nfs4_mount(struct nfs_server *server, } /* We leave export_path unset as it's not used to find the root. */ - len = strlen(hostname) + 5; - param.string = kmalloc(len, GFP_KERNEL); - if (param.string == NULL) { - put_fs_context(root_fc); - return -ENOMEM; - } - /* Does hostname needs to be enclosed in brackets? */ if (strchr(hostname, ':')) - param.size = snprintf(param.string, len, "[%s]:/", hostname); + source = kasprintf(GFP_KERNEL, "[%s]:/", hostname); else - param.size = snprintf(param.string, len, "%s:/", hostname); - ret = vfs_parse_fs_param(root_fc, ¶m); - kfree(param.string); + source = kasprintf(GFP_KERNEL, "%s:/", hostname); + + if (!source) { + put_fs_context(root_fc); + return -ENOMEM; + } + ret = vfs_parse_fs_string(root_fc, "source", source); + kfree(source); if (ret < 0) { put_fs_context(root_fc); return ret; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 49ff98571fa5..1d0e6c10f921 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4930,7 +4930,7 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, } /* - * The prefered block size for layout directed io + * The preferred block size for layout directed io */ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -6585,7 +6585,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, int status; if (res->acl_scratch != NULL) - xdr_set_scratch_page(xdr, res->acl_scratch); + xdr_set_scratch_folio(xdr, res->acl_scratch); status = decode_compound_hdr(xdr, &hdr); if (status) goto out; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 627115179795..6ce55e8e6b67 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -45,6 +45,23 @@ { BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \ { BIT(NFS_INO_ODIRECT), "ODIRECT" }) +#define nfs_show_wb_flags(v) \ + __print_flags(v, "|", \ + { BIT(PG_BUSY), "BUSY" }, \ + { BIT(PG_MAPPED), "MAPPED" }, \ + { BIT(PG_FOLIO), "FOLIO" }, \ + { BIT(PG_CLEAN), "CLEAN" }, \ + { BIT(PG_COMMIT_TO_DS), "COMMIT_TO_DS" }, \ + { BIT(PG_INODE_REF), "INODE_REF" }, \ + { BIT(PG_HEADLOCK), "HEADLOCK" }, \ + { BIT(PG_TEARDOWN), "TEARDOWN" }, \ + { BIT(PG_UNLOCKPAGE), "UNLOCKPAGE" }, \ + { BIT(PG_UPTODATE), "UPTODATE" }, \ + { BIT(PG_WB_END), "WB_END" }, \ + { BIT(PG_REMOVE), "REMOVE" }, \ + { BIT(PG_CONTENDED1), "CONTENDED1" }, \ + { BIT(PG_CONTENDED2), "CONTENDED2" }) + DECLARE_EVENT_CLASS(nfs_inode_event, TP_PROTO( const struct inode *inode @@ -967,7 +984,7 @@ DECLARE_EVENT_CLASS(nfs_folio_event, __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); - __entry->offset = offset, + __entry->offset = offset; __entry->count = count; ), @@ -1017,8 +1034,8 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done, __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); - __entry->offset = offset, - __entry->count = count, + __entry->offset = offset; + __entry->count = count; __entry->ret = ret; ), @@ -1051,6 +1068,73 @@ DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_done); DEFINE_NFS_FOLIO_EVENT(nfs_invalidate_folio); DEFINE_NFS_FOLIO_EVENT_DONE(nfs_launder_folio_done); +DEFINE_NFS_FOLIO_EVENT(nfs_try_to_update_request); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_try_to_update_request_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_update_folio); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_update_folio_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_write_begin); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_write_begin_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_write_end); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_write_end_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_writepages); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writepages_done); + +DECLARE_EVENT_CLASS(nfs_kiocb_event, + TP_PROTO( + const struct kiocb *iocb, + const struct iov_iter *iter + ), + + TP_ARGS(iocb, iter), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, version) + __field(loff_t, offset) + __field(size_t, count) + __field(int, flags) + ), + + TP_fast_assign( + const struct inode *inode = file_inode(iocb->ki_filp); + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->version = inode_peek_iversion_raw(inode); + __entry->offset = iocb->ki_pos; + __entry->count = iov_iter_count(iter); + __entry->flags = iocb->ki_flags; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld count=%zu ki_flags=%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->version, + __entry->offset, __entry->count, + __print_flags(__entry->flags, "|", TRACE_IOCB_STRINGS) + ) +); + +#define DEFINE_NFS_KIOCB_EVENT(name) \ + DEFINE_EVENT(nfs_kiocb_event, name, \ + TP_PROTO( \ + const struct kiocb *iocb, \ + const struct iov_iter *iter \ + ), \ + TP_ARGS(iocb, iter)) + +DEFINE_NFS_KIOCB_EVENT(nfs_file_read); +DEFINE_NFS_KIOCB_EVENT(nfs_file_write); + TRACE_EVENT(nfs_aop_readahead, TP_PROTO( const struct inode *inode, @@ -1398,6 +1482,55 @@ TRACE_EVENT(nfs_writeback_done, ) ); +DECLARE_EVENT_CLASS(nfs_page_class, + TP_PROTO( + const struct nfs_page *req + ), + + TP_ARGS(req), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(const struct nfs_page *__private, req) + __field(loff_t, offset) + __field(unsigned int, count) + __field(unsigned long, flags) + ), + + TP_fast_assign( + const struct inode *inode = folio_inode(req->wb_folio); + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->req = req; + __entry->offset = req_offset(req); + __entry->count = req->wb_bytes; + __entry->flags = req->wb_flags; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x req=%p offset=%lld count=%u flags=%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, __entry->fhandle, + __entry->req, __entry->offset, __entry->count, + nfs_show_wb_flags(__entry->flags) + ) +); + +#define DEFINE_NFS_PAGE_EVENT(name) \ + DEFINE_EVENT(nfs_page_class, name, \ + TP_PROTO( \ + const struct nfs_page *req \ + ), \ + TP_ARGS(req)) + +DEFINE_NFS_PAGE_EVENT(nfs_writepage_setup); +DEFINE_NFS_PAGE_EVENT(nfs_do_writepage); + DECLARE_EVENT_CLASS(nfs_page_error_class, TP_PROTO( const struct inode *inode, @@ -1599,6 +1732,76 @@ DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_completion); DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_schedule_iovec); DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_reschedule_io); +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + +DECLARE_EVENT_CLASS(nfs_local_dio_class, + TP_PROTO( + const struct inode *inode, + loff_t offset, + ssize_t count, + const struct nfs_local_dio *local_dio + ), + TP_ARGS(inode, offset, count, local_dio), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, fileid) + __field(u32, fhandle) + __field(loff_t, offset) + __field(ssize_t, count) + __field(u32, mem_align) + __field(u32, offset_align) + __field(loff_t, start) + __field(ssize_t, start_len) + __field(loff_t, middle) + __field(ssize_t, middle_len) + __field(loff_t, end) + __field(ssize_t, end_len) + ), + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = &nfsi->fh; + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + __entry->offset = offset; + __entry->count = count; + __entry->mem_align = local_dio->mem_align; + __entry->offset_align = local_dio->offset_align; + __entry->start = offset; + __entry->start_len = local_dio->start_len; + __entry->middle = local_dio->middle_offset; + __entry->middle_len = local_dio->middle_len; + __entry->end = local_dio->end_offset; + __entry->end_len = local_dio->end_len; + ), + TP_printk("fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%zd " + "mem_align=%u offset_align=%u " + "start=%llu+%zd middle=%llu+%zd end=%llu+%zd", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->offset, __entry->count, + __entry->mem_align, __entry->offset_align, + __entry->start, __entry->start_len, + __entry->middle, __entry->middle_len, + __entry->end, __entry->end_len) +) + +#define DEFINE_NFS_LOCAL_DIO_EVENT(name) \ +DEFINE_EVENT(nfs_local_dio_class, nfs_local_dio_##name, \ + TP_PROTO(const struct inode *inode, \ + loff_t offset, \ + ssize_t count, \ + const struct nfs_local_dio *local_dio),\ + TP_ARGS(inode, offset, count, local_dio)) + +DEFINE_NFS_LOCAL_DIO_EVENT(read); +DEFINE_NFS_LOCAL_DIO_EVENT(write); +DEFINE_NFS_LOCAL_DIO_EVENT(misaligned); + +#endif /* CONFIG_NFS_LOCALIO */ + TRACE_EVENT(nfs_fh_to_dentry, TP_PROTO( const struct super_block *sb, @@ -1713,10 +1916,10 @@ TRACE_EVENT(nfs_local_open_fh, ), TP_printk( - "error=%d fhandle=0x%08x mode=%s", - __entry->error, + "fhandle=0x%08x mode=%s result=%d", __entry->fhandle, - show_fs_fmode_flags(__entry->fmode) + show_fs_fmode_flags(__entry->fmode), + __entry->error ) ); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 647c53d1418a..0fb6905736d5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -296,7 +296,7 @@ static void nfs_folio_end_writeback(struct folio *folio) { struct nfs_server *nfss = NFS_SERVER(folio->mapping->host); - folio_end_writeback(folio); + folio_end_writeback_no_dropbehind(folio); if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) { nfss->write_congested = 0; @@ -593,6 +593,7 @@ static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc, if (IS_ERR(req)) return PTR_ERR(req); + trace_nfs_do_writepage(req); nfs_folio_set_writeback(folio); WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); @@ -656,12 +657,14 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) int priority = 0; int err; + trace_nfs_writepages(inode, wbc->range_start, wbc->range_end - wbc->range_start); + /* Wait with writeback until write congestion eases */ if (wbc->sync_mode == WB_SYNC_NONE && nfss->write_congested) { err = wait_event_killable(nfss->write_congestion_wait, nfss->write_congested == 0); if (err) - return err; + goto out_err; } nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); @@ -692,10 +695,10 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) } while (err < 0 && !nfs_error_is_fatal(err)); nfs_io_completion_put(ioc); - if (err < 0) - goto out_err; - return 0; + if (err > 0) + err = 0; out_err: + trace_nfs_writepages_done(inode, wbc->range_start, wbc->range_end - wbc->range_start, err); return err; } @@ -745,6 +748,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) clear_bit(PG_MAPPED, &req->wb_head->wb_flags); } spin_unlock(&mapping->i_private_lock); + + folio_end_dropbehind(folio); } nfs_page_group_unlock(req); @@ -926,7 +931,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) req->wb_nio = 0; memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); nfs_mark_request_commit(req, hdr->lseg, &cinfo, - hdr->pgio_mirror_idx); + hdr->ds_commit_idx); goto next; } remove_req: @@ -1017,11 +1022,12 @@ static struct nfs_page *nfs_try_to_update_request(struct folio *folio, unsigned int end; int error; + trace_nfs_try_to_update_request(folio_inode(folio), offset, bytes); end = offset + bytes; req = nfs_lock_and_join_requests(folio); if (IS_ERR_OR_NULL(req)) - return req; + goto out; rqend = req->wb_offset + req->wb_bytes; /* @@ -1043,6 +1049,9 @@ static struct nfs_page *nfs_try_to_update_request(struct folio *folio, else req->wb_bytes = rqend - req->wb_offset; req->wb_nio = 0; +out: + trace_nfs_try_to_update_request_done(folio_inode(folio), offset, bytes, + PTR_ERR_OR_ZERO(req)); return req; out_flushme: /* @@ -1053,6 +1062,7 @@ out_flushme: nfs_mark_request_dirty(req); nfs_unlock_and_release_request(req); error = nfs_wb_folio(folio->mapping->host, folio); + trace_nfs_try_to_update_request_done(folio_inode(folio), offset, bytes, error); return (error < 0) ? ERR_PTR(error) : NULL; } @@ -1090,6 +1100,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, req = nfs_setup_write_request(ctx, folio, offset, count); if (IS_ERR(req)) return PTR_ERR(req); + trace_nfs_writepage_setup(req); /* Update file length */ nfs_grow_file(folio, offset, count); nfs_mark_uptodate(req); @@ -1290,6 +1301,8 @@ int nfs_update_folio(struct file *file, struct folio *folio, nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); + trace_nfs_update_folio(inode, offset, count); + dprintk("NFS: nfs_update_folio(%pD2 %d@%lld)\n", file, count, (long long)(folio_pos(folio) + offset)); @@ -1309,6 +1322,7 @@ int nfs_update_folio(struct file *file, struct folio *folio, if (status < 0) nfs_set_pageerror(mapping); out: + trace_nfs_update_folio_done(inode, offset, count, status); dprintk("NFS: nfs_update_folio returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); return status; @@ -1806,7 +1820,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) nfs_mapping_set_error(folio, status); nfs_inode_remove_request(req); } - dprintk_cont(", error = %d\n", status); + dprintk(", error = %d\n", status); goto next; } @@ -1816,11 +1830,11 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* We have a match */ if (folio) nfs_inode_remove_request(req); - dprintk_cont(" OK\n"); + dprintk(" OK\n"); goto next; } /* We have a mismatch. Write the page again */ - dprintk_cont(" mismatch\n"); + dprintk(" mismatch\n"); nfs_mark_request_dirty(req); atomic_long_inc(&NFS_I(data->inode)->redirtied_pages); next: diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index cadfc2bae60e..caa695c06efb 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -402,7 +402,7 @@ static struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); -static int check_export(struct path *path, int *flags, unsigned char *uuid) +static int check_export(const struct path *path, int *flags, unsigned char *uuid) { struct inode *inode = d_inode(path->dentry); @@ -1181,7 +1181,7 @@ denied: * use exp_get_by_name() or exp_find(). */ struct svc_export * -rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path) +rqst_exp_get_by_name(struct svc_rqst *rqstp, const struct path *path) { struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index b9c0adb3ce09..cb36e6cce829 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -111,7 +111,7 @@ int nfsd_export_init(struct net *); void nfsd_export_shutdown(struct net *); void nfsd_export_flush(struct net *); struct svc_export * rqst_exp_get_by_name(struct svc_rqst *, - struct path *); + const struct path *); struct svc_export * rqst_exp_parent(struct svc_rqst *, struct path *); struct svc_export * rqst_find_fsidzero_export(struct svc_rqst *); diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 85ca663c052c..e010d90aeb27 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -231,6 +231,9 @@ nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need, refcount_set(&nf->nf_ref, 1); nf->nf_may = need; nf->nf_mark = NULL; + nf->nf_dio_mem_align = 0; + nf->nf_dio_offset_align = 0; + nf->nf_dio_read_offset_align = 0; return nf; } @@ -1070,6 +1073,35 @@ nfsd_file_is_cached(struct inode *inode) } static __be32 +nfsd_file_get_dio_attrs(const struct svc_fh *fhp, struct nfsd_file *nf) +{ + struct inode *inode = file_inode(nf->nf_file); + struct kstat stat; + __be32 status; + + /* Currently only need to get DIO alignment info for regular files */ + if (!S_ISREG(inode->i_mode)) + return nfs_ok; + + status = fh_getattr(fhp, &stat); + if (status != nfs_ok) + return status; + + trace_nfsd_file_get_dio_attrs(inode, &stat); + + if (stat.result_mask & STATX_DIOALIGN) { + nf->nf_dio_mem_align = stat.dio_mem_align; + nf->nf_dio_offset_align = stat.dio_offset_align; + } + if (stat.result_mask & STATX_DIO_READ_ALIGN) + nf->nf_dio_read_offset_align = stat.dio_read_offset_align; + else + nf->nf_dio_read_offset_align = nf->nf_dio_offset_align; + + return nfs_ok; +} + +static __be32 nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net, struct svc_cred *cred, struct auth_domain *client, @@ -1187,6 +1219,8 @@ open_file: } status = nfserrno(ret); trace_nfsd_file_open(nf, status); + if (status == nfs_ok) + status = nfsd_file_get_dio_attrs(fhp, nf); } } else status = nfserr_jukebox; diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 722b26c71e45..237a05c74211 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -54,6 +54,10 @@ struct nfsd_file { struct list_head nf_gc; struct rcu_head nf_rcu; ktime_t nf_birthtime; + + u32 nf_dio_mem_align; + u32 nf_dio_offset_align; + u32 nf_dio_read_offset_align; }; int nfsd_file_cache_init(void); diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c index cb237f1b902a..9e0a37cd29d8 100644 --- a/fs/nfsd/localio.c +++ b/fs/nfsd/localio.c @@ -117,6 +117,16 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom, return localio; } +static void nfsd_file_dio_alignment(struct nfsd_file *nf, + u32 *nf_dio_mem_align, + u32 *nf_dio_offset_align, + u32 *nf_dio_read_offset_align) +{ + *nf_dio_mem_align = nf->nf_dio_mem_align; + *nf_dio_offset_align = nf->nf_dio_offset_align; + *nf_dio_read_offset_align = nf->nf_dio_read_offset_align; +} + static const struct nfsd_localio_operations nfsd_localio_ops = { .nfsd_net_try_get = nfsd_net_try_get, .nfsd_net_put = nfsd_net_put, @@ -124,6 +134,7 @@ static const struct nfsd_localio_operations nfsd_localio_ops = { .nfsd_file_put_local = nfsd_file_put_local, .nfsd_file_get_local = nfsd_file_get_local, .nfsd_file_file = nfsd_file_file, + .nfsd_file_dio_alignment = nfsd_file_dio_alignment, }; void nfsd_localio_ops_init(void) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index bc6b776fc657..2b79129703d5 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1103,89 +1103,48 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) * populating the filesystem. */ -/* Basically copying rpc_get_inode. */ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode) { struct inode *inode = new_inode(sb); - if (!inode) - return NULL; - /* Following advice from simple_fill_super documentation: */ - inode->i_ino = iunique(sb, NFSD_MaxReserved); - inode->i_mode = mode; - simple_inode_init_ts(inode); - switch (mode & S_IFMT) { - case S_IFDIR: - inode->i_fop = &simple_dir_operations; - inode->i_op = &simple_dir_inode_operations; - inc_nlink(inode); - break; - case S_IFLNK: - inode->i_op = &simple_symlink_inode_operations; - break; - default: - break; + if (inode) { + /* Following advice from simple_fill_super documentation: */ + inode->i_ino = iunique(sb, NFSD_MaxReserved); + inode->i_mode = mode; + simple_inode_init_ts(inode); } return inode; } -static int __nfsd_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode, struct nfsdfs_client *ncl) +static struct dentry *nfsd_mkdir(struct dentry *parent, struct nfsdfs_client *ncl, char *name) { + struct inode *dir = parent->d_inode; + struct dentry *dentry; struct inode *inode; - inode = nfsd_get_inode(dir->i_sb, mode); + inode = nfsd_get_inode(parent->d_sb, S_IFDIR | 0600); if (!inode) - return -ENOMEM; + return ERR_PTR(-ENOMEM); + + dentry = simple_start_creating(parent, name); + if (IS_ERR(dentry)) { + iput(inode); + return dentry; + } + inode->i_fop = &simple_dir_operations; + inode->i_op = &simple_dir_inode_operations; + inc_nlink(inode); if (ncl) { inode->i_private = ncl; kref_get(&ncl->cl_ref); } - d_add(dentry, inode); + d_instantiate(dentry, inode); inc_nlink(dir); fsnotify_mkdir(dir, dentry); - return 0; -} - -static struct dentry *nfsd_mkdir(struct dentry *parent, struct nfsdfs_client *ncl, char *name) -{ - struct inode *dir = parent->d_inode; - struct dentry *dentry; - int ret = -ENOMEM; - - inode_lock(dir); - dentry = d_alloc_name(parent, name); - if (!dentry) - goto out_err; - ret = __nfsd_mkdir(d_inode(parent), dentry, S_IFDIR | 0600, ncl); - if (ret) - goto out_err; -out: inode_unlock(dir); return dentry; -out_err: - dput(dentry); - dentry = ERR_PTR(ret); - goto out; } #if IS_ENABLED(CONFIG_SUNRPC_GSS) -static int __nfsd_symlink(struct inode *dir, struct dentry *dentry, - umode_t mode, const char *content) -{ - struct inode *inode; - - inode = nfsd_get_inode(dir->i_sb, mode); - if (!inode) - return -ENOMEM; - - inode->i_link = (char *)content; - inode->i_size = strlen(content); - - d_add(dentry, inode); - inc_nlink(dir); - fsnotify_create(dir, dentry); - return 0; -} - /* * @content is assumed to be a NUL-terminated string that lives * longer than the symlink itself. @@ -1194,17 +1153,25 @@ static void _nfsd_symlink(struct dentry *parent, const char *name, const char *content) { struct inode *dir = parent->d_inode; + struct inode *inode; struct dentry *dentry; - int ret; - inode_lock(dir); - dentry = d_alloc_name(parent, name); - if (!dentry) - goto out; - ret = __nfsd_symlink(d_inode(parent), dentry, S_IFLNK | 0777, content); - if (ret) - dput(dentry); -out: + inode = nfsd_get_inode(dir->i_sb, S_IFLNK | 0777); + if (!inode) + return; + + dentry = simple_start_creating(parent, name); + if (IS_ERR(dentry)) { + iput(inode); + return; + } + + inode->i_op = &simple_symlink_inode_operations; + inode->i_link = (char *)content; + inode->i_size = strlen(content); + + d_instantiate(dentry, inode); + fsnotify_create(dir, dentry); inode_unlock(dir); } #else @@ -1240,40 +1207,34 @@ struct nfsdfs_client *get_nfsdfs_client(struct inode *inode) /* XXX: cut'n'paste from simple_fill_super; figure out if we could share * code instead. */ -static int nfsdfs_create_files(struct dentry *root, +static int nfsdfs_create_files(struct dentry *root, const struct tree_descr *files, struct nfsdfs_client *ncl, struct dentry **fdentries) { struct inode *dir = d_inode(root); - struct inode *inode; struct dentry *dentry; - int i; - inode_lock(dir); - for (i = 0; files->name && files->name[0]; i++, files++) { - dentry = d_alloc_name(root, files->name); - if (!dentry) - goto out; - inode = nfsd_get_inode(d_inode(root)->i_sb, - S_IFREG | files->mode); - if (!inode) { - dput(dentry); - goto out; + for (int i = 0; files->name && files->name[0]; i++, files++) { + struct inode *inode = nfsd_get_inode(root->d_sb, + S_IFREG | files->mode); + if (!inode) + return -ENOMEM; + dentry = simple_start_creating(root, files->name); + if (IS_ERR(dentry)) { + iput(inode); + return PTR_ERR(dentry); } kref_get(&ncl->cl_ref); inode->i_fop = files->ops; inode->i_private = ncl; - d_add(dentry, inode); + d_instantiate(dentry, inode); fsnotify_create(dir, dentry); if (fdentries) fdentries[i] = dentry; + inode_unlock(dir); } - inode_unlock(dir); return 0; -out: - inode_unlock(dir); - return -ENOMEM; } /* on success, returns positive number unique to that client. */ @@ -1993,7 +1954,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) * remaining listeners and recreate the list. */ if (delete) - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); /* walk list of addrs again, open any that still don't exist */ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 82b0111ac469..7057ddd7a0a8 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -535,16 +535,13 @@ void nfsd_destroy_serv(struct net *net) #endif } - svc_xprt_destroy_all(serv, net); - /* * write_ports can create the server without actually starting - * any threads--if we get shut down before any threads are + * any threads. If we get shut down before any threads are * started, then nfsd_destroy_serv will be run before any of this * other initialization has been done except the rpcb information. */ - svc_rpcb_cleanup(serv, net); - + svc_xprt_destroy_all(serv, net, true); nfsd_shutdown_net(net); svc_destroy(&serv); } diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index a664fdf1161e..6e2c8e2aab10 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -1133,6 +1133,33 @@ TRACE_EVENT(nfsd_file_alloc, ) ); +TRACE_EVENT(nfsd_file_get_dio_attrs, + TP_PROTO( + const struct inode *inode, + const struct kstat *stat + ), + TP_ARGS(inode, stat), + TP_STRUCT__entry( + __field(const void *, inode) + __field(unsigned long, mask) + __field(u32, mem_align) + __field(u32, offset_align) + __field(u32, read_offset_align) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->mask = stat->result_mask; + __entry->mem_align = stat->dio_mem_align; + __entry->offset_align = stat->dio_offset_align; + __entry->read_offset_align = stat->dio_read_offset_align; + ), + TP_printk("inode=%p flags=%s mem_align=%u offset_align=%u read_offset_align=%u", + __entry->inode, show_statx_mask(__entry->mask), + __entry->mem_align, __entry->offset_align, + __entry->read_offset_align + ) +); + TRACE_EVENT(nfsd_file_acquire, TP_PROTO( const struct svc_rqst *rqstp, diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index eff04959606f..fde3e0c11dba 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -185,6 +185,10 @@ static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat) u32 request_mask = STATX_BASIC_STATS; struct path p = {.mnt = fh->fh_export->ex_path.mnt, .dentry = fh->fh_dentry}; + struct inode *inode = d_inode(p.dentry); + + if (S_ISREG(inode->i_mode)) + request_mask |= (STATX_DIOALIGN | STATX_DIO_READ_ALIGN); if (fh->fh_maxsize == NFS4_FHSIZE) request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE); diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index b78308975082..39e60218df7c 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -441,7 +441,9 @@ struct fanotify_perm_event { size_t count; u32 response; /* userspace answer to the event */ unsigned short state; /* state of the event */ + unsigned short watchdog_cnt; /* already scanned by watchdog? */ int fd; /* fd we passed to userspace for this event */ + pid_t recv_pid; /* pid of task receiving the event */ union { struct fanotify_response_info_header hdr; struct fanotify_response_info_audit_rule audit_rule; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index b192ee068a7a..1dadda82cae5 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -50,6 +50,7 @@ /* configurable via /proc/sys/fs/fanotify/ */ static int fanotify_max_queued_events __read_mostly; +static int perm_group_timeout __read_mostly; #ifdef CONFIG_SYSCTL @@ -85,6 +86,14 @@ static const struct ctl_table fanotify_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO }, + { + .procname = "watchdog_timeout", + .data = &perm_group_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, }; static void __init fanotify_sysctls_init(void) @@ -95,6 +104,91 @@ static void __init fanotify_sysctls_init(void) #define fanotify_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ +static LIST_HEAD(perm_group_list); +static DEFINE_SPINLOCK(perm_group_lock); +static void perm_group_watchdog(struct work_struct *work); +static DECLARE_DELAYED_WORK(perm_group_work, perm_group_watchdog); + +static void perm_group_watchdog_schedule(void) +{ + schedule_delayed_work(&perm_group_work, secs_to_jiffies(perm_group_timeout)); +} + +static void perm_group_watchdog(struct work_struct *work) +{ + struct fsnotify_group *group; + struct fanotify_perm_event *event; + struct task_struct *task; + pid_t failed_pid = 0; + + guard(spinlock)(&perm_group_lock); + if (list_empty(&perm_group_list)) + return; + + list_for_each_entry(group, &perm_group_list, + fanotify_data.perm_grp_list) { + /* + * Ok to test without lock, racing with an addition is + * fine, will deal with it next round + */ + if (list_empty(&group->fanotify_data.access_list)) + continue; + + spin_lock(&group->notification_lock); + list_for_each_entry(event, &group->fanotify_data.access_list, + fae.fse.list) { + if (likely(event->watchdog_cnt == 0)) { + event->watchdog_cnt = 1; + } else if (event->watchdog_cnt == 1) { + /* Report on event only once */ + event->watchdog_cnt = 2; + + /* Do not report same pid repeatedly */ + if (event->recv_pid == failed_pid) + continue; + + failed_pid = event->recv_pid; + rcu_read_lock(); + task = find_task_by_pid_ns(event->recv_pid, + &init_pid_ns); + pr_warn_ratelimited( + "PID %u (%s) failed to respond to fanotify queue for more than %d seconds\n", + event->recv_pid, + task ? task->comm : NULL, + perm_group_timeout); + rcu_read_unlock(); + } + } + spin_unlock(&group->notification_lock); + } + perm_group_watchdog_schedule(); +} + +static void fanotify_perm_watchdog_group_remove(struct fsnotify_group *group) +{ + if (!list_empty(&group->fanotify_data.perm_grp_list)) { + /* Perm event watchdog can no longer scan this group. */ + spin_lock(&perm_group_lock); + list_del_init(&group->fanotify_data.perm_grp_list); + spin_unlock(&perm_group_lock); + } +} + +static void fanotify_perm_watchdog_group_add(struct fsnotify_group *group) +{ + if (!perm_group_timeout) + return; + + spin_lock(&perm_group_lock); + if (list_empty(&group->fanotify_data.perm_grp_list)) { + /* Add to perm_group_list for monitoring by watchdog. */ + if (list_empty(&perm_group_list)) + perm_group_watchdog_schedule(); + list_add_tail(&group->fanotify_data.perm_grp_list, &perm_group_list); + } + spin_unlock(&perm_group_lock); +} + /* * All flags that may be specified in parameter event_f_flags of fanotify_init. * @@ -953,6 +1047,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, spin_lock(&group->notification_lock); list_add_tail(&event->fse.list, &group->fanotify_data.access_list); + FANOTIFY_PERM(event)->recv_pid = current->pid; spin_unlock(&group->notification_lock); } } @@ -1012,6 +1107,8 @@ static int fanotify_release(struct inode *ignored, struct file *file) */ fsnotify_group_stop_queueing(group); + fanotify_perm_watchdog_group_remove(group); + /* * Process all permission events on access_list and notification queue * and simulate reply from userspace. @@ -1465,6 +1562,10 @@ out: fsnotify_group_unlock(group); fsnotify_put_mark(fsn_mark); + + if (!ret && (mask & FANOTIFY_PERM_EVENTS)) + fanotify_perm_watchdog_group_add(group); + return ret; } @@ -1625,6 +1726,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.f_flags = event_f_flags; init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); + INIT_LIST_HEAD(&group->fanotify_data.perm_grp_list); switch (class) { case FAN_CLASS_NOTIF: group->priority = FSNOTIFY_PRIO_NORMAL; @@ -1999,7 +2101,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, user_ns = path.mnt->mnt_sb->s_user_ns; obj = path.mnt->mnt_sb; } else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) { + ret = -EINVAL; mntns = mnt_ns_from_dentry(path.dentry); + if (!mntns) + goto path_put_and_out; user_ns = mntns->user_ns; obj = mntns; } diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index cd7d11b0eb08..7c326ec2e8a8 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -10,7 +10,7 @@ * Copyright 2006 Hewlett-Packard Development Company, L.P. * * Copyright (C) 2009 Eric Paris <Red Hat Inc> - * inotify was largely rewriten to make use of the fsnotify infrastructure + * inotify was largely rewritten to make use of the fsnotify infrastructure */ #include <linux/dcache.h> /* d_unlinked */ diff --git a/fs/nsfs.c b/fs/nsfs.c index e7fd8a790aaa..648dc59bef7f 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -571,7 +571,7 @@ static int nsfs_export_permission(struct handle_to_path_ctx *ctx, return 0; } -static struct file *nsfs_export_open(struct path *path, unsigned int oflags) +static struct file *nsfs_export_open(const struct path *path, unsigned int oflags) { return file_open_root(path, "", oflags, 0); } diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 04107b950717..65d05e6a0566 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -1371,6 +1371,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits) mark_buffer_dirty(bh); unlock_buffer(bh); /* err = sync_dirty_buffer(bh); */ + put_bh(bh); b0 = 0; bits -= op; diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index c1ece707b195..4c90ec2fa2ea 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -49,6 +49,30 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg) return 0; } +static int ntfs_ioctl_get_volume_label(struct ntfs_sb_info *sbi, u8 __user *buf) +{ + if (copy_to_user(buf, sbi->volume.label, FSLABEL_MAX)) + return -EFAULT; + + return 0; +} + +static int ntfs_ioctl_set_volume_label(struct ntfs_sb_info *sbi, u8 __user *buf) +{ + u8 user[FSLABEL_MAX] = {0}; + int len; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(user, buf, FSLABEL_MAX)) + return -EFAULT; + + len = strnlen(user, FSLABEL_MAX); + + return ntfs_set_label(sbi, user, len); +} + /* * ntfs_ioctl - file_operations::unlocked_ioctl */ @@ -64,6 +88,10 @@ long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) switch (cmd) { case FITRIM: return ntfs_ioctl_fitrim(sbi, arg); + case FS_IOC_GETFSLABEL: + return ntfs_ioctl_get_volume_label(sbi, (u8 __user *)arg); + case FS_IOC_SETFSLABEL: + return ntfs_ioctl_set_volume_label(sbi, (u8 __user *)arg); } return -ENOTTY; /* Inappropriate ioctl for device. */ } diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 1bf2a6593dec..6d1bf890929d 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -1508,6 +1508,16 @@ static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, bmp_size = bmp_size_v = le32_to_cpu(bmp->res.data_size); } + /* + * Index blocks exist, but $BITMAP has zero valid bits. + * This implies an on-disk corruption and must be rejected. + */ + if (in->name == I30_NAME && + unlikely(bmp_size_v == 0 && indx->alloc_run.count)) { + err = -EINVAL; + goto out1; + } + bit = bmp_size << 3; } diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 48b4f73a93ee..3959f23c487a 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -471,6 +471,7 @@ end_enum: fname->home.seq == cpu_to_le16(MFT_REC_EXTEND)) { /* Records in $Extend are not a files or general directories. */ inode->i_op = &ntfs_file_inode_operations; + mode = S_IFREG; } else { err = -EINVAL; goto out; diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 1296e6fcc779..630128716ea7 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -280,7 +280,7 @@ struct ntfs_sb_info { __le16 flags; // Cached current VOLUME_INFO::flags, VOLUME_FLAG_DIRTY. u8 major_ver; u8 minor_ver; - char label[256]; + char label[FSLABEL_MAX]; bool real_dirty; // Real fs state. } volume; diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c index 6e86d66197ef..88550085f745 100644 --- a/fs/ntfs3/run.c +++ b/fs/ntfs3/run.c @@ -9,6 +9,7 @@ #include <linux/blkdev.h> #include <linux/fs.h> #include <linux/log2.h> +#include <linux/overflow.h> #include "debug.h" #include "ntfs.h" @@ -982,14 +983,18 @@ int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, if (!dlcn) return -EINVAL; - lcn = prev_lcn + dlcn; + + if (check_add_overflow(prev_lcn, dlcn, &lcn)) + return -EINVAL; prev_lcn = lcn; } else { /* The size of 'dlcn' can't be > 8. */ return -EINVAL; } - next_vcn = vcn64 + len; + if (check_add_overflow(vcn64, len, &next_vcn)) + return -EINVAL; + /* Check boundary. */ if (next_vcn > evcn + 1) return -EINVAL; @@ -1153,7 +1158,8 @@ int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn) return -EINVAL; run_buf += size_size + offset_size; - vcn64 += len; + if (check_add_overflow(vcn64, len, &vcn64)) + return -EINVAL; #ifndef CONFIG_NTFS3_64BIT_CLUSTER if (vcn64 > 0x100000000ull) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 821cb7874685..162711cc5b20 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6928,8 +6928,7 @@ static int ocfs2_grab_folios(struct inode *inode, loff_t start, loff_t end, out: if (ret != 0) { - if (folios) - ocfs2_unlock_and_free_folios(folios, numfolios); + ocfs2_unlock_and_free_folios(folios, numfolios); numfolios = 0; } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 86bb1a03bcc1..4145e06d2c08 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1477,7 +1477,6 @@ way_up_top: goto send_response; } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { spin_unlock(&res->spinlock); - // mlog(0, "node %u is the master\n", res->owner); response = DLM_MASTER_RESP_NO; if (mle) kmem_cache_free(dlm_mle_cache, mle); @@ -1493,7 +1492,6 @@ way_up_top: BUG(); } - // mlog(0, "lockres is in progress...\n"); spin_lock(&dlm->master_lock); found = dlm_find_mle(dlm, &tmpmle, name, namelen); if (!found) { @@ -1503,8 +1501,6 @@ way_up_top: set_maybe = 1; spin_lock(&tmpmle->spinlock); if (tmpmle->type == DLM_MLE_BLOCK) { - // mlog(0, "this node is waiting for " - // "lockres to be mastered\n"); response = DLM_MASTER_RESP_NO; } else if (tmpmle->type == DLM_MLE_MIGRATION) { mlog(0, "node %u is master, but trying to migrate to " @@ -1531,8 +1527,6 @@ way_up_top: } else response = DLM_MASTER_RESP_NO; } else { - // mlog(0, "this node is attempting to " - // "master lockres\n"); response = DLM_MASTER_RESP_MAYBE; } if (set_maybe) @@ -1559,7 +1553,6 @@ way_up_top: found = dlm_find_mle(dlm, &tmpmle, name, namelen); if (!found) { /* this lockid has never been seen on this node yet */ - // mlog(0, "no mle found\n"); if (!mle) { spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); @@ -1573,8 +1566,6 @@ way_up_top: goto way_up_top; } - // mlog(0, "this is second time thru, already allocated, " - // "add the block.\n"); dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); set_bit(request->node_idx, mle->maybe_map); __dlm_insert_mle(dlm, mle); @@ -1897,8 +1888,6 @@ ok: spin_unlock(&res->spinlock); } - // mlog(0, "woo! got an assert_master from node %u!\n", - // assert->node_idx); if (mle) { int extra_ref = 0; int nn = -1; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 00f52812dbb0..843ee02bd85f 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -464,7 +464,6 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) } if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { - // mlog(0, "nothing to recover! sleeping now!\n"); spin_unlock(&dlm->spinlock); /* return to main thread loop and sleep. */ return 0; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 6c4f78f473fb..fcc89856ab95 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1495,6 +1495,14 @@ int ocfs2_validate_inode_block(struct super_block *sb, goto bail; } + if (le16_to_cpu(di->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && + (u32)le16_to_cpu(di->i_suballoc_slot) > OCFS2_SB(sb)->max_slots - 1) { + rc = ocfs2_error(sb, "Invalid dinode %llu: suballoc slot %u\n", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(di->i_suballoc_slot)); + goto bail; + } + rc = 0; bail: diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index db14c92302a1..b6864602814c 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -358,13 +358,11 @@ static int ocfs2_info_handle_freeinode(struct inode *inode, goto bail; } } else { - ocfs2_sprintf_system_inode_name(namebuf, - sizeof(namebuf), - type, i); + int len = ocfs2_sprintf_system_inode_name(namebuf, + sizeof(namebuf), + type, i); status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, - namebuf, - strlen(namebuf), - &blkno); + namebuf, len, &blkno); if (status < 0) { status = -ENOENT; goto bail; @@ -651,12 +649,10 @@ static int ocfs2_info_handle_freefrag(struct inode *inode, goto bail; } } else { - ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, - OCFS2_INVALID_SLOT); + int len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), + type, OCFS2_INVALID_SLOT); status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, - namebuf, - strlen(namebuf), - &blkno); + namebuf, len, &blkno); if (status < 0) { status = -ENOENT; goto bail; diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index cbe2f8ed8897..86f2631e6360 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -364,7 +364,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode, int *vict_bit, struct buffer_head **ret_bh) { - int ret, i, bits_per_unit = 0; + int ret, i, len, bits_per_unit = 0; u64 blkno; char namebuf[40]; @@ -375,9 +375,9 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode, struct ocfs2_dinode *ac_dinode; struct ocfs2_group_desc *bg; - ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); - ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, - strlen(namebuf), &blkno); + len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); + ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, len, &blkno); + if (ret) { ret = -ENOENT; goto out; diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index e8e94599e907..ae0e44e5f2ad 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -614,7 +614,7 @@ struct ocfs2_super_block { __le16 s_reserved0; __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash. * s_uuid_hash serves as seed[3]. */ -/*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */ +/*C8*/ __le64 s_reserved2[15]; /* Fill out superblock */ /*140*/ /* diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 765105f1ff8a..be0a5758bd40 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -1011,6 +1011,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) printk(KERN_ERR "ocfs2: Could not determine" " locking version\n"); user_cluster_disconnect(conn); + lc = NULL; goto out; } wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0)); diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index 53a945da873b..d53a6cc866be 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -127,14 +127,14 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, char namebuf[40]; struct inode *inode = NULL; u64 blkno; - int status = 0; + int len, status = 0; - ocfs2_sprintf_system_inode_name(namebuf, - sizeof(namebuf), - type, slot); + len = ocfs2_sprintf_system_inode_name(namebuf, + sizeof(namebuf), + type, slot); - status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, - strlen(namebuf), &blkno); + status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, + namebuf, len, &blkno); if (status < 0) { goto bail; } diff --git a/fs/open.c b/fs/open.c index 9655158c3885..3d64372ecc67 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1022,8 +1022,8 @@ cleanup_all: put_file_access(f); cleanup_file: path_put(&f->f_path); - f->f_path.mnt = NULL; - f->f_path.dentry = NULL; + f->__f_path.mnt = NULL; + f->__f_path.dentry = NULL; f->f_inode = NULL; return error; } @@ -1050,7 +1050,7 @@ int finish_open(struct file *file, struct dentry *dentry, { BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ - file->f_path.dentry = dentry; + file->__f_path.dentry = dentry; return do_dentry_open(file, open); } EXPORT_SYMBOL(finish_open); @@ -1059,19 +1059,21 @@ EXPORT_SYMBOL(finish_open); * finish_no_open - finish ->atomic_open() without opening the file * * @file: file pointer - * @dentry: dentry or NULL (as returned from ->lookup()) + * @dentry: dentry, ERR_PTR(-E...) or NULL (as returned from ->lookup()) * - * This can be used to set the result of a successful lookup in ->atomic_open(). + * This can be used to set the result of a lookup in ->atomic_open(). * * NB: unlike finish_open() this function does consume the dentry reference and * the caller need not dput() it. * - * Returns "0" which must be the return value of ->atomic_open() after having - * called this function. + * Returns 0 or -E..., which must be the return value of ->atomic_open() after + * having called this function. */ int finish_no_open(struct file *file, struct dentry *dentry) { - file->f_path.dentry = dentry; + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + file->__f_path.dentry = dentry; return 0; } EXPORT_SYMBOL(finish_no_open); @@ -1091,7 +1093,7 @@ int vfs_open(const struct path *path, struct file *file) { int ret; - file->f_path = *path; + file->__f_path = *path; ret = do_dentry_open(file, NULL); if (!ret) { /* diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 82395fe2b956..bec5475de094 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -38,8 +38,7 @@ static int orangefs_create(struct mnt_idmap *idmap, new_op->upcall.req.create.parent_refn = parent->refn; - fill_default_sys_attrs(new_op->upcall.req.create.attributes, - ORANGEFS_TYPE_METAFILE, mode); + fill_default_sys_attrs(new_op->upcall.req.create.attributes, mode); strscpy(new_op->upcall.req.create.d_name, dentry->d_name.name); @@ -240,9 +239,7 @@ static int orangefs_symlink(struct mnt_idmap *idmap, new_op->upcall.req.sym.parent_refn = parent->refn; - fill_default_sys_attrs(new_op->upcall.req.sym.attributes, - ORANGEFS_TYPE_SYMLINK, - mode); + fill_default_sys_attrs(new_op->upcall.req.sym.attributes, mode); strscpy(new_op->upcall.req.sym.entry_name, dentry->d_name.name); strscpy(new_op->upcall.req.sym.target, symname); @@ -316,8 +313,7 @@ static struct dentry *orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir, new_op->upcall.req.mkdir.parent_refn = parent->refn; - fill_default_sys_attrs(new_op->upcall.req.mkdir.attributes, - ORANGEFS_TYPE_DIRECTORY, mode); + fill_default_sys_attrs(new_op->upcall.req.mkdir.attributes, mode); strscpy(new_op->upcall.req.mkdir.d_name, dentry->d_name.name); diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 1c375fb65018..79267b3419f2 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -440,14 +440,13 @@ static ssize_t orangefs_debug_write(struct file *file, count = ORANGEFS_MAX_DEBUG_STRING_LEN; } - buf = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL); - if (!buf) - goto out; - - if (copy_from_user(buf, ubuf, count - 1)) { + buf = memdup_user_nul(ubuf, count - 1); + if (IS_ERR(buf)) { gossip_debug(GOSSIP_DEBUGFS_DEBUG, - "%s: copy_from_user failed!\n", + "%s: memdup_user_nul failed!\n", __func__); + rc = PTR_ERR(buf); + buf = NULL; goto out; } diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 3e153c2f6b82..29c6da43e396 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -462,7 +462,7 @@ int service_operation(struct orangefs_kernel_op_s *op, ((ORANGEFS_SB(inode->i_sb)->flags & ORANGEFS_OPT_INTR) ? \ ORANGEFS_OP_INTERRUPTIBLE : 0) -#define fill_default_sys_attrs(sys_attr, type, mode) \ +#define fill_default_sys_attrs(sys_attr, mode) \ do { \ sys_attr.owner = from_kuid(&init_user_ns, current_fsuid()); \ sys_attr.group = from_kgid(&init_user_ns, current_fsgid()); \ diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 74ef75586f38..eee3c5ed1bbb 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -54,7 +54,9 @@ static inline int convert_to_internal_xattr_flags(int setxattr_flags) static unsigned int xattr_key(const char *key) { unsigned int i = 0; - while (key) + if (!key) + return 0; + while (*key) i += *key++; return i % 16; } @@ -175,8 +177,8 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, cx->length = -1; cx->timeout = jiffies + orangefs_getattr_timeout_msecs*HZ/1000; - hash_add(orangefs_inode->xattr_cache, &cx->node, - xattr_key(cx->key)); + hlist_add_head( &cx->node, + &orangefs_inode->xattr_cache[xattr_key(cx->key)]); } } goto out_release_op; @@ -229,8 +231,8 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, memcpy(cx->val, buffer, length); cx->length = length; cx->timeout = jiffies + HZ; - hash_add(orangefs_inode->xattr_cache, &cx->node, - xattr_key(cx->key)); + hlist_add_head(&cx->node, + &orangefs_inode->xattr_cache[xattr_key(cx->key)]); } } diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 27396fe63f6d..aac7e34f56c1 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -242,7 +242,7 @@ static int ovl_verify_area(loff_t pos, loff_t pos2, loff_t len, loff_t totlen) return 0; } -static int ovl_sync_file(struct path *path) +static int ovl_sync_file(const struct path *path) { struct file *new_file; int err; @@ -670,7 +670,7 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp) if (err) return err; - if (inode->i_flags & OVL_COPY_I_FLAGS_MASK && + if (inode->i_flags & OVL_FATTR_I_FLAGS_MASK && (S_ISREG(c->stat.mode) || S_ISDIR(c->stat.mode))) { /* * Copy the fileattr inode flags that are the source of already diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index dbd63a74df4b..a5e9ddf3023b 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -187,6 +187,13 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent, /* mkdir is special... */ newdentry = ovl_do_mkdir(ofs, dir, newdentry, attr->mode); err = PTR_ERR_OR_ZERO(newdentry); + /* expect to inherit casefolding from workdir/upperdir */ + if (!err && ofs->casefold != ovl_dentry_casefolded(newdentry)) { + pr_warn_ratelimited("wrong inherited casefold (%pd2)\n", + newdentry); + dput(newdentry); + err = -EINVAL; + } break; case S_IFCHR: @@ -205,12 +212,32 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent, err = -EPERM; } } - if (!err && WARN_ON(!newdentry->d_inode)) { + if (err) + goto out; + + if (WARN_ON(!newdentry->d_inode)) { /* * Not quite sure if non-instantiated dentry is legal or not. * VFS doesn't seem to care so check and warn here. */ err = -EIO; + } else if (d_unhashed(newdentry)) { + struct dentry *d; + /* + * Some filesystems (i.e. casefolded) may return an unhashed + * negative dentry from the ovl_lookup_upper() call before + * ovl_create_real(). + * In that case, lookup again after making the newdentry + * positive, so ovl_create_upper() always returns a hashed + * positive dentry. + */ + d = ovl_lookup_upper(ofs, newdentry->d_name.name, parent, + newdentry->d_name.len); + dput(newdentry); + if (IS_ERR_OR_NULL(d)) + err = d ? PTR_ERR(d) : -ENOENT; + else + return d; } out: if (err) { diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index f5b8877d5fe2..fc52c796061d 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -120,7 +120,7 @@ static bool ovl_is_real_file(const struct file *realfile, } static struct file *ovl_real_file_path(const struct file *file, - struct path *realpath) + const struct path *realpath) { struct ovl_file *of = file->private_data; struct file *realfile = of->realfile; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index ecb9f2019395..aaa4cf579561 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -1277,6 +1277,7 @@ struct inode *ovl_get_inode(struct super_block *sb, } ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev); ovl_inode_init(inode, oip, ino, fsid); + WARN_ON_ONCE(!!IS_CASEFOLDED(inode) != ofs->casefold); if (upperdentry && ovl_is_impuredir(sb, upperdentry)) ovl_set_flag(OVL_IMPURE, inode); diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 76d6248b625e..e93bcc5727bc 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -239,13 +239,14 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, char val; /* - * We allow filesystems that are case-folding capable but deny composing - * ovl stack from case-folded directories. If someone has enabled case - * folding on a directory on underlying layer, the warranty of the ovl - * stack is voided. + * We allow filesystems that are case-folding capable as long as the + * layers are consistently enabled in the stack, enabled for every dir + * or disabled in all dirs. If someone has modified case folding on a + * directory on underlying layer, the warranty of the ovl stack is + * voided. */ - if (ovl_dentry_casefolded(base)) { - warn = "case folded parent"; + if (ofs->casefold != ovl_dentry_casefolded(base)) { + warn = "parent wrong casefold"; err = -ESTALE; goto out_warn; } @@ -259,8 +260,8 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, goto out_err; } - if (ovl_dentry_casefolded(this)) { - warn = "case folded child"; + if (ofs->casefold != ovl_dentry_casefolded(this)) { + warn = "child wrong casefold"; err = -EREMOTE; goto out_warn; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 4f84abaa0d68..c8fd5951fc5e 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -562,11 +562,11 @@ int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d, struct ovl_metacopy *metacopy); bool ovl_is_metacopy_dentry(struct dentry *dentry); char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding); -int ovl_ensure_verity_loaded(struct path *path); +int ovl_ensure_verity_loaded(const struct path *path); int ovl_validate_verity(struct ovl_fs *ofs, - struct path *metapath, - struct path *datapath); -int ovl_get_verity_digest(struct ovl_fs *ofs, struct path *src, + const struct path *metapath, + const struct path *datapath); +int ovl_get_verity_digest(struct ovl_fs *ofs, const struct path *src, struct ovl_metacopy *metacopy); int ovl_sync_status(struct ovl_fs *ofs); @@ -820,10 +820,12 @@ struct inode *ovl_get_inode(struct super_block *sb, struct ovl_inode_params *oip); void ovl_copyattr(struct inode *to); +/* vfs fileattr flags read from overlay.protattr xattr to ovl inode */ +#define OVL_PROT_I_FLAGS_MASK (S_APPEND | S_IMMUTABLE) +/* vfs fileattr flags copied from real to ovl inode */ +#define OVL_FATTR_I_FLAGS_MASK (OVL_PROT_I_FLAGS_MASK | S_SYNC | S_NOATIME) /* vfs inode flags copied from real to ovl inode */ -#define OVL_COPY_I_FLAGS_MASK (S_SYNC | S_NOATIME | S_APPEND | S_IMMUTABLE) -/* vfs inode flags read from overlay.protattr xattr to ovl inode */ -#define OVL_PROT_I_FLAGS_MASK (S_APPEND | S_IMMUTABLE) +#define OVL_COPY_I_FLAGS_MASK (OVL_FATTR_I_FLAGS_MASK | S_CASEFOLD) /* * fileattr flags copied from lower to upper inode on copy up. diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 4c1bae935ced..1d4828dbcf7a 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -91,6 +91,7 @@ struct ovl_fs { struct mutex whiteout_lock; /* r/o snapshot of upperdir sb's only taken on volatile mounts */ errseq_t errseq; + bool casefold; }; /* Number of lower layers, not including data-only layers */ diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index f4e7fff909ac..63b7346c5ee1 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -276,17 +276,26 @@ static int ovl_mount_dir(const char *name, struct path *path) static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path, enum ovl_opt layer, const char *name, bool upper) { + bool is_casefolded = ovl_dentry_casefolded(path->dentry); struct ovl_fs_context *ctx = fc->fs_private; + struct ovl_fs *ofs = fc->s_fs_info; if (!d_is_dir(path->dentry)) return invalfc(fc, "%s is not a directory", name); /* * Allow filesystems that are case-folding capable but deny composing - * ovl stack from case-folded directories. + * ovl stack from inconsistent case-folded directories. */ - if (ovl_dentry_casefolded(path->dentry)) - return invalfc(fc, "case-insensitive directory on %s not supported", name); + if (!ctx->casefold_set) { + ofs->casefold = is_casefolded; + ctx->casefold_set = true; + } + + if (ofs->casefold != is_casefolded) { + return invalfc(fc, "case-%ssensitive directory on %s is inconsistent", + is_casefolded ? "in" : "", name); + } if (ovl_dentry_weird(path->dentry)) return invalfc(fc, "filesystem on %s not supported", name); diff --git a/fs/overlayfs/params.h b/fs/overlayfs/params.h index c96d93982021..ffd53cdd8482 100644 --- a/fs/overlayfs/params.h +++ b/fs/overlayfs/params.h @@ -33,6 +33,7 @@ struct ovl_fs_context { struct ovl_opt_set set; struct ovl_fs_context_layer *lower; char *lowerdir_all; /* user provided lowerdir string */ + bool casefold_set; }; int ovl_init_fs_context(struct fs_context *fc); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 15cb06fa0c9a..1e9792cc557b 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -27,6 +27,8 @@ struct ovl_cache_entry { bool is_upper; bool is_whiteout; bool check_xwhiteout; + const char *c_name; + int c_len; char name[]; }; @@ -45,6 +47,7 @@ struct ovl_readdir_data { struct list_head *list; struct list_head middle; struct ovl_cache_entry *first_maybe_whiteout; + struct unicode_map *map; int count; int err; bool is_upper; @@ -66,6 +69,31 @@ static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n) return rb_entry(n, struct ovl_cache_entry, node); } +static int ovl_casefold(struct ovl_readdir_data *rdd, const char *str, int len, + char **dst) +{ + const struct qstr qstr = { .name = str, .len = len }; + char *cf_name; + int cf_len; + + if (!IS_ENABLED(CONFIG_UNICODE) || !rdd->map || is_dot_dotdot(str, len)) + return 0; + + cf_name = kmalloc(NAME_MAX, GFP_KERNEL); + if (!cf_name) { + rdd->err = -ENOMEM; + return -ENOMEM; + } + + cf_len = utf8_casefold(rdd->map, &qstr, cf_name, NAME_MAX); + if (cf_len > 0) + *dst = cf_name; + else + kfree(cf_name); + + return cf_len; +} + static bool ovl_cache_entry_find_link(const char *name, int len, struct rb_node ***link, struct rb_node **parent) @@ -79,10 +107,10 @@ static bool ovl_cache_entry_find_link(const char *name, int len, *parent = *newp; tmp = ovl_cache_entry_from_node(*newp); - cmp = strncmp(name, tmp->name, len); + cmp = strncmp(name, tmp->c_name, len); if (cmp > 0) newp = &tmp->node.rb_right; - else if (cmp < 0 || len < tmp->len) + else if (cmp < 0 || len < tmp->c_len) newp = &tmp->node.rb_left; else found = true; @@ -101,10 +129,10 @@ static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, while (node) { struct ovl_cache_entry *p = ovl_cache_entry_from_node(node); - cmp = strncmp(name, p->name, len); + cmp = strncmp(name, p->c_name, len); if (cmp > 0) node = p->node.rb_right; - else if (cmp < 0 || len < p->len) + else if (cmp < 0 || len < p->c_len) node = p->node.rb_left; else return p; @@ -145,6 +173,7 @@ static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd, static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, const char *name, int len, + const char *c_name, int c_len, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; @@ -167,6 +196,14 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, /* Defer check for overlay.whiteout to ovl_iterate() */ p->check_xwhiteout = rdd->in_xwhiteouts_dir && d_type == DT_REG; + if (c_name && c_name != name) { + p->c_name = c_name; + p->c_len = c_len; + } else { + p->c_name = p->name; + p->c_len = len; + } + if (d_type == DT_CHR) { p->next_maybe_whiteout = rdd->first_maybe_whiteout; rdd->first_maybe_whiteout = p; @@ -174,48 +211,62 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, return p; } -static bool ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, - const char *name, int len, u64 ino, +/* Return 0 for found, 1 for added, <0 for error */ +static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, + const char *name, int len, + const char *c_name, int c_len, + u64 ino, unsigned int d_type) { struct rb_node **newp = &rdd->root->rb_node; struct rb_node *parent = NULL; struct ovl_cache_entry *p; - if (ovl_cache_entry_find_link(name, len, &newp, &parent)) - return true; + if (ovl_cache_entry_find_link(c_name, c_len, &newp, &parent)) + return 0; - p = ovl_cache_entry_new(rdd, name, len, ino, d_type); + p = ovl_cache_entry_new(rdd, name, len, c_name, c_len, ino, d_type); if (p == NULL) { rdd->err = -ENOMEM; - return false; + return -ENOMEM; } list_add_tail(&p->l_node, rdd->list); rb_link_node(&p->node, parent, newp); rb_insert_color(&p->node, rdd->root); - return true; + return 1; } -static bool ovl_fill_lowest(struct ovl_readdir_data *rdd, +/* Return 0 for found, 1 for added, <0 for error */ +static int ovl_fill_lowest(struct ovl_readdir_data *rdd, const char *name, int namelen, + const char *c_name, int c_len, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; - p = ovl_cache_entry_find(rdd->root, name, namelen); + p = ovl_cache_entry_find(rdd->root, c_name, c_len); if (p) { list_move_tail(&p->l_node, &rdd->middle); + return 0; } else { - p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); + p = ovl_cache_entry_new(rdd, name, namelen, c_name, c_len, + ino, d_type); if (p == NULL) rdd->err = -ENOMEM; else list_add_tail(&p->l_node, &rdd->middle); } - return rdd->err == 0; + return rdd->err ?: 1; +} + +static void ovl_cache_entry_free(struct ovl_cache_entry *p) +{ + if (p->c_name != p->name) + kfree(p->c_name); + kfree(p); } void ovl_cache_free(struct list_head *list) @@ -224,7 +275,7 @@ void ovl_cache_free(struct list_head *list) struct ovl_cache_entry *n; list_for_each_entry_safe(p, n, list, l_node) - kfree(p); + ovl_cache_entry_free(p); INIT_LIST_HEAD(list); } @@ -260,12 +311,39 @@ static bool ovl_fill_merge(struct dir_context *ctx, const char *name, { struct ovl_readdir_data *rdd = container_of(ctx, struct ovl_readdir_data, ctx); + struct ovl_fs *ofs = OVL_FS(rdd->dentry->d_sb); + const char *c_name = NULL; + char *cf_name = NULL; + int c_len = 0, ret; + + if (ofs->casefold) + c_len = ovl_casefold(rdd, name, namelen, &cf_name); + + if (rdd->err) + return false; + + if (c_len <= 0) { + c_name = name; + c_len = namelen; + } else { + c_name = cf_name; + } rdd->count++; if (!rdd->is_lowest) - return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type); + ret = ovl_cache_entry_add_rb(rdd, name, namelen, c_name, c_len, ino, d_type); else - return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type); + ret = ovl_fill_lowest(rdd, name, namelen, c_name, c_len, offset, ino, d_type); + + /* + * If ret == 1, that means that c_name is being used as part of struct + * ovl_cache_entry and will be freed at ovl_cache_free(). Otherwise, + * c_name was found in the rb-tree so we can free it here. + */ + if (ret != 1 && c_name != name) + kfree(c_name); + + return ret >= 0; } static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd) @@ -357,12 +435,18 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, .list = list, .root = root, .is_lowest = false, + .map = NULL, }; int idx, next; const struct ovl_layer *layer; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); for (idx = 0; idx != -1; idx = next) { next = ovl_path_next(idx, dentry, &realpath, &layer); + + if (ofs->casefold) + rdd.map = sb_encoding(realpath.dentry->d_sb); + rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry; rdd.in_xwhiteouts_dir = layer->has_xwhiteouts && ovl_dentry_has_xwhiteouts(dentry); @@ -555,7 +639,7 @@ static bool ovl_fill_plain(struct dir_context *ctx, const char *name, container_of(ctx, struct ovl_readdir_data, ctx); rdd->count++; - p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); + p = ovl_cache_entry_new(rdd, name, namelen, NULL, 0, ino, d_type); if (p == NULL) { rdd->err = -ENOMEM; return false; @@ -595,7 +679,7 @@ static int ovl_dir_read_impure(const struct path *path, struct list_head *list, } if (p->ino == p->real_ino) { list_del(&p->l_node); - kfree(p); + ovl_cache_entry_free(p); } else { struct rb_node **newp = &root->rb_node; struct rb_node *parent = NULL; @@ -1023,7 +1107,7 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) del_entry: list_del(&p->l_node); - kfree(p); + ovl_cache_entry_free(p); } return err; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index bd3d7ba8fb95..43ee4c7296a7 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -161,6 +161,16 @@ static const struct dentry_operations ovl_dentry_operations = { .d_weak_revalidate = ovl_dentry_weak_revalidate, }; +#if IS_ENABLED(CONFIG_UNICODE) +static const struct dentry_operations ovl_dentry_ci_operations = { + .d_real = ovl_d_real, + .d_revalidate = ovl_dentry_revalidate, + .d_weak_revalidate = ovl_dentry_weak_revalidate, + .d_hash = generic_ci_d_hash, + .d_compare = generic_ci_d_compare, +}; +#endif + static struct kmem_cache *ovl_inode_cachep; static struct inode *ovl_alloc_inode(struct super_block *sb) @@ -394,7 +404,7 @@ static int ovl_check_namelen(const struct path *path, struct ovl_fs *ofs, return err; } -static int ovl_lower_dir(const char *name, struct path *path, +static int ovl_lower_dir(const char *name, const struct path *path, struct ovl_fs *ofs, int *stack_depth) { int fh_type; @@ -991,6 +1001,25 @@ static int ovl_get_data_fsid(struct ovl_fs *ofs) return ofs->numfs; } +/* + * Set the ovl sb encoding as the same one used by the first layer + */ +static int ovl_set_encoding(struct super_block *sb, struct super_block *fs_sb) +{ + if (!sb_has_encoding(fs_sb)) + return 0; + +#if IS_ENABLED(CONFIG_UNICODE) + if (sb_has_strict_encoding(fs_sb)) { + pr_err("strict encoding not supported\n"); + return -EINVAL; + } + + sb->s_encoding = fs_sb->s_encoding; + sb->s_encoding_flags = fs_sb->s_encoding_flags; +#endif + return 0; +} static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, struct ovl_fs_context *ctx, struct ovl_layer *layers) @@ -1024,6 +1053,12 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, if (ovl_upper_mnt(ofs)) { ofs->fs[0].sb = ovl_upper_mnt(ofs)->mnt_sb; ofs->fs[0].is_lower = false; + + if (ofs->casefold) { + err = ovl_set_encoding(sb, ofs->fs[0].sb); + if (err) + return err; + } } nr_merged_lower = ctx->nr - ctx->nr_data; @@ -1083,6 +1118,19 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, l->name = NULL; ofs->numlayer++; ofs->fs[fsid].is_lower = true; + + if (ofs->casefold) { + if (!ovl_upper_mnt(ofs) && !sb_has_encoding(sb)) { + err = ovl_set_encoding(sb, ofs->fs[fsid].sb); + if (err) + return err; + } + + if (!sb_same_encoding(sb, mnt->mnt_sb)) { + pr_err("all layers must have the same encoding\n"); + return -EINVAL; + } + } } /* @@ -1300,6 +1348,7 @@ static struct dentry *ovl_get_root(struct super_block *sb, ovl_dentry_set_flag(OVL_E_CONNECTED, root); ovl_set_upperdata(d_inode(root)); ovl_inode_init(d_inode(root), &oip, ino, fsid); + WARN_ON(!!IS_CASEFOLDED(d_inode(root)) != ofs->casefold); ovl_dentry_init_flags(root, upperdentry, oe, DCACHE_OP_WEAK_REVALIDATE); /* root keeps a reference of upperdentry */ dget(upperdentry); @@ -1307,6 +1356,19 @@ static struct dentry *ovl_get_root(struct super_block *sb, return root; } +static void ovl_set_d_op(struct super_block *sb) +{ +#if IS_ENABLED(CONFIG_UNICODE) + struct ovl_fs *ofs = sb->s_fs_info; + + if (ofs->casefold) { + set_default_d_op(sb, &ovl_dentry_ci_operations); + return; + } +#endif + set_default_d_op(sb, &ovl_dentry_operations); +} + int ovl_fill_super(struct super_block *sb, struct fs_context *fc) { struct ovl_fs *ofs = sb->s_fs_info; @@ -1322,7 +1384,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) if (WARN_ON(fc->user_ns != current_user_ns())) goto out_err; - set_default_d_op(sb, &ovl_dentry_operations); + ovl_set_d_op(sb); err = -ENOMEM; if (!ofs->creator_cred) diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 41033bac96cb..f76672f2e686 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -210,11 +210,11 @@ bool ovl_dentry_weird(struct dentry *dentry) return true; /* - * Allow filesystems that are case-folding capable but deny composing - * ovl stack from case-folded directories. + * Exceptionally for layers with casefold, we accept that they have + * their own hash and compare operations */ if (sb_has_encoding(dentry->d_sb)) - return IS_CASEFOLDED(d_inode(dentry)); + return false; return dentry->d_flags & (DCACHE_OP_HASH | DCACHE_OP_COMPARE); } @@ -1381,7 +1381,7 @@ err_free: } /* Call with mounter creds as it may open the file */ -int ovl_ensure_verity_loaded(struct path *datapath) +int ovl_ensure_verity_loaded(const struct path *datapath) { struct inode *inode = d_inode(datapath->dentry); struct file *filp; @@ -1401,8 +1401,8 @@ int ovl_ensure_verity_loaded(struct path *datapath) } int ovl_validate_verity(struct ovl_fs *ofs, - struct path *metapath, - struct path *datapath) + const struct path *metapath, + const struct path *datapath) { struct ovl_metacopy metacopy_data; u8 actual_digest[FS_VERITY_MAX_DIGEST_SIZE]; @@ -1455,7 +1455,7 @@ int ovl_validate_verity(struct ovl_fs *ofs, return 0; } -int ovl_get_verity_digest(struct ovl_fs *ofs, struct path *src, +int ovl_get_verity_digest(struct ovl_fs *ofs, const struct path *src, struct ovl_metacopy *metacopy) { int err, digest_size; diff --git a/fs/pidfs.c b/fs/pidfs.c index 44a95cd27377..0ef5b47d796a 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -850,7 +850,7 @@ static int pidfs_export_permission(struct handle_to_path_ctx *ctx, return 0; } -static struct file *pidfs_export_open(struct path *path, unsigned int oflags) +static struct file *pidfs_export_open(const struct path *path, unsigned int oflags) { /* * Clear O_LARGEFILE as open_by_handle_at() forces it and raise diff --git a/fs/pnode.c b/fs/pnode.c index 6f7d02f3fa98..5d91c3e58d2a 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -29,6 +29,7 @@ static inline struct mount *next_slave(struct mount *p) return hlist_entry(p->mnt_slave.next, struct mount, mnt_slave); } +/* locks: namespace_shared && is_mounted(mnt) */ static struct mount *get_peer_under_root(struct mount *mnt, struct mnt_namespace *ns, const struct path *root) @@ -50,7 +51,7 @@ static struct mount *get_peer_under_root(struct mount *mnt, * Get ID of closest dominating peer group having a representative * under the given root. * - * Caller must hold namespace_sem + * locks: namespace_shared */ int get_dominating_id(struct mount *mnt, const struct path *root) { @@ -70,19 +71,6 @@ static inline bool will_be_unmounted(struct mount *m) return m->mnt.mnt_flags & MNT_UMOUNT; } -static struct mount *propagation_source(struct mount *mnt) -{ - do { - struct mount *m; - for (m = next_peer(mnt); m != mnt; m = next_peer(m)) { - if (!will_be_unmounted(m)) - return m; - } - mnt = mnt->mnt_master; - } while (mnt && will_be_unmounted(mnt)); - return mnt; -} - static void transfer_propagation(struct mount *mnt, struct mount *to) { struct hlist_node *p = NULL, *n; @@ -111,11 +99,10 @@ void change_mnt_propagation(struct mount *mnt, int type) return; } if (IS_MNT_SHARED(mnt)) { - if (type == MS_SLAVE || !hlist_empty(&mnt->mnt_slave_list)) - m = propagation_source(mnt); if (list_empty(&mnt->mnt_share)) { mnt_release_group_id(mnt); } else { + m = next_peer(mnt); list_del_init(&mnt->mnt_share); mnt->mnt_group_id = 0; } @@ -136,6 +123,57 @@ void change_mnt_propagation(struct mount *mnt, int type) } } +static struct mount *trace_transfers(struct mount *m) +{ + while (1) { + struct mount *next = next_peer(m); + + if (next != m) { + list_del_init(&m->mnt_share); + m->mnt_group_id = 0; + m->mnt_master = next; + } else { + if (IS_MNT_SHARED(m)) + mnt_release_group_id(m); + next = m->mnt_master; + } + hlist_del_init(&m->mnt_slave); + CLEAR_MNT_SHARED(m); + SET_MNT_MARK(m); + + if (!next || !will_be_unmounted(next)) + return next; + if (IS_MNT_MARKED(next)) + return next->mnt_master; + m = next; + } +} + +static void set_destinations(struct mount *m, struct mount *master) +{ + struct mount *next; + + while ((next = m->mnt_master) != master) { + m->mnt_master = master; + m = next; + } +} + +void bulk_make_private(struct list_head *set) +{ + struct mount *m; + + list_for_each_entry(m, set, mnt_list) + if (!IS_MNT_MARKED(m)) + set_destinations(m, trace_transfers(m)); + + list_for_each_entry(m, set, mnt_list) { + transfer_propagation(m, m->mnt_master); + m->mnt_master = NULL; + CLEAR_MNT_MARK(m); + } +} + static struct mount *__propagation_next(struct mount *m, struct mount *origin) { @@ -304,9 +342,8 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, err = PTR_ERR(this); break; } - read_seqlock_excl(&mount_lock); - mnt_set_mountpoint(n, dest_mp, this); - read_sequnlock_excl(&mount_lock); + scoped_guard(mount_locked_reader) + mnt_set_mountpoint(n, dest_mp, this); if (n->mnt_master) SET_MNT_MARK(n->mnt_master); copy = this; diff --git a/fs/pnode.h b/fs/pnode.h index 00ab153e3e9d..b029db225f33 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -42,6 +42,7 @@ static inline bool peers(const struct mount *m1, const struct mount *m2) } void change_mnt_propagation(struct mount *, int); +void bulk_make_private(struct list_head *); int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, struct hlist_head *); void propagate_umount(struct list_head *); diff --git a/fs/proc/base.c b/fs/proc/base.c index b997ceef9135..6299878e3d97 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3947,7 +3947,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) tid = task_pid_nr_ns(task, ns); if (!tid) continue; /* The task has just exited. */ - len = snprintf(name, sizeof(name), "%u", tid); + len = snprintf(name, sizeof(name), "%d", tid); if (!proc_fill_cache(file, ctx, name, len, proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index afa15a214538..6c4a6ee1fa2b 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -162,6 +162,9 @@ static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES; /* SLAB cache for dquot structures */ static struct kmem_cache *dquot_cachep; +/* workqueue for work quota_release_work*/ +static struct workqueue_struct *quota_unbound_wq; + void register_quota_format(struct quota_format_type *fmt) { spin_lock(&dq_list_lock); @@ -881,7 +884,7 @@ void dqput(struct dquot *dquot) put_releasing_dquots(dquot); atomic_dec(&dquot->dq_count); spin_unlock(&dq_list_lock); - queue_delayed_work(system_dfl_wq, "a_release_work, 1); + queue_delayed_work(quota_unbound_wq, "a_release_work, 1); } EXPORT_SYMBOL(dqput); @@ -3041,6 +3044,11 @@ static int __init dquot_init(void) shrinker_register(dqcache_shrinker); + quota_unbound_wq = alloc_workqueue("quota_events_unbound", + WQ_UNBOUND | WQ_MEM_RECLAIM, WQ_MAX_ACTIVE); + if (!quota_unbound_wq) + panic("Cannot create quota_unbound_wq\n"); + return 0; } fs_initcall(dquot_init); diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig index 9f05f94e265a..a4c02199fef4 100644 --- a/fs/smb/client/Kconfig +++ b/fs/smb/client/Kconfig @@ -15,6 +15,7 @@ config CIFS select CRYPTO_GCM select CRYPTO_ECB select CRYPTO_AES + select CRYPTO_LIB_ARC4 select KEYS select DNS_RESOLVER select ASN1 diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index b69daeb1301b..b36f9f9340f0 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -36,9 +36,8 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, * fully cached or it may be in the process of * being deleted due to a lease break. */ - if (!cfid->time || !cfid->has_lease) { + if (!is_valid_cached_dir(cfid)) return NULL; - } kref_get(&cfid->refcount); return cfid; } @@ -194,7 +193,7 @@ replay_again: * Otherwise, it is either a new entry or laundromat worker removed it * from @cfids->entries. Caller will put last reference if the latter. */ - if (cfid->has_lease && cfid->time) { + if (is_valid_cached_dir(cfid)) { cfid->last_access_time = jiffies; spin_unlock(&cfids->cfid_list_lock); *ret_cfid = cfid; @@ -233,7 +232,7 @@ replay_again: list_for_each_entry(parent_cfid, &cfids->entries, entry) { if (parent_cfid->dentry == dentry->d_parent) { cifs_dbg(FYI, "found a parent cached file handle\n"); - if (parent_cfid->has_lease && parent_cfid->time) { + if (is_valid_cached_dir(parent_cfid)) { lease_flags |= SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE; memcpy(pfid->parent_lease_key, @@ -417,12 +416,18 @@ int open_cached_dir_by_dentry(struct cifs_tcon *tcon, if (cfids == NULL) return -EOPNOTSUPP; + if (!dentry) + return -ENOENT; + spin_lock(&cfids->cfid_list_lock); list_for_each_entry(cfid, &cfids->entries, entry) { - if (dentry && cfid->dentry == dentry) { + if (cfid->dentry == dentry) { + if (!is_valid_cached_dir(cfid)) + break; cifs_dbg(FYI, "found a cached file handle by dentry\n"); kref_get(&cfid->refcount); *ret_cfid = cfid; + cfid->last_access_time = jiffies; spin_unlock(&cfids->cfid_list_lock); return 0; } @@ -522,10 +527,9 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb) spin_unlock(&cifs_sb->tlink_tree_lock); goto done; } - spin_lock(&cfid->fid_lock); + tmp_list->dentry = cfid->dentry; cfid->dentry = NULL; - spin_unlock(&cfid->fid_lock); list_add_tail(&tmp_list->entry, &entry); } @@ -608,14 +612,9 @@ static void cached_dir_put_work(struct work_struct *work) { struct cached_fid *cfid = container_of(work, struct cached_fid, put_work); - struct dentry *dentry; - - spin_lock(&cfid->fid_lock); - dentry = cfid->dentry; + dput(cfid->dentry); cfid->dentry = NULL; - spin_unlock(&cfid->fid_lock); - dput(dentry); queue_work(serverclose_wq, &cfid->close_work); } @@ -673,7 +672,6 @@ static struct cached_fid *init_cached_dir(const char *path) INIT_LIST_HEAD(&cfid->entry); INIT_LIST_HEAD(&cfid->dirents.entries); mutex_init(&cfid->dirents.de_mutex); - spin_lock_init(&cfid->fid_lock); kref_init(&cfid->refcount); return cfid; } @@ -697,6 +695,21 @@ static void free_cached_dir(struct cached_fid *cfid) kfree(dirent); } + /* adjust tcon-level counters and reset per-dir accounting */ + if (cfid->cfids) { + if (cfid->dirents.entries_count) + atomic_long_sub((long)cfid->dirents.entries_count, + &cfid->cfids->total_dirents_entries); + if (cfid->dirents.bytes_used) { + atomic64_sub((long long)cfid->dirents.bytes_used, + &cfid->cfids->total_dirents_bytes); + atomic64_sub((long long)cfid->dirents.bytes_used, + &cifs_dircache_bytes_used); + } + } + cfid->dirents.entries_count = 0; + cfid->dirents.bytes_used = 0; + kfree(cfid->path); cfid->path = NULL; kfree(cfid); @@ -725,7 +738,6 @@ static void cfids_laundromat_worker(struct work_struct *work) { struct cached_fids *cfids; struct cached_fid *cfid, *q; - struct dentry *dentry; LIST_HEAD(entry); cfids = container_of(work, struct cached_fids, laundromat_work.work); @@ -752,12 +764,9 @@ static void cfids_laundromat_worker(struct work_struct *work) list_for_each_entry_safe(cfid, q, &entry, entry) { list_del(&cfid->entry); - spin_lock(&cfid->fid_lock); - dentry = cfid->dentry; + dput(cfid->dentry); cfid->dentry = NULL; - spin_unlock(&cfid->fid_lock); - dput(dentry); if (cfid->is_open) { spin_lock(&cifs_tcp_ses_lock); ++cfid->tcon->tc_count; @@ -792,6 +801,9 @@ struct cached_fids *init_cached_dirs(void) queue_delayed_work(cfid_put_wq, &cfids->laundromat_work, dir_cache_timeout * HZ); + atomic_long_set(&cfids->total_dirents_entries, 0); + atomic64_set(&cfids->total_dirents_bytes, 0); + return cfids; } diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h index 46b5a2fdf15b..31339dc32719 100644 --- a/fs/smb/client/cached_dir.h +++ b/fs/smb/client/cached_dir.h @@ -27,6 +27,9 @@ struct cached_dirents { struct mutex de_mutex; loff_t pos; /* Expected ctx->pos */ struct list_head entries; + /* accounting for cached entries in this directory */ + unsigned long entries_count; + unsigned long bytes_used; }; struct cached_fid { @@ -41,7 +44,6 @@ struct cached_fid { unsigned long last_access_time; /* jiffies of when last accessed */ struct kref refcount; struct cifs_fid fid; - spinlock_t fid_lock; struct cifs_tcon *tcon; struct dentry *dentry; struct work_struct put_work; @@ -62,8 +64,20 @@ struct cached_fids { struct list_head dying; struct work_struct invalidation_work; struct delayed_work laundromat_work; + /* aggregate accounting for all cached dirents under this tcon */ + atomic_long_t total_dirents_entries; + atomic64_t total_dirents_bytes; }; +/* Module-wide directory cache accounting (defined in cifsfs.c) */ +extern atomic64_t cifs_dircache_bytes_used; /* bytes across all mounts */ + +static inline bool +is_valid_cached_dir(struct cached_fid *cfid) +{ + return cfid->time && cfid->has_lease; +} + extern struct cached_fids *init_cached_dirs(void); extern void free_cached_dirs(struct cached_fids *cfids); extern int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 35c4d27d2cc0..1fb71d2d31b5 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -240,14 +240,18 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v) struct cifs_ses *ses; struct cifs_tcon *tcon; struct cifsFileInfo *cfile; + struct inode *inode; + struct cifsInodeInfo *cinode; + char lease[4]; + int n; seq_puts(m, "# Version:1\n"); seq_puts(m, "# Format:\n"); seq_puts(m, "# <tree id> <ses id> <persistent fid> <flags> <count> <pid> <uid>"); #ifdef CONFIG_CIFS_DEBUG2 - seq_printf(m, " <filename> <mid>\n"); + seq_puts(m, " <filename> <lease> <mid>\n"); #else - seq_printf(m, " <filename>\n"); + seq_puts(m, " <filename> <lease>\n"); #endif /* CIFS_DEBUG2 */ spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { @@ -267,11 +271,30 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v) cfile->pid, from_kuid(&init_user_ns, cfile->uid), cfile->dentry); + + /* Append lease/oplock caching state as RHW letters */ + inode = d_inode(cfile->dentry); + n = 0; + if (inode) { + cinode = CIFS_I(inode); + if (CIFS_CACHE_READ(cinode)) + lease[n++] = 'R'; + if (CIFS_CACHE_HANDLE(cinode)) + lease[n++] = 'H'; + if (CIFS_CACHE_WRITE(cinode)) + lease[n++] = 'W'; + } + lease[n] = '\0'; + seq_puts(m, " "); + if (n) + seq_printf(m, "%s", lease); + else + seq_puts(m, "NONE"); + #ifdef CONFIG_CIFS_DEBUG2 - seq_printf(m, " %llu\n", cfile->fid.mid); -#else + seq_printf(m, " %llu", cfile->fid.mid); +#endif /* CONFIG_CIFS_DEBUG2 */ seq_printf(m, "\n"); -#endif /* CIFS_DEBUG2 */ } spin_unlock(&tcon->open_file_lock); } @@ -308,7 +331,10 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v) if (!cfids) continue; spin_lock(&cfids->cfid_list_lock); /* check lock ordering */ - seq_printf(m, "Num entries: %d\n", cfids->num_entries); + seq_printf(m, "Num entries: %d, cached_dirents: %lu entries, %llu bytes\n", + cfids->num_entries, + (unsigned long)atomic_long_read(&cfids->total_dirents_entries), + (unsigned long long)atomic64_read(&cfids->total_dirents_bytes)); list_for_each_entry(cfid, &cfids->entries, entry) { seq_printf(m, "0x%x 0x%llx 0x%llx %s", tcon->tid, @@ -319,6 +345,9 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v) seq_printf(m, "\tvalid file info"); if (cfid->dirents.is_valid) seq_printf(m, ", valid dirents"); + if (!list_empty(&cfid->dirents.entries)) + seq_printf(m, ", dirents: %lu entries, %lu bytes", + cfid->dirents.entries_count, cfid->dirents.bytes_used); seq_printf(m, "\n"); } spin_unlock(&cfids->cfid_list_lock); diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index 3cc686246908..7b7c8c38fdd0 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -22,8 +22,8 @@ #include <linux/highmem.h> #include <linux/fips.h> #include <linux/iov_iter.h> -#include "../common/arc4.h" #include <crypto/aead.h> +#include <crypto/arc4.h> static size_t cifs_shash_step(void *iter_base, size_t progress, size_t len, void *priv, void *priv2) @@ -725,9 +725,9 @@ calc_seckey(struct cifs_ses *ses) return -ENOMEM; } - cifs_arc4_setkey(ctx_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE); - cifs_arc4_crypt(ctx_arc4, ses->ntlmssp->ciphertext, sec_key, - CIFS_CPHTXT_SIZE); + arc4_setkey(ctx_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE); + arc4_crypt(ctx_arc4, ses->ntlmssp->ciphertext, sec_key, + CIFS_CPHTXT_SIZE); /* make secondary_key/nonce as session key */ memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE); diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index dcb39d1b5958..1775c2b7528f 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -121,6 +121,46 @@ unsigned int dir_cache_timeout = 30; module_param(dir_cache_timeout, uint, 0644); MODULE_PARM_DESC(dir_cache_timeout, "Number of seconds to cache directory contents for which we have a lease. Default: 30 " "Range: 1 to 65000 seconds, 0 to disable caching dir contents"); +/* Module-wide total cached dirents (in bytes) across all tcons */ +atomic64_t cifs_dircache_bytes_used = ATOMIC64_INIT(0); + +/* + * Write-only module parameter to drop all cached directory entries across + * all CIFS mounts. Echo a non-zero value to trigger. + */ +static void cifs_drop_all_dir_caches(void) +{ + struct TCP_Server_Info *server; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + + spin_lock(&cifs_tcp_ses_lock); + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) + invalidate_all_cached_dirs(tcon); + } + } + spin_unlock(&cifs_tcp_ses_lock); +} + +static int cifs_param_set_drop_dir_cache(const char *val, const struct kernel_param *kp) +{ + bool bv; + int rc = kstrtobool(val, &bv); + + if (rc) + return rc; + if (bv) + cifs_drop_all_dir_caches(); + return 0; +} + +module_param_call(drop_dir_cache, cifs_param_set_drop_dir_cache, NULL, NULL, 0200); +MODULE_PARM_DESC(drop_dir_cache, "Write 1 to drop all cached directory entries across all CIFS mounts"); + #ifdef CONFIG_CIFS_STATS2 unsigned int slow_rsp_threshold = 1; module_param(slow_rsp_threshold, uint, 0644); diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c index 5223edf6d11a..fc67a6441c96 100644 --- a/fs/smb/client/dir.c +++ b/fs/smb/client/dir.c @@ -322,13 +322,14 @@ retry_open: list_for_each_entry(parent_cfid, &tcon->cfids->entries, entry) { if (parent_cfid->dentry == direntry->d_parent) { cifs_dbg(FYI, "found a parent cached file handle\n"); - if (parent_cfid->has_lease && parent_cfid->time) { + if (is_valid_cached_dir(parent_cfid)) { lease_flags |= SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE; memcpy(fid->parent_lease_key, parent_cfid->fid.lease_key, SMB2_LEASE_KEY_SIZE); parent_cfid->dirents.is_valid = false; + parent_cfid->dirents.is_failed = true; } break; } @@ -484,8 +485,6 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, * in network traffic in the other paths. */ if (!(oflags & O_CREAT)) { - struct dentry *res; - /* * Check for hashed negative dentry. We have already revalidated * the dentry and it is fine. No need to perform another lookup. @@ -493,11 +492,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, if (!d_in_lookup(direntry)) return -ENOENT; - res = cifs_lookup(inode, direntry, 0); - if (IS_ERR(res)) - return PTR_ERR(res); - - return finish_no_open(file, res); + return finish_no_open(file, cifs_lookup(inode, direntry, 0)); } xid = get_xid(); @@ -683,6 +678,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, const char *full_path; void *page; int retry_count = 0; + struct cached_fid *cfid = NULL; xid = get_xid(); @@ -722,6 +718,28 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, cifs_dbg(FYI, "non-NULL inode in lookup\n"); } else { cifs_dbg(FYI, "NULL inode in lookup\n"); + + /* + * We can only rely on negative dentries having the same + * spelling as the cached dirent if case insensitivity is + * forced on mount. + * + * XXX: if servers correctly announce Case Sensitivity Search + * on GetInfo of FileFSAttributeInformation, then we can take + * correct action even if case insensitive is not forced on + * mount. + */ + if (pTcon->nocase && !open_cached_dir_by_dentry(pTcon, direntry->d_parent, &cfid)) { + /* + * dentry is negative and parent is fully cached: + * we can assume file does not exist + */ + if (cfid->dirents.is_valid) { + close_cached_dir(cfid); + goto out; + } + close_cached_dir(cfid); + } } cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, d_inode(direntry)); @@ -755,6 +773,8 @@ again: } newInode = ERR_PTR(rc); } + +out: free_dentry_path(page); cifs_put_tlink(tlink); free_xid(xid); @@ -765,7 +785,8 @@ static int cifs_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *direntry, unsigned int flags) { - struct inode *inode; + struct inode *inode = NULL; + struct cached_fid *cfid; int rc; if (flags & LOOKUP_RCU) @@ -812,6 +833,21 @@ cifs_d_revalidate(struct inode *dir, const struct qstr *name, return 1; } + } else { + struct cifs_sb_info *cifs_sb = CIFS_SB(dir->i_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + + if (!open_cached_dir_by_dentry(tcon, direntry->d_parent, &cfid)) { + /* + * dentry is negative and parent is fully cached: + * we can assume file does not exist + */ + if (cfid->dirents.is_valid) { + close_cached_dir(cfid); + return 1; + } + close_cached_dir(cfid); + } } /* diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 072383899e81..e60927b2a7c8 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -773,16 +773,14 @@ static int smb3_fs_context_parse_monolithic(struct fs_context *fc, } - len = 0; value = strchr(key, '='); if (value) { if (value == key) continue; *value++ = 0; - len = strlen(value); } - ret = vfs_parse_fs_string(fc, key, value, len); + ret = vfs_parse_fs_string(fc, key, value); if (ret < 0) break; } @@ -1820,6 +1818,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, goto cifs_parse_mount_err; } + /* + * Multichannel is not meaningful if max_channels is 1. + * Force multichannel to false to ensure consistent configuration. + */ + if (ctx->multichannel && ctx->max_channels == 1) + ctx->multichannel = false; + return 0; cifs_parse_mount_err: diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 7e9784080501..8bb544be401e 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -2704,7 +2704,7 @@ cifs_dentry_needs_reval(struct dentry *dentry) return true; if (!open_cached_dir_by_dentry(tcon, dentry->d_parent, &cfid)) { - if (cfid->time && cifs_i->time > cfid->time) { + if (cifs_i->time > cfid->time) { close_cached_dir(cfid); return false; } diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index 4e5460206397..f0ce26622a14 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -874,39 +874,42 @@ static void finished_cached_dirents_count(struct cached_dirents *cde, cde->is_valid = 1; } -static void add_cached_dirent(struct cached_dirents *cde, - struct dir_context *ctx, - const char *name, int namelen, - struct cifs_fattr *fattr, - struct file *file) +static bool add_cached_dirent(struct cached_dirents *cde, + struct dir_context *ctx, const char *name, + int namelen, struct cifs_fattr *fattr, + struct file *file) { struct cached_dirent *de; if (cde->file != file) - return; + return false; if (cde->is_valid || cde->is_failed) - return; + return false; if (ctx->pos != cde->pos) { cde->is_failed = 1; - return; + return false; } de = kzalloc(sizeof(*de), GFP_ATOMIC); if (de == NULL) { cde->is_failed = 1; - return; + return false; } de->namelen = namelen; de->name = kstrndup(name, namelen, GFP_ATOMIC); if (de->name == NULL) { kfree(de); cde->is_failed = 1; - return; + return false; } de->pos = ctx->pos; memcpy(&de->fattr, fattr, sizeof(struct cifs_fattr)); list_add_tail(&de->entry, &cde->entries); + /* update accounting */ + cde->entries_count++; + cde->bytes_used += sizeof(*de) + (size_t)namelen + 1; + return true; } static bool cifs_dir_emit(struct dir_context *ctx, @@ -915,7 +918,8 @@ static bool cifs_dir_emit(struct dir_context *ctx, struct cached_fid *cfid, struct file *file) { - bool rc; + size_t delta_bytes = 0; + bool rc, added = false; ino_t ino = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid); rc = dir_emit(ctx, name, namelen, ino, fattr->cf_dtype); @@ -923,10 +927,20 @@ static bool cifs_dir_emit(struct dir_context *ctx, return rc; if (cfid) { + /* Cost of this entry */ + delta_bytes = sizeof(struct cached_dirent) + (size_t)namelen + 1; + mutex_lock(&cfid->dirents.de_mutex); - add_cached_dirent(&cfid->dirents, ctx, name, namelen, - fattr, file); + added = add_cached_dirent(&cfid->dirents, ctx, name, namelen, + fattr, file); mutex_unlock(&cfid->dirents.de_mutex); + + if (added) { + /* per-tcon then global for consistency with free path */ + atomic64_add((long long)delta_bytes, &cfid->cfids->total_dirents_bytes); + atomic_long_inc(&cfid->cfids->total_dirents_entries); + atomic64_add((long long)delta_bytes, &cifs_dircache_bytes_used); + } } return rc; diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 4711a23c5b38..058050f744c0 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -954,11 +954,8 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, rc = open_cached_dir(xid, tcon, full_path, cifs_sb, true, &cfid); if (!rc) { - if (cfid->has_lease) { - close_cached_dir(cfid); - return 0; - } close_cached_dir(cfid); + return 0; } utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); @@ -4219,7 +4216,7 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst *rqst, int num_rqst, const u8 *sig, u8 **iv, struct aead_request **req, struct sg_table *sgt, - unsigned int *num_sgs, size_t *sensitive_size) + unsigned int *num_sgs) { unsigned int req_size = sizeof(**req) + crypto_aead_reqsize(tfm); unsigned int iv_size = crypto_aead_ivsize(tfm); @@ -4236,9 +4233,8 @@ static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst len += req_size; len = ALIGN(len, __alignof__(struct scatterlist)); len += array_size(*num_sgs, sizeof(struct scatterlist)); - *sensitive_size = len; - p = kvzalloc(len, GFP_NOFS); + p = kzalloc(len, GFP_NOFS); if (!p) return ERR_PTR(-ENOMEM); @@ -4252,16 +4248,14 @@ static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst static void *smb2_get_aead_req(struct crypto_aead *tfm, struct smb_rqst *rqst, int num_rqst, const u8 *sig, u8 **iv, - struct aead_request **req, struct scatterlist **sgl, - size_t *sensitive_size) + struct aead_request **req, struct scatterlist **sgl) { struct sg_table sgtable = {}; unsigned int skip, num_sgs, i, j; ssize_t rc; void *p; - p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, &sgtable, - &num_sgs, sensitive_size); + p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, &sgtable, &num_sgs); if (IS_ERR(p)) return ERR_CAST(p); @@ -4350,7 +4344,6 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, DECLARE_CRYPTO_WAIT(wait); unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); void *creq; - size_t sensitive_size; rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key); if (rc) { @@ -4376,8 +4369,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, return rc; } - creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg, - &sensitive_size); + creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg); if (IS_ERR(creq)) return PTR_ERR(creq); @@ -4407,7 +4399,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, if (!rc && enc) memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE); - kvfree_sensitive(creq, sensitive_size); + kfree_sensitive(creq); return rc; } diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 1c63d2c9cc9c..42e2d4ea344d 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -240,8 +240,8 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, */ if (smb2_command != SMB2_TREE_DISCONNECT) { spin_unlock(&tcon->tc_lock); - cifs_dbg(FYI, "can not send cmd %d while umounting\n", - smb2_command); + cifs_tcon_dbg(FYI, "can not send cmd %d while umounting\n", + smb2_command); return -ENODEV; } } @@ -296,9 +296,9 @@ again: return 0; } spin_unlock(&ses->chan_lock); - cifs_dbg(FYI, "sess reconnect mask: 0x%lx, tcon reconnect: %d", - tcon->ses->chans_need_reconnect, - tcon->need_reconnect); + cifs_tcon_dbg(FYI, "sess reconnect mask: 0x%lx, tcon reconnect: %d\n", + tcon->ses->chans_need_reconnect, + tcon->need_reconnect); mutex_lock(&ses->session_mutex); /* @@ -392,11 +392,11 @@ skip_sess_setup: rc = cifs_tree_connect(0, tcon); - cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); + cifs_tcon_dbg(FYI, "reconnect tcon rc = %d\n", rc); if (rc) { /* If sess reconnected but tcon didn't, something strange ... */ mutex_unlock(&ses->session_mutex); - cifs_dbg(VFS, "reconnect tcon failed rc = %d\n", rc); + cifs_tcon_dbg(VFS, "reconnect tcon failed rc = %d\n", rc); goto out; } @@ -442,8 +442,8 @@ skip_sess_setup: from_reconnect); goto skip_add_channels; } else if (rc) - cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n", - __func__, rc); + cifs_tcon_dbg(FYI, "%s: failed to query server interfaces: %d\n", + __func__, rc); if (ses->chan_max > ses->chan_count && ses->iface_count && diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index a61ba7f3fb86..051cd9dbba13 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -22,6 +22,7 @@ #include <linux/mempool.h> #include <linux/sched/signal.h> #include <linux/task_io_accounting_ops.h> +#include <linux/task_work.h> #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" @@ -173,9 +174,16 @@ smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg, * send a packet. In most cases if we fail to send * after the retries we will kill the socket and * reconnect which may clear the network problem. + * + * Even if regular signals are masked, EINTR might be + * propagated from sk_stream_wait_memory() to here when + * TIF_NOTIFY_SIGNAL is used for task work. For example, + * certain io_uring completions will use that. Treat + * having EINTR with pending task work the same as EAGAIN + * to avoid unnecessary reconnects. */ rc = sock_sendmsg(ssocket, smb_msg); - if (rc == -EAGAIN) { + if (rc == -EAGAIN || unlikely(rc == -EINTR && task_work_pending(current))) { retries++; if (retries >= 14 || (!server->noblocksnd && (retries > 2))) { @@ -323,8 +331,7 @@ int __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, break; total_len += sent; } - -} + } unmask: sigprocmask(SIG_SETMASK, &oldmask, NULL); diff --git a/fs/smb/common/Makefile b/fs/smb/common/Makefile index c66dbbc1469c..9e0730a385fb 100644 --- a/fs/smb/common/Makefile +++ b/fs/smb/common/Makefile @@ -3,5 +3,4 @@ # Makefile for Linux filesystem routines that are shared by client and server. # -obj-$(CONFIG_SMBFS) += cifs_arc4.o obj-$(CONFIG_SMBFS) += cifs_md4.o diff --git a/fs/smb/common/arc4.h b/fs/smb/common/arc4.h deleted file mode 100644 index 12e71ec033a1..000000000000 --- a/fs/smb/common/arc4.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * Common values for ARC4 Cipher Algorithm - */ - -#ifndef _CRYPTO_ARC4_H -#define _CRYPTO_ARC4_H - -#include <linux/types.h> - -#define ARC4_MIN_KEY_SIZE 1 -#define ARC4_MAX_KEY_SIZE 256 -#define ARC4_BLOCK_SIZE 1 - -struct arc4_ctx { - u32 S[256]; - u32 x, y; -}; - -int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len); -void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len); - -#endif /* _CRYPTO_ARC4_H */ diff --git a/fs/smb/common/cifs_arc4.c b/fs/smb/common/cifs_arc4.c deleted file mode 100644 index df360ca47826..000000000000 --- a/fs/smb/common/cifs_arc4.c +++ /dev/null @@ -1,75 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Cryptographic API - * - * ARC4 Cipher Algorithm - * - * Jon Oberheide <jon@oberheide.org> - */ - -#include <linux/module.h> -#include "arc4.h" - -MODULE_DESCRIPTION("ARC4 Cipher Algorithm"); -MODULE_LICENSE("GPL"); - -int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len) -{ - int i, j = 0, k = 0; - - ctx->x = 1; - ctx->y = 0; - - for (i = 0; i < 256; i++) - ctx->S[i] = i; - - for (i = 0; i < 256; i++) { - u32 a = ctx->S[i]; - - j = (j + in_key[k] + a) & 0xff; - ctx->S[i] = ctx->S[j]; - ctx->S[j] = a; - if (++k >= key_len) - k = 0; - } - - return 0; -} -EXPORT_SYMBOL_GPL(cifs_arc4_setkey); - -void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len) -{ - u32 *const S = ctx->S; - u32 x, y, a, b; - u32 ty, ta, tb; - - if (len == 0) - return; - - x = ctx->x; - y = ctx->y; - - a = S[x]; - y = (y + a) & 0xff; - b = S[y]; - - do { - S[y] = a; - a = (a + b) & 0xff; - S[x] = b; - x = (x + 1) & 0xff; - ta = S[x]; - ty = (y + ta) & 0xff; - tb = S[ty]; - *out++ = *in++ ^ S[a]; - if (--len == 0) - break; - y = ty; - a = ta; - b = tb; - } while (true); - - ctx->x = x; - ctx->y = y; -} -EXPORT_SYMBOL_GPL(cifs_arc4_crypt); diff --git a/fs/smb/server/Kconfig b/fs/smb/server/Kconfig index 4a23a5e7e8fe..098cac98d31e 100644 --- a/fs/smb/server/Kconfig +++ b/fs/smb/server/Kconfig @@ -10,6 +10,7 @@ config SMB_SERVER select CRYPTO_MD5 select CRYPTO_HMAC select CRYPTO_ECB + select CRYPTO_LIB_ARC4 select CRYPTO_LIB_DES select CRYPTO_LIB_SHA256 select CRYPTO_SHA256 diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index d99871c21451..b4020bb55a26 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -20,6 +20,7 @@ #include "glob.h" #include <linux/fips.h> +#include <crypto/arc4.h> #include <crypto/des.h> #include "server.h" @@ -29,7 +30,6 @@ #include "mgmt/user_config.h" #include "crypto_ctx.h" #include "transport_ipc.h" -#include "../common/arc4.h" /* * Fixed format data defining GSS header and fixed string @@ -365,10 +365,9 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, if (!ctx_arc4) return -ENOMEM; - cifs_arc4_setkey(ctx_arc4, sess->sess_key, - SMB2_NTLMV2_SESSKEY_SIZE); - cifs_arc4_crypt(ctx_arc4, sess->sess_key, - (char *)authblob + sess_key_off, sess_key_len); + arc4_setkey(ctx_arc4, sess->sess_key, SMB2_NTLMV2_SESSKEY_SIZE); + arc4_crypt(ctx_arc4, sess->sess_key, + (char *)authblob + sess_key_off, sess_key_len); kfree_sensitive(ctx_arc4); } diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 91a934411134..b6b4f1286b9c 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -19,7 +19,7 @@ static DEFINE_MUTEX(init_lock); static struct ksmbd_conn_ops default_conn_ops; -LIST_HEAD(conn_list); +DEFINE_HASHTABLE(conn_list, CONN_HASH_BITS); DECLARE_RWSEM(conn_list_lock); /** @@ -33,7 +33,7 @@ DECLARE_RWSEM(conn_list_lock); void ksmbd_conn_free(struct ksmbd_conn *conn) { down_write(&conn_list_lock); - list_del(&conn->conns_list); + hash_del(&conn->hlist); up_write(&conn_list_lock); xa_destroy(&conn->sessions); @@ -77,7 +77,6 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) init_waitqueue_head(&conn->req_running_q); init_waitqueue_head(&conn->r_count_q); - INIT_LIST_HEAD(&conn->conns_list); INIT_LIST_HEAD(&conn->requests); INIT_LIST_HEAD(&conn->async_requests); spin_lock_init(&conn->request_lock); @@ -90,19 +89,17 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) init_rwsem(&conn->session_lock); - down_write(&conn_list_lock); - list_add(&conn->conns_list, &conn_list); - up_write(&conn_list_lock); return conn; } bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c) { struct ksmbd_conn *t; + int bkt; bool ret = false; down_read(&conn_list_lock); - list_for_each_entry(t, &conn_list, conns_list) { + hash_for_each(conn_list, bkt, t, hlist) { if (memcmp(t->ClientGUID, c->ClientGUID, SMB2_CLIENT_GUID_SIZE)) continue; @@ -163,9 +160,10 @@ void ksmbd_conn_unlock(struct ksmbd_conn *conn) void ksmbd_all_conn_set_status(u64 sess_id, u32 status) { struct ksmbd_conn *conn; + int bkt; down_read(&conn_list_lock); - list_for_each_entry(conn, &conn_list, conns_list) { + hash_for_each(conn_list, bkt, conn, hlist) { if (conn->binding || xa_load(&conn->sessions, sess_id)) WRITE_ONCE(conn->status, status); } @@ -181,14 +179,14 @@ int ksmbd_conn_wait_idle_sess_id(struct ksmbd_conn *curr_conn, u64 sess_id) { struct ksmbd_conn *conn; int rc, retry_count = 0, max_timeout = 120; - int rcount = 1; + int rcount = 1, bkt; retry_idle: if (retry_count >= max_timeout) return -EIO; down_read(&conn_list_lock); - list_for_each_entry(conn, &conn_list, conns_list) { + hash_for_each(conn_list, bkt, conn, hlist) { if (conn->binding || xa_load(&conn->sessions, sess_id)) { if (conn == curr_conn) rcount = 2; @@ -480,10 +478,11 @@ static void stop_sessions(void) { struct ksmbd_conn *conn; struct ksmbd_transport *t; + int bkt; again: down_read(&conn_list_lock); - list_for_each_entry(conn, &conn_list, conns_list) { + hash_for_each(conn_list, bkt, conn, hlist) { t = conn->transport; ksmbd_conn_set_exiting(conn); if (t->ops->shutdown) { @@ -494,7 +493,7 @@ again: } up_read(&conn_list_lock); - if (!list_empty(&conn_list)) { + if (!hash_empty(conn_list)) { msleep(100); goto again; } diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 07b43634262a..7f9bcd9817b5 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -54,11 +54,12 @@ struct ksmbd_conn { u8 inet6_addr[16]; #endif }; + unsigned int inet_hash; char *request_buf; struct ksmbd_transport *transport; struct nls_table *local_nls; struct unicode_map *um; - struct list_head conns_list; + struct hlist_node hlist; struct rw_semaphore session_lock; /* smb session 1 per user */ struct xarray sessions; @@ -153,7 +154,8 @@ struct ksmbd_transport { #define KSMBD_TCP_SEND_TIMEOUT (5 * HZ) #define KSMBD_TCP_PEER_SOCKADDR(c) ((struct sockaddr *)&((c)->peer_addr)) -extern struct list_head conn_list; +#define CONN_HASH_BITS 12 +extern DECLARE_HASHTABLE(conn_list, CONN_HASH_BITS); extern struct rw_semaphore conn_list_lock; bool ksmbd_conn_alive(struct ksmbd_conn *conn); diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h index 3f07a612c05b..8ccd57fd904b 100644 --- a/fs/smb/server/ksmbd_netlink.h +++ b/fs/smb/server/ksmbd_netlink.h @@ -112,10 +112,11 @@ struct ksmbd_startup_request { __u32 smbd_max_io_size; /* smbd read write size */ __u32 max_connections; /* Number of maximum simultaneous connections */ __s8 bind_interfaces_only; - __s8 reserved[503]; /* Reserved room */ + __u32 max_ip_connections; /* Number of maximum connection per ip address */ + __s8 reserved[499]; /* Reserved room */ __u32 ifc_list_sz; /* interfaces list size */ __s8 ____payload[]; -}; +} __packed; #define KSMBD_STARTUP_CONFIG_INTERFACES(s) ((s)->____payload) diff --git a/fs/smb/server/mgmt/share_config.c b/fs/smb/server/mgmt/share_config.c index d3d5f99bdd34..c9b1108d6e96 100644 --- a/fs/smb/server/mgmt/share_config.c +++ b/fs/smb/server/mgmt/share_config.c @@ -19,7 +19,7 @@ #include "../transport_ipc.h" #include "../misc.h" -#define SHARE_HASH_BITS 3 +#define SHARE_HASH_BITS 12 static DEFINE_HASHTABLE(shares_table, SHARE_HASH_BITS); static DECLARE_RWSEM(shares_table_lock); diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 9dec4c2940bc..6fa025374f2f 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -18,7 +18,7 @@ static DEFINE_IDA(session_ida); -#define SESSION_HASH_BITS 3 +#define SESSION_HASH_BITS 12 static DEFINE_HASHTABLE(sessions_table, SESSION_HASH_BITS); static DECLARE_RWSEM(sessions_table_lock); @@ -104,29 +104,32 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) if (!entry) return -ENOMEM; - down_read(&sess->rpc_lock); entry->method = method; entry->id = id = ksmbd_ipc_id_alloc(); if (id < 0) goto free_entry; + + down_write(&sess->rpc_lock); old = xa_store(&sess->rpc_handle_list, id, entry, KSMBD_DEFAULT_GFP); - if (xa_is_err(old)) + if (xa_is_err(old)) { + up_write(&sess->rpc_lock); goto free_id; + } resp = ksmbd_rpc_open(sess, id); - if (!resp) - goto erase_xa; + if (!resp) { + xa_erase(&sess->rpc_handle_list, entry->id); + up_write(&sess->rpc_lock); + goto free_id; + } - up_read(&sess->rpc_lock); + up_write(&sess->rpc_lock); kvfree(resp); return id; -erase_xa: - xa_erase(&sess->rpc_handle_list, entry->id); free_id: ksmbd_rpc_id_free(entry->id); free_entry: kfree(entry); - up_read(&sess->rpc_lock); return -EINVAL; } @@ -144,9 +147,14 @@ void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id) int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id) { struct ksmbd_session_rpc *entry; + int method; + down_read(&sess->rpc_lock); entry = xa_load(&sess->rpc_handle_list, id); - return entry ? entry->method : 0; + method = entry ? entry->method : 0; + up_read(&sess->rpc_lock); + + return method; } void ksmbd_session_destroy(struct ksmbd_session *sess) diff --git a/fs/smb/server/server.h b/fs/smb/server/server.h index 995555febe7d..b8a7317be86b 100644 --- a/fs/smb/server/server.h +++ b/fs/smb/server/server.h @@ -43,6 +43,7 @@ struct ksmbd_server_config { unsigned int auth_mechs; unsigned int max_connections; unsigned int max_inflight_req; + unsigned int max_ip_connections; char *conf[SERVER_CONF_WORK_GROUP + 1]; struct task_struct *dh_task; diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 0c069eff80b7..ab1d45fcebde 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -5629,7 +5629,8 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, if (!work->tcon->posix_extensions) { pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n"); - rc = -EOPNOTSUPP; + path_put(&path); + return -EOPNOTSUPP; } else { info = (struct filesystem_posix_info *)(rsp->Buffer); info->OptimalTransferSize = cpu_to_le32(stfs.f_bsize); @@ -7361,7 +7362,7 @@ int smb2_lock(struct ksmbd_work *work) int nolock = 0; LIST_HEAD(lock_list); LIST_HEAD(rollback_list); - int prior_lock = 0; + int prior_lock = 0, bkt; WORK_BUFFERS(work, req, rsp); @@ -7471,7 +7472,7 @@ int smb2_lock(struct ksmbd_work *work) nolock = 1; /* check locks in connection list */ down_read(&conn_list_lock); - list_for_each_entry(conn, &conn_list, conns_list) { + hash_for_each(conn_list, bkt, conn, hlist) { spin_lock(&conn->llist_lock); list_for_each_entry_safe(cmp_lock, tmp2, &conn->lock_list, clist) { if (file_inode(cmp_lock->fl->c.flc_file) != diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index 2a3e2b0ce557..2aa1b29bea08 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -335,6 +335,9 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) if (req->max_connections) server_conf.max_connections = req->max_connections; + if (req->max_ip_connections) + server_conf.max_ip_connections = req->max_ip_connections; + ret = ksmbd_set_netbios_name(req->netbios_name); ret |= ksmbd_set_server_string(req->server_string); ret |= ksmbd_set_work_group(req->work_group); diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 9e644a0daf1c..b3077766d6ec 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -425,6 +425,11 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) conn = ksmbd_conn_alloc(); if (!conn) goto err; + + down_write(&conn_list_lock); + hash_add(conn_list, &conn->hlist, 0); + up_write(&conn_list_lock); + conn->transport = KSMBD_TRANS(t); KSMBD_TRANS(t)->conn = conn; KSMBD_TRANS(t)->ops = &ksmbd_smb_direct_transport_ops; diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index 4337df97987d..7a1e3dcc2cde 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -86,13 +86,21 @@ static struct tcp_transport *alloc_transport(struct socket *client_sk) } #if IS_ENABLED(CONFIG_IPV6) - if (client_sk->sk->sk_family == AF_INET6) + if (client_sk->sk->sk_family == AF_INET6) { memcpy(&conn->inet6_addr, &client_sk->sk->sk_v6_daddr, 16); - else + conn->inet_hash = ipv6_addr_hash(&client_sk->sk->sk_v6_daddr); + } else { conn->inet_addr = inet_sk(client_sk->sk)->inet_daddr; + conn->inet_hash = ipv4_addr_hash(inet_sk(client_sk->sk)->inet_daddr); + } #else conn->inet_addr = inet_sk(client_sk->sk)->inet_daddr; + conn->inet_hash = ipv4_addr_hash(inet_sk(client_sk->sk)->inet_daddr); #endif + down_write(&conn_list_lock); + hash_add(conn_list, &conn->hlist, conn->inet_hash); + up_write(&conn_list_lock); + conn->transport = KSMBD_TRANS(t); KSMBD_TRANS(t)->conn = conn; KSMBD_TRANS(t)->ops = &ksmbd_tcp_transport_ops; @@ -170,17 +178,6 @@ static struct kvec *get_conn_iovec(struct tcp_transport *t, unsigned int nr_segs return new_iov; } -static unsigned short ksmbd_tcp_get_port(const struct sockaddr *sa) -{ - switch (sa->sa_family) { - case AF_INET: - return ntohs(((struct sockaddr_in *)sa)->sin_port); - case AF_INET6: - return ntohs(((struct sockaddr_in6 *)sa)->sin6_port); - } - return 0; -} - /** * ksmbd_tcp_new_connection() - create a new tcp session on mount * @client_sk: socket associated with new connection @@ -192,7 +189,6 @@ static unsigned short ksmbd_tcp_get_port(const struct sockaddr *sa) */ static int ksmbd_tcp_new_connection(struct socket *client_sk) { - struct sockaddr *csin; int rc = 0; struct tcp_transport *t; struct task_struct *handler; @@ -203,27 +199,26 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk) return -ENOMEM; } - csin = KSMBD_TCP_PEER_SOCKADDR(KSMBD_TRANS(t)->conn); - if (kernel_getpeername(client_sk, csin) < 0) { - pr_err("client ip resolution failed\n"); - rc = -EINVAL; - goto out_error; - } - +#if IS_ENABLED(CONFIG_IPV6) + if (client_sk->sk->sk_family == AF_INET6) + handler = kthread_run(ksmbd_conn_handler_loop, + KSMBD_TRANS(t)->conn, "ksmbd:%pI6c", + &KSMBD_TRANS(t)->conn->inet6_addr); + else + handler = kthread_run(ksmbd_conn_handler_loop, + KSMBD_TRANS(t)->conn, "ksmbd:%pI4", + &KSMBD_TRANS(t)->conn->inet_addr); +#else handler = kthread_run(ksmbd_conn_handler_loop, - KSMBD_TRANS(t)->conn, - "ksmbd:%u", - ksmbd_tcp_get_port(csin)); + KSMBD_TRANS(t)->conn, "ksmbd:%pI4", + &KSMBD_TRANS(t)->conn->inet_addr); +#endif if (IS_ERR(handler)) { pr_err("cannot start conn thread\n"); rc = PTR_ERR(handler); free_transport(t); } return rc; - -out_error: - free_transport(t); - return rc; } /** @@ -237,7 +232,8 @@ static int ksmbd_kthread_fn(void *p) struct socket *client_sk = NULL; struct interface *iface = (struct interface *)p; struct ksmbd_conn *conn; - int ret; + int ret, inet_hash; + unsigned int max_ip_conns; while (!kthread_should_stop()) { mutex_lock(&iface->sock_release_lock); @@ -255,34 +251,49 @@ static int ksmbd_kthread_fn(void *p) continue; } + if (!server_conf.max_ip_connections) + goto skip_max_ip_conns_limit; + /* * Limits repeated connections from clients with the same IP. */ +#if IS_ENABLED(CONFIG_IPV6) + if (client_sk->sk->sk_family == AF_INET6) + inet_hash = ipv6_addr_hash(&client_sk->sk->sk_v6_daddr); + else + inet_hash = ipv4_addr_hash(inet_sk(client_sk->sk)->inet_daddr); +#else + inet_hash = ipv4_addr_hash(inet_sk(client_sk->sk)->inet_daddr); +#endif + + max_ip_conns = 0; down_read(&conn_list_lock); - list_for_each_entry(conn, &conn_list, conns_list) + hash_for_each_possible(conn_list, conn, hlist, inet_hash) { #if IS_ENABLED(CONFIG_IPV6) if (client_sk->sk->sk_family == AF_INET6) { if (memcmp(&client_sk->sk->sk_v6_daddr, - &conn->inet6_addr, 16) == 0) { - ret = -EAGAIN; - break; - } + &conn->inet6_addr, 16) == 0) + max_ip_conns++; } else if (inet_sk(client_sk->sk)->inet_daddr == - conn->inet_addr) { - ret = -EAGAIN; - break; - } + conn->inet_addr) + max_ip_conns++; #else if (inet_sk(client_sk->sk)->inet_daddr == - conn->inet_addr) { + conn->inet_addr) + max_ip_conns++; +#endif + if (server_conf.max_ip_connections <= max_ip_conns) { + pr_info_ratelimited("Maximum IP connections exceeded (%u/%u)\n", + max_ip_conns, server_conf.max_ip_connections); ret = -EAGAIN; break; } -#endif + } up_read(&conn_list_lock); if (ret == -EAGAIN) continue; +skip_max_ip_conns_limit: if (server_conf.max_connections && atomic_inc_return(&active_num_conn) >= server_conf.max_connections) { pr_info_ratelimited("Limit the maximum number of connections(%u)\n", @@ -468,12 +479,13 @@ static int create_socket(struct interface *iface) struct socket *ksmbd_socket; bool ipv4 = false; - ret = sock_create(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &ksmbd_socket); + ret = sock_create_kern(current->nsproxy->net_ns, PF_INET6, SOCK_STREAM, + IPPROTO_TCP, &ksmbd_socket); if (ret) { if (ret != -EAFNOSUPPORT) pr_err("Can't create socket for ipv6, fallback to ipv4: %d\n", ret); - ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, - &ksmbd_socket); + ret = sock_create_kern(current->nsproxy->net_ns, PF_INET, + SOCK_STREAM, IPPROTO_TCP, &ksmbd_socket); if (ret) { pr_err("Can't create socket for ipv4: %d\n", ret); goto out_clear; diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 1cfa688904b2..891ed2dc2b73 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -20,6 +20,7 @@ #include <linux/sched/xacct.h> #include <linux/crc32c.h> #include <linux/namei.h> +#include <linux/splice.h> #include "glob.h" #include "oplock.h" @@ -72,7 +73,7 @@ static int ksmbd_vfs_path_lookup(struct ksmbd_share_config *share_conf, { struct qstr last; struct filename *filename __free(putname) = NULL; - struct path *root_share_path = &share_conf->vfs_path; + const struct path *root_share_path = &share_conf->vfs_path; int err, type; struct dentry *d; @@ -1305,7 +1306,7 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *filepath, caseless, true); } -void ksmbd_vfs_kern_path_unlock(struct path *path) +void ksmbd_vfs_kern_path_unlock(const struct path *path) { /* While lock is still held, ->d_parent is safe */ inode_unlock(d_inode(path->dentry->d_parent)); @@ -1829,8 +1830,19 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work, if (src_off + len > src_file_size) return -E2BIG; - ret = vfs_copy_file_range(src_fp->filp, src_off, - dst_fp->filp, dst_off, len, 0); + /* + * vfs_copy_file_range does not allow overlapped copying + * within the same file. + */ + if (file_inode(src_fp->filp) == file_inode(dst_fp->filp) && + dst_off + len > src_off && + dst_off < src_off + len) + ret = do_splice_direct(src_fp->filp, &src_off, + dst_fp->filp, &dst_off, + min_t(size_t, len, MAX_RW_COUNT), 0); + else + ret = vfs_copy_file_range(src_fp->filp, src_off, + dst_fp->filp, dst_off, len, 0); if (ret == -EOPNOTSUPP || ret == -EXDEV) ret = vfs_copy_file_range(src_fp->filp, src_off, dst_fp->filp, dst_off, len, @@ -1855,7 +1867,7 @@ void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock) } int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, - struct path *path) + const struct path *path) { struct posix_acl_state acl_state; struct posix_acl *acls; @@ -1908,7 +1920,7 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, } int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, - struct path *path, struct inode *parent_inode) + const struct path *path, struct inode *parent_inode) { struct posix_acl *acls; struct posix_acl_entry *pace; diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h index d47472f3e30b..df6421b4590b 100644 --- a/fs/smb/server/vfs.h +++ b/fs/smb/server/vfs.h @@ -123,7 +123,7 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, unsigned int flags, struct path *path, bool caseless); -void ksmbd_vfs_kern_path_unlock(struct path *path); +void ksmbd_vfs_kern_path_unlock(const struct path *path); struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, const char *name, unsigned int flags, @@ -164,8 +164,8 @@ int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_dos_attrib *da); int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, - struct path *path); + const struct path *path); int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, - struct path *path, + const struct path *path, struct inode *parent_inode); #endif /* __KSMBD_VFS_H__ */ diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index ce7d661d5ad8..1582e0637a7e 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -307,7 +307,8 @@ static int fill_meta_index(struct inode *inode, int index, all_done: *index_block = cur_index_block; *index_offset = cur_offset; - *data_block = cur_data_block; + if (data_block) + *data_block = cur_data_block; /* * Scale cache index (cache slot entry) to index @@ -324,17 +325,15 @@ failed: * Get the on-disk location and compressed size of the datablock * specified by index. Fill_meta_index() does most of the work. */ -static int read_blocklist(struct inode *inode, int index, u64 *block) +static int read_blocklist_ptrs(struct inode *inode, int index, u64 *start, + int *offset, u64 *block) { - u64 start; long long blks; - int offset; __le32 size; - int res = fill_meta_index(inode, index, &start, &offset, block); + int res = fill_meta_index(inode, index, start, offset, block); - TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset" - " 0x%x, block 0x%llx\n", res, index, start, offset, - *block); + TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset 0x%x, block 0x%llx\n", + res, index, *start, *offset, block ? *block : 0); if (res < 0) return res; @@ -346,22 +345,31 @@ static int read_blocklist(struct inode *inode, int index, u64 *block) * extra block indexes needed. */ if (res < index) { - blks = read_indexes(inode->i_sb, index - res, &start, &offset); + blks = read_indexes(inode->i_sb, index - res, start, offset); if (blks < 0) return (int) blks; - *block += blks; + if (block) + *block += blks; } /* * Read length of block specified by index. */ - res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset, + res = squashfs_read_metadata(inode->i_sb, &size, start, offset, sizeof(size)); if (res < 0) return res; return squashfs_block_size(size); } +static inline int read_blocklist(struct inode *inode, int index, u64 *block) +{ + u64 start; + int offset; + + return read_blocklist_ptrs(inode, index, &start, &offset, block); +} + static bool squashfs_fill_page(struct folio *folio, struct squashfs_cache_entry *buffer, size_t offset, size_t avail) @@ -658,7 +666,114 @@ skip_pages: kfree(pages); } +static loff_t seek_hole_data(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + struct super_block *sb = inode->i_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + u64 start, index = offset >> msblk->block_log; + u64 file_end = (i_size_read(inode) + msblk->block_size - 1) >> msblk->block_log; + int s_offset, length; + __le32 *blist = NULL; + + /* reject offset if negative or beyond file end */ + if ((unsigned long long)offset >= i_size_read(inode)) + return -ENXIO; + + /* is offset within tailend and is tailend packed into a fragment? */ + if (index + 1 == file_end && + squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) { + if (whence == SEEK_DATA) + return offset; + + /* there is an implicit hole at the end of any file */ + return i_size_read(inode); + } + + length = read_blocklist_ptrs(inode, index, &start, &s_offset, NULL); + if (length < 0) + return length; + + /* nothing more to do if offset matches desired whence value */ + if ((length == 0 && whence == SEEK_HOLE) || + (length && whence == SEEK_DATA)) + return offset; + + /* skip scanning forwards if we're at file end */ + if (++ index == file_end) + goto not_found; + + blist = kmalloc(SQUASHFS_SCAN_INDEXES << 2, GFP_KERNEL); + if (blist == NULL) { + ERROR("%s: Failed to allocate block_list\n", __func__); + return -ENOMEM; + } + + while (index < file_end) { + int i, indexes = min(file_end - index, SQUASHFS_SCAN_INDEXES); + + offset = squashfs_read_metadata(sb, blist, &start, &s_offset, indexes << 2); + if (offset < 0) + goto finished; + + for (i = 0; i < indexes; i++) { + length = squashfs_block_size(blist[i]); + if (length < 0) { + offset = length; + goto finished; + } + + /* does this block match desired whence value? */ + if ((length == 0 && whence == SEEK_HOLE) || + (length && whence == SEEK_DATA)) { + offset = (index + i) << msblk->block_log; + goto finished; + } + } + + index += indexes; + } + +not_found: + /* whence value determines what happens */ + if (whence == SEEK_DATA) + offset = -ENXIO; + else + /* there is an implicit hole at the end of any file */ + offset = i_size_read(inode); + +finished: + kfree(blist); + return offset; +} + +static loff_t squashfs_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + + switch (whence) { + default: + return generic_file_llseek(file, offset, whence); + case SEEK_DATA: + case SEEK_HOLE: + offset = seek_hole_data(file, offset, whence); + break; + } + + if (offset < 0) + return offset; + + return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); +} + const struct address_space_operations squashfs_aops = { .read_folio = squashfs_read_folio, .readahead = squashfs_readahead }; + +const struct file_operations squashfs_file_operations = { + .llseek = squashfs_llseek, + .read_iter = generic_file_read_iter, + .mmap_prepare = generic_file_readonly_mmap_prepare, + .splice_read = filemap_splice_read +}; diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index d5918eba27e3..cceae3b78698 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -68,6 +68,10 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, inode->i_mode = le16_to_cpu(sqsh_ino->mode); inode->i_size = 0; + /* File type must not be set at this moment, for it will later be set by the caller. */ + if (inode->i_mode & S_IFMT) + err = -EIO; + return err; } @@ -140,8 +144,17 @@ int squashfs_read_inode(struct inode *inode, long long ino) if (err < 0) goto failed_read; + inode->i_size = le32_to_cpu(sqsh_ino->file_size); frag = le32_to_cpu(sqsh_ino->fragment); if (frag != SQUASHFS_INVALID_FRAG) { + /* + * the file cannot have a fragment (tailend) and have a + * file size a multiple of the block size + */ + if ((inode->i_size & (msblk->block_size - 1)) == 0) { + err = -EINVAL; + goto failed_read; + } frag_offset = le32_to_cpu(sqsh_ino->offset); frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); if (frag_size < 0) { @@ -155,8 +168,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) } set_nlink(inode, 1); - inode->i_size = le32_to_cpu(sqsh_ino->file_size); - inode->i_fop = &generic_ro_fops; + inode->i_fop = &squashfs_file_operations; inode->i_mode |= S_IFREG; inode->i_blocks = ((inode->i_size - 1) >> 9) + 1; squashfs_i(inode)->fragment_block = frag_blk; @@ -165,6 +177,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->block_list_start = block; squashfs_i(inode)->offset = offset; + squashfs_i(inode)->parent = 0; inode->i_data.a_ops = &squashfs_aops; TRACE("File inode %x:%x, start_block %llx, block_list_start " @@ -183,8 +196,21 @@ int squashfs_read_inode(struct inode *inode, long long ino) if (err < 0) goto failed_read; + inode->i_size = le64_to_cpu(sqsh_ino->file_size); + if (inode->i_size < 0) { + err = -EINVAL; + goto failed_read; + } frag = le32_to_cpu(sqsh_ino->fragment); if (frag != SQUASHFS_INVALID_FRAG) { + /* + * the file cannot have a fragment (tailend) and have a + * file size a multiple of the block size + */ + if ((inode->i_size & (msblk->block_size - 1)) == 0) { + err = -EINVAL; + goto failed_read; + } frag_offset = le32_to_cpu(sqsh_ino->offset); frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); if (frag_size < 0) { @@ -199,9 +225,8 @@ int squashfs_read_inode(struct inode *inode, long long ino) xattr_id = le32_to_cpu(sqsh_ino->xattr); set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); - inode->i_size = le64_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_inode_ops; - inode->i_fop = &generic_ro_fops; + inode->i_fop = &squashfs_file_operations; inode->i_mode |= S_IFREG; inode->i_blocks = (inode->i_size - le64_to_cpu(sqsh_ino->sparse) + 511) >> 9; @@ -212,6 +237,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->block_list_start = block; squashfs_i(inode)->offset = offset; + squashfs_i(inode)->parent = 0; inode->i_data.a_ops = &squashfs_aops; TRACE("File inode %x:%x, start_block %llx, block_list_start " @@ -292,6 +318,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFLNK; squashfs_i(inode)->start = block; squashfs_i(inode)->offset = offset; + squashfs_i(inode)->parent = 0; if (type == SQUASHFS_LSYMLINK_TYPE) { __le32 xattr; @@ -329,6 +356,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + squashfs_i(inode)->parent = 0; TRACE("Device inode %x:%x, rdev %x\n", SQUASHFS_INODE_BLK(ino), offset, rdev); @@ -353,6 +381,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + squashfs_i(inode)->parent = 0; TRACE("Device inode %x:%x, rdev %x\n", SQUASHFS_INODE_BLK(ino), offset, rdev); @@ -373,6 +402,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFSOCK; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); + squashfs_i(inode)->parent = 0; break; } case SQUASHFS_LFIFO_TYPE: @@ -392,6 +422,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_op = &squashfs_inode_ops; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); + squashfs_i(inode)->parent = 0; break; } default: diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 218868b20f16..4851bd964502 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -107,6 +107,7 @@ extern const struct address_space_operations squashfs_aops; /* inode.c */ extern const struct inode_operations squashfs_inode_ops; +extern const struct file_operations squashfs_file_operations; /* namei.c */ extern const struct inode_operations squashfs_dir_inode_ops; diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 95f8e8901768..a955d9369749 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -208,6 +208,7 @@ static inline int squashfs_block_size(__le32 raw) #define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int)) #define SQUASHFS_META_ENTRIES 127 #define SQUASHFS_META_SLOTS 8 +#define SQUASHFS_SCAN_INDEXES 1024 struct meta_entry { u64 data_block; diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h index 2c82d6f2a456..8e497ac07b9a 100644 --- a/fs/squashfs/squashfs_fs_i.h +++ b/fs/squashfs/squashfs_fs_i.h @@ -16,6 +16,7 @@ struct squashfs_inode_info { u64 xattr; unsigned int xattr_size; int xattr_count; + int parent; union { struct { u64 fragment_block; @@ -27,7 +28,6 @@ struct squashfs_inode_info { u64 dir_idx_start; int dir_idx_offset; int dir_idx_cnt; - int parent; }; }; struct inode vfs_inode; diff --git a/fs/stat.c b/fs/stat.c index f95c1dc3eaa4..6c79661e1b96 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -293,7 +293,7 @@ static int statx_lookup_flags(int flags) return lookup_flags; } -static int vfs_statx_path(struct path *path, int flags, struct kstat *stat, +static int vfs_statx_path(const struct path *path, int flags, struct kstat *stat, u32 request_mask) { int error = vfs_getattr(path, stat, request_mask, flags); diff --git a/fs/super.c b/fs/super.c index f4fa0e93c463..5bab94fb7e03 100644 --- a/fs/super.c +++ b/fs/super.c @@ -323,7 +323,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, if (!s) return NULL; - INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); @@ -408,7 +407,7 @@ static void __put_super(struct super_block *s) list_del_init(&s->s_list); WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); - WARN_ON(!list_empty(&s->s_mounts)); + WARN_ON(s->s_mounts); call_rcu(&s->rcu, destroy_super_rcu); } } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index f24aa98e6869..a79d73f28aa7 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -2272,6 +2272,9 @@ int udf_current_aext(struct inode *inode, struct extent_position *epos, if (check_add_overflow(sizeof(struct allocExtDesc), le32_to_cpu(header->lengthAllocDescs), &alen)) return -1; + + if (alen > epos->bh->b_size) + return -1; } switch (iinfo->i_alloc_type) { diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c index 770e29ec3557..42bedc4ec7af 100644 --- a/fs/vboxsf/dir.c +++ b/fs/vboxsf/dir.c @@ -315,46 +315,39 @@ static int vboxsf_dir_atomic_open(struct inode *parent, struct dentry *dentry, { struct vboxsf_sbi *sbi = VBOXSF_SBI(parent->i_sb); struct vboxsf_handle *sf_handle; - struct dentry *res = NULL; u64 handle; int err; if (d_in_lookup(dentry)) { - res = vboxsf_dir_lookup(parent, dentry, 0); - if (IS_ERR(res)) - return PTR_ERR(res); - - if (res) - dentry = res; + struct dentry *res = vboxsf_dir_lookup(parent, dentry, 0); + if (res || d_really_is_positive(dentry)) + return finish_no_open(file, res); } /* Only creates */ - if (!(flags & O_CREAT) || d_really_is_positive(dentry)) - return finish_no_open(file, res); + if (!(flags & O_CREAT)) + return finish_no_open(file, NULL); err = vboxsf_dir_create(parent, dentry, mode, false, flags & O_EXCL, &handle); if (err) - goto out; + return err; sf_handle = vboxsf_create_sf_handle(d_inode(dentry), handle, SHFL_CF_ACCESS_READWRITE); if (IS_ERR(sf_handle)) { vboxsf_close(sbi->root, handle); - err = PTR_ERR(sf_handle); - goto out; + return PTR_ERR(sf_handle); } err = finish_open(file, dentry, generic_file_open); if (err) { /* This also closes the handle passed to vboxsf_create_sf_handle() */ vboxsf_release_sf_handle(d_inode(dentry), sf_handle); - goto out; + return err; } file->private_data = sf_handle; file->f_mode |= FMODE_CREATED; -out: - dput(res); - return err; + return 0; } static int vboxsf_dir_unlink(struct inode *parent, struct dentry *dentry) |