diff options
Diffstat (limited to 'fs/fuse/inode.c')
| -rw-r--r-- | fs/fuse/inode.c | 1754 |
1 files changed, 1315 insertions, 439 deletions
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 76baaa6be393..819e50d66622 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,7 +7,10 @@ */ #include "fuse_i.h" +#include "fuse_dev_i.h" +#include "dev_uring_i.h" +#include <linux/dax.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/file.h> @@ -15,13 +18,15 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/moduleparam.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/statfs.h> #include <linux/random.h> #include <linux/sched.h> #include <linux/exportfs.h> #include <linux/posix_acl.h> #include <linux/pid_namespace.h> +#include <uapi/linux/magic.h> MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Filesystem in Userspace"); @@ -30,10 +35,16 @@ MODULE_LICENSE("GPL"); static struct kmem_cache *fuse_inode_cachep; struct list_head fuse_conn_list; DEFINE_MUTEX(fuse_mutex); +DECLARE_WAIT_QUEUE_HEAD(fuse_dev_waitq); static int set_global_limit(const char *val, const struct kernel_param *kp); -unsigned max_user_bgreq; +unsigned int fuse_max_pages_limit = 256; +/* default is no timeout */ +unsigned int fuse_default_req_timeout; +unsigned int fuse_max_req_timeout; + +unsigned int max_user_bgreq; module_param_call(max_user_bgreq, set_global_limit, param_get_uint, &max_user_bgreq, 0644); __MODULE_PARM_TYPE(max_user_bgreq, "uint"); @@ -41,7 +52,7 @@ MODULE_PARM_DESC(max_user_bgreq, "Global limit for the maximum number of backgrounded requests an " "unprivileged user can set"); -unsigned max_user_congthresh; +unsigned int max_user_congthresh; module_param_call(max_user_congthresh, set_global_limit, param_get_uint, &max_user_congthresh, 0644); __MODULE_PARM_TYPE(max_user_congthresh, "uint"); @@ -49,8 +60,6 @@ MODULE_PARM_DESC(max_user_congthresh, "Global limit for the maximum congestion threshold an " "unprivileged user can set"); -#define FUSE_SUPER_MAGIC 0x65735546 - #define FUSE_DEFAULT_BLKSIZE 512 /** Maximum number of outstanding background requests */ @@ -59,87 +68,142 @@ MODULE_PARM_DESC(max_user_congthresh, /** Congestion starts at 75% of maximum */ #define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4) -struct fuse_mount_data { - int fd; - unsigned rootmode; - kuid_t user_id; - kgid_t group_id; - unsigned fd_present:1; - unsigned rootmode_present:1; - unsigned user_id_present:1; - unsigned group_id_present:1; - unsigned default_permissions:1; - unsigned allow_other:1; - unsigned max_read; - unsigned blksize; -}; +#ifdef CONFIG_BLOCK +static struct file_system_type fuseblk_fs_type; +#endif struct fuse_forget_link *fuse_alloc_forget(void) { - return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); + return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL_ACCOUNT); +} + +static struct fuse_submount_lookup *fuse_alloc_submount_lookup(void) +{ + struct fuse_submount_lookup *sl; + + sl = kzalloc(sizeof(struct fuse_submount_lookup), GFP_KERNEL_ACCOUNT); + if (!sl) + return NULL; + sl->forget = fuse_alloc_forget(); + if (!sl->forget) + goto out_free; + + return sl; + +out_free: + kfree(sl); + return NULL; } static struct inode *fuse_alloc_inode(struct super_block *sb) { - struct inode *inode; struct fuse_inode *fi; - inode = kmem_cache_alloc(fuse_inode_cachep, GFP_KERNEL); - if (!inode) + fi = alloc_inode_sb(sb, fuse_inode_cachep, GFP_KERNEL); + if (!fi) return NULL; - fi = get_fuse_inode(inode); - fi->i_time = 0; - fi->inval_mask = 0; - fi->nodeid = 0; - fi->nlookup = 0; - fi->attr_version = 0; - fi->orig_ino = 0; - fi->state = 0; + /* Initialize private data (i.e. everything except fi->inode) */ + BUILD_BUG_ON(offsetof(struct fuse_inode, inode) != 0); + memset((void *) fi + sizeof(fi->inode), 0, sizeof(*fi) - sizeof(fi->inode)); + + fi->inval_mask = ~0; mutex_init(&fi->mutex); + spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); - if (!fi->forget) { - kmem_cache_free(fuse_inode_cachep, inode); - return NULL; - } + if (!fi->forget) + goto out_free; - return inode; -} + if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi)) + goto out_free_forget; -static void fuse_i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(fuse_inode_cachep, inode); + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_inode_backing_set(fi, NULL); + + return &fi->inode; + +out_free_forget: + kfree(fi->forget); +out_free: + kmem_cache_free(fuse_inode_cachep, fi); + return NULL; } -static void fuse_destroy_inode(struct inode *inode) +static void fuse_free_inode(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); - if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) { - WARN_ON(!list_empty(&fi->write_files)); - WARN_ON(!list_empty(&fi->queued_writes)); - } + mutex_destroy(&fi->mutex); kfree(fi->forget); - call_rcu(&inode->i_rcu, fuse_i_callback); +#ifdef CONFIG_FUSE_DAX + kfree(fi->dax); +#endif + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_put(fuse_inode_backing(fi)); + + kmem_cache_free(fuse_inode_cachep, fi); +} + +static void fuse_cleanup_submount_lookup(struct fuse_conn *fc, + struct fuse_submount_lookup *sl) +{ + if (!refcount_dec_and_test(&sl->count)) + return; + + fuse_queue_forget(fc, sl->forget, sl->nodeid, 1); + sl->forget = NULL; + kfree(sl); } static void fuse_evict_inode(struct inode *inode) { + struct fuse_inode *fi = get_fuse_inode(inode); + + /* Will write inode on close/munmap and in all other dirtiers */ + WARN_ON(inode_state_read_once(inode) & I_DIRTY_INODE); + + if (FUSE_IS_DAX(inode)) + dax_break_layout_final(inode); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_sb->s_flags & SB_ACTIVE) { struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); - fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); - fi->forget = NULL; + + if (FUSE_IS_DAX(inode)) + fuse_dax_inode_cleanup(inode); + if (fi->nlookup) { + fuse_queue_forget(fc, fi->forget, fi->nodeid, + fi->nlookup); + fi->forget = NULL; + } + + if (fi->submount_lookup) { + fuse_cleanup_submount_lookup(fc, fi->submount_lookup); + fi->submount_lookup = NULL; + } + /* + * Evict of non-deleted inode may race with outstanding + * LOOKUP/READDIRPLUS requests and result in inconsistency when + * the request finishes. Deal with that here by bumping a + * counter that can be compared to the starting value. + */ + if (inode->i_nlink > 0) + atomic64_inc(&fc->evict_ctr); + } + if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { + WARN_ON(fi->iocachectr != 0); + WARN_ON(!list_empty(&fi->write_files)); + WARN_ON(!list_empty(&fi->queued_writes)); } } -static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) +static int fuse_reconfigure(struct fs_context *fsc) { + struct super_block *sb = fsc->root->d_sb; + sync_filesystem(sb); - if (*flags & SB_MANDLOCK) + if (fsc->sb_flags & SB_MANDLOCK) return -EINVAL; return 0; @@ -158,14 +222,31 @@ static ino_t fuse_squash_ino(u64 ino64) } void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, - u64 attr_valid) + struct fuse_statx *sx, + u64 attr_valid, u32 cache_mask, + u64 evict_ctr) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - fi->attr_version = ++fc->attr_version; + lockdep_assert_held(&fi->lock); + + /* + * Clear basic stats from invalid mask. + * + * Don't do this if this is coming from a fuse_iget() call and there + * might have been a racing evict which would've invalidated the result + * if the attr_version would've been preserved. + * + * !evict_ctr -> this is create + * fi->attr_version != 0 -> this is not a new inode + * evict_ctr == fuse_get_evict_ctr() -> no evicts while during request + */ + if (!evict_ctr || fi->attr_version || evict_ctr == fuse_get_evict_ctr(fc)) + set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0); + + fi->attr_version = atomic64_inc_return(&fc->attr_version); fi->i_time = attr_valid; - WRITE_ONCE(fi->inval_mask, 0); inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); @@ -173,20 +254,44 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_uid = make_kuid(fc->user_ns, attr->uid); inode->i_gid = make_kgid(fc->user_ns, attr->gid); inode->i_blocks = attr->blocks; - inode->i_atime.tv_sec = attr->atime; - inode->i_atime.tv_nsec = attr->atimensec; + + /* Sanitize nsecs */ + attr->atimensec = min_t(u32, attr->atimensec, NSEC_PER_SEC - 1); + attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1); + attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1); + + inode_set_atime(inode, attr->atime, attr->atimensec); /* mtime from server may be stale due to local buffered write */ - if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) { - inode->i_mtime.tv_sec = attr->mtime; - inode->i_mtime.tv_nsec = attr->mtimensec; - inode->i_ctime.tv_sec = attr->ctime; - inode->i_ctime.tv_nsec = attr->ctimensec; + if (!(cache_mask & STATX_MTIME)) { + inode_set_mtime(inode, attr->mtime, attr->mtimensec); } + if (!(cache_mask & STATX_CTIME)) { + inode_set_ctime(inode, attr->ctime, attr->ctimensec); + } + if (sx) { + /* Sanitize nsecs */ + sx->btime.tv_nsec = + min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1); - if (attr->blksize != 0) - inode->i_blkbits = ilog2(attr->blksize); + /* + * Btime has been queried, cache is valid (whether or not btime + * is available or not) so clear STATX_BTIME from inval_mask. + * + * Availability of the btime attribute is indicated in + * FUSE_I_BTIME + */ + set_mask_bits(&fi->inval_mask, STATX_BTIME, 0); + if (sx->mask & STATX_BTIME) { + set_bit(FUSE_I_BTIME, &fi->state); + fi->i_btime.tv_sec = sx->btime.tv_sec; + fi->i_btime.tv_nsec = sx->btime.tv_nsec; + } + } + + if (attr->blksize) + fi->cached_i_blkbits = ilog2(attr->blksize); else - inode->i_blkbits = inode->i_sb->s_blocksize_bits; + fi->cached_i_blkbits = inode->i_sb->s_blocksize_bits; /* * Don't set the sticky bit in i_mode, unless we want the VFS @@ -198,26 +303,66 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_mode &= ~S_ISVTX; fi->orig_ino = attr->ino; + + /* + * We are refreshing inode data and it is possible that another + * client set suid/sgid or security.capability xattr. So clear + * S_NOSEC. Ideally, we could have cleared it only if suid/sgid + * was set or if security.capability xattr was set. But we don't + * know if security.capability has been set or not. So clear it + * anyway. Its less efficient but should be safe. + */ + inode->i_flags &= ~S_NOSEC; } -void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, - u64 attr_valid, u64 attr_version) +u32 fuse_get_cache_mask(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) + return 0; + + return STATX_MTIME | STATX_CTIME | STATX_SIZE; +} + +static void fuse_change_attributes_i(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, + u64 attr_version, u64 evict_ctr) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - bool is_wb = fc->writeback_cache; + u32 cache_mask; loff_t oldsize; struct timespec64 old_mtime; - spin_lock(&fc->lock); + spin_lock(&fi->lock); + /* + * In case of writeback_cache enabled, writes update mtime, ctime and + * may update i_size. In these cases trust the cached value in the + * inode. + */ + cache_mask = fuse_get_cache_mask(inode); + if (cache_mask & STATX_SIZE) + attr->size = i_size_read(inode); + + if (cache_mask & STATX_MTIME) { + attr->mtime = inode_get_mtime_sec(inode); + attr->mtimensec = inode_get_mtime_nsec(inode); + } + if (cache_mask & STATX_CTIME) { + attr->ctime = inode_get_ctime_sec(inode); + attr->ctimensec = inode_get_ctime_nsec(inode); + } + if ((attr_version != 0 && fi->attr_version > attr_version) || test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); return; } - old_mtime = inode->i_mtime; - fuse_change_attributes_common(inode, attr, attr_valid); + old_mtime = inode_get_mtime(inode); + fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask, + evict_ctr); oldsize = inode->i_size; /* @@ -225,16 +370,17 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, * extend local i_size without keeping userspace server in sync. So, * attr->size coming from server can be stale. We cannot trust it. */ - if (!is_wb || !S_ISREG(inode->i_mode)) + if (!(cache_mask & STATX_SIZE)) i_size_write(inode, attr->size); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); - if (!is_wb && S_ISREG(inode->i_mode)) { + if (!cache_mask && S_ISREG(inode->i_mode)) { bool inval = false; if (oldsize != attr->size) { truncate_pagecache(inode, attr->size); - inval = true; + if (!fc->explicit_inval_data) + inval = true; } else if (fc->auto_inval_data) { struct timespec64 new_mtime = { .tv_sec = attr->mtime, @@ -252,19 +398,35 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, if (inval) invalidate_inode_pages2(inode->i_mapping); } + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_dontcache(inode, attr->flags); } -static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) +void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, + u64 attr_version) +{ + fuse_change_attributes_i(inode, attr, sx, attr_valid, attr_version, 0); +} + +static void fuse_init_submount_lookup(struct fuse_submount_lookup *sl, + u64 nodeid) +{ + sl->nodeid = nodeid; + refcount_set(&sl->count, 1); +} + +static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr, + struct fuse_conn *fc) { inode->i_mode = attr->mode & S_IFMT; inode->i_size = attr->size; - inode->i_mtime.tv_sec = attr->mtime; - inode->i_mtime.tv_nsec = attr->mtimensec; - inode->i_ctime.tv_sec = attr->ctime; - inode->i_ctime.tv_nsec = attr->ctimensec; + inode_set_mtime(inode, attr->mtime, attr->mtimensec); + inode_set_ctime(inode, attr->ctime, attr->ctimensec); if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); - fuse_init_file_inode(inode); + fuse_init_file_inode(inode, attr->flags); } else if (S_ISDIR(inode->i_mode)) fuse_init_dir(inode); else if (S_ISLNK(inode->i_mode)) @@ -276,9 +438,15 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) new_decode_dev(attr->rdev)); } else BUG(); + /* + * Ensure that we don't cache acls for daemons without FUSE_POSIX_ACL + * so they see the exact same behavior as before. + */ + if (!fc->posix_acl) + inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; } -int fuse_inode_eq(struct inode *inode, void *_nodeidp) +static int fuse_inode_eq(struct inode *inode, void *_nodeidp) { u64 nodeid = *(u64 *) _nodeidp; if (get_node_id(inode) == nodeid) @@ -296,51 +464,112 @@ static int fuse_inode_set(struct inode *inode, void *_nodeidp) struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int generation, struct fuse_attr *attr, - u64 attr_valid, u64 attr_version) + u64 attr_valid, u64 attr_version, + u64 evict_ctr) { struct inode *inode; struct fuse_inode *fi; struct fuse_conn *fc = get_fuse_conn_super(sb); - retry: + /* + * Auto mount points get their node id from the submount root, which is + * not a unique identifier within this filesystem. + * + * To avoid conflicts, do not place submount points into the inode hash + * table. + */ + if (fc->auto_submounts && (attr->flags & FUSE_ATTR_SUBMOUNT) && + S_ISDIR(attr->mode)) { + struct fuse_inode *fi; + + inode = new_inode(sb); + if (!inode) + return NULL; + + fuse_init_inode(inode, attr, fc); + fi = get_fuse_inode(inode); + fi->nodeid = nodeid; + fi->submount_lookup = fuse_alloc_submount_lookup(); + if (!fi->submount_lookup) { + iput(inode); + return NULL; + } + /* Sets nlookup = 1 on fi->submount_lookup->nlookup */ + fuse_init_submount_lookup(fi->submount_lookup, nodeid); + inode->i_flags |= S_AUTOMOUNT; + goto done; + } + +retry: inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid); if (!inode) return NULL; - if ((inode->i_state & I_NEW)) { + if ((inode_state_read_once(inode) & I_NEW)) { inode->i_flags |= S_NOATIME; if (!fc->writeback_cache || !S_ISREG(attr->mode)) inode->i_flags |= S_NOCMTIME; inode->i_generation = generation; - fuse_init_inode(inode, attr); + fuse_init_inode(inode, attr, fc); unlock_new_inode(inode); - } else if ((inode->i_mode ^ attr->mode) & S_IFMT) { - /* Inode has changed type, any I/O on the old should fail */ - make_bad_inode(inode); - iput(inode); - goto retry; + } else if (fuse_stale_inode(inode, generation, attr)) { + /* nodeid was reused, any I/O on the old inode should fail */ + fuse_make_bad(inode); + if (inode != d_inode(sb->s_root)) { + remove_inode_hash(inode); + iput(inode); + goto retry; + } } - fi = get_fuse_inode(inode); - spin_lock(&fc->lock); + spin_lock(&fi->lock); fi->nlookup++; - spin_unlock(&fc->lock); - fuse_change_attributes(inode, attr, attr_valid, attr_version); - + spin_unlock(&fi->lock); +done: + fuse_change_attributes_i(inode, attr, NULL, attr_valid, attr_version, + evict_ctr); return inode; } -int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, +struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid, + struct fuse_mount **fm) +{ + struct fuse_mount *fm_iter; + struct inode *inode; + + WARN_ON(!rwsem_is_locked(&fc->killsb)); + list_for_each_entry(fm_iter, &fc->mounts, fc_entry) { + if (!fm_iter->sb) + continue; + + inode = ilookup5(fm_iter->sb, nodeid, fuse_inode_eq, &nodeid); + if (inode) { + if (fm) + *fm = fm_iter; + return inode; + } + } + + return NULL; +} + +int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, loff_t offset, loff_t len) { + struct fuse_inode *fi; struct inode *inode; pgoff_t pg_start; pgoff_t pg_end; - inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid); + inode = fuse_ilookup(fc, nodeid, NULL); if (!inode) return -ENOENT; + fi = get_fuse_inode(inode); + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + spin_unlock(&fi->lock); + fuse_invalidate_attr(inode); forget_all_cached_acls(inode); if (offset >= 0) { @@ -356,6 +585,17 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, return 0; } +void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid) +{ + struct inode *inode; + + inode = fuse_ilookup(fc, nodeid, NULL); + if (!inode) + return; + d_prune_aliases(inode); + iput(inode); +} + bool fuse_lock_inode(struct inode *inode) { bool locked = false; @@ -376,32 +616,28 @@ void fuse_unlock_inode(struct inode *inode, bool locked) static void fuse_umount_begin(struct super_block *sb) { - fuse_abort_conn(get_fuse_conn_super(sb), false); -} + struct fuse_conn *fc = get_fuse_conn_super(sb); -static void fuse_send_destroy(struct fuse_conn *fc) -{ - struct fuse_req *req = fc->destroy_req; - if (req && fc->conn_init) { - fc->destroy_req = NULL; - req->in.h.opcode = FUSE_DESTROY; - __set_bit(FR_FORCE, &req->flags); - __clear_bit(FR_BACKGROUND, &req->flags); - fuse_request_send(fc, req); - fuse_put_request(fc, req); - } + if (fc->no_force_umount) + return; + + fuse_abort_conn(fc); + + // Only retire block-device-based superblocks. + if (sb->s_bdev != NULL) + retire_super(sb); } -static void fuse_put_super(struct super_block *sb) +static void fuse_send_destroy(struct fuse_mount *fm) { - struct fuse_conn *fc = get_fuse_conn_super(sb); + if (fm->fc->conn_init) { + FUSE_ARGS(args); - mutex_lock(&fuse_mutex); - list_del(&fc->entry); - fuse_ctl_remove_conn(fc); - mutex_unlock(&fuse_mutex); - - fuse_conn_put(fc); + args.opcode = FUSE_DESTROY; + args.force = true; + args.nocreds = true; + fuse_simple_request(fm, &args); + } } static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr) @@ -421,30 +657,124 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); FUSE_ARGS(args); struct fuse_statfs_out outarg; int err; - if (!fuse_allow_current_process(fc)) { + if (!fuse_allow_current_process(fm->fc)) { buf->f_type = FUSE_SUPER_MAGIC; return 0; } memset(&outarg, 0, sizeof(outarg)); - args.in.numargs = 0; - args.in.h.opcode = FUSE_STATFS; - args.in.h.nodeid = get_node_id(d_inode(dentry)); - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + args.in_numargs = 0; + args.opcode = FUSE_STATFS; + args.nodeid = get_node_id(d_inode(dentry)); + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); if (!err) convert_fuse_statfs(buf, &outarg.st); return err; } +static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void) +{ + struct fuse_sync_bucket *bucket; + + bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL); + if (bucket) { + init_waitqueue_head(&bucket->waitq); + /* Initial active count */ + atomic_set(&bucket->count, 1); + } + return bucket; +} + +static void fuse_sync_fs_writes(struct fuse_conn *fc) +{ + struct fuse_sync_bucket *bucket, *new_bucket; + int count; + + new_bucket = fuse_sync_bucket_alloc(); + spin_lock(&fc->lock); + bucket = rcu_dereference_protected(fc->curr_bucket, 1); + count = atomic_read(&bucket->count); + WARN_ON(count < 1); + /* No outstanding writes? */ + if (count == 1) { + spin_unlock(&fc->lock); + kfree(new_bucket); + return; + } + + /* + * Completion of new bucket depends on completion of this bucket, so add + * one more count. + */ + atomic_inc(&new_bucket->count); + rcu_assign_pointer(fc->curr_bucket, new_bucket); + spin_unlock(&fc->lock); + /* + * Drop initial active count. At this point if all writes in this and + * ancestor buckets complete, the count will go to zero and this task + * will be woken up. + */ + atomic_dec(&bucket->count); + + wait_event(bucket->waitq, atomic_read(&bucket->count) == 0); + + /* Drop temp count on descendant bucket */ + fuse_sync_bucket_dec(new_bucket); + kfree_rcu(bucket, rcu); +} + +static int fuse_sync_fs(struct super_block *sb, int wait) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct fuse_conn *fc = fm->fc; + struct fuse_syncfs_in inarg; + FUSE_ARGS(args); + int err; + + /* + * Userspace cannot handle the wait == 0 case. Avoid a + * gratuitous roundtrip. + */ + if (!wait) + return 0; + + /* The filesystem is being unmounted. Nothing to do. */ + if (!sb->s_root) + return 0; + + if (!fc->sync_fs) + return 0; + + fuse_sync_fs_writes(fc); + + memset(&inarg, 0, sizeof(inarg)); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.opcode = FUSE_SYNCFS; + args.nodeid = get_node_id(sb->s_root->d_inode); + args.out_numargs = 0; + + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + fc->sync_fs = 0; + err = 0; + } + + return err; +} + enum { + OPT_SOURCE, + OPT_SUBTYPE, OPT_FD, OPT_ROOTMODE, OPT_USER_ID, @@ -456,111 +786,127 @@ enum { OPT_ERR }; -static const match_table_t tokens = { - {OPT_FD, "fd=%u"}, - {OPT_ROOTMODE, "rootmode=%o"}, - {OPT_USER_ID, "user_id=%u"}, - {OPT_GROUP_ID, "group_id=%u"}, - {OPT_DEFAULT_PERMISSIONS, "default_permissions"}, - {OPT_ALLOW_OTHER, "allow_other"}, - {OPT_MAX_READ, "max_read=%u"}, - {OPT_BLKSIZE, "blksize=%u"}, - {OPT_ERR, NULL} +static const struct fs_parameter_spec fuse_fs_parameters[] = { + fsparam_string ("source", OPT_SOURCE), + fsparam_u32 ("fd", OPT_FD), + fsparam_u32oct ("rootmode", OPT_ROOTMODE), + fsparam_uid ("user_id", OPT_USER_ID), + fsparam_gid ("group_id", OPT_GROUP_ID), + fsparam_flag ("default_permissions", OPT_DEFAULT_PERMISSIONS), + fsparam_flag ("allow_other", OPT_ALLOW_OTHER), + fsparam_u32 ("max_read", OPT_MAX_READ), + fsparam_u32 ("blksize", OPT_BLKSIZE), + fsparam_string ("subtype", OPT_SUBTYPE), + {} }; -static int fuse_match_uint(substring_t *s, unsigned int *res) +static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param) { - int err = -ENOMEM; - char *buf = match_strdup(s); - if (buf) { - err = kstrtouint(buf, 10, res); - kfree(buf); + struct fs_parse_result result; + struct fuse_fs_context *ctx = fsc->fs_private; + int opt; + kuid_t kuid; + kgid_t kgid; + + if (fsc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + /* + * Ignore options coming from mount(MS_REMOUNT) for backward + * compatibility. + */ + if (fsc->oldapi) + return 0; + + return invalfc(fsc, "No changes allowed in reconfigure"); } - return err; -} -static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev, - struct user_namespace *user_ns) -{ - char *p; - memset(d, 0, sizeof(struct fuse_mount_data)); - d->max_read = ~0; - d->blksize = FUSE_DEFAULT_BLKSIZE; + opt = fs_parse(fsc, fuse_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case OPT_SOURCE: + if (fsc->source) + return invalfc(fsc, "Multiple sources specified"); + fsc->source = param->string; + param->string = NULL; + break; + + case OPT_SUBTYPE: + if (ctx->subtype) + return invalfc(fsc, "Multiple subtypes specified"); + ctx->subtype = param->string; + param->string = NULL; + return 0; - while ((p = strsep(&opt, ",")) != NULL) { - int token; - int value; - unsigned uv; - substring_t args[MAX_OPT_ARGS]; - if (!*p) - continue; + case OPT_FD: + ctx->fd = result.uint_32; + ctx->fd_present = true; + break; - token = match_token(p, tokens, args); - switch (token) { - case OPT_FD: - if (match_int(&args[0], &value)) - return 0; - d->fd = value; - d->fd_present = 1; - break; - - case OPT_ROOTMODE: - if (match_octal(&args[0], &value)) - return 0; - if (!fuse_valid_type(value)) - return 0; - d->rootmode = value; - d->rootmode_present = 1; - break; - - case OPT_USER_ID: - if (fuse_match_uint(&args[0], &uv)) - return 0; - d->user_id = make_kuid(user_ns, uv); - if (!uid_valid(d->user_id)) - return 0; - d->user_id_present = 1; - break; - - case OPT_GROUP_ID: - if (fuse_match_uint(&args[0], &uv)) - return 0; - d->group_id = make_kgid(user_ns, uv); - if (!gid_valid(d->group_id)) - return 0; - d->group_id_present = 1; - break; - - case OPT_DEFAULT_PERMISSIONS: - d->default_permissions = 1; - break; - - case OPT_ALLOW_OTHER: - d->allow_other = 1; - break; - - case OPT_MAX_READ: - if (match_int(&args[0], &value)) - return 0; - d->max_read = value; - break; - - case OPT_BLKSIZE: - if (!is_bdev || match_int(&args[0], &value)) - return 0; - d->blksize = value; - break; - - default: - return 0; - } + case OPT_ROOTMODE: + if (!fuse_valid_type(result.uint_32)) + return invalfc(fsc, "Invalid rootmode"); + ctx->rootmode = result.uint_32; + ctx->rootmode_present = true; + break; + + case OPT_USER_ID: + kuid = result.uid; + /* + * The requested uid must be representable in the + * filesystem's idmapping. + */ + if (!kuid_has_mapping(fsc->user_ns, kuid)) + return invalfc(fsc, "Invalid user_id"); + ctx->user_id = kuid; + ctx->user_id_present = true; + break; + + case OPT_GROUP_ID: + kgid = result.gid; + /* + * The requested gid must be representable in the + * filesystem's idmapping. + */ + if (!kgid_has_mapping(fsc->user_ns, kgid)) + return invalfc(fsc, "Invalid group_id"); + ctx->group_id = kgid; + ctx->group_id_present = true; + break; + + case OPT_DEFAULT_PERMISSIONS: + ctx->default_permissions = true; + break; + + case OPT_ALLOW_OTHER: + ctx->allow_other = true; + break; + + case OPT_MAX_READ: + ctx->max_read = result.uint_32; + break; + + case OPT_BLKSIZE: + if (!ctx->is_bdev) + return invalfc(fsc, "blksize only supported for fuseblk"); + ctx->blksize = result.uint_32; + break; + + default: + return -EINVAL; } - if (!d->fd_present || !d->rootmode_present || - !d->user_id_present || !d->group_id_present) - return 0; + return 0; +} - return 1; +static void fuse_free_fsc(struct fs_context *fsc) +{ + struct fuse_fs_context *ctx = fsc->fs_private; + + if (ctx) { + kfree(ctx->subtype); + kfree(ctx); + } } static int fuse_show_options(struct seq_file *m, struct dentry *root) @@ -568,30 +914,48 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) struct super_block *sb = root->d_sb; struct fuse_conn *fc = get_fuse_conn_super(sb); - seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id)); - seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id)); - if (fc->default_permissions) - seq_puts(m, ",default_permissions"); - if (fc->allow_other) - seq_puts(m, ",allow_other"); - if (fc->max_read != ~0) - seq_printf(m, ",max_read=%u", fc->max_read); - if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) - seq_printf(m, ",blksize=%lu", sb->s_blocksize); + if (fc->legacy_opts_show) { + seq_printf(m, ",user_id=%u", + from_kuid_munged(fc->user_ns, fc->user_id)); + seq_printf(m, ",group_id=%u", + from_kgid_munged(fc->user_ns, fc->group_id)); + if (fc->default_permissions) + seq_puts(m, ",default_permissions"); + if (fc->allow_other) + seq_puts(m, ",allow_other"); + if (fc->max_read != ~0) + seq_printf(m, ",max_read=%u", fc->max_read); + if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) + seq_printf(m, ",blksize=%lu", sb->s_blocksize); + } +#ifdef CONFIG_FUSE_DAX + if (fc->dax_mode == FUSE_DAX_ALWAYS) + seq_puts(m, ",dax=always"); + else if (fc->dax_mode == FUSE_DAX_NEVER) + seq_puts(m, ",dax=never"); + else if (fc->dax_mode == FUSE_DAX_INODE_USER) + seq_puts(m, ",dax=inode"); +#endif + return 0; } -static void fuse_iqueue_init(struct fuse_iqueue *fiq) +static void fuse_iqueue_init(struct fuse_iqueue *fiq, + const struct fuse_iqueue_ops *ops, + void *priv) { memset(fiq, 0, sizeof(struct fuse_iqueue)); + spin_lock_init(&fiq->lock); init_waitqueue_head(&fiq->waitq); INIT_LIST_HEAD(&fiq->pending); INIT_LIST_HEAD(&fiq->interrupts); fiq->forget_list_tail = &fiq->forget_list_head; fiq->connected = 1; + fiq->ops = ops; + fiq->priv = priv; } -static void fuse_pqueue_init(struct fuse_pqueue *fpq) +void fuse_pqueue_init(struct fuse_pqueue *fpq) { unsigned int i; @@ -602,7 +966,9 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq) fpq->connected = 1; } -void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) +void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, + struct user_namespace *user_ns, + const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv) { memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); @@ -610,36 +976,74 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) init_rwsem(&fc->killsb); refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); + atomic_set(&fc->epoch, 1); + INIT_WORK(&fc->epoch_work, fuse_epoch_work); init_waitqueue_head(&fc->blocked_waitq); - init_waitqueue_head(&fc->reserved_req_waitq); - fuse_iqueue_init(&fc->iq); + fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); INIT_LIST_HEAD(&fc->devices); atomic_set(&fc->num_waiting, 0); fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; - fc->khctr = 0; + atomic64_set(&fc->khctr, 0); fc->polled_files = RB_ROOT; fc->blocked = 0; fc->initialized = 0; fc->connected = 1; - fc->attr_version = 1; + atomic64_set(&fc->attr_version, 1); + atomic64_set(&fc->evict_ctr, 1); get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); + fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; + fc->max_pages_limit = fuse_max_pages_limit; + fc->name_max = FUSE_NAME_LOW_MAX; + fc->timeout.req_timeout = 0; + + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_files_init(fc); + + INIT_LIST_HEAD(&fc->mounts); + list_add(&fm->fc_entry, &fc->mounts); + fm->fc = fc; } EXPORT_SYMBOL_GPL(fuse_conn_init); +static void delayed_release(struct rcu_head *p) +{ + struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu); + + fuse_uring_destruct(fc); + + put_user_ns(fc->user_ns); + fc->release(fc); +} + void fuse_conn_put(struct fuse_conn *fc) { - if (refcount_dec_and_test(&fc->count)) { - if (fc->destroy_req) - fuse_request_free(fc->destroy_req); - put_pid_ns(fc->pid_ns); - put_user_ns(fc->user_ns); - fc->release(fc); + struct fuse_iqueue *fiq = &fc->iq; + struct fuse_sync_bucket *bucket; + + if (!refcount_dec_and_test(&fc->count)) + return; + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_conn_free(fc); + if (fc->timeout.req_timeout) + cancel_delayed_work_sync(&fc->timeout.work); + cancel_work_sync(&fc->epoch_work); + if (fiq->ops->release) + fiq->ops->release(fiq); + put_pid_ns(fc->pid_ns); + bucket = rcu_dereference_protected(fc->curr_bucket, 1); + if (bucket) { + WARN_ON(atomic_read(&bucket->count) != 1); + kfree(bucket); } + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_files_free(fc); + call_rcu(&fc->rcu, delayed_release); } EXPORT_SYMBOL_GPL(fuse_conn_put); @@ -650,7 +1054,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) } EXPORT_SYMBOL_GPL(fuse_conn_get); -static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) +static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned int mode) { struct fuse_attr attr; memset(&attr, 0, sizeof(attr)); @@ -658,7 +1062,7 @@ static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) attr.mode = mode; attr.ino = FUSE_ROOT_ID; attr.nlink = 1; - return fuse_iget(sb, 1, 0, &attr, 0, 0); + return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0, 0); } struct fuse_inode_handle { @@ -742,7 +1146,7 @@ static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len, } *max_len = len; - return parent ? 0x82 : 0x81; + return parent ? FILEID_INO64_GEN_PARENT : FILEID_INO64_GEN; } static struct dentry *fuse_fh_to_dentry(struct super_block *sb, @@ -750,7 +1154,8 @@ static struct dentry *fuse_fh_to_dentry(struct super_block *sb, { struct fuse_inode_handle handle; - if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3) + if ((fh_type != FILEID_INO64_GEN && + fh_type != FILEID_INO64_GEN_PARENT) || fh_len < 3) return NULL; handle.nodeid = (u64) fid->raw[0] << 32; @@ -764,7 +1169,7 @@ static struct dentry *fuse_fh_to_parent(struct super_block *sb, { struct fuse_inode_handle parent; - if (fh_type != 0x82 || fh_len < 6) + if (fh_type != FILEID_INO64_GEN_PARENT || fh_len < 6) return NULL; parent.nodeid = (u64) fid->raw[3] << 32; @@ -780,14 +1185,13 @@ static struct dentry *fuse_get_parent(struct dentry *child) struct inode *inode; struct dentry *parent; struct fuse_entry_out outarg; - const struct qstr name = QSTR_INIT("..", 2); int err; if (!fc->export_support) return ERR_PTR(-ESTALE); err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode), - &name, &outarg, &inode); + &dotdot_name, &outarg, &inode); if (err) { if (err == -ENOENT) return ERR_PTR(-ESTALE); @@ -801,6 +1205,11 @@ static struct dentry *fuse_get_parent(struct dentry *child) return parent; } +/* only for fid encoding; no support for file handle */ +static const struct export_operations fuse_export_fid_operations = { + .encode_fh = fuse_encode_fh, +}; + static const struct export_operations fuse_export_operations = { .fh_to_dentry = fuse_fh_to_dentry, .fh_to_parent = fuse_fh_to_parent, @@ -810,22 +1219,24 @@ static const struct export_operations fuse_export_operations = { static const struct super_operations fuse_super_operations = { .alloc_inode = fuse_alloc_inode, - .destroy_inode = fuse_destroy_inode, + .free_inode = fuse_free_inode, .evict_inode = fuse_evict_inode, .write_inode = fuse_write_inode, - .drop_inode = generic_delete_inode, - .remount_fs = fuse_remount_fs, - .put_super = fuse_put_super, + .drop_inode = inode_just_drop, .umount_begin = fuse_umount_begin, .statfs = fuse_statfs, + .sync_fs = fuse_sync_fs, .show_options = fuse_show_options, }; -static void sanitize_global_limit(unsigned *limit) +static void sanitize_global_limit(unsigned int *limit) { + /* + * The default maximum number of async requests is calculated to consume + * 1/2^13 of the total memory, assuming 392 bytes per request. + */ if (*limit == 0) - *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / - sizeof(struct fuse_req); + *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / 392; if (*limit >= 1 << 16) *limit = (1 << 16) - 1; @@ -839,7 +1250,7 @@ static int set_global_limit(const char *val, const struct kernel_param *kp) if (rv) return rv; - sanitize_global_limit((unsigned *)kp->arg); + sanitize_global_limit((unsigned int *)kp->arg); return 0; } @@ -871,124 +1282,298 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) spin_unlock(&fc->bg_lock); } -static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) +static void set_request_timeout(struct fuse_conn *fc, unsigned int timeout) { - struct fuse_init_out *arg = &req->misc.init_out; + fc->timeout.req_timeout = secs_to_jiffies(timeout); + INIT_DELAYED_WORK(&fc->timeout.work, fuse_check_timeout); + queue_delayed_work(system_percpu_wq, &fc->timeout.work, + fuse_timeout_timer_freq); +} - if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION) - fc->conn_error = 1; +static void init_server_timeout(struct fuse_conn *fc, unsigned int timeout) +{ + if (!timeout && !fuse_max_req_timeout && !fuse_default_req_timeout) + return; + + if (!timeout) + timeout = fuse_default_req_timeout; + + if (fuse_max_req_timeout) { + if (timeout) + timeout = min(fuse_max_req_timeout, timeout); + else + timeout = fuse_max_req_timeout; + } + + timeout = max(FUSE_TIMEOUT_TIMER_FREQ, timeout); + + set_request_timeout(fc, timeout); +} + +struct fuse_init_args { + struct fuse_args args; + struct fuse_init_in in; + struct fuse_init_out out; +}; + +static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, + int error) +{ + struct fuse_conn *fc = fm->fc; + struct fuse_init_args *ia = container_of(args, typeof(*ia), args); + struct fuse_init_out *arg = &ia->out; + bool ok = true; + + if (error || arg->major != FUSE_KERNEL_VERSION) + ok = false; else { unsigned long ra_pages; + unsigned int timeout = 0; process_init_limits(fc, arg); if (arg->minor >= 6) { + u64 flags = arg->flags; + + if (flags & FUSE_INIT_EXT) + flags |= (u64) arg->flags2 << 32; + ra_pages = arg->max_readahead / PAGE_SIZE; - if (arg->flags & FUSE_ASYNC_READ) + if (flags & FUSE_ASYNC_READ) fc->async_read = 1; - if (!(arg->flags & FUSE_POSIX_LOCKS)) + if (!(flags & FUSE_POSIX_LOCKS)) fc->no_lock = 1; if (arg->minor >= 17) { - if (!(arg->flags & FUSE_FLOCK_LOCKS)) + if (!(flags & FUSE_FLOCK_LOCKS)) fc->no_flock = 1; } else { - if (!(arg->flags & FUSE_POSIX_LOCKS)) + if (!(flags & FUSE_POSIX_LOCKS)) fc->no_flock = 1; } - if (arg->flags & FUSE_ATOMIC_O_TRUNC) + if (flags & FUSE_ATOMIC_O_TRUNC) fc->atomic_o_trunc = 1; if (arg->minor >= 9) { /* LOOKUP has dependency on proto version */ - if (arg->flags & FUSE_EXPORT_SUPPORT) + if (flags & FUSE_EXPORT_SUPPORT) fc->export_support = 1; } - if (arg->flags & FUSE_BIG_WRITES) + if (flags & FUSE_BIG_WRITES) fc->big_writes = 1; - if (arg->flags & FUSE_DONT_MASK) + if (flags & FUSE_DONT_MASK) fc->dont_mask = 1; - if (arg->flags & FUSE_AUTO_INVAL_DATA) + if (flags & FUSE_AUTO_INVAL_DATA) fc->auto_inval_data = 1; - if (arg->flags & FUSE_DO_READDIRPLUS) { + else if (flags & FUSE_EXPLICIT_INVAL_DATA) + fc->explicit_inval_data = 1; + if (flags & FUSE_DO_READDIRPLUS) { fc->do_readdirplus = 1; - if (arg->flags & FUSE_READDIRPLUS_AUTO) + if (flags & FUSE_READDIRPLUS_AUTO) fc->readdirplus_auto = 1; } - if (arg->flags & FUSE_ASYNC_DIO) + if (flags & FUSE_ASYNC_DIO) fc->async_dio = 1; - if (arg->flags & FUSE_WRITEBACK_CACHE) + if (flags & FUSE_WRITEBACK_CACHE) fc->writeback_cache = 1; - if (arg->flags & FUSE_PARALLEL_DIROPS) + if (flags & FUSE_PARALLEL_DIROPS) fc->parallel_dirops = 1; - if (arg->flags & FUSE_HANDLE_KILLPRIV) + if (flags & FUSE_HANDLE_KILLPRIV) fc->handle_killpriv = 1; if (arg->time_gran && arg->time_gran <= 1000000000) - fc->sb->s_time_gran = arg->time_gran; - if ((arg->flags & FUSE_POSIX_ACL)) { + fm->sb->s_time_gran = arg->time_gran; + if ((flags & FUSE_POSIX_ACL)) { fc->default_permissions = 1; fc->posix_acl = 1; - fc->sb->s_xattr = fuse_acl_xattr_handlers; } - if (arg->flags & FUSE_CACHE_SYMLINKS) + if (flags & FUSE_CACHE_SYMLINKS) fc->cache_symlinks = 1; - if (arg->flags & FUSE_ABORT_ERROR) + if (flags & FUSE_ABORT_ERROR) fc->abort_err = 1; - if (arg->flags & FUSE_MAX_PAGES) { + if (flags & FUSE_MAX_PAGES) { fc->max_pages = - min_t(unsigned int, FUSE_MAX_MAX_PAGES, + min_t(unsigned int, fc->max_pages_limit, max_t(unsigned int, arg->max_pages, 1)); + + /* + * PATH_MAX file names might need two pages for + * ops like rename + */ + if (fc->max_pages > 1) + fc->name_max = FUSE_NAME_MAX; } + if (IS_ENABLED(CONFIG_FUSE_DAX)) { + if (flags & FUSE_MAP_ALIGNMENT && + !fuse_dax_check_alignment(fc, arg->map_alignment)) { + ok = false; + } + if (flags & FUSE_HAS_INODE_DAX) + fc->inode_dax = 1; + } + if (flags & FUSE_HANDLE_KILLPRIV_V2) { + fc->handle_killpriv_v2 = 1; + fm->sb->s_flags |= SB_NOSEC; + } + if (flags & FUSE_SETXATTR_EXT) + fc->setxattr_ext = 1; + if (flags & FUSE_SECURITY_CTX) + fc->init_security = 1; + if (flags & FUSE_CREATE_SUPP_GROUP) + fc->create_supp_group = 1; + if (flags & FUSE_DIRECT_IO_ALLOW_MMAP) + fc->direct_io_allow_mmap = 1; + /* + * max_stack_depth is the max stack depth of FUSE fs, + * so it has to be at least 1 to support passthrough + * to backing files. + * + * with max_stack_depth > 1, the backing files can be + * on a stacked fs (e.g. overlayfs) themselves and with + * max_stack_depth == 1, FUSE fs can be stacked as the + * underlying fs of a stacked fs (e.g. overlayfs). + * + * Also don't allow the combination of FUSE_PASSTHROUGH + * and FUSE_WRITEBACK_CACHE, current design doesn't handle + * them together. + */ + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) && + (flags & FUSE_PASSTHROUGH) && + arg->max_stack_depth > 0 && + arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH && + !(flags & FUSE_WRITEBACK_CACHE)) { + fc->passthrough = 1; + fc->max_stack_depth = arg->max_stack_depth; + fm->sb->s_stack_depth = arg->max_stack_depth; + } + if (flags & FUSE_NO_EXPORT_SUPPORT) + fm->sb->s_export_op = &fuse_export_fid_operations; + if (flags & FUSE_ALLOW_IDMAP) { + if (fc->default_permissions) + fm->sb->s_iflags &= ~SB_I_NOIDMAP; + else + ok = false; + } + if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled()) + fc->io_uring = 1; + + if (flags & FUSE_REQUEST_TIMEOUT) + timeout = arg->request_timeout; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; fc->no_flock = 1; } - fc->sb->s_bdi->ra_pages = - min(fc->sb->s_bdi->ra_pages, ra_pages); + init_server_timeout(fc, timeout); + + fm->sb->s_bdi->ra_pages = + min(fm->sb->s_bdi->ra_pages, ra_pages); fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; fc->max_write = max_t(unsigned, 4096, fc->max_write); fc->conn_init = 1; } + kfree(ia); + + if (!ok) { + fc->conn_init = 0; + fc->conn_error = 1; + } + fuse_set_initialized(fc); wake_up_all(&fc->blocked_waitq); } -static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) +static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm) { - struct fuse_init_in *arg = &req->misc.init_in; + struct fuse_init_args *ia; + u64 flags; + + ia = kzalloc(sizeof(*ia), GFP_KERNEL | __GFP_NOFAIL); - arg->major = FUSE_KERNEL_VERSION; - arg->minor = FUSE_KERNEL_MINOR_VERSION; - arg->max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE; - arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | + ia->in.major = FUSE_KERNEL_VERSION; + ia->in.minor = FUSE_KERNEL_MINOR_VERSION; + ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE; + flags = + FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | - FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS; - req->in.h.opcode = FUSE_INIT; - req->in.numargs = 1; - req->in.args[0].size = sizeof(*arg); - req->in.args[0].value = arg; - req->out.numargs = 1; + FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | + FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | + FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | + FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | + FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | + FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP | + FUSE_REQUEST_TIMEOUT; +#ifdef CONFIG_FUSE_DAX + if (fm->fc->dax) + flags |= FUSE_MAP_ALIGNMENT; + if (fuse_is_inode_dax_mode(fm->fc->dax_mode)) + flags |= FUSE_HAS_INODE_DAX; +#endif + if (fm->fc->auto_submounts) + flags |= FUSE_SUBMOUNTS; + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + flags |= FUSE_PASSTHROUGH; + + /* + * This is just an information flag for fuse server. No need to check + * the reply - server is either sending IORING_OP_URING_CMD or not. + */ + if (fuse_uring_enabled()) + flags |= FUSE_OVER_IO_URING; + + ia->in.flags = flags; + ia->in.flags2 = flags >> 32; + + ia->args.opcode = FUSE_INIT; + ia->args.in_numargs = 1; + ia->args.in_args[0].size = sizeof(ia->in); + ia->args.in_args[0].value = &ia->in; + ia->args.out_numargs = 1; /* Variable length argument used for backward compatibility with interface version < 7.5. Rest of init_out is zeroed by do_get_request(), so a short reply is not a problem */ - req->out.argvar = 1; - req->out.args[0].size = sizeof(struct fuse_init_out); - req->out.args[0].value = &req->misc.init_out; - req->end = process_init_reply; - fuse_request_send_background(fc, req); + ia->args.out_argvar = true; + ia->args.out_args[0].size = sizeof(ia->out); + ia->args.out_args[0].value = &ia->out; + ia->args.force = true; + ia->args.nocreds = true; + + return ia; +} + +int fuse_send_init(struct fuse_mount *fm) +{ + struct fuse_init_args *ia = fuse_new_init(fm); + int err; + + if (fm->fc->sync_init) { + err = fuse_simple_request(fm, &ia->args); + /* Ignore size of init reply */ + if (err > 0) + err = 0; + } else { + ia->args.end = process_init_reply; + err = fuse_simple_background(fm, &ia->args, GFP_KERNEL); + if (!err) + return 0; + } + process_init_reply(fm, &ia->args, err); + if (fm->fc->conn_error) + return -ENOTCONN; + return 0; } +EXPORT_SYMBOL_GPL(fuse_send_init); -static void fuse_free_conn(struct fuse_conn *fc) +void fuse_free_conn(struct fuse_conn *fc) { WARN_ON(!list_empty(&fc->devices)); - kfree_rcu(fc, rcu); + kfree(fc); } +EXPORT_SYMBOL_GPL(fuse_free_conn); static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) { @@ -1009,9 +1594,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) if (err) return err; - sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; - /* fuse does it's own writeback accounting */ - sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; + sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT; /* * For a single fuse filesystem use max 1% of dirty + @@ -1030,7 +1613,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) return 0; } -struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) +struct fuse_dev *fuse_dev_alloc(void) { struct fuse_dev *fud; struct list_head *pq; @@ -1046,16 +1629,33 @@ struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) } fud->pq.processing = pq; - fud->fc = fuse_conn_get(fc); fuse_pqueue_init(&fud->pq); + return fud; +} +EXPORT_SYMBOL_GPL(fuse_dev_alloc); + +void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc) +{ + fud->fc = fuse_conn_get(fc); spin_lock(&fc->lock); list_add_tail(&fud->entry, &fc->devices); spin_unlock(&fc->lock); +} +EXPORT_SYMBOL_GPL(fuse_dev_install); + +struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc) +{ + struct fuse_dev *fud; + + fud = fuse_dev_alloc(); + if (!fud) + return NULL; + fuse_dev_install(fud, fc); return fud; } -EXPORT_SYMBOL_GPL(fuse_dev_alloc); +EXPORT_SYMBOL_GPL(fuse_dev_alloc_install); void fuse_dev_free(struct fuse_dev *fud) { @@ -1073,37 +1673,34 @@ void fuse_dev_free(struct fuse_dev *fud) } EXPORT_SYMBOL_GPL(fuse_dev_free); -static int fuse_fill_super(struct super_block *sb, void *data, int silent) +static void fuse_fill_attr_from_inode(struct fuse_attr *attr, + const struct fuse_inode *fi) { - struct fuse_dev *fud; - struct fuse_conn *fc; - struct inode *root; - struct fuse_mount_data d; - struct file *file; - struct dentry *root_dentry; - struct fuse_req *init_req; - int err; - int is_bdev = sb->s_bdev != NULL; - - err = -EINVAL; - if (sb->s_flags & SB_MANDLOCK) - goto err; - - sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); - - if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns)) - goto err; + struct timespec64 atime = inode_get_atime(&fi->inode); + struct timespec64 mtime = inode_get_mtime(&fi->inode); + struct timespec64 ctime = inode_get_ctime(&fi->inode); + + *attr = (struct fuse_attr){ + .ino = fi->inode.i_ino, + .size = fi->inode.i_size, + .blocks = fi->inode.i_blocks, + .atime = atime.tv_sec, + .mtime = mtime.tv_sec, + .ctime = ctime.tv_sec, + .atimensec = atime.tv_nsec, + .mtimensec = mtime.tv_nsec, + .ctimensec = ctime.tv_nsec, + .mode = fi->inode.i_mode, + .nlink = fi->inode.i_nlink, + .uid = __kuid_val(fi->inode.i_uid), + .gid = __kgid_val(fi->inode.i_gid), + .rdev = fi->inode.i_rdev, + .blksize = 1u << fi->inode.i_blkbits, + }; +} - if (is_bdev) { -#ifdef CONFIG_BLOCK - err = -EINVAL; - if (!sb_set_blocksize(sb, d.blksize)) - goto err; -#endif - } else { - sb->s_blocksize = PAGE_SIZE; - sb->s_blocksize_bits = PAGE_SHIFT; - } +static void fuse_sb_defaults(struct super_block *sb) +{ sb->s_magic = FUSE_SUPER_MAGIC; sb->s_op = &fuse_super_operations; sb->s_xattr = fuse_xattr_handlers; @@ -1111,43 +1708,163 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; sb->s_export_op = &fuse_export_operations; sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE; + sb->s_iflags |= SB_I_NOIDMAP; if (sb->s_user_ns != &init_user_ns) sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; + sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); +} - file = fget(d.fd); - err = -EINVAL; - if (!file) - goto err; +static int fuse_fill_super_submount(struct super_block *sb, + struct fuse_inode *parent_fi) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct super_block *parent_sb = parent_fi->inode.i_sb; + struct fuse_attr root_attr; + struct inode *root; + struct fuse_submount_lookup *sl; + struct fuse_inode *fi; + + fuse_sb_defaults(sb); + fm->sb = sb; + WARN_ON(sb->s_bdi != &noop_backing_dev_info); + sb->s_bdi = bdi_get(parent_sb->s_bdi); + + sb->s_xattr = parent_sb->s_xattr; + sb->s_export_op = parent_sb->s_export_op; + sb->s_time_gran = parent_sb->s_time_gran; + sb->s_blocksize = parent_sb->s_blocksize; + sb->s_blocksize_bits = parent_sb->s_blocksize_bits; + sb->s_subtype = kstrdup(parent_sb->s_subtype, GFP_KERNEL); + if (parent_sb->s_subtype && !sb->s_subtype) + return -ENOMEM; + + fuse_fill_attr_from_inode(&root_attr, parent_fi); + root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0, + fuse_get_evict_ctr(fm->fc)); /* - * Require mount to happen from the same user namespace which - * opened /dev/fuse to prevent potential attacks. + * This inode is just a duplicate, so it is not looked up and + * its nlookup should not be incremented. fuse_iget() does + * that, though, so undo it here. */ - if (file->f_op != &fuse_dev_operations || - file->f_cred->user_ns != sb->s_user_ns) - goto err_fput; + fi = get_fuse_inode(root); + fi->nlookup--; + + set_default_d_op(sb, &fuse_dentry_operations); + sb->s_root = d_make_root(root); + if (!sb->s_root) + return -ENOMEM; /* - * If we are not in the initial user namespace posix - * acls must be translated. + * Grab the parent's submount_lookup pointer and take a + * reference on the shared nlookup from the parent. This is to + * prevent the last forget for this nodeid from getting + * triggered until all users have finished with it. */ - if (sb->s_user_ns != &init_user_ns) - sb->s_xattr = fuse_no_acl_xattr_handlers; + sl = parent_fi->submount_lookup; + WARN_ON(!sl); + if (sl) { + refcount_inc(&sl->count); + fi->submount_lookup = sl; + } - fc = kmalloc(sizeof(*fc), GFP_KERNEL); - err = -ENOMEM; - if (!fc) - goto err_fput; + return 0; +} - fuse_conn_init(fc, sb->s_user_ns); - fc->release = fuse_free_conn; +/* Filesystem context private data holds the FUSE inode of the mount point */ +static int fuse_get_tree_submount(struct fs_context *fsc) +{ + struct fuse_mount *fm; + struct fuse_inode *mp_fi = fsc->fs_private; + struct fuse_conn *fc = get_fuse_conn(&mp_fi->inode); + struct super_block *sb; + int err; - fud = fuse_dev_alloc(fc); - if (!fud) - goto err_put_conn; + fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); + if (!fm) + return -ENOMEM; + + fm->fc = fuse_conn_get(fc); + fsc->s_fs_info = fm; + sb = sget_fc(fsc, NULL, set_anon_super_fc); + if (fsc->s_fs_info) + fuse_mount_destroy(fm); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + /* Initialize superblock, making @mp_fi its root */ + err = fuse_fill_super_submount(sb, mp_fi); + if (err) { + deactivate_locked_super(sb); + return err; + } + + down_write(&fc->killsb); + list_add_tail(&fm->fc_entry, &fc->mounts); + up_write(&fc->killsb); + + sb->s_flags |= SB_ACTIVE; + fsc->root = dget(sb->s_root); + + return 0; +} + +static const struct fs_context_operations fuse_context_submount_ops = { + .get_tree = fuse_get_tree_submount, +}; + +int fuse_init_fs_context_submount(struct fs_context *fsc) +{ + fsc->ops = &fuse_context_submount_ops; + return 0; +} +EXPORT_SYMBOL_GPL(fuse_init_fs_context_submount); + +int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) +{ + struct fuse_dev *fud = NULL; + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct fuse_conn *fc = fm->fc; + struct inode *root; + struct dentry *root_dentry; + int err; + + err = -EINVAL; + if (sb->s_flags & SB_MANDLOCK) + goto err; + + rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc()); + fuse_sb_defaults(sb); + + if (ctx->is_bdev) { +#ifdef CONFIG_BLOCK + err = -EINVAL; + if (!sb_set_blocksize(sb, ctx->blksize)) + goto err; +#endif + fc->sync_fs = 1; + } else { + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + } + + sb->s_subtype = ctx->subtype; + ctx->subtype = NULL; + if (IS_ENABLED(CONFIG_FUSE_DAX)) { + err = fuse_dax_conn_alloc(fc, ctx->dax_mode, ctx->dax_dev); + if (err) + goto err; + } + + if (ctx->fudptr) { + err = -ENOMEM; + fud = fuse_dev_alloc_install(fc); + if (!fud) + goto err_free_dax; + } fc->dev = sb->s_dev; - fc->sb = sb; + fm->sb = sb; err = fuse_bdi_init(fc, sb); if (err) goto err_dev_free; @@ -1157,40 +1874,31 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->dont_mask = 1; sb->s_flags |= SB_POSIXACL; - fc->default_permissions = d.default_permissions; - fc->allow_other = d.allow_other; - fc->user_id = d.user_id; - fc->group_id = d.group_id; - fc->max_read = max_t(unsigned, 4096, d.max_read); - fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; - - /* Used by get_root_inode() */ - sb->s_fs_info = fc; + fc->default_permissions = ctx->default_permissions; + fc->allow_other = ctx->allow_other; + fc->user_id = ctx->user_id; + fc->group_id = ctx->group_id; + fc->legacy_opts_show = ctx->legacy_opts_show; + fc->max_read = max_t(unsigned int, 4096, ctx->max_read); + fc->destroy = ctx->destroy; + fc->no_control = ctx->no_control; + fc->no_force_umount = ctx->no_force_umount; err = -ENOMEM; - root = fuse_get_root_inode(sb, d.rootmode); - sb->s_d_op = &fuse_root_dentry_operations; + root = fuse_get_root_inode(sb, ctx->rootmode); + set_default_d_op(sb, &fuse_dentry_operations); root_dentry = d_make_root(root); if (!root_dentry) goto err_dev_free; - /* Root dentry doesn't have .d_revalidate */ - sb->s_d_op = &fuse_dentry_operations; - - init_req = fuse_request_alloc(0); - if (!init_req) - goto err_put_root; - __set_bit(FR_BACKGROUND, &init_req->flags); - - if (is_bdev) { - fc->destroy_req = fuse_request_alloc(0); - if (!fc->destroy_req) - goto err_free_init_req; - } mutex_lock(&fuse_mutex); err = -EINVAL; - if (file->private_data) - goto err_unlock; + if (ctx->fudptr && *ctx->fudptr) { + if (*ctx->fudptr == FUSE_DEV_SYNC_INIT) + fc->sync_init = 1; + else + goto err_unlock; + } err = fuse_ctl_add_conn(fc); if (err) @@ -1198,94 +1906,252 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - file->private_data = fud; + if (ctx->fudptr) { + *ctx->fudptr = fud; + wake_up_all(&fuse_dev_waitq); + } mutex_unlock(&fuse_mutex); - /* - * atomic_dec_and_test() in fput() provides the necessary - * memory barrier for file->private_data to be visible on all - * CPUs after this - */ - fput(file); - - fuse_send_init(fc, init_req); - return 0; err_unlock: mutex_unlock(&fuse_mutex); - err_free_init_req: - fuse_request_free(init_req); - err_put_root: dput(root_dentry); err_dev_free: - fuse_dev_free(fud); - err_put_conn: - fuse_conn_put(fc); - sb->s_fs_info = NULL; - err_fput: - fput(file); + if (fud) + fuse_dev_free(fud); + err_free_dax: + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_conn_free(fc); err: return err; } +EXPORT_SYMBOL_GPL(fuse_fill_super_common); -static struct dentry *fuse_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *raw_data) +static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { - return mount_nodev(fs_type, flags, raw_data, fuse_fill_super); + struct fuse_fs_context *ctx = fsc->fs_private; + struct fuse_mount *fm; + int err; + + if (!ctx->file || !ctx->rootmode_present || + !ctx->user_id_present || !ctx->group_id_present) + return -EINVAL; + + /* + * Require mount to happen from the same user namespace which + * opened /dev/fuse to prevent potential attacks. + */ + if ((ctx->file->f_op != &fuse_dev_operations) || + (ctx->file->f_cred->user_ns != sb->s_user_ns)) + return -EINVAL; + ctx->fudptr = &ctx->file->private_data; + + err = fuse_fill_super_common(sb, ctx); + if (err) + return err; + /* file->private_data shall be visible on all CPUs after this */ + smp_mb(); + + fm = get_fuse_mount_super(sb); + + return fuse_send_init(fm); } -static void fuse_sb_destroy(struct super_block *sb) +/* + * This is the path where user supplied an already initialized fuse dev. In + * this case never create a new super if the old one is gone. + */ +static int fuse_set_no_super(struct super_block *sb, struct fs_context *fsc) { - struct fuse_conn *fc = get_fuse_conn_super(sb); + return -ENOTCONN; +} - if (fc) { - fuse_send_destroy(fc); +static int fuse_test_super(struct super_block *sb, struct fs_context *fsc) +{ + + return fsc->sget_key == get_fuse_conn_super(sb); +} + +static int fuse_get_tree(struct fs_context *fsc) +{ + struct fuse_fs_context *ctx = fsc->fs_private; + struct fuse_dev *fud; + struct fuse_conn *fc; + struct fuse_mount *fm; + struct super_block *sb; + int err; + + fc = kmalloc(sizeof(*fc), GFP_KERNEL); + if (!fc) + return -ENOMEM; - fuse_abort_conn(fc, false); - fuse_wait_aborted(fc); + fm = kzalloc(sizeof(*fm), GFP_KERNEL); + if (!fm) { + kfree(fc); + return -ENOMEM; + } + + fuse_conn_init(fc, fm, fsc->user_ns, &fuse_dev_fiq_ops, NULL); + fc->release = fuse_free_conn; + + fsc->s_fs_info = fm; - down_write(&fc->killsb); - fc->sb = NULL; - up_write(&fc->killsb); + if (ctx->fd_present) + ctx->file = fget(ctx->fd); + + if (IS_ENABLED(CONFIG_BLOCK) && ctx->is_bdev) { + err = get_tree_bdev(fsc, fuse_fill_super); + goto out; + } + /* + * While block dev mount can be initialized with a dummy device fd + * (found by device name), normal fuse mounts can't + */ + err = -EINVAL; + if (!ctx->file) + goto out; + + /* + * Allow creating a fuse mount with an already initialized fuse + * connection + */ + fud = __fuse_get_dev(ctx->file); + if (ctx->file->f_op == &fuse_dev_operations && fud) { + fsc->sget_key = fud->fc; + sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super); + err = PTR_ERR_OR_ZERO(sb); + if (!IS_ERR(sb)) + fsc->root = dget(sb->s_root); + } else { + err = get_tree_nodev(fsc, fuse_fill_super); } +out: + if (fsc->s_fs_info) + fuse_mount_destroy(fm); + if (ctx->file) + fput(ctx->file); + return err; } +static const struct fs_context_operations fuse_context_ops = { + .free = fuse_free_fsc, + .parse_param = fuse_parse_param, + .reconfigure = fuse_reconfigure, + .get_tree = fuse_get_tree, +}; + +/* + * Set up the filesystem mount context. + */ +static int fuse_init_fs_context(struct fs_context *fsc) +{ + struct fuse_fs_context *ctx; + + ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->max_read = ~0; + ctx->blksize = FUSE_DEFAULT_BLKSIZE; + ctx->legacy_opts_show = true; + +#ifdef CONFIG_BLOCK + if (fsc->fs_type == &fuseblk_fs_type) { + ctx->is_bdev = true; + ctx->destroy = true; + } +#endif + + fsc->fs_private = ctx; + fsc->ops = &fuse_context_ops; + return 0; +} + +bool fuse_mount_remove(struct fuse_mount *fm) +{ + struct fuse_conn *fc = fm->fc; + bool last = false; + + down_write(&fc->killsb); + list_del_init(&fm->fc_entry); + if (list_empty(&fc->mounts)) + last = true; + up_write(&fc->killsb); + + return last; +} +EXPORT_SYMBOL_GPL(fuse_mount_remove); + +void fuse_conn_destroy(struct fuse_mount *fm) +{ + struct fuse_conn *fc = fm->fc; + + if (fc->destroy) + fuse_send_destroy(fm); + + fuse_abort_conn(fc); + fuse_wait_aborted(fc); + + if (!list_empty(&fc->entry)) { + mutex_lock(&fuse_mutex); + list_del(&fc->entry); + fuse_ctl_remove_conn(fc); + mutex_unlock(&fuse_mutex); + } +} +EXPORT_SYMBOL_GPL(fuse_conn_destroy); + +static void fuse_sb_destroy(struct super_block *sb) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + bool last; + + if (sb->s_root) { + last = fuse_mount_remove(fm); + if (last) + fuse_conn_destroy(fm); + } +} + +void fuse_mount_destroy(struct fuse_mount *fm) +{ + fuse_conn_put(fm->fc); + kfree_rcu(fm, rcu); +} +EXPORT_SYMBOL(fuse_mount_destroy); + static void fuse_kill_sb_anon(struct super_block *sb) { fuse_sb_destroy(sb); kill_anon_super(sb); + fuse_mount_destroy(get_fuse_mount_super(sb)); } static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", - .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT, - .mount = fuse_mount, + .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT | FS_ALLOW_IDMAP, + .init_fs_context = fuse_init_fs_context, + .parameters = fuse_fs_parameters, .kill_sb = fuse_kill_sb_anon, }; MODULE_ALIAS_FS("fuse"); #ifdef CONFIG_BLOCK -static struct dentry *fuse_mount_blk(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *raw_data) -{ - return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super); -} - static void fuse_kill_sb_blk(struct super_block *sb) { fuse_sb_destroy(sb); kill_block_super(sb); + fuse_mount_destroy(get_fuse_mount_super(sb)); } static struct file_system_type fuseblk_fs_type = { .owner = THIS_MODULE, .name = "fuseblk", - .mount = fuse_mount_blk, + .init_fs_context = fuse_init_fs_context, + .parameters = fuse_fs_parameters, .kill_sb = fuse_kill_sb_blk, - .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, + .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("fuseblk"); @@ -1336,8 +2202,14 @@ static int __init fuse_fs_init(void) if (err) goto out3; + err = fuse_sysctl_register(); + if (err) + goto out4; + return 0; + out4: + unregister_filesystem(&fuse_fs_type); out3: unregister_fuseblk(); out2: @@ -1348,6 +2220,7 @@ static int __init fuse_fs_init(void) static void fuse_fs_cleanup(void) { + fuse_sysctl_unregister(); unregister_filesystem(&fuse_fs_type); unregister_fuseblk(); @@ -1393,8 +2266,8 @@ static int __init fuse_init(void) { int res; - printk(KERN_INFO "fuse init (API version %i.%i)\n", - FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); + pr_info("init (API version %i.%i)\n", + FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); INIT_LIST_HEAD(&fuse_conn_list); res = fuse_fs_init(); @@ -1413,6 +2286,8 @@ static int __init fuse_init(void) if (res) goto err_sysfs_cleanup; + fuse_dentry_tree_init(); + sanitize_global_limit(&max_user_bgreq); sanitize_global_limit(&max_user_congthresh); @@ -1430,8 +2305,9 @@ static int __init fuse_init(void) static void __exit fuse_exit(void) { - printk(KERN_DEBUG "fuse exit\n"); + pr_debug("exit\n"); + fuse_dentry_tree_cleanup(); fuse_ctl_cleanup(); fuse_sysfs_cleanup(); fuse_fs_cleanup(); |
