diff options
Diffstat (limited to 'fs/fuse')
-rw-r--r-- | fs/fuse/Kconfig | 12 | ||||
-rw-r--r-- | fs/fuse/Makefile | 5 | ||||
-rw-r--r-- | fs/fuse/acl.c | 10 | ||||
-rw-r--r-- | fs/fuse/control.c | 4 | ||||
-rw-r--r-- | fs/fuse/cuse.c | 33 | ||||
-rw-r--r-- | fs/fuse/dax.c | 52 | ||||
-rw-r--r-- | fs/fuse/dev.c | 598 | ||||
-rw-r--r-- | fs/fuse/dev_uring.c | 1352 | ||||
-rw-r--r-- | fs/fuse/dev_uring_i.h | 211 | ||||
-rw-r--r-- | fs/fuse/dir.c | 333 | ||||
-rw-r--r-- | fs/fuse/file.c | 670 | ||||
-rw-r--r-- | fs/fuse/fuse_dev_i.h | 70 | ||||
-rw-r--r-- | fs/fuse/fuse_i.h | 196 | ||||
-rw-r--r-- | fs/fuse/fuse_trace.h | 132 | ||||
-rw-r--r-- | fs/fuse/inode.c | 175 | ||||
-rw-r--r-- | fs/fuse/ioctl.c | 93 | ||||
-rw-r--r-- | fs/fuse/iomode.c | 60 | ||||
-rw-r--r-- | fs/fuse/passthrough.c | 44 | ||||
-rw-r--r-- | fs/fuse/readdir.c | 37 | ||||
-rw-r--r-- | fs/fuse/sysctl.c | 64 | ||||
-rw-r--r-- | fs/fuse/virtio_fs.c | 424 | ||||
-rw-r--r-- | fs/fuse/xattr.c | 11 |
22 files changed, 3633 insertions, 953 deletions
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 8674dbfbe59d..ca215a3cba3e 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -63,3 +63,15 @@ config FUSE_PASSTHROUGH to be performed directly on a backing file. If you want to allow passthrough operations, answer Y. + +config FUSE_IO_URING + bool "FUSE communication over io-uring" + default y + depends on FUSE_FS + depends on IO_URING + help + This allows sending FUSE requests over the io-uring interface and + also adds request core affinity. + + If you want to allow fuse server/client communication through io-uring, + answer Y diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 6e0228c6d0cb..3f0f312a31c1 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -3,6 +3,9 @@ # Makefile for the FUSE filesystem. # +# Needed for trace events +ccflags-y = -I$(src) + obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o @@ -11,5 +14,7 @@ fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o +fuse-$(CONFIG_SYSCTL) += sysctl.o +fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o virtiofs-y := virtio_fs.o diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 3d192b80a561..8f484b105f13 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -12,7 +12,6 @@ #include <linux/posix_acl_xattr.h> static struct posix_acl *__fuse_get_acl(struct fuse_conn *fc, - struct mnt_idmap *idmap, struct inode *inode, int type, bool rcu) { int size; @@ -74,7 +73,7 @@ struct posix_acl *fuse_get_acl(struct mnt_idmap *idmap, if (fuse_no_acl(fc, inode)) return ERR_PTR(-EOPNOTSUPP); - return __fuse_get_acl(fc, idmap, inode, type, false); + return __fuse_get_acl(fc, inode, type, false); } struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu) @@ -90,8 +89,7 @@ struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu) */ if (!fc->posix_acl) return NULL; - - return __fuse_get_acl(fc, &nop_mnt_idmap, inode, type, rcu); + return __fuse_get_acl(fc, inode, type, rcu); } int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, @@ -146,8 +144,8 @@ int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, * be stripped. */ if (fc->posix_acl && - !vfsgid_in_group_p(i_gid_into_vfsgid(&nop_mnt_idmap, inode)) && - !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID)) + !in_group_or_capable(idmap, inode, + i_gid_into_vfsgid(idmap, inode))) extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; ret = fuse_setxattr(inode, name, value, size, 0, extra_flags); diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 97ac994ff78f..2a730d88cc3b 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -183,27 +183,23 @@ out: static const struct file_operations fuse_ctl_abort_ops = { .open = nonseekable_open, .write = fuse_conn_abort_write, - .llseek = no_llseek, }; static const struct file_operations fuse_ctl_waiting_ops = { .open = nonseekable_open, .read = fuse_conn_waiting_read, - .llseek = no_llseek, }; static const struct file_operations fuse_conn_max_background_ops = { .open = nonseekable_open, .read = fuse_conn_max_background_read, .write = fuse_conn_max_background_write, - .llseek = no_llseek, }; static const struct file_operations fuse_conn_congestion_threshold_ops = { .open = nonseekable_open, .read = fuse_conn_congestion_threshold_read, .write = fuse_conn_congestion_threshold_write, - .llseek = no_llseek, }; static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b6cad106c37e..b39844d75a80 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -303,13 +303,17 @@ struct cuse_init_args { struct fuse_args_pages ap; struct cuse_init_in in; struct cuse_init_out out; - struct page *page; - struct fuse_page_desc desc; + struct folio *folio; + struct fuse_folio_desc desc; }; /** * cuse_process_init_reply - finish initializing CUSE channel * + * @fm: The fuse mount information containing the CUSE connection. + * @args: The arguments passed to the init reply. + * @error: The error code signifying if any error occurred during the process. + * * This function creates the character device and sets up all the * required data structures for it. Please read the comment at the * top of this file for high level overview. @@ -322,7 +326,7 @@ static void cuse_process_init_reply(struct fuse_mount *fm, struct fuse_args_pages *ap = &ia->ap; struct cuse_conn *cc = fc_to_cc(fc), *pos; struct cuse_init_out *arg = &ia->out; - struct page *page = ap->pages[0]; + struct folio *folio = ap->folios[0]; struct cuse_devinfo devinfo = { }; struct device *dev; struct cdev *cdev; @@ -339,7 +343,7 @@ static void cuse_process_init_reply(struct fuse_mount *fm, /* parse init reply */ cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL; - rc = cuse_parse_devinfo(page_address(page), ap->args.out_args[1].size, + rc = cuse_parse_devinfo(folio_address(folio), ap->args.out_args[1].size, &devinfo); if (rc) goto err; @@ -407,7 +411,7 @@ static void cuse_process_init_reply(struct fuse_mount *fm, kobject_uevent(&dev->kobj, KOBJ_ADD); out: kfree(ia); - __free_page(page); + folio_put(folio); return; err_cdev: @@ -425,7 +429,7 @@ err: static int cuse_send_init(struct cuse_conn *cc) { int rc; - struct page *page; + struct folio *folio; struct fuse_mount *fm = &cc->fm; struct cuse_init_args *ia; struct fuse_args_pages *ap; @@ -433,13 +437,14 @@ static int cuse_send_init(struct cuse_conn *cc) BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); rc = -ENOMEM; - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) + + folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, 0); + if (!folio) goto err; ia = kzalloc(sizeof(*ia), GFP_KERNEL); if (!ia) - goto err_free_page; + goto err_free_folio; ap = &ia->ap; ia->in.major = FUSE_KERNEL_VERSION; @@ -455,18 +460,18 @@ static int cuse_send_init(struct cuse_conn *cc) ap->args.out_args[1].size = CUSE_INIT_INFO_MAX; ap->args.out_argvar = true; ap->args.out_pages = true; - ap->num_pages = 1; - ap->pages = &ia->page; + ap->num_folios = 1; + ap->folios = &ia->folio; ap->descs = &ia->desc; - ia->page = page; + ia->folio = folio; ia->desc.length = ap->args.out_args[1].size; ap->args.end = cuse_process_init_reply; rc = fuse_simple_background(fm, &ap->args, GFP_KERNEL); if (rc) { kfree(ia); -err_free_page: - __free_page(page); +err_free_folio: + folio_put(folio); } err: return rc; diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 12ef91d170bb..0502bf3cdf6a 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -240,11 +240,12 @@ static int fuse_send_removemapping(struct inode *inode, args.opcode = FUSE_REMOVEMAPPING; args.nodeid = fi->nodeid; - args.in_numargs = 2; - args.in_args[0].size = sizeof(*inargp); - args.in_args[0].value = inargp; - args.in_args[1].size = inargp->count * sizeof(*remove_one); - args.in_args[1].value = remove_one; + args.in_numargs = 3; + fuse_set_zero_arg0(&args); + args.in_args[1].size = sizeof(*inargp); + args.in_args[1].value = inargp; + args.in_args[2].size = inargp->count * sizeof(*remove_one); + args.in_args[2].value = remove_one; return fuse_simple_request(fm, &args); } @@ -665,36 +666,12 @@ static void fuse_wait_dax_page(struct inode *inode) filemap_invalidate_lock(inode->i_mapping); } -/* Should be called with mapping->invalidate_lock held exclusively */ -static int __fuse_dax_break_layouts(struct inode *inode, bool *retry, - loff_t start, loff_t end) -{ - struct page *page; - - page = dax_layout_busy_page_range(inode->i_mapping, start, end); - if (!page) - return 0; - - *retry = true; - return ___wait_var_event(&page->_refcount, - atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, - 0, 0, fuse_wait_dax_page(inode)); -} - -/* dmap_end == 0 leads to unmapping of whole file */ +/* Should be called with mapping->invalidate_lock held exclusively. */ int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end) { - bool retry; - int ret; - - do { - retry = false; - ret = __fuse_dax_break_layouts(inode, &retry, dmap_start, - dmap_end); - } while (ret == 0 && retry); - - return ret; + return dax_break_layout(inode, dmap_start, dmap_end, + fuse_wait_dax_page); } ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -774,16 +751,6 @@ out: return ret; } -static int fuse_dax_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - - struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); - - return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc); -} - static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order, bool write) { @@ -1323,7 +1290,6 @@ bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi) } static const struct address_space_operations fuse_dax_file_aops = { - .writepages = fuse_dax_writepages, .direct_IO = noop_direct_IO, .dirty_folio = noop_dirty_folio, }; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 3ec8bb5e68ff..6dcbaa218b7a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -6,7 +6,9 @@ See the file COPYING. */ +#include "dev_uring_i.h" #include "fuse_i.h" +#include "fuse_dev_i.h" #include <linux/init.h> #include <linux/module.h> @@ -22,22 +24,106 @@ #include <linux/splice.h> #include <linux/sched.h> +#define CREATE_TRACE_POINTS +#include "fuse_trace.h" + MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); -/* Ordinary requests have even IDs, while interrupts IDs are odd */ -#define FUSE_INT_REQ_BIT (1ULL << 0) -#define FUSE_REQ_ID_STEP (1ULL << 1) - static struct kmem_cache *fuse_req_cachep; -static struct fuse_dev *fuse_get_dev(struct file *file) +const unsigned long fuse_timeout_timer_freq = + secs_to_jiffies(FUSE_TIMEOUT_TIMER_FREQ); + +bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list) { - /* - * Lockless access is OK, because file->private data is set - * once during mount and is valid until the file is released. - */ - return READ_ONCE(file->private_data); + struct fuse_req *req; + + req = list_first_entry_or_null(list, struct fuse_req, list); + if (!req) + return false; + return time_is_before_jiffies(req->create_time + fc->timeout.req_timeout); +} + +bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing) +{ + int i; + + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + if (fuse_request_expired(fc, &processing[i])) + return true; + + return false; +} + +/* + * Check if any requests aren't being completed by the time the request timeout + * elapses. To do so, we: + * - check the fiq pending list + * - check the bg queue + * - check the fpq io and processing lists + * + * To make this fast, we only check against the head request on each list since + * these are generally queued in order of creation time (eg newer requests get + * queued to the tail). We might miss a few edge cases (eg requests transitioning + * between lists, re-sent requests at the head of the pending list having a + * later creation time than other requests on that list, etc.) but that is fine + * since if the request never gets fulfilled, it will eventually be caught. + */ +void fuse_check_timeout(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct fuse_conn *fc = container_of(dwork, struct fuse_conn, + timeout.work); + struct fuse_iqueue *fiq = &fc->iq; + struct fuse_dev *fud; + struct fuse_pqueue *fpq; + bool expired = false; + + if (!atomic_read(&fc->num_waiting)) + goto out; + + spin_lock(&fiq->lock); + expired = fuse_request_expired(fc, &fiq->pending); + spin_unlock(&fiq->lock); + if (expired) + goto abort_conn; + + spin_lock(&fc->bg_lock); + expired = fuse_request_expired(fc, &fc->bg_queue); + spin_unlock(&fc->bg_lock); + if (expired) + goto abort_conn; + + spin_lock(&fc->lock); + if (!fc->connected) { + spin_unlock(&fc->lock); + return; + } + list_for_each_entry(fud, &fc->devices, entry) { + fpq = &fud->pq; + spin_lock(&fpq->lock); + if (fuse_request_expired(fc, &fpq->io) || + fuse_fpq_processing_expired(fc, fpq->processing)) { + spin_unlock(&fpq->lock); + spin_unlock(&fc->lock); + goto abort_conn; + } + + spin_unlock(&fpq->lock); + } + spin_unlock(&fc->lock); + + if (fuse_uring_request_expired(fc)) + goto abort_conn; + +out: + queue_delayed_work(system_wq, &fc->timeout.work, + fuse_timeout_timer_freq); + return; + +abort_conn: + fuse_abort_conn(fc); } static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req) @@ -48,6 +134,7 @@ static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req) refcount_set(&req->count, 1); __set_bit(FR_PENDING, &req->flags); req->fm = fm; + req->create_time = jiffies; } static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags) @@ -84,7 +171,8 @@ void fuse_set_initialized(struct fuse_conn *fc) static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background) { - return !fc->initialized || (for_background && fc->blocked); + return !fc->initialized || (for_background && fc->blocked) || + (fc->io_uring && fc->connected && !fuse_uring_ready(fc)); } static void fuse_drop_waiting(struct fuse_conn *fc) @@ -103,11 +191,17 @@ static void fuse_drop_waiting(struct fuse_conn *fc) static void fuse_put_request(struct fuse_req *req); -static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background) +static struct fuse_req *fuse_get_req(struct mnt_idmap *idmap, + struct fuse_mount *fm, + bool for_background) { struct fuse_conn *fc = fm->fc; struct fuse_req *req; + bool no_idmap = !fm->sb || (fm->sb->s_iflags & SB_I_NOIDMAP); + kuid_t fsuid; + kgid_t fsgid; int err; + atomic_inc(&fc->num_waiting); if (fuse_block_alloc(fc, for_background)) { @@ -135,19 +229,32 @@ static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background) goto out; } - req->in.h.uid = from_kuid(fc->user_ns, current_fsuid()); - req->in.h.gid = from_kgid(fc->user_ns, current_fsgid()); req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); __set_bit(FR_WAITING, &req->flags); if (for_background) __set_bit(FR_BACKGROUND, &req->flags); - if (unlikely(req->in.h.uid == ((uid_t)-1) || - req->in.h.gid == ((gid_t)-1))) { + /* + * Keep the old behavior when idmappings support was not + * declared by a FUSE server. + * + * For those FUSE servers who support idmapped mounts, + * we send UID/GID only along with "inode creation" + * fuse requests, otherwise idmap == &invalid_mnt_idmap and + * req->in.h.{u,g}id will be equal to FUSE_INVALID_UIDGID. + */ + fsuid = no_idmap ? current_fsuid() : mapped_fsuid(idmap, fc->user_ns); + fsgid = no_idmap ? current_fsgid() : mapped_fsgid(idmap, fc->user_ns); + req->in.h.uid = from_kuid(fc->user_ns, fsuid); + req->in.h.gid = from_kgid(fc->user_ns, fsgid); + + if (no_idmap && unlikely(req->in.h.uid == ((uid_t)-1) || + req->in.h.gid == ((gid_t)-1))) { fuse_put_request(req); return ERR_PTR(-EOVERFLOW); } + return req; out: @@ -192,14 +299,25 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args) } EXPORT_SYMBOL_GPL(fuse_len_args); -u64 fuse_get_unique(struct fuse_iqueue *fiq) +static u64 fuse_get_unique_locked(struct fuse_iqueue *fiq) { fiq->reqctr += FUSE_REQ_ID_STEP; return fiq->reqctr; } + +u64 fuse_get_unique(struct fuse_iqueue *fiq) +{ + u64 ret; + + spin_lock(&fiq->lock); + ret = fuse_get_unique_locked(fiq); + spin_unlock(&fiq->lock); + + return ret; +} EXPORT_SYMBOL_GPL(fuse_get_unique); -static unsigned int fuse_req_hash(u64 unique) +unsigned int fuse_req_hash(u64 unique) { return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); } @@ -215,22 +333,71 @@ __releases(fiq->lock) spin_unlock(&fiq->lock); } +void fuse_dev_queue_forget(struct fuse_iqueue *fiq, + struct fuse_forget_link *forget) +{ + spin_lock(&fiq->lock); + if (fiq->connected) { + fiq->forget_list_tail->next = forget; + fiq->forget_list_tail = forget; + fuse_dev_wake_and_unlock(fiq); + } else { + kfree(forget); + spin_unlock(&fiq->lock); + } +} + +void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) +{ + spin_lock(&fiq->lock); + if (list_empty(&req->intr_entry)) { + list_add_tail(&req->intr_entry, &fiq->interrupts); + /* + * Pairs with smp_mb() implied by test_and_set_bit() + * from fuse_request_end(). + */ + smp_mb(); + if (test_bit(FR_FINISHED, &req->flags)) { + list_del_init(&req->intr_entry); + spin_unlock(&fiq->lock); + } else { + fuse_dev_wake_and_unlock(fiq); + } + } else { + spin_unlock(&fiq->lock); + } +} + +static void fuse_dev_queue_req(struct fuse_iqueue *fiq, struct fuse_req *req) +{ + spin_lock(&fiq->lock); + if (fiq->connected) { + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique_locked(fiq); + list_add_tail(&req->list, &fiq->pending); + fuse_dev_wake_and_unlock(fiq); + } else { + spin_unlock(&fiq->lock); + req->out.h.error = -ENOTCONN; + clear_bit(FR_PENDING, &req->flags); + fuse_request_end(req); + } +} + const struct fuse_iqueue_ops fuse_dev_fiq_ops = { - .wake_forget_and_unlock = fuse_dev_wake_and_unlock, - .wake_interrupt_and_unlock = fuse_dev_wake_and_unlock, - .wake_pending_and_unlock = fuse_dev_wake_and_unlock, + .send_forget = fuse_dev_queue_forget, + .send_interrupt = fuse_dev_queue_interrupt, + .send_req = fuse_dev_queue_req, }; EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops); -static void queue_request_and_unlock(struct fuse_iqueue *fiq, - struct fuse_req *req) -__releases(fiq->lock) +static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req) { req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); - list_add_tail(&req->list, &fiq->pending); - fiq->ops->wake_pending_and_unlock(fiq); + trace_fuse_request_send(req); + fiq->ops->send_req(fiq, req); } void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, @@ -241,15 +408,7 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, forget->forget_one.nodeid = nodeid; forget->forget_one.nlookup = nlookup; - spin_lock(&fiq->lock); - if (fiq->connected) { - fiq->forget_list_tail->next = forget; - fiq->forget_list_tail = forget; - fiq->ops->wake_forget_and_unlock(fiq); - } else { - kfree(forget); - spin_unlock(&fiq->lock); - } + fiq->ops->send_forget(fiq, forget); } static void flush_bg_queue(struct fuse_conn *fc) @@ -263,9 +422,7 @@ static void flush_bg_queue(struct fuse_conn *fc) req = list_first_entry(&fc->bg_queue, struct fuse_req, list); list_del(&req->list); fc->active_background++; - spin_lock(&fiq->lock); - req->in.h.unique = fuse_get_unique(fiq); - queue_request_and_unlock(fiq, req); + fuse_send_one(fiq, req); } } @@ -286,6 +443,7 @@ void fuse_request_end(struct fuse_req *req) if (test_and_set_bit(FR_FINISHED, &req->flags)) goto put_request; + trace_fuse_request_end(req); /* * test_and_set_bit() implies smp_mb() between bit * changing and below FR_INTERRUPTED check. Pairs with @@ -335,30 +493,31 @@ static int queue_interrupt(struct fuse_req *req) { struct fuse_iqueue *fiq = &req->fm->fc->iq; - spin_lock(&fiq->lock); /* Check for we've sent request to interrupt this req */ - if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) { - spin_unlock(&fiq->lock); + if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) return -EINVAL; - } - if (list_empty(&req->intr_entry)) { - list_add_tail(&req->intr_entry, &fiq->interrupts); + fiq->ops->send_interrupt(fiq, req); + + return 0; +} + +bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock) +{ + spin_lock(lock); + if (test_bit(FR_PENDING, &req->flags)) { /* - * Pairs with smp_mb() implied by test_and_set_bit() - * from fuse_request_end(). + * FR_PENDING does not get cleared as the request will end + * up in destruction anyway. */ - smp_mb(); - if (test_bit(FR_FINISHED, &req->flags)) { - list_del_init(&req->intr_entry); - spin_unlock(&fiq->lock); - return 0; - } - fiq->ops->wake_interrupt_and_unlock(fiq); - } else { - spin_unlock(&fiq->lock); + list_del(&req->list); + spin_unlock(lock); + __fuse_put_request(req); + req->out.h.error = -EINTR; + return true; } - return 0; + spin_unlock(lock); + return false; } static void request_wait_answer(struct fuse_req *req) @@ -382,22 +541,20 @@ static void request_wait_answer(struct fuse_req *req) } if (!test_bit(FR_FORCE, &req->flags)) { + bool removed; + /* Only fatal signals may interrupt this */ err = wait_event_killable(req->waitq, test_bit(FR_FINISHED, &req->flags)); if (!err) return; - spin_lock(&fiq->lock); - /* Request is not yet in userspace, bail out */ - if (test_bit(FR_PENDING, &req->flags)) { - list_del(&req->list); - spin_unlock(&fiq->lock); - __fuse_put_request(req); - req->out.h.error = -EINTR; + if (test_bit(FR_URING, &req->flags)) + removed = fuse_uring_remove_pending_req(req); + else + removed = fuse_remove_pending_req(req, &fiq->lock); + if (removed) return; - } - spin_unlock(&fiq->lock); } /* @@ -412,21 +569,15 @@ static void __fuse_request_send(struct fuse_req *req) struct fuse_iqueue *fiq = &req->fm->fc->iq; BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); - spin_lock(&fiq->lock); - if (!fiq->connected) { - spin_unlock(&fiq->lock); - req->out.h.error = -ENOTCONN; - } else { - req->in.h.unique = fuse_get_unique(fiq); - /* acquire extra reference, since request is still needed - after fuse_request_end() */ - __fuse_get_request(req); - queue_request_and_unlock(fiq, req); - request_wait_answer(req); - /* Pairs with smp_wmb() in fuse_request_end() */ - smp_rmb(); - } + /* acquire extra reference, since request is still needed after + fuse_request_end() */ + __fuse_get_request(req); + fuse_send_one(fiq, req); + + request_wait_answer(req); + /* Pairs with smp_wmb() in fuse_request_end() */ + smp_rmb(); } static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) @@ -466,8 +617,14 @@ static void fuse_force_creds(struct fuse_req *req) { struct fuse_conn *fc = req->fm->fc; - req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); - req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); + if (!req->fm->sb || req->fm->sb->s_iflags & SB_I_NOIDMAP) { + req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); + req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); + } else { + req->in.h.uid = FUSE_INVALID_UIDGID; + req->in.h.gid = FUSE_INVALID_UIDGID; + } + req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); } @@ -482,7 +639,9 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) __set_bit(FR_ASYNC, &req->flags); } -ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) +ssize_t __fuse_simple_request(struct mnt_idmap *idmap, + struct fuse_mount *fm, + struct fuse_args *args) { struct fuse_conn *fc = fm->fc; struct fuse_req *req; @@ -499,7 +658,7 @@ ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) __set_bit(FR_FORCE, &req->flags); } else { WARN_ON(args->nocreds); - req = fuse_get_req(fm, false); + req = fuse_get_req(idmap, fm, false); if (IS_ERR(req)) return PTR_ERR(req); } @@ -521,7 +680,25 @@ ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) return ret; } -static bool fuse_request_queue_background(struct fuse_req *req) +#ifdef CONFIG_FUSE_IO_URING +static bool fuse_request_queue_background_uring(struct fuse_conn *fc, + struct fuse_req *req) +{ + struct fuse_iqueue *fiq = &fc->iq; + + req->in.h.unique = fuse_get_unique(fiq); + req->in.h.len = sizeof(struct fuse_in_header) + + fuse_len_args(req->args->in_numargs, + (struct fuse_arg *) req->args->in_args); + + return fuse_uring_queue_bq_req(req); +} +#endif + +/* + * @return true if queued + */ +static int fuse_request_queue_background(struct fuse_req *req) { struct fuse_mount *fm = req->fm; struct fuse_conn *fc = fm->fc; @@ -533,6 +710,12 @@ static bool fuse_request_queue_background(struct fuse_req *req) atomic_inc(&fc->num_waiting); } __set_bit(FR_ISREPLY, &req->flags); + +#ifdef CONFIG_FUSE_IO_URING + if (fuse_uring_ready(fc)) + return fuse_request_queue_background_uring(fc, req); +#endif + spin_lock(&fc->bg_lock); if (likely(fc->connected)) { fc->num_background++; @@ -560,7 +743,7 @@ int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, __set_bit(FR_BACKGROUND, &req->flags); } else { WARN_ON(args->nocreds); - req = fuse_get_req(fm, true); + req = fuse_get_req(&invalid_mnt_idmap, fm, true); if (IS_ERR(req)) return PTR_ERR(req); } @@ -581,9 +764,8 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm, { struct fuse_req *req; struct fuse_iqueue *fiq = &fm->fc->iq; - int err = 0; - req = fuse_get_req(fm, false); + req = fuse_get_req(&invalid_mnt_idmap, fm, false); if (IS_ERR(req)) return PTR_ERR(req); @@ -592,16 +774,9 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm, fuse_args_to_req(req, args); - spin_lock(&fiq->lock); - if (fiq->connected) { - queue_request_and_unlock(fiq, req); - } else { - err = -ENODEV; - spin_unlock(&fiq->lock); - fuse_put_request(req); - } + fuse_send_one(fiq, req); - return err; + return 0; } /* @@ -641,22 +816,8 @@ static int unlock_request(struct fuse_req *req) return err; } -struct fuse_copy_state { - int write; - struct fuse_req *req; - struct iov_iter *iter; - struct pipe_buffer *pipebufs; - struct pipe_buffer *currbuf; - struct pipe_inode_info *pipe; - unsigned long nr_segs; - struct page *pg; - unsigned len; - unsigned offset; - unsigned move_pages:1; -}; - -static void fuse_copy_init(struct fuse_copy_state *cs, int write, - struct iov_iter *iter) +void fuse_copy_init(struct fuse_copy_state *cs, int write, + struct iov_iter *iter) { memset(cs, 0, sizeof(*cs)); cs->write = write; @@ -763,6 +924,9 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) *size -= ncpy; cs->len -= ncpy; cs->offset += ncpy; + if (cs->is_uring) + cs->ring.copied_sz += ncpy; + return ncpy; } @@ -773,7 +937,6 @@ static int fuse_check_folio(struct folio *folio) (folio->flags & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced | - 1 << PG_uptodate | 1 << PG_lru | 1 << PG_active | 1 << PG_workingset | @@ -786,6 +949,12 @@ static int fuse_check_folio(struct folio *folio) return 0; } +/* + * Attempt to steal a page from the splice() pipe and move it into the + * pagecache. If successful, the pointer in @pagep will be updated. The + * folio that was originally in @pagep will lose a reference and the new + * folio returned in @pagep will carry a reference. + */ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) { int err; @@ -818,9 +987,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) newfolio = page_folio(buf->page); - if (!folio_test_uptodate(newfolio)) - folio_mark_uptodate(newfolio); - + folio_clear_uptodate(newfolio); folio_clear_mappedtodisk(newfolio); if (fuse_check_folio(newfolio) != 0) @@ -980,17 +1147,27 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, struct fuse_req *req = cs->req; struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args); - - for (i = 0; i < ap->num_pages && (nbytes || zeroing); i++) { + for (i = 0; i < ap->num_folios && (nbytes || zeroing); i++) { int err; unsigned int offset = ap->descs[i].offset; unsigned int count = min(nbytes, ap->descs[i].length); + struct page *orig, *pagep; - err = fuse_copy_page(cs, &ap->pages[i], offset, count, zeroing); + orig = pagep = &ap->folios[i]->page; + + err = fuse_copy_page(cs, &pagep, offset, count, zeroing); if (err) return err; nbytes -= count; + + /* + * fuse_copy_page may have moved a page from a pipe instead of + * copying into our given page, so update the folios if it was + * replaced. + */ + if (pagep != orig) + ap->folios[i] = page_folio(pagep); } return 0; } @@ -1010,9 +1187,9 @@ static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) } /* Copy request arguments to/from userspace buffer */ -static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, - unsigned argpages, struct fuse_arg *args, - int zeroing) +int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, + unsigned argpages, struct fuse_arg *args, + int zeroing) { int err = 0; unsigned i; @@ -1076,9 +1253,9 @@ __releases(fiq->lock) return err ? err : reqsize; } -struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, - unsigned int max, - unsigned int *countp) +static struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, + unsigned int max, + unsigned int *countp) { struct fuse_forget_link *head = fiq->forget_list_head.next; struct fuse_forget_link **newhead = &head; @@ -1097,7 +1274,6 @@ struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, return head; } -EXPORT_SYMBOL(fuse_dequeue_forget); static int fuse_read_single_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, @@ -1112,7 +1288,7 @@ __releases(fiq->lock) struct fuse_in_header ih = { .opcode = FUSE_FORGET, .nodeid = forget->forget_one.nodeid, - .unique = fuse_get_unique(fiq), + .unique = fuse_get_unique_locked(fiq), .len = sizeof(ih) + sizeof(arg), }; @@ -1143,7 +1319,7 @@ __releases(fiq->lock) struct fuse_batch_forget_in arg = { .count = 0 }; struct fuse_in_header ih = { .opcode = FUSE_BATCH_FORGET, - .unique = fuse_get_unique(fiq), + .unique = fuse_get_unique_locked(fiq), .len = sizeof(ih) + sizeof(arg), }; @@ -1392,7 +1568,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, if (ret < 0) goto out; - if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->max_usage) { + if (pipe_buf_usage(pipe) + cs.nr_segs > pipe->max_usage) { ret = -EIO; goto out; } @@ -1468,14 +1644,10 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_inval_entry_out outarg; - int err = -ENOMEM; - char *buf; + int err; + char *buf = NULL; struct qstr name; - buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); - if (!buf) - goto err; - err = -EINVAL; if (size < sizeof(outarg)) goto err; @@ -1485,13 +1657,18 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, goto err; err = -ENAMETOOLONG; - if (outarg.namelen > FUSE_NAME_MAX) + if (outarg.namelen > fc->name_max) goto err; err = -EINVAL; if (size != sizeof(outarg) + outarg.namelen + 1) goto err; + err = -ENOMEM; + buf = kzalloc(outarg.namelen + 1, GFP_KERNEL); + if (!buf) + goto err; + name.name = buf; name.len = outarg.namelen; err = fuse_copy_one(cs, buf, outarg.namelen + 1); @@ -1516,14 +1693,10 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_delete_out outarg; - int err = -ENOMEM; - char *buf; + int err; + char *buf = NULL; struct qstr name; - buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); - if (!buf) - goto err; - err = -EINVAL; if (size < sizeof(outarg)) goto err; @@ -1533,13 +1706,18 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, goto err; err = -ENAMETOOLONG; - if (outarg.namelen > FUSE_NAME_MAX) + if (outarg.namelen > fc->name_max) goto err; err = -EINVAL; if (size != sizeof(outarg) + outarg.namelen + 1) goto err; + err = -ENOMEM; + buf = kzalloc(outarg.namelen + 1, GFP_KERNEL); + if (!buf) + goto err; + name.name = buf; name.len = outarg.namelen; err = fuse_copy_one(cs, buf, outarg.namelen + 1); @@ -1607,22 +1785,25 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, num = outarg.size; while (num) { + struct folio *folio; struct page *page; unsigned int this_num; - err = -ENOMEM; - page = find_or_create_page(mapping, index, - mapping_gfp_mask(mapping)); - if (!page) + folio = filemap_grab_folio(mapping, index); + err = PTR_ERR(folio); + if (IS_ERR(folio)) goto out_iput; - this_num = min_t(unsigned, num, PAGE_SIZE - offset); + page = &folio->page; + this_num = min_t(unsigned, num, folio_size(folio) - offset); err = fuse_copy_page(cs, &page, offset, this_num, 0); - if (!err && offset == 0 && - (this_num == PAGE_SIZE || file_size == end)) - SetPageUptodate(page); - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio) && !err && offset == 0 && + (this_num == folio_size(folio) || file_size == end)) { + folio_zero_segment(folio, this_num, folio_size(folio)); + folio_mark_uptodate(folio); + } + folio_unlock(folio); + folio_put(folio); if (err) goto out_iput; @@ -1654,7 +1835,7 @@ static void fuse_retrieve_end(struct fuse_mount *fm, struct fuse_args *args, struct fuse_retrieve_args *ra = container_of(args, typeof(*ra), ap.args); - release_pages(ra->ap.pages, ra->ap.num_pages); + release_pages(ra->ap.folios, ra->ap.num_folios); kfree(ra); } @@ -1668,7 +1849,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, unsigned int num; unsigned int offset; size_t total_len = 0; - unsigned int num_pages; + unsigned int num_pages, cur_pages = 0; struct fuse_conn *fc = fm->fc; struct fuse_retrieve_args *ra; size_t args_size = sizeof(*ra); @@ -1687,38 +1868,39 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; num_pages = min(num_pages, fc->max_pages); - args_size += num_pages * (sizeof(ap->pages[0]) + sizeof(ap->descs[0])); + args_size += num_pages * (sizeof(ap->folios[0]) + sizeof(ap->descs[0])); ra = kzalloc(args_size, GFP_KERNEL); if (!ra) return -ENOMEM; ap = &ra->ap; - ap->pages = (void *) (ra + 1); - ap->descs = (void *) (ap->pages + num_pages); + ap->folios = (void *) (ra + 1); + ap->descs = (void *) (ap->folios + num_pages); args = &ap->args; args->nodeid = outarg->nodeid; args->opcode = FUSE_NOTIFY_REPLY; - args->in_numargs = 2; + args->in_numargs = 3; args->in_pages = true; args->end = fuse_retrieve_end; index = outarg->offset >> PAGE_SHIFT; - while (num && ap->num_pages < num_pages) { - struct page *page; + while (num && cur_pages < num_pages) { + struct folio *folio; unsigned int this_num; - page = find_get_page(mapping, index); - if (!page) + folio = filemap_get_folio(mapping, index); + if (IS_ERR(folio)) break; this_num = min_t(unsigned, num, PAGE_SIZE - offset); - ap->pages[ap->num_pages] = page; - ap->descs[ap->num_pages].offset = offset; - ap->descs[ap->num_pages].length = this_num; - ap->num_pages++; + ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].offset = offset; + ap->descs[ap->num_folios].length = this_num; + ap->num_folios++; + cur_pages++; offset = 0; num -= this_num; @@ -1727,9 +1909,10 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, } ra->inarg.offset = outarg->offset; ra->inarg.size = total_len; - args->in_args[0].size = sizeof(ra->inarg); - args->in_args[0].value = &ra->inarg; - args->in_args[1].size = total_len; + fuse_set_zero_arg0(args); + args->in_args[1].size = sizeof(ra->inarg); + args->in_args[1].value = &ra->inarg; + args->in_args[2].size = total_len; err = fuse_simple_notify_reply(fm, args, outarg->notify_unique); if (err) @@ -1813,15 +1996,23 @@ static void fuse_resend(struct fuse_conn *fc) spin_unlock(&fc->lock); list_for_each_entry_safe(req, next, &to_queue, list) { - __set_bit(FR_PENDING, &req->flags); + set_bit(FR_PENDING, &req->flags); + clear_bit(FR_SENT, &req->flags); /* mark the request as resend request */ req->in.h.unique |= FUSE_UNIQUE_RESEND; } spin_lock(&fiq->lock); + if (!fiq->connected) { + spin_unlock(&fiq->lock); + list_for_each_entry(req, &to_queue, list) + clear_bit(FR_PENDING, &req->flags); + fuse_dev_end_requests(&to_queue); + return; + } /* iq and pq requests are both oldest to newest */ list_splice(&to_queue, &fiq->pending); - fiq->ops->wake_pending_and_unlock(fiq); + fuse_dev_wake_and_unlock(fiq); } static int fuse_notify_resend(struct fuse_conn *fc) @@ -1865,7 +2056,7 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, } /* Look up request on processing list by unique ID */ -static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) +struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique) { unsigned int hash = fuse_req_hash(unique); struct fuse_req *req; @@ -1877,10 +2068,17 @@ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) return NULL; } -static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, - unsigned nbytes) +int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, + unsigned nbytes) { - unsigned reqsize = sizeof(struct fuse_out_header); + + unsigned int reqsize = 0; + + /* + * Uring has all headers separated from args - args is payload only + */ + if (!cs->is_uring) + reqsize = sizeof(struct fuse_out_header); reqsize += fuse_len_args(args->out_numargs, args->out_args); @@ -1942,7 +2140,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, spin_lock(&fpq->lock); req = NULL; if (fpq->connected) - req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); + req = fuse_request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); err = -ENOENT; if (!req) { @@ -1980,7 +2178,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, if (oh.error) err = nbytes != sizeof(oh) ? -EINVAL : 0; else - err = copy_out_args(cs, req->args, nbytes); + err = fuse_copy_out_args(cs, req->args, nbytes); fuse_copy_finish(cs); spin_lock(&fpq->lock); @@ -2022,7 +2220,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { - unsigned int head, tail, mask, count; + unsigned int head, tail, count; unsigned nbuf; unsigned idx; struct pipe_buffer *bufs; @@ -2039,8 +2237,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, head = pipe->head; tail = pipe->tail; - mask = pipe->ring_size - 1; - count = head - tail; + count = pipe_occupancy(head, tail); bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL); if (!bufs) { @@ -2050,8 +2247,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, nbuf = 0; rem = 0; - for (idx = tail; idx != head && rem < len; idx++) - rem += pipe->bufs[idx & mask].len; + for (idx = tail; !pipe_empty(head, idx) && rem < len; idx++) + rem += pipe_buf(pipe, idx)->len; ret = -EINVAL; if (rem < len) @@ -2062,10 +2259,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, struct pipe_buffer *ibuf; struct pipe_buffer *obuf; - if (WARN_ON(nbuf >= count || tail == head)) + if (WARN_ON(nbuf >= count || pipe_empty(head, tail))) goto out_free; - ibuf = &pipe->bufs[tail & mask]; + ibuf = pipe_buf(pipe, tail); obuf = &bufs[nbuf]; if (rem >= ibuf->len) { @@ -2135,7 +2332,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) } /* Abort all requests on the given list (pending or processing) */ -static void end_requests(struct list_head *head) +void fuse_dev_end_requests(struct list_head *head) { while (!list_empty(head)) { struct fuse_req *req; @@ -2191,6 +2388,9 @@ void fuse_abort_conn(struct fuse_conn *fc) LIST_HEAD(to_end); unsigned int i; + if (fc->timeout.req_timeout) + cancel_delayed_work(&fc->timeout.work); + /* Background queuing checks fc->connected under bg_lock */ spin_lock(&fc->bg_lock); fc->connected = 0; @@ -2238,7 +2438,13 @@ void fuse_abort_conn(struct fuse_conn *fc) wake_up_all(&fc->blocked_waitq); spin_unlock(&fc->lock); - end_requests(&to_end); + fuse_dev_end_requests(&to_end); + + /* + * fc->lock must not be taken to avoid conflicts with io-uring + * locks + */ + fuse_uring_abort(fc); } else { spin_unlock(&fc->lock); } @@ -2250,6 +2456,8 @@ void fuse_wait_aborted(struct fuse_conn *fc) /* matches implicit memory barrier in fuse_drop_waiting() */ smp_mb(); wait_event(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0); + + fuse_uring_wait_stopped_queues(fc); } int fuse_dev_release(struct inode *inode, struct file *file) @@ -2268,7 +2476,7 @@ int fuse_dev_release(struct inode *inode, struct file *file) list_splice_init(&fpq->processing[i], &to_end); spin_unlock(&fpq->lock); - end_requests(&to_end); + fuse_dev_end_requests(&to_end); /* Are we the last open device? */ if (atomic_dec_and_test(&fc->dev_count)) { @@ -2314,21 +2522,20 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) int res; int oldfd; struct fuse_dev *fud = NULL; - struct fd f; if (get_user(oldfd, argp)) return -EFAULT; - f = fdget(oldfd); - if (!f.file) + CLASS(fd, f)(oldfd); + if (fd_empty(f)) return -EINVAL; /* * Check against file->f_op because CUSE * uses the same ioctl handler. */ - if (f.file->f_op == file->f_op) - fud = fuse_get_dev(f.file); + if (fd_file(f)->f_op == file->f_op) + fud = fuse_get_dev(fd_file(f)); res = -EINVAL; if (fud) { @@ -2337,7 +2544,6 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) mutex_unlock(&fuse_mutex); } - fdput(f); return res; } @@ -2399,7 +2605,6 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, .open = fuse_dev_open, - .llseek = no_llseek, .read_iter = fuse_dev_read, .splice_read = fuse_dev_splice_read, .write_iter = fuse_dev_write, @@ -2409,6 +2614,9 @@ const struct file_operations fuse_dev_operations = { .fasync = fuse_dev_fasync, .unlocked_ioctl = fuse_dev_ioctl, .compat_ioctl = compat_ptr_ioctl, +#ifdef CONFIG_FUSE_IO_URING + .uring_cmd = fuse_uring_cmd, +#endif }; EXPORT_SYMBOL_GPL(fuse_dev_operations); diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c new file mode 100644 index 000000000000..accdce2977c5 --- /dev/null +++ b/fs/fuse/dev_uring.c @@ -0,0 +1,1352 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE: Filesystem in Userspace + * Copyright (c) 2023-2024 DataDirect Networks. + */ + +#include "fuse_i.h" +#include "dev_uring_i.h" +#include "fuse_dev_i.h" + +#include <linux/fs.h> +#include <linux/io_uring/cmd.h> + +static bool __read_mostly enable_uring; +module_param(enable_uring, bool, 0644); +MODULE_PARM_DESC(enable_uring, + "Enable userspace communication through io-uring"); + +#define FUSE_URING_IOV_SEGS 2 /* header and payload */ + + +bool fuse_uring_enabled(void) +{ + return enable_uring; +} + +struct fuse_uring_pdu { + struct fuse_ring_ent *ent; +}; + +static const struct fuse_iqueue_ops fuse_io_uring_ops; + +static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd, + struct fuse_ring_ent *ring_ent) +{ + struct fuse_uring_pdu *pdu = + io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); + + pdu->ent = ring_ent; +} + +static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) +{ + struct fuse_uring_pdu *pdu = + io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); + + return pdu->ent; +} + +static void fuse_uring_flush_bg(struct fuse_ring_queue *queue) +{ + struct fuse_ring *ring = queue->ring; + struct fuse_conn *fc = ring->fc; + + lockdep_assert_held(&queue->lock); + lockdep_assert_held(&fc->bg_lock); + + /* + * Allow one bg request per queue, ignoring global fc limits. + * This prevents a single queue from consuming all resources and + * eliminates the need for remote queue wake-ups when global + * limits are met but this queue has no more waiting requests. + */ + while ((fc->active_background < fc->max_background || + !queue->active_background) && + (!list_empty(&queue->fuse_req_bg_queue))) { + struct fuse_req *req; + + req = list_first_entry(&queue->fuse_req_bg_queue, + struct fuse_req, list); + fc->active_background++; + queue->active_background++; + + list_move_tail(&req->list, &queue->fuse_req_queue); + } +} + +static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, + int error) +{ + struct fuse_ring_queue *queue = ent->queue; + struct fuse_ring *ring = queue->ring; + struct fuse_conn *fc = ring->fc; + + lockdep_assert_not_held(&queue->lock); + spin_lock(&queue->lock); + ent->fuse_req = NULL; + if (test_bit(FR_BACKGROUND, &req->flags)) { + queue->active_background--; + spin_lock(&fc->bg_lock); + fuse_uring_flush_bg(queue); + spin_unlock(&fc->bg_lock); + } + + spin_unlock(&queue->lock); + + if (error) + req->out.h.error = error; + + clear_bit(FR_SENT, &req->flags); + fuse_request_end(req); +} + +/* Abort all list queued request on the given ring queue */ +static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) +{ + struct fuse_req *req; + LIST_HEAD(req_list); + + spin_lock(&queue->lock); + list_for_each_entry(req, &queue->fuse_req_queue, list) + clear_bit(FR_PENDING, &req->flags); + list_splice_init(&queue->fuse_req_queue, &req_list); + spin_unlock(&queue->lock); + + /* must not hold queue lock to avoid order issues with fi->lock */ + fuse_dev_end_requests(&req_list); +} + +void fuse_uring_abort_end_requests(struct fuse_ring *ring) +{ + int qid; + struct fuse_ring_queue *queue; + struct fuse_conn *fc = ring->fc; + + for (qid = 0; qid < ring->nr_queues; qid++) { + queue = READ_ONCE(ring->queues[qid]); + if (!queue) + continue; + + queue->stopped = true; + + WARN_ON_ONCE(ring->fc->max_background != UINT_MAX); + spin_lock(&queue->lock); + spin_lock(&fc->bg_lock); + fuse_uring_flush_bg(queue); + spin_unlock(&fc->bg_lock); + spin_unlock(&queue->lock); + fuse_uring_abort_end_queue_requests(queue); + } +} + +bool fuse_uring_request_expired(struct fuse_conn *fc) +{ + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + int qid; + + if (!ring) + return false; + + for (qid = 0; qid < ring->nr_queues; qid++) { + queue = READ_ONCE(ring->queues[qid]); + if (!queue) + continue; + + spin_lock(&queue->lock); + if (fuse_request_expired(fc, &queue->fuse_req_queue) || + fuse_request_expired(fc, &queue->fuse_req_bg_queue) || + fuse_fpq_processing_expired(fc, queue->fpq.processing)) { + spin_unlock(&queue->lock); + return true; + } + spin_unlock(&queue->lock); + } + + return false; +} + +void fuse_uring_destruct(struct fuse_conn *fc) +{ + struct fuse_ring *ring = fc->ring; + int qid; + + if (!ring) + return; + + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = ring->queues[qid]; + struct fuse_ring_ent *ent, *next; + + if (!queue) + continue; + + WARN_ON(!list_empty(&queue->ent_avail_queue)); + WARN_ON(!list_empty(&queue->ent_w_req_queue)); + WARN_ON(!list_empty(&queue->ent_commit_queue)); + WARN_ON(!list_empty(&queue->ent_in_userspace)); + + list_for_each_entry_safe(ent, next, &queue->ent_released, + list) { + list_del_init(&ent->list); + kfree(ent); + } + + kfree(queue->fpq.processing); + kfree(queue); + ring->queues[qid] = NULL; + } + + kfree(ring->queues); + kfree(ring); + fc->ring = NULL; +} + +/* + * Basic ring setup for this connection based on the provided configuration + */ +static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) +{ + struct fuse_ring *ring; + size_t nr_queues = num_possible_cpus(); + struct fuse_ring *res = NULL; + size_t max_payload_size; + + ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT); + if (!ring) + return NULL; + + ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *), + GFP_KERNEL_ACCOUNT); + if (!ring->queues) + goto out_err; + + max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write); + max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); + + spin_lock(&fc->lock); + if (fc->ring) { + /* race, another thread created the ring in the meantime */ + spin_unlock(&fc->lock); + res = fc->ring; + goto out_err; + } + + init_waitqueue_head(&ring->stop_waitq); + + ring->nr_queues = nr_queues; + ring->fc = fc; + ring->max_payload_sz = max_payload_size; + smp_store_release(&fc->ring, ring); + + spin_unlock(&fc->lock); + return ring; + +out_err: + kfree(ring->queues); + kfree(ring); + return res; +} + +static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, + int qid) +{ + struct fuse_conn *fc = ring->fc; + struct fuse_ring_queue *queue; + struct list_head *pq; + + queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT); + if (!queue) + return NULL; + pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); + if (!pq) { + kfree(queue); + return NULL; + } + + queue->qid = qid; + queue->ring = ring; + spin_lock_init(&queue->lock); + + INIT_LIST_HEAD(&queue->ent_avail_queue); + INIT_LIST_HEAD(&queue->ent_commit_queue); + INIT_LIST_HEAD(&queue->ent_w_req_queue); + INIT_LIST_HEAD(&queue->ent_in_userspace); + INIT_LIST_HEAD(&queue->fuse_req_queue); + INIT_LIST_HEAD(&queue->fuse_req_bg_queue); + INIT_LIST_HEAD(&queue->ent_released); + + queue->fpq.processing = pq; + fuse_pqueue_init(&queue->fpq); + + spin_lock(&fc->lock); + if (ring->queues[qid]) { + spin_unlock(&fc->lock); + kfree(queue->fpq.processing); + kfree(queue); + return ring->queues[qid]; + } + + /* + * write_once and lock as the caller mostly doesn't take the lock at all + */ + WRITE_ONCE(ring->queues[qid], queue); + spin_unlock(&fc->lock); + + return queue; +} + +static void fuse_uring_stop_fuse_req_end(struct fuse_req *req) +{ + clear_bit(FR_SENT, &req->flags); + req->out.h.error = -ECONNABORTED; + fuse_request_end(req); +} + +/* + * Release a request/entry on connection tear down + */ +static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) +{ + struct fuse_req *req; + struct io_uring_cmd *cmd; + + struct fuse_ring_queue *queue = ent->queue; + + spin_lock(&queue->lock); + cmd = ent->cmd; + ent->cmd = NULL; + req = ent->fuse_req; + ent->fuse_req = NULL; + if (req) { + /* remove entry from queue->fpq->processing */ + list_del_init(&req->list); + } + + /* + * The entry must not be freed immediately, due to access of direct + * pointer access of entries through IO_URING_F_CANCEL - there is a risk + * of race between daemon termination (which triggers IO_URING_F_CANCEL + * and accesses entries without checking the list state first + */ + list_move(&ent->list, &queue->ent_released); + ent->state = FRRS_RELEASED; + spin_unlock(&queue->lock); + + if (cmd) + io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED); + + if (req) + fuse_uring_stop_fuse_req_end(req); +} + +static void fuse_uring_stop_list_entries(struct list_head *head, + struct fuse_ring_queue *queue, + enum fuse_ring_req_state exp_state) +{ + struct fuse_ring *ring = queue->ring; + struct fuse_ring_ent *ent, *next; + ssize_t queue_refs = SSIZE_MAX; + LIST_HEAD(to_teardown); + + spin_lock(&queue->lock); + list_for_each_entry_safe(ent, next, head, list) { + if (ent->state != exp_state) { + pr_warn("entry teardown qid=%d state=%d expected=%d", + queue->qid, ent->state, exp_state); + continue; + } + + ent->state = FRRS_TEARDOWN; + list_move(&ent->list, &to_teardown); + } + spin_unlock(&queue->lock); + + /* no queue lock to avoid lock order issues */ + list_for_each_entry_safe(ent, next, &to_teardown, list) { + fuse_uring_entry_teardown(ent); + queue_refs = atomic_dec_return(&ring->queue_refs); + WARN_ON_ONCE(queue_refs < 0); + } +} + +static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue) +{ + fuse_uring_stop_list_entries(&queue->ent_in_userspace, queue, + FRRS_USERSPACE); + fuse_uring_stop_list_entries(&queue->ent_avail_queue, queue, + FRRS_AVAILABLE); +} + +/* + * Log state debug info + */ +static void fuse_uring_log_ent_state(struct fuse_ring *ring) +{ + int qid; + struct fuse_ring_ent *ent; + + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = ring->queues[qid]; + + if (!queue) + continue; + + spin_lock(&queue->lock); + /* + * Log entries from the intermediate queue, the other queues + * should be empty + */ + list_for_each_entry(ent, &queue->ent_w_req_queue, list) { + pr_info(" ent-req-queue ring=%p qid=%d ent=%p state=%d\n", + ring, qid, ent, ent->state); + } + list_for_each_entry(ent, &queue->ent_commit_queue, list) { + pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n", + ring, qid, ent, ent->state); + } + spin_unlock(&queue->lock); + } + ring->stop_debug_log = 1; +} + +static void fuse_uring_async_stop_queues(struct work_struct *work) +{ + int qid; + struct fuse_ring *ring = + container_of(work, struct fuse_ring, async_teardown_work.work); + + /* XXX code dup */ + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); + + if (!queue) + continue; + + fuse_uring_teardown_entries(queue); + } + + /* + * Some ring entries might be in the middle of IO operations, + * i.e. in process to get handled by file_operations::uring_cmd + * or on the way to userspace - we could handle that with conditions in + * run time code, but easier/cleaner to have an async tear down handler + * If there are still queue references left + */ + if (atomic_read(&ring->queue_refs) > 0) { + if (time_after(jiffies, + ring->teardown_time + FUSE_URING_TEARDOWN_TIMEOUT)) + fuse_uring_log_ent_state(ring); + + schedule_delayed_work(&ring->async_teardown_work, + FUSE_URING_TEARDOWN_INTERVAL); + } else { + wake_up_all(&ring->stop_waitq); + } +} + +/* + * Stop the ring queues + */ +void fuse_uring_stop_queues(struct fuse_ring *ring) +{ + int qid; + + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); + + if (!queue) + continue; + + fuse_uring_teardown_entries(queue); + } + + if (atomic_read(&ring->queue_refs) > 0) { + ring->teardown_time = jiffies; + INIT_DELAYED_WORK(&ring->async_teardown_work, + fuse_uring_async_stop_queues); + schedule_delayed_work(&ring->async_teardown_work, + FUSE_URING_TEARDOWN_INTERVAL); + } else { + wake_up_all(&ring->stop_waitq); + } +} + +/* + * Handle IO_URING_F_CANCEL, typically should come on daemon termination. + * + * Releasing the last entry should trigger fuse_dev_release() if + * the daemon was terminated + */ +static void fuse_uring_cancel(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); + struct fuse_ring_queue *queue; + bool need_cmd_done = false; + + /* + * direct access on ent - it must not be destructed as long as + * IO_URING_F_CANCEL might come up + */ + queue = ent->queue; + spin_lock(&queue->lock); + if (ent->state == FRRS_AVAILABLE) { + ent->state = FRRS_USERSPACE; + list_move(&ent->list, &queue->ent_in_userspace); + need_cmd_done = true; + ent->cmd = NULL; + } + spin_unlock(&queue->lock); + + if (need_cmd_done) { + /* no queue lock to avoid lock order issues */ + io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); + } +} + +static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags, + struct fuse_ring_ent *ring_ent) +{ + uring_cmd_set_ring_ent(cmd, ring_ent); + io_uring_cmd_mark_cancelable(cmd, issue_flags); +} + +/* + * Checks for errors and stores it into the request + */ +static int fuse_uring_out_header_has_err(struct fuse_out_header *oh, + struct fuse_req *req, + struct fuse_conn *fc) +{ + int err; + + err = -EINVAL; + if (oh->unique == 0) { + /* Not supported through io-uring yet */ + pr_warn_once("notify through fuse-io-uring not supported\n"); + goto err; + } + + if (oh->error <= -ERESTARTSYS || oh->error > 0) + goto err; + + if (oh->error) { + err = oh->error; + goto err; + } + + err = -ENOENT; + if ((oh->unique & ~FUSE_INT_REQ_BIT) != req->in.h.unique) { + pr_warn_ratelimited("unique mismatch, expected: %llu got %llu\n", + req->in.h.unique, + oh->unique & ~FUSE_INT_REQ_BIT); + goto err; + } + + /* + * Is it an interrupt reply ID? + * XXX: Not supported through fuse-io-uring yet, it should not even + * find the request - should not happen. + */ + WARN_ON_ONCE(oh->unique & FUSE_INT_REQ_BIT); + + err = 0; +err: + return err; +} + +static int fuse_uring_copy_from_ring(struct fuse_ring *ring, + struct fuse_req *req, + struct fuse_ring_ent *ent) +{ + struct fuse_copy_state cs; + struct fuse_args *args = req->args; + struct iov_iter iter; + int err; + struct fuse_uring_ent_in_out ring_in_out; + + err = copy_from_user(&ring_in_out, &ent->headers->ring_ent_in_out, + sizeof(ring_in_out)); + if (err) + return -EFAULT; + + err = import_ubuf(ITER_SOURCE, ent->payload, ring->max_payload_sz, + &iter); + if (err) + return err; + + fuse_copy_init(&cs, 0, &iter); + cs.is_uring = 1; + cs.req = req; + + return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); +} + + /* + * Copy data from the req to the ring buffer + */ +static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, + struct fuse_ring_ent *ent) +{ + struct fuse_copy_state cs; + struct fuse_args *args = req->args; + struct fuse_in_arg *in_args = args->in_args; + int num_args = args->in_numargs; + int err; + struct iov_iter iter; + struct fuse_uring_ent_in_out ent_in_out = { + .flags = 0, + .commit_id = req->in.h.unique, + }; + + err = import_ubuf(ITER_DEST, ent->payload, ring->max_payload_sz, &iter); + if (err) { + pr_info_ratelimited("fuse: Import of user buffer failed\n"); + return err; + } + + fuse_copy_init(&cs, 1, &iter); + cs.is_uring = 1; + cs.req = req; + + if (num_args > 0) { + /* + * Expectation is that the first argument is the per op header. + * Some op code have that as zero size. + */ + if (args->in_args[0].size > 0) { + err = copy_to_user(&ent->headers->op_in, in_args->value, + in_args->size); + if (err) { + pr_info_ratelimited( + "Copying the header failed.\n"); + return -EFAULT; + } + } + in_args++; + num_args--; + } + + /* copy the payload */ + err = fuse_copy_args(&cs, num_args, args->in_pages, + (struct fuse_arg *)in_args, 0); + if (err) { + pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); + return err; + } + + ent_in_out.payload_sz = cs.ring.copied_sz; + err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out, + sizeof(ent_in_out)); + return err ? -EFAULT : 0; +} + +static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, + struct fuse_req *req) +{ + struct fuse_ring_queue *queue = ent->queue; + struct fuse_ring *ring = queue->ring; + int err; + + err = -EIO; + if (WARN_ON(ent->state != FRRS_FUSE_REQ)) { + pr_err("qid=%d ring-req=%p invalid state %d on send\n", + queue->qid, ent, ent->state); + return err; + } + + err = -EINVAL; + if (WARN_ON(req->in.h.unique == 0)) + return err; + + /* copy the request */ + err = fuse_uring_args_to_ring(ring, req, ent); + if (unlikely(err)) { + pr_info_ratelimited("Copy to ring failed: %d\n", err); + return err; + } + + /* copy fuse_in_header */ + err = copy_to_user(&ent->headers->in_out, &req->in.h, + sizeof(req->in.h)); + if (err) { + err = -EFAULT; + return err; + } + + return 0; +} + +static int fuse_uring_prepare_send(struct fuse_ring_ent *ent, + struct fuse_req *req) +{ + int err; + + err = fuse_uring_copy_to_ring(ent, req); + if (!err) + set_bit(FR_SENT, &req->flags); + else + fuse_uring_req_end(ent, req, err); + + return err; +} + +/* + * Write data to the ring buffer and send the request to userspace, + * userspace will read it + * This is comparable with classical read(/dev/fuse) + */ +static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, + struct fuse_req *req, + unsigned int issue_flags) +{ + struct fuse_ring_queue *queue = ent->queue; + int err; + struct io_uring_cmd *cmd; + + err = fuse_uring_prepare_send(ent, req); + if (err) + return err; + + spin_lock(&queue->lock); + cmd = ent->cmd; + ent->cmd = NULL; + ent->state = FRRS_USERSPACE; + list_move(&ent->list, &queue->ent_in_userspace); + spin_unlock(&queue->lock); + + io_uring_cmd_done(cmd, 0, 0, issue_flags); + return 0; +} + +/* + * Make a ring entry available for fuse_req assignment + */ +static void fuse_uring_ent_avail(struct fuse_ring_ent *ent, + struct fuse_ring_queue *queue) +{ + WARN_ON_ONCE(!ent->cmd); + list_move(&ent->list, &queue->ent_avail_queue); + ent->state = FRRS_AVAILABLE; +} + +/* Used to find the request on SQE commit */ +static void fuse_uring_add_to_pq(struct fuse_ring_ent *ent, + struct fuse_req *req) +{ + struct fuse_ring_queue *queue = ent->queue; + struct fuse_pqueue *fpq = &queue->fpq; + unsigned int hash; + + req->ring_entry = ent; + hash = fuse_req_hash(req->in.h.unique); + list_move_tail(&req->list, &fpq->processing[hash]); +} + +/* + * Assign a fuse queue entry to the given entry + */ +static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, + struct fuse_req *req) +{ + struct fuse_ring_queue *queue = ent->queue; + + lockdep_assert_held(&queue->lock); + + if (WARN_ON_ONCE(ent->state != FRRS_AVAILABLE && + ent->state != FRRS_COMMIT)) { + pr_warn("%s qid=%d state=%d\n", __func__, ent->queue->qid, + ent->state); + } + + clear_bit(FR_PENDING, &req->flags); + ent->fuse_req = req; + ent->state = FRRS_FUSE_REQ; + list_move(&ent->list, &queue->ent_w_req_queue); + fuse_uring_add_to_pq(ent, req); +} + +/* Fetch the next fuse request if available */ +static struct fuse_req *fuse_uring_ent_assign_req(struct fuse_ring_ent *ent) + __must_hold(&queue->lock) +{ + struct fuse_req *req; + struct fuse_ring_queue *queue = ent->queue; + struct list_head *req_queue = &queue->fuse_req_queue; + + lockdep_assert_held(&queue->lock); + + /* get and assign the next entry while it is still holding the lock */ + req = list_first_entry_or_null(req_queue, struct fuse_req, list); + if (req) + fuse_uring_add_req_to_ring_ent(ent, req); + + return req; +} + +/* + * Read data from the ring buffer, which user space has written to + * This is comparible with handling of classical write(/dev/fuse). + * Also make the ring request available again for new fuse requests. + */ +static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req, + unsigned int issue_flags) +{ + struct fuse_ring *ring = ent->queue->ring; + struct fuse_conn *fc = ring->fc; + ssize_t err = 0; + + err = copy_from_user(&req->out.h, &ent->headers->in_out, + sizeof(req->out.h)); + if (err) { + req->out.h.error = -EFAULT; + goto out; + } + + err = fuse_uring_out_header_has_err(&req->out.h, req, fc); + if (err) { + /* req->out.h.error already set */ + goto out; + } + + err = fuse_uring_copy_from_ring(ring, req, ent); +out: + fuse_uring_req_end(ent, req, err); +} + +/* + * Get the next fuse req and send it + */ +static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ent, + struct fuse_ring_queue *queue, + unsigned int issue_flags) +{ + int err; + struct fuse_req *req; + +retry: + spin_lock(&queue->lock); + fuse_uring_ent_avail(ent, queue); + req = fuse_uring_ent_assign_req(ent); + spin_unlock(&queue->lock); + + if (req) { + err = fuse_uring_send_next_to_ring(ent, req, issue_flags); + if (err) + goto retry; + } +} + +static int fuse_ring_ent_set_commit(struct fuse_ring_ent *ent) +{ + struct fuse_ring_queue *queue = ent->queue; + + lockdep_assert_held(&queue->lock); + + if (WARN_ON_ONCE(ent->state != FRRS_USERSPACE)) + return -EIO; + + ent->state = FRRS_COMMIT; + list_move(&ent->list, &queue->ent_commit_queue); + + return 0; +} + +/* FUSE_URING_CMD_COMMIT_AND_FETCH handler */ +static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, + struct fuse_conn *fc) +{ + const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); + struct fuse_ring_ent *ent; + int err; + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + uint64_t commit_id = READ_ONCE(cmd_req->commit_id); + unsigned int qid = READ_ONCE(cmd_req->qid); + struct fuse_pqueue *fpq; + struct fuse_req *req; + + err = -ENOTCONN; + if (!ring) + return err; + + if (qid >= ring->nr_queues) + return -EINVAL; + + queue = ring->queues[qid]; + if (!queue) + return err; + fpq = &queue->fpq; + + if (!READ_ONCE(fc->connected) || READ_ONCE(queue->stopped)) + return err; + + spin_lock(&queue->lock); + /* Find a request based on the unique ID of the fuse request + * This should get revised, as it needs a hash calculation and list + * search. And full struct fuse_pqueue is needed (memory overhead). + * As well as the link from req to ring_ent. + */ + req = fuse_request_find(fpq, commit_id); + err = -ENOENT; + if (!req) { + pr_info("qid=%d commit_id %llu not found\n", queue->qid, + commit_id); + spin_unlock(&queue->lock); + return err; + } + list_del_init(&req->list); + ent = req->ring_entry; + req->ring_entry = NULL; + + err = fuse_ring_ent_set_commit(ent); + if (err != 0) { + pr_info_ratelimited("qid=%d commit_id %llu state %d", + queue->qid, commit_id, ent->state); + spin_unlock(&queue->lock); + req->out.h.error = err; + clear_bit(FR_SENT, &req->flags); + fuse_request_end(req); + return err; + } + + ent->cmd = cmd; + spin_unlock(&queue->lock); + + /* without the queue lock, as other locks are taken */ + fuse_uring_prepare_cancel(cmd, issue_flags, ent); + fuse_uring_commit(ent, req, issue_flags); + + /* + * Fetching the next request is absolutely required as queued + * fuse requests would otherwise not get processed - committing + * and fetching is done in one step vs legacy fuse, which has separated + * read (fetch request) and write (commit result). + */ + fuse_uring_next_fuse_req(ent, queue, issue_flags); + return 0; +} + +static bool is_ring_ready(struct fuse_ring *ring, int current_qid) +{ + int qid; + struct fuse_ring_queue *queue; + bool ready = true; + + for (qid = 0; qid < ring->nr_queues && ready; qid++) { + if (current_qid == qid) + continue; + + queue = ring->queues[qid]; + if (!queue) { + ready = false; + break; + } + + spin_lock(&queue->lock); + if (list_empty(&queue->ent_avail_queue)) + ready = false; + spin_unlock(&queue->lock); + } + + return ready; +} + +/* + * fuse_uring_req_fetch command handling + */ +static void fuse_uring_do_register(struct fuse_ring_ent *ent, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct fuse_ring_queue *queue = ent->queue; + struct fuse_ring *ring = queue->ring; + struct fuse_conn *fc = ring->fc; + struct fuse_iqueue *fiq = &fc->iq; + + fuse_uring_prepare_cancel(cmd, issue_flags, ent); + + spin_lock(&queue->lock); + ent->cmd = cmd; + fuse_uring_ent_avail(ent, queue); + spin_unlock(&queue->lock); + + if (!ring->ready) { + bool ready = is_ring_ready(ring, queue->qid); + + if (ready) { + WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); + WRITE_ONCE(ring->ready, true); + wake_up_all(&fc->blocked_waitq); + } + } +} + +/* + * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1] + * the payload + */ +static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe, + struct iovec iov[FUSE_URING_IOV_SEGS]) +{ + struct iovec __user *uiov = u64_to_user_ptr(READ_ONCE(sqe->addr)); + struct iov_iter iter; + ssize_t ret; + + if (sqe->len != FUSE_URING_IOV_SEGS) + return -EINVAL; + + /* + * Direction for buffer access will actually be READ and WRITE, + * using write for the import should include READ access as well. + */ + ret = import_iovec(WRITE, uiov, FUSE_URING_IOV_SEGS, + FUSE_URING_IOV_SEGS, &iov, &iter); + if (ret < 0) + return ret; + + return 0; +} + +static struct fuse_ring_ent * +fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, + struct fuse_ring_queue *queue) +{ + struct fuse_ring *ring = queue->ring; + struct fuse_ring_ent *ent; + size_t payload_size; + struct iovec iov[FUSE_URING_IOV_SEGS]; + int err; + + err = fuse_uring_get_iovec_from_sqe(cmd->sqe, iov); + if (err) { + pr_info_ratelimited("Failed to get iovec from sqe, err=%d\n", + err); + return ERR_PTR(err); + } + + err = -EINVAL; + if (iov[0].iov_len < sizeof(struct fuse_uring_req_header)) { + pr_info_ratelimited("Invalid header len %zu\n", iov[0].iov_len); + return ERR_PTR(err); + } + + payload_size = iov[1].iov_len; + if (payload_size < ring->max_payload_sz) { + pr_info_ratelimited("Invalid req payload len %zu\n", + payload_size); + return ERR_PTR(err); + } + + err = -ENOMEM; + ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT); + if (!ent) + return ERR_PTR(err); + + INIT_LIST_HEAD(&ent->list); + + ent->queue = queue; + ent->headers = iov[0].iov_base; + ent->payload = iov[1].iov_base; + + atomic_inc(&ring->queue_refs); + return ent; +} + +/* + * Register header and payload buffer with the kernel and puts the + * entry as "ready to get fuse requests" on the queue + */ +static int fuse_uring_register(struct io_uring_cmd *cmd, + unsigned int issue_flags, struct fuse_conn *fc) +{ + const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); + struct fuse_ring *ring = smp_load_acquire(&fc->ring); + struct fuse_ring_queue *queue; + struct fuse_ring_ent *ent; + int err; + unsigned int qid = READ_ONCE(cmd_req->qid); + + err = -ENOMEM; + if (!ring) { + ring = fuse_uring_create(fc); + if (!ring) + return err; + } + + if (qid >= ring->nr_queues) { + pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid); + return -EINVAL; + } + + queue = ring->queues[qid]; + if (!queue) { + queue = fuse_uring_create_queue(ring, qid); + if (!queue) + return err; + } + + /* + * The created queue above does not need to be destructed in + * case of entry errors below, will be done at ring destruction time. + */ + + ent = fuse_uring_create_ring_ent(cmd, queue); + if (IS_ERR(ent)) + return PTR_ERR(ent); + + fuse_uring_do_register(ent, cmd, issue_flags); + + return 0; +} + +/* + * Entry function from io_uring to handle the given passthrough command + * (op code IORING_OP_URING_CMD) + */ +int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct fuse_dev *fud; + struct fuse_conn *fc; + u32 cmd_op = cmd->cmd_op; + int err; + + if ((unlikely(issue_flags & IO_URING_F_CANCEL))) { + fuse_uring_cancel(cmd, issue_flags); + return 0; + } + + /* This extra SQE size holds struct fuse_uring_cmd_req */ + if (!(issue_flags & IO_URING_F_SQE128)) + return -EINVAL; + + fud = fuse_get_dev(cmd->file); + if (!fud) { + pr_info_ratelimited("No fuse device found\n"); + return -ENOTCONN; + } + fc = fud->fc; + + /* Once a connection has io-uring enabled on it, it can't be disabled */ + if (!enable_uring && !fc->io_uring) { + pr_info_ratelimited("fuse-io-uring is disabled\n"); + return -EOPNOTSUPP; + } + + if (fc->aborted) + return -ECONNABORTED; + if (!fc->connected) + return -ENOTCONN; + + /* + * fuse_uring_register() needs the ring to be initialized, + * we need to know the max payload size + */ + if (!fc->initialized) + return -EAGAIN; + + switch (cmd_op) { + case FUSE_IO_URING_CMD_REGISTER: + err = fuse_uring_register(cmd, issue_flags, fc); + if (err) { + pr_info_once("FUSE_IO_URING_CMD_REGISTER failed err=%d\n", + err); + fc->io_uring = 0; + wake_up_all(&fc->blocked_waitq); + return err; + } + break; + case FUSE_IO_URING_CMD_COMMIT_AND_FETCH: + err = fuse_uring_commit_fetch(cmd, issue_flags, fc); + if (err) { + pr_info_once("FUSE_IO_URING_COMMIT_AND_FETCH failed err=%d\n", + err); + return err; + } + break; + default: + return -EINVAL; + } + + return -EIOCBQUEUED; +} + +static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, + ssize_t ret, unsigned int issue_flags) +{ + struct fuse_ring_queue *queue = ent->queue; + + spin_lock(&queue->lock); + ent->state = FRRS_USERSPACE; + list_move(&ent->list, &queue->ent_in_userspace); + ent->cmd = NULL; + spin_unlock(&queue->lock); + + io_uring_cmd_done(cmd, ret, 0, issue_flags); +} + +/* + * This prepares and sends the ring request in fuse-uring task context. + * User buffers are not mapped yet - the application does not have permission + * to write to it - this has to be executed in ring task context. + */ +static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); + struct fuse_ring_queue *queue = ent->queue; + int err; + + if (!(issue_flags & IO_URING_F_TASK_DEAD)) { + err = fuse_uring_prepare_send(ent, ent->fuse_req); + if (err) { + fuse_uring_next_fuse_req(ent, queue, issue_flags); + return; + } + } else { + err = -ECANCELED; + } + + fuse_uring_send(ent, cmd, err, issue_flags); +} + +static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) +{ + unsigned int qid; + struct fuse_ring_queue *queue; + + qid = task_cpu(current); + + if (WARN_ONCE(qid >= ring->nr_queues, + "Core number (%u) exceeds nr queues (%zu)\n", qid, + ring->nr_queues)) + qid = 0; + + queue = ring->queues[qid]; + WARN_ONCE(!queue, "Missing queue for qid %d\n", qid); + + return queue; +} + +static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent) +{ + struct io_uring_cmd *cmd = ent->cmd; + + uring_cmd_set_ring_ent(cmd, ent); + io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); +} + +/* queue a fuse request and send it if a ring entry is available */ +void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) +{ + struct fuse_conn *fc = req->fm->fc; + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + struct fuse_ring_ent *ent = NULL; + int err; + + err = -EINVAL; + queue = fuse_uring_task_to_queue(ring); + if (!queue) + goto err; + + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique(fiq); + + spin_lock(&queue->lock); + err = -ENOTCONN; + if (unlikely(queue->stopped)) + goto err_unlock; + + set_bit(FR_URING, &req->flags); + req->ring_queue = queue; + ent = list_first_entry_or_null(&queue->ent_avail_queue, + struct fuse_ring_ent, list); + if (ent) + fuse_uring_add_req_to_ring_ent(ent, req); + else + list_add_tail(&req->list, &queue->fuse_req_queue); + spin_unlock(&queue->lock); + + if (ent) + fuse_uring_dispatch_ent(ent); + + return; + +err_unlock: + spin_unlock(&queue->lock); +err: + req->out.h.error = err; + clear_bit(FR_PENDING, &req->flags); + fuse_request_end(req); +} + +bool fuse_uring_queue_bq_req(struct fuse_req *req) +{ + struct fuse_conn *fc = req->fm->fc; + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + struct fuse_ring_ent *ent = NULL; + + queue = fuse_uring_task_to_queue(ring); + if (!queue) + return false; + + spin_lock(&queue->lock); + if (unlikely(queue->stopped)) { + spin_unlock(&queue->lock); + return false; + } + + set_bit(FR_URING, &req->flags); + req->ring_queue = queue; + list_add_tail(&req->list, &queue->fuse_req_bg_queue); + + ent = list_first_entry_or_null(&queue->ent_avail_queue, + struct fuse_ring_ent, list); + spin_lock(&fc->bg_lock); + fc->num_background++; + if (fc->num_background == fc->max_background) + fc->blocked = 1; + fuse_uring_flush_bg(queue); + spin_unlock(&fc->bg_lock); + + /* + * Due to bg_queue flush limits there might be other bg requests + * in the queue that need to be handled first. Or no further req + * might be available. + */ + req = list_first_entry_or_null(&queue->fuse_req_queue, struct fuse_req, + list); + if (ent && req) { + fuse_uring_add_req_to_ring_ent(ent, req); + spin_unlock(&queue->lock); + + fuse_uring_dispatch_ent(ent); + } else { + spin_unlock(&queue->lock); + } + + return true; +} + +bool fuse_uring_remove_pending_req(struct fuse_req *req) +{ + struct fuse_ring_queue *queue = req->ring_queue; + + return fuse_remove_pending_req(req, &queue->lock); +} + +static const struct fuse_iqueue_ops fuse_io_uring_ops = { + /* should be send over io-uring as enhancement */ + .send_forget = fuse_dev_queue_forget, + + /* + * could be send over io-uring, but interrupts should be rare, + * no need to make the code complex + */ + .send_interrupt = fuse_dev_queue_interrupt, + .send_req = fuse_uring_queue_fuse_req, +}; diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h new file mode 100644 index 000000000000..51a563922ce1 --- /dev/null +++ b/fs/fuse/dev_uring_i.h @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * FUSE: Filesystem in Userspace + * Copyright (c) 2023-2024 DataDirect Networks. + */ + +#ifndef _FS_FUSE_DEV_URING_I_H +#define _FS_FUSE_DEV_URING_I_H + +#include "fuse_i.h" + +#ifdef CONFIG_FUSE_IO_URING + +#define FUSE_URING_TEARDOWN_TIMEOUT (5 * HZ) +#define FUSE_URING_TEARDOWN_INTERVAL (HZ/20) + +enum fuse_ring_req_state { + FRRS_INVALID = 0, + + /* The ring entry received from userspace and it is being processed */ + FRRS_COMMIT, + + /* The ring entry is waiting for new fuse requests */ + FRRS_AVAILABLE, + + /* The ring entry got assigned a fuse req */ + FRRS_FUSE_REQ, + + /* The ring entry is in or on the way to user space */ + FRRS_USERSPACE, + + /* The ring entry is in teardown */ + FRRS_TEARDOWN, + + /* The ring entry is released, but not freed yet */ + FRRS_RELEASED, +}; + +/** A fuse ring entry, part of the ring queue */ +struct fuse_ring_ent { + /* userspace buffer */ + struct fuse_uring_req_header __user *headers; + void __user *payload; + + /* the ring queue that owns the request */ + struct fuse_ring_queue *queue; + + /* fields below are protected by queue->lock */ + + struct io_uring_cmd *cmd; + + struct list_head list; + + enum fuse_ring_req_state state; + + struct fuse_req *fuse_req; +}; + +struct fuse_ring_queue { + /* + * back pointer to the main fuse uring structure that holds this + * queue + */ + struct fuse_ring *ring; + + /* queue id, corresponds to the cpu core */ + unsigned int qid; + + /* + * queue lock, taken when any value in the queue changes _and_ also + * a ring entry state changes. + */ + spinlock_t lock; + + /* available ring entries (struct fuse_ring_ent) */ + struct list_head ent_avail_queue; + + /* + * entries in the process of being committed or in the process + * to be sent to userspace + */ + struct list_head ent_w_req_queue; + struct list_head ent_commit_queue; + + /* entries in userspace */ + struct list_head ent_in_userspace; + + /* entries that are released */ + struct list_head ent_released; + + /* fuse requests waiting for an entry slot */ + struct list_head fuse_req_queue; + + /* background fuse requests */ + struct list_head fuse_req_bg_queue; + + struct fuse_pqueue fpq; + + unsigned int active_background; + + bool stopped; +}; + +/** + * Describes if uring is for communication and holds alls the data needed + * for uring communication + */ +struct fuse_ring { + /* back pointer */ + struct fuse_conn *fc; + + /* number of ring queues */ + size_t nr_queues; + + /* maximum payload/arg size */ + size_t max_payload_sz; + + struct fuse_ring_queue **queues; + + /* + * Log ring entry states on stop when entries cannot be released + */ + unsigned int stop_debug_log : 1; + + wait_queue_head_t stop_waitq; + + /* async tear down */ + struct delayed_work async_teardown_work; + + /* log */ + unsigned long teardown_time; + + atomic_t queue_refs; + + bool ready; +}; + +bool fuse_uring_enabled(void); +void fuse_uring_destruct(struct fuse_conn *fc); +void fuse_uring_stop_queues(struct fuse_ring *ring); +void fuse_uring_abort_end_requests(struct fuse_ring *ring); +int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); +void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req); +bool fuse_uring_queue_bq_req(struct fuse_req *req); +bool fuse_uring_remove_pending_req(struct fuse_req *req); +bool fuse_uring_request_expired(struct fuse_conn *fc); + +static inline void fuse_uring_abort(struct fuse_conn *fc) +{ + struct fuse_ring *ring = fc->ring; + + if (ring == NULL) + return; + + if (atomic_read(&ring->queue_refs) > 0) { + fuse_uring_abort_end_requests(ring); + fuse_uring_stop_queues(ring); + } +} + +static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc) +{ + struct fuse_ring *ring = fc->ring; + + if (ring) + wait_event(ring->stop_waitq, + atomic_read(&ring->queue_refs) == 0); +} + +static inline bool fuse_uring_ready(struct fuse_conn *fc) +{ + return fc->ring && fc->ring->ready; +} + +#else /* CONFIG_FUSE_IO_URING */ + +static inline void fuse_uring_destruct(struct fuse_conn *fc) +{ +} + +static inline bool fuse_uring_enabled(void) +{ + return false; +} + +static inline void fuse_uring_abort(struct fuse_conn *fc) +{ +} + +static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc) +{ +} + +static inline bool fuse_uring_ready(struct fuse_conn *fc) +{ + return false; +} + +static inline bool fuse_uring_remove_pending_req(struct fuse_req *req) +{ + return false; +} + +static inline bool fuse_uring_request_expired(struct fuse_conn *fc) +{ + return false; +} + +#endif /* CONFIG_FUSE_IO_URING */ + +#endif /* _FS_FUSE_DEV_URING_I_H */ diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 4a6df591add6..7d7ed45cb3e9 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -175,9 +175,12 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, memset(outarg, 0, sizeof(struct fuse_entry_out)); args->opcode = FUSE_LOOKUP; args->nodeid = nodeid; - args->in_numargs = 1; - args->in_args[0].size = name->len + 1; - args->in_args[0].value = name->name; + args->in_numargs = 3; + fuse_set_zero_arg0(args); + args->in_args[1].size = name->len; + args->in_args[1].value = name->name; + args->in_args[2].size = 1; + args->in_args[2].value = ""; args->out_numargs = 1; args->out_args[0].size = sizeof(struct fuse_entry_out); args->out_args[0].value = outarg; @@ -192,10 +195,10 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, * the lookup once more. If the lookup results in the same inode, * then refresh the attributes, timeouts and mark the dentry valid. */ -static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) +static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *entry, unsigned int flags) { struct inode *inode; - struct dentry *parent; struct fuse_mount *fm; struct fuse_inode *fi; int ret; @@ -227,11 +230,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) attr_version = fuse_get_attr_version(fm->fc); - parent = dget_parent(entry); - fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)), - &entry->d_name, &outarg); + fuse_lookup_init(fm->fc, &args, get_node_id(dir), + name, &outarg); ret = fuse_simple_request(fm, &args); - dput(parent); /* Zero nodeid is same as -ENOENT */ if (!ret && !outarg.nodeid) ret = -ENOENT; @@ -265,9 +266,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state)) return -ECHILD; } else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) { - parent = dget_parent(entry); - fuse_advise_use_readdirplus(d_inode(parent)); - dput(parent); + fuse_advise_use_readdirplus(dir); } } ret = 1; @@ -320,9 +319,6 @@ static struct vfsmount *fuse_dentry_automount(struct path *path) /* Create the submount */ mnt = fc_mount(fsc); - if (!IS_ERR(mnt)) - mntget(mnt); - put_fs_context(fsc); return mnt; } @@ -366,12 +362,12 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name struct fuse_mount *fm = get_fuse_mount_super(sb); FUSE_ARGS(args); struct fuse_forget_link *forget; - u64 attr_version; + u64 attr_version, evict_ctr; int err; *inode = NULL; err = -ENAMETOOLONG; - if (name->len > FUSE_NAME_MAX) + if (name->len > fm->fc->name_max) goto out; @@ -381,6 +377,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name goto out; attr_version = fuse_get_attr_version(fm->fc); + evict_ctr = fuse_get_evict_ctr(fm->fc); fuse_lookup_init(fm->fc, &args, nodeid, name, outarg); err = fuse_simple_request(fm, &args); @@ -398,7 +395,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, &outarg->attr, ATTR_TIMEOUT(outarg), - attr_version); + attr_version, evict_ctr); err = -ENOMEM; if (!*inode) { fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1); @@ -466,29 +463,29 @@ static int get_security_context(struct dentry *entry, umode_t mode, { struct fuse_secctx *fctx; struct fuse_secctx_header *header; - void *ctx = NULL, *ptr; - u32 ctxlen, total_len = sizeof(*header); + struct lsm_context lsmctx = { }; + void *ptr; + u32 total_len = sizeof(*header); int err, nr_ctx = 0; - const char *name; + const char *name = NULL; size_t namelen; err = security_dentry_init_security(entry, mode, &entry->d_name, - &name, &ctx, &ctxlen); - if (err) { - if (err != -EOPNOTSUPP) - goto out_err; - /* No LSM is supporting this security hook. Ignore error */ - ctxlen = 0; - ctx = NULL; - } + &name, &lsmctx); + + /* If no LSM is supporting this security hook ignore error */ + if (err && err != -EOPNOTSUPP) + goto out_err; - if (ctxlen) { + if (lsmctx.len) { nr_ctx = 1; namelen = strlen(name) + 1; err = -EIO; - if (WARN_ON(namelen > XATTR_NAME_MAX + 1 || ctxlen > S32_MAX)) + if (WARN_ON(namelen > XATTR_NAME_MAX + 1 || + lsmctx.len > S32_MAX)) goto out_err; - total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen + ctxlen); + total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen + + lsmctx.len); } err = -ENOMEM; @@ -501,19 +498,20 @@ static int get_security_context(struct dentry *entry, umode_t mode, ptr += sizeof(*header); if (nr_ctx) { fctx = ptr; - fctx->size = ctxlen; + fctx->size = lsmctx.len; ptr += sizeof(*fctx); strcpy(ptr, name); ptr += namelen; - memcpy(ptr, ctx, ctxlen); + memcpy(ptr, lsmctx.context, lsmctx.len); } ext->size = total_len; ext->value = header; err = 0; out_err: - kfree(ctx); + if (nr_ctx) + security_release_secctx(&lsmctx); return err; } @@ -545,17 +543,21 @@ static u32 fuse_ext_size(size_t size) /* * This adds just a single supplementary group that matches the parent's group. */ -static int get_create_supp_group(struct inode *dir, struct fuse_in_arg *ext) +static int get_create_supp_group(struct mnt_idmap *idmap, + struct inode *dir, + struct fuse_in_arg *ext) { struct fuse_conn *fc = get_fuse_conn(dir); struct fuse_ext_header *xh; struct fuse_supp_groups *sg; kgid_t kgid = dir->i_gid; + vfsgid_t vfsgid = make_vfsgid(idmap, fc->user_ns, kgid); gid_t parent_gid = from_kgid(fc->user_ns, kgid); + u32 sg_len = fuse_ext_size(sizeof(*sg) + sizeof(sg->groups[0])); - if (parent_gid == (gid_t) -1 || gid_eq(kgid, current_fsgid()) || - !in_group_p(kgid)) + if (parent_gid == (gid_t) -1 || vfsgid_eq_kgid(vfsgid, current_fsgid()) || + !vfsgid_in_group_p(vfsgid)) return 0; xh = extend_arg(ext, sg_len); @@ -572,7 +574,8 @@ static int get_create_supp_group(struct inode *dir, struct fuse_in_arg *ext) return 0; } -static int get_create_ext(struct fuse_args *args, +static int get_create_ext(struct mnt_idmap *idmap, + struct fuse_args *args, struct inode *dir, struct dentry *dentry, umode_t mode) { @@ -583,7 +586,7 @@ static int get_create_ext(struct fuse_args *args, if (fc->init_security) err = get_security_context(dentry, mode, &ext); if (!err && fc->create_supp_group) - err = get_create_supp_group(dir, &ext); + err = get_create_supp_group(idmap, dir, &ext); if (!err && ext.size) { WARN_ON(args->in_numargs >= ARRAY_SIZE(args->in_args)); @@ -609,9 +612,9 @@ static void free_ext_value(struct fuse_args *args) * If the filesystem doesn't support this, then fall back to separate * 'mknod' + 'open' requests. */ -static int fuse_create_open(struct inode *dir, struct dentry *entry, - struct file *file, unsigned int flags, - umode_t mode, u32 opcode) +static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *entry, struct file *file, + unsigned int flags, umode_t mode, u32 opcode) { int err; struct inode *inode; @@ -668,11 +671,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, args.out_args[1].size = sizeof(*outopenp); args.out_args[1].value = outopenp; - err = get_create_ext(&args, dir, entry, mode); + err = get_create_ext(idmap, &args, dir, entry, mode); if (err) - goto out_put_forget_req; + goto out_free_ff; - err = fuse_simple_request(fm, &args); + err = fuse_simple_idmap_request(idmap, fm, &args); free_ext_value(&args); if (err) goto out_free_ff; @@ -686,7 +689,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, ff->nodeid = outentry.nodeid; ff->open_flags = outopenp->open_flags; inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, - &outentry.attr, ATTR_TIMEOUT(&outentry), 0); + &outentry.attr, ATTR_TIMEOUT(&outentry), 0, 0); if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); fuse_sync_release(NULL, ff, flags); @@ -729,6 +732,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, umode_t mode) { int err; + struct mnt_idmap *idmap = file_mnt_idmap(file); struct fuse_conn *fc = get_fuse_conn(dir); struct dentry *res = NULL; @@ -753,7 +757,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, if (fc->no_create) goto mknod; - err = fuse_create_open(dir, entry, file, flags, mode, FUSE_CREATE); + err = fuse_create_open(idmap, dir, entry, file, flags, mode, FUSE_CREATE); if (err == -ENOSYS) { fc->no_create = 1; goto mknod; @@ -764,7 +768,7 @@ out_dput: return err; mknod: - err = fuse_mknod(&nop_mnt_idmap, dir, entry, mode, 0); + err = fuse_mknod(idmap, dir, entry, mode, 0); if (err) goto out_dput; no_open: @@ -774,9 +778,9 @@ no_open: /* * Code shared between mknod, mkdir, symlink and link */ -static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, - struct inode *dir, struct dentry *entry, - umode_t mode) +static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, + struct fuse_args *args, struct inode *dir, + struct dentry *entry, umode_t mode) { struct fuse_entry_out outarg; struct inode *inode; @@ -785,11 +789,11 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, struct fuse_forget_link *forget; if (fuse_is_bad(dir)) - return -EIO; + return ERR_PTR(-EIO); forget = fuse_alloc_forget(); if (!forget) - return -ENOMEM; + return ERR_PTR(-ENOMEM); memset(&outarg, 0, sizeof(outarg)); args->nodeid = get_node_id(dir); @@ -798,12 +802,12 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, args->out_args[0].value = &outarg; if (args->opcode != FUSE_LINK) { - err = get_create_ext(args, dir, entry, mode); + err = get_create_ext(idmap, args, dir, entry, mode); if (err) goto out_put_forget_req; } - err = fuse_simple_request(fm, args); + err = fuse_simple_idmap_request(idmap, fm, args); free_ext_value(args); if (err) goto out_put_forget_req; @@ -816,32 +820,46 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, goto out_put_forget_req; inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, - &outarg.attr, ATTR_TIMEOUT(&outarg), 0); + &outarg.attr, ATTR_TIMEOUT(&outarg), 0, 0); if (!inode) { fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } kfree(forget); d_drop(entry); d = d_splice_alias(inode, entry); if (IS_ERR(d)) - return PTR_ERR(d); + return d; - if (d) { + if (d) fuse_change_entry_timeout(d, &outarg); - dput(d); - } else { + else fuse_change_entry_timeout(entry, &outarg); - } fuse_dir_changed(dir); - return 0; + return d; out_put_forget_req: if (err == -EEXIST) fuse_invalidate_entry(entry); kfree(forget); - return err; + return ERR_PTR(err); +} + +static int create_new_nondir(struct mnt_idmap *idmap, struct fuse_mount *fm, + struct fuse_args *args, struct inode *dir, + struct dentry *entry, umode_t mode) +{ + /* + * Note that when creating anything other than a directory we + * can be sure create_new_entry() will NOT return an alternate + * dentry as d_splice_alias() only returns an alternate dentry + * for directories. So we don't need to check for that case + * when passing back the result. + */ + WARN_ON_ONCE(S_ISDIR(mode)); + + return PTR_ERR(create_new_entry(idmap, fm, args, dir, entry, mode)); } static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir, @@ -864,13 +882,13 @@ static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir, args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(fm, &args, dir, entry, mode); + return create_new_nondir(idmap, fm, &args, dir, entry, mode); } static int fuse_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, umode_t mode, bool excl) { - return fuse_mknod(&nop_mnt_idmap, dir, entry, mode, 0); + return fuse_mknod(idmap, dir, entry, mode, 0); } static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir, @@ -882,7 +900,8 @@ static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir, if (fc->no_tmpfile) return -EOPNOTSUPP; - err = fuse_create_open(dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE); + err = fuse_create_open(idmap, dir, file->f_path.dentry, file, + file->f_flags, mode, FUSE_TMPFILE); if (err == -ENOSYS) { fc->no_tmpfile = 1; err = -EOPNOTSUPP; @@ -890,8 +909,8 @@ static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir, return err; } -static int fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *entry, umode_t mode) +static struct dentry *fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; struct fuse_mount *fm = get_fuse_mount(dir); @@ -909,7 +928,7 @@ static int fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir, args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(fm, &args, dir, entry, S_IFDIR); + return create_new_entry(idmap, fm, &args, dir, entry, S_IFDIR); } static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir, @@ -920,12 +939,13 @@ static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir, FUSE_ARGS(args); args.opcode = FUSE_SYMLINK; - args.in_numargs = 2; - args.in_args[0].size = entry->d_name.len + 1; - args.in_args[0].value = entry->d_name.name; - args.in_args[1].size = len; - args.in_args[1].value = link; - return create_new_entry(fm, &args, dir, entry, S_IFLNK); + args.in_numargs = 3; + fuse_set_zero_arg0(&args); + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; + args.in_args[2].size = len; + args.in_args[2].value = link; + return create_new_nondir(idmap, fm, &args, dir, entry, S_IFLNK); } void fuse_flush_time_update(struct inode *inode) @@ -984,9 +1004,10 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) args.opcode = FUSE_UNLINK; args.nodeid = get_node_id(dir); - args.in_numargs = 1; - args.in_args[0].size = entry->d_name.len + 1; - args.in_args[0].value = entry->d_name.name; + args.in_numargs = 2; + fuse_set_zero_arg0(&args); + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; err = fuse_simple_request(fm, &args); if (!err) { fuse_dir_changed(dir); @@ -1007,9 +1028,10 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) args.opcode = FUSE_RMDIR; args.nodeid = get_node_id(dir); - args.in_numargs = 1; - args.in_args[0].size = entry->d_name.len + 1; - args.in_args[0].value = entry->d_name.name; + args.in_numargs = 2; + fuse_set_zero_arg0(&args); + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; err = fuse_simple_request(fm, &args); if (!err) { fuse_dir_changed(dir); @@ -1019,7 +1041,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) return err; } -static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, +static int fuse_rename_common(struct mnt_idmap *idmap, struct inode *olddir, struct dentry *oldent, struct inode *newdir, struct dentry *newent, unsigned int flags, int opcode, size_t argsize) { @@ -1040,7 +1062,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, args.in_args[1].value = oldent->d_name.name; args.in_args[2].size = newent->d_name.len + 1; args.in_args[2].value = newent->d_name.name; - err = fuse_simple_request(fm, &args); + err = fuse_simple_idmap_request(idmap, fm, &args); if (!err) { /* ctime changes */ fuse_update_ctime(d_inode(oldent)); @@ -1086,7 +1108,8 @@ static int fuse_rename2(struct mnt_idmap *idmap, struct inode *olddir, if (fc->no_rename2 || fc->minor < 23) return -EINVAL; - err = fuse_rename_common(olddir, oldent, newdir, newent, flags, + err = fuse_rename_common((flags & RENAME_WHITEOUT) ? idmap : &invalid_mnt_idmap, + olddir, oldent, newdir, newent, flags, FUSE_RENAME2, sizeof(struct fuse_rename2_in)); if (err == -ENOSYS) { @@ -1094,7 +1117,7 @@ static int fuse_rename2(struct mnt_idmap *idmap, struct inode *olddir, err = -EINVAL; } } else { - err = fuse_rename_common(olddir, oldent, newdir, newent, 0, + err = fuse_rename_common(&invalid_mnt_idmap, olddir, oldent, newdir, newent, 0, FUSE_RENAME, sizeof(struct fuse_rename_in)); } @@ -1111,6 +1134,9 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); + if (fm->fc->no_link) + goto out; + memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); args.opcode = FUSE_LINK; @@ -1119,27 +1145,37 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, args.in_args[0].value = &inarg; args.in_args[1].size = newent->d_name.len + 1; args.in_args[1].value = newent->d_name.name; - err = create_new_entry(fm, &args, newdir, newent, inode->i_mode); + err = create_new_nondir(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode); if (!err) fuse_update_ctime_in_cache(inode); else if (err == -EINTR) fuse_invalidate_attr(inode); + if (err == -ENOSYS) + fm->fc->no_link = 1; +out: + if (fm->fc->no_link) + return -EPERM; + return err; } -static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, - struct kstat *stat) +static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode, + struct fuse_attr *attr, struct kstat *stat) { unsigned int blkbits; struct fuse_conn *fc = get_fuse_conn(inode); + vfsuid_t vfsuid = make_vfsuid(idmap, fc->user_ns, + make_kuid(fc->user_ns, attr->uid)); + vfsgid_t vfsgid = make_vfsgid(idmap, fc->user_ns, + make_kgid(fc->user_ns, attr->gid)); stat->dev = inode->i_sb->s_dev; stat->ino = attr->ino; stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); stat->nlink = attr->nlink; - stat->uid = make_kuid(fc->user_ns, attr->uid); - stat->gid = make_kgid(fc->user_ns, attr->gid); + stat->uid = vfsuid_into_kuid(vfsuid); + stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->i_rdev; stat->atime.tv_sec = attr->atime; stat->atime.tv_nsec = attr->atimensec; @@ -1178,8 +1214,8 @@ static void fuse_statx_to_attr(struct fuse_statx *sx, struct fuse_attr *attr) attr->blksize = sx->blksize; } -static int fuse_do_statx(struct inode *inode, struct file *file, - struct kstat *stat) +static int fuse_do_statx(struct mnt_idmap *idmap, struct inode *inode, + struct file *file, struct kstat *stat) { int err; struct fuse_attr attr; @@ -1232,15 +1268,15 @@ static int fuse_do_statx(struct inode *inode, struct file *file, stat->result_mask = sx->mask & (STATX_BASIC_STATS | STATX_BTIME); stat->btime.tv_sec = sx->btime.tv_sec; stat->btime.tv_nsec = min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1); - fuse_fillattr(inode, &attr, stat); + fuse_fillattr(idmap, inode, &attr, stat); stat->result_mask |= STATX_TYPE; } return 0; } -static int fuse_do_getattr(struct inode *inode, struct kstat *stat, - struct file *file) +static int fuse_do_getattr(struct mnt_idmap *idmap, struct inode *inode, + struct kstat *stat, struct file *file) { int err; struct fuse_getattr_in inarg; @@ -1279,15 +1315,15 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, ATTR_TIMEOUT(&outarg), attr_version); if (stat) - fuse_fillattr(inode, &outarg.attr, stat); + fuse_fillattr(idmap, inode, &outarg.attr, stat); } } return err; } -static int fuse_update_get_attr(struct inode *inode, struct file *file, - struct kstat *stat, u32 request_mask, - unsigned int flags) +static int fuse_update_get_attr(struct mnt_idmap *idmap, struct inode *inode, + struct file *file, struct kstat *stat, + u32 request_mask, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); @@ -1318,16 +1354,17 @@ retry: forget_all_cached_acls(inode); /* Try statx if BTIME is requested */ if (!fc->no_statx && (request_mask & ~STATX_BASIC_STATS)) { - err = fuse_do_statx(inode, file, stat); + err = fuse_do_statx(idmap, inode, file, stat); if (err == -ENOSYS) { fc->no_statx = 1; + err = 0; goto retry; } } else { - err = fuse_do_getattr(inode, stat, file); + err = fuse_do_getattr(idmap, inode, stat, file); } } else if (stat) { - generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; if (test_bit(FUSE_I_BTIME, &fi->state)) { @@ -1341,7 +1378,7 @@ retry: int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask) { - return fuse_update_get_attr(inode, file, NULL, mask, 0); + return fuse_update_get_attr(&nop_mnt_idmap, inode, file, NULL, mask, 0); } int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, @@ -1461,6 +1498,14 @@ static int fuse_access(struct inode *inode, int mask) BUG_ON(mask & MAY_NOT_BLOCK); + /* + * We should not send FUSE_ACCESS to the userspace + * when idmapped mounts are enabled as for this case + * we have fc->default_permissions = 1 and access + * permission checks are done on the kernel side. + */ + WARN_ON_ONCE(!(fm->sb->s_iflags & SB_I_NOIDMAP)); + if (fm->fc->no_access) return 0; @@ -1485,7 +1530,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask) return -ECHILD; forget_all_cached_acls(inode); - return fuse_do_getattr(inode, NULL, NULL); + return fuse_do_getattr(&nop_mnt_idmap, inode, NULL, NULL); } /* @@ -1533,7 +1578,7 @@ static int fuse_permission(struct mnt_idmap *idmap, } if (fc->default_permissions) { - err = generic_permission(&nop_mnt_idmap, inode, mask); + err = generic_permission(idmap, inode, mask); /* If permission is denied, try to refresh file attributes. This is also needed, because the root @@ -1541,7 +1586,7 @@ static int fuse_permission(struct mnt_idmap *idmap, if (err == -EACCES && !refreshed) { err = fuse_perm_getattr(inode, mask); if (!err) - err = generic_permission(&nop_mnt_idmap, + err = generic_permission(idmap, inode, mask); } @@ -1564,13 +1609,13 @@ static int fuse_permission(struct mnt_idmap *idmap, return err; } -static int fuse_readlink_page(struct inode *inode, struct page *page) +static int fuse_readlink_page(struct inode *inode, struct folio *folio) { struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 }; + struct fuse_folio_desc desc = { .length = PAGE_SIZE - 1 }; struct fuse_args_pages ap = { - .num_pages = 1, - .pages = &page, + .num_folios = 1, + .folios = &folio, .descs = &desc, }; char *link; @@ -1593,7 +1638,7 @@ static int fuse_readlink_page(struct inode *inode, struct page *page) if (WARN_ON(res >= PAGE_SIZE)) return -EIO; - link = page_address(page); + link = folio_address(folio); link[res] = '\0'; return 0; @@ -1603,7 +1648,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { struct fuse_conn *fc = get_fuse_conn(inode); - struct page *page; + struct folio *folio; int err; err = -EIO; @@ -1611,26 +1656,26 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, goto out_err; if (fc->cache_symlinks) - return page_get_link(dentry, inode, callback); + return page_get_link_raw(dentry, inode, callback); err = -ECHILD; if (!dentry) goto out_err; - page = alloc_page(GFP_KERNEL); + folio = folio_alloc(GFP_KERNEL, 0); err = -ENOMEM; - if (!page) + if (!folio) goto out_err; - err = fuse_readlink_page(inode, page); + err = fuse_readlink_page(inode, folio); if (err) { - __free_page(page); + folio_put(folio); goto out_err; } - set_delayed_call(callback, page_put_link, page); + set_delayed_call(callback, page_put_link, folio); - return page_address(page); + return folio_address(folio); out_err: return ERR_PTR(err); @@ -1659,6 +1704,8 @@ static int fuse_dir_open(struct inode *inode, struct file *file) */ if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE)) nonseekable_open(inode, file); + if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); } return err; @@ -1737,17 +1784,29 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime) return true; } -static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr, - struct fuse_setattr_in *arg, bool trust_local_cmtime) +static void iattr_to_fattr(struct mnt_idmap *idmap, struct fuse_conn *fc, + struct iattr *iattr, struct fuse_setattr_in *arg, + bool trust_local_cmtime) { unsigned ivalid = iattr->ia_valid; if (ivalid & ATTR_MODE) arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; - if (ivalid & ATTR_UID) - arg->valid |= FATTR_UID, arg->uid = from_kuid(fc->user_ns, iattr->ia_uid); - if (ivalid & ATTR_GID) - arg->valid |= FATTR_GID, arg->gid = from_kgid(fc->user_ns, iattr->ia_gid); + + if (ivalid & ATTR_UID) { + kuid_t fsuid = from_vfsuid(idmap, fc->user_ns, iattr->ia_vfsuid); + + arg->valid |= FATTR_UID; + arg->uid = from_kuid(fc->user_ns, fsuid); + } + + if (ivalid & ATTR_GID) { + kgid_t fsgid = from_vfsgid(idmap, fc->user_ns, iattr->ia_vfsgid); + + arg->valid |= FATTR_GID; + arg->gid = from_kgid(fc->user_ns, fsgid); + } + if (ivalid & ATTR_SIZE) arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; if (ivalid & ATTR_ATIME) { @@ -1867,8 +1926,8 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) * vmtruncate() doesn't allow for this case, so do the rlimit checking * and the actual truncation by hand. */ -int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, - struct file *file) +int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr, struct file *file) { struct inode *inode = d_inode(dentry); struct fuse_mount *fm = get_fuse_mount(inode); @@ -1888,7 +1947,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; - err = setattr_prepare(&nop_mnt_idmap, dentry, attr); + err = setattr_prepare(idmap, dentry, attr); if (err) return err; @@ -1901,7 +1960,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (FUSE_IS_DAX(inode) && is_truncate) { filemap_invalidate_lock(mapping); fault_blocked = true; - err = fuse_dax_break_layouts(inode, 0, 0); + err = fuse_dax_break_layouts(inode, 0, -1); if (err) { filemap_invalidate_unlock(mapping); return err; @@ -1947,7 +2006,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime); + iattr_to_fattr(idmap, fc, attr, &inarg, trust_local_cmtime); if (file) { struct fuse_file *ff = file->private_data; inarg.valid |= FATTR_FH; @@ -1995,7 +2054,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, fuse_change_attributes_common(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), - fuse_get_cache_mask(inode)); + fuse_get_cache_mask(inode), 0); oldsize = inode->i_size; /* see the comment in fuse_change_attributes() */ if (!is_wb || is_truncate) @@ -2064,7 +2123,7 @@ static int fuse_setattr(struct mnt_idmap *idmap, struct dentry *entry, * ia_mode calculation may have used stale i_mode. * Refresh and recalculate. */ - ret = fuse_do_getattr(inode, NULL, file); + ret = fuse_do_getattr(idmap, inode, NULL, file); if (ret) return ret; @@ -2082,7 +2141,7 @@ static int fuse_setattr(struct mnt_idmap *idmap, struct dentry *entry, if (!attr->ia_valid) return 0; - ret = fuse_do_setattr(entry, attr, file); + ret = fuse_do_setattr(idmap, entry, attr, file); if (!ret) { /* * If filesystem supports acls it may have updated acl xattrs in @@ -2121,7 +2180,7 @@ static int fuse_getattr(struct mnt_idmap *idmap, return -EACCES; } - return fuse_update_get_attr(inode, NULL, stat, request_mask, flags); + return fuse_update_get_attr(idmap, inode, NULL, stat, request_mask, flags); } static const struct inode_operations fuse_dir_inode_operations = { @@ -2198,7 +2257,7 @@ void fuse_init_dir(struct inode *inode) static int fuse_symlink_read_folio(struct file *null, struct folio *folio) { - int err = fuse_readlink_page(folio->mapping->host, &folio->page); + int err = fuse_readlink_page(folio->mapping->host, folio); if (!err) folio_mark_uptodate(folio); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a56e7bffd000..6f19a4daa559 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -253,7 +253,7 @@ static int fuse_open(struct inode *inode, struct file *file) if (dax_truncate) { filemap_invalidate_lock(inode->i_mapping); - err = fuse_dax_break_layouts(inode, 0, 0); + err = fuse_dax_break_layouts(inode, 0, -1); if (err) goto out_inode_unlock; } @@ -436,7 +436,7 @@ static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); WARN_ON(get_fuse_inode(wpa->inode) != fi); curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from >= curr_index + wpa->ia.ap.num_pages) + if (idx_from >= curr_index + wpa->ia.ap.num_folios) n = n->rb_right; else if (idx_to < curr_index) n = n->rb_left; @@ -448,9 +448,6 @@ static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, /* * Check if any page in a range is under writeback - * - * This is currently done by walking the list of writepage requests - * for the inode, which can be pretty inefficient. */ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, pgoff_t idx_to) @@ -458,6 +455,9 @@ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, struct fuse_inode *fi = get_fuse_inode(inode); bool found; + if (RB_EMPTY_ROOT(&fi->writepages)) + return false; + spin_lock(&fi->lock); found = fuse_find_writeback(fi, idx_from, idx_to); spin_unlock(&fi->lock); @@ -483,6 +483,21 @@ static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); } +static inline bool fuse_folio_is_writeback(struct inode *inode, + struct folio *folio) +{ + pgoff_t last = folio_next_index(folio) - 1; + return fuse_range_is_writeback(inode, folio->index, last); +} + +static void fuse_wait_on_folio_writeback(struct inode *inode, + struct folio *folio) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + wait_event(fi->page_waitq, !fuse_folio_is_writeback(inode, folio)); +} + /* * Wait for all pending writepages on the inode to finish. * @@ -645,17 +660,20 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, args->out_args[0].size = count; } -static void fuse_release_user_pages(struct fuse_args_pages *ap, +static void fuse_release_user_pages(struct fuse_args_pages *ap, ssize_t nres, bool should_dirty) { unsigned int i; - for (i = 0; i < ap->num_pages; i++) { + for (i = 0; i < ap->num_folios; i++) { if (should_dirty) - set_page_dirty_lock(ap->pages[i]); + folio_mark_dirty_lock(ap->folios[i]); if (ap->args.is_pinned) - unpin_user_page(ap->pages[i]); + unpin_folio(ap->folios[i]); } + + if (nres > 0 && ap->args.invalidate_vmap) + invalidate_kernel_vmap_range(ap->args.vmap_base, nres); } static void fuse_io_release(struct kref *kref) @@ -725,16 +743,16 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) } static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, - unsigned int npages) + unsigned int nfolios) { struct fuse_io_args *ia; ia = kzalloc(sizeof(*ia), GFP_KERNEL); if (ia) { ia->io = io; - ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL, - &ia->ap.descs); - if (!ia->ap.pages) { + ia->ap.folios = fuse_folios_alloc(nfolios, GFP_KERNEL, + &ia->ap.descs); + if (!ia->ap.folios) { kfree(ia); ia = NULL; } @@ -744,7 +762,7 @@ static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, static void fuse_io_free(struct fuse_io_args *ia) { - kfree(ia->ap.pages); + kfree(ia->ap.folios); kfree(ia); } @@ -754,25 +772,29 @@ static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); struct fuse_io_priv *io = ia->io; ssize_t pos = -1; - - fuse_release_user_pages(&ia->ap, io->should_dirty); + size_t nres; if (err) { /* Nothing */ } else if (io->write) { if (ia->write.out.size > ia->write.in.size) { err = -EIO; - } else if (ia->write.in.size != ia->write.out.size) { - pos = ia->write.in.offset - io->offset + - ia->write.out.size; + } else { + nres = ia->write.out.size; + if (ia->write.in.size != ia->write.out.size) + pos = ia->write.in.offset - io->offset + + ia->write.out.size; } } else { u32 outsize = args->out_args[0].size; + nres = outsize; if (ia->read.in.size != outsize) pos = ia->read.in.offset - io->offset + outsize; } + fuse_release_user_pages(&ia->ap, err ?: nres, io->should_dirty); + fuse_aio_complete(io, err, pos); fuse_io_free(ia); } @@ -843,33 +865,33 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, * reached the client fs yet. So the hole is not present there. */ if (!fc->writeback_cache) { - loff_t pos = page_offset(ap->pages[0]) + num_read; + loff_t pos = folio_pos(ap->folios[0]) + num_read; fuse_read_update_size(inode, pos, attr_ver); } } -static int fuse_do_readpage(struct file *file, struct page *page) +static int fuse_do_readfolio(struct file *file, struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct fuse_mount *fm = get_fuse_mount(inode); - loff_t pos = page_offset(page); - struct fuse_page_desc desc = { .length = PAGE_SIZE }; + loff_t pos = folio_pos(folio); + struct fuse_folio_desc desc = { .length = PAGE_SIZE }; struct fuse_io_args ia = { .ap.args.page_zeroing = true, .ap.args.out_pages = true, - .ap.num_pages = 1, - .ap.pages = &page, + .ap.num_folios = 1, + .ap.folios = &folio, .ap.descs = &desc, }; ssize_t res; u64 attr_ver; /* - * Page writeback can extend beyond the lifetime of the - * page-cache page, so make sure we read a properly synced - * page. + * With the temporary pages that are used to complete writeback, we can + * have writeback that extends beyond the lifetime of the folio. So + * make sure we read a properly synced folio. */ - fuse_wait_on_page_writeback(inode, page->index); + fuse_wait_on_folio_writeback(inode, folio); attr_ver = fuse_get_attr_version(fm->fc); @@ -887,25 +909,24 @@ static int fuse_do_readpage(struct file *file, struct page *page) if (res < desc.length) fuse_short_read(inode, attr_ver, res, &ia.ap); - SetPageUptodate(page); + folio_mark_uptodate(folio); return 0; } static int fuse_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; int err; err = -EIO; if (fuse_is_bad(inode)) goto out; - err = fuse_do_readpage(file, page); + err = fuse_do_readfolio(file, folio); fuse_invalidate_atime(inode); out: - unlock_page(page); + folio_unlock(folio); return err; } @@ -919,8 +940,8 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, size_t num_read = args->out_args[0].size; struct address_space *mapping = NULL; - for (i = 0; mapping == NULL && i < ap->num_pages; i++) - mapping = ap->pages[i]->mapping; + for (i = 0; mapping == NULL && i < ap->num_folios; i++) + mapping = ap->folios[i]->mapping; if (mapping) { struct inode *inode = mapping->host; @@ -934,15 +955,9 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, fuse_invalidate_atime(inode); } - for (i = 0; i < ap->num_pages; i++) { - struct page *page = ap->pages[i]; - - if (!err) - SetPageUptodate(page); - else - SetPageError(page); - unlock_page(page); - put_page(page); + for (i = 0; i < ap->num_folios; i++) { + folio_end_read(ap->folios[i], !err); + folio_put(ap->folios[i]); } if (ia->ff) fuse_file_put(ia->ff, false); @@ -955,8 +970,9 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; struct fuse_args_pages *ap = &ia->ap; - loff_t pos = page_offset(ap->pages[0]); - size_t count = ap->num_pages << PAGE_SHIFT; + loff_t pos = folio_pos(ap->folios[0]); + /* Currently, all folios in FUSE are one page */ + size_t count = ap->num_folios << PAGE_SHIFT; ssize_t res; int err; @@ -967,7 +983,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) /* Don't overflow end offset */ if (pos + (count - 1) == LLONG_MAX) { count--; - ap->descs[ap->num_pages - 1].length--; + ap->descs[ap->num_folios - 1].length--; } WARN_ON((loff_t) (pos + count) < 0); @@ -989,18 +1005,36 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) static void fuse_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - unsigned int i, max_pages, nr_pages = 0; + unsigned int max_pages, nr_pages; + pgoff_t first = readahead_index(rac); + pgoff_t last = first + readahead_count(rac) - 1; if (fuse_is_bad(inode)) return; + wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, first, last)); + max_pages = min_t(unsigned int, fc->max_pages, fc->max_read / PAGE_SIZE); - for (;;) { + /* + * This is only accurate the first time through, since readahead_folio() + * doesn't update readahead_count() from the previous folio until the + * next call. Grab nr_pages here so we know how many pages we're going + * to have to process. This means that we will exit here with + * readahead_count() == folio_nr_pages(last_folio), but we will have + * consumed all of the folios, and read_pages() will call + * readahead_folio() again which will clean up the rac. + */ + nr_pages = readahead_count(rac); + + while (nr_pages) { struct fuse_io_args *ia; struct fuse_args_pages *ap; + struct folio *folio; + unsigned cur_pages = min(max_pages, nr_pages); if (fc->num_background >= fc->congestion_threshold && rac->ra->async_size >= readahead_count(rac)) @@ -1010,23 +1044,26 @@ static void fuse_readahead(struct readahead_control *rac) */ break; - nr_pages = readahead_count(rac) - nr_pages; - if (nr_pages > max_pages) - nr_pages = max_pages; - if (nr_pages == 0) - break; - ia = fuse_io_alloc(NULL, nr_pages); + ia = fuse_io_alloc(NULL, cur_pages); if (!ia) return; ap = &ia->ap; - nr_pages = __readahead_batch(rac, ap->pages, nr_pages); - for (i = 0; i < nr_pages; i++) { - fuse_wait_on_page_writeback(inode, - readahead_index(rac) + i); - ap->descs[i].length = PAGE_SIZE; + + while (ap->num_folios < cur_pages) { + /* + * This returns a folio with a ref held on it. + * The ref needs to be held until the request is + * completed, since the splice case (see + * fuse_try_move_page()) drops the ref after it's + * replaced in the page cache. + */ + folio = __readahead_folio(rac); + ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].length = folio_size(folio); + ap->num_folios++; } - ap->num_pages = nr_pages; fuse_send_readpages(ia, rac->file); + nr_pages -= cur_pages; } } @@ -1143,8 +1180,8 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, bool short_write; int err; - for (i = 0; i < ap->num_pages; i++) - fuse_wait_on_page_writeback(inode, ap->pages[i]->index); + for (i = 0; i < ap->num_folios; i++) + fuse_wait_on_folio_writeback(inode, ap->folios[i]); fuse_write_args_fill(ia, ff, pos, count); ia->write.in.flags = fuse_write_flags(iocb); @@ -1158,24 +1195,24 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, short_write = ia->write.out.size < count; offset = ap->descs[0].offset; count = ia->write.out.size; - for (i = 0; i < ap->num_pages; i++) { - struct page *page = ap->pages[i]; + for (i = 0; i < ap->num_folios; i++) { + struct folio *folio = ap->folios[i]; if (err) { - ClearPageUptodate(page); + folio_clear_uptodate(folio); } else { - if (count >= PAGE_SIZE - offset) - count -= PAGE_SIZE - offset; + if (count >= folio_size(folio) - offset) + count -= folio_size(folio) - offset; else { if (short_write) - ClearPageUptodate(page); + folio_clear_uptodate(folio); count = 0; } offset = 0; } - if (ia->write.page_locked && (i == ap->num_pages - 1)) - unlock_page(page); - put_page(page); + if (ia->write.folio_locked && (i == ap->num_folios - 1)) + folio_unlock(folio); + folio_put(folio); } return err; @@ -1189,6 +1226,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, struct fuse_args_pages *ap = &ia->ap; struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); + unsigned int nr_pages = 0; size_t count = 0; int err; @@ -1197,7 +1235,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, do { size_t tmp; - struct page *page; + struct folio *folio; pgoff_t index = pos >> PAGE_SHIFT; size_t bytes = min_t(size_t, PAGE_SIZE - offset, iov_iter_count(ii)); @@ -1209,27 +1247,30 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, if (fault_in_iov_iter_readable(ii, bytes)) break; - err = -ENOMEM; - page = grab_cache_page_write_begin(mapping, index); - if (!page) + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); break; + } if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + flush_dcache_folio(folio); - tmp = copy_page_from_iter_atomic(page, offset, bytes, ii); - flush_dcache_page(page); + tmp = copy_folio_from_iter_atomic(folio, offset, bytes, ii); + flush_dcache_folio(folio); if (!tmp) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto again; } err = 0; - ap->pages[ap->num_pages] = page; - ap->descs[ap->num_pages].length = tmp; - ap->num_pages++; + ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].length = tmp; + ap->num_folios++; + nr_pages++; count += tmp; pos += tmp; @@ -1239,18 +1280,18 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, /* If we copied full page, mark it uptodate */ if (tmp == PAGE_SIZE) - SetPageUptodate(page); + folio_mark_uptodate(folio); - if (PageUptodate(page)) { - unlock_page(page); + if (folio_test_uptodate(folio)) { + folio_unlock(folio); } else { - ia->write.page_locked = true; + ia->write.folio_locked = true; break; } if (!fc->big_writes) break; } while (iov_iter_count(ii) && count < fc->max_write && - ap->num_pages < max_pages && offset == 0); + nr_pages < max_pages && offset == 0); return count > 0 ? count : err; } @@ -1284,8 +1325,8 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), fc->max_pages); - ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs); - if (!ap->pages) { + ap->folios = fuse_folios_alloc(nr_pages, GFP_KERNEL, &ap->descs); + if (!ap->folios) { err = -ENOMEM; break; } @@ -1307,7 +1348,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) err = -EIO; } } - kfree(ap->pages); + kfree(ap->folios); } while (!err && iov_iter_count(ii)); fuse_write_update_attr(inode, pos, res); @@ -1349,7 +1390,7 @@ static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from /* shared locks are not allowed with parallel page cache IO */ if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state)) - return false; + return true; /* Parallel dio beyond EOF is not supported, at least for now. */ if (fuse_io_past_eof(iocb, from)) @@ -1362,7 +1403,7 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, bool *exclusive) { struct inode *inode = file_inode(iocb->ki_filp); - struct fuse_file *ff = iocb->ki_filp->private_data; + struct fuse_inode *fi = get_fuse_inode(inode); *exclusive = fuse_dio_wr_exclusive_lock(iocb, from); if (*exclusive) { @@ -1377,7 +1418,7 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, * have raced, so check it again. */ if (fuse_io_past_eof(iocb, from) || - fuse_file_uncached_io_start(inode, ff, NULL) != 0) { + fuse_inode_uncached_io_start(fi, NULL) != 0) { inode_unlock_shared(inode); inode_lock(inode); *exclusive = true; @@ -1388,13 +1429,13 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) { struct inode *inode = file_inode(iocb->ki_filp); - struct fuse_file *ff = iocb->ki_filp->private_data; + struct fuse_inode *fi = get_fuse_inode(inode); if (exclusive) { inode_unlock(inode); } else { /* Allow opens in caching mode after last parallel dio end */ - fuse_file_uncached_io_end(inode, ff); + fuse_inode_uncached_io_end(fi); inode_unlock_shared(inode); } } @@ -1402,6 +1443,7 @@ static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; + struct mnt_idmap *idmap = file_mnt_idmap(file); struct address_space *mapping = file->f_mapping; ssize_t written = 0; struct inode *inode = mapping->host; @@ -1416,7 +1458,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) return err; if (fc->handle_killpriv_v2 && - setattr_should_drop_suidgid(&nop_mnt_idmap, + setattr_should_drop_suidgid(idmap, file_inode(file))) { goto writethrough; } @@ -1433,11 +1475,7 @@ writethrough: task_io_account_write(count); - err = file_remove_privs(file); - if (err) - goto out; - - err = file_update_time(file); + err = kiocb_modified(iocb); if (err) goto out; @@ -1471,52 +1509,89 @@ static inline size_t fuse_get_frag_size(const struct iov_iter *ii, static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, size_t *nbytesp, int write, - unsigned int max_pages) + unsigned int max_pages, + bool use_pages_for_kvec_io) { + bool flush_or_invalidate = false; + unsigned int nr_pages = 0; size_t nbytes = 0; /* # bytes already packed in req */ ssize_t ret = 0; - /* Special case for kernel I/O: can copy directly into the buffer */ + /* Special case for kernel I/O: can copy directly into the buffer. + * However if the implementation of fuse_conn requires pages instead of + * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead. + */ if (iov_iter_is_kvec(ii)) { - unsigned long user_addr = fuse_get_user_addr(ii); - size_t frag_size = fuse_get_frag_size(ii, *nbytesp); + void *user_addr = (void *)fuse_get_user_addr(ii); - if (write) - ap->args.in_args[1].value = (void *) user_addr; - else - ap->args.out_args[0].value = (void *) user_addr; + if (!use_pages_for_kvec_io) { + size_t frag_size = fuse_get_frag_size(ii, *nbytesp); - iov_iter_advance(ii, frag_size); - *nbytesp = frag_size; - return 0; + if (write) + ap->args.in_args[1].value = user_addr; + else + ap->args.out_args[0].value = user_addr; + + iov_iter_advance(ii, frag_size); + *nbytesp = frag_size; + return 0; + } + + if (is_vmalloc_addr(user_addr)) { + ap->args.vmap_base = user_addr; + flush_or_invalidate = true; + } + } + + /* + * Until there is support for iov_iter_extract_folios(), we have to + * manually extract pages using iov_iter_extract_pages() and then + * copy that to a folios array. + */ + struct page **pages = kzalloc(max_pages * sizeof(struct page *), + GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto out; } - while (nbytes < *nbytesp && ap->num_pages < max_pages) { - unsigned npages; + while (nbytes < *nbytesp && nr_pages < max_pages) { + unsigned nfolios, i; size_t start; - struct page **pt_pages; - pt_pages = &ap->pages[ap->num_pages]; - ret = iov_iter_extract_pages(ii, &pt_pages, + ret = iov_iter_extract_pages(ii, &pages, *nbytesp - nbytes, - max_pages - ap->num_pages, + max_pages - nr_pages, 0, &start); if (ret < 0) break; nbytes += ret; - ret += start; - npages = DIV_ROUND_UP(ret, PAGE_SIZE); + nfolios = DIV_ROUND_UP(ret + start, PAGE_SIZE); - ap->descs[ap->num_pages].offset = start; - fuse_page_descs_length_init(ap->descs, ap->num_pages, npages); + for (i = 0; i < nfolios; i++) { + struct folio *folio = page_folio(pages[i]); + unsigned int offset = start + + (folio_page_idx(folio, pages[i]) << PAGE_SHIFT); + unsigned int len = min_t(unsigned int, ret, PAGE_SIZE - start); + + ap->descs[ap->num_folios].offset = offset; + ap->descs[ap->num_folios].length = len; + ap->folios[ap->num_folios] = folio; + start = 0; + ret -= len; + ap->num_folios++; + } - ap->num_pages += npages; - ap->descs[ap->num_pages - 1].length -= - (PAGE_SIZE - ret) & (PAGE_SIZE - 1); + nr_pages += nfolios; } + kfree(pages); + if (write && flush_or_invalidate) + flush_kernel_vmap_range(ap->args.vmap_base, nbytes); + + ap->args.invalidate_vmap = !write && flush_or_invalidate; ap->args.is_pinned = iov_iter_extract_will_pin(ii); ap->args.user_pages = true; if (write) @@ -1524,6 +1599,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, else ap->args.out_pages = true; +out: *nbytesp = nbytes; return ret < 0 ? ret : 0; @@ -1585,7 +1661,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, size_t nbytes = min(count, nmax); err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, - max_pages); + max_pages, fc->use_pages_for_kvec_io); if (err && !nbytes) break; @@ -1599,7 +1675,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, } if (!io->async || nres < 0) { - fuse_release_user_pages(&ia->ap, io->should_dirty); + fuse_release_user_pages(&ia->ap, nres, io->should_dirty); fuse_io_free(ia); } ia = NULL; @@ -1653,7 +1729,7 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t res; - if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { + if (!is_sync_kiocb(iocb)) { res = fuse_direct_IO(iocb, to); } else { struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); @@ -1667,7 +1743,6 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; bool exclusive; @@ -1675,9 +1750,11 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) res = generic_write_checks(iocb, from); if (res > 0) { task_io_account_write(res); - if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { + if (!is_sync_kiocb(iocb)) { res = fuse_direct_IO(iocb, from); } else { + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); fuse_write_update_attr(inode, iocb->ki_pos, res); @@ -1763,30 +1840,34 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) if (wpa->bucket) fuse_sync_bucket_dec(wpa->bucket); - for (i = 0; i < ap->num_pages; i++) - __free_page(ap->pages[i]); + for (i = 0; i < ap->num_folios; i++) + folio_put(ap->folios[i]); - if (wpa->ia.ff) - fuse_file_put(wpa->ia.ff, false); + fuse_file_put(wpa->ia.ff, false); - kfree(ap->pages); + kfree(ap->folios); kfree(wpa); } -static void fuse_writepage_finish(struct fuse_mount *fm, - struct fuse_writepage_args *wpa) +static void fuse_writepage_finish_stat(struct inode *inode, struct folio *folio) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + node_stat_sub_folio(folio, NR_WRITEBACK_TEMP); + wb_writeout_inc(&bdi->wb); +} + +static void fuse_writepage_finish(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); - struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_pages; i++) { - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); - } + for (i = 0; i < ap->num_folios; i++) + fuse_writepage_finish_stat(inode, ap->folios[i]); + wake_up(&fi->page_waitq); } @@ -1800,7 +1881,8 @@ __acquires(fi->lock) struct fuse_inode *fi = get_fuse_inode(wpa->inode); struct fuse_write_in *inarg = &wpa->ia.write.in; struct fuse_args *args = &wpa->ia.ap.args; - __u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE; + /* Currently, all folios in FUSE are one page */ + __u64 data_size = wpa->ia.ap.num_folios * PAGE_SIZE; int err; fi->writectr++; @@ -1833,13 +1915,15 @@ __acquires(fi->lock) out_free: fi->writectr--; rb_erase(&wpa->writepages_entry, &fi->writepages); - fuse_writepage_finish(fm, wpa); + fuse_writepage_finish(wpa); spin_unlock(&fi->lock); - /* After fuse_writepage_finish() aux request list is private */ + /* After rb_erase() aux request list is private */ for (aux = wpa->next; aux; aux = next) { next = aux->next; aux->next = NULL; + fuse_writepage_finish_stat(aux->inode, + aux->ia.ap.folios[0]); fuse_writepage_free(aux); } @@ -1874,11 +1958,11 @@ static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, struct fuse_writepage_args *wpa) { pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; - pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1; + pgoff_t idx_to = idx_from + wpa->ia.ap.num_folios - 1; struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; - WARN_ON(!wpa->ia.ap.num_pages); + WARN_ON(!wpa->ia.ap.num_folios); while (*p) { struct fuse_writepage_args *curr; pgoff_t curr_index; @@ -1889,7 +1973,7 @@ static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, WARN_ON(curr->inode != wpa->inode); curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from >= curr_index + curr->ia.ap.num_pages) + if (idx_from >= curr_index + curr->ia.ap.num_folios) p = &(*p)->rb_right; else if (idx_to < curr_index) p = &(*p)->rb_left; @@ -1934,7 +2018,6 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, wpa->next = next->next; next->next = NULL; - next->ia.ff = fuse_file_get(wpa->ia.ff); tree_insert(&fi->writepages, next); /* @@ -1963,7 +2046,7 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, fuse_send_writepage(fm, next, inarg->offset + inarg->size); } fi->writectr--; - fuse_writepage_finish(fm, wpa); + fuse_writepage_finish(wpa); spin_unlock(&fi->lock); fuse_writepage_free(wpa); } @@ -2022,9 +2105,9 @@ static struct fuse_writepage_args *fuse_writepage_args_alloc(void) wpa = kzalloc(sizeof(*wpa), GFP_NOFS); if (wpa) { ap = &wpa->ia.ap; - ap->num_pages = 0; - ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs); - if (!ap->pages) { + ap->num_folios = 0; + ap->folios = fuse_folios_alloc(1, GFP_NOFS, &ap->descs); + if (!ap->folios) { kfree(wpa); wpa = NULL; } @@ -2047,49 +2130,77 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, rcu_read_unlock(); } +static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, + struct folio *tmp_folio, uint32_t folio_index) +{ + struct inode *inode = folio->mapping->host; + struct fuse_args_pages *ap = &wpa->ia.ap; + + folio_copy(tmp_folio, folio); + + ap->folios[folio_index] = tmp_folio; + ap->descs[folio_index].offset = 0; + ap->descs[folio_index].length = PAGE_SIZE; + + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); + node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP); +} + +static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, + struct fuse_file *ff) +{ + struct inode *inode = folio->mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_writepage_args *wpa; + struct fuse_args_pages *ap; + + wpa = fuse_writepage_args_alloc(); + if (!wpa) + return NULL; + + fuse_writepage_add_to_bucket(fc, wpa); + fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio), 0); + wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; + wpa->inode = inode; + wpa->ia.ff = ff; + + ap = &wpa->ia.ap; + ap->args.in_pages = true; + ap->args.end = fuse_writepage_end; + + return wpa; +} + static int fuse_writepage_locked(struct folio *folio) { struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_writepage_args *wpa; struct fuse_args_pages *ap; struct folio *tmp_folio; + struct fuse_file *ff; int error = -ENOMEM; - folio_start_writeback(folio); - - wpa = fuse_writepage_args_alloc(); - if (!wpa) - goto err; - ap = &wpa->ia.ap; - tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); if (!tmp_folio) - goto err_free; + goto err; error = -EIO; - wpa->ia.ff = fuse_write_file_get(fi); - if (!wpa->ia.ff) + ff = fuse_write_file_get(fi); + if (!ff) goto err_nofile; - fuse_writepage_add_to_bucket(fc, wpa); - fuse_write_args_fill(&wpa->ia, wpa->ia.ff, folio_pos(folio), 0); + wpa = fuse_writepage_args_setup(folio, ff); + error = -ENOMEM; + if (!wpa) + goto err_writepage_args; - folio_copy(tmp_folio, folio); - wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; - wpa->next = NULL; - ap->args.in_pages = true; - ap->num_pages = 1; - ap->pages[0] = &tmp_folio->page; - ap->descs[0].offset = 0; - ap->descs[0].length = PAGE_SIZE; - ap->args.end = fuse_writepage_end; - wpa->inode = inode; + ap = &wpa->ia.ap; + ap->num_folios = 1; - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP); + folio_start_writeback(folio); + fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0); spin_lock(&fi->lock); tree_insert(&fi->writepages, wpa); @@ -2101,13 +2212,12 @@ static int fuse_writepage_locked(struct folio *folio) return 0; +err_writepage_args: + fuse_file_put(ff, false); err_nofile: folio_put(tmp_folio); -err_free: - kfree(wpa); err: mapping_set_error(folio->mapping, error); - folio_end_writeback(folio); return error; } @@ -2115,32 +2225,32 @@ struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; struct inode *inode; - struct page **orig_pages; - unsigned int max_pages; + struct folio **orig_folios; + unsigned int max_folios; }; static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) { struct fuse_args_pages *ap = &data->wpa->ia.ap; struct fuse_conn *fc = get_fuse_conn(data->inode); - struct page **pages; - struct fuse_page_desc *descs; - unsigned int npages = min_t(unsigned int, - max_t(unsigned int, data->max_pages * 2, - FUSE_DEFAULT_MAX_PAGES_PER_REQ), + struct folio **folios; + struct fuse_folio_desc *descs; + unsigned int nfolios = min_t(unsigned int, + max_t(unsigned int, data->max_folios * 2, + FUSE_DEFAULT_MAX_PAGES_PER_REQ), fc->max_pages); - WARN_ON(npages <= data->max_pages); + WARN_ON(nfolios <= data->max_folios); - pages = fuse_pages_alloc(npages, GFP_NOFS, &descs); - if (!pages) + folios = fuse_folios_alloc(nfolios, GFP_NOFS, &descs); + if (!folios) return false; - memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages); - memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages); - kfree(ap->pages); - ap->pages = pages; + memcpy(folios, ap->folios, sizeof(struct folio *) * ap->num_folios); + memcpy(descs, ap->descs, sizeof(struct fuse_folio_desc) * ap->num_folios); + kfree(ap->folios); + ap->folios = folios; ap->descs = descs; - data->max_pages = npages; + data->max_folios = nfolios; return true; } @@ -2150,17 +2260,16 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) struct fuse_writepage_args *wpa = data->wpa; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); - int num_pages = wpa->ia.ap.num_pages; + int num_folios = wpa->ia.ap.num_folios; int i; - wpa->ia.ff = fuse_file_get(data->ff); spin_lock(&fi->lock); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); - for (i = 0; i < num_pages; i++) - end_page_writeback(data->orig_pages[i]); + for (i = 0; i < num_folios; i++) + folio_end_writeback(data->orig_folios[i]); } /* @@ -2171,15 +2280,15 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) * swapping the new temp page with the old one. */ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, - struct page *page) + struct folio *folio) { struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); struct fuse_writepage_args *tmp; struct fuse_writepage_args *old_wpa; struct fuse_args_pages *new_ap = &new_wpa->ia.ap; - WARN_ON(new_ap->num_pages != 0); - new_ap->num_pages = 1; + WARN_ON(new_ap->num_folios != 0); + new_ap->num_folios = 1; spin_lock(&fi->lock); old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); @@ -2193,9 +2302,9 @@ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, WARN_ON(tmp->inode != new_wpa->inode); curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; - if (curr_index == page->index) { - WARN_ON(tmp->ia.ap.num_pages != 1); - swap(tmp->ia.ap.pages[0], new_ap->pages[0]); + if (curr_index == folio->index) { + WARN_ON(tmp->ia.ap.num_folios != 1); + swap(tmp->ia.ap.folios[0], new_ap->folios[0]); break; } } @@ -2208,22 +2317,19 @@ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, spin_unlock(&fi->lock); if (tmp) { - struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode); - - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); + fuse_writepage_finish_stat(new_wpa->inode, + folio); fuse_writepage_free(new_wpa); } return false; } -static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, +static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, struct fuse_args_pages *ap, struct fuse_fill_wb_data *data) { - WARN_ON(!ap->num_pages); + WARN_ON(!ap->num_folios); /* * Being under writeback is unlikely but possible. For example direct @@ -2231,23 +2337,23 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, * the pages are faulted with get_user_pages(), and then after the read * completed. */ - if (fuse_page_is_writeback(data->inode, page->index)) + if (fuse_folio_is_writeback(data->inode, folio)) return true; /* Reached max pages */ - if (ap->num_pages == fc->max_pages) + if (ap->num_folios == fc->max_pages) return true; /* Reached max write bytes */ - if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write) + if ((ap->num_folios + 1) * PAGE_SIZE > fc->max_write) return true; /* Discontinuity */ - if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index) + if (data->orig_folios[ap->num_folios - 1]->index + 1 != folio->index) return true; /* Need to grow the pages array? If so, did the expansion fail? */ - if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data)) + if (ap->num_folios == data->max_folios && !fuse_pages_realloc(data)) return true; return false; @@ -2262,7 +2368,7 @@ static int fuse_writepages_fill(struct folio *folio, struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct page *tmp_page; + struct folio *tmp_folio; int err; if (!data->ff) { @@ -2272,14 +2378,14 @@ static int fuse_writepages_fill(struct folio *folio, goto out_unlock; } - if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) { + if (wpa && fuse_writepage_need_send(fc, folio, ap, data)) { fuse_writepages_send(data); data->wpa = NULL; } err = -ENOMEM; - tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!tmp_page) + tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); + if (!tmp_folio) goto out_unlock; /* @@ -2291,40 +2397,25 @@ static int fuse_writepages_fill(struct folio *folio, * This is ensured by holding the page lock in page_mkwrite() while * checking fuse_page_is_writeback(). We already hold the page lock * since clear_page_dirty_for_io() and keep it held until we add the - * request to the fi->writepages list and increment ap->num_pages. + * request to the fi->writepages list and increment ap->num_folios. * After this fuse_page_is_writeback() will indicate that the page is * under writeback, so we can release the page lock. */ if (data->wpa == NULL) { err = -ENOMEM; - wpa = fuse_writepage_args_alloc(); + wpa = fuse_writepage_args_setup(folio, data->ff); if (!wpa) { - __free_page(tmp_page); + folio_put(tmp_folio); goto out_unlock; } - fuse_writepage_add_to_bucket(fc, wpa); - - data->max_pages = 1; - + fuse_file_get(wpa->ia.ff); + data->max_folios = 1; ap = &wpa->ia.ap; - fuse_write_args_fill(&wpa->ia, data->ff, folio_pos(folio), 0); - wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; - wpa->next = NULL; - ap->args.in_pages = true; - ap->args.end = fuse_writepage_end; - ap->num_pages = 0; - wpa->inode = inode; } folio_start_writeback(folio); - copy_highpage(tmp_page, &folio->page); - ap->pages[ap->num_pages] = tmp_page; - ap->descs[ap->num_pages].offset = 0; - ap->descs[ap->num_pages].length = PAGE_SIZE; - data->orig_pages[ap->num_pages] = &folio->page; - - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); + fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_folios); + data->orig_folios[ap->num_folios] = folio; err = 0; if (data->wpa) { @@ -2333,9 +2424,9 @@ static int fuse_writepages_fill(struct folio *folio, * fuse_page_is_writeback(). */ spin_lock(&fi->lock); - ap->num_pages++; + ap->num_folios++; spin_unlock(&fi->lock); - } else if (fuse_writepage_add(wpa, &folio->page)) { + } else if (fuse_writepage_add(wpa, folio)) { data->wpa = wpa; } else { folio_end_writeback(folio); @@ -2367,21 +2458,21 @@ static int fuse_writepages(struct address_space *mapping, data.ff = NULL; err = -ENOMEM; - data.orig_pages = kcalloc(fc->max_pages, - sizeof(struct page *), - GFP_NOFS); - if (!data.orig_pages) + data.orig_folios = kcalloc(fc->max_pages, + sizeof(struct folio *), + GFP_NOFS); + if (!data.orig_folios) goto out; err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); if (data.wpa) { - WARN_ON(!data.wpa->ia.ap.num_pages); + WARN_ON(!data.wpa->ia.ap.num_folios); fuse_writepages_send(&data); } if (data.ff) fuse_file_put(data.ff, false); - kfree(data.orig_pages); + kfree(data.orig_folios); out: return err; } @@ -2391,76 +2482,77 @@ out: * but how to implement it without killing performance need more thinking. */ static int fuse_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { pgoff_t index = pos >> PAGE_SHIFT; struct fuse_conn *fc = get_fuse_conn(file_inode(file)); - struct page *page; + struct folio *folio; loff_t fsize; int err = -ENOMEM; WARN_ON(!fc->writeback_cache); - page = grab_cache_page_write_begin(mapping, index); - if (!page) + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) goto error; - fuse_wait_on_page_writeback(mapping->host, page->index); + fuse_wait_on_page_writeback(mapping->host, folio->index); - if (PageUptodate(page) || len == PAGE_SIZE) + if (folio_test_uptodate(folio) || len >= folio_size(folio)) goto success; /* - * Check if the start this page comes after the end of file, in which - * case the readpage can be optimized away. + * Check if the start of this folio comes after the end of file, + * in which case the readpage can be optimized away. */ fsize = i_size_read(mapping->host); - if (fsize <= (pos & PAGE_MASK)) { - size_t off = pos & ~PAGE_MASK; + if (fsize <= folio_pos(folio)) { + size_t off = offset_in_folio(folio, pos); if (off) - zero_user_segment(page, 0, off); + folio_zero_segment(folio, 0, off); goto success; } - err = fuse_do_readpage(file, page); + err = fuse_do_readfolio(file, folio); if (err) goto cleanup; success: - *pagep = page; + *foliop = folio; return 0; cleanup: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); error: return err; } static int fuse_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; /* Haven't copied anything? Skip zeroing, size extending, dirtying. */ if (!copied) goto unlock; pos += copied; - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { /* Zero any unwritten bytes at the end of the page */ size_t endoff = pos & ~PAGE_MASK; if (endoff) - zero_user_segment(page, endoff, PAGE_SIZE); - SetPageUptodate(page); + folio_zero_segment(folio, endoff, PAGE_SIZE); + folio_mark_uptodate(folio); } if (pos > inode->i_size) i_size_write(inode, pos); - set_page_dirty(page); + folio_mark_dirty(folio); unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return copied; } @@ -2509,17 +2601,17 @@ static void fuse_vma_close(struct vm_area_struct *vma) */ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vmf->vma->vm_file); file_update_time(vmf->vma->vm_file); - lock_page(page); - if (page->mapping != inode->i_mapping) { - unlock_page(page); + folio_lock(folio); + if (folio->mapping != inode->i_mapping) { + folio_unlock(folio); return VM_FAULT_NOPAGE; } - fuse_wait_on_page_writeback(inode, page->index); + fuse_wait_on_folio_writeback(inode, folio); return VM_FAULT_LOCKED; } @@ -2574,8 +2666,10 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) * First mmap of direct_io file enters caching inode io mode. * Also waits for parallel dio writers to go into serial mode * (exclusive instead of shared lock). + * After first mmap, the inode stays in caching io mode until + * the direct_io file release. */ - rc = fuse_file_cached_io_start(inode, ff); + rc = fuse_file_cached_io_open(inode, ff); if (rc) return rc; } @@ -2968,7 +3062,7 @@ static void fuse_do_truncate(struct file *file) attr.ia_file = file; attr.ia_valid |= ATTR_FILE; - fuse_do_setattr(file_dentry(file), &attr, file); + fuse_do_setattr(file_mnt_idmap(file), file_dentry(file), &attr, file); } static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off) @@ -3111,7 +3205,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, inode_lock(inode); if (block_faults) { filemap_invalidate_lock(inode->i_mapping); - err = fuse_dax_break_layouts(inode, 0, 0); + err = fuse_dax_break_layouts(inode, 0, -1); if (err) goto out; } diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h new file mode 100644 index 000000000000..b3c2e32254ba --- /dev/null +++ b/fs/fuse/fuse_dev_i.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> + */ +#ifndef _FS_FUSE_DEV_I_H +#define _FS_FUSE_DEV_I_H + +#include <linux/types.h> + +/* Ordinary requests have even IDs, while interrupts IDs are odd */ +#define FUSE_INT_REQ_BIT (1ULL << 0) +#define FUSE_REQ_ID_STEP (1ULL << 1) + +struct fuse_arg; +struct fuse_args; +struct fuse_pqueue; +struct fuse_req; +struct fuse_iqueue; +struct fuse_forget_link; + +struct fuse_copy_state { + int write; + struct fuse_req *req; + struct iov_iter *iter; + struct pipe_buffer *pipebufs; + struct pipe_buffer *currbuf; + struct pipe_inode_info *pipe; + unsigned long nr_segs; + struct page *pg; + unsigned int len; + unsigned int offset; + unsigned int move_pages:1; + unsigned int is_uring:1; + struct { + unsigned int copied_sz; /* copied size into the user buffer */ + } ring; +}; + +static inline struct fuse_dev *fuse_get_dev(struct file *file) +{ + /* + * Lockless access is OK, because file->private data is set + * once during mount and is valid until the file is released. + */ + return READ_ONCE(file->private_data); +} + +unsigned int fuse_req_hash(u64 unique); +struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique); + +void fuse_dev_end_requests(struct list_head *head); + +void fuse_copy_init(struct fuse_copy_state *cs, int write, + struct iov_iter *iter); +int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs, + unsigned int argpages, struct fuse_arg *args, + int zeroing); +int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, + unsigned int nbytes); +void fuse_dev_queue_forget(struct fuse_iqueue *fiq, + struct fuse_forget_link *forget); +void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req); +bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock); + +bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list); +bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing); + +#endif + diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index b24084b60864..d56d4fd956db 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -35,18 +35,38 @@ /** Default max number of pages that can be used in a single read request */ #define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 -/** Maximum of max_pages received in init_out */ -#define FUSE_MAX_MAX_PAGES 256 - /** Bias for fi->writectr, meaning new writepages must not be sent */ #define FUSE_NOWRITE INT_MIN -/** It could be as large as PATH_MAX, but would that have any uses? */ -#define FUSE_NAME_MAX 1024 +/** Maximum length of a filename, not including terminating null */ + +/* maximum, small enough for FUSE_MIN_READ_BUFFER*/ +#define FUSE_NAME_LOW_MAX 1024 +/* maximum, but needs a request buffer > FUSE_MIN_READ_BUFFER */ +#define FUSE_NAME_MAX (PATH_MAX - 1) /** Number of dentries for each connection in the control filesystem */ #define FUSE_CTL_NUM_DENTRIES 5 +/* Frequency (in seconds) of request timeout checks, if opted into */ +#define FUSE_TIMEOUT_TIMER_FREQ 15 + +/** Frequency (in jiffies) of request timeout checks, if opted into */ +extern const unsigned long fuse_timeout_timer_freq; + +/** Maximum of max_pages received in init_out */ +extern unsigned int fuse_max_pages_limit; +/* + * Default timeout (in seconds) for the server to reply to a request + * before the connection is aborted, if no timeout was specified on mount. + */ +extern unsigned int fuse_default_req_timeout; +/* + * Max timeout (in seconds) for the server to reply to a request before + * the connection is aborted. + */ +extern unsigned int fuse_max_req_timeout; + /** List of active connections */ extern struct list_head fuse_conn_list; @@ -285,8 +305,8 @@ struct fuse_arg { void *value; }; -/** FUSE page descriptor */ -struct fuse_page_desc { +/** FUSE folio descriptor */ +struct fuse_folio_desc { unsigned int length; unsigned int offset; }; @@ -309,16 +329,19 @@ struct fuse_args { bool may_block:1; bool is_ext:1; bool is_pinned:1; - struct fuse_in_arg in_args[3]; + bool invalidate_vmap:1; + struct fuse_in_arg in_args[4]; struct fuse_arg out_args[2]; void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); + /* Used for kvec iter backed by vmalloc address */ + void *vmap_base; }; struct fuse_args_pages { struct fuse_args args; - struct page **pages; - struct fuse_page_desc *descs; - unsigned int num_pages; + struct folio **folios; + struct fuse_folio_desc *descs; + unsigned int num_folios; }; struct fuse_release_args { @@ -375,6 +398,7 @@ struct fuse_io_priv { * FR_FINISHED: request is finished * FR_PRIVATE: request is on private list * FR_ASYNC: request is asynchronous + * FR_URING: request is handled through fuse-io-uring */ enum fuse_req_flag { FR_ISREPLY, @@ -389,6 +413,7 @@ enum fuse_req_flag { FR_FINISHED, FR_PRIVATE, FR_ASYNC, + FR_URING, }; /** @@ -435,6 +460,13 @@ struct fuse_req { /** fuse_mount this request belongs to */ struct fuse_mount *fm; + +#ifdef CONFIG_FUSE_IO_URING + void *ring_entry; + void *ring_queue; +#endif + /** When (in jiffies) the request was created */ + unsigned long create_time; }; struct fuse_iqueue; @@ -449,22 +481,19 @@ struct fuse_iqueue; */ struct fuse_iqueue_ops { /** - * Signal that a forget has been queued + * Send one forget */ - void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq) - __releases(fiq->lock); + void (*send_forget)(struct fuse_iqueue *fiq, struct fuse_forget_link *link); /** - * Signal that an INTERRUPT request has been queued + * Send interrupt for request */ - void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq) - __releases(fiq->lock); + void (*send_interrupt)(struct fuse_iqueue *fiq, struct fuse_req *req); /** - * Signal that a request has been queued + * Send one request */ - void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq) - __releases(fiq->lock); + void (*send_req)(struct fuse_iqueue *fiq, struct fuse_req *req); /** * Clean up when fuse_iqueue is destroyed @@ -860,6 +889,15 @@ struct fuse_conn { /** Passthrough support for read/write IO */ unsigned int passthrough:1; + /* Use pages instead of pointer for kernel I/O */ + unsigned int use_pages_for_kvec_io:1; + + /* Is link not implemented by fs? */ + unsigned int no_link:1; + + /* Use io_uring for communication */ + unsigned int io_uring; + /** Maximum stack depth for passthrough backing files */ int max_stack_depth; @@ -869,7 +907,7 @@ struct fuse_conn { /** Negotiated minor version */ unsigned minor; - /** Entry on the fuse_mount_list */ + /** Entry on the fuse_conn_list */ struct list_head entry; /** Device ID from the root super block */ @@ -887,6 +925,12 @@ struct fuse_conn { /** Version counter for attribute changes */ atomic64_t attr_version; + /** Version counter for evict inode */ + atomic64_t evict_ctr; + + /* maximum file name length */ + u32 name_max; + /** Called on final put */ void (*release)(struct fuse_conn *); @@ -917,6 +961,20 @@ struct fuse_conn { /** IDR for backing files ids */ struct idr backing_files_map; #endif + +#ifdef CONFIG_FUSE_IO_URING + /** uring connection information*/ + struct fuse_ring *ring; +#endif + + /** Only used if the connection opts into request timeouts */ + struct { + /* Worker for checking if any requests have timed out */ + struct delayed_work work; + + /* Request timeout (in jiffies). 0 = no timeout */ + unsigned int req_timeout; + } timeout; }; /* @@ -941,6 +999,19 @@ struct fuse_mount { struct rcu_head rcu; }; +/* + * Empty header for FUSE opcodes without specific header needs. + * Used as a placeholder in args->in_args[0] for consistency + * across all FUSE operations, simplifying request handling. + */ +struct fuse_zero_header {}; + +static inline void fuse_set_zero_arg0(struct fuse_args *args) +{ + args->in_args[0].size = sizeof(struct fuse_zero_header); + args->in_args[0].value = NULL; +} + static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb) { return sb->s_fs_info; @@ -981,6 +1052,11 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc) return atomic64_read(&fc->attr_version); } +static inline u64 fuse_get_evict_ctr(struct fuse_conn *fc) +{ + return atomic64_read(&fc->evict_ctr); +} + static inline bool fuse_stale_inode(const struct inode *inode, int generation, struct fuse_attr *attr) { @@ -998,25 +1074,25 @@ static inline bool fuse_is_bad(struct inode *inode) return unlikely(test_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state)); } -static inline struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, - struct fuse_page_desc **desc) +static inline struct folio **fuse_folios_alloc(unsigned int nfolios, gfp_t flags, + struct fuse_folio_desc **desc) { - struct page **pages; + struct folio **folios; - pages = kzalloc(npages * (sizeof(struct page *) + - sizeof(struct fuse_page_desc)), flags); - *desc = (void *) (pages + npages); + folios = kzalloc(nfolios * (sizeof(struct folio *) + + sizeof(struct fuse_folio_desc)), flags); + *desc = (void *) (folios + nfolios); - return pages; + return folios; } -static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, - unsigned int index, - unsigned int nr_pages) +static inline void fuse_folio_descs_length_init(struct fuse_folio_desc *descs, + unsigned int index, + unsigned int nr_folios) { int i; - for (i = index; i < index + nr_pages; i++) + for (i = index; i < index + nr_folios; i++) descs[i].length = PAGE_SIZE - descs[i].offset; } @@ -1040,7 +1116,8 @@ extern const struct dentry_operations fuse_root_dentry_operations; */ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int generation, struct fuse_attr *attr, - u64 attr_valid, u64 attr_version); + u64 attr_valid, u64 attr_version, + u64 evict_ctr); int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode); @@ -1053,10 +1130,6 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, struct fuse_forget_link *fuse_alloc_forget(void); -struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, - unsigned int max, - unsigned int *countp); - /* * Initialize READ or READDIR request */ @@ -1069,7 +1142,7 @@ struct fuse_io_args { struct { struct fuse_write_in in; struct fuse_write_out out; - bool page_locked; + bool folio_locked; } write; }; struct fuse_args_pages ap; @@ -1134,7 +1207,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, struct fuse_statx *sx, - u64 attr_valid, u32 cache_mask); + u64 attr_valid, u32 cache_mask, + u64 evict_ctr); u32 fuse_get_cache_mask(struct inode *inode); @@ -1154,7 +1228,22 @@ void __exit fuse_ctl_cleanup(void); /** * Simple request sending that does request allocation and freeing */ -ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args); +ssize_t __fuse_simple_request(struct mnt_idmap *idmap, + struct fuse_mount *fm, + struct fuse_args *args); + +static inline ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) +{ + return __fuse_simple_request(&invalid_mnt_idmap, fm, args); +} + +static inline ssize_t fuse_simple_idmap_request(struct mnt_idmap *idmap, + struct fuse_mount *fm, + struct fuse_args *args) +{ + return __fuse_simple_request(idmap, fm, args); +} + int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags); @@ -1167,6 +1256,9 @@ void fuse_request_end(struct fuse_req *req); void fuse_abort_conn(struct fuse_conn *fc); void fuse_wait_aborted(struct fuse_conn *fc); +/* Check if any requests timed out */ +void fuse_check_timeout(struct work_struct *work); + /** * Invalidate inode attributes */ @@ -1196,6 +1288,11 @@ void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o); struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); /** + * Initialize the fuse processing queue + */ +void fuse_pqueue_init(struct fuse_pqueue *fpq); + +/** * Initialize fuse_conn */ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, @@ -1330,8 +1427,8 @@ bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written); int fuse_flush_times(struct inode *inode, struct fuse_file *ff); int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); -int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, - struct file *file); +int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr, struct file *file); void fuse_set_initialized(struct fuse_conn *fc); @@ -1394,9 +1491,10 @@ int fuse_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); /* iomode.c */ -int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff); -int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb); -void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff); +int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff); +int fuse_inode_uncached_io_start(struct fuse_inode *fi, + struct fuse_backing *fb); +void fuse_inode_uncached_io_end(struct fuse_inode *fi); int fuse_file_io_open(struct file *file, struct inode *inode); void fuse_file_io_release(struct fuse_file *ff, struct inode *inode); @@ -1471,4 +1569,12 @@ ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, size_t len, unsigned int flags); ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma); +#ifdef CONFIG_SYSCTL +extern int fuse_sysctl_register(void); +extern void fuse_sysctl_unregister(void); +#else +#define fuse_sysctl_register() (0) +#define fuse_sysctl_unregister() do { } while (0) +#endif /* CONFIG_SYSCTL */ + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h new file mode 100644 index 000000000000..bbe9ddd8c716 --- /dev/null +++ b/fs/fuse/fuse_trace.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fuse + +#if !defined(_TRACE_FUSE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FUSE_H + +#include <linux/tracepoint.h> + +#define OPCODES \ + EM( FUSE_LOOKUP, "FUSE_LOOKUP") \ + EM( FUSE_FORGET, "FUSE_FORGET") \ + EM( FUSE_GETATTR, "FUSE_GETATTR") \ + EM( FUSE_SETATTR, "FUSE_SETATTR") \ + EM( FUSE_READLINK, "FUSE_READLINK") \ + EM( FUSE_SYMLINK, "FUSE_SYMLINK") \ + EM( FUSE_MKNOD, "FUSE_MKNOD") \ + EM( FUSE_MKDIR, "FUSE_MKDIR") \ + EM( FUSE_UNLINK, "FUSE_UNLINK") \ + EM( FUSE_RMDIR, "FUSE_RMDIR") \ + EM( FUSE_RENAME, "FUSE_RENAME") \ + EM( FUSE_LINK, "FUSE_LINK") \ + EM( FUSE_OPEN, "FUSE_OPEN") \ + EM( FUSE_READ, "FUSE_READ") \ + EM( FUSE_WRITE, "FUSE_WRITE") \ + EM( FUSE_STATFS, "FUSE_STATFS") \ + EM( FUSE_RELEASE, "FUSE_RELEASE") \ + EM( FUSE_FSYNC, "FUSE_FSYNC") \ + EM( FUSE_SETXATTR, "FUSE_SETXATTR") \ + EM( FUSE_GETXATTR, "FUSE_GETXATTR") \ + EM( FUSE_LISTXATTR, "FUSE_LISTXATTR") \ + EM( FUSE_REMOVEXATTR, "FUSE_REMOVEXATTR") \ + EM( FUSE_FLUSH, "FUSE_FLUSH") \ + EM( FUSE_INIT, "FUSE_INIT") \ + EM( FUSE_OPENDIR, "FUSE_OPENDIR") \ + EM( FUSE_READDIR, "FUSE_READDIR") \ + EM( FUSE_RELEASEDIR, "FUSE_RELEASEDIR") \ + EM( FUSE_FSYNCDIR, "FUSE_FSYNCDIR") \ + EM( FUSE_GETLK, "FUSE_GETLK") \ + EM( FUSE_SETLK, "FUSE_SETLK") \ + EM( FUSE_SETLKW, "FUSE_SETLKW") \ + EM( FUSE_ACCESS, "FUSE_ACCESS") \ + EM( FUSE_CREATE, "FUSE_CREATE") \ + EM( FUSE_INTERRUPT, "FUSE_INTERRUPT") \ + EM( FUSE_BMAP, "FUSE_BMAP") \ + EM( FUSE_DESTROY, "FUSE_DESTROY") \ + EM( FUSE_IOCTL, "FUSE_IOCTL") \ + EM( FUSE_POLL, "FUSE_POLL") \ + EM( FUSE_NOTIFY_REPLY, "FUSE_NOTIFY_REPLY") \ + EM( FUSE_BATCH_FORGET, "FUSE_BATCH_FORGET") \ + EM( FUSE_FALLOCATE, "FUSE_FALLOCATE") \ + EM( FUSE_READDIRPLUS, "FUSE_READDIRPLUS") \ + EM( FUSE_RENAME2, "FUSE_RENAME2") \ + EM( FUSE_LSEEK, "FUSE_LSEEK") \ + EM( FUSE_COPY_FILE_RANGE, "FUSE_COPY_FILE_RANGE") \ + EM( FUSE_SETUPMAPPING, "FUSE_SETUPMAPPING") \ + EM( FUSE_REMOVEMAPPING, "FUSE_REMOVEMAPPING") \ + EM( FUSE_SYNCFS, "FUSE_SYNCFS") \ + EM( FUSE_TMPFILE, "FUSE_TMPFILE") \ + EM( FUSE_STATX, "FUSE_STATX") \ + EMe(CUSE_INIT, "CUSE_INIT") + +/* + * This will turn the above table into TRACE_DEFINE_ENUM() for each of the + * entries. + */ +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +OPCODES + +/* Now we redfine it with the table that __print_symbolic needs. */ +#undef EM +#undef EMe +#define EM(a, b) {a, b}, +#define EMe(a, b) {a, b} + +TRACE_EVENT(fuse_request_send, + TP_PROTO(const struct fuse_req *req), + + TP_ARGS(req), + + TP_STRUCT__entry( + __field(dev_t, connection) + __field(uint64_t, unique) + __field(enum fuse_opcode, opcode) + __field(uint32_t, len) + ), + + TP_fast_assign( + __entry->connection = req->fm->fc->dev; + __entry->unique = req->in.h.unique; + __entry->opcode = req->in.h.opcode; + __entry->len = req->in.h.len; + ), + + TP_printk("connection %u req %llu opcode %u (%s) len %u ", + __entry->connection, __entry->unique, __entry->opcode, + __print_symbolic(__entry->opcode, OPCODES), __entry->len) +); + +TRACE_EVENT(fuse_request_end, + TP_PROTO(const struct fuse_req *req), + + TP_ARGS(req), + + TP_STRUCT__entry( + __field(dev_t, connection) + __field(uint64_t, unique) + __field(uint32_t, len) + __field(int32_t, error) + ), + + TP_fast_assign( + __entry->connection = req->fm->fc->dev; + __entry->unique = req->in.h.unique; + __entry->len = req->out.h.len; + __entry->error = req->out.h.error; + ), + + TP_printk("connection %u req %llu len %u error %d", __entry->connection, + __entry->unique, __entry->len, __entry->error) +); + +#endif /* _TRACE_FUSE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE fuse_trace +#include <trace/define_trace.h> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 3a5d88878335..fd48e8d37f2e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "dev_uring_i.h" #include <linux/pagemap.h> #include <linux/slab.h> @@ -35,6 +36,11 @@ DEFINE_MUTEX(fuse_mutex); static int set_global_limit(const char *val, const struct kernel_param *kp); +unsigned int fuse_max_pages_limit = 256; +/* default is no timeout */ +unsigned int fuse_default_req_timeout; +unsigned int fuse_max_req_timeout; + unsigned max_user_bgreq; module_param_call(max_user_bgreq, set_global_limit, param_get_uint, &max_user_bgreq, 0644); @@ -173,8 +179,17 @@ static void fuse_evict_inode(struct inode *inode) fuse_cleanup_submount_lookup(fc, fi->submount_lookup); fi->submount_lookup = NULL; } + /* + * Evict of non-deleted inode may race with outstanding + * LOOKUP/READDIRPLUS requests and result in inconsistency when + * the request finishes. Deal with that here by bumping a + * counter that can be compared to the starting value. + */ + if (inode->i_nlink > 0) + atomic64_inc(&fc->evict_ctr); } if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { + WARN_ON(fi->iocachectr != 0); WARN_ON(!list_empty(&fi->write_files)); WARN_ON(!list_empty(&fi->queued_writes)); } @@ -205,17 +220,30 @@ static ino_t fuse_squash_ino(u64 ino64) void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, struct fuse_statx *sx, - u64 attr_valid, u32 cache_mask) + u64 attr_valid, u32 cache_mask, + u64 evict_ctr) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); lockdep_assert_held(&fi->lock); + /* + * Clear basic stats from invalid mask. + * + * Don't do this if this is coming from a fuse_iget() call and there + * might have been a racing evict which would've invalidated the result + * if the attr_version would've been preserved. + * + * !evict_ctr -> this is create + * fi->attr_version != 0 -> this is not a new inode + * evict_ctr == fuse_get_evict_ctr() -> no evicts while during request + */ + if (!evict_ctr || fi->attr_version || evict_ctr == fuse_get_evict_ctr(fc)) + set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0); + fi->attr_version = atomic64_inc_return(&fc->attr_version); fi->i_time = attr_valid; - /* Clear basic stats from invalid mask */ - set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0); inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); @@ -294,9 +322,9 @@ u32 fuse_get_cache_mask(struct inode *inode) return STATX_MTIME | STATX_CTIME | STATX_SIZE; } -void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, - struct fuse_statx *sx, - u64 attr_valid, u64 attr_version) +static void fuse_change_attributes_i(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, + u64 attr_version, u64 evict_ctr) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -330,7 +358,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, } old_mtime = inode_get_mtime(inode); - fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask); + fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask, + evict_ctr); oldsize = inode->i_size; /* @@ -371,6 +400,13 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, fuse_dax_dontcache(inode, attr->flags); } +void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, + u64 attr_version) +{ + fuse_change_attributes_i(inode, attr, sx, attr_valid, attr_version, 0); +} + static void fuse_init_submount_lookup(struct fuse_submount_lookup *sl, u64 nodeid) { @@ -425,7 +461,8 @@ static int fuse_inode_set(struct inode *inode, void *_nodeidp) struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int generation, struct fuse_attr *attr, - u64 attr_valid, u64 attr_version) + u64 attr_valid, u64 attr_version, + u64 evict_ctr) { struct inode *inode; struct fuse_inode *fi; @@ -486,8 +523,8 @@ retry: fi->nlookup++; spin_unlock(&fi->lock); done: - fuse_change_attributes(inode, attr, NULL, attr_valid, attr_version); - + fuse_change_attributes_i(inode, attr, NULL, attr_valid, attr_version, + evict_ctr); return inode; } @@ -739,8 +776,8 @@ static const struct fs_parameter_spec fuse_fs_parameters[] = { fsparam_string ("source", OPT_SOURCE), fsparam_u32 ("fd", OPT_FD), fsparam_u32oct ("rootmode", OPT_ROOTMODE), - fsparam_u32 ("user_id", OPT_USER_ID), - fsparam_u32 ("group_id", OPT_GROUP_ID), + fsparam_uid ("user_id", OPT_USER_ID), + fsparam_gid ("group_id", OPT_GROUP_ID), fsparam_flag ("default_permissions", OPT_DEFAULT_PERMISSIONS), fsparam_flag ("allow_other", OPT_ALLOW_OTHER), fsparam_u32 ("max_read", OPT_MAX_READ), @@ -754,6 +791,8 @@ static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param) struct fs_parse_result result; struct fuse_fs_context *ctx = fsc->fs_private; int opt; + kuid_t kuid; + kgid_t kgid; if (fsc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { /* @@ -798,16 +837,26 @@ static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param) break; case OPT_USER_ID: - ctx->user_id = make_kuid(fsc->user_ns, result.uint_32); - if (!uid_valid(ctx->user_id)) + kuid = result.uid; + /* + * The requested uid must be representable in the + * filesystem's idmapping. + */ + if (!kuid_has_mapping(fsc->user_ns, kuid)) return invalfc(fsc, "Invalid user_id"); + ctx->user_id = kuid; ctx->user_id_present = true; break; case OPT_GROUP_ID: - ctx->group_id = make_kgid(fsc->user_ns, result.uint_32); - if (!gid_valid(ctx->group_id)) + kgid = result.gid; + /* + * The requested gid must be representable in the + * filesystem's idmapping. + */ + if (!kgid_has_mapping(fsc->user_ns, kgid)) return invalfc(fsc, "Invalid group_id"); + ctx->group_id = kgid; ctx->group_id_present = true; break; @@ -892,7 +941,7 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq, fiq->priv = priv; } -static void fuse_pqueue_init(struct fuse_pqueue *fpq) +void fuse_pqueue_init(struct fuse_pqueue *fpq) { unsigned int i; @@ -927,11 +976,14 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->initialized = 0; fc->connected = 1; atomic64_set(&fc->attr_version, 1); + atomic64_set(&fc->evict_ctr, 1); get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; - fc->max_pages_limit = FUSE_MAX_MAX_PAGES; + fc->max_pages_limit = fuse_max_pages_limit; + fc->name_max = FUSE_NAME_LOW_MAX; + fc->timeout.req_timeout = 0; if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) fuse_backing_files_init(fc); @@ -946,6 +998,8 @@ static void delayed_release(struct rcu_head *p) { struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu); + fuse_uring_destruct(fc); + put_user_ns(fc->user_ns); fc->release(fc); } @@ -958,6 +1012,8 @@ void fuse_conn_put(struct fuse_conn *fc) if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_conn_free(fc); + if (fc->timeout.req_timeout) + cancel_delayed_work_sync(&fc->timeout.work); if (fiq->ops->release) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); @@ -988,7 +1044,7 @@ static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) attr.mode = mode; attr.ino = FUSE_ROOT_ID; attr.nlink = 1; - return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0); + return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0, 0); } struct fuse_inode_handle { @@ -1208,6 +1264,34 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) spin_unlock(&fc->bg_lock); } +static void set_request_timeout(struct fuse_conn *fc, unsigned int timeout) +{ + fc->timeout.req_timeout = secs_to_jiffies(timeout); + INIT_DELAYED_WORK(&fc->timeout.work, fuse_check_timeout); + queue_delayed_work(system_wq, &fc->timeout.work, + fuse_timeout_timer_freq); +} + +static void init_server_timeout(struct fuse_conn *fc, unsigned int timeout) +{ + if (!timeout && !fuse_max_req_timeout && !fuse_default_req_timeout) + return; + + if (!timeout) + timeout = fuse_default_req_timeout; + + if (fuse_max_req_timeout) { + if (timeout) + timeout = min(fuse_max_req_timeout, timeout); + else + timeout = fuse_max_req_timeout; + } + + timeout = max(FUSE_TIMEOUT_TIMER_FREQ, timeout); + + set_request_timeout(fc, timeout); +} + struct fuse_init_args { struct fuse_args args; struct fuse_init_in in; @@ -1226,6 +1310,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, ok = false; else { unsigned long ra_pages; + unsigned int timeout = 0; process_init_limits(fc, arg); @@ -1289,6 +1374,13 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->max_pages = min_t(unsigned int, fc->max_pages_limit, max_t(unsigned int, arg->max_pages, 1)); + + /* + * PATH_MAX file names might need two pages for + * ops like rename + */ + if (fc->max_pages > 1) + fc->name_max = FUSE_NAME_MAX; } if (IS_ENABLED(CONFIG_FUSE_DAX)) { if (flags & FUSE_MAP_ALIGNMENT && @@ -1319,23 +1411,41 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, * on a stacked fs (e.g. overlayfs) themselves and with * max_stack_depth == 1, FUSE fs can be stacked as the * underlying fs of a stacked fs (e.g. overlayfs). + * + * Also don't allow the combination of FUSE_PASSTHROUGH + * and FUSE_WRITEBACK_CACHE, current design doesn't handle + * them together. */ if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) && (flags & FUSE_PASSTHROUGH) && arg->max_stack_depth > 0 && - arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH) { + arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH && + !(flags & FUSE_WRITEBACK_CACHE)) { fc->passthrough = 1; fc->max_stack_depth = arg->max_stack_depth; fm->sb->s_stack_depth = arg->max_stack_depth; } if (flags & FUSE_NO_EXPORT_SUPPORT) fm->sb->s_export_op = &fuse_export_fid_operations; + if (flags & FUSE_ALLOW_IDMAP) { + if (fc->default_permissions) + fm->sb->s_iflags &= ~SB_I_NOIDMAP; + else + ok = false; + } + if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled()) + fc->io_uring = 1; + + if (flags & FUSE_REQUEST_TIMEOUT) + timeout = arg->request_timeout; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; fc->no_flock = 1; } + init_server_timeout(fc, timeout); + fm->sb->s_bdi->ra_pages = min(fm->sb->s_bdi->ra_pages, ra_pages); fc->minor = arg->minor; @@ -1377,7 +1487,8 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | - FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND; + FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP | + FUSE_REQUEST_TIMEOUT; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; @@ -1389,6 +1500,13 @@ void fuse_send_init(struct fuse_mount *fm) if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) flags |= FUSE_PASSTHROUGH; + /* + * This is just an information flag for fuse server. No need to check + * the reply - server is either sending IORING_OP_URING_CMD or not. + */ + if (fuse_uring_enabled()) + flags |= FUSE_OVER_IO_URING; + ia->in.flags = flags; ia->in.flags2 = flags >> 32; @@ -1554,6 +1672,7 @@ static void fuse_sb_defaults(struct super_block *sb) sb->s_time_gran = 1; sb->s_export_op = &fuse_export_operations; sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE; + sb->s_iflags |= SB_I_NOIDMAP; if (sb->s_user_ns != &init_user_ns) sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); @@ -1585,7 +1704,8 @@ static int fuse_fill_super_submount(struct super_block *sb, return -ENOMEM; fuse_fill_attr_from_inode(&root_attr, parent_fi); - root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0); + root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0, + fuse_get_evict_ctr(fm->fc)); /* * This inode is just a duplicate, so it is not looked up and * its nlookup should not be incremented. fuse_iget() does @@ -1966,7 +2086,7 @@ static void fuse_kill_sb_anon(struct super_block *sb) static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", - .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT, + .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT | FS_ALLOW_IDMAP, .init_fs_context = fuse_init_fs_context, .parameters = fuse_fs_parameters, .kill_sb = fuse_kill_sb_anon, @@ -1987,7 +2107,7 @@ static struct file_system_type fuseblk_fs_type = { .init_fs_context = fuse_init_fs_context, .parameters = fuse_fs_parameters, .kill_sb = fuse_kill_sb_blk, - .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, + .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("fuseblk"); @@ -2038,8 +2158,14 @@ static int __init fuse_fs_init(void) if (err) goto out3; + err = fuse_sysctl_register(); + if (err) + goto out4; + return 0; + out4: + unregister_filesystem(&fuse_fs_type); out3: unregister_fuseblk(); out2: @@ -2050,6 +2176,7 @@ static int __init fuse_fs_init(void) static void fuse_fs_cleanup(void) { + fuse_sysctl_unregister(); unregister_filesystem(&fuse_fs_type); unregister_fuseblk(); diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 726640fa439e..2d9abf48828f 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -8,6 +8,9 @@ #include <linux/uio.h> #include <linux/compat.h> #include <linux/fileattr.h> +#include <linux/fsverity.h> + +#define FUSE_VERITY_ENABLE_ARG_MAX_PAGES 256 static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args, struct fuse_ioctl_out *outarg) @@ -117,6 +120,53 @@ static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst, return 0; } +/* For fs-verity, determine iov lengths from input */ +static int fuse_setup_measure_verity(unsigned long arg, struct iovec *iov) +{ + __u16 digest_size; + struct fsverity_digest __user *uarg = (void __user *)arg; + + if (copy_from_user(&digest_size, &uarg->digest_size, sizeof(digest_size))) + return -EFAULT; + + if (digest_size > SIZE_MAX - sizeof(struct fsverity_digest)) + return -EINVAL; + + iov->iov_len = sizeof(struct fsverity_digest) + digest_size; + + return 0; +} + +static int fuse_setup_enable_verity(unsigned long arg, struct iovec *iov, + unsigned int *in_iovs) +{ + struct fsverity_enable_arg enable; + struct fsverity_enable_arg __user *uarg = (void __user *)arg; + const __u32 max_buffer_len = FUSE_VERITY_ENABLE_ARG_MAX_PAGES * PAGE_SIZE; + + if (copy_from_user(&enable, uarg, sizeof(enable))) + return -EFAULT; + + if (enable.salt_size > max_buffer_len || enable.sig_size > max_buffer_len) + return -ENOMEM; + + if (enable.salt_size > 0) { + iov++; + (*in_iovs)++; + + iov->iov_base = u64_to_user_ptr(enable.salt_ptr); + iov->iov_len = enable.salt_size; + } + + if (enable.sig_size > 0) { + iov++; + (*in_iovs)++; + + iov->iov_base = u64_to_user_ptr(enable.sig_ptr); + iov->iov_len = enable.sig_size; + } + return 0; +} /* * For ioctls, there is no generic way to determine how much memory @@ -201,12 +251,12 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; - ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs); + ap.folios = fuse_folios_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs); iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); - if (!ap.pages || !iov_page) + if (!ap.folios || !iov_page) goto out; - fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages); + fuse_folio_descs_length_init(ap.descs, 0, fm->fc->max_pages); /* * If restricted, initialize IO parameters as encoded in @cmd. @@ -227,6 +277,18 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, out_iov = iov; out_iovs = 1; } + + err = 0; + switch (cmd) { + case FS_IOC_MEASURE_VERITY: + err = fuse_setup_measure_verity(arg, iov); + break; + case FS_IOC_ENABLE_VERITY: + err = fuse_setup_enable_verity(arg, iov, &in_iovs); + break; + } + if (err) + goto out; } retry: @@ -244,14 +306,13 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -ENOMEM; if (max_pages > fm->fc->max_pages) goto out; - while (ap.num_pages < max_pages) { - ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (!ap.pages[ap.num_pages]) + while (ap.num_folios < max_pages) { + ap.folios[ap.num_folios] = folio_alloc(GFP_KERNEL | __GFP_HIGHMEM, 0); + if (!ap.folios[ap.num_folios]) goto out; - ap.num_pages++; + ap.num_folios++; } - /* okay, let's send it to the client */ ap.args.opcode = FUSE_IOCTL; ap.args.nodeid = ff->nodeid; @@ -265,8 +326,8 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -EFAULT; iov_iter_init(&ii, ITER_SOURCE, in_iov, in_iovs, in_size); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { - c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_folios); i++) { + c = copy_folio_from_iter(ap.folios[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) goto out; } @@ -304,7 +365,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) goto out; - vaddr = kmap_local_page(ap.pages[0]); + vaddr = kmap_local_folio(ap.folios[0], 0); err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr, transferred, in_iovs + out_iovs, (flags & FUSE_IOCTL_COMPAT) != 0); @@ -332,17 +393,17 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -EFAULT; iov_iter_init(&ii, ITER_DEST, out_iov, out_iovs, transferred); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { - c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_folios); i++) { + c = copy_folio_to_iter(ap.folios[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) goto out; } err = 0; out: free_page((unsigned long) iov_page); - while (ap.num_pages) - __free_page(ap.pages[--ap.num_pages]); - kfree(ap.pages); + while (ap.num_folios) + folio_put(ap.folios[--ap.num_folios]); + kfree(ap.folios); return err ? err : outarg.result; } diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c index c653ddcf0578..c99e285f3183 100644 --- a/fs/fuse/iomode.c +++ b/fs/fuse/iomode.c @@ -21,12 +21,13 @@ static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi) } /* - * Start cached io mode. + * Called on cached file open() and on first mmap() of direct_io file. + * Takes cached_io inode mode reference to be dropped on file release. * * Blocks new parallel dio writes and waits for the in-progress parallel dio * writes to complete. */ -int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff) +int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -67,10 +68,9 @@ int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff) return 0; } -static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff) +static void fuse_file_cached_io_release(struct fuse_file *ff, + struct fuse_inode *fi) { - struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fi->lock); WARN_ON(fi->iocachectr <= 0); WARN_ON(ff->iomode != IOM_CACHED); @@ -82,16 +82,15 @@ static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff) } /* Start strictly uncached io mode where cache access is not allowed */ -int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb) +int fuse_inode_uncached_io_start(struct fuse_inode *fi, struct fuse_backing *fb) { - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_backing *oldfb; int err = 0; spin_lock(&fi->lock); /* deny conflicting backing files on same fuse inode */ oldfb = fuse_inode_backing(fi); - if (oldfb && oldfb != fb) { + if (fb && oldfb && oldfb != fb) { err = -EBUSY; goto unlock; } @@ -99,12 +98,10 @@ int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struc err = -ETXTBSY; goto unlock; } - WARN_ON(ff->iomode != IOM_NONE); fi->iocachectr--; - ff->iomode = IOM_UNCACHED; /* fuse inode holds a single refcount of backing file */ - if (!oldfb) { + if (fb && !oldfb) { oldfb = fuse_inode_backing_set(fi, fb); WARN_ON_ONCE(oldfb != NULL); } else { @@ -115,15 +112,29 @@ unlock: return err; } -void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) +/* Takes uncached_io inode mode reference to be dropped on file release */ +static int fuse_file_uncached_io_open(struct inode *inode, + struct fuse_file *ff, + struct fuse_backing *fb) { struct fuse_inode *fi = get_fuse_inode(inode); + int err; + + err = fuse_inode_uncached_io_start(fi, fb); + if (err) + return err; + + WARN_ON(ff->iomode != IOM_NONE); + ff->iomode = IOM_UNCACHED; + return 0; +} + +void fuse_inode_uncached_io_end(struct fuse_inode *fi) +{ struct fuse_backing *oldfb = NULL; spin_lock(&fi->lock); WARN_ON(fi->iocachectr >= 0); - WARN_ON(ff->iomode != IOM_UNCACHED); - ff->iomode = IOM_NONE; fi->iocachectr++; if (!fi->iocachectr) { wake_up(&fi->direct_io_waitq); @@ -134,6 +145,15 @@ void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) fuse_backing_put(oldfb); } +/* Drop uncached_io reference from passthrough open */ +static void fuse_file_uncached_io_release(struct fuse_file *ff, + struct fuse_inode *fi) +{ + WARN_ON(ff->iomode != IOM_UNCACHED); + ff->iomode = IOM_NONE; + fuse_inode_uncached_io_end(fi); +} + /* * Open flags that are allowed in combination with FOPEN_PASSTHROUGH. * A combination of FOPEN_PASSTHROUGH and FOPEN_DIRECT_IO means that read/write @@ -163,7 +183,7 @@ static int fuse_file_passthrough_open(struct inode *inode, struct file *file) return PTR_ERR(fb); /* First passthrough file open denies caching inode io mode */ - err = fuse_file_uncached_io_start(inode, ff, fb); + err = fuse_file_uncached_io_open(inode, ff, fb); if (!err) return 0; @@ -216,7 +236,7 @@ int fuse_file_io_open(struct file *file, struct inode *inode) if (ff->open_flags & FOPEN_PASSTHROUGH) err = fuse_file_passthrough_open(inode, file); else - err = fuse_file_cached_io_start(inode, ff); + err = fuse_file_cached_io_open(inode, ff); if (err) goto fail; @@ -236,8 +256,10 @@ fail: /* No more pending io and no new io possible to inode via open/mmapped file */ void fuse_file_io_release(struct fuse_file *ff, struct inode *inode) { + struct fuse_inode *fi = get_fuse_inode(inode); + /* - * Last parallel dio close allows caching inode io mode. + * Last passthrough file close allows caching inode io mode. * Last caching file close exits caching inode io mode. */ switch (ff->iomode) { @@ -245,10 +267,10 @@ void fuse_file_io_release(struct fuse_file *ff, struct inode *inode) /* Nothing to do */ break; case IOM_UNCACHED: - fuse_file_uncached_io_end(inode, ff); + fuse_file_uncached_io_release(ff, fi); break; case IOM_CACHED: - fuse_file_cached_io_end(inode, ff); + fuse_file_cached_io_release(ff, fi); break; } } diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index 1567f0323858..607ef735ad4a 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -18,11 +18,11 @@ static void fuse_file_accessed(struct file *file) fuse_invalidate_atime(inode); } -static void fuse_file_modified(struct file *file) +static void fuse_passthrough_end_write(struct kiocb *iocb, ssize_t ret) { - struct inode *inode = file_inode(file); + struct inode *inode = file_inode(iocb->ki_filp); - fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); + fuse_write_update_attr(inode, iocb->ki_pos, ret); } ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter) @@ -34,7 +34,6 @@ ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t ret; struct backing_file_ctx ctx = { .cred = ff->cred, - .user_file = file, .accessed = fuse_file_accessed, }; @@ -62,8 +61,7 @@ ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, ssize_t ret; struct backing_file_ctx ctx = { .cred = ff->cred, - .user_file = file, - .end_write = fuse_file_modified, + .end_write = fuse_passthrough_end_write, }; pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__, @@ -88,15 +86,20 @@ ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos, struct file *backing_file = fuse_file_passthrough(ff); struct backing_file_ctx ctx = { .cred = ff->cred, - .user_file = in, .accessed = fuse_file_accessed, }; + struct kiocb iocb; + ssize_t ret; pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__, - backing_file, ppos ? *ppos : 0, len, flags); + backing_file, *ppos, len, flags); + + init_sync_kiocb(&iocb, in); + iocb.ki_pos = *ppos; + ret = backing_file_splice_read(backing_file, &iocb, pipe, len, flags, &ctx); + *ppos = iocb.ki_pos; - return backing_file_splice_read(backing_file, ppos, pipe, len, flags, - &ctx); + return ret; } ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, @@ -109,16 +112,18 @@ ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, ssize_t ret; struct backing_file_ctx ctx = { .cred = ff->cred, - .user_file = out, - .end_write = fuse_file_modified, + .end_write = fuse_passthrough_end_write, }; + struct kiocb iocb; pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__, - backing_file, ppos ? *ppos : 0, len, flags); + backing_file, *ppos, len, flags); inode_lock(inode); - ret = backing_file_splice_write(pipe, backing_file, ppos, len, flags, - &ctx); + init_sync_kiocb(&iocb, out); + iocb.ki_pos = *ppos; + ret = backing_file_splice_write(pipe, backing_file, &iocb, len, flags, &ctx); + *ppos = iocb.ki_pos; inode_unlock(inode); return ret; @@ -130,7 +135,6 @@ ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma) struct file *backing_file = fuse_file_passthrough(ff); struct backing_file_ctx ctx = { .cred = ff->cred, - .user_file = file, .accessed = fuse_file_accessed, }; @@ -225,18 +229,14 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) goto out; res = -EINVAL; - if (map->flags) + if (map->flags || map->padding) goto out; - file = fget(map->fd); + file = fget_raw(map->fd); res = -EBADF; if (!file) goto out; - res = -EOPNOTSUPP; - if (!file->f_op->read_iter || !file->f_op->write_iter) - goto out_fput; - backing_sb = file_inode(file)->i_sb; res = -ELOOP; if (backing_sb->s_stack_depth >= fc->max_stack_depth) diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 0377b6dc24c8..edcd6f18a8a8 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -120,7 +120,7 @@ static bool fuse_emit(struct file *file, struct dir_context *ctx, fuse_add_dirent_to_cache(file, dirent, ctx->pos); return dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino, - dirent->type); + dirent->type | FILLDIR_FLAG_NOINTR); } static int parse_dirfile(char *buf, size_t nbytes, struct file *file, @@ -149,7 +149,7 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file, static int fuse_direntplus_link(struct file *file, struct fuse_direntplus *direntplus, - u64 attr_version) + u64 attr_version, u64 evict_ctr) { struct fuse_entry_out *o = &direntplus->entry_out; struct fuse_dirent *dirent = &direntplus->dirent; @@ -233,7 +233,7 @@ retry: } else { inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, &o->attr, ATTR_TIMEOUT(o), - attr_version); + attr_version, evict_ctr); if (!inode) inode = ERR_PTR(-ENOMEM); @@ -284,7 +284,8 @@ static void fuse_force_forget(struct file *file, u64 nodeid) } static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, - struct dir_context *ctx, u64 attr_version) + struct dir_context *ctx, u64 attr_version, + u64 evict_ctr) { struct fuse_direntplus *direntplus; struct fuse_dirent *dirent; @@ -319,7 +320,7 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, buf += reclen; nbytes -= reclen; - ret = fuse_direntplus_link(file, direntplus, attr_version); + ret = fuse_direntplus_link(file, direntplus, attr_version, evict_ctr); if (ret) fuse_force_forget(file, direntplus->entry_out.nodeid); } @@ -331,26 +332,27 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) { int plus; ssize_t res; - struct page *page; + struct folio *folio; struct inode *inode = file_inode(file); struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_io_args ia = {}; struct fuse_args_pages *ap = &ia.ap; - struct fuse_page_desc desc = { .length = PAGE_SIZE }; - u64 attr_version = 0; + struct fuse_folio_desc desc = { .length = PAGE_SIZE }; + u64 attr_version = 0, evict_ctr = 0; bool locked; - page = alloc_page(GFP_KERNEL); - if (!page) + folio = folio_alloc(GFP_KERNEL, 0); + if (!folio) return -ENOMEM; plus = fuse_use_readdirplus(inode, ctx); ap->args.out_pages = true; - ap->num_pages = 1; - ap->pages = &page; + ap->num_folios = 1; + ap->folios = &folio; ap->descs = &desc; if (plus) { attr_version = fuse_get_attr_version(fm->fc); + evict_ctr = fuse_get_evict_ctr(fm->fc); fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, FUSE_READDIRPLUS); } else { @@ -367,15 +369,16 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) if (ff->open_flags & FOPEN_CACHE_DIR) fuse_readdir_cache_end(file, ctx->pos); } else if (plus) { - res = parse_dirplusfile(page_address(page), res, - file, ctx, attr_version); + res = parse_dirplusfile(folio_address(folio), res, + file, ctx, attr_version, + evict_ctr); } else { - res = parse_dirfile(page_address(page), res, file, + res = parse_dirfile(folio_address(folio), res, file, ctx); } } - __free_page(page); + folio_put(folio); fuse_invalidate_atime(inode); return res; } @@ -416,7 +419,7 @@ static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff, if (ff->readdir.pos == ctx->pos) { res = FOUND_SOME; if (!dir_emit(ctx, dirent->name, dirent->namelen, - dirent->ino, dirent->type)) + dirent->ino, dirent->type | FILLDIR_FLAG_NOINTR)) return FOUND_ALL; ctx->pos = dirent->off; } diff --git a/fs/fuse/sysctl.c b/fs/fuse/sysctl.c new file mode 100644 index 000000000000..e2d921abcb88 --- /dev/null +++ b/fs/fuse/sysctl.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/fuse/fuse_sysctl.c + * + * Sysctl interface to fuse parameters + */ +#include <linux/sysctl.h> + +#include "fuse_i.h" + +static struct ctl_table_header *fuse_table_header; + +/* Bound by fuse_init_out max_pages, which is a u16 */ +static unsigned int sysctl_fuse_max_pages_limit = 65535; + +/* + * fuse_init_out request timeouts are u16. + * This goes up to ~18 hours, which is plenty for a timeout. + */ +static unsigned int sysctl_fuse_req_timeout_limit = 65535; + +static const struct ctl_table fuse_sysctl_table[] = { + { + .procname = "max_pages_limit", + .data = &fuse_max_pages_limit, + .maxlen = sizeof(fuse_max_pages_limit), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &sysctl_fuse_max_pages_limit, + }, + { + .procname = "default_request_timeout", + .data = &fuse_default_req_timeout, + .maxlen = sizeof(fuse_default_req_timeout), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &sysctl_fuse_req_timeout_limit, + }, + { + .procname = "max_request_timeout", + .data = &fuse_max_req_timeout, + .maxlen = sizeof(fuse_max_req_timeout), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &sysctl_fuse_req_timeout_limit, + }, +}; + +int fuse_sysctl_register(void) +{ + fuse_table_header = register_sysctl("fs/fuse", fuse_sysctl_table); + if (!fuse_table_header) + return -ENOMEM; + return 0; +} + +void fuse_sysctl_unregister(void) +{ + unregister_sysctl_table(fuse_table_header); + fuse_table_header = NULL; +} diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 322af827a232..53c2626e90e7 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -7,6 +7,8 @@ #include <linux/fs.h> #include <linux/dax.h> #include <linux/pci.h> +#include <linux/interrupt.h> +#include <linux/group_cpus.h> #include <linux/pfn_t.h> #include <linux/memremap.h> #include <linux/module.h> @@ -49,17 +51,19 @@ struct virtio_fs_vq { struct work_struct done_work; struct list_head queued_reqs; struct list_head end_reqs; /* End these requests */ - struct delayed_work dispatch_work; + struct work_struct dispatch_work; struct fuse_dev *fud; bool connected; long in_flight; struct completion in_flight_zero; /* No inflight requests */ + struct kobject *kobj; char name[VQ_NAME_LEN]; } ____cacheline_aligned_in_smp; /* A virtio-fs device instance */ struct virtio_fs { struct kobject kobj; + struct kobject *mqs_kobj; struct list_head list; /* on virtio_fs_instances */ char *tag; struct virtio_fs_vq *vqs; @@ -67,6 +71,8 @@ struct virtio_fs { unsigned int num_request_queues; /* number of request queues */ struct dax_device *dax_dev; + unsigned int *mq_map; /* index = cpu id, value = request vq id */ + /* DAX memory window where file contents are mapped */ void *window_kaddr; phys_addr_t window_phys_addr; @@ -91,7 +97,8 @@ struct virtio_fs_req_work { }; static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, - struct fuse_req *req, bool in_flight); + struct fuse_req *req, bool in_flight, + gfp_t gfp); static const struct constant_table dax_param_enums[] = { {"always", FUSE_DAX_ALWAYS }, @@ -170,7 +177,7 @@ static ssize_t tag_show(struct kobject *kobj, { struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj); - return sysfs_emit(buf, fs->tag); + return sysfs_emit(buf, "%s\n", fs->tag); } static struct kobj_attribute virtio_fs_tag_attr = __ATTR_RO(tag); @@ -185,6 +192,7 @@ static void virtio_fs_ktype_release(struct kobject *kobj) { struct virtio_fs *vfs = container_of(kobj, struct virtio_fs, kobj); + kfree(vfs->mq_map); kfree(vfs->vqs); kfree(vfs); } @@ -195,19 +203,94 @@ static const struct kobj_type virtio_fs_ktype = { .default_groups = virtio_fs_groups, }; +static struct virtio_fs_vq *virtio_fs_kobj_to_vq(struct virtio_fs *fs, + struct kobject *kobj) +{ + int i; + + for (i = 0; i < fs->nvqs; i++) { + if (kobj == fs->vqs[i].kobj) + return &fs->vqs[i]; + } + return NULL; +} + +static ssize_t name_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct virtio_fs *fs = container_of(kobj->parent->parent, struct virtio_fs, kobj); + struct virtio_fs_vq *fsvq = virtio_fs_kobj_to_vq(fs, kobj); + + if (!fsvq) + return -EINVAL; + return sysfs_emit(buf, "%s\n", fsvq->name); +} + +static struct kobj_attribute virtio_fs_vq_name_attr = __ATTR_RO(name); + +static ssize_t cpu_list_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct virtio_fs *fs = container_of(kobj->parent->parent, struct virtio_fs, kobj); + struct virtio_fs_vq *fsvq = virtio_fs_kobj_to_vq(fs, kobj); + unsigned int cpu, qid; + const size_t size = PAGE_SIZE - 1; + bool first = true; + int ret = 0, pos = 0; + + if (!fsvq) + return -EINVAL; + + qid = fsvq->vq->index; + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { + if (qid < VQ_REQUEST || (fs->mq_map[cpu] == qid)) { + if (first) + ret = snprintf(buf + pos, size - pos, "%u", cpu); + else + ret = snprintf(buf + pos, size - pos, ", %u", cpu); + + if (ret >= size - pos) + break; + first = false; + pos += ret; + } + } + ret = snprintf(buf + pos, size + 1 - pos, "\n"); + return pos + ret; +} + +static struct kobj_attribute virtio_fs_vq_cpu_list_attr = __ATTR_RO(cpu_list); + +static struct attribute *virtio_fs_vq_attrs[] = { + &virtio_fs_vq_name_attr.attr, + &virtio_fs_vq_cpu_list_attr.attr, + NULL +}; + +static struct attribute_group virtio_fs_vq_attr_group = { + .attrs = virtio_fs_vq_attrs, +}; + /* Make sure virtiofs_mutex is held */ -static void virtio_fs_put(struct virtio_fs *fs) +static void virtio_fs_put_locked(struct virtio_fs *fs) { + lockdep_assert_held(&virtio_fs_mutex); + kobject_put(&fs->kobj); } +static void virtio_fs_put(struct virtio_fs *fs) +{ + mutex_lock(&virtio_fs_mutex); + virtio_fs_put_locked(fs); + mutex_unlock(&virtio_fs_mutex); +} + static void virtio_fs_fiq_release(struct fuse_iqueue *fiq) { struct virtio_fs *vfs = fiq->priv; - mutex_lock(&virtio_fs_mutex); virtio_fs_put(vfs); - mutex_unlock(&virtio_fs_mutex); } static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq) @@ -228,7 +311,7 @@ static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq) } flush_work(&fsvq->done_work); - flush_delayed_work(&fsvq->dispatch_work); + flush_work(&fsvq->dispatch_work); } static void virtio_fs_drain_all_queues_locked(struct virtio_fs *fs) @@ -268,6 +351,50 @@ static void virtio_fs_start_all_queues(struct virtio_fs *fs) } } +static void virtio_fs_delete_queues_sysfs(struct virtio_fs *fs) +{ + struct virtio_fs_vq *fsvq; + int i; + + for (i = 0; i < fs->nvqs; i++) { + fsvq = &fs->vqs[i]; + kobject_put(fsvq->kobj); + } +} + +static int virtio_fs_add_queues_sysfs(struct virtio_fs *fs) +{ + struct virtio_fs_vq *fsvq; + char buff[12]; + int i, j, ret; + + for (i = 0; i < fs->nvqs; i++) { + fsvq = &fs->vqs[i]; + + sprintf(buff, "%d", i); + fsvq->kobj = kobject_create_and_add(buff, fs->mqs_kobj); + if (!fs->mqs_kobj) { + ret = -ENOMEM; + goto out_del; + } + + ret = sysfs_create_group(fsvq->kobj, &virtio_fs_vq_attr_group); + if (ret) { + kobject_put(fsvq->kobj); + goto out_del; + } + } + + return 0; + +out_del: + for (j = 0; j < i; j++) { + fsvq = &fs->vqs[j]; + kobject_put(fsvq->kobj); + } + return ret; +} + /* Add a new instance to the list or return -EEXIST if tag name exists*/ static int virtio_fs_add_instance(struct virtio_device *vdev, struct virtio_fs *fs) @@ -291,17 +418,22 @@ static int virtio_fs_add_instance(struct virtio_device *vdev, */ fs->kobj.kset = virtio_fs_kset; ret = kobject_add(&fs->kobj, NULL, "%d", vdev->index); - if (ret < 0) { - mutex_unlock(&virtio_fs_mutex); - return ret; + if (ret < 0) + goto out_unlock; + + fs->mqs_kobj = kobject_create_and_add("mqs", &fs->kobj); + if (!fs->mqs_kobj) { + ret = -ENOMEM; + goto out_del; } ret = sysfs_create_link(&fs->kobj, &vdev->dev.kobj, "device"); - if (ret < 0) { - kobject_del(&fs->kobj); - mutex_unlock(&virtio_fs_mutex); - return ret; - } + if (ret < 0) + goto out_put; + + ret = virtio_fs_add_queues_sysfs(fs); + if (ret) + goto out_remove; list_add_tail(&fs->list, &virtio_fs_instances); @@ -310,6 +442,16 @@ static int virtio_fs_add_instance(struct virtio_device *vdev, kobject_uevent(&fs->kobj, KOBJ_ADD); return 0; + +out_remove: + sysfs_remove_link(&fs->kobj, "device"); +out_put: + kobject_put(fs->mqs_kobj); +out_del: + kobject_del(&fs->kobj); +out_unlock: + mutex_unlock(&virtio_fs_mutex); + return ret; } /* Return the virtio_fs with a given tag, or NULL */ @@ -380,6 +522,7 @@ static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs) return -EINVAL; } + dev_info(&vdev->dev, "discovered new tag: %s\n", fs->tag); return 0; } @@ -403,6 +546,10 @@ static void virtio_fs_hiprio_done_work(struct work_struct *work) dec_in_flight_req(fsvq); } } while (!virtqueue_enable_cb(vq)); + + if (!list_empty(&fsvq->queued_reqs)) + schedule_work(&fsvq->dispatch_work); + spin_unlock(&fsvq->lock); } @@ -410,7 +557,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) { struct fuse_req *req; struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, - dispatch_work.work); + dispatch_work); int ret; pr_debug("virtio-fs: worker %s called.\n", __func__); @@ -430,6 +577,8 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) /* Dispatch pending requests */ while (1) { + unsigned int flags; + spin_lock(&fsvq->lock); req = list_first_entry_or_null(&fsvq->queued_reqs, struct fuse_req, list); @@ -440,13 +589,13 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) list_del_init(&req->list); spin_unlock(&fsvq->lock); - ret = virtio_fs_enqueue_req(fsvq, req, true); + flags = memalloc_nofs_save(); + ret = virtio_fs_enqueue_req(fsvq, req, true, GFP_KERNEL); + memalloc_nofs_restore(flags); if (ret < 0) { - if (ret == -ENOMEM || ret == -ENOSPC) { + if (ret == -ENOSPC) { spin_lock(&fsvq->lock); list_add_tail(&req->list, &fsvq->queued_reqs); - schedule_delayed_work(&fsvq->dispatch_work, - msecs_to_jiffies(1)); spin_unlock(&fsvq->lock); return; } @@ -489,12 +638,10 @@ static int send_forget_request(struct virtio_fs_vq *fsvq, ret = virtqueue_add_outbuf(vq, &sg, 1, forget, GFP_ATOMIC); if (ret < 0) { - if (ret == -ENOMEM || ret == -ENOSPC) { + if (ret == -ENOSPC) { pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n", ret); list_add_tail(&forget->list, &fsvq->queued_reqs); - schedule_delayed_work(&fsvq->dispatch_work, - msecs_to_jiffies(1)); if (!in_flight) inc_in_flight_req(fsvq); /* Queue is full */ @@ -526,7 +673,7 @@ static void virtio_fs_hiprio_dispatch_work(struct work_struct *work) { struct virtio_fs_forget *forget; struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, - dispatch_work.work); + dispatch_work); pr_debug("virtio-fs: worker %s called.\n", __func__); while (1) { spin_lock(&fsvq->lock); @@ -545,7 +692,7 @@ static void virtio_fs_hiprio_dispatch_work(struct work_struct *work) } /* Allocate and copy args into req->argbuf */ -static int copy_args_to_argbuf(struct fuse_req *req) +static int copy_args_to_argbuf(struct fuse_req *req, gfp_t gfp) { struct fuse_args *args = req->args; unsigned int offset = 0; @@ -559,7 +706,7 @@ static int copy_args_to_argbuf(struct fuse_req *req) len = fuse_len_args(num_in, (struct fuse_arg *) args->in_args) + fuse_len_args(num_out, args->out_args); - req->argbuf = kmalloc(len, GFP_ATOMIC); + req->argbuf = kmalloc(len, gfp); if (!req->argbuf) return -ENOMEM; @@ -619,7 +766,7 @@ static void virtio_fs_request_complete(struct fuse_req *req, struct fuse_args *args; struct fuse_args_pages *ap; unsigned int len, i, thislen; - struct page *page; + struct folio *folio; /* * TODO verify that server properly follows FUSE protocol @@ -631,12 +778,12 @@ static void virtio_fs_request_complete(struct fuse_req *req, if (args->out_pages && args->page_zeroing) { len = args->out_args[args->out_numargs - 1].size; ap = container_of(args, typeof(*ap), args); - for (i = 0; i < ap->num_pages; i++) { + for (i = 0; i < ap->num_folios; i++) { thislen = ap->descs[i].length; if (len < thislen) { WARN_ON(ap->descs[i].offset); - page = ap->pages[i]; - zero_user_segment(page, len, thislen); + folio = ap->folios[i]; + folio_zero_segment(folio, len, thislen); len = 0; } else { len -= thislen; @@ -704,6 +851,50 @@ static void virtio_fs_requests_done_work(struct work_struct *work) virtio_fs_request_complete(req, fsvq); } } + + /* Try to push previously queued requests, as the queue might no longer be full */ + spin_lock(&fsvq->lock); + if (!list_empty(&fsvq->queued_reqs)) + schedule_work(&fsvq->dispatch_work); + spin_unlock(&fsvq->lock); +} + +static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *fs) +{ + const struct cpumask *mask, *masks; + unsigned int q, cpu; + + /* First attempt to map using existing transport layer affinities + * e.g. PCIe MSI-X + */ + if (!vdev->config->get_vq_affinity) + goto fallback; + + for (q = 0; q < fs->num_request_queues; q++) { + mask = vdev->config->get_vq_affinity(vdev, VQ_REQUEST + q); + if (!mask) + goto fallback; + + for_each_cpu(cpu, mask) + fs->mq_map[cpu] = q + VQ_REQUEST; + } + + return; +fallback: + /* Attempt to map evenly in groups over the CPUs */ + masks = group_cpus_evenly(fs->num_request_queues); + /* If even this fails we default to all CPUs use first request queue */ + if (!masks) { + for_each_possible_cpu(cpu) + fs->mq_map[cpu] = VQ_REQUEST; + return; + } + + for (q = 0; q < fs->num_request_queues; q++) { + for_each_cpu(cpu, &masks[q]) + fs->mq_map[cpu] = q + VQ_REQUEST; + } + kfree(masks); } /* Virtqueue interrupt handler */ @@ -727,12 +918,12 @@ static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name, if (vq_type == VQ_REQUEST) { INIT_WORK(&fsvq->done_work, virtio_fs_requests_done_work); - INIT_DELAYED_WORK(&fsvq->dispatch_work, - virtio_fs_request_dispatch_work); + INIT_WORK(&fsvq->dispatch_work, + virtio_fs_request_dispatch_work); } else { INIT_WORK(&fsvq->done_work, virtio_fs_hiprio_done_work); - INIT_DELAYED_WORK(&fsvq->dispatch_work, - virtio_fs_hiprio_dispatch_work); + INIT_WORK(&fsvq->dispatch_work, + virtio_fs_hiprio_dispatch_work); } } @@ -740,9 +931,13 @@ static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name, static int virtio_fs_setup_vqs(struct virtio_device *vdev, struct virtio_fs *fs) { + struct virtqueue_info *vqs_info; struct virtqueue **vqs; - vq_callback_t **callbacks; - const char **names; + /* Specify pre_vectors to ensure that the queues before the + * request queues (e.g. hiprio) don't claim any of the CPUs in + * the multi-queue mapping and interrupt affinities + */ + struct irq_affinity desc = { .pre_vectors = VQ_REQUEST }; unsigned int i; int ret = 0; @@ -751,24 +946,27 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, if (fs->num_request_queues == 0) return -EINVAL; + /* Truncate nr of request queues to nr_cpu_id */ + fs->num_request_queues = min_t(unsigned int, fs->num_request_queues, + nr_cpu_ids); fs->nvqs = VQ_REQUEST + fs->num_request_queues; fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL); if (!fs->vqs) return -ENOMEM; vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL); - callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]), - GFP_KERNEL); - names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL); - if (!vqs || !callbacks || !names) { + fs->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*fs->mq_map), GFP_KERNEL, + dev_to_node(&vdev->dev)); + vqs_info = kcalloc(fs->nvqs, sizeof(*vqs_info), GFP_KERNEL); + if (!vqs || !vqs_info || !fs->mq_map) { ret = -ENOMEM; goto out; } /* Initialize the hiprio/forget request virtqueue */ - callbacks[VQ_HIPRIO] = virtio_fs_vq_done; + vqs_info[VQ_HIPRIO].callback = virtio_fs_vq_done; virtio_fs_init_vq(&fs->vqs[VQ_HIPRIO], "hiprio", VQ_HIPRIO); - names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name; + vqs_info[VQ_HIPRIO].name = fs->vqs[VQ_HIPRIO].name; /* Initialize the requests virtqueues */ for (i = VQ_REQUEST; i < fs->nvqs; i++) { @@ -776,11 +974,11 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, snprintf(vq_name, VQ_NAME_LEN, "requests.%u", i - VQ_REQUEST); virtio_fs_init_vq(&fs->vqs[i], vq_name, VQ_REQUEST); - callbacks[i] = virtio_fs_vq_done; - names[i] = fs->vqs[i].name; + vqs_info[i].callback = virtio_fs_vq_done; + vqs_info[i].name = fs->vqs[i].name; } - ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL); + ret = virtio_find_vqs(vdev, fs->nvqs, vqs, vqs_info, &desc); if (ret < 0) goto out; @@ -789,11 +987,12 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, virtio_fs_start_all_queues(fs); out: - kfree(names); - kfree(callbacks); + kfree(vqs_info); kfree(vqs); - if (ret) + if (ret) { kfree(fs->vqs); + kfree(fs->mq_map); + } return ret; } @@ -818,8 +1017,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, if (kaddr) *kaddr = fs->window_kaddr + offset; if (pfn) - *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, - PFN_DEV | PFN_MAP); + *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, 0); return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; } @@ -939,7 +1137,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) if (ret < 0) goto out; - /* TODO vq affinity */ + virtio_fs_map_queues(vdev, fs); ret = virtio_fs_setup_dax(vdev, fs); if (ret < 0) @@ -986,7 +1184,9 @@ static void virtio_fs_remove(struct virtio_device *vdev) mutex_lock(&virtio_fs_mutex); /* This device is going away. No one should get new reference */ list_del_init(&fs->list); + virtio_fs_delete_queues_sysfs(fs); sysfs_remove_link(&fs->kobj, "device"); + kobject_put(fs->mqs_kobj); kobject_del(&fs->kobj); virtio_fs_stop_all_queues(fs); virtio_fs_drain_all_queues_locked(fs); @@ -995,7 +1195,7 @@ static void virtio_fs_remove(struct virtio_device *vdev) vdev->priv = NULL; /* Put device reference on virtio_fs object */ - virtio_fs_put(fs); + virtio_fs_put_locked(fs); mutex_unlock(&virtio_fs_mutex); } @@ -1023,7 +1223,6 @@ static const unsigned int feature_table[] = {}; static struct virtio_driver virtio_fs_driver = { .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, .id_table = id_table, .feature_table = feature_table, .feature_table_size = ARRAY_SIZE(feature_table), @@ -1035,22 +1234,13 @@ static struct virtio_driver virtio_fs_driver = { #endif }; -static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq) -__releases(fiq->lock) +static void virtio_fs_send_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *link) { - struct fuse_forget_link *link; struct virtio_fs_forget *forget; struct virtio_fs_forget_req *req; - struct virtio_fs *fs; - struct virtio_fs_vq *fsvq; - u64 unique; - - link = fuse_dequeue_forget(fiq, 1, NULL); - unique = fuse_get_unique(fiq); - - fs = fiq->priv; - fsvq = &fs->vqs[VQ_HIPRIO]; - spin_unlock(&fiq->lock); + struct virtio_fs *fs = fiq->priv; + struct virtio_fs_vq *fsvq = &fs->vqs[VQ_HIPRIO]; + u64 unique = fuse_get_unique(fiq); /* Allocate a buffer for the request */ forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL); @@ -1070,8 +1260,7 @@ __releases(fiq->lock) kfree(link); } -static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq) -__releases(fiq->lock) +static void virtio_fs_send_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { /* * TODO interrupts. @@ -1080,19 +1269,18 @@ __releases(fiq->lock) * Exceptions are blocking lock operations; for example fcntl(F_SETLKW) * with shared lock between host and guest. */ - spin_unlock(&fiq->lock); } /* Count number of scatter-gather elements required */ -static unsigned int sg_count_fuse_pages(struct fuse_page_desc *page_descs, - unsigned int num_pages, - unsigned int total_len) +static unsigned int sg_count_fuse_folios(struct fuse_folio_desc *folio_descs, + unsigned int num_folios, + unsigned int total_len) { unsigned int i; unsigned int this_len; - for (i = 0; i < num_pages && total_len; i++) { - this_len = min(page_descs[i].length, total_len); + for (i = 0; i < num_folios && total_len; i++) { + this_len = min(folio_descs[i].length, total_len); total_len -= this_len; } @@ -1111,8 +1299,8 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req) if (args->in_pages) { size = args->in_args[args->in_numargs - 1].size; - total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages, - size); + total_sgs += sg_count_fuse_folios(ap->descs, ap->num_folios, + size); } if (!test_bit(FR_ISREPLY, &req->flags)) @@ -1125,27 +1313,27 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req) if (args->out_pages) { size = args->out_args[args->out_numargs - 1].size; - total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages, - size); + total_sgs += sg_count_fuse_folios(ap->descs, ap->num_folios, + size); } return total_sgs; } -/* Add pages to scatter-gather list and return number of elements used */ -static unsigned int sg_init_fuse_pages(struct scatterlist *sg, - struct page **pages, - struct fuse_page_desc *page_descs, - unsigned int num_pages, - unsigned int total_len) +/* Add folios to scatter-gather list and return number of elements used */ +static unsigned int sg_init_fuse_folios(struct scatterlist *sg, + struct folio **folios, + struct fuse_folio_desc *folio_descs, + unsigned int num_folios, + unsigned int total_len) { unsigned int i; unsigned int this_len; - for (i = 0; i < num_pages && total_len; i++) { + for (i = 0; i < num_folios && total_len; i++) { sg_init_table(&sg[i], 1); - this_len = min(page_descs[i].length, total_len); - sg_set_page(&sg[i], pages[i], this_len, page_descs[i].offset); + this_len = min(folio_descs[i].length, total_len); + sg_set_folio(&sg[i], folios[i], this_len, folio_descs[i].offset); total_len -= this_len; } @@ -1170,10 +1358,10 @@ static unsigned int sg_init_fuse_args(struct scatterlist *sg, sg_init_one(&sg[total_sgs++], argbuf, len); if (argpages) - total_sgs += sg_init_fuse_pages(&sg[total_sgs], - ap->pages, ap->descs, - ap->num_pages, - args[numargs - 1].size); + total_sgs += sg_init_fuse_folios(&sg[total_sgs], + ap->folios, ap->descs, + ap->num_folios, + args[numargs - 1].size); if (len_used) *len_used = len; @@ -1183,7 +1371,8 @@ static unsigned int sg_init_fuse_args(struct scatterlist *sg, /* Add a request to a virtqueue and kick the device */ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, - struct fuse_req *req, bool in_flight) + struct fuse_req *req, bool in_flight, + gfp_t gfp) { /* requests need at least 4 elements */ struct scatterlist *stack_sgs[6]; @@ -1204,8 +1393,8 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, /* Does the sglist fit on the stack? */ total_sgs = sg_count_fuse_req(req); if (total_sgs > ARRAY_SIZE(stack_sgs)) { - sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC); - sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC); + sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), gfp); + sg = kmalloc_array(total_sgs, sizeof(sg[0]), gfp); if (!sgs || !sg) { ret = -ENOMEM; goto out; @@ -1213,7 +1402,7 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, } /* Use a bounce buffer since stack args cannot be mapped */ - ret = copy_args_to_argbuf(req); + ret = copy_args_to_argbuf(req, gfp); if (ret < 0) goto out; @@ -1285,33 +1474,31 @@ out: return ret; } -static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq) -__releases(fiq->lock) +static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req) { - unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */ + unsigned int queue_id; struct virtio_fs *fs; - struct fuse_req *req; struct virtio_fs_vq *fsvq; int ret; - WARN_ON(list_empty(&fiq->pending)); - req = list_last_entry(&fiq->pending, struct fuse_req, list); + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique(fiq); + clear_bit(FR_PENDING, &req->flags); - list_del_init(&req->list); - WARN_ON(!list_empty(&fiq->pending)); - spin_unlock(&fiq->lock); fs = fiq->priv; + queue_id = fs->mq_map[raw_smp_processor_id()]; - pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n", - __func__, req->in.h.opcode, req->in.h.unique, + pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u queue_id %u\n", + __func__, req->in.h.opcode, req->in.h.unique, req->in.h.nodeid, req->in.h.len, - fuse_len_args(req->args->out_numargs, req->args->out_args)); + fuse_len_args(req->args->out_numargs, req->args->out_args), + queue_id); fsvq = &fs->vqs[queue_id]; - ret = virtio_fs_enqueue_req(fsvq, req, false); + ret = virtio_fs_enqueue_req(fsvq, req, false, GFP_ATOMIC); if (ret < 0) { - if (ret == -ENOMEM || ret == -ENOSPC) { + if (ret == -ENOSPC) { /* * Virtqueue full. Retry submission from worker * context as we might be holding fc->bg_lock. @@ -1319,8 +1506,6 @@ __releases(fiq->lock) spin_lock(&fsvq->lock); list_add_tail(&req->list, &fsvq->queued_reqs); inc_in_flight_req(fsvq); - schedule_delayed_work(&fsvq->dispatch_work, - msecs_to_jiffies(1)); spin_unlock(&fsvq->lock); return; } @@ -1330,17 +1515,17 @@ __releases(fiq->lock) /* Can't end request in submission context. Use a worker */ spin_lock(&fsvq->lock); list_add_tail(&req->list, &fsvq->end_reqs); - schedule_delayed_work(&fsvq->dispatch_work, 0); + schedule_work(&fsvq->dispatch_work); spin_unlock(&fsvq->lock); return; } } static const struct fuse_iqueue_ops virtio_fs_fiq_ops = { - .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock, - .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock, - .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock, - .release = virtio_fs_fiq_release, + .send_forget = virtio_fs_send_forget, + .send_interrupt = virtio_fs_send_interrupt, + .send_req = virtio_fs_send_req, + .release = virtio_fs_fiq_release, }; static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx) @@ -1484,6 +1669,9 @@ static int virtio_fs_get_tree(struct fs_context *fsc) unsigned int virtqueue_size; int err = -EIO; + if (!fsc->source) + return invalf(fsc, "No source specified"); + /* This gets a reference on virtio_fs object. This ptr gets installed * in fc->iq->priv. Once fuse_conn is going away, it calls ->put() * to drop the reference to this object. @@ -1512,6 +1700,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc) fc->delete_stale = true; fc->auto_submounts = true; fc->sync_fs = true; + fc->use_pages_for_kvec_io = true; /* Tell FUSE to split requests that exceed the virtqueue's size */ fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, @@ -1540,9 +1729,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc) out_err: kfree(fc); - mutex_lock(&virtio_fs_mutex); virtio_fs_put(fs); - mutex_unlock(&virtio_fs_mutex); return err; } @@ -1572,6 +1759,7 @@ static struct file_system_type virtio_fs_type = { .name = "virtiofs", .init_fs_context = virtio_fs_init_fs_context, .kill_sb = virtio_kill_sb, + .fs_flags = FS_ALLOW_IDMAP, }; static int virtio_fs_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 5b423fdbb13f..93dfb06b6cea 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -81,7 +81,7 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, } ret = fuse_simple_request(fm, &args); if (!ret && !size) - ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX); + ret = min_t(size_t, outarg.size, XATTR_SIZE_MAX); if (ret == -ENOSYS) { fm->fc->no_getxattr = 1; ret = -EOPNOTSUPP; @@ -143,7 +143,7 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) } ret = fuse_simple_request(fm, &args); if (!ret && !size) - ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX); + ret = min_t(size_t, outarg.size, XATTR_LIST_MAX); if (ret > 0 && size) ret = fuse_verify_xattr_list(list, ret); if (ret == -ENOSYS) { @@ -164,9 +164,10 @@ int fuse_removexattr(struct inode *inode, const char *name) args.opcode = FUSE_REMOVEXATTR; args.nodeid = get_node_id(inode); - args.in_numargs = 1; - args.in_args[0].size = strlen(name) + 1; - args.in_args[0].value = name; + args.in_numargs = 2; + fuse_set_zero_arg0(&args); + args.in_args[1].size = strlen(name) + 1; + args.in_args[1].value = name; err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fm->fc->no_removexattr = 1; |