summaryrefslogtreecommitdiff
path: root/fs/fuse
diff options
context:
space:
mode:
Diffstat (limited to 'fs/fuse')
-rw-r--r--fs/fuse/Kconfig12
-rw-r--r--fs/fuse/Makefile5
-rw-r--r--fs/fuse/acl.c10
-rw-r--r--fs/fuse/control.c4
-rw-r--r--fs/fuse/cuse.c33
-rw-r--r--fs/fuse/dax.c52
-rw-r--r--fs/fuse/dev.c598
-rw-r--r--fs/fuse/dev_uring.c1352
-rw-r--r--fs/fuse/dev_uring_i.h211
-rw-r--r--fs/fuse/dir.c333
-rw-r--r--fs/fuse/file.c670
-rw-r--r--fs/fuse/fuse_dev_i.h70
-rw-r--r--fs/fuse/fuse_i.h196
-rw-r--r--fs/fuse/fuse_trace.h132
-rw-r--r--fs/fuse/inode.c175
-rw-r--r--fs/fuse/ioctl.c93
-rw-r--r--fs/fuse/iomode.c60
-rw-r--r--fs/fuse/passthrough.c44
-rw-r--r--fs/fuse/readdir.c37
-rw-r--r--fs/fuse/sysctl.c64
-rw-r--r--fs/fuse/virtio_fs.c424
-rw-r--r--fs/fuse/xattr.c11
22 files changed, 3633 insertions, 953 deletions
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 8674dbfbe59d..ca215a3cba3e 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -63,3 +63,15 @@ config FUSE_PASSTHROUGH
to be performed directly on a backing file.
If you want to allow passthrough operations, answer Y.
+
+config FUSE_IO_URING
+ bool "FUSE communication over io-uring"
+ default y
+ depends on FUSE_FS
+ depends on IO_URING
+ help
+ This allows sending FUSE requests over the io-uring interface and
+ also adds request core affinity.
+
+ If you want to allow fuse server/client communication through io-uring,
+ answer Y
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 6e0228c6d0cb..3f0f312a31c1 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -3,6 +3,9 @@
# Makefile for the FUSE filesystem.
#
+# Needed for trace events
+ccflags-y = -I$(src)
+
obj-$(CONFIG_FUSE_FS) += fuse.o
obj-$(CONFIG_CUSE) += cuse.o
obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
@@ -11,5 +14,7 @@ fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o
fuse-y += iomode.o
fuse-$(CONFIG_FUSE_DAX) += dax.o
fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o
+fuse-$(CONFIG_SYSCTL) += sysctl.o
+fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o
virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index 3d192b80a561..8f484b105f13 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -12,7 +12,6 @@
#include <linux/posix_acl_xattr.h>
static struct posix_acl *__fuse_get_acl(struct fuse_conn *fc,
- struct mnt_idmap *idmap,
struct inode *inode, int type, bool rcu)
{
int size;
@@ -74,7 +73,7 @@ struct posix_acl *fuse_get_acl(struct mnt_idmap *idmap,
if (fuse_no_acl(fc, inode))
return ERR_PTR(-EOPNOTSUPP);
- return __fuse_get_acl(fc, idmap, inode, type, false);
+ return __fuse_get_acl(fc, inode, type, false);
}
struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu)
@@ -90,8 +89,7 @@ struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu)
*/
if (!fc->posix_acl)
return NULL;
-
- return __fuse_get_acl(fc, &nop_mnt_idmap, inode, type, rcu);
+ return __fuse_get_acl(fc, inode, type, rcu);
}
int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
@@ -146,8 +144,8 @@ int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
* be stripped.
*/
if (fc->posix_acl &&
- !vfsgid_in_group_p(i_gid_into_vfsgid(&nop_mnt_idmap, inode)) &&
- !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID))
+ !in_group_or_capable(idmap, inode,
+ i_gid_into_vfsgid(idmap, inode)))
extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID;
ret = fuse_setxattr(inode, name, value, size, 0, extra_flags);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 97ac994ff78f..2a730d88cc3b 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -183,27 +183,23 @@ out:
static const struct file_operations fuse_ctl_abort_ops = {
.open = nonseekable_open,
.write = fuse_conn_abort_write,
- .llseek = no_llseek,
};
static const struct file_operations fuse_ctl_waiting_ops = {
.open = nonseekable_open,
.read = fuse_conn_waiting_read,
- .llseek = no_llseek,
};
static const struct file_operations fuse_conn_max_background_ops = {
.open = nonseekable_open,
.read = fuse_conn_max_background_read,
.write = fuse_conn_max_background_write,
- .llseek = no_llseek,
};
static const struct file_operations fuse_conn_congestion_threshold_ops = {
.open = nonseekable_open,
.read = fuse_conn_congestion_threshold_read,
.write = fuse_conn_congestion_threshold_write,
- .llseek = no_llseek,
};
static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index b6cad106c37e..b39844d75a80 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -303,13 +303,17 @@ struct cuse_init_args {
struct fuse_args_pages ap;
struct cuse_init_in in;
struct cuse_init_out out;
- struct page *page;
- struct fuse_page_desc desc;
+ struct folio *folio;
+ struct fuse_folio_desc desc;
};
/**
* cuse_process_init_reply - finish initializing CUSE channel
*
+ * @fm: The fuse mount information containing the CUSE connection.
+ * @args: The arguments passed to the init reply.
+ * @error: The error code signifying if any error occurred during the process.
+ *
* This function creates the character device and sets up all the
* required data structures for it. Please read the comment at the
* top of this file for high level overview.
@@ -322,7 +326,7 @@ static void cuse_process_init_reply(struct fuse_mount *fm,
struct fuse_args_pages *ap = &ia->ap;
struct cuse_conn *cc = fc_to_cc(fc), *pos;
struct cuse_init_out *arg = &ia->out;
- struct page *page = ap->pages[0];
+ struct folio *folio = ap->folios[0];
struct cuse_devinfo devinfo = { };
struct device *dev;
struct cdev *cdev;
@@ -339,7 +343,7 @@ static void cuse_process_init_reply(struct fuse_mount *fm,
/* parse init reply */
cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL;
- rc = cuse_parse_devinfo(page_address(page), ap->args.out_args[1].size,
+ rc = cuse_parse_devinfo(folio_address(folio), ap->args.out_args[1].size,
&devinfo);
if (rc)
goto err;
@@ -407,7 +411,7 @@ static void cuse_process_init_reply(struct fuse_mount *fm,
kobject_uevent(&dev->kobj, KOBJ_ADD);
out:
kfree(ia);
- __free_page(page);
+ folio_put(folio);
return;
err_cdev:
@@ -425,7 +429,7 @@ err:
static int cuse_send_init(struct cuse_conn *cc)
{
int rc;
- struct page *page;
+ struct folio *folio;
struct fuse_mount *fm = &cc->fm;
struct cuse_init_args *ia;
struct fuse_args_pages *ap;
@@ -433,13 +437,14 @@ static int cuse_send_init(struct cuse_conn *cc)
BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
rc = -ENOMEM;
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page)
+
+ folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, 0);
+ if (!folio)
goto err;
ia = kzalloc(sizeof(*ia), GFP_KERNEL);
if (!ia)
- goto err_free_page;
+ goto err_free_folio;
ap = &ia->ap;
ia->in.major = FUSE_KERNEL_VERSION;
@@ -455,18 +460,18 @@ static int cuse_send_init(struct cuse_conn *cc)
ap->args.out_args[1].size = CUSE_INIT_INFO_MAX;
ap->args.out_argvar = true;
ap->args.out_pages = true;
- ap->num_pages = 1;
- ap->pages = &ia->page;
+ ap->num_folios = 1;
+ ap->folios = &ia->folio;
ap->descs = &ia->desc;
- ia->page = page;
+ ia->folio = folio;
ia->desc.length = ap->args.out_args[1].size;
ap->args.end = cuse_process_init_reply;
rc = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
if (rc) {
kfree(ia);
-err_free_page:
- __free_page(page);
+err_free_folio:
+ folio_put(folio);
}
err:
return rc;
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 12ef91d170bb..0502bf3cdf6a 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -240,11 +240,12 @@ static int fuse_send_removemapping(struct inode *inode,
args.opcode = FUSE_REMOVEMAPPING;
args.nodeid = fi->nodeid;
- args.in_numargs = 2;
- args.in_args[0].size = sizeof(*inargp);
- args.in_args[0].value = inargp;
- args.in_args[1].size = inargp->count * sizeof(*remove_one);
- args.in_args[1].value = remove_one;
+ args.in_numargs = 3;
+ fuse_set_zero_arg0(&args);
+ args.in_args[1].size = sizeof(*inargp);
+ args.in_args[1].value = inargp;
+ args.in_args[2].size = inargp->count * sizeof(*remove_one);
+ args.in_args[2].value = remove_one;
return fuse_simple_request(fm, &args);
}
@@ -665,36 +666,12 @@ static void fuse_wait_dax_page(struct inode *inode)
filemap_invalidate_lock(inode->i_mapping);
}
-/* Should be called with mapping->invalidate_lock held exclusively */
-static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
- loff_t start, loff_t end)
-{
- struct page *page;
-
- page = dax_layout_busy_page_range(inode->i_mapping, start, end);
- if (!page)
- return 0;
-
- *retry = true;
- return ___wait_var_event(&page->_refcount,
- atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
- 0, 0, fuse_wait_dax_page(inode));
-}
-
-/* dmap_end == 0 leads to unmapping of whole file */
+/* Should be called with mapping->invalidate_lock held exclusively. */
int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start,
u64 dmap_end)
{
- bool retry;
- int ret;
-
- do {
- retry = false;
- ret = __fuse_dax_break_layouts(inode, &retry, dmap_start,
- dmap_end);
- } while (ret == 0 && retry);
-
- return ret;
+ return dax_break_layout(inode, dmap_start, dmap_end,
+ fuse_wait_dax_page);
}
ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -774,16 +751,6 @@ out:
return ret;
}
-static int fuse_dax_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
-
- struct inode *inode = mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
-
- return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc);
-}
-
static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order,
bool write)
{
@@ -1323,7 +1290,6 @@ bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi)
}
static const struct address_space_operations fuse_dax_file_aops = {
- .writepages = fuse_dax_writepages,
.direct_IO = noop_direct_IO,
.dirty_folio = noop_dirty_folio,
};
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 3ec8bb5e68ff..6dcbaa218b7a 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -6,7 +6,9 @@
See the file COPYING.
*/
+#include "dev_uring_i.h"
#include "fuse_i.h"
+#include "fuse_dev_i.h"
#include <linux/init.h>
#include <linux/module.h>
@@ -22,22 +24,106 @@
#include <linux/splice.h>
#include <linux/sched.h>
+#define CREATE_TRACE_POINTS
+#include "fuse_trace.h"
+
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
-/* Ordinary requests have even IDs, while interrupts IDs are odd */
-#define FUSE_INT_REQ_BIT (1ULL << 0)
-#define FUSE_REQ_ID_STEP (1ULL << 1)
-
static struct kmem_cache *fuse_req_cachep;
-static struct fuse_dev *fuse_get_dev(struct file *file)
+const unsigned long fuse_timeout_timer_freq =
+ secs_to_jiffies(FUSE_TIMEOUT_TIMER_FREQ);
+
+bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list)
{
- /*
- * Lockless access is OK, because file->private data is set
- * once during mount and is valid until the file is released.
- */
- return READ_ONCE(file->private_data);
+ struct fuse_req *req;
+
+ req = list_first_entry_or_null(list, struct fuse_req, list);
+ if (!req)
+ return false;
+ return time_is_before_jiffies(req->create_time + fc->timeout.req_timeout);
+}
+
+bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing)
+{
+ int i;
+
+ for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
+ if (fuse_request_expired(fc, &processing[i]))
+ return true;
+
+ return false;
+}
+
+/*
+ * Check if any requests aren't being completed by the time the request timeout
+ * elapses. To do so, we:
+ * - check the fiq pending list
+ * - check the bg queue
+ * - check the fpq io and processing lists
+ *
+ * To make this fast, we only check against the head request on each list since
+ * these are generally queued in order of creation time (eg newer requests get
+ * queued to the tail). We might miss a few edge cases (eg requests transitioning
+ * between lists, re-sent requests at the head of the pending list having a
+ * later creation time than other requests on that list, etc.) but that is fine
+ * since if the request never gets fulfilled, it will eventually be caught.
+ */
+void fuse_check_timeout(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct fuse_conn *fc = container_of(dwork, struct fuse_conn,
+ timeout.work);
+ struct fuse_iqueue *fiq = &fc->iq;
+ struct fuse_dev *fud;
+ struct fuse_pqueue *fpq;
+ bool expired = false;
+
+ if (!atomic_read(&fc->num_waiting))
+ goto out;
+
+ spin_lock(&fiq->lock);
+ expired = fuse_request_expired(fc, &fiq->pending);
+ spin_unlock(&fiq->lock);
+ if (expired)
+ goto abort_conn;
+
+ spin_lock(&fc->bg_lock);
+ expired = fuse_request_expired(fc, &fc->bg_queue);
+ spin_unlock(&fc->bg_lock);
+ if (expired)
+ goto abort_conn;
+
+ spin_lock(&fc->lock);
+ if (!fc->connected) {
+ spin_unlock(&fc->lock);
+ return;
+ }
+ list_for_each_entry(fud, &fc->devices, entry) {
+ fpq = &fud->pq;
+ spin_lock(&fpq->lock);
+ if (fuse_request_expired(fc, &fpq->io) ||
+ fuse_fpq_processing_expired(fc, fpq->processing)) {
+ spin_unlock(&fpq->lock);
+ spin_unlock(&fc->lock);
+ goto abort_conn;
+ }
+
+ spin_unlock(&fpq->lock);
+ }
+ spin_unlock(&fc->lock);
+
+ if (fuse_uring_request_expired(fc))
+ goto abort_conn;
+
+out:
+ queue_delayed_work(system_wq, &fc->timeout.work,
+ fuse_timeout_timer_freq);
+ return;
+
+abort_conn:
+ fuse_abort_conn(fc);
}
static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
@@ -48,6 +134,7 @@ static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
refcount_set(&req->count, 1);
__set_bit(FR_PENDING, &req->flags);
req->fm = fm;
+ req->create_time = jiffies;
}
static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags)
@@ -84,7 +171,8 @@ void fuse_set_initialized(struct fuse_conn *fc)
static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
{
- return !fc->initialized || (for_background && fc->blocked);
+ return !fc->initialized || (for_background && fc->blocked) ||
+ (fc->io_uring && fc->connected && !fuse_uring_ready(fc));
}
static void fuse_drop_waiting(struct fuse_conn *fc)
@@ -103,11 +191,17 @@ static void fuse_drop_waiting(struct fuse_conn *fc)
static void fuse_put_request(struct fuse_req *req);
-static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background)
+static struct fuse_req *fuse_get_req(struct mnt_idmap *idmap,
+ struct fuse_mount *fm,
+ bool for_background)
{
struct fuse_conn *fc = fm->fc;
struct fuse_req *req;
+ bool no_idmap = !fm->sb || (fm->sb->s_iflags & SB_I_NOIDMAP);
+ kuid_t fsuid;
+ kgid_t fsgid;
int err;
+
atomic_inc(&fc->num_waiting);
if (fuse_block_alloc(fc, for_background)) {
@@ -135,19 +229,32 @@ static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background)
goto out;
}
- req->in.h.uid = from_kuid(fc->user_ns, current_fsuid());
- req->in.h.gid = from_kgid(fc->user_ns, current_fsgid());
req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
__set_bit(FR_WAITING, &req->flags);
if (for_background)
__set_bit(FR_BACKGROUND, &req->flags);
- if (unlikely(req->in.h.uid == ((uid_t)-1) ||
- req->in.h.gid == ((gid_t)-1))) {
+ /*
+ * Keep the old behavior when idmappings support was not
+ * declared by a FUSE server.
+ *
+ * For those FUSE servers who support idmapped mounts,
+ * we send UID/GID only along with "inode creation"
+ * fuse requests, otherwise idmap == &invalid_mnt_idmap and
+ * req->in.h.{u,g}id will be equal to FUSE_INVALID_UIDGID.
+ */
+ fsuid = no_idmap ? current_fsuid() : mapped_fsuid(idmap, fc->user_ns);
+ fsgid = no_idmap ? current_fsgid() : mapped_fsgid(idmap, fc->user_ns);
+ req->in.h.uid = from_kuid(fc->user_ns, fsuid);
+ req->in.h.gid = from_kgid(fc->user_ns, fsgid);
+
+ if (no_idmap && unlikely(req->in.h.uid == ((uid_t)-1) ||
+ req->in.h.gid == ((gid_t)-1))) {
fuse_put_request(req);
return ERR_PTR(-EOVERFLOW);
}
+
return req;
out:
@@ -192,14 +299,25 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args)
}
EXPORT_SYMBOL_GPL(fuse_len_args);
-u64 fuse_get_unique(struct fuse_iqueue *fiq)
+static u64 fuse_get_unique_locked(struct fuse_iqueue *fiq)
{
fiq->reqctr += FUSE_REQ_ID_STEP;
return fiq->reqctr;
}
+
+u64 fuse_get_unique(struct fuse_iqueue *fiq)
+{
+ u64 ret;
+
+ spin_lock(&fiq->lock);
+ ret = fuse_get_unique_locked(fiq);
+ spin_unlock(&fiq->lock);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(fuse_get_unique);
-static unsigned int fuse_req_hash(u64 unique)
+unsigned int fuse_req_hash(u64 unique)
{
return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS);
}
@@ -215,22 +333,71 @@ __releases(fiq->lock)
spin_unlock(&fiq->lock);
}
+void fuse_dev_queue_forget(struct fuse_iqueue *fiq,
+ struct fuse_forget_link *forget)
+{
+ spin_lock(&fiq->lock);
+ if (fiq->connected) {
+ fiq->forget_list_tail->next = forget;
+ fiq->forget_list_tail = forget;
+ fuse_dev_wake_and_unlock(fiq);
+ } else {
+ kfree(forget);
+ spin_unlock(&fiq->lock);
+ }
+}
+
+void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
+{
+ spin_lock(&fiq->lock);
+ if (list_empty(&req->intr_entry)) {
+ list_add_tail(&req->intr_entry, &fiq->interrupts);
+ /*
+ * Pairs with smp_mb() implied by test_and_set_bit()
+ * from fuse_request_end().
+ */
+ smp_mb();
+ if (test_bit(FR_FINISHED, &req->flags)) {
+ list_del_init(&req->intr_entry);
+ spin_unlock(&fiq->lock);
+ } else {
+ fuse_dev_wake_and_unlock(fiq);
+ }
+ } else {
+ spin_unlock(&fiq->lock);
+ }
+}
+
+static void fuse_dev_queue_req(struct fuse_iqueue *fiq, struct fuse_req *req)
+{
+ spin_lock(&fiq->lock);
+ if (fiq->connected) {
+ if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
+ req->in.h.unique = fuse_get_unique_locked(fiq);
+ list_add_tail(&req->list, &fiq->pending);
+ fuse_dev_wake_and_unlock(fiq);
+ } else {
+ spin_unlock(&fiq->lock);
+ req->out.h.error = -ENOTCONN;
+ clear_bit(FR_PENDING, &req->flags);
+ fuse_request_end(req);
+ }
+}
+
const struct fuse_iqueue_ops fuse_dev_fiq_ops = {
- .wake_forget_and_unlock = fuse_dev_wake_and_unlock,
- .wake_interrupt_and_unlock = fuse_dev_wake_and_unlock,
- .wake_pending_and_unlock = fuse_dev_wake_and_unlock,
+ .send_forget = fuse_dev_queue_forget,
+ .send_interrupt = fuse_dev_queue_interrupt,
+ .send_req = fuse_dev_queue_req,
};
EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops);
-static void queue_request_and_unlock(struct fuse_iqueue *fiq,
- struct fuse_req *req)
-__releases(fiq->lock)
+static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req)
{
req->in.h.len = sizeof(struct fuse_in_header) +
fuse_len_args(req->args->in_numargs,
(struct fuse_arg *) req->args->in_args);
- list_add_tail(&req->list, &fiq->pending);
- fiq->ops->wake_pending_and_unlock(fiq);
+ trace_fuse_request_send(req);
+ fiq->ops->send_req(fiq, req);
}
void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
@@ -241,15 +408,7 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
forget->forget_one.nodeid = nodeid;
forget->forget_one.nlookup = nlookup;
- spin_lock(&fiq->lock);
- if (fiq->connected) {
- fiq->forget_list_tail->next = forget;
- fiq->forget_list_tail = forget;
- fiq->ops->wake_forget_and_unlock(fiq);
- } else {
- kfree(forget);
- spin_unlock(&fiq->lock);
- }
+ fiq->ops->send_forget(fiq, forget);
}
static void flush_bg_queue(struct fuse_conn *fc)
@@ -263,9 +422,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
req = list_first_entry(&fc->bg_queue, struct fuse_req, list);
list_del(&req->list);
fc->active_background++;
- spin_lock(&fiq->lock);
- req->in.h.unique = fuse_get_unique(fiq);
- queue_request_and_unlock(fiq, req);
+ fuse_send_one(fiq, req);
}
}
@@ -286,6 +443,7 @@ void fuse_request_end(struct fuse_req *req)
if (test_and_set_bit(FR_FINISHED, &req->flags))
goto put_request;
+ trace_fuse_request_end(req);
/*
* test_and_set_bit() implies smp_mb() between bit
* changing and below FR_INTERRUPTED check. Pairs with
@@ -335,30 +493,31 @@ static int queue_interrupt(struct fuse_req *req)
{
struct fuse_iqueue *fiq = &req->fm->fc->iq;
- spin_lock(&fiq->lock);
/* Check for we've sent request to interrupt this req */
- if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) {
- spin_unlock(&fiq->lock);
+ if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags)))
return -EINVAL;
- }
- if (list_empty(&req->intr_entry)) {
- list_add_tail(&req->intr_entry, &fiq->interrupts);
+ fiq->ops->send_interrupt(fiq, req);
+
+ return 0;
+}
+
+bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock)
+{
+ spin_lock(lock);
+ if (test_bit(FR_PENDING, &req->flags)) {
/*
- * Pairs with smp_mb() implied by test_and_set_bit()
- * from fuse_request_end().
+ * FR_PENDING does not get cleared as the request will end
+ * up in destruction anyway.
*/
- smp_mb();
- if (test_bit(FR_FINISHED, &req->flags)) {
- list_del_init(&req->intr_entry);
- spin_unlock(&fiq->lock);
- return 0;
- }
- fiq->ops->wake_interrupt_and_unlock(fiq);
- } else {
- spin_unlock(&fiq->lock);
+ list_del(&req->list);
+ spin_unlock(lock);
+ __fuse_put_request(req);
+ req->out.h.error = -EINTR;
+ return true;
}
- return 0;
+ spin_unlock(lock);
+ return false;
}
static void request_wait_answer(struct fuse_req *req)
@@ -382,22 +541,20 @@ static void request_wait_answer(struct fuse_req *req)
}
if (!test_bit(FR_FORCE, &req->flags)) {
+ bool removed;
+
/* Only fatal signals may interrupt this */
err = wait_event_killable(req->waitq,
test_bit(FR_FINISHED, &req->flags));
if (!err)
return;
- spin_lock(&fiq->lock);
- /* Request is not yet in userspace, bail out */
- if (test_bit(FR_PENDING, &req->flags)) {
- list_del(&req->list);
- spin_unlock(&fiq->lock);
- __fuse_put_request(req);
- req->out.h.error = -EINTR;
+ if (test_bit(FR_URING, &req->flags))
+ removed = fuse_uring_remove_pending_req(req);
+ else
+ removed = fuse_remove_pending_req(req, &fiq->lock);
+ if (removed)
return;
- }
- spin_unlock(&fiq->lock);
}
/*
@@ -412,21 +569,15 @@ static void __fuse_request_send(struct fuse_req *req)
struct fuse_iqueue *fiq = &req->fm->fc->iq;
BUG_ON(test_bit(FR_BACKGROUND, &req->flags));
- spin_lock(&fiq->lock);
- if (!fiq->connected) {
- spin_unlock(&fiq->lock);
- req->out.h.error = -ENOTCONN;
- } else {
- req->in.h.unique = fuse_get_unique(fiq);
- /* acquire extra reference, since request is still needed
- after fuse_request_end() */
- __fuse_get_request(req);
- queue_request_and_unlock(fiq, req);
- request_wait_answer(req);
- /* Pairs with smp_wmb() in fuse_request_end() */
- smp_rmb();
- }
+ /* acquire extra reference, since request is still needed after
+ fuse_request_end() */
+ __fuse_get_request(req);
+ fuse_send_one(fiq, req);
+
+ request_wait_answer(req);
+ /* Pairs with smp_wmb() in fuse_request_end() */
+ smp_rmb();
}
static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args)
@@ -466,8 +617,14 @@ static void fuse_force_creds(struct fuse_req *req)
{
struct fuse_conn *fc = req->fm->fc;
- req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
- req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
+ if (!req->fm->sb || req->fm->sb->s_iflags & SB_I_NOIDMAP) {
+ req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
+ req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
+ } else {
+ req->in.h.uid = FUSE_INVALID_UIDGID;
+ req->in.h.gid = FUSE_INVALID_UIDGID;
+ }
+
req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
}
@@ -482,7 +639,9 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args)
__set_bit(FR_ASYNC, &req->flags);
}
-ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args)
+ssize_t __fuse_simple_request(struct mnt_idmap *idmap,
+ struct fuse_mount *fm,
+ struct fuse_args *args)
{
struct fuse_conn *fc = fm->fc;
struct fuse_req *req;
@@ -499,7 +658,7 @@ ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args)
__set_bit(FR_FORCE, &req->flags);
} else {
WARN_ON(args->nocreds);
- req = fuse_get_req(fm, false);
+ req = fuse_get_req(idmap, fm, false);
if (IS_ERR(req))
return PTR_ERR(req);
}
@@ -521,7 +680,25 @@ ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args)
return ret;
}
-static bool fuse_request_queue_background(struct fuse_req *req)
+#ifdef CONFIG_FUSE_IO_URING
+static bool fuse_request_queue_background_uring(struct fuse_conn *fc,
+ struct fuse_req *req)
+{
+ struct fuse_iqueue *fiq = &fc->iq;
+
+ req->in.h.unique = fuse_get_unique(fiq);
+ req->in.h.len = sizeof(struct fuse_in_header) +
+ fuse_len_args(req->args->in_numargs,
+ (struct fuse_arg *) req->args->in_args);
+
+ return fuse_uring_queue_bq_req(req);
+}
+#endif
+
+/*
+ * @return true if queued
+ */
+static int fuse_request_queue_background(struct fuse_req *req)
{
struct fuse_mount *fm = req->fm;
struct fuse_conn *fc = fm->fc;
@@ -533,6 +710,12 @@ static bool fuse_request_queue_background(struct fuse_req *req)
atomic_inc(&fc->num_waiting);
}
__set_bit(FR_ISREPLY, &req->flags);
+
+#ifdef CONFIG_FUSE_IO_URING
+ if (fuse_uring_ready(fc))
+ return fuse_request_queue_background_uring(fc, req);
+#endif
+
spin_lock(&fc->bg_lock);
if (likely(fc->connected)) {
fc->num_background++;
@@ -560,7 +743,7 @@ int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
__set_bit(FR_BACKGROUND, &req->flags);
} else {
WARN_ON(args->nocreds);
- req = fuse_get_req(fm, true);
+ req = fuse_get_req(&invalid_mnt_idmap, fm, true);
if (IS_ERR(req))
return PTR_ERR(req);
}
@@ -581,9 +764,8 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm,
{
struct fuse_req *req;
struct fuse_iqueue *fiq = &fm->fc->iq;
- int err = 0;
- req = fuse_get_req(fm, false);
+ req = fuse_get_req(&invalid_mnt_idmap, fm, false);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -592,16 +774,9 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm,
fuse_args_to_req(req, args);
- spin_lock(&fiq->lock);
- if (fiq->connected) {
- queue_request_and_unlock(fiq, req);
- } else {
- err = -ENODEV;
- spin_unlock(&fiq->lock);
- fuse_put_request(req);
- }
+ fuse_send_one(fiq, req);
- return err;
+ return 0;
}
/*
@@ -641,22 +816,8 @@ static int unlock_request(struct fuse_req *req)
return err;
}
-struct fuse_copy_state {
- int write;
- struct fuse_req *req;
- struct iov_iter *iter;
- struct pipe_buffer *pipebufs;
- struct pipe_buffer *currbuf;
- struct pipe_inode_info *pipe;
- unsigned long nr_segs;
- struct page *pg;
- unsigned len;
- unsigned offset;
- unsigned move_pages:1;
-};
-
-static void fuse_copy_init(struct fuse_copy_state *cs, int write,
- struct iov_iter *iter)
+void fuse_copy_init(struct fuse_copy_state *cs, int write,
+ struct iov_iter *iter)
{
memset(cs, 0, sizeof(*cs));
cs->write = write;
@@ -763,6 +924,9 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
*size -= ncpy;
cs->len -= ncpy;
cs->offset += ncpy;
+ if (cs->is_uring)
+ cs->ring.copied_sz += ncpy;
+
return ncpy;
}
@@ -773,7 +937,6 @@ static int fuse_check_folio(struct folio *folio)
(folio->flags & PAGE_FLAGS_CHECK_AT_PREP &
~(1 << PG_locked |
1 << PG_referenced |
- 1 << PG_uptodate |
1 << PG_lru |
1 << PG_active |
1 << PG_workingset |
@@ -786,6 +949,12 @@ static int fuse_check_folio(struct folio *folio)
return 0;
}
+/*
+ * Attempt to steal a page from the splice() pipe and move it into the
+ * pagecache. If successful, the pointer in @pagep will be updated. The
+ * folio that was originally in @pagep will lose a reference and the new
+ * folio returned in @pagep will carry a reference.
+ */
static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
{
int err;
@@ -818,9 +987,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
newfolio = page_folio(buf->page);
- if (!folio_test_uptodate(newfolio))
- folio_mark_uptodate(newfolio);
-
+ folio_clear_uptodate(newfolio);
folio_clear_mappedtodisk(newfolio);
if (fuse_check_folio(newfolio) != 0)
@@ -980,17 +1147,27 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
struct fuse_req *req = cs->req;
struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args);
-
- for (i = 0; i < ap->num_pages && (nbytes || zeroing); i++) {
+ for (i = 0; i < ap->num_folios && (nbytes || zeroing); i++) {
int err;
unsigned int offset = ap->descs[i].offset;
unsigned int count = min(nbytes, ap->descs[i].length);
+ struct page *orig, *pagep;
- err = fuse_copy_page(cs, &ap->pages[i], offset, count, zeroing);
+ orig = pagep = &ap->folios[i]->page;
+
+ err = fuse_copy_page(cs, &pagep, offset, count, zeroing);
if (err)
return err;
nbytes -= count;
+
+ /*
+ * fuse_copy_page may have moved a page from a pipe instead of
+ * copying into our given page, so update the folios if it was
+ * replaced.
+ */
+ if (pagep != orig)
+ ap->folios[i] = page_folio(pagep);
}
return 0;
}
@@ -1010,9 +1187,9 @@ static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
}
/* Copy request arguments to/from userspace buffer */
-static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
- unsigned argpages, struct fuse_arg *args,
- int zeroing)
+int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
+ unsigned argpages, struct fuse_arg *args,
+ int zeroing)
{
int err = 0;
unsigned i;
@@ -1076,9 +1253,9 @@ __releases(fiq->lock)
return err ? err : reqsize;
}
-struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq,
- unsigned int max,
- unsigned int *countp)
+static struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq,
+ unsigned int max,
+ unsigned int *countp)
{
struct fuse_forget_link *head = fiq->forget_list_head.next;
struct fuse_forget_link **newhead = &head;
@@ -1097,7 +1274,6 @@ struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq,
return head;
}
-EXPORT_SYMBOL(fuse_dequeue_forget);
static int fuse_read_single_forget(struct fuse_iqueue *fiq,
struct fuse_copy_state *cs,
@@ -1112,7 +1288,7 @@ __releases(fiq->lock)
struct fuse_in_header ih = {
.opcode = FUSE_FORGET,
.nodeid = forget->forget_one.nodeid,
- .unique = fuse_get_unique(fiq),
+ .unique = fuse_get_unique_locked(fiq),
.len = sizeof(ih) + sizeof(arg),
};
@@ -1143,7 +1319,7 @@ __releases(fiq->lock)
struct fuse_batch_forget_in arg = { .count = 0 };
struct fuse_in_header ih = {
.opcode = FUSE_BATCH_FORGET,
- .unique = fuse_get_unique(fiq),
+ .unique = fuse_get_unique_locked(fiq),
.len = sizeof(ih) + sizeof(arg),
};
@@ -1392,7 +1568,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
if (ret < 0)
goto out;
- if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->max_usage) {
+ if (pipe_buf_usage(pipe) + cs.nr_segs > pipe->max_usage) {
ret = -EIO;
goto out;
}
@@ -1468,14 +1644,10 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
struct fuse_copy_state *cs)
{
struct fuse_notify_inval_entry_out outarg;
- int err = -ENOMEM;
- char *buf;
+ int err;
+ char *buf = NULL;
struct qstr name;
- buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
- if (!buf)
- goto err;
-
err = -EINVAL;
if (size < sizeof(outarg))
goto err;
@@ -1485,13 +1657,18 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
goto err;
err = -ENAMETOOLONG;
- if (outarg.namelen > FUSE_NAME_MAX)
+ if (outarg.namelen > fc->name_max)
goto err;
err = -EINVAL;
if (size != sizeof(outarg) + outarg.namelen + 1)
goto err;
+ err = -ENOMEM;
+ buf = kzalloc(outarg.namelen + 1, GFP_KERNEL);
+ if (!buf)
+ goto err;
+
name.name = buf;
name.len = outarg.namelen;
err = fuse_copy_one(cs, buf, outarg.namelen + 1);
@@ -1516,14 +1693,10 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
struct fuse_copy_state *cs)
{
struct fuse_notify_delete_out outarg;
- int err = -ENOMEM;
- char *buf;
+ int err;
+ char *buf = NULL;
struct qstr name;
- buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
- if (!buf)
- goto err;
-
err = -EINVAL;
if (size < sizeof(outarg))
goto err;
@@ -1533,13 +1706,18 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
goto err;
err = -ENAMETOOLONG;
- if (outarg.namelen > FUSE_NAME_MAX)
+ if (outarg.namelen > fc->name_max)
goto err;
err = -EINVAL;
if (size != sizeof(outarg) + outarg.namelen + 1)
goto err;
+ err = -ENOMEM;
+ buf = kzalloc(outarg.namelen + 1, GFP_KERNEL);
+ if (!buf)
+ goto err;
+
name.name = buf;
name.len = outarg.namelen;
err = fuse_copy_one(cs, buf, outarg.namelen + 1);
@@ -1607,22 +1785,25 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
num = outarg.size;
while (num) {
+ struct folio *folio;
struct page *page;
unsigned int this_num;
- err = -ENOMEM;
- page = find_or_create_page(mapping, index,
- mapping_gfp_mask(mapping));
- if (!page)
+ folio = filemap_grab_folio(mapping, index);
+ err = PTR_ERR(folio);
+ if (IS_ERR(folio))
goto out_iput;
- this_num = min_t(unsigned, num, PAGE_SIZE - offset);
+ page = &folio->page;
+ this_num = min_t(unsigned, num, folio_size(folio) - offset);
err = fuse_copy_page(cs, &page, offset, this_num, 0);
- if (!err && offset == 0 &&
- (this_num == PAGE_SIZE || file_size == end))
- SetPageUptodate(page);
- unlock_page(page);
- put_page(page);
+ if (!folio_test_uptodate(folio) && !err && offset == 0 &&
+ (this_num == folio_size(folio) || file_size == end)) {
+ folio_zero_segment(folio, this_num, folio_size(folio));
+ folio_mark_uptodate(folio);
+ }
+ folio_unlock(folio);
+ folio_put(folio);
if (err)
goto out_iput;
@@ -1654,7 +1835,7 @@ static void fuse_retrieve_end(struct fuse_mount *fm, struct fuse_args *args,
struct fuse_retrieve_args *ra =
container_of(args, typeof(*ra), ap.args);
- release_pages(ra->ap.pages, ra->ap.num_pages);
+ release_pages(ra->ap.folios, ra->ap.num_folios);
kfree(ra);
}
@@ -1668,7 +1849,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
unsigned int num;
unsigned int offset;
size_t total_len = 0;
- unsigned int num_pages;
+ unsigned int num_pages, cur_pages = 0;
struct fuse_conn *fc = fm->fc;
struct fuse_retrieve_args *ra;
size_t args_size = sizeof(*ra);
@@ -1687,38 +1868,39 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
num_pages = min(num_pages, fc->max_pages);
- args_size += num_pages * (sizeof(ap->pages[0]) + sizeof(ap->descs[0]));
+ args_size += num_pages * (sizeof(ap->folios[0]) + sizeof(ap->descs[0]));
ra = kzalloc(args_size, GFP_KERNEL);
if (!ra)
return -ENOMEM;
ap = &ra->ap;
- ap->pages = (void *) (ra + 1);
- ap->descs = (void *) (ap->pages + num_pages);
+ ap->folios = (void *) (ra + 1);
+ ap->descs = (void *) (ap->folios + num_pages);
args = &ap->args;
args->nodeid = outarg->nodeid;
args->opcode = FUSE_NOTIFY_REPLY;
- args->in_numargs = 2;
+ args->in_numargs = 3;
args->in_pages = true;
args->end = fuse_retrieve_end;
index = outarg->offset >> PAGE_SHIFT;
- while (num && ap->num_pages < num_pages) {
- struct page *page;
+ while (num && cur_pages < num_pages) {
+ struct folio *folio;
unsigned int this_num;
- page = find_get_page(mapping, index);
- if (!page)
+ folio = filemap_get_folio(mapping, index);
+ if (IS_ERR(folio))
break;
this_num = min_t(unsigned, num, PAGE_SIZE - offset);
- ap->pages[ap->num_pages] = page;
- ap->descs[ap->num_pages].offset = offset;
- ap->descs[ap->num_pages].length = this_num;
- ap->num_pages++;
+ ap->folios[ap->num_folios] = folio;
+ ap->descs[ap->num_folios].offset = offset;
+ ap->descs[ap->num_folios].length = this_num;
+ ap->num_folios++;
+ cur_pages++;
offset = 0;
num -= this_num;
@@ -1727,9 +1909,10 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
}
ra->inarg.offset = outarg->offset;
ra->inarg.size = total_len;
- args->in_args[0].size = sizeof(ra->inarg);
- args->in_args[0].value = &ra->inarg;
- args->in_args[1].size = total_len;
+ fuse_set_zero_arg0(args);
+ args->in_args[1].size = sizeof(ra->inarg);
+ args->in_args[1].value = &ra->inarg;
+ args->in_args[2].size = total_len;
err = fuse_simple_notify_reply(fm, args, outarg->notify_unique);
if (err)
@@ -1813,15 +1996,23 @@ static void fuse_resend(struct fuse_conn *fc)
spin_unlock(&fc->lock);
list_for_each_entry_safe(req, next, &to_queue, list) {
- __set_bit(FR_PENDING, &req->flags);
+ set_bit(FR_PENDING, &req->flags);
+ clear_bit(FR_SENT, &req->flags);
/* mark the request as resend request */
req->in.h.unique |= FUSE_UNIQUE_RESEND;
}
spin_lock(&fiq->lock);
+ if (!fiq->connected) {
+ spin_unlock(&fiq->lock);
+ list_for_each_entry(req, &to_queue, list)
+ clear_bit(FR_PENDING, &req->flags);
+ fuse_dev_end_requests(&to_queue);
+ return;
+ }
/* iq and pq requests are both oldest to newest */
list_splice(&to_queue, &fiq->pending);
- fiq->ops->wake_pending_and_unlock(fiq);
+ fuse_dev_wake_and_unlock(fiq);
}
static int fuse_notify_resend(struct fuse_conn *fc)
@@ -1865,7 +2056,7 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
}
/* Look up request on processing list by unique ID */
-static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique)
+struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique)
{
unsigned int hash = fuse_req_hash(unique);
struct fuse_req *req;
@@ -1877,10 +2068,17 @@ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique)
return NULL;
}
-static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
- unsigned nbytes)
+int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
+ unsigned nbytes)
{
- unsigned reqsize = sizeof(struct fuse_out_header);
+
+ unsigned int reqsize = 0;
+
+ /*
+ * Uring has all headers separated from args - args is payload only
+ */
+ if (!cs->is_uring)
+ reqsize = sizeof(struct fuse_out_header);
reqsize += fuse_len_args(args->out_numargs, args->out_args);
@@ -1942,7 +2140,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
spin_lock(&fpq->lock);
req = NULL;
if (fpq->connected)
- req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT);
+ req = fuse_request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT);
err = -ENOENT;
if (!req) {
@@ -1980,7 +2178,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
if (oh.error)
err = nbytes != sizeof(oh) ? -EINVAL : 0;
else
- err = copy_out_args(cs, req->args, nbytes);
+ err = fuse_copy_out_args(cs, req->args, nbytes);
fuse_copy_finish(cs);
spin_lock(&fpq->lock);
@@ -2022,7 +2220,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
struct file *out, loff_t *ppos,
size_t len, unsigned int flags)
{
- unsigned int head, tail, mask, count;
+ unsigned int head, tail, count;
unsigned nbuf;
unsigned idx;
struct pipe_buffer *bufs;
@@ -2039,8 +2237,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
head = pipe->head;
tail = pipe->tail;
- mask = pipe->ring_size - 1;
- count = head - tail;
+ count = pipe_occupancy(head, tail);
bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL);
if (!bufs) {
@@ -2050,8 +2247,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
nbuf = 0;
rem = 0;
- for (idx = tail; idx != head && rem < len; idx++)
- rem += pipe->bufs[idx & mask].len;
+ for (idx = tail; !pipe_empty(head, idx) && rem < len; idx++)
+ rem += pipe_buf(pipe, idx)->len;
ret = -EINVAL;
if (rem < len)
@@ -2062,10 +2259,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
struct pipe_buffer *ibuf;
struct pipe_buffer *obuf;
- if (WARN_ON(nbuf >= count || tail == head))
+ if (WARN_ON(nbuf >= count || pipe_empty(head, tail)))
goto out_free;
- ibuf = &pipe->bufs[tail & mask];
+ ibuf = pipe_buf(pipe, tail);
obuf = &bufs[nbuf];
if (rem >= ibuf->len) {
@@ -2135,7 +2332,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait)
}
/* Abort all requests on the given list (pending or processing) */
-static void end_requests(struct list_head *head)
+void fuse_dev_end_requests(struct list_head *head)
{
while (!list_empty(head)) {
struct fuse_req *req;
@@ -2191,6 +2388,9 @@ void fuse_abort_conn(struct fuse_conn *fc)
LIST_HEAD(to_end);
unsigned int i;
+ if (fc->timeout.req_timeout)
+ cancel_delayed_work(&fc->timeout.work);
+
/* Background queuing checks fc->connected under bg_lock */
spin_lock(&fc->bg_lock);
fc->connected = 0;
@@ -2238,7 +2438,13 @@ void fuse_abort_conn(struct fuse_conn *fc)
wake_up_all(&fc->blocked_waitq);
spin_unlock(&fc->lock);
- end_requests(&to_end);
+ fuse_dev_end_requests(&to_end);
+
+ /*
+ * fc->lock must not be taken to avoid conflicts with io-uring
+ * locks
+ */
+ fuse_uring_abort(fc);
} else {
spin_unlock(&fc->lock);
}
@@ -2250,6 +2456,8 @@ void fuse_wait_aborted(struct fuse_conn *fc)
/* matches implicit memory barrier in fuse_drop_waiting() */
smp_mb();
wait_event(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0);
+
+ fuse_uring_wait_stopped_queues(fc);
}
int fuse_dev_release(struct inode *inode, struct file *file)
@@ -2268,7 +2476,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
list_splice_init(&fpq->processing[i], &to_end);
spin_unlock(&fpq->lock);
- end_requests(&to_end);
+ fuse_dev_end_requests(&to_end);
/* Are we the last open device? */
if (atomic_dec_and_test(&fc->dev_count)) {
@@ -2314,21 +2522,20 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp)
int res;
int oldfd;
struct fuse_dev *fud = NULL;
- struct fd f;
if (get_user(oldfd, argp))
return -EFAULT;
- f = fdget(oldfd);
- if (!f.file)
+ CLASS(fd, f)(oldfd);
+ if (fd_empty(f))
return -EINVAL;
/*
* Check against file->f_op because CUSE
* uses the same ioctl handler.
*/
- if (f.file->f_op == file->f_op)
- fud = fuse_get_dev(f.file);
+ if (fd_file(f)->f_op == file->f_op)
+ fud = fuse_get_dev(fd_file(f));
res = -EINVAL;
if (fud) {
@@ -2337,7 +2544,6 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp)
mutex_unlock(&fuse_mutex);
}
- fdput(f);
return res;
}
@@ -2399,7 +2605,6 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
const struct file_operations fuse_dev_operations = {
.owner = THIS_MODULE,
.open = fuse_dev_open,
- .llseek = no_llseek,
.read_iter = fuse_dev_read,
.splice_read = fuse_dev_splice_read,
.write_iter = fuse_dev_write,
@@ -2409,6 +2614,9 @@ const struct file_operations fuse_dev_operations = {
.fasync = fuse_dev_fasync,
.unlocked_ioctl = fuse_dev_ioctl,
.compat_ioctl = compat_ptr_ioctl,
+#ifdef CONFIG_FUSE_IO_URING
+ .uring_cmd = fuse_uring_cmd,
+#endif
};
EXPORT_SYMBOL_GPL(fuse_dev_operations);
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
new file mode 100644
index 000000000000..accdce2977c5
--- /dev/null
+++ b/fs/fuse/dev_uring.c
@@ -0,0 +1,1352 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (c) 2023-2024 DataDirect Networks.
+ */
+
+#include "fuse_i.h"
+#include "dev_uring_i.h"
+#include "fuse_dev_i.h"
+
+#include <linux/fs.h>
+#include <linux/io_uring/cmd.h>
+
+static bool __read_mostly enable_uring;
+module_param(enable_uring, bool, 0644);
+MODULE_PARM_DESC(enable_uring,
+ "Enable userspace communication through io-uring");
+
+#define FUSE_URING_IOV_SEGS 2 /* header and payload */
+
+
+bool fuse_uring_enabled(void)
+{
+ return enable_uring;
+}
+
+struct fuse_uring_pdu {
+ struct fuse_ring_ent *ent;
+};
+
+static const struct fuse_iqueue_ops fuse_io_uring_ops;
+
+static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd,
+ struct fuse_ring_ent *ring_ent)
+{
+ struct fuse_uring_pdu *pdu =
+ io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu);
+
+ pdu->ent = ring_ent;
+}
+
+static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd)
+{
+ struct fuse_uring_pdu *pdu =
+ io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu);
+
+ return pdu->ent;
+}
+
+static void fuse_uring_flush_bg(struct fuse_ring_queue *queue)
+{
+ struct fuse_ring *ring = queue->ring;
+ struct fuse_conn *fc = ring->fc;
+
+ lockdep_assert_held(&queue->lock);
+ lockdep_assert_held(&fc->bg_lock);
+
+ /*
+ * Allow one bg request per queue, ignoring global fc limits.
+ * This prevents a single queue from consuming all resources and
+ * eliminates the need for remote queue wake-ups when global
+ * limits are met but this queue has no more waiting requests.
+ */
+ while ((fc->active_background < fc->max_background ||
+ !queue->active_background) &&
+ (!list_empty(&queue->fuse_req_bg_queue))) {
+ struct fuse_req *req;
+
+ req = list_first_entry(&queue->fuse_req_bg_queue,
+ struct fuse_req, list);
+ fc->active_background++;
+ queue->active_background++;
+
+ list_move_tail(&req->list, &queue->fuse_req_queue);
+ }
+}
+
+static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req,
+ int error)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+ struct fuse_ring *ring = queue->ring;
+ struct fuse_conn *fc = ring->fc;
+
+ lockdep_assert_not_held(&queue->lock);
+ spin_lock(&queue->lock);
+ ent->fuse_req = NULL;
+ if (test_bit(FR_BACKGROUND, &req->flags)) {
+ queue->active_background--;
+ spin_lock(&fc->bg_lock);
+ fuse_uring_flush_bg(queue);
+ spin_unlock(&fc->bg_lock);
+ }
+
+ spin_unlock(&queue->lock);
+
+ if (error)
+ req->out.h.error = error;
+
+ clear_bit(FR_SENT, &req->flags);
+ fuse_request_end(req);
+}
+
+/* Abort all list queued request on the given ring queue */
+static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue)
+{
+ struct fuse_req *req;
+ LIST_HEAD(req_list);
+
+ spin_lock(&queue->lock);
+ list_for_each_entry(req, &queue->fuse_req_queue, list)
+ clear_bit(FR_PENDING, &req->flags);
+ list_splice_init(&queue->fuse_req_queue, &req_list);
+ spin_unlock(&queue->lock);
+
+ /* must not hold queue lock to avoid order issues with fi->lock */
+ fuse_dev_end_requests(&req_list);
+}
+
+void fuse_uring_abort_end_requests(struct fuse_ring *ring)
+{
+ int qid;
+ struct fuse_ring_queue *queue;
+ struct fuse_conn *fc = ring->fc;
+
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ queue = READ_ONCE(ring->queues[qid]);
+ if (!queue)
+ continue;
+
+ queue->stopped = true;
+
+ WARN_ON_ONCE(ring->fc->max_background != UINT_MAX);
+ spin_lock(&queue->lock);
+ spin_lock(&fc->bg_lock);
+ fuse_uring_flush_bg(queue);
+ spin_unlock(&fc->bg_lock);
+ spin_unlock(&queue->lock);
+ fuse_uring_abort_end_queue_requests(queue);
+ }
+}
+
+bool fuse_uring_request_expired(struct fuse_conn *fc)
+{
+ struct fuse_ring *ring = fc->ring;
+ struct fuse_ring_queue *queue;
+ int qid;
+
+ if (!ring)
+ return false;
+
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ queue = READ_ONCE(ring->queues[qid]);
+ if (!queue)
+ continue;
+
+ spin_lock(&queue->lock);
+ if (fuse_request_expired(fc, &queue->fuse_req_queue) ||
+ fuse_request_expired(fc, &queue->fuse_req_bg_queue) ||
+ fuse_fpq_processing_expired(fc, queue->fpq.processing)) {
+ spin_unlock(&queue->lock);
+ return true;
+ }
+ spin_unlock(&queue->lock);
+ }
+
+ return false;
+}
+
+void fuse_uring_destruct(struct fuse_conn *fc)
+{
+ struct fuse_ring *ring = fc->ring;
+ int qid;
+
+ if (!ring)
+ return;
+
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ struct fuse_ring_queue *queue = ring->queues[qid];
+ struct fuse_ring_ent *ent, *next;
+
+ if (!queue)
+ continue;
+
+ WARN_ON(!list_empty(&queue->ent_avail_queue));
+ WARN_ON(!list_empty(&queue->ent_w_req_queue));
+ WARN_ON(!list_empty(&queue->ent_commit_queue));
+ WARN_ON(!list_empty(&queue->ent_in_userspace));
+
+ list_for_each_entry_safe(ent, next, &queue->ent_released,
+ list) {
+ list_del_init(&ent->list);
+ kfree(ent);
+ }
+
+ kfree(queue->fpq.processing);
+ kfree(queue);
+ ring->queues[qid] = NULL;
+ }
+
+ kfree(ring->queues);
+ kfree(ring);
+ fc->ring = NULL;
+}
+
+/*
+ * Basic ring setup for this connection based on the provided configuration
+ */
+static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
+{
+ struct fuse_ring *ring;
+ size_t nr_queues = num_possible_cpus();
+ struct fuse_ring *res = NULL;
+ size_t max_payload_size;
+
+ ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT);
+ if (!ring)
+ return NULL;
+
+ ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *),
+ GFP_KERNEL_ACCOUNT);
+ if (!ring->queues)
+ goto out_err;
+
+ max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write);
+ max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE);
+
+ spin_lock(&fc->lock);
+ if (fc->ring) {
+ /* race, another thread created the ring in the meantime */
+ spin_unlock(&fc->lock);
+ res = fc->ring;
+ goto out_err;
+ }
+
+ init_waitqueue_head(&ring->stop_waitq);
+
+ ring->nr_queues = nr_queues;
+ ring->fc = fc;
+ ring->max_payload_sz = max_payload_size;
+ smp_store_release(&fc->ring, ring);
+
+ spin_unlock(&fc->lock);
+ return ring;
+
+out_err:
+ kfree(ring->queues);
+ kfree(ring);
+ return res;
+}
+
+static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring,
+ int qid)
+{
+ struct fuse_conn *fc = ring->fc;
+ struct fuse_ring_queue *queue;
+ struct list_head *pq;
+
+ queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT);
+ if (!queue)
+ return NULL;
+ pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL);
+ if (!pq) {
+ kfree(queue);
+ return NULL;
+ }
+
+ queue->qid = qid;
+ queue->ring = ring;
+ spin_lock_init(&queue->lock);
+
+ INIT_LIST_HEAD(&queue->ent_avail_queue);
+ INIT_LIST_HEAD(&queue->ent_commit_queue);
+ INIT_LIST_HEAD(&queue->ent_w_req_queue);
+ INIT_LIST_HEAD(&queue->ent_in_userspace);
+ INIT_LIST_HEAD(&queue->fuse_req_queue);
+ INIT_LIST_HEAD(&queue->fuse_req_bg_queue);
+ INIT_LIST_HEAD(&queue->ent_released);
+
+ queue->fpq.processing = pq;
+ fuse_pqueue_init(&queue->fpq);
+
+ spin_lock(&fc->lock);
+ if (ring->queues[qid]) {
+ spin_unlock(&fc->lock);
+ kfree(queue->fpq.processing);
+ kfree(queue);
+ return ring->queues[qid];
+ }
+
+ /*
+ * write_once and lock as the caller mostly doesn't take the lock at all
+ */
+ WRITE_ONCE(ring->queues[qid], queue);
+ spin_unlock(&fc->lock);
+
+ return queue;
+}
+
+static void fuse_uring_stop_fuse_req_end(struct fuse_req *req)
+{
+ clear_bit(FR_SENT, &req->flags);
+ req->out.h.error = -ECONNABORTED;
+ fuse_request_end(req);
+}
+
+/*
+ * Release a request/entry on connection tear down
+ */
+static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent)
+{
+ struct fuse_req *req;
+ struct io_uring_cmd *cmd;
+
+ struct fuse_ring_queue *queue = ent->queue;
+
+ spin_lock(&queue->lock);
+ cmd = ent->cmd;
+ ent->cmd = NULL;
+ req = ent->fuse_req;
+ ent->fuse_req = NULL;
+ if (req) {
+ /* remove entry from queue->fpq->processing */
+ list_del_init(&req->list);
+ }
+
+ /*
+ * The entry must not be freed immediately, due to access of direct
+ * pointer access of entries through IO_URING_F_CANCEL - there is a risk
+ * of race between daemon termination (which triggers IO_URING_F_CANCEL
+ * and accesses entries without checking the list state first
+ */
+ list_move(&ent->list, &queue->ent_released);
+ ent->state = FRRS_RELEASED;
+ spin_unlock(&queue->lock);
+
+ if (cmd)
+ io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED);
+
+ if (req)
+ fuse_uring_stop_fuse_req_end(req);
+}
+
+static void fuse_uring_stop_list_entries(struct list_head *head,
+ struct fuse_ring_queue *queue,
+ enum fuse_ring_req_state exp_state)
+{
+ struct fuse_ring *ring = queue->ring;
+ struct fuse_ring_ent *ent, *next;
+ ssize_t queue_refs = SSIZE_MAX;
+ LIST_HEAD(to_teardown);
+
+ spin_lock(&queue->lock);
+ list_for_each_entry_safe(ent, next, head, list) {
+ if (ent->state != exp_state) {
+ pr_warn("entry teardown qid=%d state=%d expected=%d",
+ queue->qid, ent->state, exp_state);
+ continue;
+ }
+
+ ent->state = FRRS_TEARDOWN;
+ list_move(&ent->list, &to_teardown);
+ }
+ spin_unlock(&queue->lock);
+
+ /* no queue lock to avoid lock order issues */
+ list_for_each_entry_safe(ent, next, &to_teardown, list) {
+ fuse_uring_entry_teardown(ent);
+ queue_refs = atomic_dec_return(&ring->queue_refs);
+ WARN_ON_ONCE(queue_refs < 0);
+ }
+}
+
+static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue)
+{
+ fuse_uring_stop_list_entries(&queue->ent_in_userspace, queue,
+ FRRS_USERSPACE);
+ fuse_uring_stop_list_entries(&queue->ent_avail_queue, queue,
+ FRRS_AVAILABLE);
+}
+
+/*
+ * Log state debug info
+ */
+static void fuse_uring_log_ent_state(struct fuse_ring *ring)
+{
+ int qid;
+ struct fuse_ring_ent *ent;
+
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ struct fuse_ring_queue *queue = ring->queues[qid];
+
+ if (!queue)
+ continue;
+
+ spin_lock(&queue->lock);
+ /*
+ * Log entries from the intermediate queue, the other queues
+ * should be empty
+ */
+ list_for_each_entry(ent, &queue->ent_w_req_queue, list) {
+ pr_info(" ent-req-queue ring=%p qid=%d ent=%p state=%d\n",
+ ring, qid, ent, ent->state);
+ }
+ list_for_each_entry(ent, &queue->ent_commit_queue, list) {
+ pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n",
+ ring, qid, ent, ent->state);
+ }
+ spin_unlock(&queue->lock);
+ }
+ ring->stop_debug_log = 1;
+}
+
+static void fuse_uring_async_stop_queues(struct work_struct *work)
+{
+ int qid;
+ struct fuse_ring *ring =
+ container_of(work, struct fuse_ring, async_teardown_work.work);
+
+ /* XXX code dup */
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]);
+
+ if (!queue)
+ continue;
+
+ fuse_uring_teardown_entries(queue);
+ }
+
+ /*
+ * Some ring entries might be in the middle of IO operations,
+ * i.e. in process to get handled by file_operations::uring_cmd
+ * or on the way to userspace - we could handle that with conditions in
+ * run time code, but easier/cleaner to have an async tear down handler
+ * If there are still queue references left
+ */
+ if (atomic_read(&ring->queue_refs) > 0) {
+ if (time_after(jiffies,
+ ring->teardown_time + FUSE_URING_TEARDOWN_TIMEOUT))
+ fuse_uring_log_ent_state(ring);
+
+ schedule_delayed_work(&ring->async_teardown_work,
+ FUSE_URING_TEARDOWN_INTERVAL);
+ } else {
+ wake_up_all(&ring->stop_waitq);
+ }
+}
+
+/*
+ * Stop the ring queues
+ */
+void fuse_uring_stop_queues(struct fuse_ring *ring)
+{
+ int qid;
+
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]);
+
+ if (!queue)
+ continue;
+
+ fuse_uring_teardown_entries(queue);
+ }
+
+ if (atomic_read(&ring->queue_refs) > 0) {
+ ring->teardown_time = jiffies;
+ INIT_DELAYED_WORK(&ring->async_teardown_work,
+ fuse_uring_async_stop_queues);
+ schedule_delayed_work(&ring->async_teardown_work,
+ FUSE_URING_TEARDOWN_INTERVAL);
+ } else {
+ wake_up_all(&ring->stop_waitq);
+ }
+}
+
+/*
+ * Handle IO_URING_F_CANCEL, typically should come on daemon termination.
+ *
+ * Releasing the last entry should trigger fuse_dev_release() if
+ * the daemon was terminated
+ */
+static void fuse_uring_cancel(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd);
+ struct fuse_ring_queue *queue;
+ bool need_cmd_done = false;
+
+ /*
+ * direct access on ent - it must not be destructed as long as
+ * IO_URING_F_CANCEL might come up
+ */
+ queue = ent->queue;
+ spin_lock(&queue->lock);
+ if (ent->state == FRRS_AVAILABLE) {
+ ent->state = FRRS_USERSPACE;
+ list_move(&ent->list, &queue->ent_in_userspace);
+ need_cmd_done = true;
+ ent->cmd = NULL;
+ }
+ spin_unlock(&queue->lock);
+
+ if (need_cmd_done) {
+ /* no queue lock to avoid lock order issues */
+ io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags);
+ }
+}
+
+static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags,
+ struct fuse_ring_ent *ring_ent)
+{
+ uring_cmd_set_ring_ent(cmd, ring_ent);
+ io_uring_cmd_mark_cancelable(cmd, issue_flags);
+}
+
+/*
+ * Checks for errors and stores it into the request
+ */
+static int fuse_uring_out_header_has_err(struct fuse_out_header *oh,
+ struct fuse_req *req,
+ struct fuse_conn *fc)
+{
+ int err;
+
+ err = -EINVAL;
+ if (oh->unique == 0) {
+ /* Not supported through io-uring yet */
+ pr_warn_once("notify through fuse-io-uring not supported\n");
+ goto err;
+ }
+
+ if (oh->error <= -ERESTARTSYS || oh->error > 0)
+ goto err;
+
+ if (oh->error) {
+ err = oh->error;
+ goto err;
+ }
+
+ err = -ENOENT;
+ if ((oh->unique & ~FUSE_INT_REQ_BIT) != req->in.h.unique) {
+ pr_warn_ratelimited("unique mismatch, expected: %llu got %llu\n",
+ req->in.h.unique,
+ oh->unique & ~FUSE_INT_REQ_BIT);
+ goto err;
+ }
+
+ /*
+ * Is it an interrupt reply ID?
+ * XXX: Not supported through fuse-io-uring yet, it should not even
+ * find the request - should not happen.
+ */
+ WARN_ON_ONCE(oh->unique & FUSE_INT_REQ_BIT);
+
+ err = 0;
+err:
+ return err;
+}
+
+static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
+ struct fuse_req *req,
+ struct fuse_ring_ent *ent)
+{
+ struct fuse_copy_state cs;
+ struct fuse_args *args = req->args;
+ struct iov_iter iter;
+ int err;
+ struct fuse_uring_ent_in_out ring_in_out;
+
+ err = copy_from_user(&ring_in_out, &ent->headers->ring_ent_in_out,
+ sizeof(ring_in_out));
+ if (err)
+ return -EFAULT;
+
+ err = import_ubuf(ITER_SOURCE, ent->payload, ring->max_payload_sz,
+ &iter);
+ if (err)
+ return err;
+
+ fuse_copy_init(&cs, 0, &iter);
+ cs.is_uring = 1;
+ cs.req = req;
+
+ return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz);
+}
+
+ /*
+ * Copy data from the req to the ring buffer
+ */
+static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req,
+ struct fuse_ring_ent *ent)
+{
+ struct fuse_copy_state cs;
+ struct fuse_args *args = req->args;
+ struct fuse_in_arg *in_args = args->in_args;
+ int num_args = args->in_numargs;
+ int err;
+ struct iov_iter iter;
+ struct fuse_uring_ent_in_out ent_in_out = {
+ .flags = 0,
+ .commit_id = req->in.h.unique,
+ };
+
+ err = import_ubuf(ITER_DEST, ent->payload, ring->max_payload_sz, &iter);
+ if (err) {
+ pr_info_ratelimited("fuse: Import of user buffer failed\n");
+ return err;
+ }
+
+ fuse_copy_init(&cs, 1, &iter);
+ cs.is_uring = 1;
+ cs.req = req;
+
+ if (num_args > 0) {
+ /*
+ * Expectation is that the first argument is the per op header.
+ * Some op code have that as zero size.
+ */
+ if (args->in_args[0].size > 0) {
+ err = copy_to_user(&ent->headers->op_in, in_args->value,
+ in_args->size);
+ if (err) {
+ pr_info_ratelimited(
+ "Copying the header failed.\n");
+ return -EFAULT;
+ }
+ }
+ in_args++;
+ num_args--;
+ }
+
+ /* copy the payload */
+ err = fuse_copy_args(&cs, num_args, args->in_pages,
+ (struct fuse_arg *)in_args, 0);
+ if (err) {
+ pr_info_ratelimited("%s fuse_copy_args failed\n", __func__);
+ return err;
+ }
+
+ ent_in_out.payload_sz = cs.ring.copied_sz;
+ err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out,
+ sizeof(ent_in_out));
+ return err ? -EFAULT : 0;
+}
+
+static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent,
+ struct fuse_req *req)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+ struct fuse_ring *ring = queue->ring;
+ int err;
+
+ err = -EIO;
+ if (WARN_ON(ent->state != FRRS_FUSE_REQ)) {
+ pr_err("qid=%d ring-req=%p invalid state %d on send\n",
+ queue->qid, ent, ent->state);
+ return err;
+ }
+
+ err = -EINVAL;
+ if (WARN_ON(req->in.h.unique == 0))
+ return err;
+
+ /* copy the request */
+ err = fuse_uring_args_to_ring(ring, req, ent);
+ if (unlikely(err)) {
+ pr_info_ratelimited("Copy to ring failed: %d\n", err);
+ return err;
+ }
+
+ /* copy fuse_in_header */
+ err = copy_to_user(&ent->headers->in_out, &req->in.h,
+ sizeof(req->in.h));
+ if (err) {
+ err = -EFAULT;
+ return err;
+ }
+
+ return 0;
+}
+
+static int fuse_uring_prepare_send(struct fuse_ring_ent *ent,
+ struct fuse_req *req)
+{
+ int err;
+
+ err = fuse_uring_copy_to_ring(ent, req);
+ if (!err)
+ set_bit(FR_SENT, &req->flags);
+ else
+ fuse_uring_req_end(ent, req, err);
+
+ return err;
+}
+
+/*
+ * Write data to the ring buffer and send the request to userspace,
+ * userspace will read it
+ * This is comparable with classical read(/dev/fuse)
+ */
+static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent,
+ struct fuse_req *req,
+ unsigned int issue_flags)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+ int err;
+ struct io_uring_cmd *cmd;
+
+ err = fuse_uring_prepare_send(ent, req);
+ if (err)
+ return err;
+
+ spin_lock(&queue->lock);
+ cmd = ent->cmd;
+ ent->cmd = NULL;
+ ent->state = FRRS_USERSPACE;
+ list_move(&ent->list, &queue->ent_in_userspace);
+ spin_unlock(&queue->lock);
+
+ io_uring_cmd_done(cmd, 0, 0, issue_flags);
+ return 0;
+}
+
+/*
+ * Make a ring entry available for fuse_req assignment
+ */
+static void fuse_uring_ent_avail(struct fuse_ring_ent *ent,
+ struct fuse_ring_queue *queue)
+{
+ WARN_ON_ONCE(!ent->cmd);
+ list_move(&ent->list, &queue->ent_avail_queue);
+ ent->state = FRRS_AVAILABLE;
+}
+
+/* Used to find the request on SQE commit */
+static void fuse_uring_add_to_pq(struct fuse_ring_ent *ent,
+ struct fuse_req *req)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+ struct fuse_pqueue *fpq = &queue->fpq;
+ unsigned int hash;
+
+ req->ring_entry = ent;
+ hash = fuse_req_hash(req->in.h.unique);
+ list_move_tail(&req->list, &fpq->processing[hash]);
+}
+
+/*
+ * Assign a fuse queue entry to the given entry
+ */
+static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent,
+ struct fuse_req *req)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+
+ lockdep_assert_held(&queue->lock);
+
+ if (WARN_ON_ONCE(ent->state != FRRS_AVAILABLE &&
+ ent->state != FRRS_COMMIT)) {
+ pr_warn("%s qid=%d state=%d\n", __func__, ent->queue->qid,
+ ent->state);
+ }
+
+ clear_bit(FR_PENDING, &req->flags);
+ ent->fuse_req = req;
+ ent->state = FRRS_FUSE_REQ;
+ list_move(&ent->list, &queue->ent_w_req_queue);
+ fuse_uring_add_to_pq(ent, req);
+}
+
+/* Fetch the next fuse request if available */
+static struct fuse_req *fuse_uring_ent_assign_req(struct fuse_ring_ent *ent)
+ __must_hold(&queue->lock)
+{
+ struct fuse_req *req;
+ struct fuse_ring_queue *queue = ent->queue;
+ struct list_head *req_queue = &queue->fuse_req_queue;
+
+ lockdep_assert_held(&queue->lock);
+
+ /* get and assign the next entry while it is still holding the lock */
+ req = list_first_entry_or_null(req_queue, struct fuse_req, list);
+ if (req)
+ fuse_uring_add_req_to_ring_ent(ent, req);
+
+ return req;
+}
+
+/*
+ * Read data from the ring buffer, which user space has written to
+ * This is comparible with handling of classical write(/dev/fuse).
+ * Also make the ring request available again for new fuse requests.
+ */
+static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req,
+ unsigned int issue_flags)
+{
+ struct fuse_ring *ring = ent->queue->ring;
+ struct fuse_conn *fc = ring->fc;
+ ssize_t err = 0;
+
+ err = copy_from_user(&req->out.h, &ent->headers->in_out,
+ sizeof(req->out.h));
+ if (err) {
+ req->out.h.error = -EFAULT;
+ goto out;
+ }
+
+ err = fuse_uring_out_header_has_err(&req->out.h, req, fc);
+ if (err) {
+ /* req->out.h.error already set */
+ goto out;
+ }
+
+ err = fuse_uring_copy_from_ring(ring, req, ent);
+out:
+ fuse_uring_req_end(ent, req, err);
+}
+
+/*
+ * Get the next fuse req and send it
+ */
+static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ent,
+ struct fuse_ring_queue *queue,
+ unsigned int issue_flags)
+{
+ int err;
+ struct fuse_req *req;
+
+retry:
+ spin_lock(&queue->lock);
+ fuse_uring_ent_avail(ent, queue);
+ req = fuse_uring_ent_assign_req(ent);
+ spin_unlock(&queue->lock);
+
+ if (req) {
+ err = fuse_uring_send_next_to_ring(ent, req, issue_flags);
+ if (err)
+ goto retry;
+ }
+}
+
+static int fuse_ring_ent_set_commit(struct fuse_ring_ent *ent)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+
+ lockdep_assert_held(&queue->lock);
+
+ if (WARN_ON_ONCE(ent->state != FRRS_USERSPACE))
+ return -EIO;
+
+ ent->state = FRRS_COMMIT;
+ list_move(&ent->list, &queue->ent_commit_queue);
+
+ return 0;
+}
+
+/* FUSE_URING_CMD_COMMIT_AND_FETCH handler */
+static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
+ struct fuse_conn *fc)
+{
+ const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe);
+ struct fuse_ring_ent *ent;
+ int err;
+ struct fuse_ring *ring = fc->ring;
+ struct fuse_ring_queue *queue;
+ uint64_t commit_id = READ_ONCE(cmd_req->commit_id);
+ unsigned int qid = READ_ONCE(cmd_req->qid);
+ struct fuse_pqueue *fpq;
+ struct fuse_req *req;
+
+ err = -ENOTCONN;
+ if (!ring)
+ return err;
+
+ if (qid >= ring->nr_queues)
+ return -EINVAL;
+
+ queue = ring->queues[qid];
+ if (!queue)
+ return err;
+ fpq = &queue->fpq;
+
+ if (!READ_ONCE(fc->connected) || READ_ONCE(queue->stopped))
+ return err;
+
+ spin_lock(&queue->lock);
+ /* Find a request based on the unique ID of the fuse request
+ * This should get revised, as it needs a hash calculation and list
+ * search. And full struct fuse_pqueue is needed (memory overhead).
+ * As well as the link from req to ring_ent.
+ */
+ req = fuse_request_find(fpq, commit_id);
+ err = -ENOENT;
+ if (!req) {
+ pr_info("qid=%d commit_id %llu not found\n", queue->qid,
+ commit_id);
+ spin_unlock(&queue->lock);
+ return err;
+ }
+ list_del_init(&req->list);
+ ent = req->ring_entry;
+ req->ring_entry = NULL;
+
+ err = fuse_ring_ent_set_commit(ent);
+ if (err != 0) {
+ pr_info_ratelimited("qid=%d commit_id %llu state %d",
+ queue->qid, commit_id, ent->state);
+ spin_unlock(&queue->lock);
+ req->out.h.error = err;
+ clear_bit(FR_SENT, &req->flags);
+ fuse_request_end(req);
+ return err;
+ }
+
+ ent->cmd = cmd;
+ spin_unlock(&queue->lock);
+
+ /* without the queue lock, as other locks are taken */
+ fuse_uring_prepare_cancel(cmd, issue_flags, ent);
+ fuse_uring_commit(ent, req, issue_flags);
+
+ /*
+ * Fetching the next request is absolutely required as queued
+ * fuse requests would otherwise not get processed - committing
+ * and fetching is done in one step vs legacy fuse, which has separated
+ * read (fetch request) and write (commit result).
+ */
+ fuse_uring_next_fuse_req(ent, queue, issue_flags);
+ return 0;
+}
+
+static bool is_ring_ready(struct fuse_ring *ring, int current_qid)
+{
+ int qid;
+ struct fuse_ring_queue *queue;
+ bool ready = true;
+
+ for (qid = 0; qid < ring->nr_queues && ready; qid++) {
+ if (current_qid == qid)
+ continue;
+
+ queue = ring->queues[qid];
+ if (!queue) {
+ ready = false;
+ break;
+ }
+
+ spin_lock(&queue->lock);
+ if (list_empty(&queue->ent_avail_queue))
+ ready = false;
+ spin_unlock(&queue->lock);
+ }
+
+ return ready;
+}
+
+/*
+ * fuse_uring_req_fetch command handling
+ */
+static void fuse_uring_do_register(struct fuse_ring_ent *ent,
+ struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+ struct fuse_ring *ring = queue->ring;
+ struct fuse_conn *fc = ring->fc;
+ struct fuse_iqueue *fiq = &fc->iq;
+
+ fuse_uring_prepare_cancel(cmd, issue_flags, ent);
+
+ spin_lock(&queue->lock);
+ ent->cmd = cmd;
+ fuse_uring_ent_avail(ent, queue);
+ spin_unlock(&queue->lock);
+
+ if (!ring->ready) {
+ bool ready = is_ring_ready(ring, queue->qid);
+
+ if (ready) {
+ WRITE_ONCE(fiq->ops, &fuse_io_uring_ops);
+ WRITE_ONCE(ring->ready, true);
+ wake_up_all(&fc->blocked_waitq);
+ }
+ }
+}
+
+/*
+ * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1]
+ * the payload
+ */
+static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe,
+ struct iovec iov[FUSE_URING_IOV_SEGS])
+{
+ struct iovec __user *uiov = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ struct iov_iter iter;
+ ssize_t ret;
+
+ if (sqe->len != FUSE_URING_IOV_SEGS)
+ return -EINVAL;
+
+ /*
+ * Direction for buffer access will actually be READ and WRITE,
+ * using write for the import should include READ access as well.
+ */
+ ret = import_iovec(WRITE, uiov, FUSE_URING_IOV_SEGS,
+ FUSE_URING_IOV_SEGS, &iov, &iter);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+static struct fuse_ring_ent *
+fuse_uring_create_ring_ent(struct io_uring_cmd *cmd,
+ struct fuse_ring_queue *queue)
+{
+ struct fuse_ring *ring = queue->ring;
+ struct fuse_ring_ent *ent;
+ size_t payload_size;
+ struct iovec iov[FUSE_URING_IOV_SEGS];
+ int err;
+
+ err = fuse_uring_get_iovec_from_sqe(cmd->sqe, iov);
+ if (err) {
+ pr_info_ratelimited("Failed to get iovec from sqe, err=%d\n",
+ err);
+ return ERR_PTR(err);
+ }
+
+ err = -EINVAL;
+ if (iov[0].iov_len < sizeof(struct fuse_uring_req_header)) {
+ pr_info_ratelimited("Invalid header len %zu\n", iov[0].iov_len);
+ return ERR_PTR(err);
+ }
+
+ payload_size = iov[1].iov_len;
+ if (payload_size < ring->max_payload_sz) {
+ pr_info_ratelimited("Invalid req payload len %zu\n",
+ payload_size);
+ return ERR_PTR(err);
+ }
+
+ err = -ENOMEM;
+ ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT);
+ if (!ent)
+ return ERR_PTR(err);
+
+ INIT_LIST_HEAD(&ent->list);
+
+ ent->queue = queue;
+ ent->headers = iov[0].iov_base;
+ ent->payload = iov[1].iov_base;
+
+ atomic_inc(&ring->queue_refs);
+ return ent;
+}
+
+/*
+ * Register header and payload buffer with the kernel and puts the
+ * entry as "ready to get fuse requests" on the queue
+ */
+static int fuse_uring_register(struct io_uring_cmd *cmd,
+ unsigned int issue_flags, struct fuse_conn *fc)
+{
+ const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe);
+ struct fuse_ring *ring = smp_load_acquire(&fc->ring);
+ struct fuse_ring_queue *queue;
+ struct fuse_ring_ent *ent;
+ int err;
+ unsigned int qid = READ_ONCE(cmd_req->qid);
+
+ err = -ENOMEM;
+ if (!ring) {
+ ring = fuse_uring_create(fc);
+ if (!ring)
+ return err;
+ }
+
+ if (qid >= ring->nr_queues) {
+ pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid);
+ return -EINVAL;
+ }
+
+ queue = ring->queues[qid];
+ if (!queue) {
+ queue = fuse_uring_create_queue(ring, qid);
+ if (!queue)
+ return err;
+ }
+
+ /*
+ * The created queue above does not need to be destructed in
+ * case of entry errors below, will be done at ring destruction time.
+ */
+
+ ent = fuse_uring_create_ring_ent(cmd, queue);
+ if (IS_ERR(ent))
+ return PTR_ERR(ent);
+
+ fuse_uring_do_register(ent, cmd, issue_flags);
+
+ return 0;
+}
+
+/*
+ * Entry function from io_uring to handle the given passthrough command
+ * (op code IORING_OP_URING_CMD)
+ */
+int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ struct fuse_dev *fud;
+ struct fuse_conn *fc;
+ u32 cmd_op = cmd->cmd_op;
+ int err;
+
+ if ((unlikely(issue_flags & IO_URING_F_CANCEL))) {
+ fuse_uring_cancel(cmd, issue_flags);
+ return 0;
+ }
+
+ /* This extra SQE size holds struct fuse_uring_cmd_req */
+ if (!(issue_flags & IO_URING_F_SQE128))
+ return -EINVAL;
+
+ fud = fuse_get_dev(cmd->file);
+ if (!fud) {
+ pr_info_ratelimited("No fuse device found\n");
+ return -ENOTCONN;
+ }
+ fc = fud->fc;
+
+ /* Once a connection has io-uring enabled on it, it can't be disabled */
+ if (!enable_uring && !fc->io_uring) {
+ pr_info_ratelimited("fuse-io-uring is disabled\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (fc->aborted)
+ return -ECONNABORTED;
+ if (!fc->connected)
+ return -ENOTCONN;
+
+ /*
+ * fuse_uring_register() needs the ring to be initialized,
+ * we need to know the max payload size
+ */
+ if (!fc->initialized)
+ return -EAGAIN;
+
+ switch (cmd_op) {
+ case FUSE_IO_URING_CMD_REGISTER:
+ err = fuse_uring_register(cmd, issue_flags, fc);
+ if (err) {
+ pr_info_once("FUSE_IO_URING_CMD_REGISTER failed err=%d\n",
+ err);
+ fc->io_uring = 0;
+ wake_up_all(&fc->blocked_waitq);
+ return err;
+ }
+ break;
+ case FUSE_IO_URING_CMD_COMMIT_AND_FETCH:
+ err = fuse_uring_commit_fetch(cmd, issue_flags, fc);
+ if (err) {
+ pr_info_once("FUSE_IO_URING_COMMIT_AND_FETCH failed err=%d\n",
+ err);
+ return err;
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return -EIOCBQUEUED;
+}
+
+static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd,
+ ssize_t ret, unsigned int issue_flags)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+
+ spin_lock(&queue->lock);
+ ent->state = FRRS_USERSPACE;
+ list_move(&ent->list, &queue->ent_in_userspace);
+ ent->cmd = NULL;
+ spin_unlock(&queue->lock);
+
+ io_uring_cmd_done(cmd, ret, 0, issue_flags);
+}
+
+/*
+ * This prepares and sends the ring request in fuse-uring task context.
+ * User buffers are not mapped yet - the application does not have permission
+ * to write to it - this has to be executed in ring task context.
+ */
+static void fuse_uring_send_in_task(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd);
+ struct fuse_ring_queue *queue = ent->queue;
+ int err;
+
+ if (!(issue_flags & IO_URING_F_TASK_DEAD)) {
+ err = fuse_uring_prepare_send(ent, ent->fuse_req);
+ if (err) {
+ fuse_uring_next_fuse_req(ent, queue, issue_flags);
+ return;
+ }
+ } else {
+ err = -ECANCELED;
+ }
+
+ fuse_uring_send(ent, cmd, err, issue_flags);
+}
+
+static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring)
+{
+ unsigned int qid;
+ struct fuse_ring_queue *queue;
+
+ qid = task_cpu(current);
+
+ if (WARN_ONCE(qid >= ring->nr_queues,
+ "Core number (%u) exceeds nr queues (%zu)\n", qid,
+ ring->nr_queues))
+ qid = 0;
+
+ queue = ring->queues[qid];
+ WARN_ONCE(!queue, "Missing queue for qid %d\n", qid);
+
+ return queue;
+}
+
+static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent)
+{
+ struct io_uring_cmd *cmd = ent->cmd;
+
+ uring_cmd_set_ring_ent(cmd, ent);
+ io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task);
+}
+
+/* queue a fuse request and send it if a ring entry is available */
+void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req)
+{
+ struct fuse_conn *fc = req->fm->fc;
+ struct fuse_ring *ring = fc->ring;
+ struct fuse_ring_queue *queue;
+ struct fuse_ring_ent *ent = NULL;
+ int err;
+
+ err = -EINVAL;
+ queue = fuse_uring_task_to_queue(ring);
+ if (!queue)
+ goto err;
+
+ if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
+ req->in.h.unique = fuse_get_unique(fiq);
+
+ spin_lock(&queue->lock);
+ err = -ENOTCONN;
+ if (unlikely(queue->stopped))
+ goto err_unlock;
+
+ set_bit(FR_URING, &req->flags);
+ req->ring_queue = queue;
+ ent = list_first_entry_or_null(&queue->ent_avail_queue,
+ struct fuse_ring_ent, list);
+ if (ent)
+ fuse_uring_add_req_to_ring_ent(ent, req);
+ else
+ list_add_tail(&req->list, &queue->fuse_req_queue);
+ spin_unlock(&queue->lock);
+
+ if (ent)
+ fuse_uring_dispatch_ent(ent);
+
+ return;
+
+err_unlock:
+ spin_unlock(&queue->lock);
+err:
+ req->out.h.error = err;
+ clear_bit(FR_PENDING, &req->flags);
+ fuse_request_end(req);
+}
+
+bool fuse_uring_queue_bq_req(struct fuse_req *req)
+{
+ struct fuse_conn *fc = req->fm->fc;
+ struct fuse_ring *ring = fc->ring;
+ struct fuse_ring_queue *queue;
+ struct fuse_ring_ent *ent = NULL;
+
+ queue = fuse_uring_task_to_queue(ring);
+ if (!queue)
+ return false;
+
+ spin_lock(&queue->lock);
+ if (unlikely(queue->stopped)) {
+ spin_unlock(&queue->lock);
+ return false;
+ }
+
+ set_bit(FR_URING, &req->flags);
+ req->ring_queue = queue;
+ list_add_tail(&req->list, &queue->fuse_req_bg_queue);
+
+ ent = list_first_entry_or_null(&queue->ent_avail_queue,
+ struct fuse_ring_ent, list);
+ spin_lock(&fc->bg_lock);
+ fc->num_background++;
+ if (fc->num_background == fc->max_background)
+ fc->blocked = 1;
+ fuse_uring_flush_bg(queue);
+ spin_unlock(&fc->bg_lock);
+
+ /*
+ * Due to bg_queue flush limits there might be other bg requests
+ * in the queue that need to be handled first. Or no further req
+ * might be available.
+ */
+ req = list_first_entry_or_null(&queue->fuse_req_queue, struct fuse_req,
+ list);
+ if (ent && req) {
+ fuse_uring_add_req_to_ring_ent(ent, req);
+ spin_unlock(&queue->lock);
+
+ fuse_uring_dispatch_ent(ent);
+ } else {
+ spin_unlock(&queue->lock);
+ }
+
+ return true;
+}
+
+bool fuse_uring_remove_pending_req(struct fuse_req *req)
+{
+ struct fuse_ring_queue *queue = req->ring_queue;
+
+ return fuse_remove_pending_req(req, &queue->lock);
+}
+
+static const struct fuse_iqueue_ops fuse_io_uring_ops = {
+ /* should be send over io-uring as enhancement */
+ .send_forget = fuse_dev_queue_forget,
+
+ /*
+ * could be send over io-uring, but interrupts should be rare,
+ * no need to make the code complex
+ */
+ .send_interrupt = fuse_dev_queue_interrupt,
+ .send_req = fuse_uring_queue_fuse_req,
+};
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
new file mode 100644
index 000000000000..51a563922ce1
--- /dev/null
+++ b/fs/fuse/dev_uring_i.h
@@ -0,0 +1,211 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * FUSE: Filesystem in Userspace
+ * Copyright (c) 2023-2024 DataDirect Networks.
+ */
+
+#ifndef _FS_FUSE_DEV_URING_I_H
+#define _FS_FUSE_DEV_URING_I_H
+
+#include "fuse_i.h"
+
+#ifdef CONFIG_FUSE_IO_URING
+
+#define FUSE_URING_TEARDOWN_TIMEOUT (5 * HZ)
+#define FUSE_URING_TEARDOWN_INTERVAL (HZ/20)
+
+enum fuse_ring_req_state {
+ FRRS_INVALID = 0,
+
+ /* The ring entry received from userspace and it is being processed */
+ FRRS_COMMIT,
+
+ /* The ring entry is waiting for new fuse requests */
+ FRRS_AVAILABLE,
+
+ /* The ring entry got assigned a fuse req */
+ FRRS_FUSE_REQ,
+
+ /* The ring entry is in or on the way to user space */
+ FRRS_USERSPACE,
+
+ /* The ring entry is in teardown */
+ FRRS_TEARDOWN,
+
+ /* The ring entry is released, but not freed yet */
+ FRRS_RELEASED,
+};
+
+/** A fuse ring entry, part of the ring queue */
+struct fuse_ring_ent {
+ /* userspace buffer */
+ struct fuse_uring_req_header __user *headers;
+ void __user *payload;
+
+ /* the ring queue that owns the request */
+ struct fuse_ring_queue *queue;
+
+ /* fields below are protected by queue->lock */
+
+ struct io_uring_cmd *cmd;
+
+ struct list_head list;
+
+ enum fuse_ring_req_state state;
+
+ struct fuse_req *fuse_req;
+};
+
+struct fuse_ring_queue {
+ /*
+ * back pointer to the main fuse uring structure that holds this
+ * queue
+ */
+ struct fuse_ring *ring;
+
+ /* queue id, corresponds to the cpu core */
+ unsigned int qid;
+
+ /*
+ * queue lock, taken when any value in the queue changes _and_ also
+ * a ring entry state changes.
+ */
+ spinlock_t lock;
+
+ /* available ring entries (struct fuse_ring_ent) */
+ struct list_head ent_avail_queue;
+
+ /*
+ * entries in the process of being committed or in the process
+ * to be sent to userspace
+ */
+ struct list_head ent_w_req_queue;
+ struct list_head ent_commit_queue;
+
+ /* entries in userspace */
+ struct list_head ent_in_userspace;
+
+ /* entries that are released */
+ struct list_head ent_released;
+
+ /* fuse requests waiting for an entry slot */
+ struct list_head fuse_req_queue;
+
+ /* background fuse requests */
+ struct list_head fuse_req_bg_queue;
+
+ struct fuse_pqueue fpq;
+
+ unsigned int active_background;
+
+ bool stopped;
+};
+
+/**
+ * Describes if uring is for communication and holds alls the data needed
+ * for uring communication
+ */
+struct fuse_ring {
+ /* back pointer */
+ struct fuse_conn *fc;
+
+ /* number of ring queues */
+ size_t nr_queues;
+
+ /* maximum payload/arg size */
+ size_t max_payload_sz;
+
+ struct fuse_ring_queue **queues;
+
+ /*
+ * Log ring entry states on stop when entries cannot be released
+ */
+ unsigned int stop_debug_log : 1;
+
+ wait_queue_head_t stop_waitq;
+
+ /* async tear down */
+ struct delayed_work async_teardown_work;
+
+ /* log */
+ unsigned long teardown_time;
+
+ atomic_t queue_refs;
+
+ bool ready;
+};
+
+bool fuse_uring_enabled(void);
+void fuse_uring_destruct(struct fuse_conn *fc);
+void fuse_uring_stop_queues(struct fuse_ring *ring);
+void fuse_uring_abort_end_requests(struct fuse_ring *ring);
+int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
+void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req);
+bool fuse_uring_queue_bq_req(struct fuse_req *req);
+bool fuse_uring_remove_pending_req(struct fuse_req *req);
+bool fuse_uring_request_expired(struct fuse_conn *fc);
+
+static inline void fuse_uring_abort(struct fuse_conn *fc)
+{
+ struct fuse_ring *ring = fc->ring;
+
+ if (ring == NULL)
+ return;
+
+ if (atomic_read(&ring->queue_refs) > 0) {
+ fuse_uring_abort_end_requests(ring);
+ fuse_uring_stop_queues(ring);
+ }
+}
+
+static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc)
+{
+ struct fuse_ring *ring = fc->ring;
+
+ if (ring)
+ wait_event(ring->stop_waitq,
+ atomic_read(&ring->queue_refs) == 0);
+}
+
+static inline bool fuse_uring_ready(struct fuse_conn *fc)
+{
+ return fc->ring && fc->ring->ready;
+}
+
+#else /* CONFIG_FUSE_IO_URING */
+
+static inline void fuse_uring_destruct(struct fuse_conn *fc)
+{
+}
+
+static inline bool fuse_uring_enabled(void)
+{
+ return false;
+}
+
+static inline void fuse_uring_abort(struct fuse_conn *fc)
+{
+}
+
+static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc)
+{
+}
+
+static inline bool fuse_uring_ready(struct fuse_conn *fc)
+{
+ return false;
+}
+
+static inline bool fuse_uring_remove_pending_req(struct fuse_req *req)
+{
+ return false;
+}
+
+static inline bool fuse_uring_request_expired(struct fuse_conn *fc)
+{
+ return false;
+}
+
+#endif /* CONFIG_FUSE_IO_URING */
+
+#endif /* _FS_FUSE_DEV_URING_I_H */
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 4a6df591add6..7d7ed45cb3e9 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -175,9 +175,12 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
memset(outarg, 0, sizeof(struct fuse_entry_out));
args->opcode = FUSE_LOOKUP;
args->nodeid = nodeid;
- args->in_numargs = 1;
- args->in_args[0].size = name->len + 1;
- args->in_args[0].value = name->name;
+ args->in_numargs = 3;
+ fuse_set_zero_arg0(args);
+ args->in_args[1].size = name->len;
+ args->in_args[1].value = name->name;
+ args->in_args[2].size = 1;
+ args->in_args[2].value = "";
args->out_numargs = 1;
args->out_args[0].size = sizeof(struct fuse_entry_out);
args->out_args[0].value = outarg;
@@ -192,10 +195,10 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
* the lookup once more. If the lookup results in the same inode,
* then refresh the attributes, timeouts and mark the dentry valid.
*/
-static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
+static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *entry, unsigned int flags)
{
struct inode *inode;
- struct dentry *parent;
struct fuse_mount *fm;
struct fuse_inode *fi;
int ret;
@@ -227,11 +230,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
attr_version = fuse_get_attr_version(fm->fc);
- parent = dget_parent(entry);
- fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)),
- &entry->d_name, &outarg);
+ fuse_lookup_init(fm->fc, &args, get_node_id(dir),
+ name, &outarg);
ret = fuse_simple_request(fm, &args);
- dput(parent);
/* Zero nodeid is same as -ENOENT */
if (!ret && !outarg.nodeid)
ret = -ENOENT;
@@ -265,9 +266,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state))
return -ECHILD;
} else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) {
- parent = dget_parent(entry);
- fuse_advise_use_readdirplus(d_inode(parent));
- dput(parent);
+ fuse_advise_use_readdirplus(dir);
}
}
ret = 1;
@@ -320,9 +319,6 @@ static struct vfsmount *fuse_dentry_automount(struct path *path)
/* Create the submount */
mnt = fc_mount(fsc);
- if (!IS_ERR(mnt))
- mntget(mnt);
-
put_fs_context(fsc);
return mnt;
}
@@ -366,12 +362,12 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
struct fuse_mount *fm = get_fuse_mount_super(sb);
FUSE_ARGS(args);
struct fuse_forget_link *forget;
- u64 attr_version;
+ u64 attr_version, evict_ctr;
int err;
*inode = NULL;
err = -ENAMETOOLONG;
- if (name->len > FUSE_NAME_MAX)
+ if (name->len > fm->fc->name_max)
goto out;
@@ -381,6 +377,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
goto out;
attr_version = fuse_get_attr_version(fm->fc);
+ evict_ctr = fuse_get_evict_ctr(fm->fc);
fuse_lookup_init(fm->fc, &args, nodeid, name, outarg);
err = fuse_simple_request(fm, &args);
@@ -398,7 +395,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
*inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
&outarg->attr, ATTR_TIMEOUT(outarg),
- attr_version);
+ attr_version, evict_ctr);
err = -ENOMEM;
if (!*inode) {
fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1);
@@ -466,29 +463,29 @@ static int get_security_context(struct dentry *entry, umode_t mode,
{
struct fuse_secctx *fctx;
struct fuse_secctx_header *header;
- void *ctx = NULL, *ptr;
- u32 ctxlen, total_len = sizeof(*header);
+ struct lsm_context lsmctx = { };
+ void *ptr;
+ u32 total_len = sizeof(*header);
int err, nr_ctx = 0;
- const char *name;
+ const char *name = NULL;
size_t namelen;
err = security_dentry_init_security(entry, mode, &entry->d_name,
- &name, &ctx, &ctxlen);
- if (err) {
- if (err != -EOPNOTSUPP)
- goto out_err;
- /* No LSM is supporting this security hook. Ignore error */
- ctxlen = 0;
- ctx = NULL;
- }
+ &name, &lsmctx);
+
+ /* If no LSM is supporting this security hook ignore error */
+ if (err && err != -EOPNOTSUPP)
+ goto out_err;
- if (ctxlen) {
+ if (lsmctx.len) {
nr_ctx = 1;
namelen = strlen(name) + 1;
err = -EIO;
- if (WARN_ON(namelen > XATTR_NAME_MAX + 1 || ctxlen > S32_MAX))
+ if (WARN_ON(namelen > XATTR_NAME_MAX + 1 ||
+ lsmctx.len > S32_MAX))
goto out_err;
- total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen + ctxlen);
+ total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen +
+ lsmctx.len);
}
err = -ENOMEM;
@@ -501,19 +498,20 @@ static int get_security_context(struct dentry *entry, umode_t mode,
ptr += sizeof(*header);
if (nr_ctx) {
fctx = ptr;
- fctx->size = ctxlen;
+ fctx->size = lsmctx.len;
ptr += sizeof(*fctx);
strcpy(ptr, name);
ptr += namelen;
- memcpy(ptr, ctx, ctxlen);
+ memcpy(ptr, lsmctx.context, lsmctx.len);
}
ext->size = total_len;
ext->value = header;
err = 0;
out_err:
- kfree(ctx);
+ if (nr_ctx)
+ security_release_secctx(&lsmctx);
return err;
}
@@ -545,17 +543,21 @@ static u32 fuse_ext_size(size_t size)
/*
* This adds just a single supplementary group that matches the parent's group.
*/
-static int get_create_supp_group(struct inode *dir, struct fuse_in_arg *ext)
+static int get_create_supp_group(struct mnt_idmap *idmap,
+ struct inode *dir,
+ struct fuse_in_arg *ext)
{
struct fuse_conn *fc = get_fuse_conn(dir);
struct fuse_ext_header *xh;
struct fuse_supp_groups *sg;
kgid_t kgid = dir->i_gid;
+ vfsgid_t vfsgid = make_vfsgid(idmap, fc->user_ns, kgid);
gid_t parent_gid = from_kgid(fc->user_ns, kgid);
+
u32 sg_len = fuse_ext_size(sizeof(*sg) + sizeof(sg->groups[0]));
- if (parent_gid == (gid_t) -1 || gid_eq(kgid, current_fsgid()) ||
- !in_group_p(kgid))
+ if (parent_gid == (gid_t) -1 || vfsgid_eq_kgid(vfsgid, current_fsgid()) ||
+ !vfsgid_in_group_p(vfsgid))
return 0;
xh = extend_arg(ext, sg_len);
@@ -572,7 +574,8 @@ static int get_create_supp_group(struct inode *dir, struct fuse_in_arg *ext)
return 0;
}
-static int get_create_ext(struct fuse_args *args,
+static int get_create_ext(struct mnt_idmap *idmap,
+ struct fuse_args *args,
struct inode *dir, struct dentry *dentry,
umode_t mode)
{
@@ -583,7 +586,7 @@ static int get_create_ext(struct fuse_args *args,
if (fc->init_security)
err = get_security_context(dentry, mode, &ext);
if (!err && fc->create_supp_group)
- err = get_create_supp_group(dir, &ext);
+ err = get_create_supp_group(idmap, dir, &ext);
if (!err && ext.size) {
WARN_ON(args->in_numargs >= ARRAY_SIZE(args->in_args));
@@ -609,9 +612,9 @@ static void free_ext_value(struct fuse_args *args)
* If the filesystem doesn't support this, then fall back to separate
* 'mknod' + 'open' requests.
*/
-static int fuse_create_open(struct inode *dir, struct dentry *entry,
- struct file *file, unsigned int flags,
- umode_t mode, u32 opcode)
+static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *entry, struct file *file,
+ unsigned int flags, umode_t mode, u32 opcode)
{
int err;
struct inode *inode;
@@ -668,11 +671,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
args.out_args[1].size = sizeof(*outopenp);
args.out_args[1].value = outopenp;
- err = get_create_ext(&args, dir, entry, mode);
+ err = get_create_ext(idmap, &args, dir, entry, mode);
if (err)
- goto out_put_forget_req;
+ goto out_free_ff;
- err = fuse_simple_request(fm, &args);
+ err = fuse_simple_idmap_request(idmap, fm, &args);
free_ext_value(&args);
if (err)
goto out_free_ff;
@@ -686,7 +689,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
ff->nodeid = outentry.nodeid;
ff->open_flags = outopenp->open_flags;
inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
- &outentry.attr, ATTR_TIMEOUT(&outentry), 0);
+ &outentry.attr, ATTR_TIMEOUT(&outentry), 0, 0);
if (!inode) {
flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
fuse_sync_release(NULL, ff, flags);
@@ -729,6 +732,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
umode_t mode)
{
int err;
+ struct mnt_idmap *idmap = file_mnt_idmap(file);
struct fuse_conn *fc = get_fuse_conn(dir);
struct dentry *res = NULL;
@@ -753,7 +757,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
if (fc->no_create)
goto mknod;
- err = fuse_create_open(dir, entry, file, flags, mode, FUSE_CREATE);
+ err = fuse_create_open(idmap, dir, entry, file, flags, mode, FUSE_CREATE);
if (err == -ENOSYS) {
fc->no_create = 1;
goto mknod;
@@ -764,7 +768,7 @@ out_dput:
return err;
mknod:
- err = fuse_mknod(&nop_mnt_idmap, dir, entry, mode, 0);
+ err = fuse_mknod(idmap, dir, entry, mode, 0);
if (err)
goto out_dput;
no_open:
@@ -774,9 +778,9 @@ no_open:
/*
* Code shared between mknod, mkdir, symlink and link
*/
-static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
- struct inode *dir, struct dentry *entry,
- umode_t mode)
+static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm,
+ struct fuse_args *args, struct inode *dir,
+ struct dentry *entry, umode_t mode)
{
struct fuse_entry_out outarg;
struct inode *inode;
@@ -785,11 +789,11 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
struct fuse_forget_link *forget;
if (fuse_is_bad(dir))
- return -EIO;
+ return ERR_PTR(-EIO);
forget = fuse_alloc_forget();
if (!forget)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
memset(&outarg, 0, sizeof(outarg));
args->nodeid = get_node_id(dir);
@@ -798,12 +802,12 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
args->out_args[0].value = &outarg;
if (args->opcode != FUSE_LINK) {
- err = get_create_ext(args, dir, entry, mode);
+ err = get_create_ext(idmap, args, dir, entry, mode);
if (err)
goto out_put_forget_req;
}
- err = fuse_simple_request(fm, args);
+ err = fuse_simple_idmap_request(idmap, fm, args);
free_ext_value(args);
if (err)
goto out_put_forget_req;
@@ -816,32 +820,46 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
goto out_put_forget_req;
inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
- &outarg.attr, ATTR_TIMEOUT(&outarg), 0);
+ &outarg.attr, ATTR_TIMEOUT(&outarg), 0, 0);
if (!inode) {
fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1);
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
}
kfree(forget);
d_drop(entry);
d = d_splice_alias(inode, entry);
if (IS_ERR(d))
- return PTR_ERR(d);
+ return d;
- if (d) {
+ if (d)
fuse_change_entry_timeout(d, &outarg);
- dput(d);
- } else {
+ else
fuse_change_entry_timeout(entry, &outarg);
- }
fuse_dir_changed(dir);
- return 0;
+ return d;
out_put_forget_req:
if (err == -EEXIST)
fuse_invalidate_entry(entry);
kfree(forget);
- return err;
+ return ERR_PTR(err);
+}
+
+static int create_new_nondir(struct mnt_idmap *idmap, struct fuse_mount *fm,
+ struct fuse_args *args, struct inode *dir,
+ struct dentry *entry, umode_t mode)
+{
+ /*
+ * Note that when creating anything other than a directory we
+ * can be sure create_new_entry() will NOT return an alternate
+ * dentry as d_splice_alias() only returns an alternate dentry
+ * for directories. So we don't need to check for that case
+ * when passing back the result.
+ */
+ WARN_ON_ONCE(S_ISDIR(mode));
+
+ return PTR_ERR(create_new_entry(idmap, fm, args, dir, entry, mode));
}
static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir,
@@ -864,13 +882,13 @@ static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir,
args.in_args[0].value = &inarg;
args.in_args[1].size = entry->d_name.len + 1;
args.in_args[1].value = entry->d_name.name;
- return create_new_entry(fm, &args, dir, entry, mode);
+ return create_new_nondir(idmap, fm, &args, dir, entry, mode);
}
static int fuse_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *entry, umode_t mode, bool excl)
{
- return fuse_mknod(&nop_mnt_idmap, dir, entry, mode, 0);
+ return fuse_mknod(idmap, dir, entry, mode, 0);
}
static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
@@ -882,7 +900,8 @@ static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
if (fc->no_tmpfile)
return -EOPNOTSUPP;
- err = fuse_create_open(dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE);
+ err = fuse_create_open(idmap, dir, file->f_path.dentry, file,
+ file->f_flags, mode, FUSE_TMPFILE);
if (err == -ENOSYS) {
fc->no_tmpfile = 1;
err = -EOPNOTSUPP;
@@ -890,8 +909,8 @@ static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
return err;
}
-static int fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *entry, umode_t mode)
+static struct dentry *fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *entry, umode_t mode)
{
struct fuse_mkdir_in inarg;
struct fuse_mount *fm = get_fuse_mount(dir);
@@ -909,7 +928,7 @@ static int fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir,
args.in_args[0].value = &inarg;
args.in_args[1].size = entry->d_name.len + 1;
args.in_args[1].value = entry->d_name.name;
- return create_new_entry(fm, &args, dir, entry, S_IFDIR);
+ return create_new_entry(idmap, fm, &args, dir, entry, S_IFDIR);
}
static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir,
@@ -920,12 +939,13 @@ static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir,
FUSE_ARGS(args);
args.opcode = FUSE_SYMLINK;
- args.in_numargs = 2;
- args.in_args[0].size = entry->d_name.len + 1;
- args.in_args[0].value = entry->d_name.name;
- args.in_args[1].size = len;
- args.in_args[1].value = link;
- return create_new_entry(fm, &args, dir, entry, S_IFLNK);
+ args.in_numargs = 3;
+ fuse_set_zero_arg0(&args);
+ args.in_args[1].size = entry->d_name.len + 1;
+ args.in_args[1].value = entry->d_name.name;
+ args.in_args[2].size = len;
+ args.in_args[2].value = link;
+ return create_new_nondir(idmap, fm, &args, dir, entry, S_IFLNK);
}
void fuse_flush_time_update(struct inode *inode)
@@ -984,9 +1004,10 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
args.opcode = FUSE_UNLINK;
args.nodeid = get_node_id(dir);
- args.in_numargs = 1;
- args.in_args[0].size = entry->d_name.len + 1;
- args.in_args[0].value = entry->d_name.name;
+ args.in_numargs = 2;
+ fuse_set_zero_arg0(&args);
+ args.in_args[1].size = entry->d_name.len + 1;
+ args.in_args[1].value = entry->d_name.name;
err = fuse_simple_request(fm, &args);
if (!err) {
fuse_dir_changed(dir);
@@ -1007,9 +1028,10 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
args.opcode = FUSE_RMDIR;
args.nodeid = get_node_id(dir);
- args.in_numargs = 1;
- args.in_args[0].size = entry->d_name.len + 1;
- args.in_args[0].value = entry->d_name.name;
+ args.in_numargs = 2;
+ fuse_set_zero_arg0(&args);
+ args.in_args[1].size = entry->d_name.len + 1;
+ args.in_args[1].value = entry->d_name.name;
err = fuse_simple_request(fm, &args);
if (!err) {
fuse_dir_changed(dir);
@@ -1019,7 +1041,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
return err;
}
-static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
+static int fuse_rename_common(struct mnt_idmap *idmap, struct inode *olddir, struct dentry *oldent,
struct inode *newdir, struct dentry *newent,
unsigned int flags, int opcode, size_t argsize)
{
@@ -1040,7 +1062,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
args.in_args[1].value = oldent->d_name.name;
args.in_args[2].size = newent->d_name.len + 1;
args.in_args[2].value = newent->d_name.name;
- err = fuse_simple_request(fm, &args);
+ err = fuse_simple_idmap_request(idmap, fm, &args);
if (!err) {
/* ctime changes */
fuse_update_ctime(d_inode(oldent));
@@ -1086,7 +1108,8 @@ static int fuse_rename2(struct mnt_idmap *idmap, struct inode *olddir,
if (fc->no_rename2 || fc->minor < 23)
return -EINVAL;
- err = fuse_rename_common(olddir, oldent, newdir, newent, flags,
+ err = fuse_rename_common((flags & RENAME_WHITEOUT) ? idmap : &invalid_mnt_idmap,
+ olddir, oldent, newdir, newent, flags,
FUSE_RENAME2,
sizeof(struct fuse_rename2_in));
if (err == -ENOSYS) {
@@ -1094,7 +1117,7 @@ static int fuse_rename2(struct mnt_idmap *idmap, struct inode *olddir,
err = -EINVAL;
}
} else {
- err = fuse_rename_common(olddir, oldent, newdir, newent, 0,
+ err = fuse_rename_common(&invalid_mnt_idmap, olddir, oldent, newdir, newent, 0,
FUSE_RENAME,
sizeof(struct fuse_rename_in));
}
@@ -1111,6 +1134,9 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
+ if (fm->fc->no_link)
+ goto out;
+
memset(&inarg, 0, sizeof(inarg));
inarg.oldnodeid = get_node_id(inode);
args.opcode = FUSE_LINK;
@@ -1119,27 +1145,37 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
args.in_args[0].value = &inarg;
args.in_args[1].size = newent->d_name.len + 1;
args.in_args[1].value = newent->d_name.name;
- err = create_new_entry(fm, &args, newdir, newent, inode->i_mode);
+ err = create_new_nondir(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode);
if (!err)
fuse_update_ctime_in_cache(inode);
else if (err == -EINTR)
fuse_invalidate_attr(inode);
+ if (err == -ENOSYS)
+ fm->fc->no_link = 1;
+out:
+ if (fm->fc->no_link)
+ return -EPERM;
+
return err;
}
-static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
- struct kstat *stat)
+static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode,
+ struct fuse_attr *attr, struct kstat *stat)
{
unsigned int blkbits;
struct fuse_conn *fc = get_fuse_conn(inode);
+ vfsuid_t vfsuid = make_vfsuid(idmap, fc->user_ns,
+ make_kuid(fc->user_ns, attr->uid));
+ vfsgid_t vfsgid = make_vfsgid(idmap, fc->user_ns,
+ make_kgid(fc->user_ns, attr->gid));
stat->dev = inode->i_sb->s_dev;
stat->ino = attr->ino;
stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
stat->nlink = attr->nlink;
- stat->uid = make_kuid(fc->user_ns, attr->uid);
- stat->gid = make_kgid(fc->user_ns, attr->gid);
+ stat->uid = vfsuid_into_kuid(vfsuid);
+ stat->gid = vfsgid_into_kgid(vfsgid);
stat->rdev = inode->i_rdev;
stat->atime.tv_sec = attr->atime;
stat->atime.tv_nsec = attr->atimensec;
@@ -1178,8 +1214,8 @@ static void fuse_statx_to_attr(struct fuse_statx *sx, struct fuse_attr *attr)
attr->blksize = sx->blksize;
}
-static int fuse_do_statx(struct inode *inode, struct file *file,
- struct kstat *stat)
+static int fuse_do_statx(struct mnt_idmap *idmap, struct inode *inode,
+ struct file *file, struct kstat *stat)
{
int err;
struct fuse_attr attr;
@@ -1232,15 +1268,15 @@ static int fuse_do_statx(struct inode *inode, struct file *file,
stat->result_mask = sx->mask & (STATX_BASIC_STATS | STATX_BTIME);
stat->btime.tv_sec = sx->btime.tv_sec;
stat->btime.tv_nsec = min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1);
- fuse_fillattr(inode, &attr, stat);
+ fuse_fillattr(idmap, inode, &attr, stat);
stat->result_mask |= STATX_TYPE;
}
return 0;
}
-static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
- struct file *file)
+static int fuse_do_getattr(struct mnt_idmap *idmap, struct inode *inode,
+ struct kstat *stat, struct file *file)
{
int err;
struct fuse_getattr_in inarg;
@@ -1279,15 +1315,15 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
ATTR_TIMEOUT(&outarg),
attr_version);
if (stat)
- fuse_fillattr(inode, &outarg.attr, stat);
+ fuse_fillattr(idmap, inode, &outarg.attr, stat);
}
}
return err;
}
-static int fuse_update_get_attr(struct inode *inode, struct file *file,
- struct kstat *stat, u32 request_mask,
- unsigned int flags)
+static int fuse_update_get_attr(struct mnt_idmap *idmap, struct inode *inode,
+ struct file *file, struct kstat *stat,
+ u32 request_mask, unsigned int flags)
{
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -1318,16 +1354,17 @@ retry:
forget_all_cached_acls(inode);
/* Try statx if BTIME is requested */
if (!fc->no_statx && (request_mask & ~STATX_BASIC_STATS)) {
- err = fuse_do_statx(inode, file, stat);
+ err = fuse_do_statx(idmap, inode, file, stat);
if (err == -ENOSYS) {
fc->no_statx = 1;
+ err = 0;
goto retry;
}
} else {
- err = fuse_do_getattr(inode, stat, file);
+ err = fuse_do_getattr(idmap, inode, stat, file);
}
} else if (stat) {
- generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
stat->mode = fi->orig_i_mode;
stat->ino = fi->orig_ino;
if (test_bit(FUSE_I_BTIME, &fi->state)) {
@@ -1341,7 +1378,7 @@ retry:
int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask)
{
- return fuse_update_get_attr(inode, file, NULL, mask, 0);
+ return fuse_update_get_attr(&nop_mnt_idmap, inode, file, NULL, mask, 0);
}
int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
@@ -1461,6 +1498,14 @@ static int fuse_access(struct inode *inode, int mask)
BUG_ON(mask & MAY_NOT_BLOCK);
+ /*
+ * We should not send FUSE_ACCESS to the userspace
+ * when idmapped mounts are enabled as for this case
+ * we have fc->default_permissions = 1 and access
+ * permission checks are done on the kernel side.
+ */
+ WARN_ON_ONCE(!(fm->sb->s_iflags & SB_I_NOIDMAP));
+
if (fm->fc->no_access)
return 0;
@@ -1485,7 +1530,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask)
return -ECHILD;
forget_all_cached_acls(inode);
- return fuse_do_getattr(inode, NULL, NULL);
+ return fuse_do_getattr(&nop_mnt_idmap, inode, NULL, NULL);
}
/*
@@ -1533,7 +1578,7 @@ static int fuse_permission(struct mnt_idmap *idmap,
}
if (fc->default_permissions) {
- err = generic_permission(&nop_mnt_idmap, inode, mask);
+ err = generic_permission(idmap, inode, mask);
/* If permission is denied, try to refresh file
attributes. This is also needed, because the root
@@ -1541,7 +1586,7 @@ static int fuse_permission(struct mnt_idmap *idmap,
if (err == -EACCES && !refreshed) {
err = fuse_perm_getattr(inode, mask);
if (!err)
- err = generic_permission(&nop_mnt_idmap,
+ err = generic_permission(idmap,
inode, mask);
}
@@ -1564,13 +1609,13 @@ static int fuse_permission(struct mnt_idmap *idmap,
return err;
}
-static int fuse_readlink_page(struct inode *inode, struct page *page)
+static int fuse_readlink_page(struct inode *inode, struct folio *folio)
{
struct fuse_mount *fm = get_fuse_mount(inode);
- struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 };
+ struct fuse_folio_desc desc = { .length = PAGE_SIZE - 1 };
struct fuse_args_pages ap = {
- .num_pages = 1,
- .pages = &page,
+ .num_folios = 1,
+ .folios = &folio,
.descs = &desc,
};
char *link;
@@ -1593,7 +1638,7 @@ static int fuse_readlink_page(struct inode *inode, struct page *page)
if (WARN_ON(res >= PAGE_SIZE))
return -EIO;
- link = page_address(page);
+ link = folio_address(folio);
link[res] = '\0';
return 0;
@@ -1603,7 +1648,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode,
struct delayed_call *callback)
{
struct fuse_conn *fc = get_fuse_conn(inode);
- struct page *page;
+ struct folio *folio;
int err;
err = -EIO;
@@ -1611,26 +1656,26 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode,
goto out_err;
if (fc->cache_symlinks)
- return page_get_link(dentry, inode, callback);
+ return page_get_link_raw(dentry, inode, callback);
err = -ECHILD;
if (!dentry)
goto out_err;
- page = alloc_page(GFP_KERNEL);
+ folio = folio_alloc(GFP_KERNEL, 0);
err = -ENOMEM;
- if (!page)
+ if (!folio)
goto out_err;
- err = fuse_readlink_page(inode, page);
+ err = fuse_readlink_page(inode, folio);
if (err) {
- __free_page(page);
+ folio_put(folio);
goto out_err;
}
- set_delayed_call(callback, page_put_link, page);
+ set_delayed_call(callback, page_put_link, folio);
- return page_address(page);
+ return folio_address(folio);
out_err:
return ERR_PTR(err);
@@ -1659,6 +1704,8 @@ static int fuse_dir_open(struct inode *inode, struct file *file)
*/
if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE))
nonseekable_open(inode, file);
+ if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+ invalidate_inode_pages2(inode->i_mapping);
}
return err;
@@ -1737,17 +1784,29 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
return true;
}
-static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr,
- struct fuse_setattr_in *arg, bool trust_local_cmtime)
+static void iattr_to_fattr(struct mnt_idmap *idmap, struct fuse_conn *fc,
+ struct iattr *iattr, struct fuse_setattr_in *arg,
+ bool trust_local_cmtime)
{
unsigned ivalid = iattr->ia_valid;
if (ivalid & ATTR_MODE)
arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode;
- if (ivalid & ATTR_UID)
- arg->valid |= FATTR_UID, arg->uid = from_kuid(fc->user_ns, iattr->ia_uid);
- if (ivalid & ATTR_GID)
- arg->valid |= FATTR_GID, arg->gid = from_kgid(fc->user_ns, iattr->ia_gid);
+
+ if (ivalid & ATTR_UID) {
+ kuid_t fsuid = from_vfsuid(idmap, fc->user_ns, iattr->ia_vfsuid);
+
+ arg->valid |= FATTR_UID;
+ arg->uid = from_kuid(fc->user_ns, fsuid);
+ }
+
+ if (ivalid & ATTR_GID) {
+ kgid_t fsgid = from_vfsgid(idmap, fc->user_ns, iattr->ia_vfsgid);
+
+ arg->valid |= FATTR_GID;
+ arg->gid = from_kgid(fc->user_ns, fsgid);
+ }
+
if (ivalid & ATTR_SIZE)
arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size;
if (ivalid & ATTR_ATIME) {
@@ -1867,8 +1926,8 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
* vmtruncate() doesn't allow for this case, so do the rlimit checking
* and the actual truncation by hand.
*/
-int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
- struct file *file)
+int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr, struct file *file)
{
struct inode *inode = d_inode(dentry);
struct fuse_mount *fm = get_fuse_mount(inode);
@@ -1888,7 +1947,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
if (!fc->default_permissions)
attr->ia_valid |= ATTR_FORCE;
- err = setattr_prepare(&nop_mnt_idmap, dentry, attr);
+ err = setattr_prepare(idmap, dentry, attr);
if (err)
return err;
@@ -1901,7 +1960,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
if (FUSE_IS_DAX(inode) && is_truncate) {
filemap_invalidate_lock(mapping);
fault_blocked = true;
- err = fuse_dax_break_layouts(inode, 0, 0);
+ err = fuse_dax_break_layouts(inode, 0, -1);
if (err) {
filemap_invalidate_unlock(mapping);
return err;
@@ -1947,7 +2006,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
memset(&inarg, 0, sizeof(inarg));
memset(&outarg, 0, sizeof(outarg));
- iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime);
+ iattr_to_fattr(idmap, fc, attr, &inarg, trust_local_cmtime);
if (file) {
struct fuse_file *ff = file->private_data;
inarg.valid |= FATTR_FH;
@@ -1995,7 +2054,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
fuse_change_attributes_common(inode, &outarg.attr, NULL,
ATTR_TIMEOUT(&outarg),
- fuse_get_cache_mask(inode));
+ fuse_get_cache_mask(inode), 0);
oldsize = inode->i_size;
/* see the comment in fuse_change_attributes() */
if (!is_wb || is_truncate)
@@ -2064,7 +2123,7 @@ static int fuse_setattr(struct mnt_idmap *idmap, struct dentry *entry,
* ia_mode calculation may have used stale i_mode.
* Refresh and recalculate.
*/
- ret = fuse_do_getattr(inode, NULL, file);
+ ret = fuse_do_getattr(idmap, inode, NULL, file);
if (ret)
return ret;
@@ -2082,7 +2141,7 @@ static int fuse_setattr(struct mnt_idmap *idmap, struct dentry *entry,
if (!attr->ia_valid)
return 0;
- ret = fuse_do_setattr(entry, attr, file);
+ ret = fuse_do_setattr(idmap, entry, attr, file);
if (!ret) {
/*
* If filesystem supports acls it may have updated acl xattrs in
@@ -2121,7 +2180,7 @@ static int fuse_getattr(struct mnt_idmap *idmap,
return -EACCES;
}
- return fuse_update_get_attr(inode, NULL, stat, request_mask, flags);
+ return fuse_update_get_attr(idmap, inode, NULL, stat, request_mask, flags);
}
static const struct inode_operations fuse_dir_inode_operations = {
@@ -2198,7 +2257,7 @@ void fuse_init_dir(struct inode *inode)
static int fuse_symlink_read_folio(struct file *null, struct folio *folio)
{
- int err = fuse_readlink_page(folio->mapping->host, &folio->page);
+ int err = fuse_readlink_page(folio->mapping->host, folio);
if (!err)
folio_mark_uptodate(folio);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a56e7bffd000..6f19a4daa559 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -253,7 +253,7 @@ static int fuse_open(struct inode *inode, struct file *file)
if (dax_truncate) {
filemap_invalidate_lock(inode->i_mapping);
- err = fuse_dax_break_layouts(inode, 0, 0);
+ err = fuse_dax_break_layouts(inode, 0, -1);
if (err)
goto out_inode_unlock;
}
@@ -436,7 +436,7 @@ static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry);
WARN_ON(get_fuse_inode(wpa->inode) != fi);
curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
- if (idx_from >= curr_index + wpa->ia.ap.num_pages)
+ if (idx_from >= curr_index + wpa->ia.ap.num_folios)
n = n->rb_right;
else if (idx_to < curr_index)
n = n->rb_left;
@@ -448,9 +448,6 @@ static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
/*
* Check if any page in a range is under writeback
- *
- * This is currently done by walking the list of writepage requests
- * for the inode, which can be pretty inefficient.
*/
static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
pgoff_t idx_to)
@@ -458,6 +455,9 @@ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
struct fuse_inode *fi = get_fuse_inode(inode);
bool found;
+ if (RB_EMPTY_ROOT(&fi->writepages))
+ return false;
+
spin_lock(&fi->lock);
found = fuse_find_writeback(fi, idx_from, idx_to);
spin_unlock(&fi->lock);
@@ -483,6 +483,21 @@ static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
}
+static inline bool fuse_folio_is_writeback(struct inode *inode,
+ struct folio *folio)
+{
+ pgoff_t last = folio_next_index(folio) - 1;
+ return fuse_range_is_writeback(inode, folio->index, last);
+}
+
+static void fuse_wait_on_folio_writeback(struct inode *inode,
+ struct folio *folio)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ wait_event(fi->page_waitq, !fuse_folio_is_writeback(inode, folio));
+}
+
/*
* Wait for all pending writepages on the inode to finish.
*
@@ -645,17 +660,20 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
args->out_args[0].size = count;
}
-static void fuse_release_user_pages(struct fuse_args_pages *ap,
+static void fuse_release_user_pages(struct fuse_args_pages *ap, ssize_t nres,
bool should_dirty)
{
unsigned int i;
- for (i = 0; i < ap->num_pages; i++) {
+ for (i = 0; i < ap->num_folios; i++) {
if (should_dirty)
- set_page_dirty_lock(ap->pages[i]);
+ folio_mark_dirty_lock(ap->folios[i]);
if (ap->args.is_pinned)
- unpin_user_page(ap->pages[i]);
+ unpin_folio(ap->folios[i]);
}
+
+ if (nres > 0 && ap->args.invalidate_vmap)
+ invalidate_kernel_vmap_range(ap->args.vmap_base, nres);
}
static void fuse_io_release(struct kref *kref)
@@ -725,16 +743,16 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
}
static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io,
- unsigned int npages)
+ unsigned int nfolios)
{
struct fuse_io_args *ia;
ia = kzalloc(sizeof(*ia), GFP_KERNEL);
if (ia) {
ia->io = io;
- ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL,
- &ia->ap.descs);
- if (!ia->ap.pages) {
+ ia->ap.folios = fuse_folios_alloc(nfolios, GFP_KERNEL,
+ &ia->ap.descs);
+ if (!ia->ap.folios) {
kfree(ia);
ia = NULL;
}
@@ -744,7 +762,7 @@ static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io,
static void fuse_io_free(struct fuse_io_args *ia)
{
- kfree(ia->ap.pages);
+ kfree(ia->ap.folios);
kfree(ia);
}
@@ -754,25 +772,29 @@ static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args,
struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
struct fuse_io_priv *io = ia->io;
ssize_t pos = -1;
-
- fuse_release_user_pages(&ia->ap, io->should_dirty);
+ size_t nres;
if (err) {
/* Nothing */
} else if (io->write) {
if (ia->write.out.size > ia->write.in.size) {
err = -EIO;
- } else if (ia->write.in.size != ia->write.out.size) {
- pos = ia->write.in.offset - io->offset +
- ia->write.out.size;
+ } else {
+ nres = ia->write.out.size;
+ if (ia->write.in.size != ia->write.out.size)
+ pos = ia->write.in.offset - io->offset +
+ ia->write.out.size;
}
} else {
u32 outsize = args->out_args[0].size;
+ nres = outsize;
if (ia->read.in.size != outsize)
pos = ia->read.in.offset - io->offset + outsize;
}
+ fuse_release_user_pages(&ia->ap, err ?: nres, io->should_dirty);
+
fuse_aio_complete(io, err, pos);
fuse_io_free(ia);
}
@@ -843,33 +865,33 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
* reached the client fs yet. So the hole is not present there.
*/
if (!fc->writeback_cache) {
- loff_t pos = page_offset(ap->pages[0]) + num_read;
+ loff_t pos = folio_pos(ap->folios[0]) + num_read;
fuse_read_update_size(inode, pos, attr_ver);
}
}
-static int fuse_do_readpage(struct file *file, struct page *page)
+static int fuse_do_readfolio(struct file *file, struct folio *folio)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct fuse_mount *fm = get_fuse_mount(inode);
- loff_t pos = page_offset(page);
- struct fuse_page_desc desc = { .length = PAGE_SIZE };
+ loff_t pos = folio_pos(folio);
+ struct fuse_folio_desc desc = { .length = PAGE_SIZE };
struct fuse_io_args ia = {
.ap.args.page_zeroing = true,
.ap.args.out_pages = true,
- .ap.num_pages = 1,
- .ap.pages = &page,
+ .ap.num_folios = 1,
+ .ap.folios = &folio,
.ap.descs = &desc,
};
ssize_t res;
u64 attr_ver;
/*
- * Page writeback can extend beyond the lifetime of the
- * page-cache page, so make sure we read a properly synced
- * page.
+ * With the temporary pages that are used to complete writeback, we can
+ * have writeback that extends beyond the lifetime of the folio. So
+ * make sure we read a properly synced folio.
*/
- fuse_wait_on_page_writeback(inode, page->index);
+ fuse_wait_on_folio_writeback(inode, folio);
attr_ver = fuse_get_attr_version(fm->fc);
@@ -887,25 +909,24 @@ static int fuse_do_readpage(struct file *file, struct page *page)
if (res < desc.length)
fuse_short_read(inode, attr_ver, res, &ia.ap);
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
return 0;
}
static int fuse_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
int err;
err = -EIO;
if (fuse_is_bad(inode))
goto out;
- err = fuse_do_readpage(file, page);
+ err = fuse_do_readfolio(file, folio);
fuse_invalidate_atime(inode);
out:
- unlock_page(page);
+ folio_unlock(folio);
return err;
}
@@ -919,8 +940,8 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
size_t num_read = args->out_args[0].size;
struct address_space *mapping = NULL;
- for (i = 0; mapping == NULL && i < ap->num_pages; i++)
- mapping = ap->pages[i]->mapping;
+ for (i = 0; mapping == NULL && i < ap->num_folios; i++)
+ mapping = ap->folios[i]->mapping;
if (mapping) {
struct inode *inode = mapping->host;
@@ -934,15 +955,9 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
fuse_invalidate_atime(inode);
}
- for (i = 0; i < ap->num_pages; i++) {
- struct page *page = ap->pages[i];
-
- if (!err)
- SetPageUptodate(page);
- else
- SetPageError(page);
- unlock_page(page);
- put_page(page);
+ for (i = 0; i < ap->num_folios; i++) {
+ folio_end_read(ap->folios[i], !err);
+ folio_put(ap->folios[i]);
}
if (ia->ff)
fuse_file_put(ia->ff, false);
@@ -955,8 +970,9 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
struct fuse_file *ff = file->private_data;
struct fuse_mount *fm = ff->fm;
struct fuse_args_pages *ap = &ia->ap;
- loff_t pos = page_offset(ap->pages[0]);
- size_t count = ap->num_pages << PAGE_SHIFT;
+ loff_t pos = folio_pos(ap->folios[0]);
+ /* Currently, all folios in FUSE are one page */
+ size_t count = ap->num_folios << PAGE_SHIFT;
ssize_t res;
int err;
@@ -967,7 +983,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
/* Don't overflow end offset */
if (pos + (count - 1) == LLONG_MAX) {
count--;
- ap->descs[ap->num_pages - 1].length--;
+ ap->descs[ap->num_folios - 1].length--;
}
WARN_ON((loff_t) (pos + count) < 0);
@@ -989,18 +1005,36 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
static void fuse_readahead(struct readahead_control *rac)
{
struct inode *inode = rac->mapping->host;
+ struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
- unsigned int i, max_pages, nr_pages = 0;
+ unsigned int max_pages, nr_pages;
+ pgoff_t first = readahead_index(rac);
+ pgoff_t last = first + readahead_count(rac) - 1;
if (fuse_is_bad(inode))
return;
+ wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, first, last));
+
max_pages = min_t(unsigned int, fc->max_pages,
fc->max_read / PAGE_SIZE);
- for (;;) {
+ /*
+ * This is only accurate the first time through, since readahead_folio()
+ * doesn't update readahead_count() from the previous folio until the
+ * next call. Grab nr_pages here so we know how many pages we're going
+ * to have to process. This means that we will exit here with
+ * readahead_count() == folio_nr_pages(last_folio), but we will have
+ * consumed all of the folios, and read_pages() will call
+ * readahead_folio() again which will clean up the rac.
+ */
+ nr_pages = readahead_count(rac);
+
+ while (nr_pages) {
struct fuse_io_args *ia;
struct fuse_args_pages *ap;
+ struct folio *folio;
+ unsigned cur_pages = min(max_pages, nr_pages);
if (fc->num_background >= fc->congestion_threshold &&
rac->ra->async_size >= readahead_count(rac))
@@ -1010,23 +1044,26 @@ static void fuse_readahead(struct readahead_control *rac)
*/
break;
- nr_pages = readahead_count(rac) - nr_pages;
- if (nr_pages > max_pages)
- nr_pages = max_pages;
- if (nr_pages == 0)
- break;
- ia = fuse_io_alloc(NULL, nr_pages);
+ ia = fuse_io_alloc(NULL, cur_pages);
if (!ia)
return;
ap = &ia->ap;
- nr_pages = __readahead_batch(rac, ap->pages, nr_pages);
- for (i = 0; i < nr_pages; i++) {
- fuse_wait_on_page_writeback(inode,
- readahead_index(rac) + i);
- ap->descs[i].length = PAGE_SIZE;
+
+ while (ap->num_folios < cur_pages) {
+ /*
+ * This returns a folio with a ref held on it.
+ * The ref needs to be held until the request is
+ * completed, since the splice case (see
+ * fuse_try_move_page()) drops the ref after it's
+ * replaced in the page cache.
+ */
+ folio = __readahead_folio(rac);
+ ap->folios[ap->num_folios] = folio;
+ ap->descs[ap->num_folios].length = folio_size(folio);
+ ap->num_folios++;
}
- ap->num_pages = nr_pages;
fuse_send_readpages(ia, rac->file);
+ nr_pages -= cur_pages;
}
}
@@ -1143,8 +1180,8 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
bool short_write;
int err;
- for (i = 0; i < ap->num_pages; i++)
- fuse_wait_on_page_writeback(inode, ap->pages[i]->index);
+ for (i = 0; i < ap->num_folios; i++)
+ fuse_wait_on_folio_writeback(inode, ap->folios[i]);
fuse_write_args_fill(ia, ff, pos, count);
ia->write.in.flags = fuse_write_flags(iocb);
@@ -1158,24 +1195,24 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
short_write = ia->write.out.size < count;
offset = ap->descs[0].offset;
count = ia->write.out.size;
- for (i = 0; i < ap->num_pages; i++) {
- struct page *page = ap->pages[i];
+ for (i = 0; i < ap->num_folios; i++) {
+ struct folio *folio = ap->folios[i];
if (err) {
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
} else {
- if (count >= PAGE_SIZE - offset)
- count -= PAGE_SIZE - offset;
+ if (count >= folio_size(folio) - offset)
+ count -= folio_size(folio) - offset;
else {
if (short_write)
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
count = 0;
}
offset = 0;
}
- if (ia->write.page_locked && (i == ap->num_pages - 1))
- unlock_page(page);
- put_page(page);
+ if (ia->write.folio_locked && (i == ap->num_folios - 1))
+ folio_unlock(folio);
+ folio_put(folio);
}
return err;
@@ -1189,6 +1226,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
struct fuse_args_pages *ap = &ia->ap;
struct fuse_conn *fc = get_fuse_conn(mapping->host);
unsigned offset = pos & (PAGE_SIZE - 1);
+ unsigned int nr_pages = 0;
size_t count = 0;
int err;
@@ -1197,7 +1235,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
do {
size_t tmp;
- struct page *page;
+ struct folio *folio;
pgoff_t index = pos >> PAGE_SHIFT;
size_t bytes = min_t(size_t, PAGE_SIZE - offset,
iov_iter_count(ii));
@@ -1209,27 +1247,30 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
if (fault_in_iov_iter_readable(ii, bytes))
break;
- err = -ENOMEM;
- page = grab_cache_page_write_begin(mapping, index);
- if (!page)
+ folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio)) {
+ err = PTR_ERR(folio);
break;
+ }
if (mapping_writably_mapped(mapping))
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
- tmp = copy_page_from_iter_atomic(page, offset, bytes, ii);
- flush_dcache_page(page);
+ tmp = copy_folio_from_iter_atomic(folio, offset, bytes, ii);
+ flush_dcache_folio(folio);
if (!tmp) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
goto again;
}
err = 0;
- ap->pages[ap->num_pages] = page;
- ap->descs[ap->num_pages].length = tmp;
- ap->num_pages++;
+ ap->folios[ap->num_folios] = folio;
+ ap->descs[ap->num_folios].length = tmp;
+ ap->num_folios++;
+ nr_pages++;
count += tmp;
pos += tmp;
@@ -1239,18 +1280,18 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
/* If we copied full page, mark it uptodate */
if (tmp == PAGE_SIZE)
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
- if (PageUptodate(page)) {
- unlock_page(page);
+ if (folio_test_uptodate(folio)) {
+ folio_unlock(folio);
} else {
- ia->write.page_locked = true;
+ ia->write.folio_locked = true;
break;
}
if (!fc->big_writes)
break;
} while (iov_iter_count(ii) && count < fc->max_write &&
- ap->num_pages < max_pages && offset == 0);
+ nr_pages < max_pages && offset == 0);
return count > 0 ? count : err;
}
@@ -1284,8 +1325,8 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii)
unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii),
fc->max_pages);
- ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs);
- if (!ap->pages) {
+ ap->folios = fuse_folios_alloc(nr_pages, GFP_KERNEL, &ap->descs);
+ if (!ap->folios) {
err = -ENOMEM;
break;
}
@@ -1307,7 +1348,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii)
err = -EIO;
}
}
- kfree(ap->pages);
+ kfree(ap->folios);
} while (!err && iov_iter_count(ii));
fuse_write_update_attr(inode, pos, res);
@@ -1349,7 +1390,7 @@ static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from
/* shared locks are not allowed with parallel page cache IO */
if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state))
- return false;
+ return true;
/* Parallel dio beyond EOF is not supported, at least for now. */
if (fuse_io_past_eof(iocb, from))
@@ -1362,7 +1403,7 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from,
bool *exclusive)
{
struct inode *inode = file_inode(iocb->ki_filp);
- struct fuse_file *ff = iocb->ki_filp->private_data;
+ struct fuse_inode *fi = get_fuse_inode(inode);
*exclusive = fuse_dio_wr_exclusive_lock(iocb, from);
if (*exclusive) {
@@ -1377,7 +1418,7 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from,
* have raced, so check it again.
*/
if (fuse_io_past_eof(iocb, from) ||
- fuse_file_uncached_io_start(inode, ff, NULL) != 0) {
+ fuse_inode_uncached_io_start(fi, NULL) != 0) {
inode_unlock_shared(inode);
inode_lock(inode);
*exclusive = true;
@@ -1388,13 +1429,13 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from,
static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive)
{
struct inode *inode = file_inode(iocb->ki_filp);
- struct fuse_file *ff = iocb->ki_filp->private_data;
+ struct fuse_inode *fi = get_fuse_inode(inode);
if (exclusive) {
inode_unlock(inode);
} else {
/* Allow opens in caching mode after last parallel dio end */
- fuse_file_uncached_io_end(inode, ff);
+ fuse_inode_uncached_io_end(fi);
inode_unlock_shared(inode);
}
}
@@ -1402,6 +1443,7 @@ static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive)
static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
+ struct mnt_idmap *idmap = file_mnt_idmap(file);
struct address_space *mapping = file->f_mapping;
ssize_t written = 0;
struct inode *inode = mapping->host;
@@ -1416,7 +1458,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
return err;
if (fc->handle_killpriv_v2 &&
- setattr_should_drop_suidgid(&nop_mnt_idmap,
+ setattr_should_drop_suidgid(idmap,
file_inode(file))) {
goto writethrough;
}
@@ -1433,11 +1475,7 @@ writethrough:
task_io_account_write(count);
- err = file_remove_privs(file);
- if (err)
- goto out;
-
- err = file_update_time(file);
+ err = kiocb_modified(iocb);
if (err)
goto out;
@@ -1471,52 +1509,89 @@ static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
size_t *nbytesp, int write,
- unsigned int max_pages)
+ unsigned int max_pages,
+ bool use_pages_for_kvec_io)
{
+ bool flush_or_invalidate = false;
+ unsigned int nr_pages = 0;
size_t nbytes = 0; /* # bytes already packed in req */
ssize_t ret = 0;
- /* Special case for kernel I/O: can copy directly into the buffer */
+ /* Special case for kernel I/O: can copy directly into the buffer.
+ * However if the implementation of fuse_conn requires pages instead of
+ * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead.
+ */
if (iov_iter_is_kvec(ii)) {
- unsigned long user_addr = fuse_get_user_addr(ii);
- size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
+ void *user_addr = (void *)fuse_get_user_addr(ii);
- if (write)
- ap->args.in_args[1].value = (void *) user_addr;
- else
- ap->args.out_args[0].value = (void *) user_addr;
+ if (!use_pages_for_kvec_io) {
+ size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
- iov_iter_advance(ii, frag_size);
- *nbytesp = frag_size;
- return 0;
+ if (write)
+ ap->args.in_args[1].value = user_addr;
+ else
+ ap->args.out_args[0].value = user_addr;
+
+ iov_iter_advance(ii, frag_size);
+ *nbytesp = frag_size;
+ return 0;
+ }
+
+ if (is_vmalloc_addr(user_addr)) {
+ ap->args.vmap_base = user_addr;
+ flush_or_invalidate = true;
+ }
+ }
+
+ /*
+ * Until there is support for iov_iter_extract_folios(), we have to
+ * manually extract pages using iov_iter_extract_pages() and then
+ * copy that to a folios array.
+ */
+ struct page **pages = kzalloc(max_pages * sizeof(struct page *),
+ GFP_KERNEL);
+ if (!pages) {
+ ret = -ENOMEM;
+ goto out;
}
- while (nbytes < *nbytesp && ap->num_pages < max_pages) {
- unsigned npages;
+ while (nbytes < *nbytesp && nr_pages < max_pages) {
+ unsigned nfolios, i;
size_t start;
- struct page **pt_pages;
- pt_pages = &ap->pages[ap->num_pages];
- ret = iov_iter_extract_pages(ii, &pt_pages,
+ ret = iov_iter_extract_pages(ii, &pages,
*nbytesp - nbytes,
- max_pages - ap->num_pages,
+ max_pages - nr_pages,
0, &start);
if (ret < 0)
break;
nbytes += ret;
- ret += start;
- npages = DIV_ROUND_UP(ret, PAGE_SIZE);
+ nfolios = DIV_ROUND_UP(ret + start, PAGE_SIZE);
- ap->descs[ap->num_pages].offset = start;
- fuse_page_descs_length_init(ap->descs, ap->num_pages, npages);
+ for (i = 0; i < nfolios; i++) {
+ struct folio *folio = page_folio(pages[i]);
+ unsigned int offset = start +
+ (folio_page_idx(folio, pages[i]) << PAGE_SHIFT);
+ unsigned int len = min_t(unsigned int, ret, PAGE_SIZE - start);
+
+ ap->descs[ap->num_folios].offset = offset;
+ ap->descs[ap->num_folios].length = len;
+ ap->folios[ap->num_folios] = folio;
+ start = 0;
+ ret -= len;
+ ap->num_folios++;
+ }
- ap->num_pages += npages;
- ap->descs[ap->num_pages - 1].length -=
- (PAGE_SIZE - ret) & (PAGE_SIZE - 1);
+ nr_pages += nfolios;
}
+ kfree(pages);
+ if (write && flush_or_invalidate)
+ flush_kernel_vmap_range(ap->args.vmap_base, nbytes);
+
+ ap->args.invalidate_vmap = !write && flush_or_invalidate;
ap->args.is_pinned = iov_iter_extract_will_pin(ii);
ap->args.user_pages = true;
if (write)
@@ -1524,6 +1599,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
else
ap->args.out_pages = true;
+out:
*nbytesp = nbytes;
return ret < 0 ? ret : 0;
@@ -1585,7 +1661,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
size_t nbytes = min(count, nmax);
err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
- max_pages);
+ max_pages, fc->use_pages_for_kvec_io);
if (err && !nbytes)
break;
@@ -1599,7 +1675,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
}
if (!io->async || nres < 0) {
- fuse_release_user_pages(&ia->ap, io->should_dirty);
+ fuse_release_user_pages(&ia->ap, nres, io->should_dirty);
fuse_io_free(ia);
}
ia = NULL;
@@ -1653,7 +1729,7 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
ssize_t res;
- if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
+ if (!is_sync_kiocb(iocb)) {
res = fuse_direct_IO(iocb, to);
} else {
struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
@@ -1667,7 +1743,6 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
- struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
ssize_t res;
bool exclusive;
@@ -1675,9 +1750,11 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
res = generic_write_checks(iocb, from);
if (res > 0) {
task_io_account_write(res);
- if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
+ if (!is_sync_kiocb(iocb)) {
res = fuse_direct_IO(iocb, from);
} else {
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
+
res = fuse_direct_io(&io, from, &iocb->ki_pos,
FUSE_DIO_WRITE);
fuse_write_update_attr(inode, iocb->ki_pos, res);
@@ -1763,30 +1840,34 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa)
if (wpa->bucket)
fuse_sync_bucket_dec(wpa->bucket);
- for (i = 0; i < ap->num_pages; i++)
- __free_page(ap->pages[i]);
+ for (i = 0; i < ap->num_folios; i++)
+ folio_put(ap->folios[i]);
- if (wpa->ia.ff)
- fuse_file_put(wpa->ia.ff, false);
+ fuse_file_put(wpa->ia.ff, false);
- kfree(ap->pages);
+ kfree(ap->folios);
kfree(wpa);
}
-static void fuse_writepage_finish(struct fuse_mount *fm,
- struct fuse_writepage_args *wpa)
+static void fuse_writepage_finish_stat(struct inode *inode, struct folio *folio)
+{
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
+
+ dec_wb_stat(&bdi->wb, WB_WRITEBACK);
+ node_stat_sub_folio(folio, NR_WRITEBACK_TEMP);
+ wb_writeout_inc(&bdi->wb);
+}
+
+static void fuse_writepage_finish(struct fuse_writepage_args *wpa)
{
struct fuse_args_pages *ap = &wpa->ia.ap;
struct inode *inode = wpa->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
- struct backing_dev_info *bdi = inode_to_bdi(inode);
int i;
- for (i = 0; i < ap->num_pages; i++) {
- dec_wb_stat(&bdi->wb, WB_WRITEBACK);
- dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
- wb_writeout_inc(&bdi->wb);
- }
+ for (i = 0; i < ap->num_folios; i++)
+ fuse_writepage_finish_stat(inode, ap->folios[i]);
+
wake_up(&fi->page_waitq);
}
@@ -1800,7 +1881,8 @@ __acquires(fi->lock)
struct fuse_inode *fi = get_fuse_inode(wpa->inode);
struct fuse_write_in *inarg = &wpa->ia.write.in;
struct fuse_args *args = &wpa->ia.ap.args;
- __u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE;
+ /* Currently, all folios in FUSE are one page */
+ __u64 data_size = wpa->ia.ap.num_folios * PAGE_SIZE;
int err;
fi->writectr++;
@@ -1833,13 +1915,15 @@ __acquires(fi->lock)
out_free:
fi->writectr--;
rb_erase(&wpa->writepages_entry, &fi->writepages);
- fuse_writepage_finish(fm, wpa);
+ fuse_writepage_finish(wpa);
spin_unlock(&fi->lock);
- /* After fuse_writepage_finish() aux request list is private */
+ /* After rb_erase() aux request list is private */
for (aux = wpa->next; aux; aux = next) {
next = aux->next;
aux->next = NULL;
+ fuse_writepage_finish_stat(aux->inode,
+ aux->ia.ap.folios[0]);
fuse_writepage_free(aux);
}
@@ -1874,11 +1958,11 @@ static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root,
struct fuse_writepage_args *wpa)
{
pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT;
- pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1;
+ pgoff_t idx_to = idx_from + wpa->ia.ap.num_folios - 1;
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
- WARN_ON(!wpa->ia.ap.num_pages);
+ WARN_ON(!wpa->ia.ap.num_folios);
while (*p) {
struct fuse_writepage_args *curr;
pgoff_t curr_index;
@@ -1889,7 +1973,7 @@ static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root,
WARN_ON(curr->inode != wpa->inode);
curr_index = curr->ia.write.in.offset >> PAGE_SHIFT;
- if (idx_from >= curr_index + curr->ia.ap.num_pages)
+ if (idx_from >= curr_index + curr->ia.ap.num_folios)
p = &(*p)->rb_right;
else if (idx_to < curr_index)
p = &(*p)->rb_left;
@@ -1934,7 +2018,6 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
wpa->next = next->next;
next->next = NULL;
- next->ia.ff = fuse_file_get(wpa->ia.ff);
tree_insert(&fi->writepages, next);
/*
@@ -1963,7 +2046,7 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
fuse_send_writepage(fm, next, inarg->offset + inarg->size);
}
fi->writectr--;
- fuse_writepage_finish(fm, wpa);
+ fuse_writepage_finish(wpa);
spin_unlock(&fi->lock);
fuse_writepage_free(wpa);
}
@@ -2022,9 +2105,9 @@ static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
wpa = kzalloc(sizeof(*wpa), GFP_NOFS);
if (wpa) {
ap = &wpa->ia.ap;
- ap->num_pages = 0;
- ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs);
- if (!ap->pages) {
+ ap->num_folios = 0;
+ ap->folios = fuse_folios_alloc(1, GFP_NOFS, &ap->descs);
+ if (!ap->folios) {
kfree(wpa);
wpa = NULL;
}
@@ -2047,49 +2130,77 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
rcu_read_unlock();
}
+static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio,
+ struct folio *tmp_folio, uint32_t folio_index)
+{
+ struct inode *inode = folio->mapping->host;
+ struct fuse_args_pages *ap = &wpa->ia.ap;
+
+ folio_copy(tmp_folio, folio);
+
+ ap->folios[folio_index] = tmp_folio;
+ ap->descs[folio_index].offset = 0;
+ ap->descs[folio_index].length = PAGE_SIZE;
+
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
+ node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP);
+}
+
+static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio,
+ struct fuse_file *ff)
+{
+ struct inode *inode = folio->mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_writepage_args *wpa;
+ struct fuse_args_pages *ap;
+
+ wpa = fuse_writepage_args_alloc();
+ if (!wpa)
+ return NULL;
+
+ fuse_writepage_add_to_bucket(fc, wpa);
+ fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio), 0);
+ wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
+ wpa->inode = inode;
+ wpa->ia.ff = ff;
+
+ ap = &wpa->ia.ap;
+ ap->args.in_pages = true;
+ ap->args.end = fuse_writepage_end;
+
+ return wpa;
+}
+
static int fuse_writepage_locked(struct folio *folio)
{
struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_writepage_args *wpa;
struct fuse_args_pages *ap;
struct folio *tmp_folio;
+ struct fuse_file *ff;
int error = -ENOMEM;
- folio_start_writeback(folio);
-
- wpa = fuse_writepage_args_alloc();
- if (!wpa)
- goto err;
- ap = &wpa->ia.ap;
-
tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
if (!tmp_folio)
- goto err_free;
+ goto err;
error = -EIO;
- wpa->ia.ff = fuse_write_file_get(fi);
- if (!wpa->ia.ff)
+ ff = fuse_write_file_get(fi);
+ if (!ff)
goto err_nofile;
- fuse_writepage_add_to_bucket(fc, wpa);
- fuse_write_args_fill(&wpa->ia, wpa->ia.ff, folio_pos(folio), 0);
+ wpa = fuse_writepage_args_setup(folio, ff);
+ error = -ENOMEM;
+ if (!wpa)
+ goto err_writepage_args;
- folio_copy(tmp_folio, folio);
- wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
- wpa->next = NULL;
- ap->args.in_pages = true;
- ap->num_pages = 1;
- ap->pages[0] = &tmp_folio->page;
- ap->descs[0].offset = 0;
- ap->descs[0].length = PAGE_SIZE;
- ap->args.end = fuse_writepage_end;
- wpa->inode = inode;
+ ap = &wpa->ia.ap;
+ ap->num_folios = 1;
- inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
- node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP);
+ folio_start_writeback(folio);
+ fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0);
spin_lock(&fi->lock);
tree_insert(&fi->writepages, wpa);
@@ -2101,13 +2212,12 @@ static int fuse_writepage_locked(struct folio *folio)
return 0;
+err_writepage_args:
+ fuse_file_put(ff, false);
err_nofile:
folio_put(tmp_folio);
-err_free:
- kfree(wpa);
err:
mapping_set_error(folio->mapping, error);
- folio_end_writeback(folio);
return error;
}
@@ -2115,32 +2225,32 @@ struct fuse_fill_wb_data {
struct fuse_writepage_args *wpa;
struct fuse_file *ff;
struct inode *inode;
- struct page **orig_pages;
- unsigned int max_pages;
+ struct folio **orig_folios;
+ unsigned int max_folios;
};
static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
{
struct fuse_args_pages *ap = &data->wpa->ia.ap;
struct fuse_conn *fc = get_fuse_conn(data->inode);
- struct page **pages;
- struct fuse_page_desc *descs;
- unsigned int npages = min_t(unsigned int,
- max_t(unsigned int, data->max_pages * 2,
- FUSE_DEFAULT_MAX_PAGES_PER_REQ),
+ struct folio **folios;
+ struct fuse_folio_desc *descs;
+ unsigned int nfolios = min_t(unsigned int,
+ max_t(unsigned int, data->max_folios * 2,
+ FUSE_DEFAULT_MAX_PAGES_PER_REQ),
fc->max_pages);
- WARN_ON(npages <= data->max_pages);
+ WARN_ON(nfolios <= data->max_folios);
- pages = fuse_pages_alloc(npages, GFP_NOFS, &descs);
- if (!pages)
+ folios = fuse_folios_alloc(nfolios, GFP_NOFS, &descs);
+ if (!folios)
return false;
- memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages);
- memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages);
- kfree(ap->pages);
- ap->pages = pages;
+ memcpy(folios, ap->folios, sizeof(struct folio *) * ap->num_folios);
+ memcpy(descs, ap->descs, sizeof(struct fuse_folio_desc) * ap->num_folios);
+ kfree(ap->folios);
+ ap->folios = folios;
ap->descs = descs;
- data->max_pages = npages;
+ data->max_folios = nfolios;
return true;
}
@@ -2150,17 +2260,16 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data)
struct fuse_writepage_args *wpa = data->wpa;
struct inode *inode = data->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
- int num_pages = wpa->ia.ap.num_pages;
+ int num_folios = wpa->ia.ap.num_folios;
int i;
- wpa->ia.ff = fuse_file_get(data->ff);
spin_lock(&fi->lock);
list_add_tail(&wpa->queue_entry, &fi->queued_writes);
fuse_flush_writepages(inode);
spin_unlock(&fi->lock);
- for (i = 0; i < num_pages; i++)
- end_page_writeback(data->orig_pages[i]);
+ for (i = 0; i < num_folios; i++)
+ folio_end_writeback(data->orig_folios[i]);
}
/*
@@ -2171,15 +2280,15 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data)
* swapping the new temp page with the old one.
*/
static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa,
- struct page *page)
+ struct folio *folio)
{
struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
struct fuse_writepage_args *tmp;
struct fuse_writepage_args *old_wpa;
struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
- WARN_ON(new_ap->num_pages != 0);
- new_ap->num_pages = 1;
+ WARN_ON(new_ap->num_folios != 0);
+ new_ap->num_folios = 1;
spin_lock(&fi->lock);
old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa);
@@ -2193,9 +2302,9 @@ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa,
WARN_ON(tmp->inode != new_wpa->inode);
curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT;
- if (curr_index == page->index) {
- WARN_ON(tmp->ia.ap.num_pages != 1);
- swap(tmp->ia.ap.pages[0], new_ap->pages[0]);
+ if (curr_index == folio->index) {
+ WARN_ON(tmp->ia.ap.num_folios != 1);
+ swap(tmp->ia.ap.folios[0], new_ap->folios[0]);
break;
}
}
@@ -2208,22 +2317,19 @@ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa,
spin_unlock(&fi->lock);
if (tmp) {
- struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode);
-
- dec_wb_stat(&bdi->wb, WB_WRITEBACK);
- dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP);
- wb_writeout_inc(&bdi->wb);
+ fuse_writepage_finish_stat(new_wpa->inode,
+ folio);
fuse_writepage_free(new_wpa);
}
return false;
}
-static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page,
+static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio,
struct fuse_args_pages *ap,
struct fuse_fill_wb_data *data)
{
- WARN_ON(!ap->num_pages);
+ WARN_ON(!ap->num_folios);
/*
* Being under writeback is unlikely but possible. For example direct
@@ -2231,23 +2337,23 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page,
* the pages are faulted with get_user_pages(), and then after the read
* completed.
*/
- if (fuse_page_is_writeback(data->inode, page->index))
+ if (fuse_folio_is_writeback(data->inode, folio))
return true;
/* Reached max pages */
- if (ap->num_pages == fc->max_pages)
+ if (ap->num_folios == fc->max_pages)
return true;
/* Reached max write bytes */
- if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write)
+ if ((ap->num_folios + 1) * PAGE_SIZE > fc->max_write)
return true;
/* Discontinuity */
- if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)
+ if (data->orig_folios[ap->num_folios - 1]->index + 1 != folio->index)
return true;
/* Need to grow the pages array? If so, did the expansion fail? */
- if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data))
+ if (ap->num_folios == data->max_folios && !fuse_pages_realloc(data))
return true;
return false;
@@ -2262,7 +2368,7 @@ static int fuse_writepages_fill(struct folio *folio,
struct inode *inode = data->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
- struct page *tmp_page;
+ struct folio *tmp_folio;
int err;
if (!data->ff) {
@@ -2272,14 +2378,14 @@ static int fuse_writepages_fill(struct folio *folio,
goto out_unlock;
}
- if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) {
+ if (wpa && fuse_writepage_need_send(fc, folio, ap, data)) {
fuse_writepages_send(data);
data->wpa = NULL;
}
err = -ENOMEM;
- tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
- if (!tmp_page)
+ tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
+ if (!tmp_folio)
goto out_unlock;
/*
@@ -2291,40 +2397,25 @@ static int fuse_writepages_fill(struct folio *folio,
* This is ensured by holding the page lock in page_mkwrite() while
* checking fuse_page_is_writeback(). We already hold the page lock
* since clear_page_dirty_for_io() and keep it held until we add the
- * request to the fi->writepages list and increment ap->num_pages.
+ * request to the fi->writepages list and increment ap->num_folios.
* After this fuse_page_is_writeback() will indicate that the page is
* under writeback, so we can release the page lock.
*/
if (data->wpa == NULL) {
err = -ENOMEM;
- wpa = fuse_writepage_args_alloc();
+ wpa = fuse_writepage_args_setup(folio, data->ff);
if (!wpa) {
- __free_page(tmp_page);
+ folio_put(tmp_folio);
goto out_unlock;
}
- fuse_writepage_add_to_bucket(fc, wpa);
-
- data->max_pages = 1;
-
+ fuse_file_get(wpa->ia.ff);
+ data->max_folios = 1;
ap = &wpa->ia.ap;
- fuse_write_args_fill(&wpa->ia, data->ff, folio_pos(folio), 0);
- wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
- wpa->next = NULL;
- ap->args.in_pages = true;
- ap->args.end = fuse_writepage_end;
- ap->num_pages = 0;
- wpa->inode = inode;
}
folio_start_writeback(folio);
- copy_highpage(tmp_page, &folio->page);
- ap->pages[ap->num_pages] = tmp_page;
- ap->descs[ap->num_pages].offset = 0;
- ap->descs[ap->num_pages].length = PAGE_SIZE;
- data->orig_pages[ap->num_pages] = &folio->page;
-
- inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
- inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
+ fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_folios);
+ data->orig_folios[ap->num_folios] = folio;
err = 0;
if (data->wpa) {
@@ -2333,9 +2424,9 @@ static int fuse_writepages_fill(struct folio *folio,
* fuse_page_is_writeback().
*/
spin_lock(&fi->lock);
- ap->num_pages++;
+ ap->num_folios++;
spin_unlock(&fi->lock);
- } else if (fuse_writepage_add(wpa, &folio->page)) {
+ } else if (fuse_writepage_add(wpa, folio)) {
data->wpa = wpa;
} else {
folio_end_writeback(folio);
@@ -2367,21 +2458,21 @@ static int fuse_writepages(struct address_space *mapping,
data.ff = NULL;
err = -ENOMEM;
- data.orig_pages = kcalloc(fc->max_pages,
- sizeof(struct page *),
- GFP_NOFS);
- if (!data.orig_pages)
+ data.orig_folios = kcalloc(fc->max_pages,
+ sizeof(struct folio *),
+ GFP_NOFS);
+ if (!data.orig_folios)
goto out;
err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
if (data.wpa) {
- WARN_ON(!data.wpa->ia.ap.num_pages);
+ WARN_ON(!data.wpa->ia.ap.num_folios);
fuse_writepages_send(&data);
}
if (data.ff)
fuse_file_put(data.ff, false);
- kfree(data.orig_pages);
+ kfree(data.orig_folios);
out:
return err;
}
@@ -2391,76 +2482,77 @@ out:
* but how to implement it without killing performance need more thinking.
*/
static int fuse_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
{
pgoff_t index = pos >> PAGE_SHIFT;
struct fuse_conn *fc = get_fuse_conn(file_inode(file));
- struct page *page;
+ struct folio *folio;
loff_t fsize;
int err = -ENOMEM;
WARN_ON(!fc->writeback_cache);
- page = grab_cache_page_write_begin(mapping, index);
- if (!page)
+ folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
goto error;
- fuse_wait_on_page_writeback(mapping->host, page->index);
+ fuse_wait_on_page_writeback(mapping->host, folio->index);
- if (PageUptodate(page) || len == PAGE_SIZE)
+ if (folio_test_uptodate(folio) || len >= folio_size(folio))
goto success;
/*
- * Check if the start this page comes after the end of file, in which
- * case the readpage can be optimized away.
+ * Check if the start of this folio comes after the end of file,
+ * in which case the readpage can be optimized away.
*/
fsize = i_size_read(mapping->host);
- if (fsize <= (pos & PAGE_MASK)) {
- size_t off = pos & ~PAGE_MASK;
+ if (fsize <= folio_pos(folio)) {
+ size_t off = offset_in_folio(folio, pos);
if (off)
- zero_user_segment(page, 0, off);
+ folio_zero_segment(folio, 0, off);
goto success;
}
- err = fuse_do_readpage(file, page);
+ err = fuse_do_readfolio(file, folio);
if (err)
goto cleanup;
success:
- *pagep = page;
+ *foliop = folio;
return 0;
cleanup:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
error:
return err;
}
static int fuse_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
/* Haven't copied anything? Skip zeroing, size extending, dirtying. */
if (!copied)
goto unlock;
pos += copied;
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
/* Zero any unwritten bytes at the end of the page */
size_t endoff = pos & ~PAGE_MASK;
if (endoff)
- zero_user_segment(page, endoff, PAGE_SIZE);
- SetPageUptodate(page);
+ folio_zero_segment(folio, endoff, PAGE_SIZE);
+ folio_mark_uptodate(folio);
}
if (pos > inode->i_size)
i_size_write(inode, pos);
- set_page_dirty(page);
+ folio_mark_dirty(folio);
unlock:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return copied;
}
@@ -2509,17 +2601,17 @@ static void fuse_vma_close(struct vm_area_struct *vma)
*/
static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
{
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
struct inode *inode = file_inode(vmf->vma->vm_file);
file_update_time(vmf->vma->vm_file);
- lock_page(page);
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
+ folio_lock(folio);
+ if (folio->mapping != inode->i_mapping) {
+ folio_unlock(folio);
return VM_FAULT_NOPAGE;
}
- fuse_wait_on_page_writeback(inode, page->index);
+ fuse_wait_on_folio_writeback(inode, folio);
return VM_FAULT_LOCKED;
}
@@ -2574,8 +2666,10 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
* First mmap of direct_io file enters caching inode io mode.
* Also waits for parallel dio writers to go into serial mode
* (exclusive instead of shared lock).
+ * After first mmap, the inode stays in caching io mode until
+ * the direct_io file release.
*/
- rc = fuse_file_cached_io_start(inode, ff);
+ rc = fuse_file_cached_io_open(inode, ff);
if (rc)
return rc;
}
@@ -2968,7 +3062,7 @@ static void fuse_do_truncate(struct file *file)
attr.ia_file = file;
attr.ia_valid |= ATTR_FILE;
- fuse_do_setattr(file_dentry(file), &attr, file);
+ fuse_do_setattr(file_mnt_idmap(file), file_dentry(file), &attr, file);
}
static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
@@ -3111,7 +3205,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
inode_lock(inode);
if (block_faults) {
filemap_invalidate_lock(inode->i_mapping);
- err = fuse_dax_break_layouts(inode, 0, 0);
+ err = fuse_dax_break_layouts(inode, 0, -1);
if (err)
goto out;
}
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
new file mode 100644
index 000000000000..b3c2e32254ba
--- /dev/null
+++ b/fs/fuse/fuse_dev_i.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
+ */
+#ifndef _FS_FUSE_DEV_I_H
+#define _FS_FUSE_DEV_I_H
+
+#include <linux/types.h>
+
+/* Ordinary requests have even IDs, while interrupts IDs are odd */
+#define FUSE_INT_REQ_BIT (1ULL << 0)
+#define FUSE_REQ_ID_STEP (1ULL << 1)
+
+struct fuse_arg;
+struct fuse_args;
+struct fuse_pqueue;
+struct fuse_req;
+struct fuse_iqueue;
+struct fuse_forget_link;
+
+struct fuse_copy_state {
+ int write;
+ struct fuse_req *req;
+ struct iov_iter *iter;
+ struct pipe_buffer *pipebufs;
+ struct pipe_buffer *currbuf;
+ struct pipe_inode_info *pipe;
+ unsigned long nr_segs;
+ struct page *pg;
+ unsigned int len;
+ unsigned int offset;
+ unsigned int move_pages:1;
+ unsigned int is_uring:1;
+ struct {
+ unsigned int copied_sz; /* copied size into the user buffer */
+ } ring;
+};
+
+static inline struct fuse_dev *fuse_get_dev(struct file *file)
+{
+ /*
+ * Lockless access is OK, because file->private data is set
+ * once during mount and is valid until the file is released.
+ */
+ return READ_ONCE(file->private_data);
+}
+
+unsigned int fuse_req_hash(u64 unique);
+struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique);
+
+void fuse_dev_end_requests(struct list_head *head);
+
+void fuse_copy_init(struct fuse_copy_state *cs, int write,
+ struct iov_iter *iter);
+int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs,
+ unsigned int argpages, struct fuse_arg *args,
+ int zeroing);
+int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
+ unsigned int nbytes);
+void fuse_dev_queue_forget(struct fuse_iqueue *fiq,
+ struct fuse_forget_link *forget);
+void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req);
+bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock);
+
+bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list);
+bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing);
+
+#endif
+
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index b24084b60864..d56d4fd956db 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -35,18 +35,38 @@
/** Default max number of pages that can be used in a single read request */
#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
-/** Maximum of max_pages received in init_out */
-#define FUSE_MAX_MAX_PAGES 256
-
/** Bias for fi->writectr, meaning new writepages must not be sent */
#define FUSE_NOWRITE INT_MIN
-/** It could be as large as PATH_MAX, but would that have any uses? */
-#define FUSE_NAME_MAX 1024
+/** Maximum length of a filename, not including terminating null */
+
+/* maximum, small enough for FUSE_MIN_READ_BUFFER*/
+#define FUSE_NAME_LOW_MAX 1024
+/* maximum, but needs a request buffer > FUSE_MIN_READ_BUFFER */
+#define FUSE_NAME_MAX (PATH_MAX - 1)
/** Number of dentries for each connection in the control filesystem */
#define FUSE_CTL_NUM_DENTRIES 5
+/* Frequency (in seconds) of request timeout checks, if opted into */
+#define FUSE_TIMEOUT_TIMER_FREQ 15
+
+/** Frequency (in jiffies) of request timeout checks, if opted into */
+extern const unsigned long fuse_timeout_timer_freq;
+
+/** Maximum of max_pages received in init_out */
+extern unsigned int fuse_max_pages_limit;
+/*
+ * Default timeout (in seconds) for the server to reply to a request
+ * before the connection is aborted, if no timeout was specified on mount.
+ */
+extern unsigned int fuse_default_req_timeout;
+/*
+ * Max timeout (in seconds) for the server to reply to a request before
+ * the connection is aborted.
+ */
+extern unsigned int fuse_max_req_timeout;
+
/** List of active connections */
extern struct list_head fuse_conn_list;
@@ -285,8 +305,8 @@ struct fuse_arg {
void *value;
};
-/** FUSE page descriptor */
-struct fuse_page_desc {
+/** FUSE folio descriptor */
+struct fuse_folio_desc {
unsigned int length;
unsigned int offset;
};
@@ -309,16 +329,19 @@ struct fuse_args {
bool may_block:1;
bool is_ext:1;
bool is_pinned:1;
- struct fuse_in_arg in_args[3];
+ bool invalidate_vmap:1;
+ struct fuse_in_arg in_args[4];
struct fuse_arg out_args[2];
void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error);
+ /* Used for kvec iter backed by vmalloc address */
+ void *vmap_base;
};
struct fuse_args_pages {
struct fuse_args args;
- struct page **pages;
- struct fuse_page_desc *descs;
- unsigned int num_pages;
+ struct folio **folios;
+ struct fuse_folio_desc *descs;
+ unsigned int num_folios;
};
struct fuse_release_args {
@@ -375,6 +398,7 @@ struct fuse_io_priv {
* FR_FINISHED: request is finished
* FR_PRIVATE: request is on private list
* FR_ASYNC: request is asynchronous
+ * FR_URING: request is handled through fuse-io-uring
*/
enum fuse_req_flag {
FR_ISREPLY,
@@ -389,6 +413,7 @@ enum fuse_req_flag {
FR_FINISHED,
FR_PRIVATE,
FR_ASYNC,
+ FR_URING,
};
/**
@@ -435,6 +460,13 @@ struct fuse_req {
/** fuse_mount this request belongs to */
struct fuse_mount *fm;
+
+#ifdef CONFIG_FUSE_IO_URING
+ void *ring_entry;
+ void *ring_queue;
+#endif
+ /** When (in jiffies) the request was created */
+ unsigned long create_time;
};
struct fuse_iqueue;
@@ -449,22 +481,19 @@ struct fuse_iqueue;
*/
struct fuse_iqueue_ops {
/**
- * Signal that a forget has been queued
+ * Send one forget
*/
- void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq)
- __releases(fiq->lock);
+ void (*send_forget)(struct fuse_iqueue *fiq, struct fuse_forget_link *link);
/**
- * Signal that an INTERRUPT request has been queued
+ * Send interrupt for request
*/
- void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq)
- __releases(fiq->lock);
+ void (*send_interrupt)(struct fuse_iqueue *fiq, struct fuse_req *req);
/**
- * Signal that a request has been queued
+ * Send one request
*/
- void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq)
- __releases(fiq->lock);
+ void (*send_req)(struct fuse_iqueue *fiq, struct fuse_req *req);
/**
* Clean up when fuse_iqueue is destroyed
@@ -860,6 +889,15 @@ struct fuse_conn {
/** Passthrough support for read/write IO */
unsigned int passthrough:1;
+ /* Use pages instead of pointer for kernel I/O */
+ unsigned int use_pages_for_kvec_io:1;
+
+ /* Is link not implemented by fs? */
+ unsigned int no_link:1;
+
+ /* Use io_uring for communication */
+ unsigned int io_uring;
+
/** Maximum stack depth for passthrough backing files */
int max_stack_depth;
@@ -869,7 +907,7 @@ struct fuse_conn {
/** Negotiated minor version */
unsigned minor;
- /** Entry on the fuse_mount_list */
+ /** Entry on the fuse_conn_list */
struct list_head entry;
/** Device ID from the root super block */
@@ -887,6 +925,12 @@ struct fuse_conn {
/** Version counter for attribute changes */
atomic64_t attr_version;
+ /** Version counter for evict inode */
+ atomic64_t evict_ctr;
+
+ /* maximum file name length */
+ u32 name_max;
+
/** Called on final put */
void (*release)(struct fuse_conn *);
@@ -917,6 +961,20 @@ struct fuse_conn {
/** IDR for backing files ids */
struct idr backing_files_map;
#endif
+
+#ifdef CONFIG_FUSE_IO_URING
+ /** uring connection information*/
+ struct fuse_ring *ring;
+#endif
+
+ /** Only used if the connection opts into request timeouts */
+ struct {
+ /* Worker for checking if any requests have timed out */
+ struct delayed_work work;
+
+ /* Request timeout (in jiffies). 0 = no timeout */
+ unsigned int req_timeout;
+ } timeout;
};
/*
@@ -941,6 +999,19 @@ struct fuse_mount {
struct rcu_head rcu;
};
+/*
+ * Empty header for FUSE opcodes without specific header needs.
+ * Used as a placeholder in args->in_args[0] for consistency
+ * across all FUSE operations, simplifying request handling.
+ */
+struct fuse_zero_header {};
+
+static inline void fuse_set_zero_arg0(struct fuse_args *args)
+{
+ args->in_args[0].size = sizeof(struct fuse_zero_header);
+ args->in_args[0].value = NULL;
+}
+
static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb)
{
return sb->s_fs_info;
@@ -981,6 +1052,11 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc)
return atomic64_read(&fc->attr_version);
}
+static inline u64 fuse_get_evict_ctr(struct fuse_conn *fc)
+{
+ return atomic64_read(&fc->evict_ctr);
+}
+
static inline bool fuse_stale_inode(const struct inode *inode, int generation,
struct fuse_attr *attr)
{
@@ -998,25 +1074,25 @@ static inline bool fuse_is_bad(struct inode *inode)
return unlikely(test_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state));
}
-static inline struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
- struct fuse_page_desc **desc)
+static inline struct folio **fuse_folios_alloc(unsigned int nfolios, gfp_t flags,
+ struct fuse_folio_desc **desc)
{
- struct page **pages;
+ struct folio **folios;
- pages = kzalloc(npages * (sizeof(struct page *) +
- sizeof(struct fuse_page_desc)), flags);
- *desc = (void *) (pages + npages);
+ folios = kzalloc(nfolios * (sizeof(struct folio *) +
+ sizeof(struct fuse_folio_desc)), flags);
+ *desc = (void *) (folios + nfolios);
- return pages;
+ return folios;
}
-static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs,
- unsigned int index,
- unsigned int nr_pages)
+static inline void fuse_folio_descs_length_init(struct fuse_folio_desc *descs,
+ unsigned int index,
+ unsigned int nr_folios)
{
int i;
- for (i = index; i < index + nr_pages; i++)
+ for (i = index; i < index + nr_folios; i++)
descs[i].length = PAGE_SIZE - descs[i].offset;
}
@@ -1040,7 +1116,8 @@ extern const struct dentry_operations fuse_root_dentry_operations;
*/
struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
int generation, struct fuse_attr *attr,
- u64 attr_valid, u64 attr_version);
+ u64 attr_valid, u64 attr_version,
+ u64 evict_ctr);
int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name,
struct fuse_entry_out *outarg, struct inode **inode);
@@ -1053,10 +1130,6 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
struct fuse_forget_link *fuse_alloc_forget(void);
-struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq,
- unsigned int max,
- unsigned int *countp);
-
/*
* Initialize READ or READDIR request
*/
@@ -1069,7 +1142,7 @@ struct fuse_io_args {
struct {
struct fuse_write_in in;
struct fuse_write_out out;
- bool page_locked;
+ bool folio_locked;
} write;
};
struct fuse_args_pages ap;
@@ -1134,7 +1207,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
struct fuse_statx *sx,
- u64 attr_valid, u32 cache_mask);
+ u64 attr_valid, u32 cache_mask,
+ u64 evict_ctr);
u32 fuse_get_cache_mask(struct inode *inode);
@@ -1154,7 +1228,22 @@ void __exit fuse_ctl_cleanup(void);
/**
* Simple request sending that does request allocation and freeing
*/
-ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args);
+ssize_t __fuse_simple_request(struct mnt_idmap *idmap,
+ struct fuse_mount *fm,
+ struct fuse_args *args);
+
+static inline ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args)
+{
+ return __fuse_simple_request(&invalid_mnt_idmap, fm, args);
+}
+
+static inline ssize_t fuse_simple_idmap_request(struct mnt_idmap *idmap,
+ struct fuse_mount *fm,
+ struct fuse_args *args)
+{
+ return __fuse_simple_request(idmap, fm, args);
+}
+
int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
gfp_t gfp_flags);
@@ -1167,6 +1256,9 @@ void fuse_request_end(struct fuse_req *req);
void fuse_abort_conn(struct fuse_conn *fc);
void fuse_wait_aborted(struct fuse_conn *fc);
+/* Check if any requests timed out */
+void fuse_check_timeout(struct work_struct *work);
+
/**
* Invalidate inode attributes
*/
@@ -1196,6 +1288,11 @@ void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o);
struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
/**
+ * Initialize the fuse processing queue
+ */
+void fuse_pqueue_init(struct fuse_pqueue *fpq);
+
+/**
* Initialize fuse_conn
*/
void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
@@ -1330,8 +1427,8 @@ bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written);
int fuse_flush_times(struct inode *inode, struct fuse_file *ff);
int fuse_write_inode(struct inode *inode, struct writeback_control *wbc);
-int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
- struct file *file);
+int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr, struct file *file);
void fuse_set_initialized(struct fuse_conn *fc);
@@ -1394,9 +1491,10 @@ int fuse_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa);
/* iomode.c */
-int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff);
-int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb);
-void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff);
+int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff);
+int fuse_inode_uncached_io_start(struct fuse_inode *fi,
+ struct fuse_backing *fb);
+void fuse_inode_uncached_io_end(struct fuse_inode *fi);
int fuse_file_io_open(struct file *file, struct inode *inode);
void fuse_file_io_release(struct fuse_file *ff, struct inode *inode);
@@ -1471,4 +1569,12 @@ ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe,
size_t len, unsigned int flags);
ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma);
+#ifdef CONFIG_SYSCTL
+extern int fuse_sysctl_register(void);
+extern void fuse_sysctl_unregister(void);
+#else
+#define fuse_sysctl_register() (0)
+#define fuse_sysctl_unregister() do { } while (0)
+#endif /* CONFIG_SYSCTL */
+
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h
new file mode 100644
index 000000000000..bbe9ddd8c716
--- /dev/null
+++ b/fs/fuse/fuse_trace.h
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM fuse
+
+#if !defined(_TRACE_FUSE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_FUSE_H
+
+#include <linux/tracepoint.h>
+
+#define OPCODES \
+ EM( FUSE_LOOKUP, "FUSE_LOOKUP") \
+ EM( FUSE_FORGET, "FUSE_FORGET") \
+ EM( FUSE_GETATTR, "FUSE_GETATTR") \
+ EM( FUSE_SETATTR, "FUSE_SETATTR") \
+ EM( FUSE_READLINK, "FUSE_READLINK") \
+ EM( FUSE_SYMLINK, "FUSE_SYMLINK") \
+ EM( FUSE_MKNOD, "FUSE_MKNOD") \
+ EM( FUSE_MKDIR, "FUSE_MKDIR") \
+ EM( FUSE_UNLINK, "FUSE_UNLINK") \
+ EM( FUSE_RMDIR, "FUSE_RMDIR") \
+ EM( FUSE_RENAME, "FUSE_RENAME") \
+ EM( FUSE_LINK, "FUSE_LINK") \
+ EM( FUSE_OPEN, "FUSE_OPEN") \
+ EM( FUSE_READ, "FUSE_READ") \
+ EM( FUSE_WRITE, "FUSE_WRITE") \
+ EM( FUSE_STATFS, "FUSE_STATFS") \
+ EM( FUSE_RELEASE, "FUSE_RELEASE") \
+ EM( FUSE_FSYNC, "FUSE_FSYNC") \
+ EM( FUSE_SETXATTR, "FUSE_SETXATTR") \
+ EM( FUSE_GETXATTR, "FUSE_GETXATTR") \
+ EM( FUSE_LISTXATTR, "FUSE_LISTXATTR") \
+ EM( FUSE_REMOVEXATTR, "FUSE_REMOVEXATTR") \
+ EM( FUSE_FLUSH, "FUSE_FLUSH") \
+ EM( FUSE_INIT, "FUSE_INIT") \
+ EM( FUSE_OPENDIR, "FUSE_OPENDIR") \
+ EM( FUSE_READDIR, "FUSE_READDIR") \
+ EM( FUSE_RELEASEDIR, "FUSE_RELEASEDIR") \
+ EM( FUSE_FSYNCDIR, "FUSE_FSYNCDIR") \
+ EM( FUSE_GETLK, "FUSE_GETLK") \
+ EM( FUSE_SETLK, "FUSE_SETLK") \
+ EM( FUSE_SETLKW, "FUSE_SETLKW") \
+ EM( FUSE_ACCESS, "FUSE_ACCESS") \
+ EM( FUSE_CREATE, "FUSE_CREATE") \
+ EM( FUSE_INTERRUPT, "FUSE_INTERRUPT") \
+ EM( FUSE_BMAP, "FUSE_BMAP") \
+ EM( FUSE_DESTROY, "FUSE_DESTROY") \
+ EM( FUSE_IOCTL, "FUSE_IOCTL") \
+ EM( FUSE_POLL, "FUSE_POLL") \
+ EM( FUSE_NOTIFY_REPLY, "FUSE_NOTIFY_REPLY") \
+ EM( FUSE_BATCH_FORGET, "FUSE_BATCH_FORGET") \
+ EM( FUSE_FALLOCATE, "FUSE_FALLOCATE") \
+ EM( FUSE_READDIRPLUS, "FUSE_READDIRPLUS") \
+ EM( FUSE_RENAME2, "FUSE_RENAME2") \
+ EM( FUSE_LSEEK, "FUSE_LSEEK") \
+ EM( FUSE_COPY_FILE_RANGE, "FUSE_COPY_FILE_RANGE") \
+ EM( FUSE_SETUPMAPPING, "FUSE_SETUPMAPPING") \
+ EM( FUSE_REMOVEMAPPING, "FUSE_REMOVEMAPPING") \
+ EM( FUSE_SYNCFS, "FUSE_SYNCFS") \
+ EM( FUSE_TMPFILE, "FUSE_TMPFILE") \
+ EM( FUSE_STATX, "FUSE_STATX") \
+ EMe(CUSE_INIT, "CUSE_INIT")
+
+/*
+ * This will turn the above table into TRACE_DEFINE_ENUM() for each of the
+ * entries.
+ */
+#undef EM
+#undef EMe
+#define EM(a, b) TRACE_DEFINE_ENUM(a);
+#define EMe(a, b) TRACE_DEFINE_ENUM(a);
+
+OPCODES
+
+/* Now we redfine it with the table that __print_symbolic needs. */
+#undef EM
+#undef EMe
+#define EM(a, b) {a, b},
+#define EMe(a, b) {a, b}
+
+TRACE_EVENT(fuse_request_send,
+ TP_PROTO(const struct fuse_req *req),
+
+ TP_ARGS(req),
+
+ TP_STRUCT__entry(
+ __field(dev_t, connection)
+ __field(uint64_t, unique)
+ __field(enum fuse_opcode, opcode)
+ __field(uint32_t, len)
+ ),
+
+ TP_fast_assign(
+ __entry->connection = req->fm->fc->dev;
+ __entry->unique = req->in.h.unique;
+ __entry->opcode = req->in.h.opcode;
+ __entry->len = req->in.h.len;
+ ),
+
+ TP_printk("connection %u req %llu opcode %u (%s) len %u ",
+ __entry->connection, __entry->unique, __entry->opcode,
+ __print_symbolic(__entry->opcode, OPCODES), __entry->len)
+);
+
+TRACE_EVENT(fuse_request_end,
+ TP_PROTO(const struct fuse_req *req),
+
+ TP_ARGS(req),
+
+ TP_STRUCT__entry(
+ __field(dev_t, connection)
+ __field(uint64_t, unique)
+ __field(uint32_t, len)
+ __field(int32_t, error)
+ ),
+
+ TP_fast_assign(
+ __entry->connection = req->fm->fc->dev;
+ __entry->unique = req->in.h.unique;
+ __entry->len = req->out.h.len;
+ __entry->error = req->out.h.error;
+ ),
+
+ TP_printk("connection %u req %llu len %u error %d", __entry->connection,
+ __entry->unique, __entry->len, __entry->error)
+);
+
+#endif /* _TRACE_FUSE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE fuse_trace
+#include <trace/define_trace.h>
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3a5d88878335..fd48e8d37f2e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -7,6 +7,7 @@
*/
#include "fuse_i.h"
+#include "dev_uring_i.h"
#include <linux/pagemap.h>
#include <linux/slab.h>
@@ -35,6 +36,11 @@ DEFINE_MUTEX(fuse_mutex);
static int set_global_limit(const char *val, const struct kernel_param *kp);
+unsigned int fuse_max_pages_limit = 256;
+/* default is no timeout */
+unsigned int fuse_default_req_timeout;
+unsigned int fuse_max_req_timeout;
+
unsigned max_user_bgreq;
module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
&max_user_bgreq, 0644);
@@ -173,8 +179,17 @@ static void fuse_evict_inode(struct inode *inode)
fuse_cleanup_submount_lookup(fc, fi->submount_lookup);
fi->submount_lookup = NULL;
}
+ /*
+ * Evict of non-deleted inode may race with outstanding
+ * LOOKUP/READDIRPLUS requests and result in inconsistency when
+ * the request finishes. Deal with that here by bumping a
+ * counter that can be compared to the starting value.
+ */
+ if (inode->i_nlink > 0)
+ atomic64_inc(&fc->evict_ctr);
}
if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) {
+ WARN_ON(fi->iocachectr != 0);
WARN_ON(!list_empty(&fi->write_files));
WARN_ON(!list_empty(&fi->queued_writes));
}
@@ -205,17 +220,30 @@ static ino_t fuse_squash_ino(u64 ino64)
void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
struct fuse_statx *sx,
- u64 attr_valid, u32 cache_mask)
+ u64 attr_valid, u32 cache_mask,
+ u64 evict_ctr)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
lockdep_assert_held(&fi->lock);
+ /*
+ * Clear basic stats from invalid mask.
+ *
+ * Don't do this if this is coming from a fuse_iget() call and there
+ * might have been a racing evict which would've invalidated the result
+ * if the attr_version would've been preserved.
+ *
+ * !evict_ctr -> this is create
+ * fi->attr_version != 0 -> this is not a new inode
+ * evict_ctr == fuse_get_evict_ctr() -> no evicts while during request
+ */
+ if (!evict_ctr || fi->attr_version || evict_ctr == fuse_get_evict_ctr(fc))
+ set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0);
+
fi->attr_version = atomic64_inc_return(&fc->attr_version);
fi->i_time = attr_valid;
- /* Clear basic stats from invalid mask */
- set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0);
inode->i_ino = fuse_squash_ino(attr->ino);
inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
@@ -294,9 +322,9 @@ u32 fuse_get_cache_mask(struct inode *inode)
return STATX_MTIME | STATX_CTIME | STATX_SIZE;
}
-void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
- struct fuse_statx *sx,
- u64 attr_valid, u64 attr_version)
+static void fuse_change_attributes_i(struct inode *inode, struct fuse_attr *attr,
+ struct fuse_statx *sx, u64 attr_valid,
+ u64 attr_version, u64 evict_ctr)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -330,7 +358,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
}
old_mtime = inode_get_mtime(inode);
- fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask);
+ fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask,
+ evict_ctr);
oldsize = inode->i_size;
/*
@@ -371,6 +400,13 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
fuse_dax_dontcache(inode, attr->flags);
}
+void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
+ struct fuse_statx *sx, u64 attr_valid,
+ u64 attr_version)
+{
+ fuse_change_attributes_i(inode, attr, sx, attr_valid, attr_version, 0);
+}
+
static void fuse_init_submount_lookup(struct fuse_submount_lookup *sl,
u64 nodeid)
{
@@ -425,7 +461,8 @@ static int fuse_inode_set(struct inode *inode, void *_nodeidp)
struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
int generation, struct fuse_attr *attr,
- u64 attr_valid, u64 attr_version)
+ u64 attr_valid, u64 attr_version,
+ u64 evict_ctr)
{
struct inode *inode;
struct fuse_inode *fi;
@@ -486,8 +523,8 @@ retry:
fi->nlookup++;
spin_unlock(&fi->lock);
done:
- fuse_change_attributes(inode, attr, NULL, attr_valid, attr_version);
-
+ fuse_change_attributes_i(inode, attr, NULL, attr_valid, attr_version,
+ evict_ctr);
return inode;
}
@@ -739,8 +776,8 @@ static const struct fs_parameter_spec fuse_fs_parameters[] = {
fsparam_string ("source", OPT_SOURCE),
fsparam_u32 ("fd", OPT_FD),
fsparam_u32oct ("rootmode", OPT_ROOTMODE),
- fsparam_u32 ("user_id", OPT_USER_ID),
- fsparam_u32 ("group_id", OPT_GROUP_ID),
+ fsparam_uid ("user_id", OPT_USER_ID),
+ fsparam_gid ("group_id", OPT_GROUP_ID),
fsparam_flag ("default_permissions", OPT_DEFAULT_PERMISSIONS),
fsparam_flag ("allow_other", OPT_ALLOW_OTHER),
fsparam_u32 ("max_read", OPT_MAX_READ),
@@ -754,6 +791,8 @@ static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param)
struct fs_parse_result result;
struct fuse_fs_context *ctx = fsc->fs_private;
int opt;
+ kuid_t kuid;
+ kgid_t kgid;
if (fsc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
/*
@@ -798,16 +837,26 @@ static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param)
break;
case OPT_USER_ID:
- ctx->user_id = make_kuid(fsc->user_ns, result.uint_32);
- if (!uid_valid(ctx->user_id))
+ kuid = result.uid;
+ /*
+ * The requested uid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kuid_has_mapping(fsc->user_ns, kuid))
return invalfc(fsc, "Invalid user_id");
+ ctx->user_id = kuid;
ctx->user_id_present = true;
break;
case OPT_GROUP_ID:
- ctx->group_id = make_kgid(fsc->user_ns, result.uint_32);
- if (!gid_valid(ctx->group_id))
+ kgid = result.gid;
+ /*
+ * The requested gid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kgid_has_mapping(fsc->user_ns, kgid))
return invalfc(fsc, "Invalid group_id");
+ ctx->group_id = kgid;
ctx->group_id_present = true;
break;
@@ -892,7 +941,7 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq,
fiq->priv = priv;
}
-static void fuse_pqueue_init(struct fuse_pqueue *fpq)
+void fuse_pqueue_init(struct fuse_pqueue *fpq)
{
unsigned int i;
@@ -927,11 +976,14 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
fc->initialized = 0;
fc->connected = 1;
atomic64_set(&fc->attr_version, 1);
+ atomic64_set(&fc->evict_ctr, 1);
get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
fc->user_ns = get_user_ns(user_ns);
fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
- fc->max_pages_limit = FUSE_MAX_MAX_PAGES;
+ fc->max_pages_limit = fuse_max_pages_limit;
+ fc->name_max = FUSE_NAME_LOW_MAX;
+ fc->timeout.req_timeout = 0;
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
fuse_backing_files_init(fc);
@@ -946,6 +998,8 @@ static void delayed_release(struct rcu_head *p)
{
struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu);
+ fuse_uring_destruct(fc);
+
put_user_ns(fc->user_ns);
fc->release(fc);
}
@@ -958,6 +1012,8 @@ void fuse_conn_put(struct fuse_conn *fc)
if (IS_ENABLED(CONFIG_FUSE_DAX))
fuse_dax_conn_free(fc);
+ if (fc->timeout.req_timeout)
+ cancel_delayed_work_sync(&fc->timeout.work);
if (fiq->ops->release)
fiq->ops->release(fiq);
put_pid_ns(fc->pid_ns);
@@ -988,7 +1044,7 @@ static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
attr.mode = mode;
attr.ino = FUSE_ROOT_ID;
attr.nlink = 1;
- return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0);
+ return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0, 0);
}
struct fuse_inode_handle {
@@ -1208,6 +1264,34 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
spin_unlock(&fc->bg_lock);
}
+static void set_request_timeout(struct fuse_conn *fc, unsigned int timeout)
+{
+ fc->timeout.req_timeout = secs_to_jiffies(timeout);
+ INIT_DELAYED_WORK(&fc->timeout.work, fuse_check_timeout);
+ queue_delayed_work(system_wq, &fc->timeout.work,
+ fuse_timeout_timer_freq);
+}
+
+static void init_server_timeout(struct fuse_conn *fc, unsigned int timeout)
+{
+ if (!timeout && !fuse_max_req_timeout && !fuse_default_req_timeout)
+ return;
+
+ if (!timeout)
+ timeout = fuse_default_req_timeout;
+
+ if (fuse_max_req_timeout) {
+ if (timeout)
+ timeout = min(fuse_max_req_timeout, timeout);
+ else
+ timeout = fuse_max_req_timeout;
+ }
+
+ timeout = max(FUSE_TIMEOUT_TIMER_FREQ, timeout);
+
+ set_request_timeout(fc, timeout);
+}
+
struct fuse_init_args {
struct fuse_args args;
struct fuse_init_in in;
@@ -1226,6 +1310,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
ok = false;
else {
unsigned long ra_pages;
+ unsigned int timeout = 0;
process_init_limits(fc, arg);
@@ -1289,6 +1374,13 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
fc->max_pages =
min_t(unsigned int, fc->max_pages_limit,
max_t(unsigned int, arg->max_pages, 1));
+
+ /*
+ * PATH_MAX file names might need two pages for
+ * ops like rename
+ */
+ if (fc->max_pages > 1)
+ fc->name_max = FUSE_NAME_MAX;
}
if (IS_ENABLED(CONFIG_FUSE_DAX)) {
if (flags & FUSE_MAP_ALIGNMENT &&
@@ -1319,23 +1411,41 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
* on a stacked fs (e.g. overlayfs) themselves and with
* max_stack_depth == 1, FUSE fs can be stacked as the
* underlying fs of a stacked fs (e.g. overlayfs).
+ *
+ * Also don't allow the combination of FUSE_PASSTHROUGH
+ * and FUSE_WRITEBACK_CACHE, current design doesn't handle
+ * them together.
*/
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) &&
(flags & FUSE_PASSTHROUGH) &&
arg->max_stack_depth > 0 &&
- arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH) {
+ arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH &&
+ !(flags & FUSE_WRITEBACK_CACHE)) {
fc->passthrough = 1;
fc->max_stack_depth = arg->max_stack_depth;
fm->sb->s_stack_depth = arg->max_stack_depth;
}
if (flags & FUSE_NO_EXPORT_SUPPORT)
fm->sb->s_export_op = &fuse_export_fid_operations;
+ if (flags & FUSE_ALLOW_IDMAP) {
+ if (fc->default_permissions)
+ fm->sb->s_iflags &= ~SB_I_NOIDMAP;
+ else
+ ok = false;
+ }
+ if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
+ fc->io_uring = 1;
+
+ if (flags & FUSE_REQUEST_TIMEOUT)
+ timeout = arg->request_timeout;
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
fc->no_flock = 1;
}
+ init_server_timeout(fc, timeout);
+
fm->sb->s_bdi->ra_pages =
min(fm->sb->s_bdi->ra_pages, ra_pages);
fc->minor = arg->minor;
@@ -1377,7 +1487,8 @@ void fuse_send_init(struct fuse_mount *fm)
FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT |
FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP |
FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP |
- FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND;
+ FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP |
+ FUSE_REQUEST_TIMEOUT;
#ifdef CONFIG_FUSE_DAX
if (fm->fc->dax)
flags |= FUSE_MAP_ALIGNMENT;
@@ -1389,6 +1500,13 @@ void fuse_send_init(struct fuse_mount *fm)
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
flags |= FUSE_PASSTHROUGH;
+ /*
+ * This is just an information flag for fuse server. No need to check
+ * the reply - server is either sending IORING_OP_URING_CMD or not.
+ */
+ if (fuse_uring_enabled())
+ flags |= FUSE_OVER_IO_URING;
+
ia->in.flags = flags;
ia->in.flags2 = flags >> 32;
@@ -1554,6 +1672,7 @@ static void fuse_sb_defaults(struct super_block *sb)
sb->s_time_gran = 1;
sb->s_export_op = &fuse_export_operations;
sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE;
+ sb->s_iflags |= SB_I_NOIDMAP;
if (sb->s_user_ns != &init_user_ns)
sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
@@ -1585,7 +1704,8 @@ static int fuse_fill_super_submount(struct super_block *sb,
return -ENOMEM;
fuse_fill_attr_from_inode(&root_attr, parent_fi);
- root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0);
+ root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0,
+ fuse_get_evict_ctr(fm->fc));
/*
* This inode is just a duplicate, so it is not looked up and
* its nlookup should not be incremented. fuse_iget() does
@@ -1966,7 +2086,7 @@ static void fuse_kill_sb_anon(struct super_block *sb)
static struct file_system_type fuse_fs_type = {
.owner = THIS_MODULE,
.name = "fuse",
- .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT,
+ .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
.init_fs_context = fuse_init_fs_context,
.parameters = fuse_fs_parameters,
.kill_sb = fuse_kill_sb_anon,
@@ -1987,7 +2107,7 @@ static struct file_system_type fuseblk_fs_type = {
.init_fs_context = fuse_init_fs_context,
.parameters = fuse_fs_parameters,
.kill_sb = fuse_kill_sb_blk,
- .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
+ .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("fuseblk");
@@ -2038,8 +2158,14 @@ static int __init fuse_fs_init(void)
if (err)
goto out3;
+ err = fuse_sysctl_register();
+ if (err)
+ goto out4;
+
return 0;
+ out4:
+ unregister_filesystem(&fuse_fs_type);
out3:
unregister_fuseblk();
out2:
@@ -2050,6 +2176,7 @@ static int __init fuse_fs_init(void)
static void fuse_fs_cleanup(void)
{
+ fuse_sysctl_unregister();
unregister_filesystem(&fuse_fs_type);
unregister_fuseblk();
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index 726640fa439e..2d9abf48828f 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -8,6 +8,9 @@
#include <linux/uio.h>
#include <linux/compat.h>
#include <linux/fileattr.h>
+#include <linux/fsverity.h>
+
+#define FUSE_VERITY_ENABLE_ARG_MAX_PAGES 256
static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args,
struct fuse_ioctl_out *outarg)
@@ -117,6 +120,53 @@ static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
return 0;
}
+/* For fs-verity, determine iov lengths from input */
+static int fuse_setup_measure_verity(unsigned long arg, struct iovec *iov)
+{
+ __u16 digest_size;
+ struct fsverity_digest __user *uarg = (void __user *)arg;
+
+ if (copy_from_user(&digest_size, &uarg->digest_size, sizeof(digest_size)))
+ return -EFAULT;
+
+ if (digest_size > SIZE_MAX - sizeof(struct fsverity_digest))
+ return -EINVAL;
+
+ iov->iov_len = sizeof(struct fsverity_digest) + digest_size;
+
+ return 0;
+}
+
+static int fuse_setup_enable_verity(unsigned long arg, struct iovec *iov,
+ unsigned int *in_iovs)
+{
+ struct fsverity_enable_arg enable;
+ struct fsverity_enable_arg __user *uarg = (void __user *)arg;
+ const __u32 max_buffer_len = FUSE_VERITY_ENABLE_ARG_MAX_PAGES * PAGE_SIZE;
+
+ if (copy_from_user(&enable, uarg, sizeof(enable)))
+ return -EFAULT;
+
+ if (enable.salt_size > max_buffer_len || enable.sig_size > max_buffer_len)
+ return -ENOMEM;
+
+ if (enable.salt_size > 0) {
+ iov++;
+ (*in_iovs)++;
+
+ iov->iov_base = u64_to_user_ptr(enable.salt_ptr);
+ iov->iov_len = enable.salt_size;
+ }
+
+ if (enable.sig_size > 0) {
+ iov++;
+ (*in_iovs)++;
+
+ iov->iov_base = u64_to_user_ptr(enable.sig_ptr);
+ iov->iov_len = enable.sig_size;
+ }
+ return 0;
+}
/*
* For ioctls, there is no generic way to determine how much memory
@@ -201,12 +251,12 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
err = -ENOMEM;
- ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs);
+ ap.folios = fuse_folios_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs);
iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
- if (!ap.pages || !iov_page)
+ if (!ap.folios || !iov_page)
goto out;
- fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages);
+ fuse_folio_descs_length_init(ap.descs, 0, fm->fc->max_pages);
/*
* If restricted, initialize IO parameters as encoded in @cmd.
@@ -227,6 +277,18 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
out_iov = iov;
out_iovs = 1;
}
+
+ err = 0;
+ switch (cmd) {
+ case FS_IOC_MEASURE_VERITY:
+ err = fuse_setup_measure_verity(arg, iov);
+ break;
+ case FS_IOC_ENABLE_VERITY:
+ err = fuse_setup_enable_verity(arg, iov, &in_iovs);
+ break;
+ }
+ if (err)
+ goto out;
}
retry:
@@ -244,14 +306,13 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
err = -ENOMEM;
if (max_pages > fm->fc->max_pages)
goto out;
- while (ap.num_pages < max_pages) {
- ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
- if (!ap.pages[ap.num_pages])
+ while (ap.num_folios < max_pages) {
+ ap.folios[ap.num_folios] = folio_alloc(GFP_KERNEL | __GFP_HIGHMEM, 0);
+ if (!ap.folios[ap.num_folios])
goto out;
- ap.num_pages++;
+ ap.num_folios++;
}
-
/* okay, let's send it to the client */
ap.args.opcode = FUSE_IOCTL;
ap.args.nodeid = ff->nodeid;
@@ -265,8 +326,8 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
err = -EFAULT;
iov_iter_init(&ii, ITER_SOURCE, in_iov, in_iovs, in_size);
- for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
- c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
+ for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_folios); i++) {
+ c = copy_folio_from_iter(ap.folios[i], 0, PAGE_SIZE, &ii);
if (c != PAGE_SIZE && iov_iter_count(&ii))
goto out;
}
@@ -304,7 +365,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
goto out;
- vaddr = kmap_local_page(ap.pages[0]);
+ vaddr = kmap_local_folio(ap.folios[0], 0);
err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr,
transferred, in_iovs + out_iovs,
(flags & FUSE_IOCTL_COMPAT) != 0);
@@ -332,17 +393,17 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
err = -EFAULT;
iov_iter_init(&ii, ITER_DEST, out_iov, out_iovs, transferred);
- for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
- c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
+ for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_folios); i++) {
+ c = copy_folio_to_iter(ap.folios[i], 0, PAGE_SIZE, &ii);
if (c != PAGE_SIZE && iov_iter_count(&ii))
goto out;
}
err = 0;
out:
free_page((unsigned long) iov_page);
- while (ap.num_pages)
- __free_page(ap.pages[--ap.num_pages]);
- kfree(ap.pages);
+ while (ap.num_folios)
+ folio_put(ap.folios[--ap.num_folios]);
+ kfree(ap.folios);
return err ? err : outarg.result;
}
diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c
index c653ddcf0578..c99e285f3183 100644
--- a/fs/fuse/iomode.c
+++ b/fs/fuse/iomode.c
@@ -21,12 +21,13 @@ static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi)
}
/*
- * Start cached io mode.
+ * Called on cached file open() and on first mmap() of direct_io file.
+ * Takes cached_io inode mode reference to be dropped on file release.
*
* Blocks new parallel dio writes and waits for the in-progress parallel dio
* writes to complete.
*/
-int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff)
+int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff)
{
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -67,10 +68,9 @@ int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff)
return 0;
}
-static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff)
+static void fuse_file_cached_io_release(struct fuse_file *ff,
+ struct fuse_inode *fi)
{
- struct fuse_inode *fi = get_fuse_inode(inode);
-
spin_lock(&fi->lock);
WARN_ON(fi->iocachectr <= 0);
WARN_ON(ff->iomode != IOM_CACHED);
@@ -82,16 +82,15 @@ static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff)
}
/* Start strictly uncached io mode where cache access is not allowed */
-int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb)
+int fuse_inode_uncached_io_start(struct fuse_inode *fi, struct fuse_backing *fb)
{
- struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_backing *oldfb;
int err = 0;
spin_lock(&fi->lock);
/* deny conflicting backing files on same fuse inode */
oldfb = fuse_inode_backing(fi);
- if (oldfb && oldfb != fb) {
+ if (fb && oldfb && oldfb != fb) {
err = -EBUSY;
goto unlock;
}
@@ -99,12 +98,10 @@ int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struc
err = -ETXTBSY;
goto unlock;
}
- WARN_ON(ff->iomode != IOM_NONE);
fi->iocachectr--;
- ff->iomode = IOM_UNCACHED;
/* fuse inode holds a single refcount of backing file */
- if (!oldfb) {
+ if (fb && !oldfb) {
oldfb = fuse_inode_backing_set(fi, fb);
WARN_ON_ONCE(oldfb != NULL);
} else {
@@ -115,15 +112,29 @@ unlock:
return err;
}
-void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff)
+/* Takes uncached_io inode mode reference to be dropped on file release */
+static int fuse_file_uncached_io_open(struct inode *inode,
+ struct fuse_file *ff,
+ struct fuse_backing *fb)
{
struct fuse_inode *fi = get_fuse_inode(inode);
+ int err;
+
+ err = fuse_inode_uncached_io_start(fi, fb);
+ if (err)
+ return err;
+
+ WARN_ON(ff->iomode != IOM_NONE);
+ ff->iomode = IOM_UNCACHED;
+ return 0;
+}
+
+void fuse_inode_uncached_io_end(struct fuse_inode *fi)
+{
struct fuse_backing *oldfb = NULL;
spin_lock(&fi->lock);
WARN_ON(fi->iocachectr >= 0);
- WARN_ON(ff->iomode != IOM_UNCACHED);
- ff->iomode = IOM_NONE;
fi->iocachectr++;
if (!fi->iocachectr) {
wake_up(&fi->direct_io_waitq);
@@ -134,6 +145,15 @@ void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff)
fuse_backing_put(oldfb);
}
+/* Drop uncached_io reference from passthrough open */
+static void fuse_file_uncached_io_release(struct fuse_file *ff,
+ struct fuse_inode *fi)
+{
+ WARN_ON(ff->iomode != IOM_UNCACHED);
+ ff->iomode = IOM_NONE;
+ fuse_inode_uncached_io_end(fi);
+}
+
/*
* Open flags that are allowed in combination with FOPEN_PASSTHROUGH.
* A combination of FOPEN_PASSTHROUGH and FOPEN_DIRECT_IO means that read/write
@@ -163,7 +183,7 @@ static int fuse_file_passthrough_open(struct inode *inode, struct file *file)
return PTR_ERR(fb);
/* First passthrough file open denies caching inode io mode */
- err = fuse_file_uncached_io_start(inode, ff, fb);
+ err = fuse_file_uncached_io_open(inode, ff, fb);
if (!err)
return 0;
@@ -216,7 +236,7 @@ int fuse_file_io_open(struct file *file, struct inode *inode)
if (ff->open_flags & FOPEN_PASSTHROUGH)
err = fuse_file_passthrough_open(inode, file);
else
- err = fuse_file_cached_io_start(inode, ff);
+ err = fuse_file_cached_io_open(inode, ff);
if (err)
goto fail;
@@ -236,8 +256,10 @@ fail:
/* No more pending io and no new io possible to inode via open/mmapped file */
void fuse_file_io_release(struct fuse_file *ff, struct inode *inode)
{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
/*
- * Last parallel dio close allows caching inode io mode.
+ * Last passthrough file close allows caching inode io mode.
* Last caching file close exits caching inode io mode.
*/
switch (ff->iomode) {
@@ -245,10 +267,10 @@ void fuse_file_io_release(struct fuse_file *ff, struct inode *inode)
/* Nothing to do */
break;
case IOM_UNCACHED:
- fuse_file_uncached_io_end(inode, ff);
+ fuse_file_uncached_io_release(ff, fi);
break;
case IOM_CACHED:
- fuse_file_cached_io_end(inode, ff);
+ fuse_file_cached_io_release(ff, fi);
break;
}
}
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
index 1567f0323858..607ef735ad4a 100644
--- a/fs/fuse/passthrough.c
+++ b/fs/fuse/passthrough.c
@@ -18,11 +18,11 @@ static void fuse_file_accessed(struct file *file)
fuse_invalidate_atime(inode);
}
-static void fuse_file_modified(struct file *file)
+static void fuse_passthrough_end_write(struct kiocb *iocb, ssize_t ret)
{
- struct inode *inode = file_inode(file);
+ struct inode *inode = file_inode(iocb->ki_filp);
- fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
+ fuse_write_update_attr(inode, iocb->ki_pos, ret);
}
ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter)
@@ -34,7 +34,6 @@ ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter)
ssize_t ret;
struct backing_file_ctx ctx = {
.cred = ff->cred,
- .user_file = file,
.accessed = fuse_file_accessed,
};
@@ -62,8 +61,7 @@ ssize_t fuse_passthrough_write_iter(struct kiocb *iocb,
ssize_t ret;
struct backing_file_ctx ctx = {
.cred = ff->cred,
- .user_file = file,
- .end_write = fuse_file_modified,
+ .end_write = fuse_passthrough_end_write,
};
pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__,
@@ -88,15 +86,20 @@ ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos,
struct file *backing_file = fuse_file_passthrough(ff);
struct backing_file_ctx ctx = {
.cred = ff->cred,
- .user_file = in,
.accessed = fuse_file_accessed,
};
+ struct kiocb iocb;
+ ssize_t ret;
pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__,
- backing_file, ppos ? *ppos : 0, len, flags);
+ backing_file, *ppos, len, flags);
+
+ init_sync_kiocb(&iocb, in);
+ iocb.ki_pos = *ppos;
+ ret = backing_file_splice_read(backing_file, &iocb, pipe, len, flags, &ctx);
+ *ppos = iocb.ki_pos;
- return backing_file_splice_read(backing_file, ppos, pipe, len, flags,
- &ctx);
+ return ret;
}
ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe,
@@ -109,16 +112,18 @@ ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe,
ssize_t ret;
struct backing_file_ctx ctx = {
.cred = ff->cred,
- .user_file = out,
- .end_write = fuse_file_modified,
+ .end_write = fuse_passthrough_end_write,
};
+ struct kiocb iocb;
pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__,
- backing_file, ppos ? *ppos : 0, len, flags);
+ backing_file, *ppos, len, flags);
inode_lock(inode);
- ret = backing_file_splice_write(pipe, backing_file, ppos, len, flags,
- &ctx);
+ init_sync_kiocb(&iocb, out);
+ iocb.ki_pos = *ppos;
+ ret = backing_file_splice_write(pipe, backing_file, &iocb, len, flags, &ctx);
+ *ppos = iocb.ki_pos;
inode_unlock(inode);
return ret;
@@ -130,7 +135,6 @@ ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma)
struct file *backing_file = fuse_file_passthrough(ff);
struct backing_file_ctx ctx = {
.cred = ff->cred,
- .user_file = file,
.accessed = fuse_file_accessed,
};
@@ -225,18 +229,14 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map)
goto out;
res = -EINVAL;
- if (map->flags)
+ if (map->flags || map->padding)
goto out;
- file = fget(map->fd);
+ file = fget_raw(map->fd);
res = -EBADF;
if (!file)
goto out;
- res = -EOPNOTSUPP;
- if (!file->f_op->read_iter || !file->f_op->write_iter)
- goto out_fput;
-
backing_sb = file_inode(file)->i_sb;
res = -ELOOP;
if (backing_sb->s_stack_depth >= fc->max_stack_depth)
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 0377b6dc24c8..edcd6f18a8a8 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -120,7 +120,7 @@ static bool fuse_emit(struct file *file, struct dir_context *ctx,
fuse_add_dirent_to_cache(file, dirent, ctx->pos);
return dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino,
- dirent->type);
+ dirent->type | FILLDIR_FLAG_NOINTR);
}
static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
@@ -149,7 +149,7 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
static int fuse_direntplus_link(struct file *file,
struct fuse_direntplus *direntplus,
- u64 attr_version)
+ u64 attr_version, u64 evict_ctr)
{
struct fuse_entry_out *o = &direntplus->entry_out;
struct fuse_dirent *dirent = &direntplus->dirent;
@@ -233,7 +233,7 @@ retry:
} else {
inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
&o->attr, ATTR_TIMEOUT(o),
- attr_version);
+ attr_version, evict_ctr);
if (!inode)
inode = ERR_PTR(-ENOMEM);
@@ -284,7 +284,8 @@ static void fuse_force_forget(struct file *file, u64 nodeid)
}
static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
- struct dir_context *ctx, u64 attr_version)
+ struct dir_context *ctx, u64 attr_version,
+ u64 evict_ctr)
{
struct fuse_direntplus *direntplus;
struct fuse_dirent *dirent;
@@ -319,7 +320,7 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
buf += reclen;
nbytes -= reclen;
- ret = fuse_direntplus_link(file, direntplus, attr_version);
+ ret = fuse_direntplus_link(file, direntplus, attr_version, evict_ctr);
if (ret)
fuse_force_forget(file, direntplus->entry_out.nodeid);
}
@@ -331,26 +332,27 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
{
int plus;
ssize_t res;
- struct page *page;
+ struct folio *folio;
struct inode *inode = file_inode(file);
struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_io_args ia = {};
struct fuse_args_pages *ap = &ia.ap;
- struct fuse_page_desc desc = { .length = PAGE_SIZE };
- u64 attr_version = 0;
+ struct fuse_folio_desc desc = { .length = PAGE_SIZE };
+ u64 attr_version = 0, evict_ctr = 0;
bool locked;
- page = alloc_page(GFP_KERNEL);
- if (!page)
+ folio = folio_alloc(GFP_KERNEL, 0);
+ if (!folio)
return -ENOMEM;
plus = fuse_use_readdirplus(inode, ctx);
ap->args.out_pages = true;
- ap->num_pages = 1;
- ap->pages = &page;
+ ap->num_folios = 1;
+ ap->folios = &folio;
ap->descs = &desc;
if (plus) {
attr_version = fuse_get_attr_version(fm->fc);
+ evict_ctr = fuse_get_evict_ctr(fm->fc);
fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
FUSE_READDIRPLUS);
} else {
@@ -367,15 +369,16 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
if (ff->open_flags & FOPEN_CACHE_DIR)
fuse_readdir_cache_end(file, ctx->pos);
} else if (plus) {
- res = parse_dirplusfile(page_address(page), res,
- file, ctx, attr_version);
+ res = parse_dirplusfile(folio_address(folio), res,
+ file, ctx, attr_version,
+ evict_ctr);
} else {
- res = parse_dirfile(page_address(page), res, file,
+ res = parse_dirfile(folio_address(folio), res, file,
ctx);
}
}
- __free_page(page);
+ folio_put(folio);
fuse_invalidate_atime(inode);
return res;
}
@@ -416,7 +419,7 @@ static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff,
if (ff->readdir.pos == ctx->pos) {
res = FOUND_SOME;
if (!dir_emit(ctx, dirent->name, dirent->namelen,
- dirent->ino, dirent->type))
+ dirent->ino, dirent->type | FILLDIR_FLAG_NOINTR))
return FOUND_ALL;
ctx->pos = dirent->off;
}
diff --git a/fs/fuse/sysctl.c b/fs/fuse/sysctl.c
new file mode 100644
index 000000000000..e2d921abcb88
--- /dev/null
+++ b/fs/fuse/sysctl.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/fuse/fuse_sysctl.c
+ *
+ * Sysctl interface to fuse parameters
+ */
+#include <linux/sysctl.h>
+
+#include "fuse_i.h"
+
+static struct ctl_table_header *fuse_table_header;
+
+/* Bound by fuse_init_out max_pages, which is a u16 */
+static unsigned int sysctl_fuse_max_pages_limit = 65535;
+
+/*
+ * fuse_init_out request timeouts are u16.
+ * This goes up to ~18 hours, which is plenty for a timeout.
+ */
+static unsigned int sysctl_fuse_req_timeout_limit = 65535;
+
+static const struct ctl_table fuse_sysctl_table[] = {
+ {
+ .procname = "max_pages_limit",
+ .data = &fuse_max_pages_limit,
+ .maxlen = sizeof(fuse_max_pages_limit),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = &sysctl_fuse_max_pages_limit,
+ },
+ {
+ .procname = "default_request_timeout",
+ .data = &fuse_default_req_timeout,
+ .maxlen = sizeof(fuse_default_req_timeout),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &sysctl_fuse_req_timeout_limit,
+ },
+ {
+ .procname = "max_request_timeout",
+ .data = &fuse_max_req_timeout,
+ .maxlen = sizeof(fuse_max_req_timeout),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &sysctl_fuse_req_timeout_limit,
+ },
+};
+
+int fuse_sysctl_register(void)
+{
+ fuse_table_header = register_sysctl("fs/fuse", fuse_sysctl_table);
+ if (!fuse_table_header)
+ return -ENOMEM;
+ return 0;
+}
+
+void fuse_sysctl_unregister(void)
+{
+ unregister_sysctl_table(fuse_table_header);
+ fuse_table_header = NULL;
+}
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 322af827a232..53c2626e90e7 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -7,6 +7,8 @@
#include <linux/fs.h>
#include <linux/dax.h>
#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/group_cpus.h>
#include <linux/pfn_t.h>
#include <linux/memremap.h>
#include <linux/module.h>
@@ -49,17 +51,19 @@ struct virtio_fs_vq {
struct work_struct done_work;
struct list_head queued_reqs;
struct list_head end_reqs; /* End these requests */
- struct delayed_work dispatch_work;
+ struct work_struct dispatch_work;
struct fuse_dev *fud;
bool connected;
long in_flight;
struct completion in_flight_zero; /* No inflight requests */
+ struct kobject *kobj;
char name[VQ_NAME_LEN];
} ____cacheline_aligned_in_smp;
/* A virtio-fs device instance */
struct virtio_fs {
struct kobject kobj;
+ struct kobject *mqs_kobj;
struct list_head list; /* on virtio_fs_instances */
char *tag;
struct virtio_fs_vq *vqs;
@@ -67,6 +71,8 @@ struct virtio_fs {
unsigned int num_request_queues; /* number of request queues */
struct dax_device *dax_dev;
+ unsigned int *mq_map; /* index = cpu id, value = request vq id */
+
/* DAX memory window where file contents are mapped */
void *window_kaddr;
phys_addr_t window_phys_addr;
@@ -91,7 +97,8 @@ struct virtio_fs_req_work {
};
static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
- struct fuse_req *req, bool in_flight);
+ struct fuse_req *req, bool in_flight,
+ gfp_t gfp);
static const struct constant_table dax_param_enums[] = {
{"always", FUSE_DAX_ALWAYS },
@@ -170,7 +177,7 @@ static ssize_t tag_show(struct kobject *kobj,
{
struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj);
- return sysfs_emit(buf, fs->tag);
+ return sysfs_emit(buf, "%s\n", fs->tag);
}
static struct kobj_attribute virtio_fs_tag_attr = __ATTR_RO(tag);
@@ -185,6 +192,7 @@ static void virtio_fs_ktype_release(struct kobject *kobj)
{
struct virtio_fs *vfs = container_of(kobj, struct virtio_fs, kobj);
+ kfree(vfs->mq_map);
kfree(vfs->vqs);
kfree(vfs);
}
@@ -195,19 +203,94 @@ static const struct kobj_type virtio_fs_ktype = {
.default_groups = virtio_fs_groups,
};
+static struct virtio_fs_vq *virtio_fs_kobj_to_vq(struct virtio_fs *fs,
+ struct kobject *kobj)
+{
+ int i;
+
+ for (i = 0; i < fs->nvqs; i++) {
+ if (kobj == fs->vqs[i].kobj)
+ return &fs->vqs[i];
+ }
+ return NULL;
+}
+
+static ssize_t name_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct virtio_fs *fs = container_of(kobj->parent->parent, struct virtio_fs, kobj);
+ struct virtio_fs_vq *fsvq = virtio_fs_kobj_to_vq(fs, kobj);
+
+ if (!fsvq)
+ return -EINVAL;
+ return sysfs_emit(buf, "%s\n", fsvq->name);
+}
+
+static struct kobj_attribute virtio_fs_vq_name_attr = __ATTR_RO(name);
+
+static ssize_t cpu_list_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct virtio_fs *fs = container_of(kobj->parent->parent, struct virtio_fs, kobj);
+ struct virtio_fs_vq *fsvq = virtio_fs_kobj_to_vq(fs, kobj);
+ unsigned int cpu, qid;
+ const size_t size = PAGE_SIZE - 1;
+ bool first = true;
+ int ret = 0, pos = 0;
+
+ if (!fsvq)
+ return -EINVAL;
+
+ qid = fsvq->vq->index;
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+ if (qid < VQ_REQUEST || (fs->mq_map[cpu] == qid)) {
+ if (first)
+ ret = snprintf(buf + pos, size - pos, "%u", cpu);
+ else
+ ret = snprintf(buf + pos, size - pos, ", %u", cpu);
+
+ if (ret >= size - pos)
+ break;
+ first = false;
+ pos += ret;
+ }
+ }
+ ret = snprintf(buf + pos, size + 1 - pos, "\n");
+ return pos + ret;
+}
+
+static struct kobj_attribute virtio_fs_vq_cpu_list_attr = __ATTR_RO(cpu_list);
+
+static struct attribute *virtio_fs_vq_attrs[] = {
+ &virtio_fs_vq_name_attr.attr,
+ &virtio_fs_vq_cpu_list_attr.attr,
+ NULL
+};
+
+static struct attribute_group virtio_fs_vq_attr_group = {
+ .attrs = virtio_fs_vq_attrs,
+};
+
/* Make sure virtiofs_mutex is held */
-static void virtio_fs_put(struct virtio_fs *fs)
+static void virtio_fs_put_locked(struct virtio_fs *fs)
{
+ lockdep_assert_held(&virtio_fs_mutex);
+
kobject_put(&fs->kobj);
}
+static void virtio_fs_put(struct virtio_fs *fs)
+{
+ mutex_lock(&virtio_fs_mutex);
+ virtio_fs_put_locked(fs);
+ mutex_unlock(&virtio_fs_mutex);
+}
+
static void virtio_fs_fiq_release(struct fuse_iqueue *fiq)
{
struct virtio_fs *vfs = fiq->priv;
- mutex_lock(&virtio_fs_mutex);
virtio_fs_put(vfs);
- mutex_unlock(&virtio_fs_mutex);
}
static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq)
@@ -228,7 +311,7 @@ static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq)
}
flush_work(&fsvq->done_work);
- flush_delayed_work(&fsvq->dispatch_work);
+ flush_work(&fsvq->dispatch_work);
}
static void virtio_fs_drain_all_queues_locked(struct virtio_fs *fs)
@@ -268,6 +351,50 @@ static void virtio_fs_start_all_queues(struct virtio_fs *fs)
}
}
+static void virtio_fs_delete_queues_sysfs(struct virtio_fs *fs)
+{
+ struct virtio_fs_vq *fsvq;
+ int i;
+
+ for (i = 0; i < fs->nvqs; i++) {
+ fsvq = &fs->vqs[i];
+ kobject_put(fsvq->kobj);
+ }
+}
+
+static int virtio_fs_add_queues_sysfs(struct virtio_fs *fs)
+{
+ struct virtio_fs_vq *fsvq;
+ char buff[12];
+ int i, j, ret;
+
+ for (i = 0; i < fs->nvqs; i++) {
+ fsvq = &fs->vqs[i];
+
+ sprintf(buff, "%d", i);
+ fsvq->kobj = kobject_create_and_add(buff, fs->mqs_kobj);
+ if (!fs->mqs_kobj) {
+ ret = -ENOMEM;
+ goto out_del;
+ }
+
+ ret = sysfs_create_group(fsvq->kobj, &virtio_fs_vq_attr_group);
+ if (ret) {
+ kobject_put(fsvq->kobj);
+ goto out_del;
+ }
+ }
+
+ return 0;
+
+out_del:
+ for (j = 0; j < i; j++) {
+ fsvq = &fs->vqs[j];
+ kobject_put(fsvq->kobj);
+ }
+ return ret;
+}
+
/* Add a new instance to the list or return -EEXIST if tag name exists*/
static int virtio_fs_add_instance(struct virtio_device *vdev,
struct virtio_fs *fs)
@@ -291,17 +418,22 @@ static int virtio_fs_add_instance(struct virtio_device *vdev,
*/
fs->kobj.kset = virtio_fs_kset;
ret = kobject_add(&fs->kobj, NULL, "%d", vdev->index);
- if (ret < 0) {
- mutex_unlock(&virtio_fs_mutex);
- return ret;
+ if (ret < 0)
+ goto out_unlock;
+
+ fs->mqs_kobj = kobject_create_and_add("mqs", &fs->kobj);
+ if (!fs->mqs_kobj) {
+ ret = -ENOMEM;
+ goto out_del;
}
ret = sysfs_create_link(&fs->kobj, &vdev->dev.kobj, "device");
- if (ret < 0) {
- kobject_del(&fs->kobj);
- mutex_unlock(&virtio_fs_mutex);
- return ret;
- }
+ if (ret < 0)
+ goto out_put;
+
+ ret = virtio_fs_add_queues_sysfs(fs);
+ if (ret)
+ goto out_remove;
list_add_tail(&fs->list, &virtio_fs_instances);
@@ -310,6 +442,16 @@ static int virtio_fs_add_instance(struct virtio_device *vdev,
kobject_uevent(&fs->kobj, KOBJ_ADD);
return 0;
+
+out_remove:
+ sysfs_remove_link(&fs->kobj, "device");
+out_put:
+ kobject_put(fs->mqs_kobj);
+out_del:
+ kobject_del(&fs->kobj);
+out_unlock:
+ mutex_unlock(&virtio_fs_mutex);
+ return ret;
}
/* Return the virtio_fs with a given tag, or NULL */
@@ -380,6 +522,7 @@ static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs)
return -EINVAL;
}
+ dev_info(&vdev->dev, "discovered new tag: %s\n", fs->tag);
return 0;
}
@@ -403,6 +546,10 @@ static void virtio_fs_hiprio_done_work(struct work_struct *work)
dec_in_flight_req(fsvq);
}
} while (!virtqueue_enable_cb(vq));
+
+ if (!list_empty(&fsvq->queued_reqs))
+ schedule_work(&fsvq->dispatch_work);
+
spin_unlock(&fsvq->lock);
}
@@ -410,7 +557,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work)
{
struct fuse_req *req;
struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
- dispatch_work.work);
+ dispatch_work);
int ret;
pr_debug("virtio-fs: worker %s called.\n", __func__);
@@ -430,6 +577,8 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work)
/* Dispatch pending requests */
while (1) {
+ unsigned int flags;
+
spin_lock(&fsvq->lock);
req = list_first_entry_or_null(&fsvq->queued_reqs,
struct fuse_req, list);
@@ -440,13 +589,13 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work)
list_del_init(&req->list);
spin_unlock(&fsvq->lock);
- ret = virtio_fs_enqueue_req(fsvq, req, true);
+ flags = memalloc_nofs_save();
+ ret = virtio_fs_enqueue_req(fsvq, req, true, GFP_KERNEL);
+ memalloc_nofs_restore(flags);
if (ret < 0) {
- if (ret == -ENOMEM || ret == -ENOSPC) {
+ if (ret == -ENOSPC) {
spin_lock(&fsvq->lock);
list_add_tail(&req->list, &fsvq->queued_reqs);
- schedule_delayed_work(&fsvq->dispatch_work,
- msecs_to_jiffies(1));
spin_unlock(&fsvq->lock);
return;
}
@@ -489,12 +638,10 @@ static int send_forget_request(struct virtio_fs_vq *fsvq,
ret = virtqueue_add_outbuf(vq, &sg, 1, forget, GFP_ATOMIC);
if (ret < 0) {
- if (ret == -ENOMEM || ret == -ENOSPC) {
+ if (ret == -ENOSPC) {
pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n",
ret);
list_add_tail(&forget->list, &fsvq->queued_reqs);
- schedule_delayed_work(&fsvq->dispatch_work,
- msecs_to_jiffies(1));
if (!in_flight)
inc_in_flight_req(fsvq);
/* Queue is full */
@@ -526,7 +673,7 @@ static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
{
struct virtio_fs_forget *forget;
struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
- dispatch_work.work);
+ dispatch_work);
pr_debug("virtio-fs: worker %s called.\n", __func__);
while (1) {
spin_lock(&fsvq->lock);
@@ -545,7 +692,7 @@ static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
}
/* Allocate and copy args into req->argbuf */
-static int copy_args_to_argbuf(struct fuse_req *req)
+static int copy_args_to_argbuf(struct fuse_req *req, gfp_t gfp)
{
struct fuse_args *args = req->args;
unsigned int offset = 0;
@@ -559,7 +706,7 @@ static int copy_args_to_argbuf(struct fuse_req *req)
len = fuse_len_args(num_in, (struct fuse_arg *) args->in_args) +
fuse_len_args(num_out, args->out_args);
- req->argbuf = kmalloc(len, GFP_ATOMIC);
+ req->argbuf = kmalloc(len, gfp);
if (!req->argbuf)
return -ENOMEM;
@@ -619,7 +766,7 @@ static void virtio_fs_request_complete(struct fuse_req *req,
struct fuse_args *args;
struct fuse_args_pages *ap;
unsigned int len, i, thislen;
- struct page *page;
+ struct folio *folio;
/*
* TODO verify that server properly follows FUSE protocol
@@ -631,12 +778,12 @@ static void virtio_fs_request_complete(struct fuse_req *req,
if (args->out_pages && args->page_zeroing) {
len = args->out_args[args->out_numargs - 1].size;
ap = container_of(args, typeof(*ap), args);
- for (i = 0; i < ap->num_pages; i++) {
+ for (i = 0; i < ap->num_folios; i++) {
thislen = ap->descs[i].length;
if (len < thislen) {
WARN_ON(ap->descs[i].offset);
- page = ap->pages[i];
- zero_user_segment(page, len, thislen);
+ folio = ap->folios[i];
+ folio_zero_segment(folio, len, thislen);
len = 0;
} else {
len -= thislen;
@@ -704,6 +851,50 @@ static void virtio_fs_requests_done_work(struct work_struct *work)
virtio_fs_request_complete(req, fsvq);
}
}
+
+ /* Try to push previously queued requests, as the queue might no longer be full */
+ spin_lock(&fsvq->lock);
+ if (!list_empty(&fsvq->queued_reqs))
+ schedule_work(&fsvq->dispatch_work);
+ spin_unlock(&fsvq->lock);
+}
+
+static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *fs)
+{
+ const struct cpumask *mask, *masks;
+ unsigned int q, cpu;
+
+ /* First attempt to map using existing transport layer affinities
+ * e.g. PCIe MSI-X
+ */
+ if (!vdev->config->get_vq_affinity)
+ goto fallback;
+
+ for (q = 0; q < fs->num_request_queues; q++) {
+ mask = vdev->config->get_vq_affinity(vdev, VQ_REQUEST + q);
+ if (!mask)
+ goto fallback;
+
+ for_each_cpu(cpu, mask)
+ fs->mq_map[cpu] = q + VQ_REQUEST;
+ }
+
+ return;
+fallback:
+ /* Attempt to map evenly in groups over the CPUs */
+ masks = group_cpus_evenly(fs->num_request_queues);
+ /* If even this fails we default to all CPUs use first request queue */
+ if (!masks) {
+ for_each_possible_cpu(cpu)
+ fs->mq_map[cpu] = VQ_REQUEST;
+ return;
+ }
+
+ for (q = 0; q < fs->num_request_queues; q++) {
+ for_each_cpu(cpu, &masks[q])
+ fs->mq_map[cpu] = q + VQ_REQUEST;
+ }
+ kfree(masks);
}
/* Virtqueue interrupt handler */
@@ -727,12 +918,12 @@ static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name,
if (vq_type == VQ_REQUEST) {
INIT_WORK(&fsvq->done_work, virtio_fs_requests_done_work);
- INIT_DELAYED_WORK(&fsvq->dispatch_work,
- virtio_fs_request_dispatch_work);
+ INIT_WORK(&fsvq->dispatch_work,
+ virtio_fs_request_dispatch_work);
} else {
INIT_WORK(&fsvq->done_work, virtio_fs_hiprio_done_work);
- INIT_DELAYED_WORK(&fsvq->dispatch_work,
- virtio_fs_hiprio_dispatch_work);
+ INIT_WORK(&fsvq->dispatch_work,
+ virtio_fs_hiprio_dispatch_work);
}
}
@@ -740,9 +931,13 @@ static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name,
static int virtio_fs_setup_vqs(struct virtio_device *vdev,
struct virtio_fs *fs)
{
+ struct virtqueue_info *vqs_info;
struct virtqueue **vqs;
- vq_callback_t **callbacks;
- const char **names;
+ /* Specify pre_vectors to ensure that the queues before the
+ * request queues (e.g. hiprio) don't claim any of the CPUs in
+ * the multi-queue mapping and interrupt affinities
+ */
+ struct irq_affinity desc = { .pre_vectors = VQ_REQUEST };
unsigned int i;
int ret = 0;
@@ -751,24 +946,27 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
if (fs->num_request_queues == 0)
return -EINVAL;
+ /* Truncate nr of request queues to nr_cpu_id */
+ fs->num_request_queues = min_t(unsigned int, fs->num_request_queues,
+ nr_cpu_ids);
fs->nvqs = VQ_REQUEST + fs->num_request_queues;
fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL);
if (!fs->vqs)
return -ENOMEM;
vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL);
- callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]),
- GFP_KERNEL);
- names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL);
- if (!vqs || !callbacks || !names) {
+ fs->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*fs->mq_map), GFP_KERNEL,
+ dev_to_node(&vdev->dev));
+ vqs_info = kcalloc(fs->nvqs, sizeof(*vqs_info), GFP_KERNEL);
+ if (!vqs || !vqs_info || !fs->mq_map) {
ret = -ENOMEM;
goto out;
}
/* Initialize the hiprio/forget request virtqueue */
- callbacks[VQ_HIPRIO] = virtio_fs_vq_done;
+ vqs_info[VQ_HIPRIO].callback = virtio_fs_vq_done;
virtio_fs_init_vq(&fs->vqs[VQ_HIPRIO], "hiprio", VQ_HIPRIO);
- names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name;
+ vqs_info[VQ_HIPRIO].name = fs->vqs[VQ_HIPRIO].name;
/* Initialize the requests virtqueues */
for (i = VQ_REQUEST; i < fs->nvqs; i++) {
@@ -776,11 +974,11 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
snprintf(vq_name, VQ_NAME_LEN, "requests.%u", i - VQ_REQUEST);
virtio_fs_init_vq(&fs->vqs[i], vq_name, VQ_REQUEST);
- callbacks[i] = virtio_fs_vq_done;
- names[i] = fs->vqs[i].name;
+ vqs_info[i].callback = virtio_fs_vq_done;
+ vqs_info[i].name = fs->vqs[i].name;
}
- ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL);
+ ret = virtio_find_vqs(vdev, fs->nvqs, vqs, vqs_info, &desc);
if (ret < 0)
goto out;
@@ -789,11 +987,12 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
virtio_fs_start_all_queues(fs);
out:
- kfree(names);
- kfree(callbacks);
+ kfree(vqs_info);
kfree(vqs);
- if (ret)
+ if (ret) {
kfree(fs->vqs);
+ kfree(fs->mq_map);
+ }
return ret;
}
@@ -818,8 +1017,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
if (kaddr)
*kaddr = fs->window_kaddr + offset;
if (pfn)
- *pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
- PFN_DEV | PFN_MAP);
+ *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, 0);
return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
}
@@ -939,7 +1137,7 @@ static int virtio_fs_probe(struct virtio_device *vdev)
if (ret < 0)
goto out;
- /* TODO vq affinity */
+ virtio_fs_map_queues(vdev, fs);
ret = virtio_fs_setup_dax(vdev, fs);
if (ret < 0)
@@ -986,7 +1184,9 @@ static void virtio_fs_remove(struct virtio_device *vdev)
mutex_lock(&virtio_fs_mutex);
/* This device is going away. No one should get new reference */
list_del_init(&fs->list);
+ virtio_fs_delete_queues_sysfs(fs);
sysfs_remove_link(&fs->kobj, "device");
+ kobject_put(fs->mqs_kobj);
kobject_del(&fs->kobj);
virtio_fs_stop_all_queues(fs);
virtio_fs_drain_all_queues_locked(fs);
@@ -995,7 +1195,7 @@ static void virtio_fs_remove(struct virtio_device *vdev)
vdev->priv = NULL;
/* Put device reference on virtio_fs object */
- virtio_fs_put(fs);
+ virtio_fs_put_locked(fs);
mutex_unlock(&virtio_fs_mutex);
}
@@ -1023,7 +1223,6 @@ static const unsigned int feature_table[] = {};
static struct virtio_driver virtio_fs_driver = {
.driver.name = KBUILD_MODNAME,
- .driver.owner = THIS_MODULE,
.id_table = id_table,
.feature_table = feature_table,
.feature_table_size = ARRAY_SIZE(feature_table),
@@ -1035,22 +1234,13 @@ static struct virtio_driver virtio_fs_driver = {
#endif
};
-static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq)
-__releases(fiq->lock)
+static void virtio_fs_send_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *link)
{
- struct fuse_forget_link *link;
struct virtio_fs_forget *forget;
struct virtio_fs_forget_req *req;
- struct virtio_fs *fs;
- struct virtio_fs_vq *fsvq;
- u64 unique;
-
- link = fuse_dequeue_forget(fiq, 1, NULL);
- unique = fuse_get_unique(fiq);
-
- fs = fiq->priv;
- fsvq = &fs->vqs[VQ_HIPRIO];
- spin_unlock(&fiq->lock);
+ struct virtio_fs *fs = fiq->priv;
+ struct virtio_fs_vq *fsvq = &fs->vqs[VQ_HIPRIO];
+ u64 unique = fuse_get_unique(fiq);
/* Allocate a buffer for the request */
forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL);
@@ -1070,8 +1260,7 @@ __releases(fiq->lock)
kfree(link);
}
-static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq)
-__releases(fiq->lock)
+static void virtio_fs_send_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
{
/*
* TODO interrupts.
@@ -1080,19 +1269,18 @@ __releases(fiq->lock)
* Exceptions are blocking lock operations; for example fcntl(F_SETLKW)
* with shared lock between host and guest.
*/
- spin_unlock(&fiq->lock);
}
/* Count number of scatter-gather elements required */
-static unsigned int sg_count_fuse_pages(struct fuse_page_desc *page_descs,
- unsigned int num_pages,
- unsigned int total_len)
+static unsigned int sg_count_fuse_folios(struct fuse_folio_desc *folio_descs,
+ unsigned int num_folios,
+ unsigned int total_len)
{
unsigned int i;
unsigned int this_len;
- for (i = 0; i < num_pages && total_len; i++) {
- this_len = min(page_descs[i].length, total_len);
+ for (i = 0; i < num_folios && total_len; i++) {
+ this_len = min(folio_descs[i].length, total_len);
total_len -= this_len;
}
@@ -1111,8 +1299,8 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req)
if (args->in_pages) {
size = args->in_args[args->in_numargs - 1].size;
- total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages,
- size);
+ total_sgs += sg_count_fuse_folios(ap->descs, ap->num_folios,
+ size);
}
if (!test_bit(FR_ISREPLY, &req->flags))
@@ -1125,27 +1313,27 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req)
if (args->out_pages) {
size = args->out_args[args->out_numargs - 1].size;
- total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages,
- size);
+ total_sgs += sg_count_fuse_folios(ap->descs, ap->num_folios,
+ size);
}
return total_sgs;
}
-/* Add pages to scatter-gather list and return number of elements used */
-static unsigned int sg_init_fuse_pages(struct scatterlist *sg,
- struct page **pages,
- struct fuse_page_desc *page_descs,
- unsigned int num_pages,
- unsigned int total_len)
+/* Add folios to scatter-gather list and return number of elements used */
+static unsigned int sg_init_fuse_folios(struct scatterlist *sg,
+ struct folio **folios,
+ struct fuse_folio_desc *folio_descs,
+ unsigned int num_folios,
+ unsigned int total_len)
{
unsigned int i;
unsigned int this_len;
- for (i = 0; i < num_pages && total_len; i++) {
+ for (i = 0; i < num_folios && total_len; i++) {
sg_init_table(&sg[i], 1);
- this_len = min(page_descs[i].length, total_len);
- sg_set_page(&sg[i], pages[i], this_len, page_descs[i].offset);
+ this_len = min(folio_descs[i].length, total_len);
+ sg_set_folio(&sg[i], folios[i], this_len, folio_descs[i].offset);
total_len -= this_len;
}
@@ -1170,10 +1358,10 @@ static unsigned int sg_init_fuse_args(struct scatterlist *sg,
sg_init_one(&sg[total_sgs++], argbuf, len);
if (argpages)
- total_sgs += sg_init_fuse_pages(&sg[total_sgs],
- ap->pages, ap->descs,
- ap->num_pages,
- args[numargs - 1].size);
+ total_sgs += sg_init_fuse_folios(&sg[total_sgs],
+ ap->folios, ap->descs,
+ ap->num_folios,
+ args[numargs - 1].size);
if (len_used)
*len_used = len;
@@ -1183,7 +1371,8 @@ static unsigned int sg_init_fuse_args(struct scatterlist *sg,
/* Add a request to a virtqueue and kick the device */
static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
- struct fuse_req *req, bool in_flight)
+ struct fuse_req *req, bool in_flight,
+ gfp_t gfp)
{
/* requests need at least 4 elements */
struct scatterlist *stack_sgs[6];
@@ -1204,8 +1393,8 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
/* Does the sglist fit on the stack? */
total_sgs = sg_count_fuse_req(req);
if (total_sgs > ARRAY_SIZE(stack_sgs)) {
- sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC);
- sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC);
+ sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), gfp);
+ sg = kmalloc_array(total_sgs, sizeof(sg[0]), gfp);
if (!sgs || !sg) {
ret = -ENOMEM;
goto out;
@@ -1213,7 +1402,7 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
}
/* Use a bounce buffer since stack args cannot be mapped */
- ret = copy_args_to_argbuf(req);
+ ret = copy_args_to_argbuf(req, gfp);
if (ret < 0)
goto out;
@@ -1285,33 +1474,31 @@ out:
return ret;
}
-static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
-__releases(fiq->lock)
+static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req)
{
- unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
+ unsigned int queue_id;
struct virtio_fs *fs;
- struct fuse_req *req;
struct virtio_fs_vq *fsvq;
int ret;
- WARN_ON(list_empty(&fiq->pending));
- req = list_last_entry(&fiq->pending, struct fuse_req, list);
+ if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
+ req->in.h.unique = fuse_get_unique(fiq);
+
clear_bit(FR_PENDING, &req->flags);
- list_del_init(&req->list);
- WARN_ON(!list_empty(&fiq->pending));
- spin_unlock(&fiq->lock);
fs = fiq->priv;
+ queue_id = fs->mq_map[raw_smp_processor_id()];
- pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n",
- __func__, req->in.h.opcode, req->in.h.unique,
+ pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u queue_id %u\n",
+ __func__, req->in.h.opcode, req->in.h.unique,
req->in.h.nodeid, req->in.h.len,
- fuse_len_args(req->args->out_numargs, req->args->out_args));
+ fuse_len_args(req->args->out_numargs, req->args->out_args),
+ queue_id);
fsvq = &fs->vqs[queue_id];
- ret = virtio_fs_enqueue_req(fsvq, req, false);
+ ret = virtio_fs_enqueue_req(fsvq, req, false, GFP_ATOMIC);
if (ret < 0) {
- if (ret == -ENOMEM || ret == -ENOSPC) {
+ if (ret == -ENOSPC) {
/*
* Virtqueue full. Retry submission from worker
* context as we might be holding fc->bg_lock.
@@ -1319,8 +1506,6 @@ __releases(fiq->lock)
spin_lock(&fsvq->lock);
list_add_tail(&req->list, &fsvq->queued_reqs);
inc_in_flight_req(fsvq);
- schedule_delayed_work(&fsvq->dispatch_work,
- msecs_to_jiffies(1));
spin_unlock(&fsvq->lock);
return;
}
@@ -1330,17 +1515,17 @@ __releases(fiq->lock)
/* Can't end request in submission context. Use a worker */
spin_lock(&fsvq->lock);
list_add_tail(&req->list, &fsvq->end_reqs);
- schedule_delayed_work(&fsvq->dispatch_work, 0);
+ schedule_work(&fsvq->dispatch_work);
spin_unlock(&fsvq->lock);
return;
}
}
static const struct fuse_iqueue_ops virtio_fs_fiq_ops = {
- .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock,
- .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock,
- .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock,
- .release = virtio_fs_fiq_release,
+ .send_forget = virtio_fs_send_forget,
+ .send_interrupt = virtio_fs_send_interrupt,
+ .send_req = virtio_fs_send_req,
+ .release = virtio_fs_fiq_release,
};
static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx)
@@ -1484,6 +1669,9 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
unsigned int virtqueue_size;
int err = -EIO;
+ if (!fsc->source)
+ return invalf(fsc, "No source specified");
+
/* This gets a reference on virtio_fs object. This ptr gets installed
* in fc->iq->priv. Once fuse_conn is going away, it calls ->put()
* to drop the reference to this object.
@@ -1512,6 +1700,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
fc->delete_stale = true;
fc->auto_submounts = true;
fc->sync_fs = true;
+ fc->use_pages_for_kvec_io = true;
/* Tell FUSE to split requests that exceed the virtqueue's size */
fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit,
@@ -1540,9 +1729,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
out_err:
kfree(fc);
- mutex_lock(&virtio_fs_mutex);
virtio_fs_put(fs);
- mutex_unlock(&virtio_fs_mutex);
return err;
}
@@ -1572,6 +1759,7 @@ static struct file_system_type virtio_fs_type = {
.name = "virtiofs",
.init_fs_context = virtio_fs_init_fs_context,
.kill_sb = virtio_kill_sb,
+ .fs_flags = FS_ALLOW_IDMAP,
};
static int virtio_fs_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 5b423fdbb13f..93dfb06b6cea 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -81,7 +81,7 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
}
ret = fuse_simple_request(fm, &args);
if (!ret && !size)
- ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX);
+ ret = min_t(size_t, outarg.size, XATTR_SIZE_MAX);
if (ret == -ENOSYS) {
fm->fc->no_getxattr = 1;
ret = -EOPNOTSUPP;
@@ -143,7 +143,7 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
}
ret = fuse_simple_request(fm, &args);
if (!ret && !size)
- ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX);
+ ret = min_t(size_t, outarg.size, XATTR_LIST_MAX);
if (ret > 0 && size)
ret = fuse_verify_xattr_list(list, ret);
if (ret == -ENOSYS) {
@@ -164,9 +164,10 @@ int fuse_removexattr(struct inode *inode, const char *name)
args.opcode = FUSE_REMOVEXATTR;
args.nodeid = get_node_id(inode);
- args.in_numargs = 1;
- args.in_args[0].size = strlen(name) + 1;
- args.in_args[0].value = name;
+ args.in_numargs = 2;
+ fuse_set_zero_arg0(&args);
+ args.in_args[1].size = strlen(name) + 1;
+ args.in_args[1].value = name;
err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
fm->fc->no_removexattr = 1;