diff options
Diffstat (limited to 'fs/fuse/file.c')
| -rw-r--r-- | fs/fuse/file.c | 3424 |
1 files changed, 1736 insertions, 1688 deletions
diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ffaffe18352a..01bc894e9c2b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -14,67 +14,74 @@ #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/module.h> -#include <linux/compat.h> #include <linux/swap.h> #include <linux/falloc.h> #include <linux/uio.h> +#include <linux/fs.h> +#include <linux/filelock.h> +#include <linux/splice.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/iomap.h> -static const struct file_operations fuse_direct_io_file_operations; - -static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, - int opcode, struct fuse_open_out *outargp) +static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, + unsigned int open_flags, int opcode, + struct fuse_open_out *outargp) { struct fuse_open_in inarg; FUSE_ARGS(args); memset(&inarg, 0, sizeof(inarg)); - inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); - if (!fc->atomic_o_trunc) + inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); + if (!fm->fc->atomic_o_trunc) inarg.flags &= ~O_TRUNC; - args.in.h.opcode = opcode; - args.in.h.nodeid = nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(*outargp); - args.out.args[0].value = outargp; - return fuse_simple_request(fc, &args); + if (fm->fc->handle_killpriv_v2 && + (inarg.flags & O_TRUNC) && !capable(CAP_FSETID)) { + inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; + } + + args.opcode = opcode; + args.nodeid = nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(*outargp); + args.out_args[0].value = outargp; + + return fuse_simple_request(fm, &args); } -struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release) { struct fuse_file *ff; - ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL); + ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT); if (unlikely(!ff)) return NULL; - ff->fc = fc; - ff->reserved_req = fuse_request_alloc(0); - if (unlikely(!ff->reserved_req)) { - kfree(ff); - return NULL; + ff->fm = fm; + if (release) { + ff->args = kzalloc(sizeof(*ff->args), GFP_KERNEL_ACCOUNT); + if (!ff->args) { + kfree(ff); + return NULL; + } } INIT_LIST_HEAD(&ff->write_entry); - mutex_init(&ff->readdir.lock); refcount_set(&ff->count, 1); RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); - spin_lock(&fc->lock); - ff->kh = ++fc->khctr; - spin_unlock(&fc->lock); + ff->kh = atomic64_inc_return(&fm->fc->khctr); return ff; } void fuse_file_free(struct fuse_file *ff) { - fuse_request_free(ff->reserved_req); - mutex_destroy(&ff->readdir.lock); + kfree(ff->args); kfree(ff); } @@ -84,65 +91,86 @@ static struct fuse_file *fuse_file_get(struct fuse_file *ff) return ff; } -static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args, + int error) { - iput(req->misc.release.inode); + struct fuse_release_args *ra = container_of(args, typeof(*ra), args); + + iput(ra->inode); + kfree(ra); } -static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) +static void fuse_file_put(struct fuse_file *ff, bool sync) { if (refcount_dec_and_test(&ff->count)) { - struct fuse_req *req = ff->reserved_req; + struct fuse_release_args *ra = &ff->args->release_args; + struct fuse_args *args = (ra ? &ra->args : NULL); - if (ff->fc->no_open && !isdir) { - /* - * Drop the release request when client does not - * implement 'open' - */ - __clear_bit(FR_BACKGROUND, &req->flags); - iput(req->misc.release.inode); - fuse_put_request(ff->fc, req); + if (ra && ra->inode) + fuse_file_io_release(ff, ra->inode); + + if (!args) { + /* Do nothing when server does not implement 'opendir' */ + } else if (args->opcode == FUSE_RELEASE && ff->fm->fc->no_open) { + fuse_release_end(ff->fm, args, 0); } else if (sync) { - __set_bit(FR_FORCE, &req->flags); - __clear_bit(FR_BACKGROUND, &req->flags); - fuse_request_send(ff->fc, req); - iput(req->misc.release.inode); - fuse_put_request(ff->fc, req); + fuse_simple_request(ff->fm, args); + fuse_release_end(ff->fm, args, 0); } else { - req->end = fuse_release_end; - __set_bit(FR_BACKGROUND, &req->flags); - fuse_request_send_background(ff->fc, req); + args->end = fuse_release_end; + if (fuse_simple_background(ff->fm, args, + GFP_KERNEL | __GFP_NOFAIL)) + fuse_release_end(ff->fm, args, -ENOTCONN); } kfree(ff); } } -int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, - bool isdir) +struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, + unsigned int open_flags, bool isdir) { + struct fuse_conn *fc = fm->fc; struct fuse_file *ff; int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; + bool open = isdir ? !fc->no_opendir : !fc->no_open; + bool release = !isdir || open; - ff = fuse_file_alloc(fc); + /* + * ff->args->release_args still needs to be allocated (so we can hold an + * inode reference while there are pending inflight file operations when + * ->release() is called, see fuse_prepare_release()) even if + * fc->no_open is set else it becomes possible for reclaim to deadlock + * if while servicing the readahead request the server triggers reclaim + * and reclaim evicts the inode of the file being read ahead. + */ + ff = fuse_file_alloc(fm, release); if (!ff) - return -ENOMEM; + return ERR_PTR(-ENOMEM); ff->fh = 0; - ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */ - if (!fc->no_open || isdir) { - struct fuse_open_out outarg; + /* Default for no-open */ + ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); + if (open) { + /* Store outarg for fuse_finish_open() */ + struct fuse_open_out *outargp = &ff->args->open_outarg; int err; - err = fuse_send_open(fc, nodeid, file, opcode, &outarg); + err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp); if (!err) { - ff->fh = outarg.fh; - ff->open_flags = outarg.open_flags; - - } else if (err != -ENOSYS || isdir) { + ff->fh = outargp->fh; + ff->open_flags = outargp->open_flags; + } else if (err != -ENOSYS) { fuse_file_free(ff); - return err; + return ERR_PTR(err); } else { - fc->no_open = 1; + if (isdir) { + /* No release needed */ + kfree(ff->args); + ff->args = NULL; + fc->no_opendir = 1; + } else { + fc->no_open = 1; + } } } @@ -150,119 +178,187 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, ff->open_flags &= ~FOPEN_DIRECT_IO; ff->nodeid = nodeid; - file->private_data = ff; - return 0; + return ff; +} + +int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, + bool isdir) +{ + struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir); + + if (!IS_ERR(ff)) + file->private_data = ff; + + return PTR_ERR_OR_ZERO(ff); } EXPORT_SYMBOL_GPL(fuse_do_open); static void fuse_link_write_file(struct file *file) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff = file->private_data; /* * file may be written through mmap, so chain it onto the * inodes's write_file list */ - spin_lock(&fc->lock); + spin_lock(&fi->lock); if (list_empty(&ff->write_entry)) list_add(&ff->write_entry, &fi->write_files); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); } -void fuse_finish_open(struct inode *inode, struct file *file) +int fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); + int err; - if (ff->open_flags & FOPEN_DIRECT_IO) - file->f_op = &fuse_direct_io_file_operations; - if (!(ff->open_flags & FOPEN_KEEP_CACHE)) - invalidate_inode_pages2(inode->i_mapping); - if (ff->open_flags & FOPEN_NONSEEKABLE) + err = fuse_file_io_open(file, inode); + if (err) + return err; + + if (ff->open_flags & FOPEN_STREAM) + stream_open(inode, file); + else if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); - if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { - struct fuse_inode *fi = get_fuse_inode(inode); - - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; - i_size_write(inode, 0); - spin_unlock(&fc->lock); - fuse_invalidate_attr(inode); - if (fc->writeback_cache) - file_update_time(file); - } + if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); + + return 0; } -int fuse_open_common(struct inode *inode, struct file *file, bool isdir) +static void fuse_truncate_update_attr(struct inode *inode, struct file *file) { struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + i_size_write(inode, 0); + spin_unlock(&fi->lock); + file_update_time(file); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); +} + +static int fuse_open(struct inode *inode, struct file *file) +{ + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = fm->fc; + struct fuse_file *ff; int err; - bool lock_inode = (file->f_flags & O_TRUNC) && - fc->atomic_o_trunc && - fc->writeback_cache; + bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc; + bool is_wb_truncate = is_truncate && fc->writeback_cache; + bool dax_truncate = is_truncate && FUSE_IS_DAX(inode); + + if (fuse_is_bad(inode)) + return -EIO; err = generic_file_open(inode, file); if (err) return err; - if (lock_inode) + if (is_wb_truncate || dax_truncate) inode_lock(inode); - err = fuse_do_open(fc, get_node_id(inode), file, isdir); + if (dax_truncate) { + filemap_invalidate_lock(inode->i_mapping); + err = fuse_dax_break_layouts(inode, 0, -1); + if (err) + goto out_inode_unlock; + } + + if (is_wb_truncate || dax_truncate) + fuse_set_nowrite(inode); - if (!err) - fuse_finish_open(inode, file); + err = fuse_do_open(fm, get_node_id(inode), file, false); + if (!err) { + ff = file->private_data; + err = fuse_finish_open(inode, file); + if (err) + fuse_sync_release(fi, ff, file->f_flags); + else if (is_truncate) + fuse_truncate_update_attr(inode, file); + } - if (lock_inode) + if (is_wb_truncate || dax_truncate) + fuse_release_nowrite(inode); + if (!err) { + if (is_truncate) + truncate_pagecache(inode, 0); + else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); + } + if (dax_truncate) + filemap_invalidate_unlock(inode->i_mapping); +out_inode_unlock: + if (is_wb_truncate || dax_truncate) inode_unlock(inode); return err; } -static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) +static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, + unsigned int flags, int opcode, bool sync) { - struct fuse_conn *fc = ff->fc; - struct fuse_req *req = ff->reserved_req; - struct fuse_release_in *inarg = &req->misc.release.in; + struct fuse_conn *fc = ff->fm->fc; + struct fuse_release_args *ra = &ff->args->release_args; + if (fuse_file_passthrough(ff)) + fuse_passthrough_release(ff, fuse_inode_backing(fi)); + + /* Inode is NULL on error path of fuse_create_open() */ + if (likely(fi)) { + spin_lock(&fi->lock); + list_del(&ff->write_entry); + spin_unlock(&fi->lock); + } spin_lock(&fc->lock); - list_del(&ff->write_entry); if (!RB_EMPTY_NODE(&ff->polled_node)) rb_erase(&ff->polled_node, &fc->polled_files); spin_unlock(&fc->lock); wake_up_interruptible_all(&ff->poll_wait); - inarg->fh = ff->fh; - inarg->flags = flags; - req->in.h.opcode = opcode; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(struct fuse_release_in); - req->in.args[0].value = inarg; + if (!ra) + return; + + /* ff->args was used for open outarg */ + memset(ff->args, 0, sizeof(*ff->args)); + ra->inarg.fh = ff->fh; + ra->inarg.flags = flags; + ra->args.in_numargs = 1; + ra->args.in_args[0].size = sizeof(struct fuse_release_in); + ra->args.in_args[0].value = &ra->inarg; + ra->args.opcode = opcode; + ra->args.nodeid = ff->nodeid; + ra->args.force = true; + ra->args.nocreds = true; + + /* + * Hold inode until release is finished. + * From fuse_sync_release() the refcount is 1 and everything's + * synchronous, so we are fine with not doing igrab() here. + */ + ra->inode = sync ? NULL : igrab(&fi->inode); } -void fuse_release_common(struct file *file, bool isdir) +void fuse_file_release(struct inode *inode, struct fuse_file *ff, + unsigned int open_flags, fl_owner_t id, bool isdir) { - struct fuse_file *ff = file->private_data; - struct fuse_req *req = ff->reserved_req; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_release_args *ra = &ff->args->release_args; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; - fuse_prepare_release(ff, file->f_flags, opcode); + fuse_prepare_release(fi, ff, open_flags, opcode, false); - if (ff->flock) { - struct fuse_release_in *inarg = &req->misc.release.in; - inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; - inarg->lock_owner = fuse_lock_owner_id(ff->fc, - (fl_owner_t) file); + if (ra && ff->flock) { + ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; + ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id); } - /* Hold inode until release is finished */ - req->misc.release.inode = igrab(file_inode(file)); /* * Normally this will send the RELEASE request, however if @@ -272,20 +368,30 @@ void fuse_release_common(struct file *file, bool isdir) * Make the release synchronous if this is a fuseblk mount, * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. + * + * Always use the asynchronous file put because the current thread + * might be the fuse server. This can happen if a process starts some + * aio and closes the fd before the aio completes. Since aio takes its + * own ref to the file, the IO completion has to drop the ref, which is + * how the fuse server can end up closing its clients' files. */ - fuse_file_put(ff, ff->fc->destroy_req != NULL, isdir); + fuse_file_put(ff, false); } -static int fuse_open(struct inode *inode, struct file *file) +void fuse_release_common(struct file *file, bool isdir) { - return fuse_open_common(inode, file, false); + fuse_file_release(file_inode(file), file->private_data, file->f_flags, + (fl_owner_t) file, isdir); } static int fuse_release(struct inode *inode, struct file *file) { struct fuse_conn *fc = get_fuse_conn(inode); - /* see fuse_vma_close() for !writeback_cache case */ + /* + * Dirty pages might remain despite write_inode_now() call from + * fuse_flush() due to writes racing with the close. + */ if (fc->writeback_cache) write_inode_now(inode, 1); @@ -295,15 +401,12 @@ static int fuse_release(struct inode *inode, struct file *file) return 0; } -void fuse_sync_release(struct fuse_file *ff, int flags) +void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, + unsigned int flags) { WARN_ON(refcount_read(&ff->count) > 1); - fuse_prepare_release(ff, flags, FUSE_RELEASE); - /* - * iput(NULL) is a no-op and since the refcount is 1 and everything's - * synchronous, we are fine with not doing igrab() here" - */ - fuse_file_put(ff, true, false); + fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true); + fuse_file_put(ff, true); } EXPORT_SYMBOL_GPL(fuse_sync_release); @@ -329,55 +432,12 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) return (u64) v0 + ((u64) v1 << 32); } -/* - * Check if any page in a range is under writeback - * - * This is currently done by walking the list of writepage requests - * for the inode, which can be pretty inefficient. - */ -static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, - pgoff_t idx_to) -{ - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_req *req; - bool found = false; - - spin_lock(&fc->lock); - list_for_each_entry(req, &fi->writepages, writepages_entry) { - pgoff_t curr_index; - - BUG_ON(req->inode != inode); - curr_index = req->misc.write.in.offset >> PAGE_SHIFT; - if (idx_from < curr_index + req->num_pages && - curr_index <= idx_to) { - found = true; - break; - } - } - spin_unlock(&fc->lock); - - return found; -} - -static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) -{ - return fuse_range_is_writeback(inode, index, index); -} - -/* - * Wait for page writeback to be completed. - * - * Since fuse doesn't rely on the VM writeback tracking, this has to - * use some other means. - */ -static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - - wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); - return 0; -} +struct fuse_writepage_args { + struct fuse_io_args ia; + struct list_head queue_entry; + struct inode *inode; + struct fuse_sync_bucket *bucket; +}; /* * Wait for all pending writepages on the inode to finish. @@ -397,47 +457,53 @@ static void fuse_sync_writes(struct inode *inode) static int fuse_flush(struct file *file, fl_owner_t id) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_file *ff = file->private_data; - struct fuse_req *req; struct fuse_flush_in inarg; + FUSE_ARGS(args); int err; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) return -EIO; - if (fc->no_flush) + if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache) return 0; err = write_inode_now(inode, 1); if (err) return err; - inode_lock(inode); - fuse_sync_writes(inode); - inode_unlock(inode); - err = filemap_check_errors(file->f_mapping); if (err) return err; - req = fuse_get_req_nofail_nopages(fc, file); + err = 0; + if (fm->fc->no_flush) + goto inval_attr_out; + memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; - inarg.lock_owner = fuse_lock_owner_id(fc, id); - req->in.h.opcode = FUSE_FLUSH; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - __set_bit(FR_FORCE, &req->flags); - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); + args.opcode = FUSE_FLUSH; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.force = true; + + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_flush = 1; + fm->fc->no_flush = 1; err = 0; } + +inval_attr_out: + /* + * In memory i_blocks is not maintained by fuse, if writeback cache is + * enabled, i_blocks from cached attr may not be accurate. + */ + if (!err && fm->fc->writeback_cache) + fuse_invalidate_attr_mask(inode, STATX_BLOCKS); return err; } @@ -445,20 +511,20 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, int datasync, int opcode) { struct inode *inode = file->f_mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_file *ff = file->private_data; FUSE_ARGS(args); struct fuse_fsync_in inarg; memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; - inarg.fsync_flags = datasync ? 1 : 0; - args.in.h.opcode = opcode; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - return fuse_simple_request(fc, &args); + inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0; + args.opcode = opcode; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + return fuse_simple_request(fm, &args); } static int fuse_fsync(struct file *file, loff_t start, loff_t end, @@ -468,7 +534,7 @@ static int fuse_fsync(struct file *file, loff_t start, loff_t end, struct fuse_conn *fc = get_fuse_conn(inode); int err; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) return -EIO; inode_lock(inode); @@ -511,36 +577,40 @@ out: return err; } -void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, - size_t count, int opcode) +void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, + size_t count, int opcode) { - struct fuse_read_in *inarg = &req->misc.read.in; struct fuse_file *ff = file->private_data; + struct fuse_args *args = &ia->ap.args; - inarg->fh = ff->fh; - inarg->offset = pos; - inarg->size = count; - inarg->flags = file->f_flags; - req->in.h.opcode = opcode; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(struct fuse_read_in); - req->in.args[0].value = inarg; - req->out.argvar = 1; - req->out.numargs = 1; - req->out.args[0].size = count; + ia->read.in.fh = ff->fh; + ia->read.in.offset = pos; + ia->read.in.size = count; + ia->read.in.flags = file->f_flags; + args->opcode = opcode; + args->nodeid = ff->nodeid; + args->in_numargs = 1; + args->in_args[0].size = sizeof(ia->read.in); + args->in_args[0].value = &ia->read.in; + args->out_argvar = true; + args->out_numargs = 1; + args->out_args[0].size = count; } -static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty) +static void fuse_release_user_pages(struct fuse_args_pages *ap, ssize_t nres, + bool should_dirty) { - unsigned i; + unsigned int i; - for (i = 0; i < req->num_pages; i++) { - struct page *page = req->pages[i]; + for (i = 0; i < ap->num_folios; i++) { if (should_dirty) - set_page_dirty_lock(page); - put_page(page); + folio_mark_dirty_lock(ap->folios[i]); + if (ap->args.is_pinned) + unpin_folio(ap->folios[i]); } + + if (nres > 0 && ap->args.invalidate_vmap) + invalidate_kernel_vmap_range(ap->args.vmap_base, nres); } static void fuse_io_release(struct kref *kref) @@ -559,13 +629,13 @@ static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) return io->bytes < 0 ? io->size : io->bytes; } -/** +/* * In case of short read, the caller sets 'pos' to the position of * actual end of fuse request in IO request. Otherwise, if bytes_requested * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1. * * An example: - * User requested DIO read of 64K. It was splitted into two 32K fuse requests, + * User requested DIO read of 64K. It was split into two 32K fuse requests, * both submitted asynchronously. The first of them was ACKed by userspace as * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The * second request was ACKed as short, e.g. only 1K was read, resulting in @@ -598,75 +668,112 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; - spin_unlock(&fc->lock); + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + spin_unlock(&fi->lock); } - io->iocb->ki_complete(io->iocb, res, 0); + io->iocb->ki_complete(io->iocb, res); } kref_put(&io->refcnt, fuse_io_release); } -static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req) +static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, + unsigned int nfolios) { - struct fuse_io_priv *io = req->io; - ssize_t pos = -1; + struct fuse_io_args *ia; + + ia = kzalloc(sizeof(*ia), GFP_KERNEL); + if (ia) { + ia->io = io; + ia->ap.folios = fuse_folios_alloc(nfolios, GFP_KERNEL, + &ia->ap.descs); + if (!ia->ap.folios) { + kfree(ia); + ia = NULL; + } + } + return ia; +} + +static void fuse_io_free(struct fuse_io_args *ia) +{ + kfree(ia->ap.folios); + kfree(ia); +} - fuse_release_user_pages(req, io->should_dirty); +static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, + int err) +{ + struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); + struct fuse_io_priv *io = ia->io; + ssize_t pos = -1; + size_t nres; - if (io->write) { - if (req->misc.write.in.size != req->misc.write.out.size) - pos = req->misc.write.in.offset - io->offset + - req->misc.write.out.size; + if (err) { + /* Nothing */ + } else if (io->write) { + if (ia->write.out.size > ia->write.in.size) { + err = -EIO; + } else { + nres = ia->write.out.size; + if (ia->write.in.size != ia->write.out.size) + pos = ia->write.in.offset - io->offset + + ia->write.out.size; + } } else { - if (req->misc.read.in.size != req->out.args[0].size) - pos = req->misc.read.in.offset - io->offset + - req->out.args[0].size; + u32 outsize = args->out_args[0].size; + + nres = outsize; + if (ia->read.in.size != outsize) + pos = ia->read.in.offset - io->offset + outsize; } - fuse_aio_complete(io, req->out.h.error, pos); + fuse_release_user_pages(&ia->ap, err ?: nres, io->should_dirty); + + fuse_aio_complete(io, err, pos); + fuse_io_free(ia); } -static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req, - size_t num_bytes, struct fuse_io_priv *io) +static ssize_t fuse_async_req_send(struct fuse_mount *fm, + struct fuse_io_args *ia, size_t num_bytes) { + ssize_t err; + struct fuse_io_priv *io = ia->io; + spin_lock(&io->lock); kref_get(&io->refcnt); io->size += num_bytes; io->reqs++; spin_unlock(&io->lock); - req->io = io; - req->end = fuse_aio_complete_req; - - __fuse_get_request(req); - fuse_request_send_background(fc, req); + ia->ap.args.end = fuse_aio_complete_req; + ia->ap.args.may_block = io->should_dirty; + err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL); + if (err) + fuse_aio_complete_req(fm, &ia->ap.args, err); return num_bytes; } -static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io, - loff_t pos, size_t count, fl_owner_t owner) +static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, + fl_owner_t owner) { - struct file *file = io->iocb->ki_filp; + struct file *file = ia->io->iocb->ki_filp; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; - fuse_read_fill(req, file, pos, count, FUSE_READ); + fuse_read_args_fill(ia, file, pos, count, FUSE_READ); if (owner != NULL) { - struct fuse_read_in *inarg = &req->misc.read.in; - - inarg->read_flags |= FUSE_READ_LOCKOWNER; - inarg->lock_owner = fuse_lock_owner_id(fc, owner); + ia->read.in.read_flags |= FUSE_READ_LOCKOWNER; + ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner); } - if (io->async) - return fuse_async_req_send(fc, req, count, io); + if (ia->io->async) + return fuse_async_req_send(fm, ia, count); - fuse_request_send(fc, req); - return req->out.args[0].size; + return fuse_simple_request(fm, &ia->ap.args); } static void fuse_read_update_size(struct inode *inode, loff_t size, @@ -675,251 +782,309 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - if (attr_ver == fi->attr_version && size < inode->i_size && + spin_lock(&fi->lock); + if (attr_ver >= fi->attr_version && size < inode->i_size && !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { - fi->attr_version = ++fc->attr_version; + fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, size); } - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); } -static void fuse_short_read(struct fuse_req *req, struct inode *inode, - u64 attr_ver) +static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, + struct fuse_args_pages *ap) { - size_t num_read = req->out.args[0].size; struct fuse_conn *fc = get_fuse_conn(inode); - if (fc->writeback_cache) { - /* - * A hole in a file. Some data after the hole are in page cache, - * but have not reached the client fs yet. So, the hole is not - * present there. - */ - int i; - int start_idx = num_read >> PAGE_SHIFT; - size_t off = num_read & (PAGE_SIZE - 1); - - for (i = start_idx; i < req->num_pages; i++) { - zero_user_segment(req->pages[i], off, PAGE_SIZE); - off = 0; - } - } else { - loff_t pos = page_offset(req->pages[0]) + num_read; + /* + * If writeback_cache is enabled, a short read means there's a hole in + * the file. Some data after the hole is in page cache, but has not + * reached the client fs yet. So the hole is not present there. + */ + if (!fc->writeback_cache) { + loff_t pos = folio_pos(ap->folios[0]) + num_read; fuse_read_update_size(inode, pos, attr_ver); } } -static int fuse_do_readpage(struct file *file, struct page *page) +static int fuse_do_readfolio(struct file *file, struct folio *folio, + size_t off, size_t len) { - struct kiocb iocb; - struct fuse_io_priv io; - struct inode *inode = page->mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; - size_t num_read; - loff_t pos = page_offset(page); - size_t count = PAGE_SIZE; + struct inode *inode = folio->mapping->host; + struct fuse_mount *fm = get_fuse_mount(inode); + loff_t pos = folio_pos(folio) + off; + struct fuse_folio_desc desc = { + .offset = off, + .length = len, + }; + struct fuse_io_args ia = { + .ap.args.page_zeroing = true, + .ap.args.out_pages = true, + .ap.num_folios = 1, + .ap.folios = &folio, + .ap.descs = &desc, + }; + ssize_t res; u64 attr_ver; - int err; + attr_ver = fuse_get_attr_version(fm->fc); + + /* Don't overflow end offset */ + if (pos + (desc.length - 1) == LLONG_MAX) + desc.length--; + + fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); + res = fuse_simple_request(fm, &ia.ap.args); + if (res < 0) + return res; /* - * Page writeback can extend beyond the lifetime of the - * page-cache page, so make sure we read a properly synced - * page. + * Short read means EOF. If file size is larger, truncate it */ - fuse_wait_on_page_writeback(inode, page->index); + if (res < desc.length) + fuse_short_read(inode, attr_ver, res, &ia.ap); - req = fuse_get_req(fc, 1); - if (IS_ERR(req)) - return PTR_ERR(req); + return 0; +} - attr_ver = fuse_get_attr_version(fc); +static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + iomap->type = IOMAP_MAPPED; + iomap->length = length; + iomap->offset = offset; + return 0; +} - req->out.page_zeroing = 1; - req->out.argpages = 1; - req->num_pages = 1; - req->pages[0] = page; - req->page_descs[0].length = count; - init_sync_kiocb(&iocb, file); - io = (struct fuse_io_priv) FUSE_IO_PRIV_SYNC(&iocb); - num_read = fuse_send_read(req, &io, pos, count, NULL); - err = req->out.h.error; +static const struct iomap_ops fuse_iomap_ops = { + .iomap_begin = fuse_iomap_begin, +}; - if (!err) { - /* - * Short read means EOF. If file size is larger, truncate it - */ - if (num_read < count) - fuse_short_read(req, inode, attr_ver); +struct fuse_fill_read_data { + struct file *file; - SetPageUptodate(page); - } + /* Fields below are used if sending the read request asynchronously */ + struct fuse_conn *fc; + struct fuse_io_args *ia; + unsigned int nr_bytes; +}; - fuse_put_request(fc, req); +/* forward declarations */ +static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, + unsigned len, struct fuse_args_pages *ap, + unsigned cur_bytes, bool write); +static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, + unsigned int count, bool async); + +static int fuse_handle_readahead(struct folio *folio, + struct readahead_control *rac, + struct fuse_fill_read_data *data, loff_t pos, + size_t len) +{ + struct fuse_io_args *ia = data->ia; + size_t off = offset_in_folio(folio, pos); + struct fuse_conn *fc = data->fc; + struct fuse_args_pages *ap; + unsigned int nr_pages; + + if (ia && fuse_folios_need_send(fc, pos, len, &ia->ap, data->nr_bytes, + false)) { + fuse_send_readpages(ia, data->file, data->nr_bytes, + fc->async_read); + data->nr_bytes = 0; + data->ia = NULL; + ia = NULL; + } + if (!ia) { + if (fc->num_background >= fc->congestion_threshold && + rac->ra->async_size >= readahead_count(rac)) + /* + * Congested and only async pages left, so skip the + * rest. + */ + return -EAGAIN; - return err; + nr_pages = min(fc->max_pages, readahead_count(rac)); + data->ia = fuse_io_alloc(NULL, nr_pages); + if (!data->ia) + return -ENOMEM; + ia = data->ia; + } + folio_get(folio); + ap = &ia->ap; + ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].offset = off; + ap->descs[ap->num_folios].length = len; + data->nr_bytes += len; + ap->num_folios++; + + return 0; } -static int fuse_readpage(struct file *file, struct page *page) +static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, + size_t len) { - struct inode *inode = page->mapping->host; - int err; - - err = -EIO; - if (is_bad_inode(inode)) - goto out; + struct fuse_fill_read_data *data = ctx->read_ctx; + struct folio *folio = ctx->cur_folio; + loff_t pos = iter->pos; + size_t off = offset_in_folio(folio, pos); + struct file *file = data->file; + int ret; - err = fuse_do_readpage(file, page); - fuse_invalidate_atime(inode); - out: - unlock_page(page); - return err; + if (ctx->rac) { + ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len); + } else { + /* + * for non-readahead read requests, do reads synchronously + * since it's not guaranteed that the server can handle + * out-of-order reads + */ + ret = fuse_do_readfolio(file, folio, off, len); + if (!ret) + iomap_finish_folio_read(folio, off, len, ret); + } + return ret; } -static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_iomap_read_submit(struct iomap_read_folio_ctx *ctx) { - int i; - size_t count = req->misc.read.in.size; - size_t num_read = req->out.args[0].size; - struct address_space *mapping = NULL; + struct fuse_fill_read_data *data = ctx->read_ctx; - for (i = 0; mapping == NULL && i < req->num_pages; i++) - mapping = req->pages[i]->mapping; + if (data->ia) + fuse_send_readpages(data->ia, data->file, data->nr_bytes, + data->fc->async_read); +} - if (mapping) { - struct inode *inode = mapping->host; +static const struct iomap_read_ops fuse_iomap_read_ops = { + .read_folio_range = fuse_iomap_read_folio_range_async, + .submit_read = fuse_iomap_read_submit, +}; - /* - * Short read means EOF. If file size is larger, truncate it - */ - if (!req->out.h.error && num_read < count) - fuse_short_read(req, inode, req->misc.read.attr_ver); +static int fuse_read_folio(struct file *file, struct folio *folio) +{ + struct inode *inode = folio->mapping->host; + struct fuse_fill_read_data data = { + .file = file, + }; + struct iomap_read_folio_ctx ctx = { + .cur_folio = folio, + .ops = &fuse_iomap_read_ops, + .read_ctx = &data, - fuse_invalidate_atime(inode); - } + }; - for (i = 0; i < req->num_pages; i++) { - struct page *page = req->pages[i]; - if (!req->out.h.error) - SetPageUptodate(page); - else - SetPageError(page); - unlock_page(page); - put_page(page); + if (fuse_is_bad(inode)) { + folio_unlock(folio); + return -EIO; } - if (req->ff) - fuse_file_put(req->ff, false, false); + + iomap_read_folio(&fuse_iomap_ops, &ctx); + fuse_invalidate_atime(inode); + return 0; } -static void fuse_send_readpages(struct fuse_req *req, struct file *file) +static int fuse_iomap_read_folio_range(const struct iomap_iter *iter, + struct folio *folio, loff_t pos, + size_t len) { - struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; - loff_t pos = page_offset(req->pages[0]); - size_t count = req->num_pages << PAGE_SHIFT; - - req->out.argpages = 1; - req->out.page_zeroing = 1; - req->out.page_replace = 1; - fuse_read_fill(req, file, pos, count, FUSE_READ); - req->misc.read.attr_ver = fuse_get_attr_version(fc); - if (fc->async_read) { - req->ff = fuse_file_get(ff); - req->end = fuse_readpages_end; - fuse_request_send_background(fc, req); - } else { - fuse_request_send(fc, req); - fuse_readpages_end(fc, req); - fuse_put_request(fc, req); - } + struct file *file = iter->private; + size_t off = offset_in_folio(folio, pos); + + return fuse_do_readfolio(file, folio, off, len); } -struct fuse_fill_data { - struct fuse_req *req; - struct file *file; +static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, + int err) +{ + int i; + struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); + struct fuse_args_pages *ap = &ia->ap; + size_t count = ia->read.in.size; + size_t num_read = args->out_args[0].size; + struct address_space *mapping; struct inode *inode; - unsigned nr_pages; -}; -static int fuse_readpages_fill(void *_data, struct page *page) -{ - struct fuse_fill_data *data = _data; - struct fuse_req *req = data->req; - struct inode *inode = data->inode; - struct fuse_conn *fc = get_fuse_conn(inode); + WARN_ON_ONCE(!ap->num_folios); + mapping = ap->folios[0]->mapping; + inode = mapping->host; - fuse_wait_on_page_writeback(inode, page->index); - - if (req->num_pages && - (req->num_pages == fc->max_pages || - (req->num_pages + 1) * PAGE_SIZE > fc->max_read || - req->pages[req->num_pages - 1]->index + 1 != page->index)) { - unsigned int nr_alloc = min_t(unsigned int, data->nr_pages, - fc->max_pages); - fuse_send_readpages(req, data->file); - if (fc->async_read) - req = fuse_get_req_for_background(fc, nr_alloc); - else - req = fuse_get_req(fc, nr_alloc); + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (!err && num_read < count) + fuse_short_read(inode, ia->read.attr_ver, num_read, ap); - data->req = req; - if (IS_ERR(req)) { - unlock_page(page); - return PTR_ERR(req); - } - } + fuse_invalidate_atime(inode); - if (WARN_ON(req->num_pages >= req->max_pages)) { - unlock_page(page); - fuse_put_request(fc, req); - return -EIO; + for (i = 0; i < ap->num_folios; i++) { + iomap_finish_folio_read(ap->folios[i], ap->descs[i].offset, + ap->descs[i].length, err); + folio_put(ap->folios[i]); } + if (ia->ff) + fuse_file_put(ia->ff, false); - get_page(page); - req->pages[req->num_pages] = page; - req->page_descs[req->num_pages].length = PAGE_SIZE; - req->num_pages++; - data->nr_pages--; - return 0; + fuse_io_free(ia); } -static int fuse_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, + unsigned int count, bool async) { - struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_fill_data data; + struct fuse_file *ff = file->private_data; + struct fuse_mount *fm = ff->fm; + struct fuse_args_pages *ap = &ia->ap; + loff_t pos = folio_pos(ap->folios[0]); + ssize_t res; int err; - unsigned int nr_alloc = min_t(unsigned int, nr_pages, fc->max_pages); - err = -EIO; - if (is_bad_inode(inode)) - goto out; + ap->args.out_pages = true; + ap->args.page_zeroing = true; + ap->args.page_replace = true; - data.file = file; - data.inode = inode; - if (fc->async_read) - data.req = fuse_get_req_for_background(fc, nr_alloc); - else - data.req = fuse_get_req(fc, nr_alloc); - data.nr_pages = nr_pages; - err = PTR_ERR(data.req); - if (IS_ERR(data.req)) - goto out; + /* Don't overflow end offset */ + if (pos + (count - 1) == LLONG_MAX) { + count--; + ap->descs[ap->num_folios - 1].length--; + } + WARN_ON((loff_t) (pos + count) < 0); - err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); - if (!err) { - if (data.req->num_pages) - fuse_send_readpages(data.req, file); - else - fuse_put_request(fc, data.req); + fuse_read_args_fill(ia, file, pos, count, FUSE_READ); + ia->read.attr_ver = fuse_get_attr_version(fm->fc); + if (async) { + ia->ff = fuse_file_get(ff); + ap->args.end = fuse_readpages_end; + err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); + if (!err) + return; + } else { + res = fuse_simple_request(fm, &ap->args); + err = res < 0 ? res : 0; } -out: - return err; + fuse_readpages_end(fm, &ap->args, err); } -static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +static void fuse_readahead(struct readahead_control *rac) +{ + struct inode *inode = rac->mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_fill_read_data data = { + .file = rac->file, + .fc = fc, + }; + struct iomap_read_folio_ctx ctx = { + .ops = &fuse_iomap_read_ops, + .rac = rac, + .read_ctx = &data + }; + + if (fuse_is_bad(inode)) + return; + + iomap_readahead(&fuse_iomap_ops, &ctx); +} + +static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = iocb->ki_filp->f_mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); @@ -932,7 +1097,7 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (fc->auto_inval_data || (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) { int err; - err = fuse_update_attributes(inode, iocb->ki_filp); + err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE); if (err) return err; } @@ -940,168 +1105,217 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return generic_file_read_iter(iocb, to); } -static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, - loff_t pos, size_t count) +static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff, + loff_t pos, size_t count) { - struct fuse_write_in *inarg = &req->misc.write.in; - struct fuse_write_out *outarg = &req->misc.write.out; + struct fuse_args *args = &ia->ap.args; - inarg->fh = ff->fh; - inarg->offset = pos; - inarg->size = count; - req->in.h.opcode = FUSE_WRITE; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 2; - if (ff->fc->minor < 9) - req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; + ia->write.in.fh = ff->fh; + ia->write.in.offset = pos; + ia->write.in.size = count; + args->opcode = FUSE_WRITE; + args->nodeid = ff->nodeid; + args->in_numargs = 2; + if (ff->fm->fc->minor < 9) + args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; else - req->in.args[0].size = sizeof(struct fuse_write_in); - req->in.args[0].value = inarg; - req->in.args[1].size = count; - req->out.numargs = 1; - req->out.args[0].size = sizeof(struct fuse_write_out); - req->out.args[0].value = outarg; + args->in_args[0].size = sizeof(ia->write.in); + args->in_args[0].value = &ia->write.in; + args->in_args[1].size = count; + args->out_numargs = 1; + args->out_args[0].size = sizeof(ia->write.out); + args->out_args[0].value = &ia->write.out; } -static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, - loff_t pos, size_t count, fl_owner_t owner) +static unsigned int fuse_write_flags(struct kiocb *iocb) { - struct kiocb *iocb = io->iocb; + unsigned int flags = iocb->ki_filp->f_flags; + + if (iocb_is_dsync(iocb)) + flags |= O_DSYNC; + if (iocb->ki_flags & IOCB_SYNC) + flags |= O_SYNC; + + return flags; +} + +static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, + size_t count, fl_owner_t owner) +{ + struct kiocb *iocb = ia->io->iocb; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; - struct fuse_write_in *inarg = &req->misc.write.in; + struct fuse_mount *fm = ff->fm; + struct fuse_write_in *inarg = &ia->write.in; + ssize_t err; - fuse_write_fill(req, ff, pos, count); - inarg->flags = file->f_flags; - if (iocb->ki_flags & IOCB_DSYNC) - inarg->flags |= O_DSYNC; - if (iocb->ki_flags & IOCB_SYNC) - inarg->flags |= O_SYNC; + fuse_write_args_fill(ia, ff, pos, count); + inarg->flags = fuse_write_flags(iocb); if (owner != NULL) { inarg->write_flags |= FUSE_WRITE_LOCKOWNER; - inarg->lock_owner = fuse_lock_owner_id(fc, owner); + inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner); } - if (io->async) - return fuse_async_req_send(fc, req, count, io); + if (ia->io->async) + return fuse_async_req_send(fm, ia, count); - fuse_request_send(fc, req); - return req->misc.write.out.size; + err = fuse_simple_request(fm, &ia->ap.args); + if (!err && ia->write.out.size > count) + err = -EIO; + + return err ?: ia->write.out.size; } -bool fuse_write_update_size(struct inode *inode, loff_t pos) +bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); bool ret = false; - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; - if (pos > inode->i_size) { + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + if (written > 0 && pos > inode->i_size) { i_size_write(inode, pos); ret = true; } - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); + + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); return ret; } -static size_t fuse_send_write_pages(struct fuse_req *req, struct kiocb *iocb, - struct inode *inode, loff_t pos, - size_t count) +static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, + struct kiocb *iocb, struct inode *inode, + loff_t pos, size_t count) { - size_t res; - unsigned offset; - unsigned i; - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); - - for (i = 0; i < req->num_pages; i++) - fuse_wait_on_page_writeback(inode, req->pages[i]->index); + struct fuse_args_pages *ap = &ia->ap; + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct fuse_mount *fm = ff->fm; + unsigned int offset, i; + bool short_write; + int err; - res = fuse_send_write(req, &io, pos, count, NULL); + for (i = 0; i < ap->num_folios; i++) + folio_wait_writeback(ap->folios[i]); - offset = req->page_descs[0].offset; - count = res; - for (i = 0; i < req->num_pages; i++) { - struct page *page = req->pages[i]; + fuse_write_args_fill(ia, ff, pos, count); + ia->write.in.flags = fuse_write_flags(iocb); + if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID)) + ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID; - if (!req->out.h.error && !offset && count >= PAGE_SIZE) - SetPageUptodate(page); + err = fuse_simple_request(fm, &ap->args); + if (!err && ia->write.out.size > count) + err = -EIO; - if (count > PAGE_SIZE - offset) - count -= PAGE_SIZE - offset; - else - count = 0; - offset = 0; + short_write = ia->write.out.size < count; + offset = ap->descs[0].offset; + count = ia->write.out.size; + for (i = 0; i < ap->num_folios; i++) { + struct folio *folio = ap->folios[i]; - unlock_page(page); - put_page(page); + if (err) { + folio_clear_uptodate(folio); + } else { + if (count >= folio_size(folio) - offset) + count -= folio_size(folio) - offset; + else { + if (short_write) + folio_clear_uptodate(folio); + count = 0; + } + offset = 0; + } + if (ia->write.folio_locked && (i == ap->num_folios - 1)) + folio_unlock(folio); + folio_put(folio); } - return res; + return err; } -static ssize_t fuse_fill_write_pages(struct fuse_req *req, - struct address_space *mapping, - struct iov_iter *ii, loff_t pos) +static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos, + unsigned int max_folios) { + struct fuse_args_pages *ap = &ia->ap; struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); size_t count = 0; - int err; + unsigned int num; + int err = 0; - req->in.argpages = 1; - req->page_descs[0].offset = offset; + num = min(iov_iter_count(ii), fc->max_write); - do { + ap->args.in_pages = true; + + while (num && ap->num_folios < max_folios) { size_t tmp; - struct page *page; + struct folio *folio; pgoff_t index = pos >> PAGE_SHIFT; - size_t bytes = min_t(size_t, PAGE_SIZE - offset, - iov_iter_count(ii)); - - bytes = min_t(size_t, bytes, fc->max_write - count); + unsigned int bytes; + unsigned int folio_offset; again: - err = -EFAULT; - if (iov_iter_fault_in_readable(ii, bytes)) - break; - - err = -ENOMEM; - page = grab_cache_page_write_begin(mapping, index, 0); - if (!page) + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); break; + } if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + flush_dcache_folio(folio); + + folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset; + bytes = min(folio_size(folio) - folio_offset, num); - tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); - flush_dcache_page(page); + tmp = copy_folio_from_iter_atomic(folio, folio_offset, bytes, ii); + flush_dcache_folio(folio); - iov_iter_advance(ii, tmp); if (!tmp) { - unlock_page(page); - put_page(page); - bytes = min(bytes, iov_iter_single_seg_count(ii)); + folio_unlock(folio); + folio_put(folio); + + /* + * Ensure forward progress by faulting in + * while not holding the folio lock: + */ + if (fault_in_iov_iter_readable(ii, bytes)) { + err = -EFAULT; + break; + } + goto again; } - err = 0; - req->pages[req->num_pages] = page; - req->page_descs[req->num_pages].length = tmp; - req->num_pages++; + ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].offset = folio_offset; + ap->descs[ap->num_folios].length = tmp; + ap->num_folios++; count += tmp; pos += tmp; + num -= tmp; offset += tmp; - if (offset == PAGE_SIZE) + if (offset == folio_size(folio)) offset = 0; - if (!fc->big_writes) + /* If we copied full folio, mark it uptodate */ + if (tmp == folio_size(folio)) + folio_mark_uptodate(folio); + + if (folio_test_uptodate(folio)) { + folio_unlock(folio); + } else { + ia->write.folio_locked = true; break; - } while (iov_iter_count(ii) && count < fc->max_write && - req->num_pages < req->max_pages && offset == 0); + } + if (!fc->big_writes || offset != 0) + break; + } return count > 0 ? count : err; } @@ -1115,44 +1329,41 @@ static inline unsigned int fuse_wr_pages(loff_t pos, size_t len, max_pages); } -static ssize_t fuse_perform_write(struct kiocb *iocb, - struct address_space *mapping, - struct iov_iter *ii, loff_t pos) +static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) { + struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); + loff_t pos = iocb->ki_pos; int err = 0; ssize_t res = 0; - if (is_bad_inode(inode)) - return -EIO; - if (inode->i_size < pos + iov_iter_count(ii)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); do { - struct fuse_req *req; ssize_t count; + struct fuse_io_args ia = {}; + struct fuse_args_pages *ap = &ia.ap; unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), fc->max_pages); - req = fuse_get_req(fc, nr_pages); - if (IS_ERR(req)) { - err = PTR_ERR(req); + ap->folios = fuse_folios_alloc(nr_pages, GFP_KERNEL, &ap->descs); + if (!ap->folios) { + err = -ENOMEM; break; } - count = fuse_fill_write_pages(req, mapping, ii, pos); + count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages); if (count <= 0) { err = count; } else { - size_t num_written; - - num_written = fuse_send_write_pages(req, iocb, inode, - pos, count); - err = req->out.h.error; + err = fuse_send_write_pages(&ia, iocb, inode, + pos, count); if (!err) { + size_t num_written = ia.write.out.size; + res += num_written; pos += num_written; @@ -1161,87 +1372,156 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, err = -EIO; } } - fuse_put_request(fc, req); + kfree(ap->folios); } while (!err && iov_iter_count(ii)); - if (res > 0) - fuse_write_update_size(inode, pos); - + fuse_write_update_attr(inode, pos, res); clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); - fuse_invalidate_attr(inode); - return res > 0 ? res : err; + if (!res) + return err; + iocb->ki_pos += res; + return res; } -static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); +} + +/* + * @return true if an exclusive lock for direct IO writes is needed + */ +static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_inode *fi = get_fuse_inode(inode); + + /* Server side has to advise that it supports parallel dio writes. */ + if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES)) + return true; + + /* + * Append will need to know the eventual EOF - always needs an + * exclusive lock. + */ + if (iocb->ki_flags & IOCB_APPEND) + return true; + + /* shared locks are not allowed with parallel page cache IO */ + if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state)) + return true; + + /* Parallel dio beyond EOF is not supported, at least for now. */ + if (fuse_io_past_eof(iocb, from)) + return true; + + return false; +} + +static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, + bool *exclusive) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_inode *fi = get_fuse_inode(inode); + + *exclusive = fuse_dio_wr_exclusive_lock(iocb, from); + if (*exclusive) { + inode_lock(inode); + } else { + inode_lock_shared(inode); + /* + * New parallal dio allowed only if inode is not in caching + * mode and denies new opens in caching mode. This check + * should be performed only after taking shared inode lock. + * Previous past eof check was without inode lock and might + * have raced, so check it again. + */ + if (fuse_io_past_eof(iocb, from) || + fuse_inode_uncached_io_start(fi, NULL) != 0) { + inode_unlock_shared(inode); + inode_lock(inode); + *exclusive = true; + } + } +} + +static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_inode *fi = get_fuse_inode(inode); + + if (exclusive) { + inode_unlock(inode); + } else { + /* Allow opens in caching mode after last parallel dio end */ + fuse_inode_uncached_io_end(fi); + inode_unlock_shared(inode); + } +} + +static const struct iomap_write_ops fuse_iomap_write_ops = { + .read_folio_range = fuse_iomap_read_folio_range, +}; + +static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct mnt_idmap *idmap = file_mnt_idmap(file); struct address_space *mapping = file->f_mapping; ssize_t written = 0; - ssize_t written_buffered = 0; struct inode *inode = mapping->host; - ssize_t err; - loff_t endbyte = 0; + ssize_t err, count; + struct fuse_conn *fc = get_fuse_conn(inode); + bool writeback = false; - if (get_fuse_conn(inode)->writeback_cache) { + if (fc->writeback_cache) { /* Update size (EOF optimization) and mode (SUID clearing) */ - err = fuse_update_attributes(mapping->host, file); + err = fuse_update_attributes(mapping->host, file, + STATX_SIZE | STATX_MODE); if (err) return err; - return generic_file_write_iter(iocb, from); + if (!fc->handle_killpriv_v2 || + !setattr_should_drop_suidgid(idmap, file_inode(file))) + writeback = true; } inode_lock(inode); - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - - err = generic_write_checks(iocb, from); + err = count = generic_write_checks(iocb, from); if (err <= 0) goto out; - err = file_remove_privs(file); - if (err) - goto out; + task_io_account_write(count); - err = file_update_time(file); + err = kiocb_modified(iocb); if (err) goto out; if (iocb->ki_flags & IOCB_DIRECT) { - loff_t pos = iocb->ki_pos; written = generic_file_direct_write(iocb, from); if (written < 0 || !iov_iter_count(from)) goto out; - - pos += written; - - written_buffered = fuse_perform_write(iocb, mapping, from, pos); - if (written_buffered < 0) { - err = written_buffered; - goto out; - } - endbyte = pos + written_buffered - 1; - - err = filemap_write_and_wait_range(file->f_mapping, pos, - endbyte); - if (err) - goto out; - - invalidate_mapping_pages(file->f_mapping, - pos >> PAGE_SHIFT, - endbyte >> PAGE_SHIFT); - - written += written_buffered; - iocb->ki_pos = pos + written_buffered; + written = direct_write_fallback(iocb, from, written, + fuse_perform_write(iocb, from)); + } else if (writeback) { + /* + * Use iomap so that we can do granular uptodate reads + * and granular dirty tracking for large folios. + */ + written = iomap_file_buffered_write(iocb, from, + &fuse_iomap_ops, + &fuse_iomap_write_ops, + file); } else { - written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos); - if (written >= 0) - iocb->ki_pos += written; + written = fuse_perform_write(iocb, from); } out: - current->backing_dev_info = NULL; inode_unlock(inode); if (written > 0) written = generic_write_sync(iocb, written); @@ -1249,19 +1529,9 @@ out: return written ? written : err; } -static inline void fuse_page_descs_length_init(struct fuse_req *req, - unsigned index, unsigned nr_pages) -{ - int i; - - for (i = index; i < index + nr_pages; i++) - req->page_descs[i].length = PAGE_SIZE - - req->page_descs[i].offset; -} - static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) { - return (unsigned long)ii->iov->iov_base + ii->iov_offset; + return (unsigned long)iter_iov(ii)->iov_base + ii->iov_offset; } static inline size_t fuse_get_frag_size(const struct iov_iter *ii, @@ -1270,56 +1540,99 @@ static inline size_t fuse_get_frag_size(const struct iov_iter *ii, return min(iov_iter_single_seg_count(ii), max_size); } -static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, - size_t *nbytesp, int write) +static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, + size_t *nbytesp, int write, + unsigned int max_pages, + bool use_pages_for_kvec_io) { + bool flush_or_invalidate = false; + unsigned int nr_pages = 0; size_t nbytes = 0; /* # bytes already packed in req */ ssize_t ret = 0; - /* Special case for kernel I/O: can copy directly into the buffer */ + /* Special case for kernel I/O: can copy directly into the buffer. + * However if the implementation of fuse_conn requires pages instead of + * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead. + */ if (iov_iter_is_kvec(ii)) { - unsigned long user_addr = fuse_get_user_addr(ii); - size_t frag_size = fuse_get_frag_size(ii, *nbytesp); + void *user_addr = (void *)fuse_get_user_addr(ii); - if (write) - req->in.args[1].value = (void *) user_addr; - else - req->out.args[0].value = (void *) user_addr; + if (!use_pages_for_kvec_io) { + size_t frag_size = fuse_get_frag_size(ii, *nbytesp); - iov_iter_advance(ii, frag_size); - *nbytesp = frag_size; - return 0; + if (write) + ap->args.in_args[1].value = user_addr; + else + ap->args.out_args[0].value = user_addr; + + iov_iter_advance(ii, frag_size); + *nbytesp = frag_size; + return 0; + } + + if (is_vmalloc_addr(user_addr)) { + ap->args.vmap_base = user_addr; + flush_or_invalidate = true; + } } - while (nbytes < *nbytesp && req->num_pages < req->max_pages) { - unsigned npages; + /* + * Until there is support for iov_iter_extract_folios(), we have to + * manually extract pages using iov_iter_extract_pages() and then + * copy that to a folios array. + */ + struct page **pages = kzalloc(max_pages * sizeof(struct page *), + GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto out; + } + + while (nbytes < *nbytesp && nr_pages < max_pages) { + unsigned nfolios, i; size_t start; - ret = iov_iter_get_pages(ii, &req->pages[req->num_pages], - *nbytesp - nbytes, - req->max_pages - req->num_pages, - &start); + + ret = iov_iter_extract_pages(ii, &pages, + *nbytesp - nbytes, + max_pages - nr_pages, + 0, &start); if (ret < 0) break; - iov_iter_advance(ii, ret); nbytes += ret; - ret += start; - npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE; + nfolios = DIV_ROUND_UP(ret + start, PAGE_SIZE); + + for (i = 0; i < nfolios; i++) { + struct folio *folio = page_folio(pages[i]); + unsigned int offset = start + + (folio_page_idx(folio, pages[i]) << PAGE_SHIFT); + unsigned int len = min_t(unsigned int, ret, PAGE_SIZE - start); - req->page_descs[req->num_pages].offset = start; - fuse_page_descs_length_init(req, req->num_pages, npages); + ap->descs[ap->num_folios].offset = offset; + ap->descs[ap->num_folios].length = len; + ap->folios[ap->num_folios] = folio; + start = 0; + ret -= len; + ap->num_folios++; + } - req->num_pages += npages; - req->page_descs[req->num_pages - 1].length -= - (PAGE_SIZE - ret) & (PAGE_SIZE - 1); + nr_pages += nfolios; } + kfree(pages); + + if (write && flush_or_invalidate) + flush_kernel_vmap_range(ap->args.vmap_base, nbytes); + ap->args.invalidate_vmap = !write && flush_or_invalidate; + ap->args.is_pinned = iov_iter_extract_will_pin(ii); + ap->args.user_pages = true; if (write) - req->in.argpages = 1; + ap->args.in_pages = true; else - req->out.argpages = 1; + ap->args.out_pages = true; +out: *nbytesp = nbytes; return ret < 0 ? ret : 0; @@ -1331,27 +1644,34 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, int write = flags & FUSE_DIO_WRITE; int cuse = flags & FUSE_DIO_CUSE; struct file *file = io->iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_conn *fc = ff->fm->fc; size_t nmax = write ? fc->max_write : fc->max_read; loff_t pos = *ppos; size_t count = iov_iter_count(iter); pgoff_t idx_from = pos >> PAGE_SHIFT; pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT; ssize_t res = 0; - struct fuse_req *req; int err = 0; + struct fuse_io_args *ia; + unsigned int max_pages; + bool fopen_direct_io = ff->open_flags & FOPEN_DIRECT_IO; - if (io->async) - req = fuse_get_req_for_background(fc, iov_iter_npages(iter, - fc->max_pages)); - else - req = fuse_get_req(fc, iov_iter_npages(iter, fc->max_pages)); - if (IS_ERR(req)) - return PTR_ERR(req); + max_pages = iov_iter_npages(iter, fc->max_pages); + ia = fuse_io_alloc(io, max_pages); + if (!ia) + return -ENOMEM; - if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { + if (fopen_direct_io) { + res = filemap_write_and_wait_range(mapping, pos, pos + count - 1); + if (res) { + fuse_io_free(ia); + return res; + } + } + if (!cuse && filemap_range_has_writeback(mapping, pos, (pos + count - 1))) { if (!write) inode_lock(inode); fuse_sync_writes(inode); @@ -1359,52 +1679,74 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, inode_unlock(inode); } - io->should_dirty = !write && iter_is_iovec(iter); + if (fopen_direct_io && write) { + res = invalidate_inode_pages2_range(mapping, idx_from, idx_to); + if (res) { + fuse_io_free(ia); + return res; + } + } + + io->should_dirty = !write && user_backed_iter(iter); while (count) { - size_t nres; + ssize_t nres; fl_owner_t owner = current->files; size_t nbytes = min(count, nmax); - err = fuse_get_user_pages(req, iter, &nbytes, write); + + err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, + max_pages, fc->use_pages_for_kvec_io); if (err && !nbytes) break; - if (write) - nres = fuse_send_write(req, io, pos, nbytes, owner); - else - nres = fuse_send_read(req, io, pos, nbytes, owner); + if (write) { + if (!capable(CAP_FSETID)) + ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID; - if (!io->async) - fuse_release_user_pages(req, io->should_dirty); - if (req->out.h.error) { - err = req->out.h.error; - break; - } else if (nres > nbytes) { - res = 0; - err = -EIO; + nres = fuse_send_write(ia, pos, nbytes, owner); + } else { + nres = fuse_send_read(ia, pos, nbytes, owner); + } + + if (!io->async || nres < 0) { + fuse_release_user_pages(&ia->ap, nres, io->should_dirty); + fuse_io_free(ia); + } + ia = NULL; + if (nres < 0) { + iov_iter_revert(iter, nbytes); + err = nres; break; } + WARN_ON(nres > nbytes); + count -= nres; res += nres; pos += nres; - if (nres != nbytes) + if (nres != nbytes) { + iov_iter_revert(iter, nbytes - nres); break; + } if (count) { - fuse_put_request(fc, req); - if (io->async) - req = fuse_get_req_for_background(fc, - iov_iter_npages(iter, fc->max_pages)); - else - req = fuse_get_req(fc, iov_iter_npages(iter, - fc->max_pages)); - if (IS_ERR(req)) + max_pages = iov_iter_npages(iter, fc->max_pages); + ia = fuse_io_alloc(io, max_pages); + if (!ia) break; } } - if (!IS_ERR(req)) - fuse_put_request(fc, req); + if (ia) + fuse_io_free(ia); if (res > 0) *ppos = pos; + if (res > 0 && write && fopen_direct_io) { + /* + * As in generic_file_direct_write(), invalidate after the + * write, to invalidate read-ahead cache that may have competed + * with the write. + */ + invalidate_inode_pages2_range(mapping, idx_from, idx_to); + } + return res > 0 ? res : err; } EXPORT_SYMBOL_GPL(fuse_direct_io); @@ -1416,9 +1758,6 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, ssize_t res; struct inode *inode = file_inode(io->iocb->ki_filp); - if (is_bad_inode(inode)) - return -EIO; - res = fuse_direct_io(io, iter, ppos, 0); fuse_invalidate_atime(inode); @@ -1426,75 +1765,164 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, return res; } +static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter); + static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); - return __fuse_direct_read(&io, to, &iocb->ki_pos); + ssize_t res; + + if (!is_sync_kiocb(iocb)) { + res = fuse_direct_IO(iocb, to); + } else { + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + + res = __fuse_direct_read(&io, to, &iocb->ki_pos); + } + + return res; } static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; + bool exclusive; - if (is_bad_inode(inode)) - return -EIO; - - /* Don't allow parallel writes to the same file */ - inode_lock(inode); + fuse_dio_lock(iocb, from, &exclusive); res = generic_write_checks(iocb, from); - if (res > 0) - res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); - fuse_invalidate_attr(inode); - if (res > 0) - fuse_write_update_size(inode, iocb->ki_pos); - inode_unlock(inode); + if (res > 0) { + task_io_account_write(res); + if (!is_sync_kiocb(iocb)) { + res = fuse_direct_IO(iocb, from); + } else { + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + + res = fuse_direct_io(&io, from, &iocb->ki_pos, + FUSE_DIO_WRITE); + fuse_write_update_attr(inode, iocb->ki_pos, res); + } + } + fuse_dio_unlock(iocb, exclusive); return res; } -static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) +static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { - int i; + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); - for (i = 0; i < req->num_pages; i++) - __free_page(req->pages[i]); + if (fuse_is_bad(inode)) + return -EIO; + + if (FUSE_IS_DAX(inode)) + return fuse_dax_read_iter(iocb, to); - if (req->ff) - fuse_file_put(req->ff, false, false); + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (ff->open_flags & FOPEN_DIRECT_IO) + return fuse_direct_read_iter(iocb, to); + else if (fuse_file_passthrough(ff)) + return fuse_passthrough_read_iter(iocb, to); + else + return fuse_cache_read_iter(iocb, to); +} + +static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + + if (fuse_is_bad(inode)) + return -EIO; + + if (FUSE_IS_DAX(inode)) + return fuse_dax_write_iter(iocb, from); + + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (ff->open_flags & FOPEN_DIRECT_IO) + return fuse_direct_write_iter(iocb, from); + else if (fuse_file_passthrough(ff)) + return fuse_passthrough_write_iter(iocb, from); + else + return fuse_cache_write_iter(iocb, from); } -static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) +static ssize_t fuse_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { - struct inode *inode = req->inode; + struct fuse_file *ff = in->private_data; + + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_passthrough_splice_read(in, ppos, pipe, len, flags); + else + return filemap_splice_read(in, ppos, pipe, len, flags); +} + +static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + struct fuse_file *ff = out->private_data; + + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_passthrough_splice_write(pipe, out, ppos, len, flags); + else + return iter_file_splice_write(pipe, out, ppos, len, flags); +} + +static void fuse_writepage_free(struct fuse_writepage_args *wpa) +{ + struct fuse_args_pages *ap = &wpa->ia.ap; + + if (wpa->bucket) + fuse_sync_bucket_dec(wpa->bucket); + + fuse_file_put(wpa->ia.ff, false); + + kfree(ap->folios); + kfree(wpa); +} + +static void fuse_writepage_finish(struct fuse_writepage_args *wpa) +{ + struct fuse_args_pages *ap = &wpa->ia.ap; + struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); - struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - list_del(&req->writepages_entry); - for (i = 0; i < req->num_pages; i++) { - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(req->pages[i], NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); - } + for (i = 0; i < ap->num_folios; i++) + /* + * Benchmarks showed that ending writeback within the + * scope of the fi->lock alleviates xarray lock + * contention and noticeably improves performance. + */ + iomap_finish_folio_write(inode, ap->folios[i], + ap->descs[i].length); + wake_up(&fi->page_waitq); } -/* Called under fc->lock, may release and reacquire it */ -static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req, - loff_t size) -__releases(fc->lock) -__acquires(fc->lock) +/* Called under fi->lock, may release and reacquire it */ +static void fuse_send_writepage(struct fuse_mount *fm, + struct fuse_writepage_args *wpa, loff_t size) +__releases(fi->lock) +__acquires(fi->lock) { - struct fuse_inode *fi = get_fuse_inode(req->inode); - struct fuse_write_in *inarg = &req->misc.write.in; - __u64 data_size = req->num_pages * PAGE_SIZE; - bool queued; + struct fuse_inode *fi = get_fuse_inode(wpa->inode); + struct fuse_args_pages *ap = &wpa->ia.ap; + struct fuse_write_in *inarg = &wpa->ia.write.in; + struct fuse_args *args = &ap->args; + __u64 data_size = 0; + int err, i; - if (!fc->connected) - goto out_free; + for (i = 0; i < ap->num_folios; i++) + data_size += ap->descs[i].length; + fi->writectr++; if (inarg->offset + data_size <= size) { inarg->size = data_size; } else if (inarg->offset < size) { @@ -1504,547 +1932,399 @@ __acquires(fc->lock) goto out_free; } - req->in.args[1].size = inarg->size; - fi->writectr++; - queued = fuse_request_queue_background(fc, req); - WARN_ON(!queued); + args->in_args[1].size = inarg->size; + args->force = true; + args->nocreds = true; + + err = fuse_simple_background(fm, args, GFP_ATOMIC); + if (err == -ENOMEM) { + spin_unlock(&fi->lock); + err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL); + spin_lock(&fi->lock); + } + + /* Fails on broken connection only */ + if (unlikely(err)) + goto out_free; + return; out_free: - fuse_writepage_finish(fc, req); - spin_unlock(&fc->lock); - fuse_writepage_free(fc, req); - fuse_put_request(fc, req); - spin_lock(&fc->lock); + fi->writectr--; + fuse_writepage_finish(wpa); + spin_unlock(&fi->lock); + fuse_writepage_free(wpa); + spin_lock(&fi->lock); } /* * If fi->writectr is positive (no truncate or fsync going on) send * all queued writepage requests. * - * Called with fc->lock + * Called with fi->lock */ void fuse_flush_writepages(struct inode *inode) -__releases(fc->lock) -__acquires(fc->lock) +__releases(fi->lock) +__acquires(fi->lock) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_inode *fi = get_fuse_inode(inode); - size_t crop = i_size_read(inode); - struct fuse_req *req; + loff_t crop = i_size_read(inode); + struct fuse_writepage_args *wpa; while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { - req = list_entry(fi->queued_writes.next, struct fuse_req, list); - list_del_init(&req->list); - fuse_send_writepage(fc, req, crop); + wpa = list_entry(fi->queued_writes.next, + struct fuse_writepage_args, queue_entry); + list_del_init(&wpa->queue_entry); + fuse_send_writepage(fm, wpa, crop); } } -static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, + int error) { - struct inode *inode = req->inode; + struct fuse_writepage_args *wpa = + container_of(args, typeof(*wpa), ia.ap.args); + struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); - mapping_set_error(inode->i_mapping, req->out.h.error); - spin_lock(&fc->lock); - while (req->misc.write.next) { - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_write_in *inarg = &req->misc.write.in; - struct fuse_req *next = req->misc.write.next; - req->misc.write.next = next->misc.write.next; - next->misc.write.next = NULL; - next->ff = fuse_file_get(req->ff); - list_add(&next->writepages_entry, &fi->writepages); - - /* - * Skip fuse_flush_writepages() to make it easy to crop requests - * based on primary request size. - * - * 1st case (trivial): there are no concurrent activities using - * fuse_set/release_nowrite. Then we're on safe side because - * fuse_flush_writepages() would call fuse_send_writepage() - * anyway. - * - * 2nd case: someone called fuse_set_nowrite and it is waiting - * now for completion of all in-flight requests. This happens - * rarely and no more than once per page, so this should be - * okay. - * - * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle - * of fuse_set_nowrite..fuse_release_nowrite section. The fact - * that fuse_set_nowrite returned implies that all in-flight - * requests were completed along with all of their secondary - * requests. Further primary requests are blocked by negative - * writectr. Hence there cannot be any in-flight requests and - * no invocations of fuse_writepage_end() while we're in - * fuse_set_nowrite..fuse_release_nowrite section. - */ - fuse_send_writepage(fc, next, inarg->offset + inarg->size); - } + mapping_set_error(inode->i_mapping, error); + /* + * A writeback finished and this might have updated mtime/ctime on + * server making local mtime/ctime stale. Hence invalidate attrs. + * Do this only if writeback_cache is not enabled. If writeback_cache + * is enabled, we trust local ctime/mtime. + */ + if (!fc->writeback_cache) + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY); + spin_lock(&fi->lock); fi->writectr--; - fuse_writepage_finish(fc, req); - spin_unlock(&fc->lock); - fuse_writepage_free(fc, req); + fuse_writepage_finish(wpa); + spin_unlock(&fi->lock); + fuse_writepage_free(wpa); } -static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, - struct fuse_inode *fi) +static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi) { - struct fuse_file *ff = NULL; + struct fuse_file *ff; - spin_lock(&fc->lock); - if (!list_empty(&fi->write_files)) { - ff = list_entry(fi->write_files.next, struct fuse_file, - write_entry); + spin_lock(&fi->lock); + ff = list_first_entry_or_null(&fi->write_files, struct fuse_file, + write_entry); + if (ff) fuse_file_get(ff); - } - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); return ff; } -static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, - struct fuse_inode *fi) +static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi) { - struct fuse_file *ff = __fuse_write_file_get(fc, fi); + struct fuse_file *ff = __fuse_write_file_get(fi); WARN_ON(!ff); return ff; } int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) { - struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff; int err; - ff = __fuse_write_file_get(fc, fi); + ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) - fuse_file_put(ff, false, false); + fuse_file_put(ff, false); return err; } -static int fuse_writepage_locked(struct page *page) +static struct fuse_writepage_args *fuse_writepage_args_alloc(void) { - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_req *req; - struct page *tmp_page; - int error = -ENOMEM; - - set_page_writeback(page); - - req = fuse_request_alloc_nofs(1); - if (!req) - goto err; - - /* writeback always goes to bg_queue */ - __set_bit(FR_BACKGROUND, &req->flags); - tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!tmp_page) - goto err_free; - - error = -EIO; - req->ff = fuse_write_file_get(fc, fi); - if (!req->ff) - goto err_nofile; - - fuse_write_fill(req, req->ff, page_offset(page), 0); - - copy_highpage(tmp_page, page); - req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; - req->misc.write.next = NULL; - req->in.argpages = 1; - req->num_pages = 1; - req->pages[0] = tmp_page; - req->page_descs[0].offset = 0; - req->page_descs[0].length = PAGE_SIZE; - req->end = fuse_writepage_end; - req->inode = inode; - - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); + struct fuse_writepage_args *wpa; + struct fuse_args_pages *ap; - spin_lock(&fc->lock); - list_add(&req->writepages_entry, &fi->writepages); - list_add_tail(&req->list, &fi->queued_writes); - fuse_flush_writepages(inode); - spin_unlock(&fc->lock); + wpa = kzalloc(sizeof(*wpa), GFP_NOFS); + if (wpa) { + ap = &wpa->ia.ap; + ap->num_folios = 0; + ap->folios = fuse_folios_alloc(1, GFP_NOFS, &ap->descs); + if (!ap->folios) { + kfree(wpa); + wpa = NULL; + } + } + return wpa; - end_page_writeback(page); +} - return 0; +static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, + struct fuse_writepage_args *wpa) +{ + if (!fc->sync_fs) + return; -err_nofile: - __free_page(tmp_page); -err_free: - fuse_request_free(req); -err: - mapping_set_error(page->mapping, error); - end_page_writeback(page); - return error; + rcu_read_lock(); + /* Prevent resurrection of dead bucket in unlikely race with syncfs */ + do { + wpa->bucket = rcu_dereference(fc->curr_bucket); + } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count))); + rcu_read_unlock(); } -static int fuse_writepage(struct page *page, struct writeback_control *wbc) +static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, + uint32_t folio_index, loff_t offset, unsigned len) { - int err; + struct fuse_args_pages *ap = &wpa->ia.ap; - if (fuse_page_is_writeback(page->mapping->host, page->index)) { - /* - * ->writepages() should be called for sync() and friends. We - * should only get here on direct reclaim and then we are - * allowed to skip a page which is already in flight - */ - WARN_ON(wbc->sync_mode == WB_SYNC_ALL); + ap->folios[folio_index] = folio; + ap->descs[folio_index].offset = offset; + ap->descs[folio_index].length = len; +} - redirty_page_for_writepage(wbc, page); - return 0; - } +static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, + size_t offset, + struct fuse_file *ff) +{ + struct inode *inode = folio->mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_writepage_args *wpa; + struct fuse_args_pages *ap; - err = fuse_writepage_locked(page); - unlock_page(page); + wpa = fuse_writepage_args_alloc(); + if (!wpa) + return NULL; - return err; + fuse_writepage_add_to_bucket(fc, wpa); + fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio) + offset, 0); + wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; + wpa->inode = inode; + wpa->ia.ff = ff; + + ap = &wpa->ia.ap; + ap->args.in_pages = true; + ap->args.end = fuse_writepage_end; + + return wpa; } struct fuse_fill_wb_data { - struct fuse_req *req; + struct fuse_writepage_args *wpa; struct fuse_file *ff; - struct inode *inode; - struct page **orig_pages; + unsigned int max_folios; + /* + * nr_bytes won't overflow since fuse_folios_need_send() caps + * wb requests to never exceed fc->max_pages (which has an upper bound + * of U16_MAX). + */ + unsigned int nr_bytes; }; -static void fuse_writepages_send(struct fuse_fill_wb_data *data) +static bool fuse_pages_realloc(struct fuse_fill_wb_data *data, + unsigned int max_pages) { - struct fuse_req *req = data->req; - struct inode *inode = data->inode; - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); - int num_pages = req->num_pages; - int i; + struct fuse_args_pages *ap = &data->wpa->ia.ap; + struct folio **folios; + struct fuse_folio_desc *descs; + unsigned int nfolios = min_t(unsigned int, + max_t(unsigned int, data->max_folios * 2, + FUSE_DEFAULT_MAX_PAGES_PER_REQ), + max_pages); + WARN_ON(nfolios <= data->max_folios); - req->ff = fuse_file_get(data->ff); - spin_lock(&fc->lock); - list_add_tail(&req->list, &fi->queued_writes); - fuse_flush_writepages(inode); - spin_unlock(&fc->lock); + folios = fuse_folios_alloc(nfolios, GFP_NOFS, &descs); + if (!folios) + return false; + + memcpy(folios, ap->folios, sizeof(struct folio *) * ap->num_folios); + memcpy(descs, ap->descs, sizeof(struct fuse_folio_desc) * ap->num_folios); + kfree(ap->folios); + ap->folios = folios; + ap->descs = descs; + data->max_folios = nfolios; - for (i = 0; i < num_pages; i++) - end_page_writeback(data->orig_pages[i]); + return true; } -static bool fuse_writepage_in_flight(struct fuse_req *new_req, - struct page *page) +static void fuse_writepages_send(struct inode *inode, + struct fuse_fill_wb_data *data) { - struct fuse_conn *fc = get_fuse_conn(new_req->inode); - struct fuse_inode *fi = get_fuse_inode(new_req->inode); - struct fuse_req *tmp; - struct fuse_req *old_req; - bool found = false; - pgoff_t curr_index; + struct fuse_writepage_args *wpa = data->wpa; + struct fuse_inode *fi = get_fuse_inode(inode); - BUG_ON(new_req->num_pages != 0); + spin_lock(&fi->lock); + list_add_tail(&wpa->queue_entry, &fi->queued_writes); + fuse_flush_writepages(inode); + spin_unlock(&fi->lock); +} - spin_lock(&fc->lock); - list_del(&new_req->writepages_entry); - list_for_each_entry(old_req, &fi->writepages, writepages_entry) { - BUG_ON(old_req->inode != new_req->inode); - curr_index = old_req->misc.write.in.offset >> PAGE_SHIFT; - if (curr_index <= page->index && - page->index < curr_index + old_req->num_pages) { - found = true; - break; - } - } - if (!found) { - list_add(&new_req->writepages_entry, &fi->writepages); - goto out_unlock; - } +static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, + unsigned len, struct fuse_args_pages *ap, + unsigned cur_bytes, bool write) +{ + struct folio *prev_folio; + struct fuse_folio_desc prev_desc; + unsigned bytes = cur_bytes + len; + loff_t prev_pos; + size_t max_bytes = write ? fc->max_write : fc->max_read; - new_req->num_pages = 1; - for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) { - BUG_ON(tmp->inode != new_req->inode); - curr_index = tmp->misc.write.in.offset >> PAGE_SHIFT; - if (tmp->num_pages == 1 && - curr_index == page->index) { - old_req = tmp; - } - } + WARN_ON(!ap->num_folios); - if (old_req->num_pages == 1 && test_bit(FR_PENDING, &old_req->flags)) { - struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host); + /* Reached max pages */ + if ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT > fc->max_pages) + return true; - copy_highpage(old_req->pages[0], page); - spin_unlock(&fc->lock); + if (bytes > max_bytes) + return true; - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(page, NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); - fuse_writepage_free(fc, new_req); - fuse_request_free(new_req); - goto out; - } else { - new_req->misc.write.next = old_req->misc.write.next; - old_req->misc.write.next = new_req; - } -out_unlock: - spin_unlock(&fc->lock); -out: - return found; + /* Discontinuity */ + prev_folio = ap->folios[ap->num_folios - 1]; + prev_desc = ap->descs[ap->num_folios - 1]; + prev_pos = folio_pos(prev_folio) + prev_desc.offset + prev_desc.length; + if (prev_pos != pos) + return true; + + return false; } -static int fuse_writepages_fill(struct page *page, - struct writeback_control *wbc, void *_data) +static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, + struct folio *folio, u64 pos, + unsigned len, u64 end_pos) { - struct fuse_fill_wb_data *data = _data; - struct fuse_req *req = data->req; - struct inode *inode = data->inode; + struct fuse_fill_wb_data *data = wpc->wb_ctx; + struct fuse_writepage_args *wpa = data->wpa; + struct fuse_args_pages *ap = &wpa->ia.ap; + struct inode *inode = wpc->inode; + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct page *tmp_page; - bool is_writeback; - int err; + loff_t offset = offset_in_folio(folio, pos); + + WARN_ON_ONCE(!data); if (!data->ff) { - err = -EIO; - data->ff = fuse_write_file_get(fc, get_fuse_inode(inode)); + data->ff = fuse_write_file_get(fi); if (!data->ff) - goto out_unlock; - } - - /* - * Being under writeback is unlikely but possible. For example direct - * read to an mmaped fuse file will set the page dirty twice; once when - * the pages are faulted with get_user_pages(), and then after the read - * completed. - */ - is_writeback = fuse_page_is_writeback(inode, page->index); - - if (req && req->num_pages && - (is_writeback || req->num_pages == fc->max_pages || - (req->num_pages + 1) * PAGE_SIZE > fc->max_write || - data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) { - fuse_writepages_send(data); - data->req = NULL; - } else if (req && req->num_pages == req->max_pages) { - if (!fuse_req_realloc_pages(fc, req, GFP_NOFS)) { - fuse_writepages_send(data); - req = data->req = NULL; - } + return -EIO; } - err = -ENOMEM; - tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!tmp_page) - goto out_unlock; + if (wpa) { + bool send = fuse_folios_need_send(fc, pos, len, ap, + data->nr_bytes, true); - /* - * The page must not be redirtied until the writeout is completed - * (i.e. userspace has sent a reply to the write request). Otherwise - * there could be more than one temporary page instance for each real - * page. - * - * This is ensured by holding the page lock in page_mkwrite() while - * checking fuse_page_is_writeback(). We already hold the page lock - * since clear_page_dirty_for_io() and keep it held until we add the - * request to the fi->writepages list and increment req->num_pages. - * After this fuse_page_is_writeback() will indicate that the page is - * under writeback, so we can release the page lock. - */ - if (data->req == NULL) { - struct fuse_inode *fi = get_fuse_inode(inode); - - err = -ENOMEM; - req = fuse_request_alloc_nofs(FUSE_REQ_INLINE_PAGES); - if (!req) { - __free_page(tmp_page); - goto out_unlock; + if (!send) { + /* + * Need to grow the pages array? If so, did the + * expansion fail? + */ + send = (ap->num_folios == data->max_folios) && + !fuse_pages_realloc(data, fc->max_pages); } - fuse_write_fill(req, data->ff, page_offset(page), 0); - req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; - req->misc.write.next = NULL; - req->in.argpages = 1; - __set_bit(FR_BACKGROUND, &req->flags); - req->num_pages = 0; - req->end = fuse_writepage_end; - req->inode = inode; - - spin_lock(&fc->lock); - list_add(&req->writepages_entry, &fi->writepages); - spin_unlock(&fc->lock); - - data->req = req; + if (send) { + fuse_writepages_send(inode, data); + data->wpa = NULL; + data->nr_bytes = 0; + } } - set_page_writeback(page); - - copy_highpage(tmp_page, page); - req->pages[req->num_pages] = tmp_page; - req->page_descs[req->num_pages].offset = 0; - req->page_descs[req->num_pages].length = PAGE_SIZE; - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); - - err = 0; - if (is_writeback && fuse_writepage_in_flight(req, page)) { - end_page_writeback(page); - data->req = NULL; - goto out_unlock; + if (data->wpa == NULL) { + wpa = fuse_writepage_args_setup(folio, offset, data->ff); + if (!wpa) + return -ENOMEM; + fuse_file_get(wpa->ia.ff); + data->max_folios = 1; + ap = &wpa->ia.ap; } - data->orig_pages[req->num_pages] = page; - /* - * Protected by fc->lock against concurrent access by - * fuse_page_is_writeback(). - */ - spin_lock(&fc->lock); - req->num_pages++; - spin_unlock(&fc->lock); + fuse_writepage_args_page_fill(wpa, folio, ap->num_folios, + offset, len); + data->nr_bytes += len; -out_unlock: - unlock_page(page); + ap->num_folios++; + if (!data->wpa) + data->wpa = wpa; - return err; + return len; } -static int fuse_writepages(struct address_space *mapping, - struct writeback_control *wbc) +static int fuse_iomap_writeback_submit(struct iomap_writepage_ctx *wpc, + int error) { - struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_fill_wb_data data; - int err; - - err = -EIO; - if (is_bad_inode(inode)) - goto out; + struct fuse_fill_wb_data *data = wpc->wb_ctx; - data.inode = inode; - data.req = NULL; - data.ff = NULL; + WARN_ON_ONCE(!data); - err = -ENOMEM; - data.orig_pages = kcalloc(fc->max_pages, - sizeof(struct page *), - GFP_NOFS); - if (!data.orig_pages) - goto out; - - err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); - if (data.req) { - /* Ignore errors if we can write at least one page */ - BUG_ON(!data.req->num_pages); - fuse_writepages_send(&data); - err = 0; + if (data->wpa) { + WARN_ON(!data->wpa->ia.ap.num_folios); + fuse_writepages_send(wpc->inode, data); } - if (data.ff) - fuse_file_put(data.ff, false, false); - kfree(data.orig_pages); -out: - return err; -} - -/* - * It's worthy to make sure that space is reserved on disk for the write, - * but how to implement it without killing performance need more thinking. - */ -static int fuse_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - pgoff_t index = pos >> PAGE_SHIFT; - struct fuse_conn *fc = get_fuse_conn(file_inode(file)); - struct page *page; - loff_t fsize; - int err = -ENOMEM; - - WARN_ON(!fc->writeback_cache); + if (data->ff) + fuse_file_put(data->ff, false); - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - goto error; - - fuse_wait_on_page_writeback(mapping->host, page->index); - - if (PageUptodate(page) || len == PAGE_SIZE) - goto success; - /* - * Check if the start this page comes after the end of file, in which - * case the readpage can be optimized away. - */ - fsize = i_size_read(mapping->host); - if (fsize <= (pos & PAGE_MASK)) { - size_t off = pos & ~PAGE_MASK; - if (off) - zero_user_segment(page, 0, off); - goto success; - } - err = fuse_do_readpage(file, page); - if (err) - goto cleanup; -success: - *pagep = page; - return 0; - -cleanup: - unlock_page(page); - put_page(page); -error: - return err; + return error; } -static int fuse_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = page->mapping->host; - - /* Haven't copied anything? Skip zeroing, size extending, dirtying. */ - if (!copied) - goto unlock; +static const struct iomap_writeback_ops fuse_writeback_ops = { + .writeback_range = fuse_iomap_writeback_range, + .writeback_submit = fuse_iomap_writeback_submit, +}; - if (!PageUptodate(page)) { - /* Zero any unwritten bytes at the end of the page */ - size_t endoff = (pos + copied) & ~PAGE_MASK; - if (endoff) - zero_user_segment(page, endoff, PAGE_SIZE); - SetPageUptodate(page); - } +static int fuse_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_fill_wb_data data = {}; + struct iomap_writepage_ctx wpc = { + .inode = inode, + .iomap.type = IOMAP_MAPPED, + .wbc = wbc, + .ops = &fuse_writeback_ops, + .wb_ctx = &data, + }; - fuse_write_update_size(inode, pos + copied); - set_page_dirty(page); + if (fuse_is_bad(inode)) + return -EIO; -unlock: - unlock_page(page); - put_page(page); + if (wbc->sync_mode == WB_SYNC_NONE && + fc->num_background >= fc->congestion_threshold) + return 0; - return copied; + return iomap_writepages(&wpc); } -static int fuse_launder_page(struct page *page) +static int fuse_launder_folio(struct folio *folio) { int err = 0; - if (clear_page_dirty_for_io(page)) { - struct inode *inode = page->mapping->host; - err = fuse_writepage_locked(page); + struct fuse_fill_wb_data data = {}; + struct iomap_writepage_ctx wpc = { + .inode = folio->mapping->host, + .iomap.type = IOMAP_MAPPED, + .ops = &fuse_writeback_ops, + .wb_ctx = &data, + }; + + if (folio_clear_dirty_for_io(folio)) { + err = iomap_writeback_folio(&wpc, folio); + err = fuse_iomap_writeback_submit(&wpc, err); if (!err) - fuse_wait_on_page_writeback(inode, page->index); + folio_wait_writeback(folio); } return err; } /* - * Write back dirty pages now, because there may not be any suitable - * open files later + * Write back dirty data/metadata now (there may not be any suitable + * open files later for data) */ static void fuse_vma_close(struct vm_area_struct *vma) { - filemap_write_and_wait(vma->vm_file->f_mapping); + int err; + + err = write_inode_now(vma->vm_file->f_mapping->host, 1); + mapping_set_error(vma->vm_file->f_mapping, err); } /* @@ -2064,17 +2344,17 @@ static void fuse_vma_close(struct vm_area_struct *vma) */ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vmf->vma->vm_file); file_update_time(vmf->vma->vm_file); - lock_page(page); - if (page->mapping != inode->i_mapping) { - unlock_page(page); + folio_lock(folio); + if (folio->mapping != inode->i_mapping) { + folio_unlock(folio); return VM_FAULT_NOPAGE; } - fuse_wait_on_page_writeback(inode, page->index); + folio_wait_writeback(folio); return VM_FAULT_LOCKED; } @@ -2087,6 +2367,56 @@ static const struct vm_operations_struct fuse_file_vm_ops = { static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fm->fc; + struct inode *inode = file_inode(file); + int rc; + + /* DAX mmap is superior to direct_io mmap */ + if (FUSE_IS_DAX(inode)) + return fuse_dax_mmap(file, vma); + + /* + * If inode is in passthrough io mode, because it has some file open + * in passthrough mode, either mmap to backing file or fail mmap, + * because mixing cached mmap and passthrough io mode is not allowed. + */ + if (fuse_file_passthrough(ff)) + return fuse_passthrough_mmap(file, vma); + else if (fuse_inode_backing(get_fuse_inode(inode))) + return -ENODEV; + + /* + * FOPEN_DIRECT_IO handling is special compared to O_DIRECT, + * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP. + */ + if (ff->open_flags & FOPEN_DIRECT_IO) { + /* + * Can't provide the coherency needed for MAP_SHARED + * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set. + */ + if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap) + return -ENODEV; + + invalidate_inode_pages2(file->f_mapping); + + if (!(vma->vm_flags & VM_MAYSHARE)) { + /* MAP_PRIVATE */ + return generic_file_mmap(file, vma); + } + + /* + * First mmap of direct_io file enters caching inode io mode. + * Also waits for parallel dio writers to go into serial mode + * (exclusive instead of shared lock). + * After first mmap, the inode stays in caching io mode until + * the direct_io file release. + */ + rc = fuse_file_cached_io_open(inode, ff); + if (rc) + return rc; + } + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) fuse_link_write_file(file); @@ -2095,17 +2425,6 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma) -{ - /* Can't provide the coherency needed for MAP_SHARED */ - if (vma->vm_flags & VM_MAYSHARE) - return -ENODEV; - - invalidate_inode_pages2(file->f_mapping); - - return generic_file_mmap(file, vma); -} - static int convert_fuse_file_lock(struct fuse_conn *fc, const struct fuse_file_lock *ffl, struct file_lock *fl) @@ -2128,14 +2447,14 @@ static int convert_fuse_file_lock(struct fuse_conn *fc, * translate it into the caller's pid namespace. */ rcu_read_lock(); - fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns); + fl->c.flc_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns); rcu_read_unlock(); break; default: return -EIO; } - fl->fl_type = ffl->type; + fl->c.flc_type = ffl->type; return 0; } @@ -2149,36 +2468,36 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file, memset(inarg, 0, sizeof(*inarg)); inarg->fh = ff->fh; - inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner); + inarg->owner = fuse_lock_owner_id(fc, fl->c.flc_owner); inarg->lk.start = fl->fl_start; inarg->lk.end = fl->fl_end; - inarg->lk.type = fl->fl_type; + inarg->lk.type = fl->c.flc_type; inarg->lk.pid = pid; if (flock) inarg->lk_flags |= FUSE_LK_FLOCK; - args->in.h.opcode = opcode; - args->in.h.nodeid = get_node_id(inode); - args->in.numargs = 1; - args->in.args[0].size = sizeof(*inarg); - args->in.args[0].value = inarg; + args->opcode = opcode; + args->nodeid = get_node_id(inode); + args->in_numargs = 1; + args->in_args[0].size = sizeof(*inarg); + args->in_args[0].value = inarg; } static int fuse_getlk(struct file *file, struct file_lock *fl) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_lk_in inarg; struct fuse_lk_out outarg; int err; fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg); - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); if (!err) - err = convert_fuse_file_lock(fc, &outarg.lk, fl); + err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl); return err; } @@ -2186,12 +2505,12 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_lk_in inarg; - int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; - struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL; - pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns); + int opcode = (fl->c.flc_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; + struct pid *pid = fl->c.flc_type != F_UNLCK ? task_tgid(current) : NULL; + pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns); int err; if (fl->fl_lmops && fl->fl_lmops->lm_grant) { @@ -2199,12 +2518,8 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) return -ENOLCK; } - /* Unlock on close is handled by the flush method */ - if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) - return 0; - fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); /* locking is restartable */ if (err == -EINTR) @@ -2258,29 +2573,29 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) static sector_t fuse_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_bmap_in inarg; struct fuse_bmap_out outarg; int err; - if (!inode->i_sb->s_bdev || fc->no_bmap) + if (!inode->i_sb->s_bdev || fm->fc->no_bmap) return 0; memset(&inarg, 0, sizeof(inarg)); inarg.block = block; inarg.blocksize = inode->i_sb->s_blocksize; - args.in.h.opcode = FUSE_BMAP; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + args.opcode = FUSE_BMAP; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) - fc->no_bmap = 1; + fm->fc->no_bmap = 1; return err ? 0 : outarg.block; } @@ -2288,7 +2603,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_file *ff = file->private_data; FUSE_ARGS(args); struct fuse_lseek_in inarg = { @@ -2299,21 +2614,21 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) struct fuse_lseek_out outarg; int err; - if (fc->no_lseek) + if (fm->fc->no_lseek) goto fallback; - args.in.h.opcode = FUSE_LSEEK; - args.in.h.nodeid = ff->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + args.opcode = FUSE_LSEEK; + args.nodeid = ff->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); if (err) { if (err == -ENOSYS) { - fc->no_lseek = 1; + fm->fc->no_lseek = 1; goto fallback; } return err; @@ -2322,7 +2637,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes); fallback: - err = fuse_update_attributes(inode, file); + err = fuse_update_attributes(inode, file, STATX_SIZE); if (!err) return generic_file_llseek(file, offset, whence); else @@ -2342,7 +2657,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) break; case SEEK_END: inode_lock(inode); - retval = fuse_update_attributes(inode, file); + retval = fuse_update_attributes(inode, file, STATX_SIZE); if (!retval) retval = generic_file_llseek(file, offset, whence); inode_unlock(inode); @@ -2361,361 +2676,6 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) } /* - * CUSE servers compiled on 32bit broke on 64bit kernels because the - * ABI was defined to be 'struct iovec' which is different on 32bit - * and 64bit. Fortunately we can determine which structure the server - * used from the size of the reply. - */ -static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src, - size_t transferred, unsigned count, - bool is_compat) -{ -#ifdef CONFIG_COMPAT - if (count * sizeof(struct compat_iovec) == transferred) { - struct compat_iovec *ciov = src; - unsigned i; - - /* - * With this interface a 32bit server cannot support - * non-compat (i.e. ones coming from 64bit apps) ioctl - * requests - */ - if (!is_compat) - return -EINVAL; - - for (i = 0; i < count; i++) { - dst[i].iov_base = compat_ptr(ciov[i].iov_base); - dst[i].iov_len = ciov[i].iov_len; - } - return 0; - } -#endif - - if (count * sizeof(struct iovec) != transferred) - return -EIO; - - memcpy(dst, src, transferred); - return 0; -} - -/* Make sure iov_length() won't overflow */ -static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov, - size_t count) -{ - size_t n; - u32 max = fc->max_pages << PAGE_SHIFT; - - for (n = 0; n < count; n++, iov++) { - if (iov->iov_len > (size_t) max) - return -ENOMEM; - max -= iov->iov_len; - } - return 0; -} - -static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst, - void *src, size_t transferred, unsigned count, - bool is_compat) -{ - unsigned i; - struct fuse_ioctl_iovec *fiov = src; - - if (fc->minor < 16) { - return fuse_copy_ioctl_iovec_old(dst, src, transferred, - count, is_compat); - } - - if (count * sizeof(struct fuse_ioctl_iovec) != transferred) - return -EIO; - - for (i = 0; i < count; i++) { - /* Did the server supply an inappropriate value? */ - if (fiov[i].base != (unsigned long) fiov[i].base || - fiov[i].len != (unsigned long) fiov[i].len) - return -EIO; - - dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base; - dst[i].iov_len = (size_t) fiov[i].len; - -#ifdef CONFIG_COMPAT - if (is_compat && - (ptr_to_compat(dst[i].iov_base) != fiov[i].base || - (compat_size_t) dst[i].iov_len != fiov[i].len)) - return -EIO; -#endif - } - - return 0; -} - - -/* - * For ioctls, there is no generic way to determine how much memory - * needs to be read and/or written. Furthermore, ioctls are allowed - * to dereference the passed pointer, so the parameter requires deep - * copying but FUSE has no idea whatsoever about what to copy in or - * out. - * - * This is solved by allowing FUSE server to retry ioctl with - * necessary in/out iovecs. Let's assume the ioctl implementation - * needs to read in the following structure. - * - * struct a { - * char *buf; - * size_t buflen; - * } - * - * On the first callout to FUSE server, inarg->in_size and - * inarg->out_size will be NULL; then, the server completes the ioctl - * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and - * the actual iov array to - * - * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } } - * - * which tells FUSE to copy in the requested area and retry the ioctl. - * On the second round, the server has access to the structure and - * from that it can tell what to look for next, so on the invocation, - * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to - * - * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) }, - * { .iov_base = a.buf, .iov_len = a.buflen } } - * - * FUSE will copy both struct a and the pointed buffer from the - * process doing the ioctl and retry ioctl with both struct a and the - * buffer. - * - * This time, FUSE server has everything it needs and completes ioctl - * without FUSE_IOCTL_RETRY which finishes the ioctl call. - * - * Copying data out works the same way. - * - * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel - * automatically initializes in and out iovs by decoding @cmd with - * _IOC_* macros and the server is not allowed to request RETRY. This - * limits ioctl data transfers to well-formed ioctls and is the forced - * behavior for all FUSE servers. - */ -long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, - unsigned int flags) -{ - struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; - struct fuse_ioctl_in inarg = { - .fh = ff->fh, - .cmd = cmd, - .arg = arg, - .flags = flags - }; - struct fuse_ioctl_out outarg; - struct fuse_req *req = NULL; - struct page **pages = NULL; - struct iovec *iov_page = NULL; - struct iovec *in_iov = NULL, *out_iov = NULL; - unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; - size_t in_size, out_size, transferred, c; - int err, i; - struct iov_iter ii; - -#if BITS_PER_LONG == 32 - inarg.flags |= FUSE_IOCTL_32BIT; -#else - if (flags & FUSE_IOCTL_COMPAT) - inarg.flags |= FUSE_IOCTL_32BIT; -#endif - - /* assume all the iovs returned by client always fits in a page */ - BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); - - err = -ENOMEM; - pages = kcalloc(fc->max_pages, sizeof(pages[0]), GFP_KERNEL); - iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); - if (!pages || !iov_page) - goto out; - - /* - * If restricted, initialize IO parameters as encoded in @cmd. - * RETRY from server is not allowed. - */ - if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { - struct iovec *iov = iov_page; - - iov->iov_base = (void __user *)arg; - iov->iov_len = _IOC_SIZE(cmd); - - if (_IOC_DIR(cmd) & _IOC_WRITE) { - in_iov = iov; - in_iovs = 1; - } - - if (_IOC_DIR(cmd) & _IOC_READ) { - out_iov = iov; - out_iovs = 1; - } - } - - retry: - inarg.in_size = in_size = iov_length(in_iov, in_iovs); - inarg.out_size = out_size = iov_length(out_iov, out_iovs); - - /* - * Out data can be used either for actual out data or iovs, - * make sure there always is at least one page. - */ - out_size = max_t(size_t, out_size, PAGE_SIZE); - max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE); - - /* make sure there are enough buffer pages and init request with them */ - err = -ENOMEM; - if (max_pages > fc->max_pages) - goto out; - while (num_pages < max_pages) { - pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (!pages[num_pages]) - goto out; - num_pages++; - } - - req = fuse_get_req(fc, num_pages); - if (IS_ERR(req)) { - err = PTR_ERR(req); - req = NULL; - goto out; - } - memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages); - req->num_pages = num_pages; - fuse_page_descs_length_init(req, 0, req->num_pages); - - /* okay, let's send it to the client */ - req->in.h.opcode = FUSE_IOCTL; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - if (in_size) { - req->in.numargs++; - req->in.args[1].size = in_size; - req->in.argpages = 1; - - err = -EFAULT; - iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { - c = copy_page_from_iter(pages[i], 0, PAGE_SIZE, &ii); - if (c != PAGE_SIZE && iov_iter_count(&ii)) - goto out; - } - } - - req->out.numargs = 2; - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; - req->out.args[1].size = out_size; - req->out.argpages = 1; - req->out.argvar = 1; - - fuse_request_send(fc, req); - err = req->out.h.error; - transferred = req->out.args[1].size; - fuse_put_request(fc, req); - req = NULL; - if (err) - goto out; - - /* did it ask for retry? */ - if (outarg.flags & FUSE_IOCTL_RETRY) { - void *vaddr; - - /* no retry if in restricted mode */ - err = -EIO; - if (!(flags & FUSE_IOCTL_UNRESTRICTED)) - goto out; - - in_iovs = outarg.in_iovs; - out_iovs = outarg.out_iovs; - - /* - * Make sure things are in boundary, separate checks - * are to protect against overflow. - */ - err = -ENOMEM; - if (in_iovs > FUSE_IOCTL_MAX_IOV || - out_iovs > FUSE_IOCTL_MAX_IOV || - in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) - goto out; - - vaddr = kmap_atomic(pages[0]); - err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr, - transferred, in_iovs + out_iovs, - (flags & FUSE_IOCTL_COMPAT) != 0); - kunmap_atomic(vaddr); - if (err) - goto out; - - in_iov = iov_page; - out_iov = in_iov + in_iovs; - - err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs); - if (err) - goto out; - - err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs); - if (err) - goto out; - - goto retry; - } - - err = -EIO; - if (transferred > inarg.out_size) - goto out; - - err = -EFAULT; - iov_iter_init(&ii, READ, out_iov, out_iovs, transferred); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { - c = copy_page_to_iter(pages[i], 0, PAGE_SIZE, &ii); - if (c != PAGE_SIZE && iov_iter_count(&ii)) - goto out; - } - err = 0; - out: - if (req) - fuse_put_request(fc, req); - free_page((unsigned long) iov_page); - while (num_pages) - __free_page(pages[--num_pages]); - kfree(pages); - - return err ? err : outarg.result; -} -EXPORT_SYMBOL_GPL(fuse_do_ioctl); - -long fuse_ioctl_common(struct file *file, unsigned int cmd, - unsigned long arg, unsigned int flags) -{ - struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); - - if (!fuse_allow_current_process(fc)) - return -EACCES; - - if (is_bad_inode(inode)) - return -EIO; - - return fuse_do_ioctl(file, cmd, arg, flags); -} - -static long fuse_file_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - return fuse_ioctl_common(file, cmd, arg, 0); -} - -static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); -} - -/* * All files which have been polled are linked to RB tree * fuse_conn->polled_files which is indexed by kh. Walk the tree and * find the matching one. @@ -2756,7 +2716,7 @@ static void fuse_register_polled_file(struct fuse_conn *fc, { spin_lock(&fc->lock); if (RB_EMPTY_NODE(&ff->polled_node)) { - struct rb_node **link, *uninitialized_var(parent); + struct rb_node **link, *parent; link = fuse_find_polled_node(fc, ff->kh, &parent); BUG_ON(*link); @@ -2769,13 +2729,13 @@ static void fuse_register_polled_file(struct fuse_conn *fc, __poll_t fuse_file_poll(struct file *file, poll_table *wait) { struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; struct fuse_poll_out outarg; FUSE_ARGS(args); int err; - if (fc->no_poll) + if (fm->fc->no_poll) return DEFAULT_POLLMASK; poll_wait(file, &ff->poll_wait, wait); @@ -2787,23 +2747,23 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait) */ if (waitqueue_active(&ff->poll_wait)) { inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY; - fuse_register_polled_file(fc, ff); + fuse_register_polled_file(fm->fc, ff); } - args.in.h.opcode = FUSE_POLL; - args.in.h.nodeid = ff->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + args.opcode = FUSE_POLL; + args.nodeid = ff->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); if (!err) return demangle_poll(outarg.revents); if (err == -ENOSYS) { - fc->no_poll = 1; + fm->fc->no_poll = 1; return DEFAULT_POLLMASK; } return EPOLLERR; @@ -2845,7 +2805,7 @@ static void fuse_do_truncate(struct file *file) attr.ia_file = file; attr.ia_valid |= ATTR_FILE; - fuse_do_setattr(file_dentry(file), &attr, file); + fuse_do_setattr(file_mnt_idmap(file), file_dentry(file), &attr, file); } static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off) @@ -2860,11 +2820,10 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) ssize_t ret = 0; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; - bool async_dio = ff->fc->async_dio; loff_t pos = 0; struct inode *inode; loff_t i_size; - size_t count = iov_iter_count(iter); + size_t count = iov_iter_count(iter), shortened = 0; loff_t offset = iocb->ki_pos; struct fuse_io_priv *io; @@ -2872,17 +2831,9 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) inode = file->f_mapping->host; i_size = i_size_read(inode); - if ((iov_iter_rw(iter) == READ) && (offset > i_size)) + if ((iov_iter_rw(iter) == READ) && (offset >= i_size)) return 0; - /* optimization for short read */ - if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) { - if (offset >= i_size) - return 0; - iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset)); - count = iov_iter_count(iter); - } - io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); if (!io) return -ENOMEM; @@ -2898,15 +2849,22 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * By default, we want to optimize all I/Os with async request * submission to the client filesystem if supported. */ - io->async = async_dio; + io->async = ff->fm->fc->async_dio; io->iocb = iocb; io->blocking = is_sync_kiocb(iocb); + /* optimization for short read */ + if (io->async && !io->write && offset + count > i_size) { + iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset)); + shortened = count - iov_iter_count(iter); + count -= shortened; + } + /* * We cannot asynchronously extend the size of a file. * In such case the aio will behave exactly like sync io. */ - if ((offset + count > i_size) && iov_iter_rw(iter) == WRITE) + if ((offset + count > i_size) && io->write) io->blocking = true; if (io->async && io->blocking) { @@ -2920,10 +2878,11 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (iov_iter_rw(iter) == WRITE) { ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE); - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); } else { ret = __fuse_direct_read(io, iter, &pos); } + iov_iter_reexpand(iter, iov_iter_count(iter) + shortened); if (io->async) { bool blocking = io->blocking; @@ -2941,22 +2900,32 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) kref_put(&io->refcnt, fuse_io_release); if (iov_iter_rw(iter) == WRITE) { - if (ret > 0) - fuse_write_update_size(inode, pos); - else if (ret < 0 && offset + count > i_size) + fuse_write_update_attr(inode, pos, ret); + /* For extending writes we already hold exclusive lock */ + if (ret < 0 && offset + count > i_size) fuse_do_truncate(file); } return ret; } +static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) +{ + int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX); + + if (!err) + fuse_sync_writes(inode); + + return err; +} + static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, loff_t length) { struct fuse_file *ff = file->private_data; struct inode *inode = file_inode(file); struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; FUSE_ARGS(args); struct fuse_fallocate_in inarg = { .fh = ff->fh, @@ -2965,39 +2934,55 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, .mode = mode }; int err; - bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || - (mode & FALLOC_FL_PUNCH_HOLE); + bool block_faults = FUSE_IS_DAX(inode) && + (!(mode & FALLOC_FL_KEEP_SIZE) || + (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))); - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; - if (fc->no_fallocate) + if (fm->fc->no_fallocate) return -EOPNOTSUPP; - if (lock_inode) { - inode_lock(inode); - if (mode & FALLOC_FL_PUNCH_HOLE) { - loff_t endbyte = offset + length - 1; - err = filemap_write_and_wait_range(inode->i_mapping, - offset, endbyte); - if (err) - goto out; - - fuse_sync_writes(inode); - } + inode_lock(inode); + if (block_faults) { + filemap_invalidate_lock(inode->i_mapping); + err = fuse_dax_break_layouts(inode, 0, -1); + if (err) + goto out; } + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) { + loff_t endbyte = offset + length - 1; + + err = fuse_writeback_range(inode, offset, endbyte); + if (err) + goto out; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + length > i_size_read(inode)) { + err = inode_newsize_ok(inode, offset + length); + if (err) + goto out; + } + + err = file_modified(file); + if (err) + goto out; + if (!(mode & FALLOC_FL_KEEP_SIZE)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); - args.in.h.opcode = FUSE_FALLOCATE; - args.in.h.nodeid = ff->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - err = fuse_simple_request(fc, &args); + args.opcode = FUSE_FALLOCATE; + args.nodeid = ff->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_fallocate = 1; + fm->fc->no_fallocate = 1; err = -EOPNOTSUPP; } if (err) @@ -3005,36 +2990,40 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, /* we could have extended the file */ if (!(mode & FALLOC_FL_KEEP_SIZE)) { - bool changed = fuse_write_update_size(inode, offset + length); - - if (changed && fc->writeback_cache) + if (fuse_write_update_attr(inode, offset + length, length)) file_update_time(file); } - if (mode & FALLOC_FL_PUNCH_HOLE) + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) truncate_pagecache_range(inode, offset, offset + length - 1); - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); out: if (!(mode & FALLOC_FL_KEEP_SIZE)) clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); - if (lock_inode) - inode_unlock(inode); + if (block_faults) + filemap_invalidate_unlock(inode->i_mapping); + + inode_unlock(inode); + + fuse_flush_time_update(inode); return err; } -static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t len, unsigned int flags) +static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags) { struct fuse_file *ff_in = file_in->private_data; struct fuse_file *ff_out = file_out->private_data; + struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); struct fuse_inode *fi_out = get_fuse_inode(inode_out); - struct fuse_conn *fc = ff_in->fc; + struct fuse_mount *fm = ff_in->fm; + struct fuse_conn *fc = fm->fc; FUSE_ARGS(args); struct fuse_copy_file_range_in inarg = { .fh_in = ff_in->fh, @@ -3046,6 +3035,8 @@ static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in, .flags = flags }; struct fuse_write_out outarg; + struct fuse_copy_file_range_out outarg_64; + u64 bytes_copied; ssize_t err; /* mark unstable when write-back is not used, and file_out gets * extended */ @@ -3055,53 +3046,118 @@ static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (fc->no_copy_file_range) return -EOPNOTSUPP; + if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) + return -EXDEV; + + inode_lock(inode_in); + err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1); + inode_unlock(inode_in); + if (err) + return err; + inode_lock(inode_out); - if (fc->writeback_cache) { - err = filemap_write_and_wait_range(inode_out->i_mapping, - pos_out, pos_out + len); - if (err) - goto out; + err = file_modified(file_out); + if (err) + goto out; - fuse_sync_writes(inode_out); - } + /* + * Write out dirty pages in the destination file before sending the COPY + * request to userspace. After the request is completed, truncate off + * pages (including partial ones) from the cache that have been copied, + * since these contain stale data at that point. + * + * This should be mostly correct, but if the COPY writes to partial + * pages (at the start or end) and the parts not covered by the COPY are + * written through a memory map after calling fuse_writeback_range(), + * then these partial page modifications will be lost on truncation. + * + * It is unlikely that someone would rely on such mixed style + * modifications. Yet this does give less guarantees than if the + * copying was performed with write(2). + * + * To fix this a mapping->invalidate_lock could be used to prevent new + * faults while the copy is ongoing. + */ + err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); + if (err) + goto out; if (is_unstable) set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); - args.in.h.opcode = FUSE_COPY_FILE_RANGE; - args.in.h.nodeid = ff_in->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + args.opcode = FUSE_COPY_FILE_RANGE_64; + args.nodeid = ff_in->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg_64); + args.out_args[0].value = &outarg_64; + if (fc->no_copy_file_range_64) { +fallback: + /* Fall back to old op that can't handle large copy length */ + args.opcode = FUSE_COPY_FILE_RANGE; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + inarg.len = len = min_t(size_t, len, UINT_MAX & PAGE_MASK); + } + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_copy_file_range = 1; - err = -EOPNOTSUPP; + if (fc->no_copy_file_range_64) { + fc->no_copy_file_range = 1; + err = -EOPNOTSUPP; + } else { + fc->no_copy_file_range_64 = 1; + goto fallback; + } } if (err) goto out; - if (fc->writeback_cache) { - fuse_write_update_size(inode_out, pos_out + outarg.size); - file_update_time(file_out); + bytes_copied = fc->no_copy_file_range_64 ? + outarg.size : outarg_64.bytes_copied; + + if (bytes_copied > len) { + err = -EIO; + goto out; } - fuse_invalidate_attr(inode_out); + truncate_inode_pages_range(inode_out->i_mapping, + ALIGN_DOWN(pos_out, PAGE_SIZE), + ALIGN(pos_out + bytes_copied, PAGE_SIZE) - 1); - err = outarg.size; + file_update_time(file_out); + fuse_write_update_attr(inode_out, pos_out + bytes_copied, bytes_copied); + + err = bytes_copied; out: if (is_unstable) clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); inode_unlock(inode_out); + file_accessed(file_in); + + fuse_flush_time_update(inode_out); return err; } +static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, + size_t len, unsigned int flags) +{ + ssize_t ret; + + ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off, + len, flags); + + if (ret == -EOPNOTSUPP || ret == -EXDEV) + ret = splice_copy_file_range(src_file, src_off, dst_file, + dst_off, len); + return ret; +} + static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, .read_iter = fuse_file_read_iter, @@ -3112,8 +3168,10 @@ static const struct file_operations fuse_file_operations = { .release = fuse_release, .fsync = fuse_fsync, .lock = fuse_file_lock, + .get_unmapped_area = thp_get_unmapped_area, .flock = fuse_file_flock, - .splice_read = generic_file_splice_read, + .splice_read = fuse_splice_read, + .splice_write = fuse_splice_write, .unlocked_ioctl = fuse_file_ioctl, .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, @@ -3121,47 +3179,37 @@ static const struct file_operations fuse_file_operations = { .copy_file_range = fuse_copy_file_range, }; -static const struct file_operations fuse_direct_io_file_operations = { - .llseek = fuse_file_llseek, - .read_iter = fuse_direct_read_iter, - .write_iter = fuse_direct_write_iter, - .mmap = fuse_direct_mmap, - .open = fuse_open, - .flush = fuse_flush, - .release = fuse_release, - .fsync = fuse_fsync, - .lock = fuse_file_lock, - .flock = fuse_file_flock, - .unlocked_ioctl = fuse_file_ioctl, - .compat_ioctl = fuse_file_compat_ioctl, - .poll = fuse_file_poll, - .fallocate = fuse_file_fallocate, - /* no splice_read */ -}; - static const struct address_space_operations fuse_file_aops = { - .readpage = fuse_readpage, - .writepage = fuse_writepage, + .read_folio = fuse_read_folio, + .readahead = fuse_readahead, .writepages = fuse_writepages, - .launder_page = fuse_launder_page, - .readpages = fuse_readpages, - .set_page_dirty = __set_page_dirty_nobuffers, + .launder_folio = fuse_launder_folio, + .dirty_folio = iomap_dirty_folio, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, + .is_partially_uptodate = iomap_is_partially_uptodate, + .migrate_folio = filemap_migrate_folio, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, - .write_begin = fuse_write_begin, - .write_end = fuse_write_end, }; -void fuse_init_file_inode(struct inode *inode) +void fuse_init_file_inode(struct inode *inode, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); inode->i_fop = &fuse_file_operations; inode->i_data.a_ops = &fuse_file_aops; + if (fc->writeback_cache) + mapping_set_writeback_may_deadlock_on_reclaim(&inode->i_data); INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); fi->writectr = 0; + fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); - INIT_LIST_HEAD(&fi->writepages); + init_waitqueue_head(&fi->direct_io_waitq); + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_inode_init(inode, flags); } |
