diff options
Diffstat (limited to 'fs/fuse/file.c')
-rw-r--r-- | fs/fuse/file.c | 1093 |
1 files changed, 643 insertions, 450 deletions
diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 148a71b8b4d0..d63e56fd3dd2 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -20,6 +20,7 @@ #include <linux/fs.h> #include <linux/filelock.h> #include <linux/splice.h> +#include <linux/task_io_accounting_ops.h> static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, @@ -50,13 +51,7 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, return fuse_simple_request(fm, &args); } -struct fuse_release_args { - struct fuse_args args; - struct fuse_release_in inarg; - struct inode *inode; -}; - -struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release) { struct fuse_file *ff; @@ -65,15 +60,15 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) return NULL; ff->fm = fm; - ff->release_args = kzalloc(sizeof(*ff->release_args), - GFP_KERNEL_ACCOUNT); - if (!ff->release_args) { - kfree(ff); - return NULL; + if (release) { + ff->args = kzalloc(sizeof(*ff->args), GFP_KERNEL_ACCOUNT); + if (!ff->args) { + kfree(ff); + return NULL; + } } INIT_LIST_HEAD(&ff->write_entry); - mutex_init(&ff->readdir.lock); refcount_set(&ff->count, 1); RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); @@ -85,8 +80,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) void fuse_file_free(struct fuse_file *ff) { - kfree(ff->release_args); - mutex_destroy(&ff->readdir.lock); + kfree(ff->args); kfree(ff); } @@ -105,14 +99,17 @@ static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args, kfree(ra); } -static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) +static void fuse_file_put(struct fuse_file *ff, bool sync) { if (refcount_dec_and_test(&ff->count)) { - struct fuse_args *args = &ff->release_args->args; + struct fuse_release_args *ra = &ff->args->release_args; + struct fuse_args *args = (ra ? &ra->args : NULL); - if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) { - /* Do nothing when client does not implement 'open' */ - fuse_release_end(ff->fm, args, 0); + if (ra && ra->inode) + fuse_file_io_release(ff, ra->inode); + + if (!args) { + /* Do nothing when server does not implement 'open' */ } else if (sync) { fuse_simple_request(ff->fm, args); fuse_release_end(ff->fm, args, 0); @@ -132,27 +129,31 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, struct fuse_conn *fc = fm->fc; struct fuse_file *ff; int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; + bool open = isdir ? !fc->no_opendir : !fc->no_open; - ff = fuse_file_alloc(fm); + ff = fuse_file_alloc(fm, open); if (!ff) return ERR_PTR(-ENOMEM); ff->fh = 0; /* Default for no-open */ ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); - if (isdir ? !fc->no_opendir : !fc->no_open) { - struct fuse_open_out outarg; + if (open) { + /* Store outarg for fuse_finish_open() */ + struct fuse_open_out *outargp = &ff->args->open_outarg; int err; - err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg); + err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp); if (!err) { - ff->fh = outarg.fh; - ff->open_flags = outarg.open_flags; - + ff->fh = outargp->fh; + ff->open_flags = outargp->open_flags; } else if (err != -ENOSYS) { fuse_file_free(ff); return ERR_PTR(err); } else { + /* No release needed */ + kfree(ff->args); + ff->args = NULL; if (isdir) fc->no_opendir = 1; else @@ -195,40 +196,50 @@ static void fuse_link_write_file(struct file *file) spin_unlock(&fi->lock); } -void fuse_finish_open(struct inode *inode, struct file *file) +int fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + err = fuse_file_io_open(file, inode); + if (err) + return err; if (ff->open_flags & FOPEN_STREAM) stream_open(inode, file); else if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); - if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { - struct fuse_inode *fi = get_fuse_inode(inode); - - spin_lock(&fi->lock); - fi->attr_version = atomic64_inc_return(&fc->attr_version); - i_size_write(inode, 0); - spin_unlock(&fi->lock); - file_update_time(file); - fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); - } if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); + + return 0; +} + +static void fuse_truncate_update_attr(struct inode *inode, struct file *file) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + i_size_write(inode, 0); + spin_unlock(&fi->lock); + file_update_time(file); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); } -int fuse_open_common(struct inode *inode, struct file *file, bool isdir) +static int fuse_open(struct inode *inode, struct file *file) { struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = fm->fc; + struct fuse_file *ff; int err; - bool is_wb_truncate = (file->f_flags & O_TRUNC) && - fc->atomic_o_trunc && - fc->writeback_cache; - bool dax_truncate = (file->f_flags & O_TRUNC) && - fc->atomic_o_trunc && FUSE_IS_DAX(inode); + bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc; + bool is_wb_truncate = is_truncate && fc->writeback_cache; + bool dax_truncate = is_truncate && FUSE_IS_DAX(inode); if (fuse_is_bad(inode)) return -EIO; @@ -250,16 +261,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) if (is_wb_truncate || dax_truncate) fuse_set_nowrite(inode); - err = fuse_do_open(fm, get_node_id(inode), file, isdir); - if (!err) - fuse_finish_open(inode, file); + err = fuse_do_open(fm, get_node_id(inode), file, false); + if (!err) { + ff = file->private_data; + err = fuse_finish_open(inode, file); + if (err) + fuse_sync_release(fi, ff, file->f_flags); + else if (is_truncate) + fuse_truncate_update_attr(inode, file); + } if (is_wb_truncate || dax_truncate) fuse_release_nowrite(inode); if (!err) { - struct fuse_file *ff = file->private_data; - - if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) + if (is_truncate) truncate_pagecache(inode, 0); else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) invalidate_inode_pages2(inode->i_mapping); @@ -274,10 +289,13 @@ out_inode_unlock: } static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, - unsigned int flags, int opcode) + unsigned int flags, int opcode, bool sync) { struct fuse_conn *fc = ff->fm->fc; - struct fuse_release_args *ra = ff->release_args; + struct fuse_release_args *ra = &ff->args->release_args; + + if (fuse_file_passthrough(ff)) + fuse_passthrough_release(ff, fuse_inode_backing(fi)); /* Inode is NULL on error path of fuse_create_open() */ if (likely(fi)) { @@ -292,6 +310,11 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, wake_up_interruptible_all(&ff->poll_wait); + if (!ra) + return; + + /* ff->args was used for open outarg */ + memset(ff->args, 0, sizeof(*ff->args)); ra->inarg.fh = ff->fh; ra->inarg.flags = flags; ra->args.in_numargs = 1; @@ -301,23 +324,28 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, ra->args.nodeid = ff->nodeid; ra->args.force = true; ra->args.nocreds = true; + + /* + * Hold inode until release is finished. + * From fuse_sync_release() the refcount is 1 and everything's + * synchronous, so we are fine with not doing igrab() here. + */ + ra->inode = sync ? NULL : igrab(&fi->inode); } void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir) { struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_release_args *ra = ff->release_args; + struct fuse_release_args *ra = &ff->args->release_args; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; - fuse_prepare_release(fi, ff, open_flags, opcode); + fuse_prepare_release(fi, ff, open_flags, opcode, false); - if (ff->flock) { + if (ra && ff->flock) { ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id); } - /* Hold inode until release is finished */ - ra->inode = igrab(inode); /* * Normally this will send the RELEASE request, however if @@ -328,7 +356,7 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff, * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. */ - fuse_file_put(ff, ff->fm->fc->destroy, isdir); + fuse_file_put(ff, ff->fm->fc->destroy); } void fuse_release_common(struct file *file, bool isdir) @@ -337,11 +365,6 @@ void fuse_release_common(struct file *file, bool isdir) (fl_owner_t) file, isdir); } -static int fuse_open(struct inode *inode, struct file *file) -{ - return fuse_open_common(inode, file, false); -} - static int fuse_release(struct inode *inode, struct file *file) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -363,12 +386,8 @@ void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, unsigned int flags) { WARN_ON(refcount_read(&ff->count) > 1); - fuse_prepare_release(fi, ff, flags, FUSE_RELEASE); - /* - * iput(NULL) is a no-op and since the refcount is 1 and everything's - * synchronous, we are fine with not doing igrab() here" - */ - fuse_file_put(ff, true, false); + fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true); + fuse_file_put(ff, true); } EXPORT_SYMBOL_GPL(fuse_sync_release); @@ -417,7 +436,7 @@ static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); WARN_ON(get_fuse_inode(wpa->inode) != fi); curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from >= curr_index + wpa->ia.ap.num_pages) + if (idx_from >= curr_index + wpa->ia.ap.num_folios) n = n->rb_right; else if (idx_to < curr_index) n = n->rb_left; @@ -429,9 +448,6 @@ static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, /* * Check if any page in a range is under writeback - * - * This is currently done by walking the list of writepage requests - * for the inode, which can be pretty inefficient. */ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, pgoff_t idx_to) @@ -439,6 +455,9 @@ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, struct fuse_inode *fi = get_fuse_inode(inode); bool found; + if (RB_EMPTY_ROOT(&fi->writepages)) + return false; + spin_lock(&fi->lock); found = fuse_find_writeback(fi, idx_from, idx_to); spin_unlock(&fi->lock); @@ -464,6 +483,21 @@ static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); } +static inline bool fuse_folio_is_writeback(struct inode *inode, + struct folio *folio) +{ + pgoff_t last = folio_next_index(folio) - 1; + return fuse_range_is_writeback(inode, folio_index(folio), last); +} + +static void fuse_wait_on_folio_writeback(struct inode *inode, + struct folio *folio) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + wait_event(fi->page_waitq, !fuse_folio_is_writeback(inode, folio)); +} + /* * Wait for all pending writepages on the inode to finish. * @@ -626,16 +660,20 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, args->out_args[0].size = count; } -static void fuse_release_user_pages(struct fuse_args_pages *ap, +static void fuse_release_user_pages(struct fuse_args_pages *ap, ssize_t nres, bool should_dirty) { unsigned int i; - for (i = 0; i < ap->num_pages; i++) { + for (i = 0; i < ap->num_folios; i++) { if (should_dirty) - set_page_dirty_lock(ap->pages[i]); - put_page(ap->pages[i]); + folio_mark_dirty_lock(ap->folios[i]); + if (ap->args.is_pinned) + unpin_folio(ap->folios[i]); } + + if (nres > 0 && ap->args.invalidate_vmap) + invalidate_kernel_vmap_range(ap->args.vmap_base, nres); } static void fuse_io_release(struct kref *kref) @@ -705,16 +743,16 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) } static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, - unsigned int npages) + unsigned int nfolios) { struct fuse_io_args *ia; ia = kzalloc(sizeof(*ia), GFP_KERNEL); if (ia) { ia->io = io; - ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL, - &ia->ap.descs); - if (!ia->ap.pages) { + ia->ap.folios = fuse_folios_alloc(nfolios, GFP_KERNEL, + &ia->ap.descs); + if (!ia->ap.folios) { kfree(ia); ia = NULL; } @@ -724,7 +762,7 @@ static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, static void fuse_io_free(struct fuse_io_args *ia) { - kfree(ia->ap.pages); + kfree(ia->ap.folios); kfree(ia); } @@ -734,25 +772,29 @@ static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); struct fuse_io_priv *io = ia->io; ssize_t pos = -1; - - fuse_release_user_pages(&ia->ap, io->should_dirty); + size_t nres; if (err) { /* Nothing */ } else if (io->write) { if (ia->write.out.size > ia->write.in.size) { err = -EIO; - } else if (ia->write.in.size != ia->write.out.size) { - pos = ia->write.in.offset - io->offset + - ia->write.out.size; + } else { + nres = ia->write.out.size; + if (ia->write.in.size != ia->write.out.size) + pos = ia->write.in.offset - io->offset + + ia->write.out.size; } } else { u32 outsize = args->out_args[0].size; + nres = outsize; if (ia->read.in.size != outsize) pos = ia->read.in.offset - io->offset + outsize; } + fuse_release_user_pages(&ia->ap, err ?: nres, io->should_dirty); + fuse_aio_complete(io, err, pos); fuse_io_free(ia); } @@ -823,33 +865,33 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, * reached the client fs yet. So the hole is not present there. */ if (!fc->writeback_cache) { - loff_t pos = page_offset(ap->pages[0]) + num_read; + loff_t pos = folio_pos(ap->folios[0]) + num_read; fuse_read_update_size(inode, pos, attr_ver); } } -static int fuse_do_readpage(struct file *file, struct page *page) +static int fuse_do_readfolio(struct file *file, struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct fuse_mount *fm = get_fuse_mount(inode); - loff_t pos = page_offset(page); - struct fuse_page_desc desc = { .length = PAGE_SIZE }; + loff_t pos = folio_pos(folio); + struct fuse_folio_desc desc = { .length = PAGE_SIZE }; struct fuse_io_args ia = { .ap.args.page_zeroing = true, .ap.args.out_pages = true, - .ap.num_pages = 1, - .ap.pages = &page, + .ap.num_folios = 1, + .ap.folios = &folio, .ap.descs = &desc, }; ssize_t res; u64 attr_ver; /* - * Page writeback can extend beyond the lifetime of the - * page-cache page, so make sure we read a properly synced - * page. + * With the temporary pages that are used to complete writeback, we can + * have writeback that extends beyond the lifetime of the folio. So + * make sure we read a properly synced folio. */ - fuse_wait_on_page_writeback(inode, page->index); + fuse_wait_on_folio_writeback(inode, folio); attr_ver = fuse_get_attr_version(fm->fc); @@ -867,25 +909,24 @@ static int fuse_do_readpage(struct file *file, struct page *page) if (res < desc.length) fuse_short_read(inode, attr_ver, res, &ia.ap); - SetPageUptodate(page); + folio_mark_uptodate(folio); return 0; } static int fuse_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; int err; err = -EIO; if (fuse_is_bad(inode)) goto out; - err = fuse_do_readpage(file, page); + err = fuse_do_readfolio(file, folio); fuse_invalidate_atime(inode); out: - unlock_page(page); + folio_unlock(folio); return err; } @@ -899,8 +940,8 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, size_t num_read = args->out_args[0].size; struct address_space *mapping = NULL; - for (i = 0; mapping == NULL && i < ap->num_pages; i++) - mapping = ap->pages[i]->mapping; + for (i = 0; mapping == NULL && i < ap->num_folios; i++) + mapping = ap->folios[i]->mapping; if (mapping) { struct inode *inode = mapping->host; @@ -914,18 +955,12 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, fuse_invalidate_atime(inode); } - for (i = 0; i < ap->num_pages; i++) { - struct page *page = ap->pages[i]; - - if (!err) - SetPageUptodate(page); - else - SetPageError(page); - unlock_page(page); - put_page(page); + for (i = 0; i < ap->num_folios; i++) { + folio_end_read(ap->folios[i], !err); + folio_put(ap->folios[i]); } if (ia->ff) - fuse_file_put(ia->ff, false, false); + fuse_file_put(ia->ff, false); fuse_io_free(ia); } @@ -935,8 +970,9 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; struct fuse_args_pages *ap = &ia->ap; - loff_t pos = page_offset(ap->pages[0]); - size_t count = ap->num_pages << PAGE_SHIFT; + loff_t pos = folio_pos(ap->folios[0]); + /* Currently, all folios in FUSE are one page */ + size_t count = ap->num_folios << PAGE_SHIFT; ssize_t res; int err; @@ -947,7 +983,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) /* Don't overflow end offset */ if (pos + (count - 1) == LLONG_MAX) { count--; - ap->descs[ap->num_pages - 1].length--; + ap->descs[ap->num_folios - 1].length--; } WARN_ON((loff_t) (pos + count) < 0); @@ -969,18 +1005,36 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) static void fuse_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - unsigned int i, max_pages, nr_pages = 0; + unsigned int max_pages, nr_pages; + pgoff_t first = readahead_index(rac); + pgoff_t last = first + readahead_count(rac) - 1; if (fuse_is_bad(inode)) return; + wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, first, last)); + max_pages = min_t(unsigned int, fc->max_pages, fc->max_read / PAGE_SIZE); - for (;;) { + /* + * This is only accurate the first time through, since readahead_folio() + * doesn't update readahead_count() from the previous folio until the + * next call. Grab nr_pages here so we know how many pages we're going + * to have to process. This means that we will exit here with + * readahead_count() == folio_nr_pages(last_folio), but we will have + * consumed all of the folios, and read_pages() will call + * readahead_folio() again which will clean up the rac. + */ + nr_pages = readahead_count(rac); + + while (nr_pages) { struct fuse_io_args *ia; struct fuse_args_pages *ap; + struct folio *folio; + unsigned cur_pages = min(max_pages, nr_pages); if (fc->num_background >= fc->congestion_threshold && rac->ra->async_size >= readahead_count(rac)) @@ -990,23 +1044,26 @@ static void fuse_readahead(struct readahead_control *rac) */ break; - nr_pages = readahead_count(rac) - nr_pages; - if (nr_pages > max_pages) - nr_pages = max_pages; - if (nr_pages == 0) - break; - ia = fuse_io_alloc(NULL, nr_pages); + ia = fuse_io_alloc(NULL, cur_pages); if (!ia) return; ap = &ia->ap; - nr_pages = __readahead_batch(rac, ap->pages, nr_pages); - for (i = 0; i < nr_pages; i++) { - fuse_wait_on_page_writeback(inode, - readahead_index(rac) + i); - ap->descs[i].length = PAGE_SIZE; + + while (ap->num_folios < cur_pages) { + /* + * This returns a folio with a ref held on it. + * The ref needs to be held until the request is + * completed, since the splice case (see + * fuse_try_move_page()) drops the ref after it's + * replaced in the page cache. + */ + folio = __readahead_folio(rac); + ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].length = folio_size(folio); + ap->num_folios++; } - ap->num_pages = nr_pages; fuse_send_readpages(ia, rac->file); + nr_pages -= cur_pages; } } @@ -1123,8 +1180,8 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, bool short_write; int err; - for (i = 0; i < ap->num_pages; i++) - fuse_wait_on_page_writeback(inode, ap->pages[i]->index); + for (i = 0; i < ap->num_folios; i++) + fuse_wait_on_folio_writeback(inode, ap->folios[i]); fuse_write_args_fill(ia, ff, pos, count); ia->write.in.flags = fuse_write_flags(iocb); @@ -1138,24 +1195,24 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, short_write = ia->write.out.size < count; offset = ap->descs[0].offset; count = ia->write.out.size; - for (i = 0; i < ap->num_pages; i++) { - struct page *page = ap->pages[i]; + for (i = 0; i < ap->num_folios; i++) { + struct folio *folio = ap->folios[i]; if (err) { - ClearPageUptodate(page); + folio_clear_uptodate(folio); } else { - if (count >= PAGE_SIZE - offset) - count -= PAGE_SIZE - offset; + if (count >= folio_size(folio) - offset) + count -= folio_size(folio) - offset; else { if (short_write) - ClearPageUptodate(page); + folio_clear_uptodate(folio); count = 0; } offset = 0; } - if (ia->write.page_locked && (i == ap->num_pages - 1)) - unlock_page(page); - put_page(page); + if (ia->write.folio_locked && (i == ap->num_folios - 1)) + folio_unlock(folio); + folio_put(folio); } return err; @@ -1169,6 +1226,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, struct fuse_args_pages *ap = &ia->ap; struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); + unsigned int nr_pages = 0; size_t count = 0; int err; @@ -1177,7 +1235,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, do { size_t tmp; - struct page *page; + struct folio *folio; pgoff_t index = pos >> PAGE_SHIFT; size_t bytes = min_t(size_t, PAGE_SIZE - offset, iov_iter_count(ii)); @@ -1189,27 +1247,30 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, if (fault_in_iov_iter_readable(ii, bytes)) break; - err = -ENOMEM; - page = grab_cache_page_write_begin(mapping, index); - if (!page) + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); break; + } if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + flush_dcache_folio(folio); - tmp = copy_page_from_iter_atomic(page, offset, bytes, ii); - flush_dcache_page(page); + tmp = copy_folio_from_iter_atomic(folio, offset, bytes, ii); + flush_dcache_folio(folio); if (!tmp) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto again; } err = 0; - ap->pages[ap->num_pages] = page; - ap->descs[ap->num_pages].length = tmp; - ap->num_pages++; + ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].length = tmp; + ap->num_folios++; + nr_pages++; count += tmp; pos += tmp; @@ -1219,18 +1280,18 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, /* If we copied full page, mark it uptodate */ if (tmp == PAGE_SIZE) - SetPageUptodate(page); + folio_mark_uptodate(folio); - if (PageUptodate(page)) { - unlock_page(page); + if (folio_test_uptodate(folio)) { + folio_unlock(folio); } else { - ia->write.page_locked = true; + ia->write.folio_locked = true; break; } if (!fc->big_writes) break; } while (iov_iter_count(ii) && count < fc->max_write && - ap->num_pages < max_pages && offset == 0); + nr_pages < max_pages && offset == 0); return count > 0 ? count : err; } @@ -1264,8 +1325,8 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), fc->max_pages); - ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs); - if (!ap->pages) { + ap->folios = fuse_folios_alloc(nr_pages, GFP_KERNEL, &ap->descs); + if (!ap->folios) { err = -ENOMEM; break; } @@ -1287,7 +1348,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) err = -EIO; } } - kfree(ap->pages); + kfree(ap->folios); } while (!err && iov_iter_count(ii)); fuse_write_update_attr(inode, pos, res); @@ -1299,13 +1360,94 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) return res; } +static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); +} + +/* + * @return true if an exclusive lock for direct IO writes is needed + */ +static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_inode *fi = get_fuse_inode(inode); + + /* Server side has to advise that it supports parallel dio writes. */ + if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES)) + return true; + + /* + * Append will need to know the eventual EOF - always needs an + * exclusive lock. + */ + if (iocb->ki_flags & IOCB_APPEND) + return true; + + /* shared locks are not allowed with parallel page cache IO */ + if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state)) + return true; + + /* Parallel dio beyond EOF is not supported, at least for now. */ + if (fuse_io_past_eof(iocb, from)) + return true; + + return false; +} + +static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, + bool *exclusive) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_inode *fi = get_fuse_inode(inode); + + *exclusive = fuse_dio_wr_exclusive_lock(iocb, from); + if (*exclusive) { + inode_lock(inode); + } else { + inode_lock_shared(inode); + /* + * New parallal dio allowed only if inode is not in caching + * mode and denies new opens in caching mode. This check + * should be performed only after taking shared inode lock. + * Previous past eof check was without inode lock and might + * have raced, so check it again. + */ + if (fuse_io_past_eof(iocb, from) || + fuse_inode_uncached_io_start(fi, NULL) != 0) { + inode_unlock_shared(inode); + inode_lock(inode); + *exclusive = true; + } + } +} + +static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_inode *fi = get_fuse_inode(inode); + + if (exclusive) { + inode_unlock(inode); + } else { + /* Allow opens in caching mode after last parallel dio end */ + fuse_inode_uncached_io_end(fi); + inode_unlock_shared(inode); + } +} + static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; + struct mnt_idmap *idmap = file_mnt_idmap(file); struct address_space *mapping = file->f_mapping; ssize_t written = 0; struct inode *inode = mapping->host; - ssize_t err; + ssize_t err, count; struct fuse_conn *fc = get_fuse_conn(inode); if (fc->writeback_cache) { @@ -1316,7 +1458,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) return err; if (fc->handle_killpriv_v2 && - setattr_should_drop_suidgid(&nop_mnt_idmap, + setattr_should_drop_suidgid(idmap, file_inode(file))) { goto writethrough; } @@ -1327,15 +1469,13 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) writethrough: inode_lock(inode); - err = generic_write_checks(iocb, from); + err = count = generic_write_checks(iocb, from); if (err <= 0) goto out; - err = file_remove_privs(file); - if (err) - goto out; + task_io_account_write(count); - err = file_update_time(file); + err = kiocb_modified(iocb); if (err) goto out; @@ -1369,55 +1509,97 @@ static inline size_t fuse_get_frag_size(const struct iov_iter *ii, static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, size_t *nbytesp, int write, - unsigned int max_pages) + unsigned int max_pages, + bool use_pages_for_kvec_io) { + bool flush_or_invalidate = false; + unsigned int nr_pages = 0; size_t nbytes = 0; /* # bytes already packed in req */ ssize_t ret = 0; - /* Special case for kernel I/O: can copy directly into the buffer */ + /* Special case for kernel I/O: can copy directly into the buffer. + * However if the implementation of fuse_conn requires pages instead of + * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead. + */ if (iov_iter_is_kvec(ii)) { - unsigned long user_addr = fuse_get_user_addr(ii); - size_t frag_size = fuse_get_frag_size(ii, *nbytesp); + void *user_addr = (void *)fuse_get_user_addr(ii); - if (write) - ap->args.in_args[1].value = (void *) user_addr; - else - ap->args.out_args[0].value = (void *) user_addr; + if (!use_pages_for_kvec_io) { + size_t frag_size = fuse_get_frag_size(ii, *nbytesp); - iov_iter_advance(ii, frag_size); - *nbytesp = frag_size; - return 0; + if (write) + ap->args.in_args[1].value = user_addr; + else + ap->args.out_args[0].value = user_addr; + + iov_iter_advance(ii, frag_size); + *nbytesp = frag_size; + return 0; + } + + if (is_vmalloc_addr(user_addr)) { + ap->args.vmap_base = user_addr; + flush_or_invalidate = true; + } } - while (nbytes < *nbytesp && ap->num_pages < max_pages) { - unsigned npages; + /* + * Until there is support for iov_iter_extract_folios(), we have to + * manually extract pages using iov_iter_extract_pages() and then + * copy that to a folios array. + */ + struct page **pages = kzalloc(max_pages * sizeof(struct page *), + GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto out; + } + + while (nbytes < *nbytesp && nr_pages < max_pages) { + unsigned nfolios, i; size_t start; - ret = iov_iter_get_pages2(ii, &ap->pages[ap->num_pages], - *nbytesp - nbytes, - max_pages - ap->num_pages, - &start); + + ret = iov_iter_extract_pages(ii, &pages, + *nbytesp - nbytes, + max_pages - nr_pages, + 0, &start); if (ret < 0) break; nbytes += ret; - ret += start; - npages = DIV_ROUND_UP(ret, PAGE_SIZE); + nfolios = DIV_ROUND_UP(ret + start, PAGE_SIZE); - ap->descs[ap->num_pages].offset = start; - fuse_page_descs_length_init(ap->descs, ap->num_pages, npages); + for (i = 0; i < nfolios; i++) { + struct folio *folio = page_folio(pages[i]); + unsigned int offset = start + + (folio_page_idx(folio, pages[i]) << PAGE_SHIFT); + unsigned int len = min_t(unsigned int, ret, PAGE_SIZE - start); - ap->num_pages += npages; - ap->descs[ap->num_pages - 1].length -= - (PAGE_SIZE - ret) & (PAGE_SIZE - 1); + ap->descs[ap->num_folios].offset = offset; + ap->descs[ap->num_folios].length = len; + ap->folios[ap->num_folios] = folio; + start = 0; + ret -= len; + ap->num_folios++; + } + + nr_pages += nfolios; } + kfree(pages); + + if (write && flush_or_invalidate) + flush_kernel_vmap_range(ap->args.vmap_base, nbytes); + ap->args.invalidate_vmap = !write && flush_or_invalidate; + ap->args.is_pinned = iov_iter_extract_will_pin(ii); ap->args.user_pages = true; if (write) ap->args.in_pages = true; else ap->args.out_pages = true; +out: *nbytesp = nbytes; return ret < 0 ? ret : 0; @@ -1479,7 +1661,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, size_t nbytes = min(count, nmax); err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, - max_pages); + max_pages, fc->use_pages_for_kvec_io); if (err && !nbytes) break; @@ -1493,7 +1675,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, } if (!io->async || nres < 0) { - fuse_release_user_pages(&ia->ap, io->should_dirty); + fuse_release_user_pages(&ia->ap, nres, io->should_dirty); fuse_io_free(ia); } ia = NULL; @@ -1547,7 +1729,7 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t res; - if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { + if (!is_sync_kiocb(iocb)) { res = fuse_direct_IO(iocb, to); } else { struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); @@ -1558,63 +1740,27 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) return res; } -static bool fuse_direct_write_extending_i_size(struct kiocb *iocb, - struct iov_iter *iter) -{ - struct inode *inode = file_inode(iocb->ki_filp); - - return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); -} - static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - struct file *file = iocb->ki_filp; - struct fuse_file *ff = file->private_data; - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; - bool exclusive_lock = - !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) || - get_fuse_conn(inode)->direct_io_allow_mmap || - iocb->ki_flags & IOCB_APPEND || - fuse_direct_write_extending_i_size(iocb, from); - - /* - * Take exclusive lock if - * - Parallel direct writes are disabled - a user space decision - * - Parallel direct writes are enabled and i_size is being extended. - * - Shared mmap on direct_io file is supported (FUSE_DIRECT_IO_ALLOW_MMAP). - * This might not be needed at all, but needs further investigation. - */ - if (exclusive_lock) - inode_lock(inode); - else { - inode_lock_shared(inode); - - /* A race with truncate might have come up as the decision for - * the lock type was done without holding the lock, check again. - */ - if (fuse_direct_write_extending_i_size(iocb, from)) { - inode_unlock_shared(inode); - inode_lock(inode); - exclusive_lock = true; - } - } + bool exclusive; + fuse_dio_lock(iocb, from, &exclusive); res = generic_write_checks(iocb, from); if (res > 0) { - if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { + task_io_account_write(res); + if (!is_sync_kiocb(iocb)) { res = fuse_direct_IO(iocb, from); } else { + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); fuse_write_update_attr(inode, iocb->ki_pos, res); } } - if (exclusive_lock) - inode_unlock(inode); - else - inode_unlock_shared(inode); + fuse_dio_unlock(iocb, exclusive); return res; } @@ -1631,10 +1777,13 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (FUSE_IS_DAX(inode)) return fuse_dax_read_iter(iocb, to); - if (!(ff->open_flags & FOPEN_DIRECT_IO)) - return fuse_cache_read_iter(iocb, to); - else + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (ff->open_flags & FOPEN_DIRECT_IO) return fuse_direct_read_iter(iocb, to); + else if (fuse_file_passthrough(ff)) + return fuse_passthrough_read_iter(iocb, to); + else + return fuse_cache_read_iter(iocb, to); } static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) @@ -1649,10 +1798,38 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (FUSE_IS_DAX(inode)) return fuse_dax_write_iter(iocb, from); - if (!(ff->open_flags & FOPEN_DIRECT_IO)) + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (ff->open_flags & FOPEN_DIRECT_IO) + return fuse_direct_write_iter(iocb, from); + else if (fuse_file_passthrough(ff)) + return fuse_passthrough_write_iter(iocb, from); + else return fuse_cache_write_iter(iocb, from); +} + +static ssize_t fuse_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct fuse_file *ff = in->private_data; + + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_passthrough_splice_read(in, ppos, pipe, len, flags); else - return fuse_direct_write_iter(iocb, from); + return filemap_splice_read(in, ppos, pipe, len, flags); +} + +static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + struct fuse_file *ff = out->private_data; + + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_passthrough_splice_write(pipe, out, ppos, len, flags); + else + return iter_file_splice_write(pipe, out, ppos, len, flags); } static void fuse_writepage_free(struct fuse_writepage_args *wpa) @@ -1663,30 +1840,34 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) if (wpa->bucket) fuse_sync_bucket_dec(wpa->bucket); - for (i = 0; i < ap->num_pages; i++) - __free_page(ap->pages[i]); + for (i = 0; i < ap->num_folios; i++) + folio_put(ap->folios[i]); - if (wpa->ia.ff) - fuse_file_put(wpa->ia.ff, false, false); + fuse_file_put(wpa->ia.ff, false); - kfree(ap->pages); + kfree(ap->folios); kfree(wpa); } -static void fuse_writepage_finish(struct fuse_mount *fm, - struct fuse_writepage_args *wpa) +static void fuse_writepage_finish_stat(struct inode *inode, struct folio *folio) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + node_stat_sub_folio(folio, NR_WRITEBACK_TEMP); + wb_writeout_inc(&bdi->wb); +} + +static void fuse_writepage_finish(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); - struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_pages; i++) { - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); - } + for (i = 0; i < ap->num_folios; i++) + fuse_writepage_finish_stat(inode, ap->folios[i]); + wake_up(&fi->page_waitq); } @@ -1700,7 +1881,8 @@ __acquires(fi->lock) struct fuse_inode *fi = get_fuse_inode(wpa->inode); struct fuse_write_in *inarg = &wpa->ia.write.in; struct fuse_args *args = &wpa->ia.ap.args; - __u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE; + /* Currently, all folios in FUSE are one page */ + __u64 data_size = wpa->ia.ap.num_folios * PAGE_SIZE; int err; fi->writectr++; @@ -1733,13 +1915,15 @@ __acquires(fi->lock) out_free: fi->writectr--; rb_erase(&wpa->writepages_entry, &fi->writepages); - fuse_writepage_finish(fm, wpa); + fuse_writepage_finish(wpa); spin_unlock(&fi->lock); - /* After fuse_writepage_finish() aux request list is private */ + /* After rb_erase() aux request list is private */ for (aux = wpa->next; aux; aux = next) { next = aux->next; aux->next = NULL; + fuse_writepage_finish_stat(aux->inode, + aux->ia.ap.folios[0]); fuse_writepage_free(aux); } @@ -1774,11 +1958,11 @@ static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, struct fuse_writepage_args *wpa) { pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; - pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1; + pgoff_t idx_to = idx_from + wpa->ia.ap.num_folios - 1; struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; - WARN_ON(!wpa->ia.ap.num_pages); + WARN_ON(!wpa->ia.ap.num_folios); while (*p) { struct fuse_writepage_args *curr; pgoff_t curr_index; @@ -1789,7 +1973,7 @@ static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, WARN_ON(curr->inode != wpa->inode); curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from >= curr_index + curr->ia.ap.num_pages) + if (idx_from >= curr_index + curr->ia.ap.num_folios) p = &(*p)->rb_right; else if (idx_to < curr_index) p = &(*p)->rb_left; @@ -1834,7 +2018,6 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, wpa->next = next->next; next->next = NULL; - next->ia.ff = fuse_file_get(wpa->ia.ff); tree_insert(&fi->writepages, next); /* @@ -1863,7 +2046,7 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, fuse_send_writepage(fm, next, inarg->offset + inarg->size); } fi->writectr--; - fuse_writepage_finish(fm, wpa); + fuse_writepage_finish(wpa); spin_unlock(&fi->lock); fuse_writepage_free(wpa); } @@ -1909,7 +2092,7 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) - fuse_file_put(ff, false, false); + fuse_file_put(ff, false); return err; } @@ -1922,9 +2105,9 @@ static struct fuse_writepage_args *fuse_writepage_args_alloc(void) wpa = kzalloc(sizeof(*wpa), GFP_NOFS); if (wpa) { ap = &wpa->ia.ap; - ap->num_pages = 0; - ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs); - if (!ap->pages) { + ap->num_folios = 0; + ap->folios = fuse_folios_alloc(1, GFP_NOFS, &ap->descs); + if (!ap->folios) { kfree(wpa); wpa = NULL; } @@ -1947,49 +2130,77 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, rcu_read_unlock(); } -static int fuse_writepage_locked(struct page *page) +static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, + struct folio *tmp_folio, uint32_t folio_index) { - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; + struct inode *inode = folio->mapping->host; + struct fuse_args_pages *ap = &wpa->ia.ap; + + folio_copy(tmp_folio, folio); + + ap->folios[folio_index] = tmp_folio; + ap->descs[folio_index].offset = 0; + ap->descs[folio_index].length = PAGE_SIZE; + + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); + node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP); +} + +static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, + struct fuse_file *ff) +{ + struct inode *inode = folio->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_writepage_args *wpa; struct fuse_args_pages *ap; - struct page *tmp_page; - int error = -ENOMEM; - - set_page_writeback(page); wpa = fuse_writepage_args_alloc(); if (!wpa) - goto err; + return NULL; + + fuse_writepage_add_to_bucket(fc, wpa); + fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio), 0); + wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; + wpa->inode = inode; + wpa->ia.ff = ff; + ap = &wpa->ia.ap; + ap->args.in_pages = true; + ap->args.end = fuse_writepage_end; + + return wpa; +} + +static int fuse_writepage_locked(struct folio *folio) +{ + struct address_space *mapping = folio->mapping; + struct inode *inode = mapping->host; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_writepage_args *wpa; + struct fuse_args_pages *ap; + struct folio *tmp_folio; + struct fuse_file *ff; + int error = -ENOMEM; - tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!tmp_page) - goto err_free; + tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); + if (!tmp_folio) + goto err; error = -EIO; - wpa->ia.ff = fuse_write_file_get(fi); - if (!wpa->ia.ff) + ff = fuse_write_file_get(fi); + if (!ff) goto err_nofile; - fuse_writepage_add_to_bucket(fc, wpa); - fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); + wpa = fuse_writepage_args_setup(folio, ff); + error = -ENOMEM; + if (!wpa) + goto err_writepage_args; - copy_highpage(tmp_page, page); - wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; - wpa->next = NULL; - ap->args.in_pages = true; - ap->num_pages = 1; - ap->pages[0] = tmp_page; - ap->descs[0].offset = 0; - ap->descs[0].length = PAGE_SIZE; - ap->args.end = fuse_writepage_end; - wpa->inode = inode; + ap = &wpa->ia.ap; + ap->num_folios = 1; - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); + folio_start_writeback(folio); + fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0); spin_lock(&fi->lock); tree_insert(&fi->writepages, wpa); @@ -1997,78 +2208,49 @@ static int fuse_writepage_locked(struct page *page) fuse_flush_writepages(inode); spin_unlock(&fi->lock); - end_page_writeback(page); + folio_end_writeback(folio); return 0; +err_writepage_args: + fuse_file_put(ff, false); err_nofile: - __free_page(tmp_page); -err_free: - kfree(wpa); + folio_put(tmp_folio); err: - mapping_set_error(page->mapping, error); - end_page_writeback(page); + mapping_set_error(folio->mapping, error); return error; } -static int fuse_writepage(struct page *page, struct writeback_control *wbc) -{ - struct fuse_conn *fc = get_fuse_conn(page->mapping->host); - int err; - - if (fuse_page_is_writeback(page->mapping->host, page->index)) { - /* - * ->writepages() should be called for sync() and friends. We - * should only get here on direct reclaim and then we are - * allowed to skip a page which is already in flight - */ - WARN_ON(wbc->sync_mode == WB_SYNC_ALL); - - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - - if (wbc->sync_mode == WB_SYNC_NONE && - fc->num_background >= fc->congestion_threshold) - return AOP_WRITEPAGE_ACTIVATE; - - err = fuse_writepage_locked(page); - unlock_page(page); - - return err; -} - struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; struct inode *inode; - struct page **orig_pages; - unsigned int max_pages; + struct folio **orig_folios; + unsigned int max_folios; }; static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) { struct fuse_args_pages *ap = &data->wpa->ia.ap; struct fuse_conn *fc = get_fuse_conn(data->inode); - struct page **pages; - struct fuse_page_desc *descs; - unsigned int npages = min_t(unsigned int, - max_t(unsigned int, data->max_pages * 2, - FUSE_DEFAULT_MAX_PAGES_PER_REQ), + struct folio **folios; + struct fuse_folio_desc *descs; + unsigned int nfolios = min_t(unsigned int, + max_t(unsigned int, data->max_folios * 2, + FUSE_DEFAULT_MAX_PAGES_PER_REQ), fc->max_pages); - WARN_ON(npages <= data->max_pages); + WARN_ON(nfolios <= data->max_folios); - pages = fuse_pages_alloc(npages, GFP_NOFS, &descs); - if (!pages) + folios = fuse_folios_alloc(nfolios, GFP_NOFS, &descs); + if (!folios) return false; - memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages); - memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages); - kfree(ap->pages); - ap->pages = pages; + memcpy(folios, ap->folios, sizeof(struct folio *) * ap->num_folios); + memcpy(descs, ap->descs, sizeof(struct fuse_folio_desc) * ap->num_folios); + kfree(ap->folios); + ap->folios = folios; ap->descs = descs; - data->max_pages = npages; + data->max_folios = nfolios; return true; } @@ -2078,17 +2260,16 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) struct fuse_writepage_args *wpa = data->wpa; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); - int num_pages = wpa->ia.ap.num_pages; + int num_folios = wpa->ia.ap.num_folios; int i; - wpa->ia.ff = fuse_file_get(data->ff); spin_lock(&fi->lock); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); - for (i = 0; i < num_pages; i++) - end_page_writeback(data->orig_pages[i]); + for (i = 0; i < num_folios; i++) + folio_end_writeback(data->orig_folios[i]); } /* @@ -2099,15 +2280,15 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) * swapping the new temp page with the old one. */ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, - struct page *page) + struct folio *folio) { struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); struct fuse_writepage_args *tmp; struct fuse_writepage_args *old_wpa; struct fuse_args_pages *new_ap = &new_wpa->ia.ap; - WARN_ON(new_ap->num_pages != 0); - new_ap->num_pages = 1; + WARN_ON(new_ap->num_folios != 0); + new_ap->num_folios = 1; spin_lock(&fi->lock); old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); @@ -2121,9 +2302,9 @@ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, WARN_ON(tmp->inode != new_wpa->inode); curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; - if (curr_index == page->index) { - WARN_ON(tmp->ia.ap.num_pages != 1); - swap(tmp->ia.ap.pages[0], new_ap->pages[0]); + if (curr_index == folio->index) { + WARN_ON(tmp->ia.ap.num_folios != 1); + swap(tmp->ia.ap.folios[0], new_ap->folios[0]); break; } } @@ -2136,22 +2317,19 @@ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, spin_unlock(&fi->lock); if (tmp) { - struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode); - - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); + fuse_writepage_finish_stat(new_wpa->inode, + folio); fuse_writepage_free(new_wpa); } return false; } -static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, +static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, struct fuse_args_pages *ap, struct fuse_fill_wb_data *data) { - WARN_ON(!ap->num_pages); + WARN_ON(!ap->num_folios); /* * Being under writeback is unlikely but possible. For example direct @@ -2159,23 +2337,23 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, * the pages are faulted with get_user_pages(), and then after the read * completed. */ - if (fuse_page_is_writeback(data->inode, page->index)) + if (fuse_folio_is_writeback(data->inode, folio)) return true; /* Reached max pages */ - if (ap->num_pages == fc->max_pages) + if (ap->num_folios == fc->max_pages) return true; /* Reached max write bytes */ - if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write) + if ((ap->num_folios + 1) * PAGE_SIZE > fc->max_write) return true; /* Discontinuity */ - if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index) + if (data->orig_folios[ap->num_folios - 1]->index + 1 != folio_index(folio)) return true; /* Need to grow the pages array? If so, did the expansion fail? */ - if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data)) + if (ap->num_folios == data->max_folios && !fuse_pages_realloc(data)) return true; return false; @@ -2190,7 +2368,7 @@ static int fuse_writepages_fill(struct folio *folio, struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct page *tmp_page; + struct folio *tmp_folio; int err; if (!data->ff) { @@ -2200,14 +2378,14 @@ static int fuse_writepages_fill(struct folio *folio, goto out_unlock; } - if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) { + if (wpa && fuse_writepage_need_send(fc, folio, ap, data)) { fuse_writepages_send(data); data->wpa = NULL; } err = -ENOMEM; - tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!tmp_page) + tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); + if (!tmp_folio) goto out_unlock; /* @@ -2219,40 +2397,25 @@ static int fuse_writepages_fill(struct folio *folio, * This is ensured by holding the page lock in page_mkwrite() while * checking fuse_page_is_writeback(). We already hold the page lock * since clear_page_dirty_for_io() and keep it held until we add the - * request to the fi->writepages list and increment ap->num_pages. + * request to the fi->writepages list and increment ap->num_folios. * After this fuse_page_is_writeback() will indicate that the page is * under writeback, so we can release the page lock. */ if (data->wpa == NULL) { err = -ENOMEM; - wpa = fuse_writepage_args_alloc(); + wpa = fuse_writepage_args_setup(folio, data->ff); if (!wpa) { - __free_page(tmp_page); + folio_put(tmp_folio); goto out_unlock; } - fuse_writepage_add_to_bucket(fc, wpa); - - data->max_pages = 1; - + fuse_file_get(wpa->ia.ff); + data->max_folios = 1; ap = &wpa->ia.ap; - fuse_write_args_fill(&wpa->ia, data->ff, folio_pos(folio), 0); - wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; - wpa->next = NULL; - ap->args.in_pages = true; - ap->args.end = fuse_writepage_end; - ap->num_pages = 0; - wpa->inode = inode; } folio_start_writeback(folio); - copy_highpage(tmp_page, &folio->page); - ap->pages[ap->num_pages] = tmp_page; - ap->descs[ap->num_pages].offset = 0; - ap->descs[ap->num_pages].length = PAGE_SIZE; - data->orig_pages[ap->num_pages] = &folio->page; - - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); + fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_folios); + data->orig_folios[ap->num_folios] = folio; err = 0; if (data->wpa) { @@ -2261,9 +2424,9 @@ static int fuse_writepages_fill(struct folio *folio, * fuse_page_is_writeback(). */ spin_lock(&fi->lock); - ap->num_pages++; + ap->num_folios++; spin_unlock(&fi->lock); - } else if (fuse_writepage_add(wpa, &folio->page)) { + } else if (fuse_writepage_add(wpa, folio)) { data->wpa = wpa; } else { folio_end_writeback(folio); @@ -2295,21 +2458,21 @@ static int fuse_writepages(struct address_space *mapping, data.ff = NULL; err = -ENOMEM; - data.orig_pages = kcalloc(fc->max_pages, - sizeof(struct page *), - GFP_NOFS); - if (!data.orig_pages) + data.orig_folios = kcalloc(fc->max_pages, + sizeof(struct folio *), + GFP_NOFS); + if (!data.orig_folios) goto out; err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); if (data.wpa) { - WARN_ON(!data.wpa->ia.ap.num_pages); + WARN_ON(!data.wpa->ia.ap.num_folios); fuse_writepages_send(&data); } if (data.ff) - fuse_file_put(data.ff, false, false); + fuse_file_put(data.ff, false); - kfree(data.orig_pages); + kfree(data.orig_folios); out: return err; } @@ -2319,76 +2482,77 @@ out: * but how to implement it without killing performance need more thinking. */ static int fuse_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { pgoff_t index = pos >> PAGE_SHIFT; struct fuse_conn *fc = get_fuse_conn(file_inode(file)); - struct page *page; + struct folio *folio; loff_t fsize; int err = -ENOMEM; WARN_ON(!fc->writeback_cache); - page = grab_cache_page_write_begin(mapping, index); - if (!page) + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) goto error; - fuse_wait_on_page_writeback(mapping->host, page->index); + fuse_wait_on_page_writeback(mapping->host, folio->index); - if (PageUptodate(page) || len == PAGE_SIZE) + if (folio_test_uptodate(folio) || len >= folio_size(folio)) goto success; /* - * Check if the start this page comes after the end of file, in which - * case the readpage can be optimized away. + * Check if the start of this folio comes after the end of file, + * in which case the readpage can be optimized away. */ fsize = i_size_read(mapping->host); - if (fsize <= (pos & PAGE_MASK)) { - size_t off = pos & ~PAGE_MASK; + if (fsize <= folio_pos(folio)) { + size_t off = offset_in_folio(folio, pos); if (off) - zero_user_segment(page, 0, off); + folio_zero_segment(folio, 0, off); goto success; } - err = fuse_do_readpage(file, page); + err = fuse_do_readfolio(file, folio); if (err) goto cleanup; success: - *pagep = page; + *foliop = folio; return 0; cleanup: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); error: return err; } static int fuse_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; /* Haven't copied anything? Skip zeroing, size extending, dirtying. */ if (!copied) goto unlock; pos += copied; - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { /* Zero any unwritten bytes at the end of the page */ size_t endoff = pos & ~PAGE_MASK; if (endoff) - zero_user_segment(page, endoff, PAGE_SIZE); - SetPageUptodate(page); + folio_zero_segment(folio, endoff, PAGE_SIZE); + folio_mark_uptodate(folio); } if (pos > inode->i_size) i_size_write(inode, pos); - set_page_dirty(page); + folio_mark_dirty(folio); unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return copied; } @@ -2401,7 +2565,7 @@ static int fuse_launder_folio(struct folio *folio) /* Serialize with pending writeback for the same page */ fuse_wait_on_page_writeback(inode, folio->index); - err = fuse_writepage_locked(&folio->page); + err = fuse_writepage_locked(folio); if (!err) fuse_wait_on_page_writeback(inode, folio->index); } @@ -2437,17 +2601,17 @@ static void fuse_vma_close(struct vm_area_struct *vma) */ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vmf->vma->vm_file); file_update_time(vmf->vma->vm_file); - lock_page(page); - if (page->mapping != inode->i_mapping) { - unlock_page(page); + folio_lock(folio); + if (folio->mapping != inode->i_mapping) { + folio_unlock(folio); return VM_FAULT_NOPAGE; } - fuse_wait_on_page_writeback(inode, page->index); + fuse_wait_on_folio_writeback(inode, folio); return VM_FAULT_LOCKED; } @@ -2462,13 +2626,30 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fm->fc; + struct inode *inode = file_inode(file); + int rc; /* DAX mmap is superior to direct_io mmap */ - if (FUSE_IS_DAX(file_inode(file))) + if (FUSE_IS_DAX(inode)) return fuse_dax_mmap(file, vma); + /* + * If inode is in passthrough io mode, because it has some file open + * in passthrough mode, either mmap to backing file or fail mmap, + * because mixing cached mmap and passthrough io mode is not allowed. + */ + if (fuse_file_passthrough(ff)) + return fuse_passthrough_mmap(file, vma); + else if (fuse_inode_backing(get_fuse_inode(inode))) + return -ENODEV; + + /* + * FOPEN_DIRECT_IO handling is special compared to O_DIRECT, + * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP. + */ if (ff->open_flags & FOPEN_DIRECT_IO) { - /* Can't provide the coherency needed for MAP_SHARED + /* + * Can't provide the coherency needed for MAP_SHARED * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set. */ if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap) @@ -2476,7 +2657,21 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) invalidate_inode_pages2(file->f_mapping); - return generic_file_mmap(file, vma); + if (!(vma->vm_flags & VM_MAYSHARE)) { + /* MAP_PRIVATE */ + return generic_file_mmap(file, vma); + } + + /* + * First mmap of direct_io file enters caching inode io mode. + * Also waits for parallel dio writers to go into serial mode + * (exclusive instead of shared lock). + * After first mmap, the inode stays in caching io mode until + * the direct_io file release. + */ + rc = fuse_file_cached_io_open(inode, ff); + if (rc) + return rc; } if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) @@ -2509,14 +2704,14 @@ static int convert_fuse_file_lock(struct fuse_conn *fc, * translate it into the caller's pid namespace. */ rcu_read_lock(); - fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns); + fl->c.flc_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns); rcu_read_unlock(); break; default: return -EIO; } - fl->fl_type = ffl->type; + fl->c.flc_type = ffl->type; return 0; } @@ -2530,10 +2725,10 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file, memset(inarg, 0, sizeof(*inarg)); inarg->fh = ff->fh; - inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner); + inarg->owner = fuse_lock_owner_id(fc, fl->c.flc_owner); inarg->lk.start = fl->fl_start; inarg->lk.end = fl->fl_end; - inarg->lk.type = fl->fl_type; + inarg->lk.type = fl->c.flc_type; inarg->lk.pid = pid; if (flock) inarg->lk_flags |= FUSE_LK_FLOCK; @@ -2570,8 +2765,8 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_lk_in inarg; - int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; - struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL; + int opcode = (fl->c.flc_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; + struct pid *pid = fl->c.flc_type != F_UNLCK ? task_tgid(current) : NULL; pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns); int err; @@ -2580,10 +2775,6 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) return -ENOLCK; } - /* Unlock on close is handled by the flush method */ - if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) - return 0; - fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); err = fuse_simple_request(fm, &args); @@ -2871,7 +3062,7 @@ static void fuse_do_truncate(struct file *file) attr.ia_file = file; attr.ia_valid |= ATTR_FILE; - fuse_do_setattr(file_dentry(file), &attr, file); + fuse_do_setattr(file_mnt_idmap(file), file_dentry(file), &attr, file); } static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off) @@ -3213,8 +3404,8 @@ static const struct file_operations fuse_file_operations = { .lock = fuse_file_lock, .get_unmapped_area = thp_get_unmapped_area, .flock = fuse_file_flock, - .splice_read = filemap_splice_read, - .splice_write = iter_file_splice_write, + .splice_read = fuse_splice_read, + .splice_write = fuse_splice_write, .unlocked_ioctl = fuse_file_ioctl, .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, @@ -3225,10 +3416,10 @@ static const struct file_operations fuse_file_operations = { static const struct address_space_operations fuse_file_aops = { .read_folio = fuse_read_folio, .readahead = fuse_readahead, - .writepage = fuse_writepage, .writepages = fuse_writepages, .launder_folio = fuse_launder_folio, .dirty_folio = filemap_dirty_folio, + .migrate_folio = filemap_migrate_folio, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, .write_begin = fuse_write_begin, @@ -3245,7 +3436,9 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); fi->writectr = 0; + fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); + init_waitqueue_head(&fi->direct_io_waitq); fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) |