summaryrefslogtreecommitdiff
path: root/fs/netfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/netfs')
-rw-r--r--fs/netfs/Kconfig18
-rw-r--r--fs/netfs/Makefile10
-rw-r--r--fs/netfs/buffered_read.c623
-rw-r--r--fs/netfs/buffered_write.c1175
-rw-r--r--fs/netfs/direct_read.c166
-rw-r--r--fs/netfs/direct_write.c74
-rw-r--r--fs/netfs/fscache_cache.c2
-rw-r--r--fs/netfs/fscache_cookie.c6
-rw-r--r--fs/netfs/fscache_io.c15
-rw-r--r--fs/netfs/fscache_main.c1
-rw-r--r--fs/netfs/fscache_volume.c17
-rw-r--r--fs/netfs/internal.h117
-rw-r--r--fs/netfs/io.c787
-rw-r--r--fs/netfs/iterator.c50
-rw-r--r--fs/netfs/locking.c25
-rw-r--r--fs/netfs/main.c68
-rw-r--r--fs/netfs/misc.c219
-rw-r--r--fs/netfs/objects.c113
-rw-r--r--fs/netfs/output.c478
-rw-r--r--fs/netfs/read_collect.c706
-rw-r--r--fs/netfs/read_pgpriv2.c227
-rw-r--r--fs/netfs/read_retry.c313
-rw-r--r--fs/netfs/read_single.c195
-rw-r--r--fs/netfs/rolling_buffer.c222
-rw-r--r--fs/netfs/stats.c49
-rw-r--r--fs/netfs/write_collect.c550
-rw-r--r--fs/netfs/write_issue.c927
-rw-r--r--fs/netfs/write_retry.c234
28 files changed, 4734 insertions, 2653 deletions
diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig
index bec805e0c44c..7701c037c328 100644
--- a/fs/netfs/Kconfig
+++ b/fs/netfs/Kconfig
@@ -22,6 +22,14 @@ config NETFS_STATS
between CPUs. On the other hand, the stats are very useful for
debugging purposes. Saying 'Y' here is recommended.
+config NETFS_DEBUG
+ bool "Enable dynamic debugging netfslib and FS-Cache"
+ depends on NETFS_SUPPORT
+ help
+ This permits debugging to be dynamically enabled in the local caching
+ management module. If this is set, the debugging output may be
+ enabled by setting bits in /sys/module/netfs/parameters/debug.
+
config FSCACHE
bool "General filesystem local caching manager"
depends on NETFS_SUPPORT
@@ -50,13 +58,3 @@ config FSCACHE_STATS
debugging purposes. Saying 'Y' here is recommended.
See Documentation/filesystems/caching/fscache.rst for more information.
-
-config FSCACHE_DEBUG
- bool "Debug FS-Cache"
- depends on FSCACHE
- help
- This permits debugging to be dynamically enabled in the local caching
- management module. If this is set, the debugging output may be
- enabled by setting bits in /sys/modules/fscache/parameter/debug.
-
- See Documentation/filesystems/caching/fscache.rst for more information.
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index d4d1d799819e..b43188d64bd8 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -5,13 +5,19 @@ netfs-y := \
buffered_write.o \
direct_read.o \
direct_write.o \
- io.o \
iterator.o \
locking.o \
main.o \
misc.o \
objects.o \
- output.o
+ read_collect.o \
+ read_pgpriv2.o \
+ read_retry.o \
+ read_single.o \
+ rolling_buffer.o \
+ write_collect.o \
+ write_issue.o \
+ write_retry.o
netfs-$(CONFIG_NETFS_STATS) += stats.o
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 3298c29b5548..0d1b6d35ff3b 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -9,114 +9,10 @@
#include <linux/task_io_accounting_ops.h>
#include "internal.h"
-/*
- * Unlock the folios in a read operation. We need to set PG_fscache on any
- * folios we're going to write back before we unlock them.
- */
-void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
-{
- struct netfs_io_subrequest *subreq;
- struct netfs_folio *finfo;
- struct folio *folio;
- pgoff_t start_page = rreq->start / PAGE_SIZE;
- pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
- size_t account = 0;
- bool subreq_failed = false;
-
- XA_STATE(xas, &rreq->mapping->i_pages, start_page);
-
- if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
- __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
- }
- }
-
- /* Walk through the pagecache and the I/O request lists simultaneously.
- * We may have a mixture of cached and uncached sections and we only
- * really want to write out the uncached sections. This is slightly
- * complicated by the possibility that we might have huge pages with a
- * mixture inside.
- */
- subreq = list_first_entry(&rreq->subrequests,
- struct netfs_io_subrequest, rreq_link);
- subreq_failed = (subreq->error < 0);
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
-
- rcu_read_lock();
- xas_for_each(&xas, folio, last_page) {
- loff_t pg_end;
- bool pg_failed = false;
- bool folio_started;
-
- if (xas_retry(&xas, folio))
- continue;
-
- pg_end = folio_pos(folio) + folio_size(folio) - 1;
-
- folio_started = false;
- for (;;) {
- loff_t sreq_end;
-
- if (!subreq) {
- pg_failed = true;
- break;
- }
- if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
- trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
- folio_start_fscache(folio);
- folio_started = true;
- }
- pg_failed |= subreq_failed;
- sreq_end = subreq->start + subreq->len - 1;
- if (pg_end < sreq_end)
- break;
-
- account += subreq->transferred;
- if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
- subreq = list_next_entry(subreq, rreq_link);
- subreq_failed = (subreq->error < 0);
- } else {
- subreq = NULL;
- subreq_failed = false;
- }
-
- if (pg_end == sreq_end)
- break;
- }
-
- if (!pg_failed) {
- flush_dcache_folio(folio);
- finfo = netfs_folio_info(folio);
- if (finfo) {
- trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
- if (finfo->netfs_group)
- folio_change_private(folio, finfo->netfs_group);
- else
- folio_detach_private(folio);
- kfree(finfo);
- }
- folio_mark_uptodate(folio);
- }
-
- if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
- if (folio->index == rreq->no_unlock_folio &&
- test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
- _debug("no unlock");
- else
- folio_unlock(folio);
- }
- }
- rcu_read_unlock();
-
- task_io_account_read(account);
- if (rreq->netfs_ops->done)
- rreq->netfs_ops->done(rreq);
-}
-
static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
- loff_t *_start, size_t *_len, loff_t i_size)
+ unsigned long long *_start,
+ unsigned long long *_len,
+ unsigned long long i_size)
{
struct netfs_cache_resources *cres = &rreq->cache_resources;
@@ -167,6 +63,262 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in
return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
}
+/*
+ * netfs_prepare_read_iterator - Prepare the subreq iterator for I/O
+ * @subreq: The subrequest to be set up
+ *
+ * Prepare the I/O iterator representing the read buffer on a subrequest for
+ * the filesystem to use for I/O (it can be passed directly to a socket). This
+ * is intended to be called from the ->issue_read() method once the filesystem
+ * has trimmed the request to the size it wants.
+ *
+ * Returns the limited size if successful and -ENOMEM if insufficient memory
+ * available.
+ *
+ * [!] NOTE: This must be run in the same thread as ->issue_read() was called
+ * in as we access the readahead_control struct.
+ */
+static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+ size_t rsize = subreq->len;
+
+ if (subreq->source == NETFS_DOWNLOAD_FROM_SERVER)
+ rsize = umin(rsize, rreq->io_streams[0].sreq_max_len);
+
+ if (rreq->ractl) {
+ /* If we don't have sufficient folios in the rolling buffer,
+ * extract a folioq's worth from the readahead region at a time
+ * into the buffer. Note that this acquires a ref on each page
+ * that we will need to release later - but we don't want to do
+ * that until after we've started the I/O.
+ */
+ struct folio_batch put_batch;
+
+ folio_batch_init(&put_batch);
+ while (rreq->submitted < subreq->start + rsize) {
+ ssize_t added;
+
+ added = rolling_buffer_load_from_ra(&rreq->buffer, rreq->ractl,
+ &put_batch);
+ if (added < 0)
+ return added;
+ rreq->submitted += added;
+ }
+ folio_batch_release(&put_batch);
+ }
+
+ subreq->len = rsize;
+ if (unlikely(rreq->io_streams[0].sreq_max_segs)) {
+ size_t limit = netfs_limit_iter(&rreq->buffer.iter, 0, rsize,
+ rreq->io_streams[0].sreq_max_segs);
+
+ if (limit < rsize) {
+ subreq->len = limit;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_limited);
+ }
+ }
+
+ subreq->io_iter = rreq->buffer.iter;
+
+ iov_iter_truncate(&subreq->io_iter, subreq->len);
+ rolling_buffer_advance(&rreq->buffer, subreq->len);
+ return subreq->len;
+}
+
+static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq,
+ loff_t i_size)
+{
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+ enum netfs_io_source source;
+
+ if (!cres->ops)
+ return NETFS_DOWNLOAD_FROM_SERVER;
+ source = cres->ops->prepare_read(subreq, i_size);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+ return source;
+
+}
+
+/*
+ * Issue a read against the cache.
+ * - Eats the caller's ref on subreq.
+ */
+static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+
+ netfs_stat(&netfs_n_rh_read);
+ cres->ops->read(cres, subreq->start, &subreq->io_iter, NETFS_READ_HOLE_IGNORE,
+ netfs_cache_read_terminated, subreq);
+}
+
+static void netfs_queue_read(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq,
+ bool last_subreq)
+{
+ struct netfs_io_stream *stream = &rreq->io_streams[0];
+
+ __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+
+ /* We add to the end of the list whilst the collector may be walking
+ * the list. The collector only goes nextwards and uses the lock to
+ * remove entries off of the front.
+ */
+ spin_lock(&rreq->lock);
+ list_add_tail(&subreq->rreq_link, &stream->subrequests);
+ if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
+ stream->front = subreq;
+ if (!stream->active) {
+ stream->collected_to = stream->front->start;
+ /* Store list pointers before active flag */
+ smp_store_release(&stream->active, true);
+ }
+ }
+
+ if (last_subreq) {
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
+ }
+
+ spin_unlock(&rreq->lock);
+}
+
+static void netfs_issue_read(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ switch (subreq->source) {
+ case NETFS_DOWNLOAD_FROM_SERVER:
+ rreq->netfs_ops->issue_read(subreq);
+ break;
+ case NETFS_READ_FROM_CACHE:
+ netfs_read_cache_to_pagecache(rreq, subreq);
+ break;
+ default:
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ subreq->error = 0;
+ iov_iter_zero(subreq->len, &subreq->io_iter);
+ subreq->transferred = subreq->len;
+ netfs_read_subreq_terminated(subreq);
+ break;
+ }
+}
+
+/*
+ * Perform a read to the pagecache from a series of sources of different types,
+ * slicing up the region to be read according to available cache blocks and
+ * network rsize.
+ */
+static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
+{
+ struct netfs_inode *ictx = netfs_inode(rreq->inode);
+ unsigned long long start = rreq->start;
+ ssize_t size = rreq->len;
+ int ret = 0;
+
+ do {
+ struct netfs_io_subrequest *subreq;
+ enum netfs_io_source source = NETFS_SOURCE_UNKNOWN;
+ ssize_t slice;
+
+ subreq = netfs_alloc_subrequest(rreq);
+ if (!subreq) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ subreq->start = start;
+ subreq->len = size;
+
+ source = netfs_cache_prepare_read(rreq, subreq, rreq->i_size);
+ subreq->source = source;
+ if (source == NETFS_DOWNLOAD_FROM_SERVER) {
+ unsigned long long zp = umin(ictx->zero_point, rreq->i_size);
+ size_t len = subreq->len;
+
+ if (unlikely(rreq->origin == NETFS_READ_SINGLE))
+ zp = rreq->i_size;
+ if (subreq->start >= zp) {
+ subreq->source = source = NETFS_FILL_WITH_ZEROES;
+ goto fill_with_zeroes;
+ }
+
+ if (len > zp - subreq->start)
+ len = zp - subreq->start;
+ if (len == 0) {
+ pr_err("ZERO-LEN READ: R=%08x[%x] l=%zx/%zx s=%llx z=%llx i=%llx",
+ rreq->debug_id, subreq->debug_index,
+ subreq->len, size,
+ subreq->start, ictx->zero_point, rreq->i_size);
+ break;
+ }
+ subreq->len = len;
+
+ netfs_stat(&netfs_n_rh_download);
+ if (rreq->netfs_ops->prepare_read) {
+ ret = rreq->netfs_ops->prepare_read(subreq);
+ if (ret < 0) {
+ subreq->error = ret;
+ /* Not queued - release both refs. */
+ netfs_put_subrequest(subreq, false,
+ netfs_sreq_trace_put_cancel);
+ netfs_put_subrequest(subreq, false,
+ netfs_sreq_trace_put_cancel);
+ break;
+ }
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+ }
+ goto issue;
+ }
+
+ fill_with_zeroes:
+ if (source == NETFS_FILL_WITH_ZEROES) {
+ subreq->source = NETFS_FILL_WITH_ZEROES;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ netfs_stat(&netfs_n_rh_zero);
+ goto issue;
+ }
+
+ if (source == NETFS_READ_FROM_CACHE) {
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ goto issue;
+ }
+
+ pr_err("Unexpected read source %u\n", source);
+ WARN_ON_ONCE(1);
+ break;
+
+ issue:
+ slice = netfs_prepare_read_iterator(subreq);
+ if (slice < 0) {
+ ret = slice;
+ subreq->error = ret;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_cancel);
+ /* Not queued - release both refs. */
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+ break;
+ }
+ size -= slice;
+ start += slice;
+
+ netfs_queue_read(rreq, subreq, size <= 0);
+ netfs_issue_read(rreq, subreq);
+ cond_resched();
+ } while (size > 0);
+
+ if (unlikely(size > 0)) {
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
+ netfs_wake_read_collector(rreq);
+ }
+
+ /* Defer error return as we may need to wait for outstanding I/O. */
+ cmpxchg(&rreq->error, 0, ret);
+}
+
/**
* netfs_readahead - Helper to manage a read request
* @ractl: The description of the readahead request
@@ -185,22 +337,19 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in
void netfs_readahead(struct readahead_control *ractl)
{
struct netfs_io_request *rreq;
- struct netfs_inode *ctx = netfs_inode(ractl->mapping->host);
+ struct netfs_inode *ictx = netfs_inode(ractl->mapping->host);
+ unsigned long long start = readahead_pos(ractl);
+ size_t size = readahead_length(ractl);
int ret;
- _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
-
- if (readahead_count(ractl) == 0)
- return;
-
- rreq = netfs_alloc_request(ractl->mapping, ractl->file,
- readahead_pos(ractl),
- readahead_length(ractl),
+ rreq = netfs_alloc_request(ractl->mapping, ractl->file, start, size,
NETFS_READAHEAD);
if (IS_ERR(rreq))
return;
- ret = netfs_begin_cache_read(rreq, ctx);
+ __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags);
+
+ ret = netfs_begin_cache_read(rreq, ictx);
if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
goto cleanup_free;
@@ -210,18 +359,13 @@ void netfs_readahead(struct readahead_control *ractl)
netfs_rreq_expand(rreq, ractl);
- /* Set up the output buffer */
- iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages,
- rreq->start, rreq->len);
-
- /* Drop the refs on the folios here rather than in the cache or
- * filesystem. The locks will be dropped in netfs_rreq_unlock().
- */
- while (readahead_folio(ractl))
- ;
+ rreq->ractl = ractl;
+ rreq->submitted = rreq->start;
+ if (rolling_buffer_init(&rreq->buffer, rreq->debug_id, ITER_DEST) < 0)
+ goto cleanup_free;
+ netfs_read_to_pagecache(rreq);
- netfs_begin_read(rreq, false);
- netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(rreq, true, netfs_rreq_trace_put_return);
return;
cleanup_free:
@@ -230,6 +374,112 @@ cleanup_free:
}
EXPORT_SYMBOL(netfs_readahead);
+/*
+ * Create a rolling buffer with a single occupying folio.
+ */
+static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct folio *folio,
+ unsigned int rollbuf_flags)
+{
+ ssize_t added;
+
+ if (rolling_buffer_init(&rreq->buffer, rreq->debug_id, ITER_DEST) < 0)
+ return -ENOMEM;
+
+ added = rolling_buffer_append(&rreq->buffer, folio, rollbuf_flags);
+ if (added < 0)
+ return added;
+ rreq->submitted = rreq->start + added;
+ rreq->ractl = (struct readahead_control *)1UL;
+ return 0;
+}
+
+/*
+ * Read into gaps in a folio partially filled by a streaming write.
+ */
+static int netfs_read_gaps(struct file *file, struct folio *folio)
+{
+ struct netfs_io_request *rreq;
+ struct address_space *mapping = folio->mapping;
+ struct netfs_folio *finfo = netfs_folio_info(folio);
+ struct netfs_inode *ctx = netfs_inode(mapping->host);
+ struct folio *sink = NULL;
+ struct bio_vec *bvec;
+ unsigned int from = finfo->dirty_offset;
+ unsigned int to = from + finfo->dirty_len;
+ unsigned int off = 0, i = 0;
+ size_t flen = folio_size(folio);
+ size_t nr_bvec = flen / PAGE_SIZE + 2;
+ size_t part;
+ int ret;
+
+ _enter("%lx", folio->index);
+
+ rreq = netfs_alloc_request(mapping, file, folio_pos(folio), flen, NETFS_READ_GAPS);
+ if (IS_ERR(rreq)) {
+ ret = PTR_ERR(rreq);
+ goto alloc_error;
+ }
+
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto discard;
+
+ netfs_stat(&netfs_n_rh_read_folio);
+ trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_read_gaps);
+
+ /* Fiddle the buffer so that a gap at the beginning and/or a gap at the
+ * end get copied to, but the middle is discarded.
+ */
+ ret = -ENOMEM;
+ bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL);
+ if (!bvec)
+ goto discard;
+
+ sink = folio_alloc(GFP_KERNEL, 0);
+ if (!sink) {
+ kfree(bvec);
+ goto discard;
+ }
+
+ trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
+
+ rreq->direct_bv = bvec;
+ rreq->direct_bv_count = nr_bvec;
+ if (from > 0) {
+ bvec_set_folio(&bvec[i++], folio, from, 0);
+ off = from;
+ }
+ while (off < to) {
+ part = min_t(size_t, to - off, PAGE_SIZE);
+ bvec_set_folio(&bvec[i++], sink, part, 0);
+ off += part;
+ }
+ if (to < flen)
+ bvec_set_folio(&bvec[i++], folio, flen - to, to);
+ iov_iter_bvec(&rreq->buffer.iter, ITER_DEST, bvec, i, rreq->len);
+ rreq->submitted = rreq->start + flen;
+
+ netfs_read_to_pagecache(rreq);
+
+ if (sink)
+ folio_put(sink);
+
+ ret = netfs_wait_for_read(rreq);
+ if (ret >= 0) {
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
+ }
+ folio_unlock(folio);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ return ret < 0 ? ret : 0;
+
+discard:
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
+alloc_error:
+ folio_unlock(folio);
+ return ret;
+}
+
/**
* netfs_read_folio - Helper to manage a read_folio request
* @file: The file to read from
@@ -249,13 +499,17 @@ int netfs_read_folio(struct file *file, struct folio *folio)
struct address_space *mapping = folio->mapping;
struct netfs_io_request *rreq;
struct netfs_inode *ctx = netfs_inode(mapping->host);
- struct folio *sink = NULL;
int ret;
+ if (folio_test_dirty(folio)) {
+ trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
+ return netfs_read_gaps(file, folio);
+ }
+
_enter("%lx", folio->index);
rreq = netfs_alloc_request(mapping, file,
- folio_file_pos(folio), folio_size(folio),
+ folio_pos(folio), folio_size(folio),
NETFS_READPAGE);
if (IS_ERR(rreq)) {
ret = PTR_ERR(rreq);
@@ -266,58 +520,16 @@ int netfs_read_folio(struct file *file, struct folio *folio)
if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
goto discard;
- netfs_stat(&netfs_n_rh_readpage);
+ netfs_stat(&netfs_n_rh_read_folio);
trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
/* Set up the output buffer */
- if (folio_test_dirty(folio)) {
- /* Handle someone trying to read from an unflushed streaming
- * write. We fiddle the buffer so that a gap at the beginning
- * and/or a gap at the end get copied to, but the middle is
- * discarded.
- */
- struct netfs_folio *finfo = netfs_folio_info(folio);
- struct bio_vec *bvec;
- unsigned int from = finfo->dirty_offset;
- unsigned int to = from + finfo->dirty_len;
- unsigned int off = 0, i = 0;
- size_t flen = folio_size(folio);
- size_t nr_bvec = flen / PAGE_SIZE + 2;
- size_t part;
-
- ret = -ENOMEM;
- bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL);
- if (!bvec)
- goto discard;
-
- sink = folio_alloc(GFP_KERNEL, 0);
- if (!sink)
- goto discard;
-
- trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
-
- rreq->direct_bv = bvec;
- rreq->direct_bv_count = nr_bvec;
- if (from > 0) {
- bvec_set_folio(&bvec[i++], folio, from, 0);
- off = from;
- }
- while (off < to) {
- part = min_t(size_t, to - off, PAGE_SIZE);
- bvec_set_folio(&bvec[i++], sink, part, 0);
- off += part;
- }
- if (to < flen)
- bvec_set_folio(&bvec[i++], folio, flen - to, to);
- iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len);
- } else {
- iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
- rreq->start, rreq->len);
- }
+ ret = netfs_create_singular_buffer(rreq, folio, 0);
+ if (ret < 0)
+ goto discard;
- ret = netfs_begin_read(rreq, true);
- if (sink)
- folio_put(sink);
+ netfs_read_to_pagecache(rreq);
+ ret = netfs_wait_for_read(rreq);
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
return ret < 0 ? ret : 0;
@@ -355,7 +567,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
if (unlikely(always_fill)) {
if (pos - offset + len <= i_size)
return false; /* Page entirely before EOF */
- zero_user_segment(&folio->page, 0, plen);
+ folio_zero_segment(folio, 0, plen);
folio_mark_uptodate(folio);
return true;
}
@@ -374,12 +586,12 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
return false;
zero_out:
- zero_user_segments(&folio->page, 0, offset, offset + len, plen);
+ folio_zero_segments(folio, 0, offset, offset + len, plen);
return true;
}
/**
- * netfs_write_begin - Helper to prepare for writing
+ * netfs_write_begin - Helper to prepare for writing [DEPRECATED]
* @ctx: The netfs context
* @file: The file to read from
* @mapping: The mapping to read from
@@ -390,13 +602,10 @@ zero_out:
*
* Pre-read data for a write-begin request by drawing data from the cache if
* possible, or the netfs if not. Space beyond the EOF is zero-filled.
- * Multiple I/O requests from different sources will get munged together. If
- * necessary, the readahead window can be expanded in either direction to a
- * more convenient alighment for RPC efficiency or to make storage in the cache
- * feasible.
+ * Multiple I/O requests from different sources will get munged together.
*
* The calling netfs must provide a table of operations, only one of which,
- * issue_op, is mandatory.
+ * issue_read, is mandatory.
*
* The check_write_begin() operation can be provided to check for and flush
* conflicting writes once the folio is grabbed and locked. It is passed a
@@ -410,6 +619,9 @@ zero_out:
* inode before calling this.
*
* This is usable whether or not caching is enabled.
+ *
+ * Note that this should be considered deprecated and netfs_perform_write()
+ * used instead.
*/
int netfs_write_begin(struct netfs_inode *ctx,
struct file *file, struct address_space *mapping,
@@ -421,8 +633,6 @@ int netfs_write_begin(struct netfs_inode *ctx,
pgoff_t index = pos >> PAGE_SHIFT;
int ret;
- DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
-
retry:
folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping));
@@ -443,7 +653,7 @@ retry:
if (folio_test_uptodate(folio))
goto have_folio;
- /* If the page is beyond the EOF, we want to clear it - unless it's
+ /* If the folio is beyond the EOF, we want to clear it - unless it's
* within the cache granule containing the EOF, in which case we need
* to preload the granule.
*/
@@ -454,7 +664,7 @@ retry:
}
rreq = netfs_alloc_request(mapping, file,
- folio_file_pos(folio), folio_size(folio),
+ folio_pos(folio), folio_size(folio),
NETFS_READ_FOR_WRITE);
if (IS_ERR(rreq)) {
ret = PTR_ERR(rreq);
@@ -470,28 +680,19 @@ retry:
netfs_stat(&netfs_n_rh_write_begin);
trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
- /* Expand the request to meet caching requirements and download
- * preferences.
- */
- ractl._nr_pages = folio_nr_pages(folio);
- netfs_rreq_expand(rreq, &ractl);
-
/* Set up the output buffer */
- iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
- rreq->start, rreq->len);
-
- /* We hold the folio locks, so we can drop the references */
- folio_get(folio);
- while (readahead_folio(&ractl))
- ;
+ ret = netfs_create_singular_buffer(rreq, folio, 0);
+ if (ret < 0)
+ goto error_put;
- ret = netfs_begin_read(rreq, true);
+ netfs_read_to_pagecache(rreq);
+ ret = netfs_wait_for_read(rreq);
if (ret < 0)
goto error;
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
have_folio:
- ret = folio_wait_fscache_killable(folio);
+ ret = folio_wait_private_2_killable(folio);
if (ret < 0)
goto error;
have_folio_no_wait:
@@ -512,7 +713,7 @@ error:
EXPORT_SYMBOL(netfs_write_begin);
/*
- * Preload the data into a page we're proposing to write into.
+ * Preload the data into a folio we're proposing to write into.
*/
int netfs_prefetch_for_write(struct file *file, struct folio *folio,
size_t offset, size_t len)
@@ -545,12 +746,14 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write);
/* Set up the output buffer */
- iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
- rreq->start, rreq->len);
+ ret = netfs_create_singular_buffer(rreq, folio, NETFS_ROLLBUF_PAGECACHE_MARK);
+ if (ret < 0)
+ goto error_put;
- ret = netfs_begin_read(rreq, true);
+ netfs_read_to_pagecache(rreq);
+ ret = netfs_wait_for_read(rreq);
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
- return ret;
+ return ret < 0 ? ret : 0;
error_put:
netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 9a0d32e4b422..b4826360a411 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Network filesystem high-level write support.
+/* Network filesystem high-level buffered write support.
*
* Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
@@ -13,104 +13,22 @@
#include <linux/pagevec.h>
#include "internal.h"
-/*
- * Determined write method. Adjust netfs_folio_traces if this is changed.
- */
-enum netfs_how_to_modify {
- NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */
- NETFS_JUST_PREFETCH, /* We have to read the folio anyway */
- NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */
- NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */
- NETFS_STREAMING_WRITE, /* Store incomplete data in non-uptodate page. */
- NETFS_STREAMING_WRITE_CONT, /* Continue streaming write. */
- NETFS_FLUSH_CONTENT, /* Flush incompatible content. */
-};
-
-static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq);
-
-static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
+static void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
{
- if (netfs_group && !folio_get_private(folio))
+ if (netfs_group)
folio_attach_private(folio, netfs_get_group(netfs_group));
}
-#if IS_ENABLED(CONFIG_FSCACHE)
-static void netfs_folio_start_fscache(bool caching, struct folio *folio)
-{
- if (caching)
- folio_start_fscache(folio);
-}
-#else
-static void netfs_folio_start_fscache(bool caching, struct folio *folio)
-{
-}
-#endif
-
-/*
- * Decide how we should modify a folio. We might be attempting to do
- * write-streaming, in which case we don't want to a local RMW cycle if we can
- * avoid it. If we're doing local caching or content crypto, we award that
- * priority over avoiding RMW. If the file is open readably, then we also
- * assume that we may want to read what we wrote.
- */
-static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
- struct file *file,
- struct folio *folio,
- void *netfs_group,
- size_t flen,
- size_t offset,
- size_t len,
- bool maybe_trouble)
+static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
{
- struct netfs_folio *finfo = netfs_folio_info(folio);
- loff_t pos = folio_file_pos(folio);
-
- _enter("");
-
- if (netfs_folio_group(folio) != netfs_group)
- return NETFS_FLUSH_CONTENT;
-
- if (folio_test_uptodate(folio))
- return NETFS_FOLIO_IS_UPTODATE;
+ void *priv = folio_get_private(folio);
- if (pos >= ctx->zero_point)
- return NETFS_MODIFY_AND_CLEAR;
-
- if (!maybe_trouble && offset == 0 && len >= flen)
- return NETFS_WHOLE_FOLIO_MODIFY;
-
- if (file->f_mode & FMODE_READ)
- goto no_write_streaming;
- if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
- goto no_write_streaming;
-
- if (netfs_is_cache_enabled(ctx)) {
- /* We don't want to get a streaming write on a file that loses
- * caching service temporarily because the backing store got
- * culled.
- */
- if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
- set_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags);
- goto no_write_streaming;
- }
-
- if (!finfo)
- return NETFS_STREAMING_WRITE;
-
- /* We can continue a streaming write only if it continues on from the
- * previous. If it overlaps, we must flush lest we suffer a partial
- * copy and disjoint dirty regions.
- */
- if (offset == finfo->dirty_offset + finfo->dirty_len)
- return NETFS_STREAMING_WRITE_CONT;
- return NETFS_FLUSH_CONTENT;
-
-no_write_streaming:
- if (finfo) {
- netfs_stat(&netfs_n_wh_wstream_conflict);
- return NETFS_FLUSH_CONTENT;
+ if (unlikely(priv != netfs_group)) {
+ if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE))
+ folio_attach_private(folio, netfs_get_group(netfs_group));
+ else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE)
+ folio_detach_private(folio);
}
- return NETFS_JUST_PREFETCH;
}
/*
@@ -130,17 +48,48 @@ static struct folio *netfs_grab_folio_for_write(struct address_space *mapping,
mapping_gfp_mask(mapping));
}
+/*
+ * Update i_size and estimate the update to i_blocks to reflect the additional
+ * data written into the pagecache until we can find out from the server what
+ * the values actually are.
+ */
+static void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode,
+ loff_t i_size, loff_t pos, size_t copied)
+{
+ blkcnt_t add;
+ size_t gap;
+
+ if (ctx->ops->update_i_size) {
+ ctx->ops->update_i_size(inode, pos);
+ return;
+ }
+
+ i_size_write(inode, pos);
+#if IS_ENABLED(CONFIG_FSCACHE)
+ fscache_update_cookie(ctx->cache, NULL, &pos);
+#endif
+
+ gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1));
+ if (copied > gap) {
+ add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE);
+
+ inode->i_blocks = min_t(blkcnt_t,
+ DIV_ROUND_UP(pos, SECTOR_SIZE),
+ inode->i_blocks + add);
+ }
+}
+
/**
* netfs_perform_write - Copy data into the pagecache.
* @iocb: The operation parameters
* @iter: The source buffer
- * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ * @netfs_group: Grouping for dirty folios (eg. ceph snaps).
*
- * Copy data into pagecache pages attached to the inode specified by @iocb.
+ * Copy data into pagecache folios attached to the inode specified by @iocb.
* The caller must hold appropriate inode locks.
*
- * Dirty pages are tagged with a netfs_folio struct if they're not up to date
- * to indicate the range modified. Dirty pages may also be tagged with a
+ * Dirty folios are tagged with a netfs_folio struct if they're not up to date
+ * to indicate the range modified. Dirty folios may also be tagged with a
* netfs-specific grouping such that data from an old group gets flushed before
* a new one is started.
*/
@@ -159,28 +108,24 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
.range_end = iocb->ki_pos + iter->count,
};
struct netfs_io_request *wreq = NULL;
- struct netfs_folio *finfo;
- struct folio *folio;
- enum netfs_how_to_modify howto;
- enum netfs_folio_trace trace;
- unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC;
- ssize_t written = 0, ret;
- loff_t i_size, pos = iocb->ki_pos, from, to;
- size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
+ struct folio *folio = NULL, *writethrough = NULL;
+ unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0;
+ ssize_t written = 0, ret, ret2;
+ loff_t i_size, pos = iocb->ki_pos;
+ size_t max_chunk = mapping_max_folio_size(mapping);
bool maybe_trouble = false;
if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) ||
iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC))
) {
- if (pos < i_size_read(inode)) {
- ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count);
- if (ret < 0) {
- goto out;
- }
- }
-
wbc_attach_fdatawrite_inode(&wbc, mapping->host);
+ ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count);
+ if (ret < 0) {
+ wbc_detach_inode(&wbc);
+ goto out;
+ }
+
wreq = netfs_begin_writethrough(iocb, iter->count);
if (IS_ERR(wreq)) {
wbc_detach_inode(&wbc);
@@ -190,19 +135,20 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
}
if (!is_sync_kiocb(iocb))
wreq->iocb = iocb;
- wreq->cleanup = netfs_cleanup_buffered_write;
+ netfs_stat(&netfs_n_wh_writethrough);
+ } else {
+ netfs_stat(&netfs_n_wh_buffered_write);
}
do {
+ struct netfs_folio *finfo;
+ struct netfs_group *group;
+ unsigned long long fpos;
size_t flen;
size_t offset; /* Offset into pagecache folio */
size_t part; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
- ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags);
- if (unlikely(ret < 0))
- break;
-
offset = pos & (max_chunk - 1);
part = min(max_chunk - offset, iov_iter_count(iter));
@@ -228,95 +174,125 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
}
flen = folio_size(folio);
- offset = pos & (flen - 1);
+ fpos = folio_pos(folio);
+ offset = pos - fpos;
part = min_t(size_t, flen - offset, part);
+ /* Wait for writeback to complete. The writeback engine owns
+ * the info in folio->private and may change it until it
+ * removes the WB mark.
+ */
+ if (folio_get_private(folio) &&
+ folio_wait_writeback_killable(folio)) {
+ ret = written ? -EINTR : -ERESTARTSYS;
+ goto error_folio_unlock;
+ }
+
if (signal_pending(current)) {
ret = written ? -EINTR : -ERESTARTSYS;
goto error_folio_unlock;
}
- /* See if we need to prefetch the area we're going to modify.
- * We need to do this before we get a lock on the folio in case
- * there's more than one writer competing for the same cache
- * block.
+ /* Decide how we should modify a folio. We might be attempting
+ * to do write-streaming, in which case we don't want to a
+ * local RMW cycle if we can avoid it. If we're doing local
+ * caching or content crypto, we award that priority over
+ * avoiding RMW. If the file is open readably, then we also
+ * assume that we may want to read what we wrote.
*/
- howto = netfs_how_to_modify(ctx, file, folio, netfs_group,
- flen, offset, part, maybe_trouble);
- _debug("howto %u", howto);
- switch (howto) {
- case NETFS_JUST_PREFETCH:
- ret = netfs_prefetch_for_write(file, folio, offset, part);
- if (ret < 0) {
- _debug("prefetch = %zd", ret);
- goto error_folio_unlock;
- }
- break;
- case NETFS_FOLIO_IS_UPTODATE:
- case NETFS_WHOLE_FOLIO_MODIFY:
- case NETFS_STREAMING_WRITE_CONT:
- break;
- case NETFS_MODIFY_AND_CLEAR:
- zero_user_segment(&folio->page, 0, offset);
- break;
- case NETFS_STREAMING_WRITE:
- ret = -EIO;
- if (WARN_ON(folio_get_private(folio)))
- goto error_folio_unlock;
- break;
- case NETFS_FLUSH_CONTENT:
- trace_netfs_folio(folio, netfs_flush_content);
- from = folio_pos(folio);
- to = from + folio_size(folio) - 1;
- folio_unlock(folio);
- folio_put(folio);
- ret = filemap_write_and_wait_range(mapping, from, to);
- if (ret < 0)
- goto error_folio_unlock;
- continue;
+ finfo = netfs_folio_info(folio);
+ group = netfs_folio_group(folio);
+
+ if (unlikely(group != netfs_group) &&
+ group != NETFS_FOLIO_COPY_TO_CACHE)
+ goto flush_content;
+
+ if (folio_test_uptodate(folio)) {
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_folio(folio);
+ copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
+ if (unlikely(copied == 0))
+ goto copy_failed;
+ netfs_set_group(folio, netfs_group);
+ trace_netfs_folio(folio, netfs_folio_is_uptodate);
+ goto copied;
}
- if (mapping_writably_mapped(mapping))
- flush_dcache_folio(folio);
-
- copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
-
- flush_dcache_folio(folio);
-
- /* Deal with a (partially) failed copy */
- if (copied == 0) {
- ret = -EFAULT;
- goto error_folio_unlock;
+ /* If the page is above the zero-point then we assume that the
+ * server would just return a block of zeros or a short read if
+ * we try to read it.
+ */
+ if (fpos >= ctx->zero_point) {
+ folio_zero_segment(folio, 0, offset);
+ copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
+ if (unlikely(copied == 0))
+ goto copy_failed;
+ folio_zero_segment(folio, offset + copied, flen);
+ __netfs_set_group(folio, netfs_group);
+ folio_mark_uptodate(folio);
+ trace_netfs_folio(folio, netfs_modify_and_clear);
+ goto copied;
}
- trace = (enum netfs_folio_trace)howto;
- switch (howto) {
- case NETFS_FOLIO_IS_UPTODATE:
- case NETFS_JUST_PREFETCH:
- netfs_set_group(folio, netfs_group);
- break;
- case NETFS_MODIFY_AND_CLEAR:
- zero_user_segment(&folio->page, offset + copied, flen);
- netfs_set_group(folio, netfs_group);
- folio_mark_uptodate(folio);
- break;
- case NETFS_WHOLE_FOLIO_MODIFY:
+ /* See if we can write a whole folio in one go. */
+ if (!maybe_trouble && offset == 0 && part >= flen) {
+ copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
+ if (unlikely(copied == 0))
+ goto copy_failed;
if (unlikely(copied < part)) {
maybe_trouble = true;
iov_iter_revert(iter, copied);
copied = 0;
+ folio_unlock(folio);
goto retry;
}
- netfs_set_group(folio, netfs_group);
+ __netfs_set_group(folio, netfs_group);
folio_mark_uptodate(folio);
- break;
- case NETFS_STREAMING_WRITE:
+ trace_netfs_folio(folio, netfs_whole_folio_modify);
+ goto copied;
+ }
+
+ /* We don't want to do a streaming write on a file that loses
+ * caching service temporarily because the backing store got
+ * culled and we don't really want to get a streaming write on
+ * a file that's open for reading as ->read_folio() then has to
+ * be able to flush it.
+ */
+ if ((file->f_mode & FMODE_READ) ||
+ netfs_is_cache_enabled(ctx)) {
+ if (finfo) {
+ netfs_stat(&netfs_n_wh_wstream_conflict);
+ goto flush_content;
+ }
+ ret = netfs_prefetch_for_write(file, folio, offset, part);
+ if (ret < 0) {
+ _debug("prefetch = %zd", ret);
+ goto error_folio_unlock;
+ }
+ /* Note that copy-to-cache may have been set. */
+
+ copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
+ if (unlikely(copied == 0))
+ goto copy_failed;
+ netfs_set_group(folio, netfs_group);
+ trace_netfs_folio(folio, netfs_just_prefetch);
+ goto copied;
+ }
+
+ if (!finfo) {
+ ret = -EIO;
+ if (WARN_ON(folio_get_private(folio)))
+ goto error_folio_unlock;
+ copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
+ if (unlikely(copied == 0))
+ goto copy_failed;
if (offset == 0 && copied == flen) {
- netfs_set_group(folio, netfs_group);
+ __netfs_set_group(folio, netfs_group);
folio_mark_uptodate(folio);
- trace = netfs_streaming_filled_page;
- break;
+ trace_netfs_folio(folio, netfs_streaming_filled_page);
+ goto copied;
}
+
finfo = kzalloc(sizeof(*finfo), GFP_KERNEL);
if (!finfo) {
iov_iter_revert(iter, copied);
@@ -328,9 +304,18 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
finfo->dirty_len = copied;
folio_attach_private(folio, (void *)((unsigned long)finfo |
NETFS_FOLIO_INFO));
- break;
- case NETFS_STREAMING_WRITE_CONT:
- finfo = netfs_folio_info(folio);
+ trace_netfs_folio(folio, netfs_streaming_write);
+ goto copied;
+ }
+
+ /* We can continue a streaming write only if it continues on
+ * from the previous. If it overlaps, we must flush lest we
+ * suffer a partial copy and disjoint dirty regions.
+ */
+ if (offset == finfo->dirty_offset + finfo->dirty_len) {
+ copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
+ if (unlikely(copied == 0))
+ goto copy_failed;
finfo->dirty_len += copied;
if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) {
if (finfo->netfs_group)
@@ -339,72 +324,78 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
folio_detach_private(folio);
folio_mark_uptodate(folio);
kfree(finfo);
- trace = netfs_streaming_cont_filled_page;
+ trace_netfs_folio(folio, netfs_streaming_cont_filled_page);
+ } else {
+ trace_netfs_folio(folio, netfs_streaming_write_cont);
}
- break;
- default:
- WARN(true, "Unexpected modify type %u ix=%lx\n",
- howto, folio->index);
- ret = -EIO;
- goto error_folio_unlock;
+ goto copied;
}
- trace_netfs_folio(folio, trace);
+ /* Incompatible write; flush the folio and try again. */
+ flush_content:
+ trace_netfs_folio(folio, netfs_flush_content);
+ folio_unlock(folio);
+ folio_put(folio);
+ ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1);
+ if (ret < 0)
+ goto error_folio_unlock;
+ continue;
+
+ copied:
+ flush_dcache_folio(folio);
/* Update the inode size if we moved the EOF marker */
- i_size = i_size_read(inode);
pos += copied;
- if (pos > i_size) {
- if (ctx->ops->update_i_size) {
- ctx->ops->update_i_size(inode, pos);
- } else {
- i_size_write(inode, pos);
-#if IS_ENABLED(CONFIG_FSCACHE)
- fscache_update_cookie(ctx->cache, NULL, &pos);
-#endif
- }
- }
+ i_size = i_size_read(inode);
+ if (pos > i_size)
+ netfs_update_i_size(ctx, inode, i_size, pos, copied);
written += copied;
if (likely(!wreq)) {
folio_mark_dirty(folio);
+ folio_unlock(folio);
} else {
- if (folio_test_dirty(folio))
- /* Sigh. mmap. */
- folio_clear_dirty_for_io(folio);
- /* We make multiple writes to the folio... */
- if (!folio_test_writeback(folio)) {
- folio_wait_fscache(folio);
- folio_start_writeback(folio);
- folio_start_fscache(folio);
- if (wreq->iter.count == 0)
- trace_netfs_folio(folio, netfs_folio_trace_wthru);
- else
- trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
- }
- netfs_advance_writethrough(wreq, copied,
- offset + copied == flen);
+ netfs_advance_writethrough(wreq, &wbc, folio, copied,
+ offset + copied == flen,
+ &writethrough);
+ /* Folio unlocked */
}
retry:
- folio_unlock(folio);
folio_put(folio);
folio = NULL;
+ ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags);
+ if (unlikely(ret < 0))
+ break;
+
cond_resched();
} while (iov_iter_count(iter));
out:
+ if (likely(written)) {
+ /* Set indication that ctime and mtime got updated in case
+ * close is deferred.
+ */
+ set_bit(NETFS_ICTX_MODIFIED_ATTR, &ctx->flags);
+ if (unlikely(ctx->ops->post_modify))
+ ctx->ops->post_modify(inode);
+ }
+
if (unlikely(wreq)) {
- ret = netfs_end_writethrough(wreq, iocb);
+ ret2 = netfs_end_writethrough(wreq, &wbc, writethrough);
wbc_detach_inode(&wbc);
- if (ret == -EIOCBQUEUED)
- return ret;
+ if (ret2 == -EIOCBQUEUED)
+ return ret2;
+ if (ret == 0)
+ ret = ret2;
}
iocb->ki_pos += written;
_leave(" = %zd [%zd]", written, ret);
return written ? written : ret;
+copy_failed:
+ ret = -EFAULT;
error_folio_unlock:
folio_unlock(folio);
folio_put(folio);
@@ -416,7 +407,7 @@ EXPORT_SYMBOL(netfs_perform_write);
* netfs_buffered_write_iter_locked - write data to a file
* @iocb: IO state structure (file, offset, etc.)
* @from: iov_iter with data to write
- * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ * @netfs_group: Grouping for dirty folios (eg. ceph snaps).
*
* This function does all the work needed for actually writing data to a
* file. It does all basic checks, removes SUID from the file, updates
@@ -500,37 +491,44 @@ EXPORT_SYMBOL(netfs_file_write_iter);
/*
* Notification that a previously read-only page is about to become writable.
- * Note that the caller indicates a single page of a multipage folio.
+ * The caller indicates the precise page that needs to be written to, but
+ * we only track group on a per-folio basis, so we block more often than
+ * we might otherwise.
*/
vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group)
{
+ struct netfs_group *group;
struct folio *folio = page_folio(vmf->page);
struct file *file = vmf->vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
struct inode *inode = file_inode(file);
- vm_fault_t ret = VM_FAULT_RETRY;
+ struct netfs_inode *ictx = netfs_inode(inode);
+ vm_fault_t ret = VM_FAULT_NOPAGE;
int err;
_enter("%lx", folio->index);
sb_start_pagefault(inode->i_sb);
- if (folio_wait_writeback_killable(folio))
- goto out;
-
if (folio_lock_killable(folio) < 0)
goto out;
+ if (folio->mapping != mapping)
+ goto unlock;
+ if (folio_wait_writeback_killable(folio) < 0)
+ goto unlock;
/* Can we see a streaming write here? */
if (WARN_ON(!folio_test_uptodate(folio))) {
- ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED;
- goto out;
+ ret = VM_FAULT_SIGBUS;
+ goto unlock;
}
- if (netfs_folio_group(folio) != netfs_group) {
+ group = netfs_folio_group(folio);
+ if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) {
folio_unlock(folio);
- err = filemap_fdatawait_range(inode->i_mapping,
- folio_pos(folio),
- folio_pos(folio) + folio_size(folio));
+ err = filemap_fdatawrite_range(mapping,
+ folio_pos(folio),
+ folio_pos(folio) + folio_size(folio));
switch (err) {
case 0:
ret = VM_FAULT_RETRY;
@@ -550,708 +548,15 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
trace_netfs_folio(folio, netfs_folio_trace_mkwrite);
netfs_set_group(folio, netfs_group);
file_update_time(file);
+ set_bit(NETFS_ICTX_MODIFIED_ATTR, &ictx->flags);
+ if (ictx->ops->post_modify)
+ ictx->ops->post_modify(inode);
ret = VM_FAULT_LOCKED;
out:
sb_end_pagefault(inode->i_sb);
return ret;
-}
-EXPORT_SYMBOL(netfs_page_mkwrite);
-
-/*
- * Kill all the pages in the given range
- */
-static void netfs_kill_pages(struct address_space *mapping,
- loff_t start, loff_t len)
-{
- struct folio *folio;
- pgoff_t index = start / PAGE_SIZE;
- pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
-
- _enter("%llx-%llx", start, start + len - 1);
-
- do {
- _debug("kill %lx (to %lx)", index, last);
-
- folio = filemap_get_folio(mapping, index);
- if (IS_ERR(folio)) {
- next = index + 1;
- continue;
- }
-
- next = folio_next_index(folio);
-
- trace_netfs_folio(folio, netfs_folio_trace_kill);
- folio_clear_uptodate(folio);
- if (folio_test_fscache(folio))
- folio_end_fscache(folio);
- folio_end_writeback(folio);
- folio_lock(folio);
- generic_error_remove_folio(mapping, folio);
- folio_unlock(folio);
- folio_put(folio);
-
- } while (index = next, index <= last);
-
- _leave("");
-}
-
-/*
- * Redirty all the pages in a given range.
- */
-static void netfs_redirty_pages(struct address_space *mapping,
- loff_t start, loff_t len)
-{
- struct folio *folio;
- pgoff_t index = start / PAGE_SIZE;
- pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
-
- _enter("%llx-%llx", start, start + len - 1);
-
- do {
- _debug("redirty %llx @%llx", len, start);
-
- folio = filemap_get_folio(mapping, index);
- if (IS_ERR(folio)) {
- next = index + 1;
- continue;
- }
-
- next = folio_next_index(folio);
- trace_netfs_folio(folio, netfs_folio_trace_redirty);
- filemap_dirty_folio(mapping, folio);
- if (folio_test_fscache(folio))
- folio_end_fscache(folio);
- folio_end_writeback(folio);
- folio_put(folio);
- } while (index = next, index <= last);
-
- balance_dirty_pages_ratelimited(mapping);
-
- _leave("");
-}
-
-/*
- * Completion of write to server
- */
-static void netfs_pages_written_back(struct netfs_io_request *wreq)
-{
- struct address_space *mapping = wreq->mapping;
- struct netfs_folio *finfo;
- struct netfs_group *group = NULL;
- struct folio *folio;
- pgoff_t last;
- int gcount = 0;
-
- XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE);
-
- _enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
-
- rcu_read_lock();
-
- last = (wreq->start + wreq->len - 1) / PAGE_SIZE;
- xas_for_each(&xas, folio, last) {
- WARN(!folio_test_writeback(folio),
- "bad %zx @%llx page %lx %lx\n",
- wreq->len, wreq->start, folio->index, last);
-
- if ((finfo = netfs_folio_info(folio))) {
- /* Streaming writes cannot be redirtied whilst under
- * writeback, so discard the streaming record.
- */
- folio_detach_private(folio);
- group = finfo->netfs_group;
- gcount++;
- trace_netfs_folio(folio, netfs_folio_trace_clear_s);
- kfree(finfo);
- } else if ((group = netfs_folio_group(folio))) {
- /* Need to detach the group pointer if the page didn't
- * get redirtied. If it has been redirtied, then it
- * must be within the same group.
- */
- if (folio_test_dirty(folio)) {
- trace_netfs_folio(folio, netfs_folio_trace_redirtied);
- goto end_wb;
- }
- if (folio_trylock(folio)) {
- if (!folio_test_dirty(folio)) {
- folio_detach_private(folio);
- gcount++;
- trace_netfs_folio(folio, netfs_folio_trace_clear_g);
- } else {
- trace_netfs_folio(folio, netfs_folio_trace_redirtied);
- }
- folio_unlock(folio);
- goto end_wb;
- }
-
- xas_pause(&xas);
- rcu_read_unlock();
- folio_lock(folio);
- if (!folio_test_dirty(folio)) {
- folio_detach_private(folio);
- gcount++;
- trace_netfs_folio(folio, netfs_folio_trace_clear_g);
- } else {
- trace_netfs_folio(folio, netfs_folio_trace_redirtied);
- }
- folio_unlock(folio);
- rcu_read_lock();
- } else {
- trace_netfs_folio(folio, netfs_folio_trace_clear);
- }
- end_wb:
- if (folio_test_fscache(folio))
- folio_end_fscache(folio);
- xas_advance(&xas, folio_next_index(folio) - 1);
- folio_end_writeback(folio);
- }
-
- rcu_read_unlock();
- netfs_put_group_many(group, gcount);
- _leave("");
-}
-
-/*
- * Deal with the disposition of the folios that are under writeback to close
- * out the operation.
- */
-static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq)
-{
- struct address_space *mapping = wreq->mapping;
-
- _enter("");
-
- switch (wreq->error) {
- case 0:
- netfs_pages_written_back(wreq);
- break;
-
- default:
- pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error);
- fallthrough;
- case -EACCES:
- case -EPERM:
- case -ENOKEY:
- case -EKEYEXPIRED:
- case -EKEYREJECTED:
- case -EKEYREVOKED:
- case -ENETRESET:
- case -EDQUOT:
- case -ENOSPC:
- netfs_redirty_pages(mapping, wreq->start, wreq->len);
- break;
-
- case -EROFS:
- case -EIO:
- case -EREMOTEIO:
- case -EFBIG:
- case -ENOENT:
- case -ENOMEDIUM:
- case -ENXIO:
- netfs_kill_pages(mapping, wreq->start, wreq->len);
- break;
- }
-
- if (wreq->error)
- mapping_set_error(mapping, wreq->error);
- if (wreq->netfs_ops->done)
- wreq->netfs_ops->done(wreq);
-}
-
-/*
- * Extend the region to be written back to include subsequent contiguously
- * dirty pages if possible, but don't sleep while doing so.
- *
- * If this page holds new content, then we can include filler zeros in the
- * writeback.
- */
-static void netfs_extend_writeback(struct address_space *mapping,
- struct netfs_group *group,
- struct xa_state *xas,
- long *_count,
- loff_t start,
- loff_t max_len,
- bool caching,
- size_t *_len,
- size_t *_top)
-{
- struct netfs_folio *finfo;
- struct folio_batch fbatch;
- struct folio *folio;
- unsigned int i;
- pgoff_t index = (start + *_len) / PAGE_SIZE;
- size_t len;
- void *priv;
- bool stop = true;
-
- folio_batch_init(&fbatch);
-
- do {
- /* Firstly, we gather up a batch of contiguous dirty pages
- * under the RCU read lock - but we can't clear the dirty flags
- * there if any of those pages are mapped.
- */
- rcu_read_lock();
-
- xas_for_each(xas, folio, ULONG_MAX) {
- stop = true;
- if (xas_retry(xas, folio))
- continue;
- if (xa_is_value(folio))
- break;
- if (folio->index != index) {
- xas_reset(xas);
- break;
- }
-
- if (!folio_try_get_rcu(folio)) {
- xas_reset(xas);
- continue;
- }
-
- /* Has the folio moved or been split? */
- if (unlikely(folio != xas_reload(xas))) {
- folio_put(folio);
- xas_reset(xas);
- break;
- }
-
- if (!folio_trylock(folio)) {
- folio_put(folio);
- xas_reset(xas);
- break;
- }
- if (!folio_test_dirty(folio) ||
- folio_test_writeback(folio) ||
- folio_test_fscache(folio)) {
- folio_unlock(folio);
- folio_put(folio);
- xas_reset(xas);
- break;
- }
-
- stop = false;
- len = folio_size(folio);
- priv = folio_get_private(folio);
- if ((const struct netfs_group *)priv != group) {
- stop = true;
- finfo = netfs_folio_info(folio);
- if (finfo->netfs_group != group ||
- finfo->dirty_offset > 0) {
- folio_unlock(folio);
- folio_put(folio);
- xas_reset(xas);
- break;
- }
- len = finfo->dirty_len;
- }
-
- *_top += folio_size(folio);
- index += folio_nr_pages(folio);
- *_count -= folio_nr_pages(folio);
- *_len += len;
- if (*_len >= max_len || *_count <= 0)
- stop = true;
-
- if (!folio_batch_add(&fbatch, folio))
- break;
- if (stop)
- break;
- }
-
- xas_pause(xas);
- rcu_read_unlock();
-
- /* Now, if we obtained any folios, we can shift them to being
- * writable and mark them for caching.
- */
- if (!folio_batch_count(&fbatch))
- break;
-
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
- folio = fbatch.folios[i];
- trace_netfs_folio(folio, netfs_folio_trace_store_plus);
-
- if (!folio_clear_dirty_for_io(folio))
- BUG();
- folio_start_writeback(folio);
- netfs_folio_start_fscache(caching, folio);
- folio_unlock(folio);
- }
-
- folio_batch_release(&fbatch);
- cond_resched();
- } while (!stop);
-}
-
-/*
- * Synchronously write back the locked page and any subsequent non-locked dirty
- * pages.
- */
-static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping,
- struct writeback_control *wbc,
- struct netfs_group *group,
- struct xa_state *xas,
- struct folio *folio,
- unsigned long long start,
- unsigned long long end)
-{
- struct netfs_io_request *wreq;
- struct netfs_folio *finfo;
- struct netfs_inode *ctx = netfs_inode(mapping->host);
- unsigned long long i_size = i_size_read(&ctx->inode);
- size_t len, max_len;
- bool caching = netfs_is_cache_enabled(ctx);
- long count = wbc->nr_to_write;
- int ret;
-
- _enter(",%lx,%llx-%llx,%u", folio->index, start, end, caching);
-
- wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio),
- NETFS_WRITEBACK);
- if (IS_ERR(wreq)) {
- folio_unlock(folio);
- return PTR_ERR(wreq);
- }
-
- if (!folio_clear_dirty_for_io(folio))
- BUG();
- folio_start_writeback(folio);
- netfs_folio_start_fscache(caching, folio);
-
- count -= folio_nr_pages(folio);
-
- /* Find all consecutive lockable dirty pages that have contiguous
- * written regions, stopping when we find a page that is not
- * immediately lockable, is not dirty or is missing, or we reach the
- * end of the range.
- */
- trace_netfs_folio(folio, netfs_folio_trace_store);
-
- len = wreq->len;
- finfo = netfs_folio_info(folio);
- if (finfo) {
- start += finfo->dirty_offset;
- if (finfo->dirty_offset + finfo->dirty_len != len) {
- len = finfo->dirty_len;
- goto cant_expand;
- }
- len = finfo->dirty_len;
- }
-
- if (start < i_size) {
- /* Trim the write to the EOF; the extra data is ignored. Also
- * put an upper limit on the size of a single storedata op.
- */
- max_len = 65536 * 4096;
- max_len = min_t(unsigned long long, max_len, end - start + 1);
- max_len = min_t(unsigned long long, max_len, i_size - start);
-
- if (len < max_len)
- netfs_extend_writeback(mapping, group, xas, &count, start,
- max_len, caching, &len, &wreq->upper_len);
- }
-
-cant_expand:
- len = min_t(unsigned long long, len, i_size - start);
-
- /* We now have a contiguous set of dirty pages, each with writeback
- * set; the first page is still locked at this point, but all the rest
- * have been unlocked.
- */
+unlock:
folio_unlock(folio);
- wreq->start = start;
- wreq->len = len;
-
- if (start < i_size) {
- _debug("write back %zx @%llx [%llx]", len, start, i_size);
-
- /* Speculatively write to the cache. We have to fix this up
- * later if the store fails.
- */
- wreq->cleanup = netfs_cleanup_buffered_write;
-
- iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start,
- wreq->upper_len);
- __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
- ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback);
- if (ret == 0 || ret == -EIOCBQUEUED)
- wbc->nr_to_write -= len / PAGE_SIZE;
- } else {
- _debug("write discard %zx @%llx [%llx]", len, start, i_size);
-
- /* The dirty region was entirely beyond the EOF. */
- fscache_clear_page_bits(mapping, start, len, caching);
- netfs_pages_written_back(wreq);
- ret = 0;
- }
-
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
- _leave(" = 1");
- return 1;
-}
-
-/*
- * Write a region of pages back to the server
- */
-static ssize_t netfs_writepages_begin(struct address_space *mapping,
- struct writeback_control *wbc,
- struct netfs_group *group,
- struct xa_state *xas,
- unsigned long long *_start,
- unsigned long long end)
-{
- const struct netfs_folio *finfo;
- struct folio *folio;
- unsigned long long start = *_start;
- ssize_t ret;
- void *priv;
- int skips = 0;
-
- _enter("%llx,%llx,", start, end);
-
-search_again:
- /* Find the first dirty page in the group. */
- rcu_read_lock();
-
- for (;;) {
- folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY);
- if (xas_retry(xas, folio) || xa_is_value(folio))
- continue;
- if (!folio)
- break;
-
- if (!folio_try_get_rcu(folio)) {
- xas_reset(xas);
- continue;
- }
-
- if (unlikely(folio != xas_reload(xas))) {
- folio_put(folio);
- xas_reset(xas);
- continue;
- }
-
- /* Skip any dirty folio that's not in the group of interest. */
- priv = folio_get_private(folio);
- if ((const struct netfs_group *)priv != group) {
- finfo = netfs_folio_info(folio);
- if (finfo->netfs_group != group) {
- folio_put(folio);
- continue;
- }
- }
-
- xas_pause(xas);
- break;
- }
- rcu_read_unlock();
- if (!folio)
- return 0;
-
- start = folio_pos(folio); /* May regress with THPs */
-
- _debug("wback %lx", folio->index);
-
- /* At this point we hold neither the i_pages lock nor the page lock:
- * the page may be truncated or invalidated (changing page->mapping to
- * NULL), or even swizzled back from swapper_space to tmpfs file
- * mapping
- */
-lock_again:
- if (wbc->sync_mode != WB_SYNC_NONE) {
- ret = folio_lock_killable(folio);
- if (ret < 0)
- return ret;
- } else {
- if (!folio_trylock(folio))
- goto search_again;
- }
-
- if (folio->mapping != mapping ||
- !folio_test_dirty(folio)) {
- start += folio_size(folio);
- folio_unlock(folio);
- goto search_again;
- }
-
- if (folio_test_writeback(folio) ||
- folio_test_fscache(folio)) {
- folio_unlock(folio);
- if (wbc->sync_mode != WB_SYNC_NONE) {
- folio_wait_writeback(folio);
-#ifdef CONFIG_FSCACHE
- folio_wait_fscache(folio);
-#endif
- goto lock_again;
- }
-
- start += folio_size(folio);
- if (wbc->sync_mode == WB_SYNC_NONE) {
- if (skips >= 5 || need_resched()) {
- ret = 0;
- goto out;
- }
- skips++;
- }
- goto search_again;
- }
-
- ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas,
- folio, start, end);
-out:
- if (ret > 0)
- *_start = start + ret;
- _leave(" = %zd [%llx]", ret, *_start);
- return ret;
-}
-
-/*
- * Write a region of pages back to the server
- */
-static int netfs_writepages_region(struct address_space *mapping,
- struct writeback_control *wbc,
- struct netfs_group *group,
- unsigned long long *_start,
- unsigned long long end)
-{
- ssize_t ret;
-
- XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE);
-
- do {
- ret = netfs_writepages_begin(mapping, wbc, group, &xas,
- _start, end);
- if (ret > 0 && wbc->nr_to_write > 0)
- cond_resched();
- } while (ret > 0 && wbc->nr_to_write > 0);
-
- return ret > 0 ? 0 : ret;
-}
-
-/*
- * write some of the pending data back to the server
- */
-int netfs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct netfs_group *group = NULL;
- loff_t start, end;
- int ret;
-
- _enter("");
-
- /* We have to be careful as we can end up racing with setattr()
- * truncating the pagecache since the caller doesn't take a lock here
- * to prevent it.
- */
-
- if (wbc->range_cyclic && mapping->writeback_index) {
- start = mapping->writeback_index * PAGE_SIZE;
- ret = netfs_writepages_region(mapping, wbc, group,
- &start, LLONG_MAX);
- if (ret < 0)
- goto out;
-
- if (wbc->nr_to_write <= 0) {
- mapping->writeback_index = start / PAGE_SIZE;
- goto out;
- }
-
- start = 0;
- end = mapping->writeback_index * PAGE_SIZE;
- mapping->writeback_index = 0;
- ret = netfs_writepages_region(mapping, wbc, group, &start, end);
- if (ret == 0)
- mapping->writeback_index = start / PAGE_SIZE;
- } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
- start = 0;
- ret = netfs_writepages_region(mapping, wbc, group,
- &start, LLONG_MAX);
- if (wbc->nr_to_write > 0 && ret == 0)
- mapping->writeback_index = start / PAGE_SIZE;
- } else {
- start = wbc->range_start;
- ret = netfs_writepages_region(mapping, wbc, group,
- &start, wbc->range_end);
- }
-
-out:
- _leave(" = %d", ret);
- return ret;
-}
-EXPORT_SYMBOL(netfs_writepages);
-
-/*
- * Deal with the disposition of a laundered folio.
- */
-static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq)
-{
- if (wreq->error) {
- pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error);
- mapping_set_error(wreq->mapping, wreq->error);
- }
-}
-
-/**
- * netfs_launder_folio - Clean up a dirty folio that's being invalidated
- * @folio: The folio to clean
- *
- * This is called to write back a folio that's being invalidated when an inode
- * is getting torn down. Ideally, writepages would be used instead.
- */
-int netfs_launder_folio(struct folio *folio)
-{
- struct netfs_io_request *wreq;
- struct address_space *mapping = folio->mapping;
- struct netfs_folio *finfo = netfs_folio_info(folio);
- struct netfs_group *group = netfs_folio_group(folio);
- struct bio_vec bvec;
- unsigned long long i_size = i_size_read(mapping->host);
- unsigned long long start = folio_pos(folio);
- size_t offset = 0, len;
- int ret = 0;
-
- if (finfo) {
- offset = finfo->dirty_offset;
- start += offset;
- len = finfo->dirty_len;
- } else {
- len = folio_size(folio);
- }
- len = min_t(unsigned long long, len, i_size - start);
-
- wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE);
- if (IS_ERR(wreq)) {
- ret = PTR_ERR(wreq);
- goto out;
- }
-
- if (!folio_clear_dirty_for_io(folio))
- goto out_put;
-
- trace_netfs_folio(folio, netfs_folio_trace_launder);
-
- _debug("launder %llx-%llx", start, start + len - 1);
-
- /* Speculatively write to the cache. We have to fix this up later if
- * the store fails.
- */
- wreq->cleanup = netfs_cleanup_launder_folio;
-
- bvec_set_folio(&bvec, folio, len, offset);
- iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len);
- __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
- ret = netfs_begin_write(wreq, true, netfs_write_trace_launder);
-
-out_put:
- folio_detach_private(folio);
- netfs_put_group(group);
- kfree(finfo);
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
-out:
- folio_wait_fscache(folio);
- _leave(" = %d", ret);
- return ret;
+ goto out;
}
-EXPORT_SYMBOL(netfs_launder_folio);
+EXPORT_SYMBOL(netfs_page_mkwrite);
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
index ad4370b3935d..5e3f0aeb51f3 100644
--- a/fs/netfs/direct_read.c
+++ b/fs/netfs/direct_read.c
@@ -16,6 +16,149 @@
#include <linux/netfs.h>
#include "internal.h"
+static void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+ size_t rsize;
+
+ rsize = umin(subreq->len, rreq->io_streams[0].sreq_max_len);
+ subreq->len = rsize;
+
+ if (unlikely(rreq->io_streams[0].sreq_max_segs)) {
+ size_t limit = netfs_limit_iter(&rreq->buffer.iter, 0, rsize,
+ rreq->io_streams[0].sreq_max_segs);
+
+ if (limit < rsize) {
+ subreq->len = limit;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_limited);
+ }
+ }
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+
+ subreq->io_iter = rreq->buffer.iter;
+ iov_iter_truncate(&subreq->io_iter, subreq->len);
+ iov_iter_advance(&rreq->buffer.iter, subreq->len);
+}
+
+/*
+ * Perform a read to a buffer from the server, slicing up the region to be read
+ * according to the network rsize.
+ */
+static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
+{
+ struct netfs_io_stream *stream = &rreq->io_streams[0];
+ unsigned long long start = rreq->start;
+ ssize_t size = rreq->len;
+ int ret = 0;
+
+ do {
+ struct netfs_io_subrequest *subreq;
+ ssize_t slice;
+
+ subreq = netfs_alloc_subrequest(rreq);
+ if (!subreq) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
+ subreq->start = start;
+ subreq->len = size;
+
+ __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+
+ spin_lock(&rreq->lock);
+ list_add_tail(&subreq->rreq_link, &stream->subrequests);
+ if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
+ stream->front = subreq;
+ if (!stream->active) {
+ stream->collected_to = stream->front->start;
+ /* Store list pointers before active flag */
+ smp_store_release(&stream->active, true);
+ }
+ }
+ trace_netfs_sreq(subreq, netfs_sreq_trace_added);
+ spin_unlock(&rreq->lock);
+
+ netfs_stat(&netfs_n_rh_download);
+ if (rreq->netfs_ops->prepare_read) {
+ ret = rreq->netfs_ops->prepare_read(subreq);
+ if (ret < 0) {
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+ break;
+ }
+ }
+
+ netfs_prepare_dio_read_iterator(subreq);
+ slice = subreq->len;
+ size -= slice;
+ start += slice;
+ rreq->submitted += slice;
+ if (size <= 0) {
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
+ }
+
+ rreq->netfs_ops->issue_read(subreq);
+
+ if (test_bit(NETFS_RREQ_PAUSE, &rreq->flags))
+ netfs_wait_for_pause(rreq);
+ if (test_bit(NETFS_RREQ_FAILED, &rreq->flags))
+ break;
+ if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
+ test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
+ break;
+ cond_resched();
+ } while (size > 0);
+
+ if (unlikely(size > 0)) {
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
+ netfs_wake_read_collector(rreq);
+ }
+
+ return ret;
+}
+
+/*
+ * Perform a read to an application buffer, bypassing the pagecache and the
+ * local disk cache.
+ */
+static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync)
+{
+ ssize_t ret;
+
+ _enter("R=%x %llx-%llx",
+ rreq->debug_id, rreq->start, rreq->start + rreq->len - 1);
+
+ if (rreq->len == 0) {
+ pr_err("Zero-sized read [R=%x]\n", rreq->debug_id);
+ return -EIO;
+ }
+
+ // TODO: Use bounce buffer if requested
+
+ inode_dio_begin(rreq->inode);
+
+ ret = netfs_dispatch_unbuffered_reads(rreq);
+
+ if (!rreq->submitted) {
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit);
+ inode_dio_end(rreq->inode);
+ ret = 0;
+ goto out;
+ }
+
+ if (sync)
+ ret = netfs_wait_for_read(rreq);
+ else
+ ret = -EIOCBQUEUED;
+out:
+ _leave(" = %zd", ret);
+ return ret;
+}
+
/**
* netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read
* @iocb: The I/O control descriptor describing the read
@@ -26,12 +169,12 @@
*
* The caller must hold any appropriate locks.
*/
-static ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter)
+ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter)
{
struct netfs_io_request *rreq;
ssize_t ret;
size_t orig_count = iov_iter_count(iter);
- bool async = !is_sync_kiocb(iocb);
+ bool sync = is_sync_kiocb(iocb);
_enter("");
@@ -62,15 +205,15 @@ static ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_
* the request.
*/
if (user_backed_iter(iter)) {
- ret = netfs_extract_user_iter(iter, rreq->len, &rreq->iter, 0);
+ ret = netfs_extract_user_iter(iter, rreq->len, &rreq->buffer.iter, 0);
if (ret < 0)
goto out;
- rreq->direct_bv = (struct bio_vec *)rreq->iter.bvec;
+ rreq->direct_bv = (struct bio_vec *)rreq->buffer.iter.bvec;
rreq->direct_bv_count = ret;
rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
- rreq->len = iov_iter_count(&rreq->iter);
+ rreq->len = iov_iter_count(&rreq->buffer.iter);
} else {
- rreq->iter = *iter;
+ rreq->buffer.iter = *iter;
rreq->len = orig_count;
rreq->direct_bv_unpin = false;
iov_iter_advance(iter, orig_count);
@@ -78,13 +221,15 @@ static ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_
// TODO: Set up bounce buffer if needed
- if (async)
+ if (!sync) {
rreq->iocb = iocb;
+ __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags);
+ }
- ret = netfs_begin_read(rreq, is_sync_kiocb(iocb));
+ ret = netfs_unbuffered_read(rreq, sync);
if (ret < 0)
goto out; /* May be -EIOCBQUEUED */
- if (!async) {
+ if (sync) {
// TODO: Copy from bounce buffer
iocb->ki_pos += rreq->transferred;
ret = rreq->transferred;
@@ -94,10 +239,9 @@ out:
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
if (ret > 0)
orig_count -= ret;
- if (ret != -EIOCBQUEUED)
- iov_iter_revert(iter, orig_count - iov_iter_count(iter));
return ret;
}
+EXPORT_SYMBOL(netfs_unbuffered_read_iter_locked);
/**
* netfs_unbuffered_read_iter - Perform an unbuffered or direct I/O read
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index bee047e20f5d..42ce53cc216e 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -12,7 +12,7 @@
static void netfs_cleanup_dio_write(struct netfs_io_request *wreq)
{
struct inode *inode = wreq->inode;
- unsigned long long end = wreq->start + wreq->len;
+ unsigned long long end = wreq->start + wreq->transferred;
if (!wreq->error &&
i_size_read(inode) < end) {
@@ -27,13 +27,14 @@ static void netfs_cleanup_dio_write(struct netfs_io_request *wreq)
* Perform an unbuffered write where we may have to do an RMW operation on an
* encrypted file. This can also be used for direct I/O writes.
*/
-static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
+ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
struct netfs_group *netfs_group)
{
struct netfs_io_request *wreq;
unsigned long long start = iocb->ki_pos;
unsigned long long end = start + iov_iter_count(iter);
ssize_t ret, n;
+ size_t len = iov_iter_count(iter);
bool async = !is_sync_kiocb(iocb);
_enter("");
@@ -46,13 +47,17 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
_debug("uw %llx-%llx", start, end);
- wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
- start, end - start,
- iocb->ki_flags & IOCB_DIRECT ?
- NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
+ wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, start,
+ iocb->ki_flags & IOCB_DIRECT ?
+ NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
if (IS_ERR(wreq))
return PTR_ERR(wreq);
+ wreq->io_streams[0].avail = true;
+ trace_netfs_write(wreq, (iocb->ki_flags & IOCB_DIRECT ?
+ netfs_write_trace_dio_write :
+ netfs_write_trace_unbuffered_write));
+
{
/* If this is an async op and we're not using a bounce buffer,
* we have to save the source buffer as the iterator is only
@@ -62,23 +67,27 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
* allocate a sufficiently large bvec array and may shorten the
* request.
*/
- if (async || user_backed_iter(iter)) {
- n = netfs_extract_user_iter(iter, wreq->len, &wreq->iter, 0);
+ if (user_backed_iter(iter)) {
+ n = netfs_extract_user_iter(iter, len, &wreq->buffer.iter, 0);
if (n < 0) {
ret = n;
goto out;
}
- wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec;
+ wreq->direct_bv = (struct bio_vec *)wreq->buffer.iter.bvec;
wreq->direct_bv_count = n;
wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
- wreq->len = iov_iter_count(&wreq->iter);
} else {
- wreq->iter = *iter;
+ /* If this is a kernel-generated async DIO request,
+ * assume that any resources the iterator points to
+ * (eg. a bio_vec array) will persist till the end of
+ * the op.
+ */
+ wreq->buffer.iter = *iter;
}
-
- wreq->io_iter = wreq->iter;
}
+ __set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags);
+
/* Copy the data into the bounce buffer and encrypt it. */
// TODO
@@ -86,11 +95,9 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
if (async)
wreq->iocb = iocb;
+ wreq->len = iov_iter_count(&wreq->buffer.iter);
wreq->cleanup = netfs_cleanup_dio_write;
- ret = netfs_begin_write(wreq, is_sync_kiocb(iocb),
- iocb->ki_flags & IOCB_DIRECT ?
- netfs_write_trace_dio_write :
- netfs_write_trace_unbuffered_write);
+ ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len);
if (ret < 0) {
_debug("begin = %zd", ret);
goto out;
@@ -100,9 +107,7 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
TASK_UNINTERRUPTIBLE);
-
ret = wreq->error;
- _debug("waited = %zd", ret);
if (ret == 0) {
ret = wreq->transferred;
iocb->ki_pos += ret;
@@ -115,6 +120,7 @@ out:
netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
return ret;
}
+EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked);
/**
* netfs_unbuffered_write_iter - Unbuffered write to a file
@@ -132,18 +138,20 @@ out:
ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
struct netfs_inode *ictx = netfs_inode(inode);
- unsigned long long end;
ssize_t ret;
+ loff_t pos = iocb->ki_pos;
+ unsigned long long end = pos + iov_iter_count(from) - 1;
- _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
+ _enter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode));
if (!iov_iter_count(from))
return 0;
trace_netfs_write_iter(iocb, from);
- netfs_stat(&netfs_n_rh_dio_write);
+ netfs_stat(&netfs_n_wh_dio_write);
ret = netfs_start_io_direct(inode);
if (ret < 0)
@@ -157,7 +165,25 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = file_update_time(file);
if (ret < 0)
goto out;
- ret = kiocb_invalidate_pages(iocb, iov_iter_count(from));
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ /* We could block if there are any pages in the range. */
+ ret = -EAGAIN;
+ if (filemap_range_has_page(mapping, pos, end))
+ if (filemap_invalidate_inode(inode, true, pos, end))
+ goto out;
+ } else {
+ ret = filemap_write_and_wait_range(mapping, pos, end);
+ if (ret < 0)
+ goto out;
+ }
+
+ /*
+ * After a write we want buffered reads to be sure to go to disk to get
+ * the new data. We invalidate clean cached page from the region we're
+ * about to write. We do this *before* the write so that we can return
+ * without clobbering -EIOCBQUEUED from ->direct_IO().
+ */
+ ret = filemap_invalidate_inode(inode, true, pos, end);
if (ret < 0)
goto out;
end = iocb->ki_pos + iov_iter_count(from);
diff --git a/fs/netfs/fscache_cache.c b/fs/netfs/fscache_cache.c
index 9397ed39b0b4..8f70f8da064b 100644
--- a/fs/netfs/fscache_cache.c
+++ b/fs/netfs/fscache_cache.c
@@ -372,7 +372,7 @@ void fscache_withdraw_cache(struct fscache_cache *cache)
EXPORT_SYMBOL(fscache_withdraw_cache);
#ifdef CONFIG_PROC_FS
-static const char fscache_cache_states[NR__FSCACHE_CACHE_STATE] = "-PAEW";
+static const char fscache_cache_states[NR__FSCACHE_CACHE_STATE] __nonstring = "-PAEW";
/*
* Generate a list of caches in /proc/fs/fscache/caches
diff --git a/fs/netfs/fscache_cookie.c b/fs/netfs/fscache_cookie.c
index bce2492186d0..3d56fc73435f 100644
--- a/fs/netfs/fscache_cookie.c
+++ b/fs/netfs/fscache_cookie.c
@@ -29,7 +29,7 @@ static LIST_HEAD(fscache_cookie_lru);
static DEFINE_SPINLOCK(fscache_cookie_lru_lock);
DEFINE_TIMER(fscache_cookie_lru_timer, fscache_cookie_lru_timed_out);
static DECLARE_WORK(fscache_cookie_lru_work, fscache_cookie_lru_worker);
-static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] = "-LCAIFUWRD";
+static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] __nonstring = "-LCAIFUWRD";
static unsigned int fscache_lru_cookie_timeout = 10 * HZ;
void fscache_print_cookie(struct fscache_cookie *cookie, char prefix)
@@ -741,6 +741,10 @@ again_locked:
spin_lock(&cookie->lock);
}
if (test_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags)) {
+ if (atomic_read(&cookie->n_accesses) != 0)
+ /* still being accessed: postpone it */
+ break;
+
__fscache_set_cookie_state(cookie,
FSCACHE_COOKIE_STATE_LRU_DISCARDING);
wake = true;
diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c
index 43a651ed8264..b1722a82c03d 100644
--- a/fs/netfs/fscache_io.c
+++ b/fs/netfs/fscache_io.c
@@ -9,7 +9,6 @@
#include <linux/uio.h>
#include <linux/bvec.h>
#include <linux/slab.h>
-#include <linux/uio.h>
#include "internal.h"
/**
@@ -166,6 +165,7 @@ struct fscache_write_request {
loff_t start;
size_t len;
bool set_bits;
+ bool using_pgpriv2;
netfs_io_terminated_t term_func;
void *term_func_priv;
};
@@ -182,7 +182,7 @@ void __fscache_clear_page_bits(struct address_space *mapping,
rcu_read_lock();
xas_for_each(&xas, page, last) {
- end_page_fscache(page);
+ folio_end_private_2(page_folio(page));
}
rcu_read_unlock();
}
@@ -197,8 +197,9 @@ static void fscache_wreq_done(void *priv, ssize_t transferred_or_error,
{
struct fscache_write_request *wreq = priv;
- fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len,
- wreq->set_bits);
+ if (wreq->using_pgpriv2)
+ fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len,
+ wreq->set_bits);
if (wreq->term_func)
wreq->term_func(wreq->term_func_priv, transferred_or_error,
@@ -212,7 +213,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
loff_t start, size_t len, loff_t i_size,
netfs_io_terminated_t term_func,
void *term_func_priv,
- bool cond)
+ bool using_pgpriv2, bool cond)
{
struct fscache_write_request *wreq;
struct netfs_cache_resources *cres;
@@ -230,6 +231,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
wreq->mapping = mapping;
wreq->start = start;
wreq->len = len;
+ wreq->using_pgpriv2 = using_pgpriv2;
wreq->set_bits = cond;
wreq->term_func = term_func;
wreq->term_func_priv = term_func_priv;
@@ -257,7 +259,8 @@ abandon_end:
abandon_free:
kfree(wreq);
abandon:
- fscache_clear_page_bits(mapping, start, len, cond);
+ if (using_pgpriv2)
+ fscache_clear_page_bits(mapping, start, len, cond);
if (term_func)
term_func(term_func_priv, ret, false);
}
diff --git a/fs/netfs/fscache_main.c b/fs/netfs/fscache_main.c
index 42e98bb523e3..49849005eb7c 100644
--- a/fs/netfs/fscache_main.c
+++ b/fs/netfs/fscache_main.c
@@ -103,6 +103,7 @@ void __exit fscache_exit(void)
kmem_cache_destroy(fscache_cookie_jar);
fscache_proc_cleanup();
+ timer_shutdown_sync(&fscache_cookie_lru_timer);
destroy_workqueue(fscache_wq);
pr_notice("FS-Cache unloaded\n");
}
diff --git a/fs/netfs/fscache_volume.c b/fs/netfs/fscache_volume.c
index cdf991bdd9de..ced14ac78cc1 100644
--- a/fs/netfs/fscache_volume.c
+++ b/fs/netfs/fscache_volume.c
@@ -27,6 +27,19 @@ struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
return volume;
}
+struct fscache_volume *fscache_try_get_volume(struct fscache_volume *volume,
+ enum fscache_volume_trace where)
+{
+ int ref;
+
+ if (!__refcount_inc_not_zero(&volume->ref, &ref))
+ return NULL;
+
+ trace_fscache_volume(volume->debug_id, ref + 1, where);
+ return volume;
+}
+EXPORT_SYMBOL(fscache_try_get_volume);
+
static void fscache_see_volume(struct fscache_volume *volume,
enum fscache_volume_trace where)
{
@@ -309,8 +322,7 @@ maybe_wait:
}
return;
no_wait:
- clear_bit_unlock(FSCACHE_VOLUME_CREATING, &volume->flags);
- wake_up_bit(&volume->flags, FSCACHE_VOLUME_CREATING);
+ clear_and_wake_up_bit(FSCACHE_VOLUME_CREATING, &volume->flags);
}
/*
@@ -420,6 +432,7 @@ void fscache_put_volume(struct fscache_volume *volume,
fscache_free_volume(volume);
}
}
+EXPORT_SYMBOL(fscache_put_volume);
/*
* Relinquish a volume representation cookie.
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index ec7045d24400..1c4f953c3d68 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/seq_file.h>
+#include <linux/folio_queue.h>
#include <linux/netfs.h>
#include <linux/fscache.h>
#include <linux/fscache-cache.h>
@@ -22,21 +23,18 @@
/*
* buffered_read.c
*/
-void netfs_rreq_unlock_folios(struct netfs_io_request *rreq);
+void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async);
int netfs_prefetch_for_write(struct file *file, struct folio *folio,
size_t offset, size_t len);
/*
- * io.c
- */
-int netfs_begin_read(struct netfs_io_request *rreq, bool sync);
-
-/*
* main.c
*/
extern unsigned int netfs_debug;
extern struct list_head netfs_io_requests;
extern spinlock_t netfs_proc_lock;
+extern mempool_t netfs_request_pool;
+extern mempool_t netfs_subrequest_pool;
#ifdef CONFIG_PROC_FS
static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq)
@@ -61,15 +59,9 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
/*
* misc.c
*/
-#define NETFS_FLAG_PUT_MARK BIT(0)
-#define NETFS_FLAG_PAGECACHE_MARK BIT(1)
-int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
- struct folio *folio, unsigned int flags,
- gfp_t gfp_mask);
-int netfs_add_folios_to_buffer(struct xarray *buffer,
- struct address_space *mapping,
- pgoff_t index, pgoff_t to, gfp_t gfp_mask);
-void netfs_clear_buffer(struct xarray *buffer);
+struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq,
+ enum netfs_folioq_trace trace);
+void netfs_reset_iter(struct netfs_io_subrequest *subreq);
/*
* objects.c
@@ -90,23 +82,43 @@ static inline void netfs_see_request(struct netfs_io_request *rreq,
trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), what);
}
+static inline void netfs_see_subrequest(struct netfs_io_subrequest *subreq,
+ enum netfs_sreq_ref_trace what)
+{
+ trace_netfs_sreq_ref(subreq->rreq->debug_id, subreq->debug_index,
+ refcount_read(&subreq->ref), what);
+}
+
/*
- * output.c
+ * read_collect.c
*/
-int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
- enum netfs_write_trace what);
-struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len);
-int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end);
-int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb);
+void netfs_read_collection_worker(struct work_struct *work);
+void netfs_wake_read_collector(struct netfs_io_request *rreq);
+void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async);
+ssize_t netfs_wait_for_read(struct netfs_io_request *rreq);
+void netfs_wait_for_pause(struct netfs_io_request *rreq);
+
+/*
+ * read_pgpriv2.c
+ */
+void netfs_pgpriv2_copy_to_cache(struct netfs_io_request *rreq, struct folio *folio);
+void netfs_pgpriv2_end_copy_to_cache(struct netfs_io_request *rreq);
+bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq);
+
+/*
+ * read_retry.c
+ */
+void netfs_retry_reads(struct netfs_io_request *rreq);
+void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq);
/*
* stats.c
*/
#ifdef CONFIG_NETFS_STATS
extern atomic_t netfs_n_rh_dio_read;
-extern atomic_t netfs_n_rh_dio_write;
extern atomic_t netfs_n_rh_readahead;
-extern atomic_t netfs_n_rh_readpage;
+extern atomic_t netfs_n_rh_read_folio;
+extern atomic_t netfs_n_rh_read_single;
extern atomic_t netfs_n_rh_rreq;
extern atomic_t netfs_n_rh_sreq;
extern atomic_t netfs_n_rh_download;
@@ -123,6 +135,13 @@ extern atomic_t netfs_n_rh_write_begin;
extern atomic_t netfs_n_rh_write_done;
extern atomic_t netfs_n_rh_write_failed;
extern atomic_t netfs_n_rh_write_zskip;
+extern atomic_t netfs_n_rh_retry_read_req;
+extern atomic_t netfs_n_rh_retry_read_subreq;
+extern atomic_t netfs_n_wh_buffered_write;
+extern atomic_t netfs_n_wh_writethrough;
+extern atomic_t netfs_n_wh_dio_write;
+extern atomic_t netfs_n_wh_writepages;
+extern atomic_t netfs_n_wh_copy_to_cache;
extern atomic_t netfs_n_wh_wstream_conflict;
extern atomic_t netfs_n_wh_upload;
extern atomic_t netfs_n_wh_upload_done;
@@ -130,6 +149,11 @@ extern atomic_t netfs_n_wh_upload_failed;
extern atomic_t netfs_n_wh_write;
extern atomic_t netfs_n_wh_write_done;
extern atomic_t netfs_n_wh_write_failed;
+extern atomic_t netfs_n_wh_retry_write_req;
+extern atomic_t netfs_n_wh_retry_write_subreq;
+extern atomic_t netfs_n_wb_lock_skip;
+extern atomic_t netfs_n_wb_lock_wait;
+extern atomic_t netfs_n_folioq;
int netfs_stats_show(struct seq_file *m, void *v);
@@ -149,6 +173,41 @@ static inline void netfs_stat_d(atomic_t *stat)
#endif
/*
+ * write_collect.c
+ */
+int netfs_folio_written_back(struct folio *folio);
+void netfs_write_collection_worker(struct work_struct *work);
+void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async);
+
+/*
+ * write_issue.c
+ */
+struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
+ struct file *file,
+ loff_t start,
+ enum netfs_io_origin origin);
+void netfs_reissue_write(struct netfs_io_stream *stream,
+ struct netfs_io_subrequest *subreq,
+ struct iov_iter *source);
+void netfs_issue_write(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream);
+size_t netfs_advance_write(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream,
+ loff_t start, size_t len, bool to_eof);
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len);
+int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *folio, size_t copied, bool to_page_end,
+ struct folio **writethrough_cache);
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *writethrough_cache);
+int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len);
+
+/*
+ * write_retry.c
+ */
+void netfs_retry_writes(struct netfs_io_request *wreq);
+
+/*
* Miscellaneous functions.
*/
static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)
@@ -168,7 +227,7 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)
*/
static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group)
{
- if (netfs_group)
+ if (netfs_group && netfs_group != NETFS_FOLIO_COPY_TO_CACHE)
refcount_inc(&netfs_group->ref);
return netfs_group;
}
@@ -178,7 +237,9 @@ static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_grou
*/
static inline void netfs_put_group(struct netfs_group *netfs_group)
{
- if (netfs_group && refcount_dec_and_test(&netfs_group->ref))
+ if (netfs_group &&
+ netfs_group != NETFS_FOLIO_COPY_TO_CACHE &&
+ refcount_dec_and_test(&netfs_group->ref))
netfs_group->free(netfs_group);
}
@@ -187,7 +248,9 @@ static inline void netfs_put_group(struct netfs_group *netfs_group)
*/
static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr)
{
- if (netfs_group && refcount_sub_and_test(nr, &netfs_group->ref))
+ if (netfs_group &&
+ netfs_group != NETFS_FOLIO_COPY_TO_CACHE &&
+ refcount_sub_and_test(nr, &netfs_group->ref))
netfs_group->free(netfs_group);
}
@@ -326,8 +389,6 @@ extern const struct seq_operations fscache_volumes_seq_ops;
struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
enum fscache_volume_trace where);
-void fscache_put_volume(struct fscache_volume *volume,
- enum fscache_volume_trace where);
bool fscache_begin_volume_access(struct fscache_volume *volume,
struct fscache_cookie *cookie,
enum fscache_access_trace why);
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
deleted file mode 100644
index 4261ad6c55b6..000000000000
--- a/fs/netfs/io.c
+++ /dev/null
@@ -1,787 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* Network filesystem high-level read support.
- *
- * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/module.h>
-#include <linux/export.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/uio.h>
-#include <linux/sched/mm.h>
-#include <linux/task_io_accounting_ops.h>
-#include "internal.h"
-
-/*
- * Clear the unread part of an I/O request.
- */
-static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
-{
- iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter);
-}
-
-static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
- bool was_async)
-{
- struct netfs_io_subrequest *subreq = priv;
-
- netfs_subreq_terminated(subreq, transferred_or_error, was_async);
-}
-
-/*
- * Issue a read against the cache.
- * - Eats the caller's ref on subreq.
- */
-static void netfs_read_from_cache(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq,
- enum netfs_read_from_hole read_hole)
-{
- struct netfs_cache_resources *cres = &rreq->cache_resources;
-
- netfs_stat(&netfs_n_rh_read);
- cres->ops->read(cres, subreq->start, &subreq->io_iter, read_hole,
- netfs_cache_read_terminated, subreq);
-}
-
-/*
- * Fill a subrequest region with zeroes.
- */
-static void netfs_fill_with_zeroes(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq)
-{
- netfs_stat(&netfs_n_rh_zero);
- __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
- netfs_subreq_terminated(subreq, 0, false);
-}
-
-/*
- * Ask the netfs to issue a read request to the server for us.
- *
- * The netfs is expected to read from subreq->pos + subreq->transferred to
- * subreq->pos + subreq->len - 1. It may not backtrack and write data into the
- * buffer prior to the transferred point as it might clobber dirty data
- * obtained from the cache.
- *
- * Alternatively, the netfs is allowed to indicate one of two things:
- *
- * - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and
- * make progress.
- *
- * - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be
- * cleared.
- */
-static void netfs_read_from_server(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq)
-{
- netfs_stat(&netfs_n_rh_download);
-
- if (rreq->origin != NETFS_DIO_READ &&
- iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
- pr_warn("R=%08x[%u] ITER PRE-MISMATCH %zx != %zx-%zx %lx\n",
- rreq->debug_id, subreq->debug_index,
- iov_iter_count(&subreq->io_iter), subreq->len,
- subreq->transferred, subreq->flags);
- rreq->netfs_ops->issue_read(subreq);
-}
-
-/*
- * Release those waiting.
- */
-static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async)
-{
- trace_netfs_rreq(rreq, netfs_rreq_trace_done);
- netfs_clear_subrequests(rreq, was_async);
- netfs_put_request(rreq, was_async, netfs_rreq_trace_put_complete);
-}
-
-/*
- * Deal with the completion of writing the data to the cache. We have to clear
- * the PG_fscache bits on the folios involved and release the caller's ref.
- *
- * May be called in softirq mode and we inherit a ref from the caller.
- */
-static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq,
- bool was_async)
-{
- struct netfs_io_subrequest *subreq;
- struct folio *folio;
- pgoff_t unlocked = 0;
- bool have_unlocked = false;
-
- rcu_read_lock();
-
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE);
-
- xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) {
- if (xas_retry(&xas, folio))
- continue;
-
- /* We might have multiple writes from the same huge
- * folio, but we mustn't unlock a folio more than once.
- */
- if (have_unlocked && folio->index <= unlocked)
- continue;
- unlocked = folio_next_index(folio) - 1;
- trace_netfs_folio(folio, netfs_folio_trace_end_copy);
- folio_end_fscache(folio);
- have_unlocked = true;
- }
- }
-
- rcu_read_unlock();
- netfs_rreq_completed(rreq, was_async);
-}
-
-static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error,
- bool was_async)
-{
- struct netfs_io_subrequest *subreq = priv;
- struct netfs_io_request *rreq = subreq->rreq;
-
- if (IS_ERR_VALUE(transferred_or_error)) {
- netfs_stat(&netfs_n_rh_write_failed);
- trace_netfs_failure(rreq, subreq, transferred_or_error,
- netfs_fail_copy_to_cache);
- } else {
- netfs_stat(&netfs_n_rh_write_done);
- }
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_write_term);
-
- /* If we decrement nr_copy_ops to 0, the ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_copy_ops))
- netfs_rreq_unmark_after_write(rreq, was_async);
-
- netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
-}
-
-/*
- * Perform any outstanding writes to the cache. We inherit a ref from the
- * caller.
- */
-static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
-{
- struct netfs_cache_resources *cres = &rreq->cache_resources;
- struct netfs_io_subrequest *subreq, *next, *p;
- struct iov_iter iter;
- int ret;
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_copy);
-
- /* We don't want terminating writes trying to wake us up whilst we're
- * still going through the list.
- */
- atomic_inc(&rreq->nr_copy_ops);
-
- list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) {
- if (!test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
- list_del_init(&subreq->rreq_link);
- netfs_put_subrequest(subreq, false,
- netfs_sreq_trace_put_no_copy);
- }
- }
-
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- /* Amalgamate adjacent writes */
- while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
- next = list_next_entry(subreq, rreq_link);
- if (next->start != subreq->start + subreq->len)
- break;
- subreq->len += next->len;
- list_del_init(&next->rreq_link);
- netfs_put_subrequest(next, false,
- netfs_sreq_trace_put_merged);
- }
-
- ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
- subreq->len, rreq->i_size, true);
- if (ret < 0) {
- trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
- trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
- continue;
- }
-
- iov_iter_xarray(&iter, ITER_SOURCE, &rreq->mapping->i_pages,
- subreq->start, subreq->len);
-
- atomic_inc(&rreq->nr_copy_ops);
- netfs_stat(&netfs_n_rh_write);
- netfs_get_subrequest(subreq, netfs_sreq_trace_get_copy_to_cache);
- trace_netfs_sreq(subreq, netfs_sreq_trace_write);
- cres->ops->write(cres, subreq->start, &iter,
- netfs_rreq_copy_terminated, subreq);
- }
-
- /* If we decrement nr_copy_ops to 0, the usage ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_copy_ops))
- netfs_rreq_unmark_after_write(rreq, false);
-}
-
-static void netfs_rreq_write_to_cache_work(struct work_struct *work)
-{
- struct netfs_io_request *rreq =
- container_of(work, struct netfs_io_request, work);
-
- netfs_rreq_do_write_to_cache(rreq);
-}
-
-static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq)
-{
- rreq->work.func = netfs_rreq_write_to_cache_work;
- if (!queue_work(system_unbound_wq, &rreq->work))
- BUG();
-}
-
-/*
- * Handle a short read.
- */
-static void netfs_rreq_short_read(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq)
-{
- __clear_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
- __set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags);
-
- netfs_stat(&netfs_n_rh_short_read);
- trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short);
-
- netfs_get_subrequest(subreq, netfs_sreq_trace_get_short_read);
- atomic_inc(&rreq->nr_outstanding);
- if (subreq->source == NETFS_READ_FROM_CACHE)
- netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_CLEAR);
- else
- netfs_read_from_server(rreq, subreq);
-}
-
-/*
- * Reset the subrequest iterator prior to resubmission.
- */
-static void netfs_reset_subreq_iter(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq)
-{
- size_t remaining = subreq->len - subreq->transferred;
- size_t count = iov_iter_count(&subreq->io_iter);
-
- if (count == remaining)
- return;
-
- _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n",
- rreq->debug_id, subreq->debug_index,
- iov_iter_count(&subreq->io_iter), subreq->transferred,
- subreq->len, rreq->i_size,
- subreq->io_iter.iter_type);
-
- if (count < remaining)
- iov_iter_revert(&subreq->io_iter, remaining - count);
- else
- iov_iter_advance(&subreq->io_iter, count - remaining);
-}
-
-/*
- * Resubmit any short or failed operations. Returns true if we got the rreq
- * ref back.
- */
-static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq)
-{
- struct netfs_io_subrequest *subreq;
-
- WARN_ON(in_interrupt());
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
-
- /* We don't want terminating submissions trying to wake us up whilst
- * we're still going through the list.
- */
- atomic_inc(&rreq->nr_outstanding);
-
- __clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- if (subreq->error) {
- if (subreq->source != NETFS_READ_FROM_CACHE)
- break;
- subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
- subreq->error = 0;
- netfs_stat(&netfs_n_rh_download_instead);
- trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
- netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
- atomic_inc(&rreq->nr_outstanding);
- netfs_reset_subreq_iter(rreq, subreq);
- netfs_read_from_server(rreq, subreq);
- } else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) {
- netfs_rreq_short_read(rreq, subreq);
- }
- }
-
- /* If we decrement nr_outstanding to 0, the usage ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_outstanding))
- return true;
-
- wake_up_var(&rreq->nr_outstanding);
- return false;
-}
-
-/*
- * Check to see if the data read is still valid.
- */
-static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq)
-{
- struct netfs_io_subrequest *subreq;
-
- if (!rreq->netfs_ops->is_still_valid ||
- rreq->netfs_ops->is_still_valid(rreq))
- return;
-
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- if (subreq->source == NETFS_READ_FROM_CACHE) {
- subreq->error = -ESTALE;
- __set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
- }
- }
-}
-
-/*
- * Determine how much we can admit to having read from a DIO read.
- */
-static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
-{
- struct netfs_io_subrequest *subreq;
- unsigned int i;
- size_t transferred = 0;
-
- for (i = 0; i < rreq->direct_bv_count; i++)
- flush_dcache_page(rreq->direct_bv[i].bv_page);
-
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- if (subreq->error || subreq->transferred == 0)
- break;
- transferred += subreq->transferred;
- if (subreq->transferred < subreq->len)
- break;
- }
-
- for (i = 0; i < rreq->direct_bv_count; i++)
- flush_dcache_page(rreq->direct_bv[i].bv_page);
-
- rreq->transferred = transferred;
- task_io_account_read(transferred);
-
- if (rreq->iocb) {
- rreq->iocb->ki_pos += transferred;
- if (rreq->iocb->ki_complete)
- rreq->iocb->ki_complete(
- rreq->iocb, rreq->error ? rreq->error : transferred);
- }
- if (rreq->netfs_ops->done)
- rreq->netfs_ops->done(rreq);
- inode_dio_end(rreq->inode);
-}
-
-/*
- * Assess the state of a read request and decide what to do next.
- *
- * Note that we could be in an ordinary kernel thread, on a workqueue or in
- * softirq context at this point. We inherit a ref from the caller.
- */
-static void netfs_rreq_assess(struct netfs_io_request *rreq, bool was_async)
-{
- trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
-
-again:
- netfs_rreq_is_still_valid(rreq);
-
- if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) &&
- test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) {
- if (netfs_rreq_perform_resubmissions(rreq))
- goto again;
- return;
- }
-
- if (rreq->origin != NETFS_DIO_READ)
- netfs_rreq_unlock_folios(rreq);
- else
- netfs_rreq_assess_dio(rreq);
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
- clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
- wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
-
- if (test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags))
- return netfs_rreq_write_to_cache(rreq);
-
- netfs_rreq_completed(rreq, was_async);
-}
-
-static void netfs_rreq_work(struct work_struct *work)
-{
- struct netfs_io_request *rreq =
- container_of(work, struct netfs_io_request, work);
- netfs_rreq_assess(rreq, false);
-}
-
-/*
- * Handle the completion of all outstanding I/O operations on a read request.
- * We inherit a ref from the caller.
- */
-static void netfs_rreq_terminated(struct netfs_io_request *rreq,
- bool was_async)
-{
- if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) &&
- was_async) {
- if (!queue_work(system_unbound_wq, &rreq->work))
- BUG();
- } else {
- netfs_rreq_assess(rreq, was_async);
- }
-}
-
-/**
- * netfs_subreq_terminated - Note the termination of an I/O operation.
- * @subreq: The I/O request that has terminated.
- * @transferred_or_error: The amount of data transferred or an error code.
- * @was_async: The termination was asynchronous
- *
- * This tells the read helper that a contributory I/O operation has terminated,
- * one way or another, and that it should integrate the results.
- *
- * The caller indicates in @transferred_or_error the outcome of the operation,
- * supplying a positive value to indicate the number of bytes transferred, 0 to
- * indicate a failure to transfer anything that should be retried or a negative
- * error code. The helper will look after reissuing I/O operations as
- * appropriate and writing downloaded data to the cache.
- *
- * If @was_async is true, the caller might be running in softirq or interrupt
- * context and we can't sleep.
- */
-void netfs_subreq_terminated(struct netfs_io_subrequest *subreq,
- ssize_t transferred_or_error,
- bool was_async)
-{
- struct netfs_io_request *rreq = subreq->rreq;
- int u;
-
- _enter("R=%x[%x]{%llx,%lx},%zd",
- rreq->debug_id, subreq->debug_index,
- subreq->start, subreq->flags, transferred_or_error);
-
- switch (subreq->source) {
- case NETFS_READ_FROM_CACHE:
- netfs_stat(&netfs_n_rh_read_done);
- break;
- case NETFS_DOWNLOAD_FROM_SERVER:
- netfs_stat(&netfs_n_rh_download_done);
- break;
- default:
- break;
- }
-
- if (IS_ERR_VALUE(transferred_or_error)) {
- subreq->error = transferred_or_error;
- trace_netfs_failure(rreq, subreq, transferred_or_error,
- netfs_fail_read);
- goto failed;
- }
-
- if (WARN(transferred_or_error > subreq->len - subreq->transferred,
- "Subreq overread: R%x[%x] %zd > %zu - %zu",
- rreq->debug_id, subreq->debug_index,
- transferred_or_error, subreq->len, subreq->transferred))
- transferred_or_error = subreq->len - subreq->transferred;
-
- subreq->error = 0;
- subreq->transferred += transferred_or_error;
- if (subreq->transferred < subreq->len)
- goto incomplete;
-
-complete:
- __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
- if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
- set_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
-
-out:
- trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
-
- /* If we decrement nr_outstanding to 0, the ref belongs to us. */
- u = atomic_dec_return(&rreq->nr_outstanding);
- if (u == 0)
- netfs_rreq_terminated(rreq, was_async);
- else if (u == 1)
- wake_up_var(&rreq->nr_outstanding);
-
- netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
- return;
-
-incomplete:
- if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) {
- netfs_clear_unread(subreq);
- subreq->transferred = subreq->len;
- goto complete;
- }
-
- if (transferred_or_error == 0) {
- if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
- subreq->error = -ENODATA;
- goto failed;
- }
- } else {
- __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
- }
-
- __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
- set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
- goto out;
-
-failed:
- if (subreq->source == NETFS_READ_FROM_CACHE) {
- netfs_stat(&netfs_n_rh_read_failed);
- set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
- } else {
- netfs_stat(&netfs_n_rh_download_failed);
- set_bit(NETFS_RREQ_FAILED, &rreq->flags);
- rreq->error = subreq->error;
- }
- goto out;
-}
-EXPORT_SYMBOL(netfs_subreq_terminated);
-
-static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest *subreq,
- loff_t i_size)
-{
- struct netfs_io_request *rreq = subreq->rreq;
- struct netfs_cache_resources *cres = &rreq->cache_resources;
-
- if (cres->ops)
- return cres->ops->prepare_read(subreq, i_size);
- if (subreq->start >= rreq->i_size)
- return NETFS_FILL_WITH_ZEROES;
- return NETFS_DOWNLOAD_FROM_SERVER;
-}
-
-/*
- * Work out what sort of subrequest the next one will be.
- */
-static enum netfs_io_source
-netfs_rreq_prepare_read(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq,
- struct iov_iter *io_iter)
-{
- enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER;
- struct netfs_inode *ictx = netfs_inode(rreq->inode);
- size_t lsize;
-
- _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
-
- if (rreq->origin != NETFS_DIO_READ) {
- source = netfs_cache_prepare_read(subreq, rreq->i_size);
- if (source == NETFS_INVALID_READ)
- goto out;
- }
-
- if (source == NETFS_DOWNLOAD_FROM_SERVER) {
- /* Call out to the netfs to let it shrink the request to fit
- * its own I/O sizes and boundaries. If it shinks it here, it
- * will be called again to make simultaneous calls; if it wants
- * to make serial calls, it can indicate a short read and then
- * we will call it again.
- */
- if (rreq->origin != NETFS_DIO_READ) {
- if (subreq->start >= ictx->zero_point) {
- source = NETFS_FILL_WITH_ZEROES;
- goto set;
- }
- if (subreq->len > ictx->zero_point - subreq->start)
- subreq->len = ictx->zero_point - subreq->start;
- }
- if (subreq->len > rreq->i_size - subreq->start)
- subreq->len = rreq->i_size - subreq->start;
- if (rreq->rsize && subreq->len > rreq->rsize)
- subreq->len = rreq->rsize;
-
- if (rreq->netfs_ops->clamp_length &&
- !rreq->netfs_ops->clamp_length(subreq)) {
- source = NETFS_INVALID_READ;
- goto out;
- }
-
- if (subreq->max_nr_segs) {
- lsize = netfs_limit_iter(io_iter, 0, subreq->len,
- subreq->max_nr_segs);
- if (subreq->len > lsize) {
- subreq->len = lsize;
- trace_netfs_sreq(subreq, netfs_sreq_trace_limited);
- }
- }
- }
-
-set:
- if (subreq->len > rreq->len)
- pr_warn("R=%08x[%u] SREQ>RREQ %zx > %zx\n",
- rreq->debug_id, subreq->debug_index,
- subreq->len, rreq->len);
-
- if (WARN_ON(subreq->len == 0)) {
- source = NETFS_INVALID_READ;
- goto out;
- }
-
- subreq->source = source;
- trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
-
- subreq->io_iter = *io_iter;
- iov_iter_truncate(&subreq->io_iter, subreq->len);
- iov_iter_advance(io_iter, subreq->len);
-out:
- subreq->source = source;
- trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
- return source;
-}
-
-/*
- * Slice off a piece of a read request and submit an I/O request for it.
- */
-static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
- struct iov_iter *io_iter,
- unsigned int *_debug_index)
-{
- struct netfs_io_subrequest *subreq;
- enum netfs_io_source source;
-
- subreq = netfs_alloc_subrequest(rreq);
- if (!subreq)
- return false;
-
- subreq->debug_index = (*_debug_index)++;
- subreq->start = rreq->start + rreq->submitted;
- subreq->len = io_iter->count;
-
- _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
- list_add_tail(&subreq->rreq_link, &rreq->subrequests);
-
- /* Call out to the cache to find out what it can do with the remaining
- * subset. It tells us in subreq->flags what it decided should be done
- * and adjusts subreq->len down if the subset crosses a cache boundary.
- *
- * Then when we hand the subset, it can choose to take a subset of that
- * (the starts must coincide), in which case, we go around the loop
- * again and ask it to download the next piece.
- */
- source = netfs_rreq_prepare_read(rreq, subreq, io_iter);
- if (source == NETFS_INVALID_READ)
- goto subreq_failed;
-
- atomic_inc(&rreq->nr_outstanding);
-
- rreq->submitted += subreq->len;
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
- switch (source) {
- case NETFS_FILL_WITH_ZEROES:
- netfs_fill_with_zeroes(rreq, subreq);
- break;
- case NETFS_DOWNLOAD_FROM_SERVER:
- netfs_read_from_server(rreq, subreq);
- break;
- case NETFS_READ_FROM_CACHE:
- netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_IGNORE);
- break;
- default:
- BUG();
- }
-
- return true;
-
-subreq_failed:
- rreq->error = subreq->error;
- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_failed);
- return false;
-}
-
-/*
- * Begin the process of reading in a chunk of data, where that data may be
- * stitched together from multiple sources, including multiple servers and the
- * local cache.
- */
-int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
-{
- struct iov_iter io_iter;
- unsigned int debug_index = 0;
- int ret;
-
- _enter("R=%x %llx-%llx",
- rreq->debug_id, rreq->start, rreq->start + rreq->len - 1);
-
- if (rreq->len == 0) {
- pr_err("Zero-sized read [R=%x]\n", rreq->debug_id);
- return -EIO;
- }
-
- if (rreq->origin == NETFS_DIO_READ)
- inode_dio_begin(rreq->inode);
-
- // TODO: Use bounce buffer if requested
- rreq->io_iter = rreq->iter;
-
- INIT_WORK(&rreq->work, netfs_rreq_work);
-
- /* Chop the read into slices according to what the cache and the netfs
- * want and submit each one.
- */
- netfs_get_request(rreq, netfs_rreq_trace_get_for_outstanding);
- atomic_set(&rreq->nr_outstanding, 1);
- io_iter = rreq->io_iter;
- do {
- _debug("submit %llx + %zx >= %llx",
- rreq->start, rreq->submitted, rreq->i_size);
- if (rreq->origin == NETFS_DIO_READ &&
- rreq->start + rreq->submitted >= rreq->i_size)
- break;
- if (!netfs_rreq_submit_slice(rreq, &io_iter, &debug_index))
- break;
- if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
- test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
- break;
-
- } while (rreq->submitted < rreq->len);
-
- if (!rreq->submitted) {
- netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit);
- if (rreq->origin == NETFS_DIO_READ)
- inode_dio_end(rreq->inode);
- ret = 0;
- goto out;
- }
-
- if (sync) {
- /* Keep nr_outstanding incremented so that the ref always
- * belongs to us, and the service code isn't punted off to a
- * random thread pool to process. Note that this might start
- * further work, such as writing to the cache.
- */
- wait_var_event(&rreq->nr_outstanding,
- atomic_read(&rreq->nr_outstanding) == 1);
- if (atomic_dec_and_test(&rreq->nr_outstanding))
- netfs_rreq_assess(rreq, false);
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
- wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS,
- TASK_UNINTERRUPTIBLE);
-
- ret = rreq->error;
- if (ret == 0 && rreq->submitted < rreq->len &&
- rreq->origin != NETFS_DIO_READ) {
- trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
- ret = -EIO;
- }
- } else {
- /* If we decrement nr_outstanding to 0, the ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_outstanding))
- netfs_rreq_assess(rreq, false);
- ret = -EIOCBQUEUED;
- }
-
-out:
- return ret;
-}
diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c
index b781bbbf1d8d..72a435e5fc6d 100644
--- a/fs/netfs/iterator.c
+++ b/fs/netfs/iterator.c
@@ -188,9 +188,59 @@ static size_t netfs_limit_xarray(const struct iov_iter *iter, size_t start_offse
return min(span, max_size);
}
+/*
+ * Select the span of a folio queue iterator we're going to use. Limit it by
+ * both maximum size and maximum number of segments. Returns the size of the
+ * span in bytes.
+ */
+static size_t netfs_limit_folioq(const struct iov_iter *iter, size_t start_offset,
+ size_t max_size, size_t max_segs)
+{
+ const struct folio_queue *folioq = iter->folioq;
+ unsigned int nsegs = 0;
+ unsigned int slot = iter->folioq_slot;
+ size_t span = 0, n = iter->count;
+
+ if (WARN_ON(!iov_iter_is_folioq(iter)) ||
+ WARN_ON(start_offset > n) ||
+ n == 0)
+ return 0;
+ max_size = umin(max_size, n - start_offset);
+
+ if (slot >= folioq_nr_slots(folioq)) {
+ folioq = folioq->next;
+ slot = 0;
+ }
+
+ start_offset += iter->iov_offset;
+ do {
+ size_t flen = folioq_folio_size(folioq, slot);
+
+ if (start_offset < flen) {
+ span += flen - start_offset;
+ nsegs++;
+ start_offset = 0;
+ } else {
+ start_offset -= flen;
+ }
+ if (span >= max_size || nsegs >= max_segs)
+ break;
+
+ slot++;
+ if (slot >= folioq_nr_slots(folioq)) {
+ folioq = folioq->next;
+ slot = 0;
+ }
+ } while (folioq);
+
+ return umin(span, max_size);
+}
+
size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset,
size_t max_size, size_t max_segs)
{
+ if (iov_iter_is_folioq(iter))
+ return netfs_limit_folioq(iter, start_offset, max_size, max_segs);
if (iov_iter_is_bvec(iter))
return netfs_limit_bvec(iter, start_offset, max_size, max_segs);
if (iov_iter_is_xarray(iter))
diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c
index 75dc52a49b3a..2249ecd09d0a 100644
--- a/fs/netfs/locking.c
+++ b/fs/netfs/locking.c
@@ -19,25 +19,13 @@
* Must be called under a lock that serializes taking new references
* to i_dio_count, usually by inode->i_mutex.
*/
-static int inode_dio_wait_interruptible(struct inode *inode)
+static int netfs_inode_dio_wait_interruptible(struct inode *inode)
{
- if (!atomic_read(&inode->i_dio_count))
+ if (inode_dio_finished(inode))
return 0;
- wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
- DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
-
- for (;;) {
- prepare_to_wait(wq, &q.wq_entry, TASK_INTERRUPTIBLE);
- if (!atomic_read(&inode->i_dio_count))
- break;
- if (signal_pending(current))
- break;
- schedule();
- }
- finish_wait(wq, &q.wq_entry);
-
- return atomic_read(&inode->i_dio_count) ? -ERESTARTSYS : 0;
+ inode_dio_wait_interruptible(inode);
+ return !inode_dio_finished(inode) ? -ERESTARTSYS : 0;
}
/* Call with exclusively locked inode->i_rwsem */
@@ -46,7 +34,7 @@ static int netfs_block_o_direct(struct netfs_inode *ictx)
if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags))
return 0;
clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
- return inode_dio_wait_interruptible(&ictx->inode);
+ return netfs_inode_dio_wait_interruptible(&ictx->inode);
}
/**
@@ -121,6 +109,7 @@ int netfs_start_io_write(struct inode *inode)
up_write(&inode->i_rwsem);
return -ERESTARTSYS;
}
+ downgrade_write(&inode->i_rwsem);
return 0;
}
EXPORT_SYMBOL(netfs_start_io_write);
@@ -135,7 +124,7 @@ EXPORT_SYMBOL(netfs_start_io_write);
void netfs_end_io_write(struct inode *inode)
__releases(inode->i_rwsem)
{
- up_write(&inode->i_rwsem);
+ up_read(&inode->i_rwsem);
}
EXPORT_SYMBOL(netfs_end_io_write);
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 5e77618a7940..70ecc8f5f210 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -7,6 +7,7 @@
#include <linux/module.h>
#include <linux/export.h>
+#include <linux/mempool.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include "internal.h"
@@ -23,6 +24,11 @@ unsigned netfs_debug;
module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
+static struct kmem_cache *netfs_request_slab;
+static struct kmem_cache *netfs_subrequest_slab;
+mempool_t netfs_request_pool;
+mempool_t netfs_subrequest_pool;
+
#ifdef CONFIG_PROC_FS
LIST_HEAD(netfs_io_requests);
DEFINE_SPINLOCK(netfs_proc_lock);
@@ -30,13 +36,16 @@ DEFINE_SPINLOCK(netfs_proc_lock);
static const char *netfs_origins[nr__netfs_io_origin] = {
[NETFS_READAHEAD] = "RA",
[NETFS_READPAGE] = "RP",
+ [NETFS_READ_GAPS] = "RG",
+ [NETFS_READ_SINGLE] = "R1",
[NETFS_READ_FOR_WRITE] = "RW",
+ [NETFS_DIO_READ] = "DR",
[NETFS_WRITEBACK] = "WB",
+ [NETFS_WRITEBACK_SINGLE] = "W1",
[NETFS_WRITETHROUGH] = "WT",
- [NETFS_LAUNDER_WRITE] = "LW",
[NETFS_UNBUFFERED_WRITE] = "UW",
- [NETFS_DIO_READ] = "DR",
[NETFS_DIO_WRITE] = "DW",
+ [NETFS_PGPRIV2_COPY_TO_CACHE] = "2C",
};
/*
@@ -56,13 +65,13 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v)
rreq = list_entry(v, struct netfs_io_request, proc_link);
seq_printf(m,
- "%08x %s %3d %2lx %4d %3d @%04llx %zx/%zx",
+ "%08x %s %3d %2lx %4ld %3d @%04llx %llx/%llx",
rreq->debug_id,
netfs_origins[rreq->origin],
refcount_read(&rreq->ref),
rreq->flags,
rreq->error,
- atomic_read(&rreq->nr_outstanding),
+ 0,
rreq->start, rreq->submitted, rreq->len);
seq_putc(m, '\n');
return 0;
@@ -98,25 +107,58 @@ static int __init netfs_init(void)
{
int ret = -ENOMEM;
+ netfs_request_slab = kmem_cache_create("netfs_request",
+ sizeof(struct netfs_io_request), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT,
+ NULL);
+ if (!netfs_request_slab)
+ goto error_req;
+
+ if (mempool_init_slab_pool(&netfs_request_pool, 100, netfs_request_slab) < 0)
+ goto error_reqpool;
+
+ netfs_subrequest_slab = kmem_cache_create("netfs_subrequest",
+ sizeof(struct netfs_io_subrequest) + 16, 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT,
+ NULL);
+ if (!netfs_subrequest_slab)
+ goto error_subreq;
+
+ if (mempool_init_slab_pool(&netfs_subrequest_pool, 100, netfs_subrequest_slab) < 0)
+ goto error_subreqpool;
+
+#ifdef CONFIG_PROC_FS
if (!proc_mkdir("fs/netfs", NULL))
- goto error;
+ goto error_proc;
if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL,
&netfs_requests_seq_ops))
- goto error_proc;
+ goto error_procfile;
+#endif
#ifdef CONFIG_FSCACHE_STATS
if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL,
netfs_stats_show))
- goto error_proc;
+ goto error_procfile;
#endif
ret = fscache_init();
if (ret < 0)
- goto error_proc;
+ goto error_fscache;
return 0;
+error_fscache:
+#ifdef CONFIG_PROC_FS
+error_procfile:
+ remove_proc_subtree("fs/netfs", NULL);
error_proc:
- remove_proc_entry("fs/netfs", NULL);
-error:
+#endif
+ mempool_exit(&netfs_subrequest_pool);
+error_subreqpool:
+ kmem_cache_destroy(netfs_subrequest_slab);
+error_subreq:
+ mempool_exit(&netfs_request_pool);
+error_reqpool:
+ kmem_cache_destroy(netfs_request_slab);
+error_req:
return ret;
}
fs_initcall(netfs_init);
@@ -124,6 +166,10 @@ fs_initcall(netfs_init);
static void __exit netfs_exit(void)
{
fscache_exit();
- remove_proc_entry("fs/netfs", NULL);
+ remove_proc_subtree("fs/netfs", NULL);
+ mempool_exit(&netfs_subrequest_pool);
+ kmem_cache_destroy(netfs_subrequest_slab);
+ mempool_exit(&netfs_request_pool);
+ kmem_cache_destroy(netfs_request_slab);
}
module_exit(netfs_exit);
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 90051ced8e2a..7099aa07737a 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -8,85 +8,118 @@
#include <linux/swap.h>
#include "internal.h"
-/*
- * Attach a folio to the buffer and maybe set marks on it to say that we need
- * to put the folio later and twiddle the pagecache flags.
- */
-int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
- struct folio *folio, unsigned int flags,
- gfp_t gfp_mask)
-{
- XA_STATE_ORDER(xas, xa, index, folio_order(folio));
-
-retry:
- xas_lock(&xas);
- for (;;) {
- xas_store(&xas, folio);
- if (!xas_error(&xas))
- break;
- xas_unlock(&xas);
- if (!xas_nomem(&xas, gfp_mask))
- return xas_error(&xas);
- goto retry;
- }
-
- if (flags & NETFS_FLAG_PUT_MARK)
- xas_set_mark(&xas, NETFS_BUF_PUT_MARK);
- if (flags & NETFS_FLAG_PAGECACHE_MARK)
- xas_set_mark(&xas, NETFS_BUF_PAGECACHE_MARK);
- xas_unlock(&xas);
- return xas_error(&xas);
-}
-
-/*
- * Create the specified range of folios in the buffer attached to the read
- * request. The folios are marked with NETFS_BUF_PUT_MARK so that we know that
- * these need freeing later.
+/**
+ * netfs_alloc_folioq_buffer - Allocate buffer space into a folio queue
+ * @mapping: Address space to set on the folio (or NULL).
+ * @_buffer: Pointer to the folio queue to add to (may point to a NULL; updated).
+ * @_cur_size: Current size of the buffer (updated).
+ * @size: Target size of the buffer.
+ * @gfp: The allocation constraints.
*/
-int netfs_add_folios_to_buffer(struct xarray *buffer,
- struct address_space *mapping,
- pgoff_t index, pgoff_t to, gfp_t gfp_mask)
+int netfs_alloc_folioq_buffer(struct address_space *mapping,
+ struct folio_queue **_buffer,
+ size_t *_cur_size, ssize_t size, gfp_t gfp)
{
- struct folio *folio;
- int ret;
+ struct folio_queue *tail = *_buffer, *p;
- if (to + 1 == index) /* Page range is inclusive */
+ size = round_up(size, PAGE_SIZE);
+ if (*_cur_size >= size)
return 0;
+ if (tail)
+ while (tail->next)
+ tail = tail->next;
+
do {
- /* TODO: Figure out what order folio can be allocated here */
- folio = filemap_alloc_folio(readahead_gfp_mask(mapping), 0);
+ struct folio *folio;
+ int order = 0, slot;
+
+ if (!tail || folioq_full(tail)) {
+ p = netfs_folioq_alloc(0, GFP_NOFS, netfs_trace_folioq_alloc_buffer);
+ if (!p)
+ return -ENOMEM;
+ if (tail) {
+ tail->next = p;
+ p->prev = tail;
+ } else {
+ *_buffer = p;
+ }
+ tail = p;
+ }
+
+ if (size - *_cur_size > PAGE_SIZE)
+ order = umin(ilog2(size - *_cur_size) - PAGE_SHIFT,
+ MAX_PAGECACHE_ORDER);
+
+ folio = folio_alloc(gfp, order);
+ if (!folio && order > 0)
+ folio = folio_alloc(gfp, 0);
if (!folio)
return -ENOMEM;
- folio->index = index;
- ret = netfs_xa_store_and_mark(buffer, index, folio,
- NETFS_FLAG_PUT_MARK, gfp_mask);
- if (ret < 0) {
- folio_put(folio);
- return ret;
- }
- index += folio_nr_pages(folio);
- } while (index <= to && index != 0);
+ folio->mapping = mapping;
+ folio->index = *_cur_size / PAGE_SIZE;
+ trace_netfs_folio(folio, netfs_folio_trace_alloc_buffer);
+ slot = folioq_append_mark(tail, folio);
+ *_cur_size += folioq_folio_size(tail, slot);
+ } while (*_cur_size < size);
return 0;
}
+EXPORT_SYMBOL(netfs_alloc_folioq_buffer);
-/*
- * Clear an xarray buffer, putting a ref on the folios that have
- * NETFS_BUF_PUT_MARK set.
+/**
+ * netfs_free_folioq_buffer - Free a folio queue.
+ * @fq: The start of the folio queue to free
+ *
+ * Free up a chain of folio_queues and, if marked, the marked folios they point
+ * to.
*/
-void netfs_clear_buffer(struct xarray *buffer)
+void netfs_free_folioq_buffer(struct folio_queue *fq)
{
- struct folio *folio;
- XA_STATE(xas, buffer, 0);
+ struct folio_queue *next;
+ struct folio_batch fbatch;
+
+ folio_batch_init(&fbatch);
- rcu_read_lock();
- xas_for_each_marked(&xas, folio, ULONG_MAX, NETFS_BUF_PUT_MARK) {
- folio_put(folio);
+ for (; fq; fq = next) {
+ for (int slot = 0; slot < folioq_count(fq); slot++) {
+ struct folio *folio = folioq_folio(fq, slot);
+
+ if (!folio ||
+ !folioq_is_marked(fq, slot))
+ continue;
+
+ trace_netfs_folio(folio, netfs_folio_trace_put);
+ if (folio_batch_add(&fbatch, folio))
+ folio_batch_release(&fbatch);
+ }
+
+ netfs_stat_d(&netfs_n_folioq);
+ next = fq->next;
+ kfree(fq);
}
- rcu_read_unlock();
- xa_destroy(buffer);
+
+ folio_batch_release(&fbatch);
+}
+EXPORT_SYMBOL(netfs_free_folioq_buffer);
+
+/*
+ * Reset the subrequest iterator to refer just to the region remaining to be
+ * read. The iterator may or may not have been advanced by socket ops or
+ * extraction ops to an extent that may or may not match the amount actually
+ * read.
+ */
+void netfs_reset_iter(struct netfs_io_subrequest *subreq)
+{
+ struct iov_iter *io_iter = &subreq->io_iter;
+ size_t remain = subreq->len - subreq->transferred;
+
+ if (io_iter->count > remain)
+ iov_iter_advance(io_iter, io_iter->count - remain);
+ else if (io_iter->count < remain)
+ iov_iter_revert(io_iter, remain - io_iter->count);
+ iov_iter_truncate(&subreq->io_iter, remain);
}
/**
@@ -177,12 +210,22 @@ EXPORT_SYMBOL(netfs_clear_inode_writeback);
*/
void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
{
- struct netfs_folio *finfo = NULL;
+ struct netfs_folio *finfo;
+ struct netfs_inode *ctx = netfs_inode(folio_inode(folio));
size_t flen = folio_size(folio);
_enter("{%lx},%zx,%zx", folio->index, offset, length);
- folio_wait_fscache(folio);
+ if (offset == 0 && length == flen) {
+ unsigned long long i_size = i_size_read(&ctx->inode);
+ unsigned long long fpos = folio_pos(folio), end;
+
+ end = umin(fpos + flen, i_size);
+ if (fpos < i_size && end > ctx->zero_point)
+ ctx->zero_point = end;
+ }
+
+ folio_wait_private_2(folio); /* [DEPRECATED] */
if (!folio_test_private(folio))
return;
@@ -196,18 +239,34 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
/* We have a partially uptodate page from a streaming write. */
unsigned int fstart = finfo->dirty_offset;
unsigned int fend = fstart + finfo->dirty_len;
- unsigned int end = offset + length;
+ unsigned int iend = offset + length;
if (offset >= fend)
return;
- if (end <= fstart)
+ if (iend <= fstart)
+ return;
+
+ /* The invalidation region overlaps the data. If the region
+ * covers the start of the data, we either move along the start
+ * or just erase the data entirely.
+ */
+ if (offset <= fstart) {
+ if (iend >= fend)
+ goto erase_completely;
+ /* Move the start of the data. */
+ finfo->dirty_len = fend - iend;
+ finfo->dirty_offset = offset;
+ return;
+ }
+
+ /* Reduce the length of the data if the invalidation region
+ * covers the tail part.
+ */
+ if (iend >= fend) {
+ finfo->dirty_len = offset - fstart;
return;
- if (offset <= fstart && end >= fend)
- goto erase_completely;
- if (offset <= fstart && end > fstart)
- goto reduce_len;
- if (offset > fstart && end >= fend)
- goto move_start;
+ }
+
/* A partial write was split. The caller has already zeroed
* it, so just absorb the hole.
*/
@@ -220,12 +279,6 @@ erase_completely:
folio_clear_uptodate(folio);
kfree(finfo);
return;
-reduce_len:
- finfo->dirty_len = offset + length - finfo->dirty_offset;
- return;
-move_start:
- finfo->dirty_len -= offset - finfo->dirty_offset;
- finfo->dirty_offset = offset;
}
EXPORT_SYMBOL(netfs_invalidate_folio);
@@ -242,18 +295,20 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp)
struct netfs_inode *ctx = netfs_inode(folio_inode(folio));
unsigned long long end;
- end = folio_pos(folio) + folio_size(folio);
+ if (folio_test_dirty(folio))
+ return false;
+
+ end = umin(folio_pos(folio) + folio_size(folio), i_size_read(&ctx->inode));
if (end > ctx->zero_point)
ctx->zero_point = end;
if (folio_test_private(folio))
return false;
- if (folio_test_fscache(folio)) {
+ if (unlikely(folio_test_private_2(folio))) { /* [DEPRECATED] */
if (current_is_kswapd() || !(gfp & __GFP_FS))
return false;
- folio_wait_fscache(folio);
+ folio_wait_private_2(folio);
}
-
fscache_note_page_release(netfs_i_cookie(ctx));
return true;
}
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index 610ceb5bd86c..dc6b41ef18b0 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -6,6 +6,8 @@
*/
#include <linux/slab.h>
+#include <linux/mempool.h>
+#include <linux/delay.h>
#include "internal.h"
/*
@@ -20,43 +22,59 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
struct inode *inode = file ? file_inode(file) : mapping->host;
struct netfs_inode *ctx = netfs_inode(inode);
struct netfs_io_request *rreq;
- bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE ||
- origin == NETFS_DIO_READ ||
- origin == NETFS_DIO_WRITE);
- bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx);
+ mempool_t *mempool = ctx->ops->request_pool ?: &netfs_request_pool;
+ struct kmem_cache *cache = mempool->pool_data;
int ret;
- rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request),
- GFP_KERNEL);
- if (!rreq)
- return ERR_PTR(-ENOMEM);
+ for (;;) {
+ rreq = mempool_alloc(mempool, GFP_KERNEL);
+ if (rreq)
+ break;
+ msleep(10);
+ }
+ memset(rreq, 0, kmem_cache_size(cache));
rreq->start = start;
rreq->len = len;
- rreq->upper_len = len;
rreq->origin = origin;
rreq->netfs_ops = ctx->ops;
rreq->mapping = mapping;
rreq->inode = inode;
rreq->i_size = i_size_read(inode);
rreq->debug_id = atomic_inc_return(&debug_ids);
- INIT_LIST_HEAD(&rreq->subrequests);
- INIT_WORK(&rreq->work, NULL);
+ rreq->wsize = INT_MAX;
+ rreq->io_streams[0].sreq_max_len = ULONG_MAX;
+ rreq->io_streams[0].sreq_max_segs = 0;
+ spin_lock_init(&rreq->lock);
+ INIT_LIST_HEAD(&rreq->io_streams[0].subrequests);
+ INIT_LIST_HEAD(&rreq->io_streams[1].subrequests);
+ init_waitqueue_head(&rreq->waitq);
refcount_set(&rreq->ref, 1);
+ if (origin == NETFS_READAHEAD ||
+ origin == NETFS_READPAGE ||
+ origin == NETFS_READ_GAPS ||
+ origin == NETFS_READ_SINGLE ||
+ origin == NETFS_READ_FOR_WRITE ||
+ origin == NETFS_DIO_READ) {
+ INIT_WORK(&rreq->work, netfs_read_collection_worker);
+ rreq->io_streams[0].avail = true;
+ } else {
+ INIT_WORK(&rreq->work, netfs_write_collection_worker);
+ }
+
__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
- if (cached)
- __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
if (file && file->f_flags & O_NONBLOCK)
__set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags);
if (rreq->netfs_ops->init_request) {
ret = rreq->netfs_ops->init_request(rreq, file);
if (ret < 0) {
- kfree(rreq);
+ mempool_free(rreq, rreq->netfs_ops->request_pool ?: &netfs_request_pool);
return ERR_PTR(ret);
}
}
+ atomic_inc(&ctx->io_count);
trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new);
netfs_proc_add_rreq(rreq);
netfs_stat(&netfs_n_rh_rreq);
@@ -74,20 +92,34 @@ void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace
void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async)
{
struct netfs_io_subrequest *subreq;
-
- while (!list_empty(&rreq->subrequests)) {
- subreq = list_first_entry(&rreq->subrequests,
- struct netfs_io_subrequest, rreq_link);
- list_del(&subreq->rreq_link);
- netfs_put_subrequest(subreq, was_async,
- netfs_sreq_trace_put_clear);
+ struct netfs_io_stream *stream;
+ int s;
+
+ for (s = 0; s < ARRAY_SIZE(rreq->io_streams); s++) {
+ stream = &rreq->io_streams[s];
+ while (!list_empty(&stream->subrequests)) {
+ subreq = list_first_entry(&stream->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ list_del(&subreq->rreq_link);
+ netfs_put_subrequest(subreq, was_async,
+ netfs_sreq_trace_put_clear);
+ }
}
}
+static void netfs_free_request_rcu(struct rcu_head *rcu)
+{
+ struct netfs_io_request *rreq = container_of(rcu, struct netfs_io_request, rcu);
+
+ mempool_free(rreq, rreq->netfs_ops->request_pool ?: &netfs_request_pool);
+ netfs_stat_d(&netfs_n_rh_rreq);
+}
+
static void netfs_free_request(struct work_struct *work)
{
struct netfs_io_request *rreq =
container_of(work, struct netfs_io_request, work);
+ struct netfs_inode *ictx = netfs_inode(rreq->inode);
unsigned int i;
trace_netfs_rreq(rreq, netfs_rreq_trace_free);
@@ -106,8 +138,11 @@ static void netfs_free_request(struct work_struct *work)
}
kvfree(rreq->direct_bv);
}
- kfree_rcu(rreq, rcu);
- netfs_stat_d(&netfs_n_rh_rreq);
+ rolling_buffer_clear(&rreq->buffer);
+
+ if (atomic_dec_and_test(&ictx->io_count))
+ wake_up_var(&ictx->io_count);
+ call_rcu(&rreq->rcu, netfs_free_request_rcu);
}
void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
@@ -125,7 +160,7 @@ void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
if (was_async) {
rreq->work.func = netfs_free_request;
if (!queue_work(system_unbound_wq, &rreq->work))
- BUG();
+ WARN_ON(1);
} else {
netfs_free_request(&rreq->work);
}
@@ -139,19 +174,25 @@ void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
-
- subreq = kzalloc(rreq->netfs_ops->io_subrequest_size ?:
- sizeof(struct netfs_io_subrequest),
- GFP_KERNEL);
- if (subreq) {
- INIT_WORK(&subreq->work, NULL);
- INIT_LIST_HEAD(&subreq->rreq_link);
- refcount_set(&subreq->ref, 2);
- subreq->rreq = rreq;
- netfs_get_request(rreq, netfs_rreq_trace_get_subreq);
- netfs_stat(&netfs_n_rh_sreq);
+ mempool_t *mempool = rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool;
+ struct kmem_cache *cache = mempool->pool_data;
+
+ for (;;) {
+ subreq = mempool_alloc(rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool,
+ GFP_KERNEL);
+ if (subreq)
+ break;
+ msleep(10);
}
+ memset(subreq, 0, kmem_cache_size(cache));
+ INIT_WORK(&subreq->work, NULL);
+ INIT_LIST_HEAD(&subreq->rreq_link);
+ refcount_set(&subreq->ref, 2);
+ subreq->rreq = rreq;
+ subreq->debug_index = atomic_inc_return(&rreq->subreq_counter);
+ netfs_get_request(rreq, netfs_rreq_trace_get_subreq);
+ netfs_stat(&netfs_n_rh_sreq);
return subreq;
}
@@ -173,7 +214,7 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq,
trace_netfs_sreq(subreq, netfs_sreq_trace_free);
if (rreq->netfs_ops->free_subrequest)
rreq->netfs_ops->free_subrequest(subreq);
- kfree(subreq);
+ mempool_free(subreq, rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool);
netfs_stat_d(&netfs_n_rh_sreq);
netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq);
}
diff --git a/fs/netfs/output.c b/fs/netfs/output.c
deleted file mode 100644
index 625eb68f3e5a..000000000000
--- a/fs/netfs/output.c
+++ /dev/null
@@ -1,478 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* Network filesystem high-level write support.
- *
- * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/writeback.h>
-#include <linux/pagevec.h>
-#include "internal.h"
-
-/**
- * netfs_create_write_request - Create a write operation.
- * @wreq: The write request this is storing from.
- * @dest: The destination type
- * @start: Start of the region this write will modify
- * @len: Length of the modification
- * @worker: The worker function to handle the write(s)
- *
- * Allocate a write operation, set it up and add it to the list on a write
- * request.
- */
-struct netfs_io_subrequest *netfs_create_write_request(struct netfs_io_request *wreq,
- enum netfs_io_source dest,
- loff_t start, size_t len,
- work_func_t worker)
-{
- struct netfs_io_subrequest *subreq;
-
- subreq = netfs_alloc_subrequest(wreq);
- if (subreq) {
- INIT_WORK(&subreq->work, worker);
- subreq->source = dest;
- subreq->start = start;
- subreq->len = len;
- subreq->debug_index = wreq->subreq_counter++;
-
- switch (subreq->source) {
- case NETFS_UPLOAD_TO_SERVER:
- netfs_stat(&netfs_n_wh_upload);
- break;
- case NETFS_WRITE_TO_CACHE:
- netfs_stat(&netfs_n_wh_write);
- break;
- default:
- BUG();
- }
-
- subreq->io_iter = wreq->io_iter;
- iov_iter_advance(&subreq->io_iter, subreq->start - wreq->start);
- iov_iter_truncate(&subreq->io_iter, subreq->len);
-
- trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
- refcount_read(&subreq->ref),
- netfs_sreq_trace_new);
- atomic_inc(&wreq->nr_outstanding);
- list_add_tail(&subreq->rreq_link, &wreq->subrequests);
- trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
- }
-
- return subreq;
-}
-EXPORT_SYMBOL(netfs_create_write_request);
-
-/*
- * Process a completed write request once all the component operations have
- * been completed.
- */
-static void netfs_write_terminated(struct netfs_io_request *wreq, bool was_async)
-{
- struct netfs_io_subrequest *subreq;
- struct netfs_inode *ctx = netfs_inode(wreq->inode);
- size_t transferred = 0;
-
- _enter("R=%x[]", wreq->debug_id);
-
- trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
-
- list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
- if (subreq->error || subreq->transferred == 0)
- break;
- transferred += subreq->transferred;
- if (subreq->transferred < subreq->len)
- break;
- }
- wreq->transferred = transferred;
-
- list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
- if (!subreq->error)
- continue;
- switch (subreq->source) {
- case NETFS_UPLOAD_TO_SERVER:
- /* Depending on the type of failure, this may prevent
- * writeback completion unless we're in disconnected
- * mode.
- */
- if (!wreq->error)
- wreq->error = subreq->error;
- break;
-
- case NETFS_WRITE_TO_CACHE:
- /* Failure doesn't prevent writeback completion unless
- * we're in disconnected mode.
- */
- if (subreq->error != -ENOBUFS)
- ctx->ops->invalidate_cache(wreq);
- break;
-
- default:
- WARN_ON_ONCE(1);
- if (!wreq->error)
- wreq->error = -EIO;
- return;
- }
- }
-
- wreq->cleanup(wreq);
-
- if (wreq->origin == NETFS_DIO_WRITE &&
- wreq->mapping->nrpages) {
- pgoff_t first = wreq->start >> PAGE_SHIFT;
- pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
- invalidate_inode_pages2_range(wreq->mapping, first, last);
- }
-
- if (wreq->origin == NETFS_DIO_WRITE)
- inode_dio_end(wreq->inode);
-
- _debug("finished");
- trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
- clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
- wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
-
- if (wreq->iocb) {
- wreq->iocb->ki_pos += transferred;
- if (wreq->iocb->ki_complete)
- wreq->iocb->ki_complete(
- wreq->iocb, wreq->error ? wreq->error : transferred);
- }
-
- netfs_clear_subrequests(wreq, was_async);
- netfs_put_request(wreq, was_async, netfs_rreq_trace_put_complete);
-}
-
-/*
- * Deal with the completion of writing the data to the cache.
- */
-void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
- bool was_async)
-{
- struct netfs_io_subrequest *subreq = _op;
- struct netfs_io_request *wreq = subreq->rreq;
- unsigned int u;
-
- _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
-
- switch (subreq->source) {
- case NETFS_UPLOAD_TO_SERVER:
- netfs_stat(&netfs_n_wh_upload_done);
- break;
- case NETFS_WRITE_TO_CACHE:
- netfs_stat(&netfs_n_wh_write_done);
- break;
- case NETFS_INVALID_WRITE:
- break;
- default:
- BUG();
- }
-
- if (IS_ERR_VALUE(transferred_or_error)) {
- subreq->error = transferred_or_error;
- trace_netfs_failure(wreq, subreq, transferred_or_error,
- netfs_fail_write);
- goto failed;
- }
-
- if (WARN(transferred_or_error > subreq->len - subreq->transferred,
- "Subreq excess write: R%x[%x] %zd > %zu - %zu",
- wreq->debug_id, subreq->debug_index,
- transferred_or_error, subreq->len, subreq->transferred))
- transferred_or_error = subreq->len - subreq->transferred;
-
- subreq->error = 0;
- subreq->transferred += transferred_or_error;
-
- if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
- pr_warn("R=%08x[%u] ITER POST-MISMATCH %zx != %zx-%zx %x\n",
- wreq->debug_id, subreq->debug_index,
- iov_iter_count(&subreq->io_iter), subreq->len,
- subreq->transferred, subreq->io_iter.iter_type);
-
- if (subreq->transferred < subreq->len)
- goto incomplete;
-
- __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
-out:
- trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
-
- /* If we decrement nr_outstanding to 0, the ref belongs to us. */
- u = atomic_dec_return(&wreq->nr_outstanding);
- if (u == 0)
- netfs_write_terminated(wreq, was_async);
- else if (u == 1)
- wake_up_var(&wreq->nr_outstanding);
-
- netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
- return;
-
-incomplete:
- if (transferred_or_error == 0) {
- if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
- subreq->error = -ENODATA;
- goto failed;
- }
- } else {
- __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
- }
-
- __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
- set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
- goto out;
-
-failed:
- switch (subreq->source) {
- case NETFS_WRITE_TO_CACHE:
- netfs_stat(&netfs_n_wh_write_failed);
- set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
- break;
- case NETFS_UPLOAD_TO_SERVER:
- netfs_stat(&netfs_n_wh_upload_failed);
- set_bit(NETFS_RREQ_FAILED, &wreq->flags);
- wreq->error = subreq->error;
- break;
- default:
- break;
- }
- goto out;
-}
-EXPORT_SYMBOL(netfs_write_subrequest_terminated);
-
-static void netfs_write_to_cache_op(struct netfs_io_subrequest *subreq)
-{
- struct netfs_io_request *wreq = subreq->rreq;
- struct netfs_cache_resources *cres = &wreq->cache_resources;
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
-
- cres->ops->write(cres, subreq->start, &subreq->io_iter,
- netfs_write_subrequest_terminated, subreq);
-}
-
-static void netfs_write_to_cache_op_worker(struct work_struct *work)
-{
- struct netfs_io_subrequest *subreq =
- container_of(work, struct netfs_io_subrequest, work);
-
- netfs_write_to_cache_op(subreq);
-}
-
-/**
- * netfs_queue_write_request - Queue a write request for attention
- * @subreq: The write request to be queued
- *
- * Queue the specified write request for processing by a worker thread. We
- * pass the caller's ref on the request to the worker thread.
- */
-void netfs_queue_write_request(struct netfs_io_subrequest *subreq)
-{
- if (!queue_work(system_unbound_wq, &subreq->work))
- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_wip);
-}
-EXPORT_SYMBOL(netfs_queue_write_request);
-
-/*
- * Set up a op for writing to the cache.
- */
-static void netfs_set_up_write_to_cache(struct netfs_io_request *wreq)
-{
- struct netfs_cache_resources *cres = &wreq->cache_resources;
- struct netfs_io_subrequest *subreq;
- struct netfs_inode *ctx = netfs_inode(wreq->inode);
- struct fscache_cookie *cookie = netfs_i_cookie(ctx);
- loff_t start = wreq->start;
- size_t len = wreq->len;
- int ret;
-
- if (!fscache_cookie_enabled(cookie)) {
- clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags);
- return;
- }
-
- _debug("write to cache");
- ret = fscache_begin_write_operation(cres, cookie);
- if (ret < 0)
- return;
-
- ret = cres->ops->prepare_write(cres, &start, &len, wreq->upper_len,
- i_size_read(wreq->inode), true);
- if (ret < 0)
- return;
-
- subreq = netfs_create_write_request(wreq, NETFS_WRITE_TO_CACHE, start, len,
- netfs_write_to_cache_op_worker);
- if (!subreq)
- return;
-
- netfs_write_to_cache_op(subreq);
-}
-
-/*
- * Begin the process of writing out a chunk of data.
- *
- * We are given a write request that holds a series of dirty regions and
- * (partially) covers a sequence of folios, all of which are present. The
- * pages must have been marked as writeback as appropriate.
- *
- * We need to perform the following steps:
- *
- * (1) If encrypting, create an output buffer and encrypt each block of the
- * data into it, otherwise the output buffer will point to the original
- * folios.
- *
- * (2) If the data is to be cached, set up a write op for the entire output
- * buffer to the cache, if the cache wants to accept it.
- *
- * (3) If the data is to be uploaded (ie. not merely cached):
- *
- * (a) If the data is to be compressed, create a compression buffer and
- * compress the data into it.
- *
- * (b) For each destination we want to upload to, set up write ops to write
- * to that destination. We may need multiple writes if the data is not
- * contiguous or the span exceeds wsize for a server.
- */
-int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
- enum netfs_write_trace what)
-{
- struct netfs_inode *ctx = netfs_inode(wreq->inode);
-
- _enter("R=%x %llx-%llx f=%lx",
- wreq->debug_id, wreq->start, wreq->start + wreq->len - 1,
- wreq->flags);
-
- trace_netfs_write(wreq, what);
- if (wreq->len == 0 || wreq->iter.count == 0) {
- pr_err("Zero-sized write [R=%x]\n", wreq->debug_id);
- return -EIO;
- }
-
- if (wreq->origin == NETFS_DIO_WRITE)
- inode_dio_begin(wreq->inode);
-
- wreq->io_iter = wreq->iter;
-
- /* ->outstanding > 0 carries a ref */
- netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
- atomic_set(&wreq->nr_outstanding, 1);
-
- /* Start the encryption/compression going. We can do that in the
- * background whilst we generate a list of write ops that we want to
- * perform.
- */
- // TODO: Encrypt or compress the region as appropriate
-
- /* We need to write all of the region to the cache */
- if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags))
- netfs_set_up_write_to_cache(wreq);
-
- /* However, we don't necessarily write all of the region to the server.
- * Caching of reads is being managed this way also.
- */
- if (test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
- ctx->ops->create_write_requests(wreq, wreq->start, wreq->len);
-
- if (atomic_dec_and_test(&wreq->nr_outstanding))
- netfs_write_terminated(wreq, false);
-
- if (!may_wait)
- return -EIOCBQUEUED;
-
- wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
- TASK_UNINTERRUPTIBLE);
- return wreq->error;
-}
-
-/*
- * Begin a write operation for writing through the pagecache.
- */
-struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len)
-{
- struct netfs_io_request *wreq;
- struct file *file = iocb->ki_filp;
-
- wreq = netfs_alloc_request(file->f_mapping, file, iocb->ki_pos, len,
- NETFS_WRITETHROUGH);
- if (IS_ERR(wreq))
- return wreq;
-
- trace_netfs_write(wreq, netfs_write_trace_writethrough);
-
- __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
- iov_iter_xarray(&wreq->iter, ITER_SOURCE, &wreq->mapping->i_pages, wreq->start, 0);
- wreq->io_iter = wreq->iter;
-
- /* ->outstanding > 0 carries a ref */
- netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
- atomic_set(&wreq->nr_outstanding, 1);
- return wreq;
-}
-
-static void netfs_submit_writethrough(struct netfs_io_request *wreq, bool final)
-{
- struct netfs_inode *ictx = netfs_inode(wreq->inode);
- unsigned long long start;
- size_t len;
-
- if (!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
- return;
-
- start = wreq->start + wreq->submitted;
- len = wreq->iter.count - wreq->submitted;
- if (!final) {
- len /= wreq->wsize; /* Round to number of maximum packets */
- len *= wreq->wsize;
- }
-
- ictx->ops->create_write_requests(wreq, start, len);
- wreq->submitted += len;
-}
-
-/*
- * Advance the state of the write operation used when writing through the
- * pagecache. Data has been copied into the pagecache that we need to append
- * to the request. If we've added more than wsize then we need to create a new
- * subrequest.
- */
-int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end)
-{
- _enter("ic=%zu sb=%zu ws=%u cp=%zu tp=%u",
- wreq->iter.count, wreq->submitted, wreq->wsize, copied, to_page_end);
-
- wreq->iter.count += copied;
- wreq->io_iter.count += copied;
- if (to_page_end && wreq->io_iter.count - wreq->submitted >= wreq->wsize)
- netfs_submit_writethrough(wreq, false);
-
- return wreq->error;
-}
-
-/*
- * End a write operation used when writing through the pagecache.
- */
-int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb)
-{
- int ret = -EIOCBQUEUED;
-
- _enter("ic=%zu sb=%zu ws=%u",
- wreq->iter.count, wreq->submitted, wreq->wsize);
-
- if (wreq->submitted < wreq->io_iter.count)
- netfs_submit_writethrough(wreq, true);
-
- if (atomic_dec_and_test(&wreq->nr_outstanding))
- netfs_write_terminated(wreq, false);
-
- if (is_sync_kiocb(iocb)) {
- wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
- TASK_UNINTERRUPTIBLE);
- ret = wreq->error;
- }
-
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
- return ret;
-}
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
new file mode 100644
index 000000000000..23c75755ad4e
--- /dev/null
+++ b/fs/netfs/read_collect.c
@@ -0,0 +1,706 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem read subrequest result collection, assessment and
+ * retrying.
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/task_io_accounting_ops.h>
+#include "internal.h"
+
+/* Notes made in the collector */
+#define HIT_PENDING 0x01 /* A front op was still pending */
+#define MADE_PROGRESS 0x04 /* Made progress cleaning up a stream or the folio set */
+#define BUFFERED 0x08 /* The pagecache needs cleaning up */
+#define NEED_RETRY 0x10 /* A front op requests retrying */
+#define COPY_TO_CACHE 0x40 /* Need to copy subrequest to cache */
+#define ABANDON_SREQ 0x80 /* Need to abandon untransferred part of subrequest */
+
+/*
+ * Clear the unread part of an I/O request.
+ */
+static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
+{
+ netfs_reset_iter(subreq);
+ WARN_ON_ONCE(subreq->len - subreq->transferred != iov_iter_count(&subreq->io_iter));
+ iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter);
+ if (subreq->start + subreq->transferred >= subreq->rreq->i_size)
+ __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
+}
+
+/*
+ * Flush, mark and unlock a folio that's now completely read. If we want to
+ * cache the folio, we set the group to NETFS_FOLIO_COPY_TO_CACHE, mark it
+ * dirty and let writeback handle it.
+ */
+static void netfs_unlock_read_folio(struct netfs_io_request *rreq,
+ struct folio_queue *folioq,
+ int slot)
+{
+ struct netfs_folio *finfo;
+ struct folio *folio = folioq_folio(folioq, slot);
+
+ if (unlikely(folio_pos(folio) < rreq->abandon_to)) {
+ trace_netfs_folio(folio, netfs_folio_trace_abandon);
+ goto just_unlock;
+ }
+
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
+
+ if (!test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) {
+ finfo = netfs_folio_info(folio);
+ if (finfo) {
+ trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
+ if (finfo->netfs_group)
+ folio_change_private(folio, finfo->netfs_group);
+ else
+ folio_detach_private(folio);
+ kfree(finfo);
+ }
+
+ if (test_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags)) {
+ if (!WARN_ON_ONCE(folio_get_private(folio) != NULL)) {
+ trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
+ folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE);
+ folio_mark_dirty(folio);
+ }
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_read_done);
+ }
+
+ folioq_clear(folioq, slot);
+ } else {
+ // TODO: Use of PG_private_2 is deprecated.
+ if (test_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags))
+ netfs_pgpriv2_copy_to_cache(rreq, folio);
+ }
+
+just_unlock:
+ if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
+ if (folio->index == rreq->no_unlock_folio &&
+ test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) {
+ _debug("no unlock");
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_read_unlock);
+ folio_unlock(folio);
+ }
+ }
+
+ folioq_clear(folioq, slot);
+}
+
+/*
+ * Unlock any folios we've finished with.
+ */
+static void netfs_read_unlock_folios(struct netfs_io_request *rreq,
+ unsigned int *notes)
+{
+ struct folio_queue *folioq = rreq->buffer.tail;
+ unsigned long long collected_to = rreq->collected_to;
+ unsigned int slot = rreq->buffer.first_tail_slot;
+
+ if (rreq->cleaned_to >= rreq->collected_to)
+ return;
+
+ // TODO: Begin decryption
+
+ if (slot >= folioq_nr_slots(folioq)) {
+ folioq = rolling_buffer_delete_spent(&rreq->buffer);
+ if (!folioq) {
+ rreq->front_folio_order = 0;
+ return;
+ }
+ slot = 0;
+ }
+
+ for (;;) {
+ struct folio *folio;
+ unsigned long long fpos, fend;
+ unsigned int order;
+ size_t fsize;
+
+ if (*notes & COPY_TO_CACHE)
+ set_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags);
+
+ folio = folioq_folio(folioq, slot);
+ if (WARN_ONCE(!folio_test_locked(folio),
+ "R=%08x: folio %lx is not locked\n",
+ rreq->debug_id, folio->index))
+ trace_netfs_folio(folio, netfs_folio_trace_not_locked);
+
+ order = folioq_folio_order(folioq, slot);
+ rreq->front_folio_order = order;
+ fsize = PAGE_SIZE << order;
+ fpos = folio_pos(folio);
+ fend = umin(fpos + fsize, rreq->i_size);
+
+ trace_netfs_collect_folio(rreq, folio, fend, collected_to);
+
+ /* Unlock any folio we've transferred all of. */
+ if (collected_to < fend)
+ break;
+
+ netfs_unlock_read_folio(rreq, folioq, slot);
+ WRITE_ONCE(rreq->cleaned_to, fpos + fsize);
+ *notes |= MADE_PROGRESS;
+
+ clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags);
+
+ /* Clean up the head folioq. If we clear an entire folioq, then
+ * we can get rid of it provided it's not also the tail folioq
+ * being filled by the issuer.
+ */
+ folioq_clear(folioq, slot);
+ slot++;
+ if (slot >= folioq_nr_slots(folioq)) {
+ folioq = rolling_buffer_delete_spent(&rreq->buffer);
+ if (!folioq)
+ goto done;
+ slot = 0;
+ trace_netfs_folioq(folioq, netfs_trace_folioq_read_progress);
+ }
+
+ if (fpos + fsize >= collected_to)
+ break;
+ }
+
+ rreq->buffer.tail = folioq;
+done:
+ rreq->buffer.first_tail_slot = slot;
+}
+
+/*
+ * Collect and assess the results of various read subrequests. We may need to
+ * retry some of the results.
+ *
+ * Note that we have a sequence of subrequests, which may be drawing on
+ * different sources and may or may not be the same size or starting position
+ * and may not even correspond in boundary alignment.
+ */
+static void netfs_collect_read_results(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *front, *remove;
+ struct netfs_io_stream *stream = &rreq->io_streams[0];
+ unsigned int notes;
+
+ _enter("%llx-%llx", rreq->start, rreq->start + rreq->len);
+ trace_netfs_rreq(rreq, netfs_rreq_trace_collect);
+ trace_netfs_collect(rreq);
+
+reassess:
+ if (rreq->origin == NETFS_READAHEAD ||
+ rreq->origin == NETFS_READPAGE ||
+ rreq->origin == NETFS_READ_FOR_WRITE)
+ notes = BUFFERED;
+ else
+ notes = 0;
+
+ /* Remove completed subrequests from the front of the stream and
+ * advance the completion point. We stop when we hit something that's
+ * in progress. The issuer thread may be adding stuff to the tail
+ * whilst we're doing this.
+ */
+ front = READ_ONCE(stream->front);
+ while (front) {
+ size_t transferred;
+
+ trace_netfs_collect_sreq(rreq, front);
+ _debug("sreq [%x] %llx %zx/%zx",
+ front->debug_index, front->start, front->transferred, front->len);
+
+ if (stream->collected_to < front->start) {
+ trace_netfs_collect_gap(rreq, stream, front->start, 'F');
+ stream->collected_to = front->start;
+ }
+
+ if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags))
+ notes |= HIT_PENDING;
+ smp_rmb(); /* Read counters after IN_PROGRESS flag. */
+ transferred = READ_ONCE(front->transferred);
+
+ /* If we can now collect the next folio, do so. We don't want
+ * to defer this as we have to decide whether we need to copy
+ * to the cache or not, and that may differ between adjacent
+ * subreqs.
+ */
+ if (notes & BUFFERED) {
+ size_t fsize = PAGE_SIZE << rreq->front_folio_order;
+
+ /* Clear the tail of a short read. */
+ if (!(notes & HIT_PENDING) &&
+ front->error == 0 &&
+ transferred < front->len &&
+ (test_bit(NETFS_SREQ_HIT_EOF, &front->flags) ||
+ test_bit(NETFS_SREQ_CLEAR_TAIL, &front->flags))) {
+ netfs_clear_unread(front);
+ transferred = front->transferred = front->len;
+ trace_netfs_sreq(front, netfs_sreq_trace_clear);
+ }
+
+ stream->collected_to = front->start + transferred;
+ rreq->collected_to = stream->collected_to;
+
+ if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &front->flags))
+ notes |= COPY_TO_CACHE;
+
+ if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
+ rreq->abandon_to = front->start + front->len;
+ front->transferred = front->len;
+ transferred = front->len;
+ trace_netfs_rreq(rreq, netfs_rreq_trace_set_abandon);
+ }
+ if (front->start + transferred >= rreq->cleaned_to + fsize ||
+ test_bit(NETFS_SREQ_HIT_EOF, &front->flags))
+ netfs_read_unlock_folios(rreq, &notes);
+ } else {
+ stream->collected_to = front->start + transferred;
+ rreq->collected_to = stream->collected_to;
+ }
+
+ /* Stall if the front is still undergoing I/O. */
+ if (notes & HIT_PENDING)
+ break;
+
+ if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
+ if (!stream->failed) {
+ stream->error = front->error;
+ rreq->error = front->error;
+ set_bit(NETFS_RREQ_FAILED, &rreq->flags);
+ stream->failed = true;
+ }
+ notes |= MADE_PROGRESS | ABANDON_SREQ;
+ } else if (test_bit(NETFS_SREQ_NEED_RETRY, &front->flags)) {
+ stream->need_retry = true;
+ notes |= NEED_RETRY | MADE_PROGRESS;
+ break;
+ } else {
+ if (!stream->failed)
+ stream->transferred = stream->collected_to - rreq->start;
+ notes |= MADE_PROGRESS;
+ }
+
+ /* Remove if completely consumed. */
+ stream->source = front->source;
+ spin_lock(&rreq->lock);
+
+ remove = front;
+ trace_netfs_sreq(front, netfs_sreq_trace_discard);
+ list_del_init(&front->rreq_link);
+ front = list_first_entry_or_null(&stream->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ stream->front = front;
+ spin_unlock(&rreq->lock);
+ netfs_put_subrequest(remove, false,
+ notes & ABANDON_SREQ ?
+ netfs_sreq_trace_put_abandon :
+ netfs_sreq_trace_put_done);
+ }
+
+ trace_netfs_collect_stream(rreq, stream);
+ trace_netfs_collect_state(rreq, rreq->collected_to, notes);
+
+ if (!(notes & BUFFERED))
+ rreq->cleaned_to = rreq->collected_to;
+
+ if (notes & NEED_RETRY)
+ goto need_retry;
+ if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) {
+ trace_netfs_rreq(rreq, netfs_rreq_trace_unpause);
+ clear_bit_unlock(NETFS_RREQ_PAUSE, &rreq->flags);
+ smp_mb__after_atomic(); /* Set PAUSE before task state */
+ wake_up(&rreq->waitq);
+ }
+
+ if (notes & MADE_PROGRESS) {
+ //cond_resched();
+ goto reassess;
+ }
+
+out:
+ _leave(" = %x", notes);
+ return;
+
+need_retry:
+ /* Okay... We're going to have to retry parts of the stream. Note
+ * that any partially completed op will have had any wholly transferred
+ * folios removed from it.
+ */
+ _debug("retry");
+ netfs_retry_reads(rreq);
+ goto out;
+}
+
+/*
+ * Do page flushing and suchlike after DIO.
+ */
+static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+ struct netfs_io_stream *stream = &rreq->io_streams[0];
+ unsigned int i;
+
+ /* Collect unbuffered reads and direct reads, adding up the transfer
+ * sizes until we find the first short or failed subrequest.
+ */
+ list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
+ rreq->transferred += subreq->transferred;
+
+ if (subreq->transferred < subreq->len ||
+ test_bit(NETFS_SREQ_FAILED, &subreq->flags)) {
+ rreq->error = subreq->error;
+ break;
+ }
+ }
+
+ if (rreq->origin == NETFS_DIO_READ) {
+ for (i = 0; i < rreq->direct_bv_count; i++) {
+ flush_dcache_page(rreq->direct_bv[i].bv_page);
+ // TODO: cifs marks pages in the destination buffer
+ // dirty under some circumstances after a read. Do we
+ // need to do that too?
+ set_page_dirty(rreq->direct_bv[i].bv_page);
+ }
+ }
+
+ if (rreq->iocb) {
+ rreq->iocb->ki_pos += rreq->transferred;
+ if (rreq->iocb->ki_complete)
+ rreq->iocb->ki_complete(
+ rreq->iocb, rreq->error ? rreq->error : rreq->transferred);
+ }
+ if (rreq->netfs_ops->done)
+ rreq->netfs_ops->done(rreq);
+ if (rreq->origin == NETFS_DIO_READ)
+ inode_dio_end(rreq->inode);
+}
+
+/*
+ * Do processing after reading a monolithic single object.
+ */
+static void netfs_rreq_assess_single(struct netfs_io_request *rreq)
+{
+ struct netfs_io_stream *stream = &rreq->io_streams[0];
+
+ if (!rreq->error && stream->source == NETFS_DOWNLOAD_FROM_SERVER &&
+ fscache_resources_valid(&rreq->cache_resources)) {
+ trace_netfs_rreq(rreq, netfs_rreq_trace_dirty);
+ netfs_single_mark_inode_dirty(rreq->inode);
+ }
+
+ if (rreq->iocb) {
+ rreq->iocb->ki_pos += rreq->transferred;
+ if (rreq->iocb->ki_complete)
+ rreq->iocb->ki_complete(
+ rreq->iocb, rreq->error ? rreq->error : rreq->transferred);
+ }
+ if (rreq->netfs_ops->done)
+ rreq->netfs_ops->done(rreq);
+}
+
+/*
+ * Perform the collection of subrequests and folios.
+ *
+ * Note that we're in normal kernel thread context at this point, possibly
+ * running on a workqueue.
+ */
+static void netfs_read_collection(struct netfs_io_request *rreq)
+{
+ struct netfs_io_stream *stream = &rreq->io_streams[0];
+
+ netfs_collect_read_results(rreq);
+
+ /* We're done when the app thread has finished posting subreqs and the
+ * queue is empty.
+ */
+ if (!test_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags))
+ return;
+ smp_rmb(); /* Read ALL_QUEUED before subreq lists. */
+
+ if (!list_empty(&stream->subrequests))
+ return;
+
+ /* Okay, declare that all I/O is complete. */
+ rreq->transferred = stream->transferred;
+ trace_netfs_rreq(rreq, netfs_rreq_trace_complete);
+
+ //netfs_rreq_is_still_valid(rreq);
+
+ switch (rreq->origin) {
+ case NETFS_DIO_READ:
+ case NETFS_READ_GAPS:
+ netfs_rreq_assess_dio(rreq);
+ break;
+ case NETFS_READ_SINGLE:
+ netfs_rreq_assess_single(rreq);
+ break;
+ default:
+ break;
+ }
+ task_io_account_read(rreq->transferred);
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
+ clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_done);
+ netfs_clear_subrequests(rreq, false);
+ netfs_unlock_abandoned_read_pages(rreq);
+ if (unlikely(rreq->copy_to_cache))
+ netfs_pgpriv2_end_copy_to_cache(rreq);
+}
+
+void netfs_read_collection_worker(struct work_struct *work)
+{
+ struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work);
+
+ netfs_see_request(rreq, netfs_rreq_trace_see_work);
+ if (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
+ netfs_read_collection(rreq);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_work);
+}
+
+/*
+ * Wake the collection work item.
+ */
+void netfs_wake_read_collector(struct netfs_io_request *rreq)
+{
+ if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) &&
+ !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) {
+ if (!work_pending(&rreq->work)) {
+ netfs_get_request(rreq, netfs_rreq_trace_get_work);
+ if (!queue_work(system_unbound_wq, &rreq->work))
+ netfs_put_request(rreq, true, netfs_rreq_trace_put_work_nq);
+ }
+ } else {
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue);
+ wake_up(&rreq->waitq);
+ }
+}
+
+/**
+ * netfs_read_subreq_progress - Note progress of a read operation.
+ * @subreq: The read request that has terminated.
+ *
+ * This tells the read side of netfs lib that a contributory I/O operation has
+ * made some progress and that it may be possible to unlock some folios.
+ *
+ * Before calling, the filesystem should update subreq->transferred to track
+ * the amount of data copied into the output buffer.
+ */
+void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct netfs_io_stream *stream = &rreq->io_streams[0];
+ size_t fsize = PAGE_SIZE << rreq->front_folio_order;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_progress);
+
+ /* If we are at the head of the queue, wake up the collector,
+ * getting a ref to it if we were the ones to do so.
+ */
+ if (subreq->start + subreq->transferred > rreq->cleaned_to + fsize &&
+ (rreq->origin == NETFS_READAHEAD ||
+ rreq->origin == NETFS_READPAGE ||
+ rreq->origin == NETFS_READ_FOR_WRITE) &&
+ list_is_first(&subreq->rreq_link, &stream->subrequests)
+ ) {
+ __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+ netfs_wake_read_collector(rreq);
+ }
+}
+EXPORT_SYMBOL(netfs_read_subreq_progress);
+
+/**
+ * netfs_read_subreq_terminated - Note the termination of an I/O operation.
+ * @subreq: The I/O request that has terminated.
+ *
+ * This tells the read helper that a contributory I/O operation has terminated,
+ * one way or another, and that it should integrate the results.
+ *
+ * The caller indicates the outcome of the operation through @subreq->error,
+ * supplying 0 to indicate a successful or retryable transfer (if
+ * NETFS_SREQ_NEED_RETRY is set) or a negative error code. The helper will
+ * look after reissuing I/O operations as appropriate and writing downloaded
+ * data to the cache.
+ *
+ * Before calling, the filesystem should update subreq->transferred to track
+ * the amount of data copied into the output buffer.
+ */
+void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct netfs_io_stream *stream = &rreq->io_streams[0];
+
+ switch (subreq->source) {
+ case NETFS_READ_FROM_CACHE:
+ netfs_stat(&netfs_n_rh_read_done);
+ break;
+ case NETFS_DOWNLOAD_FROM_SERVER:
+ netfs_stat(&netfs_n_rh_download_done);
+ break;
+ default:
+ break;
+ }
+
+ /* Deal with retry requests, short reads and errors. If we retry
+ * but don't make progress, we abandon the attempt.
+ */
+ if (!subreq->error && subreq->transferred < subreq->len) {
+ if (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) {
+ trace_netfs_sreq(subreq, netfs_sreq_trace_hit_eof);
+ } else if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) {
+ trace_netfs_sreq(subreq, netfs_sreq_trace_need_clear);
+ } else if (test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
+ trace_netfs_sreq(subreq, netfs_sreq_trace_need_retry);
+ } else if (test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) {
+ __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_partial_read);
+ } else {
+ __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+ subreq->error = -ENODATA;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_short);
+ }
+ }
+
+ if (unlikely(subreq->error < 0)) {
+ trace_netfs_failure(rreq, subreq, subreq->error, netfs_fail_read);
+ if (subreq->source == NETFS_READ_FROM_CACHE) {
+ netfs_stat(&netfs_n_rh_read_failed);
+ __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ } else {
+ netfs_stat(&netfs_n_rh_download_failed);
+ __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+ }
+ trace_netfs_rreq(rreq, netfs_rreq_trace_set_pause);
+ set_bit(NETFS_RREQ_PAUSE, &rreq->flags);
+ }
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
+ clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+ smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */
+
+ /* If we are at the head of the queue, wake up the collector. */
+ if (list_is_first(&subreq->rreq_link, &stream->subrequests) ||
+ test_bit(NETFS_RREQ_RETRYING, &rreq->flags))
+ netfs_wake_read_collector(rreq);
+
+ netfs_put_subrequest(subreq, true, netfs_sreq_trace_put_terminated);
+}
+EXPORT_SYMBOL(netfs_read_subreq_terminated);
+
+/*
+ * Handle termination of a read from the cache.
+ */
+void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async)
+{
+ struct netfs_io_subrequest *subreq = priv;
+
+ if (transferred_or_error > 0) {
+ subreq->error = 0;
+ if (transferred_or_error > 0) {
+ subreq->transferred += transferred_or_error;
+ __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+ }
+ } else {
+ subreq->error = transferred_or_error;
+ }
+ netfs_read_subreq_terminated(subreq);
+}
+
+/*
+ * Wait for the read operation to complete, successfully or otherwise.
+ */
+ssize_t netfs_wait_for_read(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+ struct netfs_io_stream *stream = &rreq->io_streams[0];
+ DEFINE_WAIT(myself);
+ ssize_t ret;
+
+ for (;;) {
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue);
+ prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
+
+ subreq = list_first_entry_or_null(&stream->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ if (subreq &&
+ (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) ||
+ test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) {
+ __set_current_state(TASK_RUNNING);
+ netfs_read_collection(rreq);
+ continue;
+ }
+
+ if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
+ break;
+
+ schedule();
+ trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue);
+ }
+
+ finish_wait(&rreq->waitq, &myself);
+
+ ret = rreq->error;
+ if (ret == 0) {
+ ret = rreq->transferred;
+ switch (rreq->origin) {
+ case NETFS_DIO_READ:
+ case NETFS_READ_SINGLE:
+ ret = rreq->transferred;
+ break;
+ default:
+ if (rreq->submitted < rreq->len) {
+ trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
+ ret = -EIO;
+ }
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Wait for a paused read operation to unpause or complete in some manner.
+ */
+void netfs_wait_for_pause(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+ struct netfs_io_stream *stream = &rreq->io_streams[0];
+ DEFINE_WAIT(myself);
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wait_pause);
+
+ for (;;) {
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue);
+ prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
+
+ if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) {
+ subreq = list_first_entry_or_null(&stream->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ if (subreq &&
+ (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) ||
+ test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) {
+ __set_current_state(TASK_RUNNING);
+ netfs_read_collection(rreq);
+ continue;
+ }
+ }
+
+ if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags) ||
+ !test_bit(NETFS_RREQ_PAUSE, &rreq->flags))
+ break;
+
+ schedule();
+ trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue);
+ }
+
+ finish_wait(&rreq->waitq, &myself);
+}
diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c
new file mode 100644
index 000000000000..cf7727060215
--- /dev/null
+++ b/fs/netfs/read_pgpriv2.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Read with PG_private_2 [DEPRECATED].
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/task_io_accounting_ops.h>
+#include "internal.h"
+
+/*
+ * [DEPRECATED] Copy a folio to the cache with PG_private_2 set.
+ */
+static void netfs_pgpriv2_copy_folio(struct netfs_io_request *creq, struct folio *folio)
+{
+ struct netfs_io_stream *cache = &creq->io_streams[1];
+ size_t fsize = folio_size(folio), flen = fsize;
+ loff_t fpos = folio_pos(folio), i_size;
+ bool to_eof = false;
+
+ _enter("");
+
+ /* netfs_perform_write() may shift i_size around the page or from out
+ * of the page to beyond it, but cannot move i_size into or through the
+ * page since we have it locked.
+ */
+ i_size = i_size_read(creq->inode);
+
+ if (fpos >= i_size) {
+ /* mmap beyond eof. */
+ _debug("beyond eof");
+ folio_end_private_2(folio);
+ return;
+ }
+
+ if (fpos + fsize > creq->i_size)
+ creq->i_size = i_size;
+
+ if (flen > i_size - fpos) {
+ flen = i_size - fpos;
+ to_eof = true;
+ } else if (flen == i_size - fpos) {
+ to_eof = true;
+ }
+
+ _debug("folio %zx %zx", flen, fsize);
+
+ trace_netfs_folio(folio, netfs_folio_trace_store_copy);
+
+ /* Attach the folio to the rolling buffer. */
+ if (rolling_buffer_append(&creq->buffer, folio, 0) < 0) {
+ clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &creq->flags);
+ return;
+ }
+
+ cache->submit_extendable_to = fsize;
+ cache->submit_off = 0;
+ cache->submit_len = flen;
+
+ /* Attach the folio to one or more subrequests. For a big folio, we
+ * could end up with thousands of subrequests if the wsize is small -
+ * but we might need to wait during the creation of subrequests for
+ * network resources (eg. SMB credits).
+ */
+ do {
+ ssize_t part;
+
+ creq->buffer.iter.iov_offset = cache->submit_off;
+
+ atomic64_set(&creq->issued_to, fpos + cache->submit_off);
+ cache->submit_extendable_to = fsize - cache->submit_off;
+ part = netfs_advance_write(creq, cache, fpos + cache->submit_off,
+ cache->submit_len, to_eof);
+ cache->submit_off += part;
+ if (part > cache->submit_len)
+ cache->submit_len = 0;
+ else
+ cache->submit_len -= part;
+ } while (cache->submit_len > 0);
+
+ creq->buffer.iter.iov_offset = 0;
+ rolling_buffer_advance(&creq->buffer, fsize);
+ atomic64_set(&creq->issued_to, fpos + fsize);
+
+ if (flen < fsize)
+ netfs_issue_write(creq, cache);
+}
+
+/*
+ * [DEPRECATED] Set up copying to the cache.
+ */
+static struct netfs_io_request *netfs_pgpriv2_begin_copy_to_cache(
+ struct netfs_io_request *rreq, struct folio *folio)
+{
+ struct netfs_io_request *creq;
+
+ if (!fscache_resources_valid(&rreq->cache_resources))
+ goto cancel;
+
+ creq = netfs_create_write_req(rreq->mapping, NULL, folio_pos(folio),
+ NETFS_PGPRIV2_COPY_TO_CACHE);
+ if (IS_ERR(creq))
+ goto cancel;
+
+ if (!creq->io_streams[1].avail)
+ goto cancel_put;
+
+ trace_netfs_write(creq, netfs_write_trace_copy_to_cache);
+ netfs_stat(&netfs_n_wh_copy_to_cache);
+ rreq->copy_to_cache = creq;
+ return creq;
+
+cancel_put:
+ netfs_put_request(creq, false, netfs_rreq_trace_put_return);
+cancel:
+ rreq->copy_to_cache = ERR_PTR(-ENOBUFS);
+ clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags);
+ return ERR_PTR(-ENOBUFS);
+}
+
+/*
+ * [DEPRECATED] Mark page as requiring copy-to-cache using PG_private_2 and add
+ * it to the copy write request.
+ */
+void netfs_pgpriv2_copy_to_cache(struct netfs_io_request *rreq, struct folio *folio)
+{
+ struct netfs_io_request *creq = rreq->copy_to_cache;
+
+ if (!creq)
+ creq = netfs_pgpriv2_begin_copy_to_cache(rreq, folio);
+ if (IS_ERR(creq))
+ return;
+
+ trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
+ folio_start_private_2(folio);
+ netfs_pgpriv2_copy_folio(creq, folio);
+}
+
+/*
+ * [DEPRECATED] End writing to the cache, flushing out any outstanding writes.
+ */
+void netfs_pgpriv2_end_copy_to_cache(struct netfs_io_request *rreq)
+{
+ struct netfs_io_request *creq = rreq->copy_to_cache;
+
+ if (IS_ERR_OR_NULL(creq))
+ return;
+
+ netfs_issue_write(creq, &creq->io_streams[1]);
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &creq->flags);
+
+ netfs_put_request(creq, false, netfs_rreq_trace_put_return);
+ creq->copy_to_cache = NULL;
+}
+
+/*
+ * [DEPRECATED] Remove the PG_private_2 mark from any folios we've finished
+ * copying.
+ */
+bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *creq)
+{
+ struct folio_queue *folioq = creq->buffer.tail;
+ unsigned long long collected_to = creq->collected_to;
+ unsigned int slot = creq->buffer.first_tail_slot;
+ bool made_progress = false;
+
+ if (slot >= folioq_nr_slots(folioq)) {
+ folioq = rolling_buffer_delete_spent(&creq->buffer);
+ slot = 0;
+ }
+
+ for (;;) {
+ struct folio *folio;
+ unsigned long long fpos, fend;
+ size_t fsize, flen;
+
+ folio = folioq_folio(folioq, slot);
+ if (WARN_ONCE(!folio_test_private_2(folio),
+ "R=%08x: folio %lx is not marked private_2\n",
+ creq->debug_id, folio->index))
+ trace_netfs_folio(folio, netfs_folio_trace_not_under_wback);
+
+ fpos = folio_pos(folio);
+ fsize = folio_size(folio);
+ flen = fsize;
+
+ fend = min_t(unsigned long long, fpos + flen, creq->i_size);
+
+ trace_netfs_collect_folio(creq, folio, fend, collected_to);
+
+ /* Unlock any folio we've transferred all of. */
+ if (collected_to < fend)
+ break;
+
+ trace_netfs_folio(folio, netfs_folio_trace_end_copy);
+ folio_end_private_2(folio);
+ creq->cleaned_to = fpos + fsize;
+ made_progress = true;
+
+ /* Clean up the head folioq. If we clear an entire folioq, then
+ * we can get rid of it provided it's not also the tail folioq
+ * being filled by the issuer.
+ */
+ folioq_clear(folioq, slot);
+ slot++;
+ if (slot >= folioq_nr_slots(folioq)) {
+ folioq = rolling_buffer_delete_spent(&creq->buffer);
+ if (!folioq)
+ goto done;
+ slot = 0;
+ }
+
+ if (fpos + fsize >= collected_to)
+ break;
+ }
+
+ creq->buffer.tail = folioq;
+done:
+ creq->buffer.first_tail_slot = slot;
+ return made_progress;
+}
diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c
new file mode 100644
index 000000000000..0f294b26e08c
--- /dev/null
+++ b/fs/netfs/read_retry.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem read subrequest retrying.
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+static void netfs_reissue_read(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+ __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+ netfs_stat(&netfs_n_rh_retry_read_subreq);
+ subreq->rreq->netfs_ops->issue_read(subreq);
+}
+
+/*
+ * Go through the list of failed/short reads, retrying all retryable ones. We
+ * need to switch failed cache reads to network downloads.
+ */
+static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+ struct netfs_io_stream *stream = &rreq->io_streams[0];
+ struct list_head *next;
+
+ _enter("R=%x", rreq->debug_id);
+
+ if (list_empty(&stream->subrequests))
+ return;
+
+ if (rreq->netfs_ops->retry_request)
+ rreq->netfs_ops->retry_request(rreq, NULL);
+
+ /* If there's no renegotiation to do, just resend each retryable subreq
+ * up to the first permanently failed one.
+ */
+ if (!rreq->netfs_ops->prepare_read &&
+ !rreq->cache_resources.ops) {
+ list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
+ if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
+ break;
+ if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
+ __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+ subreq->retry_count++;
+ netfs_reset_iter(subreq);
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+ netfs_reissue_read(rreq, subreq);
+ }
+ }
+ return;
+ }
+
+ /* Okay, we need to renegotiate all the download requests and flip any
+ * failed cache reads over to being download requests and negotiate
+ * those also. All fully successful subreqs have been removed from the
+ * list and any spare data from those has been donated.
+ *
+ * What we do is decant the list and rebuild it one subreq at a time so
+ * that we don't end up with donations jumping over a gap we're busy
+ * populating with smaller subrequests. In the event that the subreq
+ * we just launched finishes before we insert the next subreq, it'll
+ * fill in rreq->prev_donated instead.
+ *
+ * Note: Alternatively, we could split the tail subrequest right before
+ * we reissue it and fix up the donations under lock.
+ */
+ next = stream->subrequests.next;
+
+ do {
+ struct netfs_io_subrequest *from, *to, *tmp;
+ struct iov_iter source;
+ unsigned long long start, len;
+ size_t part;
+ bool boundary = false, subreq_superfluous = false;
+
+ /* Go through the subreqs and find the next span of contiguous
+ * buffer that we then rejig (cifs, for example, needs the
+ * rsize renegotiating) and reissue.
+ */
+ from = list_entry(next, struct netfs_io_subrequest, rreq_link);
+ to = from;
+ start = from->start + from->transferred;
+ len = from->len - from->transferred;
+
+ _debug("from R=%08x[%x] s=%llx ctl=%zx/%zx",
+ rreq->debug_id, from->debug_index,
+ from->start, from->transferred, from->len);
+
+ if (test_bit(NETFS_SREQ_FAILED, &from->flags) ||
+ !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags))
+ goto abandon;
+
+ list_for_each_continue(next, &stream->subrequests) {
+ subreq = list_entry(next, struct netfs_io_subrequest, rreq_link);
+ if (subreq->start + subreq->transferred != start + len ||
+ test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) ||
+ !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))
+ break;
+ to = subreq;
+ len += to->len;
+ }
+
+ _debug(" - range: %llx-%llx %llx", start, start + len - 1, len);
+
+ /* Determine the set of buffers we're going to use. Each
+ * subreq gets a subset of a single overall contiguous buffer.
+ */
+ netfs_reset_iter(from);
+ source = from->io_iter;
+ source.count = len;
+
+ /* Work through the sublist. */
+ subreq = from;
+ list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
+ if (!len) {
+ subreq_superfluous = true;
+ break;
+ }
+ subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
+ subreq->start = start - subreq->transferred;
+ subreq->len = len + subreq->transferred;
+ __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+ subreq->retry_count++;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+
+ /* Renegotiate max_len (rsize) */
+ stream->sreq_max_len = subreq->len;
+ if (rreq->netfs_ops->prepare_read &&
+ rreq->netfs_ops->prepare_read(subreq) < 0) {
+ trace_netfs_sreq(subreq, netfs_sreq_trace_reprep_failed);
+ __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+ goto abandon;
+ }
+
+ part = umin(len, stream->sreq_max_len);
+ if (unlikely(stream->sreq_max_segs))
+ part = netfs_limit_iter(&source, 0, part, stream->sreq_max_segs);
+ subreq->len = subreq->transferred + part;
+ subreq->io_iter = source;
+ iov_iter_truncate(&subreq->io_iter, part);
+ iov_iter_advance(&source, part);
+ len -= part;
+ start += part;
+ if (!len) {
+ if (boundary)
+ __set_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);
+ } else {
+ __clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);
+ }
+
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+ netfs_reissue_read(rreq, subreq);
+ if (subreq == to) {
+ subreq_superfluous = false;
+ break;
+ }
+ }
+
+ /* If we managed to use fewer subreqs, we can discard the
+ * excess; if we used the same number, then we're done.
+ */
+ if (!len) {
+ if (!subreq_superfluous)
+ continue;
+ list_for_each_entry_safe_from(subreq, tmp,
+ &stream->subrequests, rreq_link) {
+ trace_netfs_sreq(subreq, netfs_sreq_trace_superfluous);
+ list_del(&subreq->rreq_link);
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
+ if (subreq == to)
+ break;
+ }
+ continue;
+ }
+
+ /* We ran out of subrequests, so we need to allocate some more
+ * and insert them after.
+ */
+ do {
+ subreq = netfs_alloc_subrequest(rreq);
+ if (!subreq) {
+ subreq = to;
+ goto abandon_after;
+ }
+ subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
+ subreq->start = start;
+ subreq->len = len;
+ subreq->stream_nr = stream->stream_nr;
+ subreq->retry_count = 1;
+
+ trace_netfs_sreq_ref(rreq->debug_id, subreq->debug_index,
+ refcount_read(&subreq->ref),
+ netfs_sreq_trace_new);
+
+ list_add(&subreq->rreq_link, &to->rreq_link);
+ to = list_next_entry(to, rreq_link);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+
+ stream->sreq_max_len = umin(len, rreq->rsize);
+ stream->sreq_max_segs = 0;
+ if (unlikely(stream->sreq_max_segs))
+ part = netfs_limit_iter(&source, 0, part, stream->sreq_max_segs);
+
+ netfs_stat(&netfs_n_rh_download);
+ if (rreq->netfs_ops->prepare_read(subreq) < 0) {
+ trace_netfs_sreq(subreq, netfs_sreq_trace_reprep_failed);
+ __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+ goto abandon;
+ }
+
+ part = umin(len, stream->sreq_max_len);
+ subreq->len = subreq->transferred + part;
+ subreq->io_iter = source;
+ iov_iter_truncate(&subreq->io_iter, part);
+ iov_iter_advance(&source, part);
+
+ len -= part;
+ start += part;
+ if (!len && boundary) {
+ __set_bit(NETFS_SREQ_BOUNDARY, &to->flags);
+ boundary = false;
+ }
+
+ netfs_reissue_read(rreq, subreq);
+ } while (len);
+
+ } while (!list_is_head(next, &stream->subrequests));
+
+ return;
+
+ /* If we hit an error, fail all remaining incomplete subrequests */
+abandon_after:
+ if (list_is_last(&subreq->rreq_link, &stream->subrequests))
+ return;
+ subreq = list_next_entry(subreq, rreq_link);
+abandon:
+ list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
+ if (!subreq->error &&
+ !test_bit(NETFS_SREQ_FAILED, &subreq->flags) &&
+ !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))
+ continue;
+ subreq->error = -ENOMEM;
+ __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+ __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ }
+}
+
+/*
+ * Retry reads.
+ */
+void netfs_retry_reads(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+ struct netfs_io_stream *stream = &rreq->io_streams[0];
+ DEFINE_WAIT(myself);
+
+ netfs_stat(&netfs_n_rh_retry_read_req);
+
+ set_bit(NETFS_RREQ_RETRYING, &rreq->flags);
+
+ /* Wait for all outstanding I/O to quiesce before performing retries as
+ * we may need to renegotiate the I/O sizes.
+ */
+ list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
+ if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags))
+ continue;
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue);
+ for (;;) {
+ prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
+
+ if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags))
+ break;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for);
+ schedule();
+ trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue);
+ }
+
+ finish_wait(&rreq->waitq, &myself);
+ }
+ clear_bit(NETFS_RREQ_RETRYING, &rreq->flags);
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
+ netfs_retry_read_subrequests(rreq);
+}
+
+/*
+ * Unlock any the pages that haven't been unlocked yet due to abandoned
+ * subrequests.
+ */
+void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq)
+{
+ struct folio_queue *p;
+
+ for (p = rreq->buffer.tail; p; p = p->next) {
+ for (int slot = 0; slot < folioq_count(p); slot++) {
+ struct folio *folio = folioq_folio(p, slot);
+
+ if (folio && !folioq_is_marked2(p, slot)) {
+ trace_netfs_folio(folio, netfs_folio_trace_abandon);
+ folio_unlock(folio);
+ }
+ }
+ }
+}
diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c
new file mode 100644
index 000000000000..fea0ecdecc53
--- /dev/null
+++ b/fs/netfs/read_single.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Single, monolithic object support (e.g. AFS directory).
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/sched/mm.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+/**
+ * netfs_single_mark_inode_dirty - Mark a single, monolithic object inode dirty
+ * @inode: The inode to mark
+ *
+ * Mark an inode that contains a single, monolithic object as dirty so that its
+ * writepages op will get called. If set, the SINGLE_NO_UPLOAD flag indicates
+ * that the object will only be written to the cache and not uploaded (e.g. AFS
+ * directory contents).
+ */
+void netfs_single_mark_inode_dirty(struct inode *inode)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+ bool cache_only = test_bit(NETFS_ICTX_SINGLE_NO_UPLOAD, &ictx->flags);
+ bool caching = fscache_cookie_enabled(netfs_i_cookie(netfs_inode(inode)));
+
+ if (cache_only && !caching)
+ return;
+
+ mark_inode_dirty(inode);
+
+ if (caching && !(inode->i_state & I_PINNING_NETFS_WB)) {
+ bool need_use = false;
+
+ spin_lock(&inode->i_lock);
+ if (!(inode->i_state & I_PINNING_NETFS_WB)) {
+ inode->i_state |= I_PINNING_NETFS_WB;
+ need_use = true;
+ }
+ spin_unlock(&inode->i_lock);
+
+ if (need_use)
+ fscache_use_cookie(netfs_i_cookie(ictx), true);
+ }
+
+}
+EXPORT_SYMBOL(netfs_single_mark_inode_dirty);
+
+static int netfs_single_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx)
+{
+ return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
+}
+
+static void netfs_single_cache_prepare_read(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+
+ if (!cres->ops) {
+ subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
+ return;
+ }
+ subreq->source = cres->ops->prepare_read(subreq, rreq->i_size);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+
+}
+
+static void netfs_single_read_cache(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+
+ _enter("R=%08x[%x]", rreq->debug_id, subreq->debug_index);
+ netfs_stat(&netfs_n_rh_read);
+ cres->ops->read(cres, subreq->start, &subreq->io_iter, NETFS_READ_HOLE_FAIL,
+ netfs_cache_read_terminated, subreq);
+}
+
+/*
+ * Perform a read to a buffer from the cache or the server. Only a single
+ * subreq is permitted as the object must be fetched in a single transaction.
+ */
+static int netfs_single_dispatch_read(struct netfs_io_request *rreq)
+{
+ struct netfs_io_stream *stream = &rreq->io_streams[0];
+ struct netfs_io_subrequest *subreq;
+ int ret = 0;
+
+ subreq = netfs_alloc_subrequest(rreq);
+ if (!subreq)
+ return -ENOMEM;
+
+ subreq->source = NETFS_SOURCE_UNKNOWN;
+ subreq->start = 0;
+ subreq->len = rreq->len;
+ subreq->io_iter = rreq->buffer.iter;
+
+ __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+
+ spin_lock(&rreq->lock);
+ list_add_tail(&subreq->rreq_link, &stream->subrequests);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_added);
+ stream->front = subreq;
+ /* Store list pointers before active flag */
+ smp_store_release(&stream->active, true);
+ spin_unlock(&rreq->lock);
+
+ netfs_single_cache_prepare_read(rreq, subreq);
+ switch (subreq->source) {
+ case NETFS_DOWNLOAD_FROM_SERVER:
+ netfs_stat(&netfs_n_rh_download);
+ if (rreq->netfs_ops->prepare_read) {
+ ret = rreq->netfs_ops->prepare_read(subreq);
+ if (ret < 0)
+ goto cancel;
+ }
+
+ rreq->netfs_ops->issue_read(subreq);
+ rreq->submitted += subreq->len;
+ break;
+ case NETFS_READ_FROM_CACHE:
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ netfs_single_read_cache(rreq, subreq);
+ rreq->submitted += subreq->len;
+ ret = 0;
+ break;
+ default:
+ pr_warn("Unexpected single-read source %u\n", subreq->source);
+ WARN_ON_ONCE(true);
+ ret = -EIO;
+ break;
+ }
+
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
+ return ret;
+cancel:
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+ return ret;
+}
+
+/**
+ * netfs_read_single - Synchronously read a single blob of pages.
+ * @inode: The inode to read from.
+ * @file: The file we're using to read or NULL.
+ * @iter: The buffer we're reading into.
+ *
+ * Fulfil a read request for a single monolithic object by drawing data from
+ * the cache if possible, or the netfs if not. The buffer may be larger than
+ * the file content; unused beyond the EOF will be zero-filled. The content
+ * will be read with a single I/O request (though this may be retried).
+ *
+ * The calling netfs must initialise a netfs context contiguous to the vfs
+ * inode before calling this.
+ *
+ * This is usable whether or not caching is enabled. If caching is enabled,
+ * the data will be stored as a single object into the cache.
+ */
+ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_iter *iter)
+{
+ struct netfs_io_request *rreq;
+ struct netfs_inode *ictx = netfs_inode(inode);
+ ssize_t ret;
+
+ rreq = netfs_alloc_request(inode->i_mapping, file, 0, iov_iter_count(iter),
+ NETFS_READ_SINGLE);
+ if (IS_ERR(rreq))
+ return PTR_ERR(rreq);
+
+ ret = netfs_single_begin_cache_read(rreq, ictx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto cleanup_free;
+
+ netfs_stat(&netfs_n_rh_read_single);
+ trace_netfs_read(rreq, 0, rreq->len, netfs_read_trace_read_single);
+
+ rreq->buffer.iter = *iter;
+ netfs_single_dispatch_read(rreq);
+
+ ret = netfs_wait_for_read(rreq);
+ netfs_put_request(rreq, true, netfs_rreq_trace_put_return);
+ return ret;
+
+cleanup_free:
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_read_single);
diff --git a/fs/netfs/rolling_buffer.c b/fs/netfs/rolling_buffer.c
new file mode 100644
index 000000000000..207b6a326651
--- /dev/null
+++ b/fs/netfs/rolling_buffer.c
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Rolling buffer helpers
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/bitops.h>
+#include <linux/pagemap.h>
+#include <linux/rolling_buffer.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+static atomic_t debug_ids;
+
+/**
+ * netfs_folioq_alloc - Allocate a folio_queue struct
+ * @rreq_id: Associated debugging ID for tracing purposes
+ * @gfp: Allocation constraints
+ * @trace: Trace tag to indicate the purpose of the allocation
+ *
+ * Allocate, initialise and account the folio_queue struct and log a trace line
+ * to mark the allocation.
+ */
+struct folio_queue *netfs_folioq_alloc(unsigned int rreq_id, gfp_t gfp,
+ unsigned int /*enum netfs_folioq_trace*/ trace)
+{
+ struct folio_queue *fq;
+
+ fq = kmalloc(sizeof(*fq), gfp);
+ if (fq) {
+ netfs_stat(&netfs_n_folioq);
+ folioq_init(fq, rreq_id);
+ fq->debug_id = atomic_inc_return(&debug_ids);
+ trace_netfs_folioq(fq, trace);
+ }
+ return fq;
+}
+EXPORT_SYMBOL(netfs_folioq_alloc);
+
+/**
+ * netfs_folioq_free - Free a folio_queue struct
+ * @folioq: The object to free
+ * @trace: Trace tag to indicate which free
+ *
+ * Free and unaccount the folio_queue struct.
+ */
+void netfs_folioq_free(struct folio_queue *folioq,
+ unsigned int /*enum netfs_trace_folioq*/ trace)
+{
+ trace_netfs_folioq(folioq, trace);
+ netfs_stat_d(&netfs_n_folioq);
+ kfree(folioq);
+}
+EXPORT_SYMBOL(netfs_folioq_free);
+
+/*
+ * Initialise a rolling buffer. We allocate an empty folio queue struct to so
+ * that the pointers can be independently driven by the producer and the
+ * consumer.
+ */
+int rolling_buffer_init(struct rolling_buffer *roll, unsigned int rreq_id,
+ unsigned int direction)
+{
+ struct folio_queue *fq;
+
+ fq = netfs_folioq_alloc(rreq_id, GFP_NOFS, netfs_trace_folioq_rollbuf_init);
+ if (!fq)
+ return -ENOMEM;
+
+ roll->head = fq;
+ roll->tail = fq;
+ iov_iter_folio_queue(&roll->iter, direction, fq, 0, 0, 0);
+ return 0;
+}
+
+/*
+ * Add another folio_queue to a rolling buffer if there's no space left.
+ */
+int rolling_buffer_make_space(struct rolling_buffer *roll)
+{
+ struct folio_queue *fq, *head = roll->head;
+
+ if (!folioq_full(head))
+ return 0;
+
+ fq = netfs_folioq_alloc(head->rreq_id, GFP_NOFS, netfs_trace_folioq_make_space);
+ if (!fq)
+ return -ENOMEM;
+ fq->prev = head;
+
+ roll->head = fq;
+ if (folioq_full(head)) {
+ /* Make sure we don't leave the master iterator pointing to a
+ * block that might get immediately consumed.
+ */
+ if (roll->iter.folioq == head &&
+ roll->iter.folioq_slot == folioq_nr_slots(head)) {
+ roll->iter.folioq = fq;
+ roll->iter.folioq_slot = 0;
+ }
+ }
+
+ /* Make sure the initialisation is stored before the next pointer.
+ *
+ * [!] NOTE: After we set head->next, the consumer is at liberty to
+ * immediately delete the old head.
+ */
+ smp_store_release(&head->next, fq);
+ return 0;
+}
+
+/*
+ * Decant the list of folios to read into a rolling buffer.
+ */
+ssize_t rolling_buffer_load_from_ra(struct rolling_buffer *roll,
+ struct readahead_control *ractl,
+ struct folio_batch *put_batch)
+{
+ struct folio_queue *fq;
+ struct page **vec;
+ int nr, ix, to;
+ ssize_t size = 0;
+
+ if (rolling_buffer_make_space(roll) < 0)
+ return -ENOMEM;
+
+ fq = roll->head;
+ vec = (struct page **)fq->vec.folios;
+ nr = __readahead_batch(ractl, vec + folio_batch_count(&fq->vec),
+ folio_batch_space(&fq->vec));
+ ix = fq->vec.nr;
+ to = ix + nr;
+ fq->vec.nr = to;
+ for (; ix < to; ix++) {
+ struct folio *folio = folioq_folio(fq, ix);
+ unsigned int order = folio_order(folio);
+
+ fq->orders[ix] = order;
+ size += PAGE_SIZE << order;
+ trace_netfs_folio(folio, netfs_folio_trace_read);
+ if (!folio_batch_add(put_batch, folio))
+ folio_batch_release(put_batch);
+ }
+ WRITE_ONCE(roll->iter.count, roll->iter.count + size);
+
+ /* Store the counter after setting the slot. */
+ smp_store_release(&roll->next_head_slot, to);
+ return size;
+}
+
+/*
+ * Append a folio to the rolling buffer.
+ */
+ssize_t rolling_buffer_append(struct rolling_buffer *roll, struct folio *folio,
+ unsigned int flags)
+{
+ ssize_t size = folio_size(folio);
+ int slot;
+
+ if (rolling_buffer_make_space(roll) < 0)
+ return -ENOMEM;
+
+ slot = folioq_append(roll->head, folio);
+ if (flags & ROLLBUF_MARK_1)
+ folioq_mark(roll->head, slot);
+ if (flags & ROLLBUF_MARK_2)
+ folioq_mark2(roll->head, slot);
+
+ WRITE_ONCE(roll->iter.count, roll->iter.count + size);
+
+ /* Store the counter after setting the slot. */
+ smp_store_release(&roll->next_head_slot, slot);
+ return size;
+}
+
+/*
+ * Delete a spent buffer from a rolling queue and return the next in line. We
+ * don't return the last buffer to keep the pointers independent, but return
+ * NULL instead.
+ */
+struct folio_queue *rolling_buffer_delete_spent(struct rolling_buffer *roll)
+{
+ struct folio_queue *spent = roll->tail, *next = READ_ONCE(spent->next);
+
+ if (!next)
+ return NULL;
+ next->prev = NULL;
+ netfs_folioq_free(spent, netfs_trace_folioq_delete);
+ roll->tail = next;
+ return next;
+}
+
+/*
+ * Clear out a rolling queue. Folios that have mark 1 set are put.
+ */
+void rolling_buffer_clear(struct rolling_buffer *roll)
+{
+ struct folio_batch fbatch;
+ struct folio_queue *p;
+
+ folio_batch_init(&fbatch);
+
+ while ((p = roll->tail)) {
+ roll->tail = p->next;
+ for (int slot = 0; slot < folioq_count(p); slot++) {
+ struct folio *folio = folioq_folio(p, slot);
+
+ if (!folio)
+ continue;
+ if (folioq_is_marked(p, slot)) {
+ trace_netfs_folio(folio, netfs_folio_trace_put);
+ if (!folio_batch_add(&fbatch, folio))
+ folio_batch_release(&fbatch);
+ }
+ }
+
+ netfs_folioq_free(p, netfs_trace_folioq_clear);
+ }
+
+ folio_batch_release(&fbatch);
+}
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index deeba9f9dcf5..ab6b916addc4 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -10,9 +10,9 @@
#include "internal.h"
atomic_t netfs_n_rh_dio_read;
-atomic_t netfs_n_rh_dio_write;
atomic_t netfs_n_rh_readahead;
-atomic_t netfs_n_rh_readpage;
+atomic_t netfs_n_rh_read_folio;
+atomic_t netfs_n_rh_read_single;
atomic_t netfs_n_rh_rreq;
atomic_t netfs_n_rh_sreq;
atomic_t netfs_n_rh_download;
@@ -29,6 +29,13 @@ atomic_t netfs_n_rh_write_begin;
atomic_t netfs_n_rh_write_done;
atomic_t netfs_n_rh_write_failed;
atomic_t netfs_n_rh_write_zskip;
+atomic_t netfs_n_rh_retry_read_req;
+atomic_t netfs_n_rh_retry_read_subreq;
+atomic_t netfs_n_wh_buffered_write;
+atomic_t netfs_n_wh_writethrough;
+atomic_t netfs_n_wh_dio_write;
+atomic_t netfs_n_wh_writepages;
+atomic_t netfs_n_wh_copy_to_cache;
atomic_t netfs_n_wh_wstream_conflict;
atomic_t netfs_n_wh_upload;
atomic_t netfs_n_wh_upload_done;
@@ -36,41 +43,61 @@ atomic_t netfs_n_wh_upload_failed;
atomic_t netfs_n_wh_write;
atomic_t netfs_n_wh_write_done;
atomic_t netfs_n_wh_write_failed;
+atomic_t netfs_n_wh_retry_write_req;
+atomic_t netfs_n_wh_retry_write_subreq;
+atomic_t netfs_n_wb_lock_skip;
+atomic_t netfs_n_wb_lock_wait;
+atomic_t netfs_n_folioq;
int netfs_stats_show(struct seq_file *m, void *v)
{
- seq_printf(m, "Netfs : DR=%u DW=%u RA=%u RP=%u WB=%u WBZ=%u\n",
+ seq_printf(m, "Reads : DR=%u RA=%u RF=%u RS=%u WB=%u WBZ=%u\n",
atomic_read(&netfs_n_rh_dio_read),
- atomic_read(&netfs_n_rh_dio_write),
atomic_read(&netfs_n_rh_readahead),
- atomic_read(&netfs_n_rh_readpage),
+ atomic_read(&netfs_n_rh_read_folio),
+ atomic_read(&netfs_n_rh_read_single),
atomic_read(&netfs_n_rh_write_begin),
atomic_read(&netfs_n_rh_write_zskip));
- seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n",
+ seq_printf(m, "Writes : BW=%u WT=%u DW=%u WP=%u 2C=%u\n",
+ atomic_read(&netfs_n_wh_buffered_write),
+ atomic_read(&netfs_n_wh_writethrough),
+ atomic_read(&netfs_n_wh_dio_write),
+ atomic_read(&netfs_n_wh_writepages),
+ atomic_read(&netfs_n_wh_copy_to_cache));
+ seq_printf(m, "ZeroOps: ZR=%u sh=%u sk=%u\n",
atomic_read(&netfs_n_rh_zero),
atomic_read(&netfs_n_rh_short_read),
atomic_read(&netfs_n_rh_write_zskip));
- seq_printf(m, "Netfs : DL=%u ds=%u df=%u di=%u\n",
+ seq_printf(m, "DownOps: DL=%u ds=%u df=%u di=%u\n",
atomic_read(&netfs_n_rh_download),
atomic_read(&netfs_n_rh_download_done),
atomic_read(&netfs_n_rh_download_failed),
atomic_read(&netfs_n_rh_download_instead));
- seq_printf(m, "Netfs : RD=%u rs=%u rf=%u\n",
+ seq_printf(m, "CaRdOps: RD=%u rs=%u rf=%u\n",
atomic_read(&netfs_n_rh_read),
atomic_read(&netfs_n_rh_read_done),
atomic_read(&netfs_n_rh_read_failed));
- seq_printf(m, "Netfs : UL=%u us=%u uf=%u\n",
+ seq_printf(m, "UpldOps: UL=%u us=%u uf=%u\n",
atomic_read(&netfs_n_wh_upload),
atomic_read(&netfs_n_wh_upload_done),
atomic_read(&netfs_n_wh_upload_failed));
- seq_printf(m, "Netfs : WR=%u ws=%u wf=%u\n",
+ seq_printf(m, "CaWrOps: WR=%u ws=%u wf=%u\n",
atomic_read(&netfs_n_wh_write),
atomic_read(&netfs_n_wh_write_done),
atomic_read(&netfs_n_wh_write_failed));
- seq_printf(m, "Netfs : rr=%u sr=%u wsc=%u\n",
+ seq_printf(m, "Retries: rq=%u rs=%u wq=%u ws=%u\n",
+ atomic_read(&netfs_n_rh_retry_read_req),
+ atomic_read(&netfs_n_rh_retry_read_subreq),
+ atomic_read(&netfs_n_wh_retry_write_req),
+ atomic_read(&netfs_n_wh_retry_write_subreq));
+ seq_printf(m, "Objs : rr=%u sr=%u foq=%u wsc=%u\n",
atomic_read(&netfs_n_rh_rreq),
atomic_read(&netfs_n_rh_sreq),
+ atomic_read(&netfs_n_folioq),
atomic_read(&netfs_n_wh_wstream_conflict));
+ seq_printf(m, "WbLock : skip=%u wait=%u\n",
+ atomic_read(&netfs_n_wb_lock_skip),
+ atomic_read(&netfs_n_wb_lock_wait));
return fscache_stats_show(m);
}
EXPORT_SYMBOL(netfs_stats_show);
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
new file mode 100644
index 000000000000..3fca59e6475d
--- /dev/null
+++ b/fs/netfs/write_collect.c
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem write subrequest result collection, assessment
+ * and retrying.
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+/* Notes made in the collector */
+#define HIT_PENDING 0x01 /* A front op was still pending */
+#define NEED_REASSESS 0x02 /* Need to loop round and reassess */
+#define MADE_PROGRESS 0x04 /* Made progress cleaning up a stream or the folio set */
+#define NEED_UNLOCK 0x08 /* The pagecache needs unlocking */
+#define NEED_RETRY 0x10 /* A front op requests retrying */
+#define SAW_FAILURE 0x20 /* One stream or hit a permanent failure */
+
+static void netfs_dump_request(const struct netfs_io_request *rreq)
+{
+ pr_err("Request R=%08x r=%d fl=%lx or=%x e=%ld\n",
+ rreq->debug_id, refcount_read(&rreq->ref), rreq->flags,
+ rreq->origin, rreq->error);
+ pr_err(" st=%llx tsl=%zx/%llx/%llx\n",
+ rreq->start, rreq->transferred, rreq->submitted, rreq->len);
+ pr_err(" cci=%llx/%llx/%llx\n",
+ rreq->cleaned_to, rreq->collected_to, atomic64_read(&rreq->issued_to));
+ pr_err(" iw=%pSR\n", rreq->netfs_ops->issue_write);
+ for (int i = 0; i < NR_IO_STREAMS; i++) {
+ const struct netfs_io_subrequest *sreq;
+ const struct netfs_io_stream *s = &rreq->io_streams[i];
+
+ pr_err(" str[%x] s=%x e=%d acnf=%u,%u,%u,%u\n",
+ s->stream_nr, s->source, s->error,
+ s->avail, s->active, s->need_retry, s->failed);
+ pr_err(" str[%x] ct=%llx t=%zx\n",
+ s->stream_nr, s->collected_to, s->transferred);
+ list_for_each_entry(sreq, &s->subrequests, rreq_link) {
+ pr_err(" sreq[%x:%x] sc=%u s=%llx t=%zx/%zx r=%d f=%lx\n",
+ sreq->stream_nr, sreq->debug_index, sreq->source,
+ sreq->start, sreq->transferred, sreq->len,
+ refcount_read(&sreq->ref), sreq->flags);
+ }
+ }
+}
+
+/*
+ * Successful completion of write of a folio to the server and/or cache. Note
+ * that we are not allowed to lock the folio here on pain of deadlocking with
+ * truncate.
+ */
+int netfs_folio_written_back(struct folio *folio)
+{
+ enum netfs_folio_trace why = netfs_folio_trace_clear;
+ struct netfs_inode *ictx = netfs_inode(folio->mapping->host);
+ struct netfs_folio *finfo;
+ struct netfs_group *group = NULL;
+ int gcount = 0;
+
+ if ((finfo = netfs_folio_info(folio))) {
+ /* Streaming writes cannot be redirtied whilst under writeback,
+ * so discard the streaming record.
+ */
+ unsigned long long fend;
+
+ fend = folio_pos(folio) + finfo->dirty_offset + finfo->dirty_len;
+ if (fend > ictx->zero_point)
+ ictx->zero_point = fend;
+
+ folio_detach_private(folio);
+ group = finfo->netfs_group;
+ gcount++;
+ kfree(finfo);
+ why = netfs_folio_trace_clear_s;
+ goto end_wb;
+ }
+
+ if ((group = netfs_folio_group(folio))) {
+ if (group == NETFS_FOLIO_COPY_TO_CACHE) {
+ why = netfs_folio_trace_clear_cc;
+ folio_detach_private(folio);
+ goto end_wb;
+ }
+
+ /* Need to detach the group pointer if the page didn't get
+ * redirtied. If it has been redirtied, then it must be within
+ * the same group.
+ */
+ why = netfs_folio_trace_redirtied;
+ if (!folio_test_dirty(folio)) {
+ folio_detach_private(folio);
+ gcount++;
+ why = netfs_folio_trace_clear_g;
+ }
+ }
+
+end_wb:
+ trace_netfs_folio(folio, why);
+ folio_end_writeback(folio);
+ return gcount;
+}
+
+/*
+ * Unlock any folios we've finished with.
+ */
+static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
+ unsigned int *notes)
+{
+ struct folio_queue *folioq = wreq->buffer.tail;
+ unsigned long long collected_to = wreq->collected_to;
+ unsigned int slot = wreq->buffer.first_tail_slot;
+
+ if (WARN_ON_ONCE(!folioq)) {
+ pr_err("[!] Writeback unlock found empty rolling buffer!\n");
+ netfs_dump_request(wreq);
+ return;
+ }
+
+ if (wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) {
+ if (netfs_pgpriv2_unlock_copied_folios(wreq))
+ *notes |= MADE_PROGRESS;
+ return;
+ }
+
+ if (slot >= folioq_nr_slots(folioq)) {
+ folioq = rolling_buffer_delete_spent(&wreq->buffer);
+ if (!folioq)
+ return;
+ slot = 0;
+ }
+
+ for (;;) {
+ struct folio *folio;
+ struct netfs_folio *finfo;
+ unsigned long long fpos, fend;
+ size_t fsize, flen;
+
+ folio = folioq_folio(folioq, slot);
+ if (WARN_ONCE(!folio_test_writeback(folio),
+ "R=%08x: folio %lx is not under writeback\n",
+ wreq->debug_id, folio->index))
+ trace_netfs_folio(folio, netfs_folio_trace_not_under_wback);
+
+ fpos = folio_pos(folio);
+ fsize = folio_size(folio);
+ finfo = netfs_folio_info(folio);
+ flen = finfo ? finfo->dirty_offset + finfo->dirty_len : fsize;
+
+ fend = min_t(unsigned long long, fpos + flen, wreq->i_size);
+
+ trace_netfs_collect_folio(wreq, folio, fend, collected_to);
+
+ /* Unlock any folio we've transferred all of. */
+ if (collected_to < fend)
+ break;
+
+ wreq->nr_group_rel += netfs_folio_written_back(folio);
+ wreq->cleaned_to = fpos + fsize;
+ *notes |= MADE_PROGRESS;
+
+ /* Clean up the head folioq. If we clear an entire folioq, then
+ * we can get rid of it provided it's not also the tail folioq
+ * being filled by the issuer.
+ */
+ folioq_clear(folioq, slot);
+ slot++;
+ if (slot >= folioq_nr_slots(folioq)) {
+ folioq = rolling_buffer_delete_spent(&wreq->buffer);
+ if (!folioq)
+ goto done;
+ slot = 0;
+ }
+
+ if (fpos + fsize >= collected_to)
+ break;
+ }
+
+ wreq->buffer.tail = folioq;
+done:
+ wreq->buffer.first_tail_slot = slot;
+}
+
+/*
+ * Collect and assess the results of various write subrequests. We may need to
+ * retry some of the results - or even do an RMW cycle for content crypto.
+ *
+ * Note that we have a number of parallel, overlapping lists of subrequests,
+ * one to the server and one to the local cache for example, which may not be
+ * the same size or starting position and may not even correspond in boundary
+ * alignment.
+ */
+static void netfs_collect_write_results(struct netfs_io_request *wreq)
+{
+ struct netfs_io_subrequest *front, *remove;
+ struct netfs_io_stream *stream;
+ unsigned long long collected_to, issued_to;
+ unsigned int notes;
+ int s;
+
+ _enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
+ trace_netfs_collect(wreq);
+ trace_netfs_rreq(wreq, netfs_rreq_trace_collect);
+
+reassess_streams:
+ issued_to = atomic64_read(&wreq->issued_to);
+ smp_rmb();
+ collected_to = ULLONG_MAX;
+ if (wreq->origin == NETFS_WRITEBACK ||
+ wreq->origin == NETFS_WRITETHROUGH ||
+ wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE)
+ notes = NEED_UNLOCK;
+ else
+ notes = 0;
+
+ /* Remove completed subrequests from the front of the streams and
+ * advance the completion point on each stream. We stop when we hit
+ * something that's in progress. The issuer thread may be adding stuff
+ * to the tail whilst we're doing this.
+ */
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ /* Read active flag before list pointers */
+ if (!smp_load_acquire(&stream->active))
+ continue;
+
+ front = stream->front;
+ while (front) {
+ trace_netfs_collect_sreq(wreq, front);
+ //_debug("sreq [%x] %llx %zx/%zx",
+ // front->debug_index, front->start, front->transferred, front->len);
+
+ if (stream->collected_to < front->start) {
+ trace_netfs_collect_gap(wreq, stream, issued_to, 'F');
+ stream->collected_to = front->start;
+ }
+
+ /* Stall if the front is still undergoing I/O. */
+ if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) {
+ notes |= HIT_PENDING;
+ break;
+ }
+ smp_rmb(); /* Read counters after I-P flag. */
+
+ if (stream->failed) {
+ stream->collected_to = front->start + front->len;
+ notes |= MADE_PROGRESS | SAW_FAILURE;
+ goto cancel;
+ }
+ if (front->start + front->transferred > stream->collected_to) {
+ stream->collected_to = front->start + front->transferred;
+ stream->transferred = stream->collected_to - wreq->start;
+ notes |= MADE_PROGRESS;
+ }
+ if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
+ stream->failed = true;
+ stream->error = front->error;
+ if (stream->source == NETFS_UPLOAD_TO_SERVER)
+ mapping_set_error(wreq->mapping, front->error);
+ notes |= NEED_REASSESS | SAW_FAILURE;
+ break;
+ }
+ if (front->transferred < front->len) {
+ stream->need_retry = true;
+ notes |= NEED_RETRY | MADE_PROGRESS;
+ break;
+ }
+
+ cancel:
+ /* Remove if completely consumed. */
+ spin_lock(&wreq->lock);
+
+ remove = front;
+ list_del_init(&front->rreq_link);
+ front = list_first_entry_or_null(&stream->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ stream->front = front;
+ spin_unlock(&wreq->lock);
+ netfs_put_subrequest(remove, false,
+ notes & SAW_FAILURE ?
+ netfs_sreq_trace_put_cancel :
+ netfs_sreq_trace_put_done);
+ }
+
+ /* If we have an empty stream, we need to jump it forward
+ * otherwise the collection point will never advance.
+ */
+ if (!front && issued_to > stream->collected_to) {
+ trace_netfs_collect_gap(wreq, stream, issued_to, 'E');
+ stream->collected_to = issued_to;
+ }
+
+ if (stream->collected_to < collected_to)
+ collected_to = stream->collected_to;
+ }
+
+ if (collected_to != ULLONG_MAX && collected_to > wreq->collected_to)
+ wreq->collected_to = collected_to;
+
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->active)
+ trace_netfs_collect_stream(wreq, stream);
+ }
+
+ trace_netfs_collect_state(wreq, wreq->collected_to, notes);
+
+ /* Unlock any folios that we have now finished with. */
+ if (notes & NEED_UNLOCK) {
+ if (wreq->cleaned_to < wreq->collected_to)
+ netfs_writeback_unlock_folios(wreq, &notes);
+ } else {
+ wreq->cleaned_to = wreq->collected_to;
+ }
+
+ // TODO: Discard encryption buffers
+
+ if (notes & NEED_RETRY)
+ goto need_retry;
+ if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
+ trace_netfs_rreq(wreq, netfs_rreq_trace_unpause);
+ clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags);
+ smp_mb__after_atomic(); /* Set PAUSE before task state */
+ wake_up(&wreq->waitq);
+ }
+
+ if (notes & NEED_REASSESS) {
+ //cond_resched();
+ goto reassess_streams;
+ }
+ if (notes & MADE_PROGRESS) {
+ //cond_resched();
+ goto reassess_streams;
+ }
+
+out:
+ netfs_put_group_many(wreq->group, wreq->nr_group_rel);
+ wreq->nr_group_rel = 0;
+ _leave(" = %x", notes);
+ return;
+
+need_retry:
+ /* Okay... We're going to have to retry one or both streams. Note
+ * that any partially completed op will have had any wholly transferred
+ * folios removed from it.
+ */
+ _debug("retry");
+ netfs_retry_writes(wreq);
+ goto out;
+}
+
+/*
+ * Perform the collection of subrequests, folios and encryption buffers.
+ */
+void netfs_write_collection_worker(struct work_struct *work)
+{
+ struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work);
+ struct netfs_inode *ictx = netfs_inode(wreq->inode);
+ size_t transferred;
+ int s;
+
+ _enter("R=%x", wreq->debug_id);
+
+ netfs_see_request(wreq, netfs_rreq_trace_see_work);
+ if (!test_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags)) {
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
+ return;
+ }
+
+ netfs_collect_write_results(wreq);
+
+ /* We're done when the app thread has finished posting subreqs and all
+ * the queues in all the streams are empty.
+ */
+ if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags)) {
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
+ return;
+ }
+ smp_rmb(); /* Read ALL_QUEUED before lists. */
+
+ transferred = LONG_MAX;
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ struct netfs_io_stream *stream = &wreq->io_streams[s];
+ if (!stream->active)
+ continue;
+ if (!list_empty(&stream->subrequests)) {
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
+ return;
+ }
+ if (stream->transferred < transferred)
+ transferred = stream->transferred;
+ }
+
+ /* Okay, declare that all I/O is complete. */
+ wreq->transferred = transferred;
+ trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
+
+ if (wreq->io_streams[1].active &&
+ wreq->io_streams[1].failed &&
+ ictx->ops->invalidate_cache) {
+ /* Cache write failure doesn't prevent writeback completion
+ * unless we're in disconnected mode.
+ */
+ ictx->ops->invalidate_cache(wreq);
+ }
+
+ if (wreq->cleanup)
+ wreq->cleanup(wreq);
+
+ if (wreq->origin == NETFS_DIO_WRITE &&
+ wreq->mapping->nrpages) {
+ /* mmap may have got underfoot and we may now have folios
+ * locally covering the region we just wrote. Attempt to
+ * discard the folios, but leave in place any modified locally.
+ * ->write_iter() is prevented from interfering by the DIO
+ * counter.
+ */
+ pgoff_t first = wreq->start >> PAGE_SHIFT;
+ pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
+ invalidate_inode_pages2_range(wreq->mapping, first, last);
+ }
+
+ if (wreq->origin == NETFS_DIO_WRITE)
+ inode_dio_end(wreq->inode);
+
+ _debug("finished");
+ trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
+ clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
+
+ if (wreq->iocb) {
+ size_t written = min(wreq->transferred, wreq->len);
+ wreq->iocb->ki_pos += written;
+ if (wreq->iocb->ki_complete)
+ wreq->iocb->ki_complete(
+ wreq->iocb, wreq->error ? wreq->error : written);
+ wreq->iocb = VFS_PTR_POISON;
+ }
+
+ netfs_clear_subrequests(wreq, false);
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_work_complete);
+}
+
+/*
+ * Wake the collection work item.
+ */
+void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async)
+{
+ if (!work_pending(&wreq->work)) {
+ netfs_get_request(wreq, netfs_rreq_trace_get_work);
+ if (!queue_work(system_unbound_wq, &wreq->work))
+ netfs_put_request(wreq, was_async, netfs_rreq_trace_put_work_nq);
+ }
+}
+
+/**
+ * netfs_write_subrequest_terminated - Note the termination of a write operation.
+ * @_op: The I/O request that has terminated.
+ * @transferred_or_error: The amount of data transferred or an error code.
+ * @was_async: The termination was asynchronous
+ *
+ * This tells the library that a contributory write I/O operation has
+ * terminated, one way or another, and that it should collect the results.
+ *
+ * The caller indicates in @transferred_or_error the outcome of the operation,
+ * supplying a positive value to indicate the number of bytes transferred or a
+ * negative error code. The library will look after reissuing I/O operations
+ * as appropriate and writing downloaded data to the cache.
+ *
+ * If @was_async is true, the caller might be running in softirq or interrupt
+ * context and we can't sleep.
+ *
+ * When this is called, ownership of the subrequest is transferred back to the
+ * library, along with a ref.
+ *
+ * Note that %_op is a void* so that the function can be passed to
+ * kiocb::term_func without the need for a casting wrapper.
+ */
+void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct netfs_io_subrequest *subreq = _op;
+ struct netfs_io_request *wreq = subreq->rreq;
+ struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];
+
+ _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
+
+ switch (subreq->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload_done);
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write_done);
+ break;
+ case NETFS_INVALID_WRITE:
+ break;
+ default:
+ BUG();
+ }
+
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ subreq->error = transferred_or_error;
+ if (subreq->error == -EAGAIN)
+ set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ else
+ set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+ trace_netfs_failure(wreq, subreq, transferred_or_error, netfs_fail_write);
+
+ switch (subreq->source) {
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write_failed);
+ break;
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload_failed);
+ break;
+ default:
+ break;
+ }
+ trace_netfs_rreq(wreq, netfs_rreq_trace_set_pause);
+ set_bit(NETFS_RREQ_PAUSE, &wreq->flags);
+ } else {
+ if (WARN(transferred_or_error > subreq->len - subreq->transferred,
+ "Subreq excess write: R=%x[%x] %zd > %zu - %zu",
+ wreq->debug_id, subreq->debug_index,
+ transferred_or_error, subreq->len, subreq->transferred))
+ transferred_or_error = subreq->len - subreq->transferred;
+
+ subreq->error = 0;
+ subreq->transferred += transferred_or_error;
+
+ if (subreq->transferred < subreq->len)
+ set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ }
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
+ clear_and_wake_up_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+
+ /* If we are at the head of the queue, wake up the collector,
+ * transferring a ref to it if we were the ones to do so.
+ */
+ if (list_is_first(&subreq->rreq_link, &stream->subrequests))
+ netfs_wake_write_collector(wreq, was_async);
+
+ netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+}
+EXPORT_SYMBOL(netfs_write_subrequest_terminated);
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
new file mode 100644
index 000000000000..77279fc5b5a7
--- /dev/null
+++ b/fs/netfs/write_issue.c
@@ -0,0 +1,927 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem high-level (buffered) writeback.
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ *
+ * To support network filesystems with local caching, we manage a situation
+ * that can be envisioned like the following:
+ *
+ * +---+---+-----+-----+---+----------+
+ * Folios: | | | | | | |
+ * +---+---+-----+-----+---+----------+
+ *
+ * +------+------+ +----+----+
+ * Upload: | | |.....| | |
+ * (Stream 0) +------+------+ +----+----+
+ *
+ * +------+------+------+------+------+
+ * Cache: | | | | | |
+ * (Stream 1) +------+------+------+------+------+
+ *
+ * Where we have a sequence of folios of varying sizes that we need to overlay
+ * with multiple parallel streams of I/O requests, where the I/O requests in a
+ * stream may also be of various sizes (in cifs, for example, the sizes are
+ * negotiated with the server; in something like ceph, they may represent the
+ * sizes of storage objects).
+ *
+ * The sequence in each stream may contain gaps and noncontiguous subrequests
+ * may be glued together into single vectored write RPCs.
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include "internal.h"
+
+/*
+ * Kill all dirty folios in the event of an unrecoverable error, starting with
+ * a locked folio we've already obtained from writeback_iter().
+ */
+static void netfs_kill_dirty_pages(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct folio *folio)
+{
+ int error = 0;
+
+ do {
+ enum netfs_folio_trace why = netfs_folio_trace_kill;
+ struct netfs_group *group = NULL;
+ struct netfs_folio *finfo = NULL;
+ void *priv;
+
+ priv = folio_detach_private(folio);
+ if (priv) {
+ finfo = __netfs_folio_info(priv);
+ if (finfo) {
+ /* Kill folio from streaming write. */
+ group = finfo->netfs_group;
+ why = netfs_folio_trace_kill_s;
+ } else {
+ group = priv;
+ if (group == NETFS_FOLIO_COPY_TO_CACHE) {
+ /* Kill copy-to-cache folio */
+ why = netfs_folio_trace_kill_cc;
+ group = NULL;
+ } else {
+ /* Kill folio with group */
+ why = netfs_folio_trace_kill_g;
+ }
+ }
+ }
+
+ trace_netfs_folio(folio, why);
+
+ folio_start_writeback(folio);
+ folio_unlock(folio);
+ folio_end_writeback(folio);
+
+ netfs_put_group(group);
+ kfree(finfo);
+
+ } while ((folio = writeback_iter(mapping, wbc, folio, &error)));
+}
+
+/*
+ * Create a write request and set it up appropriately for the origin type.
+ */
+struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
+ struct file *file,
+ loff_t start,
+ enum netfs_io_origin origin)
+{
+ struct netfs_io_request *wreq;
+ struct netfs_inode *ictx;
+ bool is_cacheable = (origin == NETFS_WRITEBACK ||
+ origin == NETFS_WRITEBACK_SINGLE ||
+ origin == NETFS_WRITETHROUGH ||
+ origin == NETFS_PGPRIV2_COPY_TO_CACHE);
+
+ wreq = netfs_alloc_request(mapping, file, start, 0, origin);
+ if (IS_ERR(wreq))
+ return wreq;
+
+ _enter("R=%x", wreq->debug_id);
+
+ ictx = netfs_inode(wreq->inode);
+ if (is_cacheable && netfs_is_cache_enabled(ictx))
+ fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx));
+ if (rolling_buffer_init(&wreq->buffer, wreq->debug_id, ITER_SOURCE) < 0)
+ goto nomem;
+
+ wreq->cleaned_to = wreq->start;
+
+ wreq->io_streams[0].stream_nr = 0;
+ wreq->io_streams[0].source = NETFS_UPLOAD_TO_SERVER;
+ wreq->io_streams[0].prepare_write = ictx->ops->prepare_write;
+ wreq->io_streams[0].issue_write = ictx->ops->issue_write;
+ wreq->io_streams[0].collected_to = start;
+ wreq->io_streams[0].transferred = LONG_MAX;
+
+ wreq->io_streams[1].stream_nr = 1;
+ wreq->io_streams[1].source = NETFS_WRITE_TO_CACHE;
+ wreq->io_streams[1].collected_to = start;
+ wreq->io_streams[1].transferred = LONG_MAX;
+ if (fscache_resources_valid(&wreq->cache_resources)) {
+ wreq->io_streams[1].avail = true;
+ wreq->io_streams[1].active = true;
+ wreq->io_streams[1].prepare_write = wreq->cache_resources.ops->prepare_write_subreq;
+ wreq->io_streams[1].issue_write = wreq->cache_resources.ops->issue_write;
+ }
+
+ return wreq;
+nomem:
+ wreq->error = -ENOMEM;
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_failed);
+ return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * netfs_prepare_write_failed - Note write preparation failed
+ * @subreq: The subrequest to mark
+ *
+ * Mark a subrequest to note that preparation for write failed.
+ */
+void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq)
+{
+ __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prep_failed);
+}
+EXPORT_SYMBOL(netfs_prepare_write_failed);
+
+/*
+ * Prepare a write subrequest. We need to allocate a new subrequest
+ * if we don't have one.
+ */
+static void netfs_prepare_write(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream,
+ loff_t start)
+{
+ struct netfs_io_subrequest *subreq;
+ struct iov_iter *wreq_iter = &wreq->buffer.iter;
+
+ /* Make sure we don't point the iterator at a used-up folio_queue
+ * struct being used as a placeholder to prevent the queue from
+ * collapsing. In such a case, extend the queue.
+ */
+ if (iov_iter_is_folioq(wreq_iter) &&
+ wreq_iter->folioq_slot >= folioq_nr_slots(wreq_iter->folioq))
+ rolling_buffer_make_space(&wreq->buffer);
+
+ subreq = netfs_alloc_subrequest(wreq);
+ subreq->source = stream->source;
+ subreq->start = start;
+ subreq->stream_nr = stream->stream_nr;
+ subreq->io_iter = *wreq_iter;
+
+ _enter("R=%x[%x]", wreq->debug_id, subreq->debug_index);
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+
+ stream->sreq_max_len = UINT_MAX;
+ stream->sreq_max_segs = INT_MAX;
+ switch (stream->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload);
+ stream->sreq_max_len = wreq->wsize;
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ if (stream->prepare_write)
+ stream->prepare_write(subreq);
+
+ __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+
+ /* We add to the end of the list whilst the collector may be walking
+ * the list. The collector only goes nextwards and uses the lock to
+ * remove entries off of the front.
+ */
+ spin_lock(&wreq->lock);
+ list_add_tail(&subreq->rreq_link, &stream->subrequests);
+ if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
+ stream->front = subreq;
+ if (!stream->active) {
+ stream->collected_to = stream->front->start;
+ /* Write list pointers before active flag */
+ smp_store_release(&stream->active, true);
+ }
+ }
+
+ spin_unlock(&wreq->lock);
+
+ stream->construct = subreq;
+}
+
+/*
+ * Set the I/O iterator for the filesystem/cache to use and dispatch the I/O
+ * operation. The operation may be asynchronous and should call
+ * netfs_write_subrequest_terminated() when complete.
+ */
+static void netfs_do_issue_write(struct netfs_io_stream *stream,
+ struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *wreq = subreq->rreq;
+
+ _enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len);
+
+ if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
+ return netfs_write_subrequest_terminated(subreq, subreq->error, false);
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ stream->issue_write(subreq);
+}
+
+void netfs_reissue_write(struct netfs_io_stream *stream,
+ struct netfs_io_subrequest *subreq,
+ struct iov_iter *source)
+{
+ size_t size = subreq->len - subreq->transferred;
+
+ // TODO: Use encrypted buffer
+ subreq->io_iter = *source;
+ iov_iter_advance(source, size);
+ iov_iter_truncate(&subreq->io_iter, size);
+
+ subreq->retry_count++;
+ __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+ __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+ netfs_stat(&netfs_n_wh_retry_write_subreq);
+ netfs_do_issue_write(stream, subreq);
+}
+
+void netfs_issue_write(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream)
+{
+ struct netfs_io_subrequest *subreq = stream->construct;
+
+ if (!subreq)
+ return;
+ stream->construct = NULL;
+ subreq->io_iter.count = subreq->len;
+ netfs_do_issue_write(stream, subreq);
+}
+
+/*
+ * Add data to the write subrequest, dispatching each as we fill it up or if it
+ * is discontiguous with the previous. We only fill one part at a time so that
+ * we can avoid overrunning the credits obtained (cifs) and try to parallelise
+ * content-crypto preparation with network writes.
+ */
+size_t netfs_advance_write(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream,
+ loff_t start, size_t len, bool to_eof)
+{
+ struct netfs_io_subrequest *subreq = stream->construct;
+ size_t part;
+
+ if (!stream->avail) {
+ _leave("no write");
+ return len;
+ }
+
+ _enter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0);
+
+ if (subreq && start != subreq->start + subreq->len) {
+ netfs_issue_write(wreq, stream);
+ subreq = NULL;
+ }
+
+ if (!stream->construct)
+ netfs_prepare_write(wreq, stream, start);
+ subreq = stream->construct;
+
+ part = umin(stream->sreq_max_len - subreq->len, len);
+ _debug("part %zx/%zx %zx/%zx", subreq->len, stream->sreq_max_len, part, len);
+ subreq->len += part;
+ subreq->nr_segs++;
+ stream->submit_extendable_to -= part;
+
+ if (subreq->len >= stream->sreq_max_len ||
+ subreq->nr_segs >= stream->sreq_max_segs ||
+ to_eof) {
+ netfs_issue_write(wreq, stream);
+ subreq = NULL;
+ }
+
+ return part;
+}
+
+/*
+ * Write some of a pending folio data back to the server.
+ */
+static int netfs_write_folio(struct netfs_io_request *wreq,
+ struct writeback_control *wbc,
+ struct folio *folio)
+{
+ struct netfs_io_stream *upload = &wreq->io_streams[0];
+ struct netfs_io_stream *cache = &wreq->io_streams[1];
+ struct netfs_io_stream *stream;
+ struct netfs_group *fgroup; /* TODO: Use this with ceph */
+ struct netfs_folio *finfo;
+ size_t iter_off = 0;
+ size_t fsize = folio_size(folio), flen = fsize, foff = 0;
+ loff_t fpos = folio_pos(folio), i_size;
+ bool to_eof = false, streamw = false;
+ bool debug = false;
+
+ _enter("");
+
+ if (rolling_buffer_make_space(&wreq->buffer) < 0)
+ return -ENOMEM;
+
+ /* netfs_perform_write() may shift i_size around the page or from out
+ * of the page to beyond it, but cannot move i_size into or through the
+ * page since we have it locked.
+ */
+ i_size = i_size_read(wreq->inode);
+
+ if (fpos >= i_size) {
+ /* mmap beyond eof. */
+ _debug("beyond eof");
+ folio_start_writeback(folio);
+ folio_unlock(folio);
+ wreq->nr_group_rel += netfs_folio_written_back(folio);
+ netfs_put_group_many(wreq->group, wreq->nr_group_rel);
+ wreq->nr_group_rel = 0;
+ return 0;
+ }
+
+ if (fpos + fsize > wreq->i_size)
+ wreq->i_size = i_size;
+
+ fgroup = netfs_folio_group(folio);
+ finfo = netfs_folio_info(folio);
+ if (finfo) {
+ foff = finfo->dirty_offset;
+ flen = foff + finfo->dirty_len;
+ streamw = true;
+ }
+
+ if (wreq->origin == NETFS_WRITETHROUGH) {
+ to_eof = false;
+ if (flen > i_size - fpos)
+ flen = i_size - fpos;
+ } else if (flen > i_size - fpos) {
+ flen = i_size - fpos;
+ if (!streamw)
+ folio_zero_segment(folio, flen, fsize);
+ to_eof = true;
+ } else if (flen == i_size - fpos) {
+ to_eof = true;
+ }
+ flen -= foff;
+
+ _debug("folio %zx %zx %zx", foff, flen, fsize);
+
+ /* Deal with discontinuities in the stream of dirty pages. These can
+ * arise from a number of sources:
+ *
+ * (1) Intervening non-dirty pages from random-access writes, multiple
+ * flushers writing back different parts simultaneously and manual
+ * syncing.
+ *
+ * (2) Partially-written pages from write-streaming.
+ *
+ * (3) Pages that belong to a different write-back group (eg. Ceph
+ * snapshots).
+ *
+ * (4) Actually-clean pages that were marked for write to the cache
+ * when they were read. Note that these appear as a special
+ * write-back group.
+ */
+ if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) {
+ netfs_issue_write(wreq, upload);
+ } else if (fgroup != wreq->group) {
+ /* We can't write this page to the server yet. */
+ kdebug("wrong group");
+ folio_redirty_for_writepage(wbc, folio);
+ folio_unlock(folio);
+ netfs_issue_write(wreq, upload);
+ netfs_issue_write(wreq, cache);
+ return 0;
+ }
+
+ if (foff > 0)
+ netfs_issue_write(wreq, upload);
+ if (streamw)
+ netfs_issue_write(wreq, cache);
+
+ /* Flip the page to the writeback state and unlock. If we're called
+ * from write-through, then the page has already been put into the wb
+ * state.
+ */
+ if (wreq->origin == NETFS_WRITEBACK)
+ folio_start_writeback(folio);
+ folio_unlock(folio);
+
+ if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) {
+ if (!cache->avail) {
+ trace_netfs_folio(folio, netfs_folio_trace_cancel_copy);
+ netfs_issue_write(wreq, upload);
+ netfs_folio_written_back(folio);
+ return 0;
+ }
+ trace_netfs_folio(folio, netfs_folio_trace_store_copy);
+ } else if (!upload->avail && !cache->avail) {
+ trace_netfs_folio(folio, netfs_folio_trace_cancel_store);
+ netfs_folio_written_back(folio);
+ return 0;
+ } else if (!upload->construct) {
+ trace_netfs_folio(folio, netfs_folio_trace_store);
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_store_plus);
+ }
+
+ /* Attach the folio to the rolling buffer. */
+ rolling_buffer_append(&wreq->buffer, folio, 0);
+
+ /* Move the submission point forward to allow for write-streaming data
+ * not starting at the front of the page. We don't do write-streaming
+ * with the cache as the cache requires DIO alignment.
+ *
+ * Also skip uploading for data that's been read and just needs copying
+ * to the cache.
+ */
+ for (int s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ stream->submit_off = foff;
+ stream->submit_len = flen;
+ if (!stream->avail ||
+ (stream->source == NETFS_WRITE_TO_CACHE && streamw) ||
+ (stream->source == NETFS_UPLOAD_TO_SERVER &&
+ fgroup == NETFS_FOLIO_COPY_TO_CACHE)) {
+ stream->submit_off = UINT_MAX;
+ stream->submit_len = 0;
+ }
+ }
+
+ /* Attach the folio to one or more subrequests. For a big folio, we
+ * could end up with thousands of subrequests if the wsize is small -
+ * but we might need to wait during the creation of subrequests for
+ * network resources (eg. SMB credits).
+ */
+ for (;;) {
+ ssize_t part;
+ size_t lowest_off = ULONG_MAX;
+ int choose_s = -1;
+
+ /* Always add to the lowest-submitted stream first. */
+ for (int s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->submit_len > 0 &&
+ stream->submit_off < lowest_off) {
+ lowest_off = stream->submit_off;
+ choose_s = s;
+ }
+ }
+
+ if (choose_s < 0)
+ break;
+ stream = &wreq->io_streams[choose_s];
+
+ /* Advance the iterator(s). */
+ if (stream->submit_off > iter_off) {
+ rolling_buffer_advance(&wreq->buffer, stream->submit_off - iter_off);
+ iter_off = stream->submit_off;
+ }
+
+ atomic64_set(&wreq->issued_to, fpos + stream->submit_off);
+ stream->submit_extendable_to = fsize - stream->submit_off;
+ part = netfs_advance_write(wreq, stream, fpos + stream->submit_off,
+ stream->submit_len, to_eof);
+ stream->submit_off += part;
+ if (part > stream->submit_len)
+ stream->submit_len = 0;
+ else
+ stream->submit_len -= part;
+ if (part > 0)
+ debug = true;
+ }
+
+ if (fsize > iter_off)
+ rolling_buffer_advance(&wreq->buffer, fsize - iter_off);
+ atomic64_set(&wreq->issued_to, fpos + fsize);
+
+ if (!debug)
+ kdebug("R=%x: No submit", wreq->debug_id);
+
+ if (foff + flen < fsize)
+ for (int s = 0; s < NR_IO_STREAMS; s++)
+ netfs_issue_write(wreq, &wreq->io_streams[s]);
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * End the issuing of writes, letting the collector know we're done.
+ */
+static void netfs_end_issue_write(struct netfs_io_request *wreq)
+{
+ bool needs_poke = true;
+
+ smp_wmb(); /* Write subreq lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
+
+ for (int s = 0; s < NR_IO_STREAMS; s++) {
+ struct netfs_io_stream *stream = &wreq->io_streams[s];
+
+ if (!stream->active)
+ continue;
+ if (!list_empty(&stream->subrequests))
+ needs_poke = false;
+ netfs_issue_write(wreq, stream);
+ }
+
+ if (needs_poke)
+ netfs_wake_write_collector(wreq, false);
+}
+
+/*
+ * Write some of the pending data back to the server
+ */
+int netfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct netfs_inode *ictx = netfs_inode(mapping->host);
+ struct netfs_io_request *wreq = NULL;
+ struct folio *folio;
+ int error = 0;
+
+ if (!mutex_trylock(&ictx->wb_lock)) {
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ netfs_stat(&netfs_n_wb_lock_skip);
+ return 0;
+ }
+ netfs_stat(&netfs_n_wb_lock_wait);
+ mutex_lock(&ictx->wb_lock);
+ }
+
+ /* Need the first folio to be able to set up the op. */
+ folio = writeback_iter(mapping, wbc, NULL, &error);
+ if (!folio)
+ goto out;
+
+ wreq = netfs_create_write_req(mapping, NULL, folio_pos(folio), NETFS_WRITEBACK);
+ if (IS_ERR(wreq)) {
+ error = PTR_ERR(wreq);
+ goto couldnt_start;
+ }
+
+ trace_netfs_write(wreq, netfs_write_trace_writeback);
+ netfs_stat(&netfs_n_wh_writepages);
+
+ do {
+ _debug("wbiter %lx %llx", folio->index, atomic64_read(&wreq->issued_to));
+
+ /* It appears we don't have to handle cyclic writeback wrapping. */
+ WARN_ON_ONCE(wreq && folio_pos(folio) < atomic64_read(&wreq->issued_to));
+
+ if (netfs_folio_group(folio) != NETFS_FOLIO_COPY_TO_CACHE &&
+ unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))) {
+ set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ wreq->netfs_ops->begin_writeback(wreq);
+ }
+
+ error = netfs_write_folio(wreq, wbc, folio);
+ if (error < 0)
+ break;
+ } while ((folio = writeback_iter(mapping, wbc, folio, &error)));
+
+ netfs_end_issue_write(wreq);
+
+ mutex_unlock(&ictx->wb_lock);
+
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ _leave(" = %d", error);
+ return error;
+
+couldnt_start:
+ netfs_kill_dirty_pages(mapping, wbc, folio);
+out:
+ mutex_unlock(&ictx->wb_lock);
+ _leave(" = %d", error);
+ return error;
+}
+EXPORT_SYMBOL(netfs_writepages);
+
+/*
+ * Begin a write operation for writing through the pagecache.
+ */
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len)
+{
+ struct netfs_io_request *wreq = NULL;
+ struct netfs_inode *ictx = netfs_inode(file_inode(iocb->ki_filp));
+
+ mutex_lock(&ictx->wb_lock);
+
+ wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp,
+ iocb->ki_pos, NETFS_WRITETHROUGH);
+ if (IS_ERR(wreq)) {
+ mutex_unlock(&ictx->wb_lock);
+ return wreq;
+ }
+
+ wreq->io_streams[0].avail = true;
+ trace_netfs_write(wreq, netfs_write_trace_writethrough);
+ return wreq;
+}
+
+/*
+ * Advance the state of the write operation used when writing through the
+ * pagecache. Data has been copied into the pagecache that we need to append
+ * to the request. If we've added more than wsize then we need to create a new
+ * subrequest.
+ */
+int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *folio, size_t copied, bool to_page_end,
+ struct folio **writethrough_cache)
+{
+ _enter("R=%x ic=%zu ws=%u cp=%zu tp=%u",
+ wreq->debug_id, wreq->buffer.iter.count, wreq->wsize, copied, to_page_end);
+
+ if (!*writethrough_cache) {
+ if (folio_test_dirty(folio))
+ /* Sigh. mmap. */
+ folio_clear_dirty_for_io(folio);
+
+ /* We can make multiple writes to the folio... */
+ folio_start_writeback(folio);
+ if (wreq->len == 0)
+ trace_netfs_folio(folio, netfs_folio_trace_wthru);
+ else
+ trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
+ *writethrough_cache = folio;
+ }
+
+ wreq->len += copied;
+ if (!to_page_end)
+ return 0;
+
+ *writethrough_cache = NULL;
+ return netfs_write_folio(wreq, wbc, folio);
+}
+
+/*
+ * End a write operation used when writing through the pagecache.
+ */
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *writethrough_cache)
+{
+ struct netfs_inode *ictx = netfs_inode(wreq->inode);
+ int ret;
+
+ _enter("R=%x", wreq->debug_id);
+
+ if (writethrough_cache)
+ netfs_write_folio(wreq, wbc, writethrough_cache);
+
+ netfs_end_issue_write(wreq);
+
+ mutex_unlock(&ictx->wb_lock);
+
+ if (wreq->iocb) {
+ ret = -EIOCBQUEUED;
+ } else {
+ wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE);
+ ret = wreq->error;
+ }
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ return ret;
+}
+
+/*
+ * Write data to the server without going through the pagecache and without
+ * writing it to the local cache.
+ */
+int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len)
+{
+ struct netfs_io_stream *upload = &wreq->io_streams[0];
+ ssize_t part;
+ loff_t start = wreq->start;
+ int error = 0;
+
+ _enter("%zx", len);
+
+ if (wreq->origin == NETFS_DIO_WRITE)
+ inode_dio_begin(wreq->inode);
+
+ while (len) {
+ // TODO: Prepare content encryption
+
+ _debug("unbuffered %zx", len);
+ part = netfs_advance_write(wreq, upload, start, len, false);
+ start += part;
+ len -= part;
+ rolling_buffer_advance(&wreq->buffer, part);
+ if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
+ trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause);
+ wait_event(wreq->waitq, !test_bit(NETFS_RREQ_PAUSE, &wreq->flags));
+ }
+ if (test_bit(NETFS_RREQ_FAILED, &wreq->flags))
+ break;
+ }
+
+ netfs_end_issue_write(wreq);
+ _leave(" = %d", error);
+ return error;
+}
+
+/*
+ * Write some of a pending folio data back to the server and/or the cache.
+ */
+static int netfs_write_folio_single(struct netfs_io_request *wreq,
+ struct folio *folio)
+{
+ struct netfs_io_stream *upload = &wreq->io_streams[0];
+ struct netfs_io_stream *cache = &wreq->io_streams[1];
+ struct netfs_io_stream *stream;
+ size_t iter_off = 0;
+ size_t fsize = folio_size(folio), flen;
+ loff_t fpos = folio_pos(folio);
+ bool to_eof = false;
+ bool no_debug = false;
+
+ _enter("");
+
+ flen = folio_size(folio);
+ if (flen > wreq->i_size - fpos) {
+ flen = wreq->i_size - fpos;
+ folio_zero_segment(folio, flen, fsize);
+ to_eof = true;
+ } else if (flen == wreq->i_size - fpos) {
+ to_eof = true;
+ }
+
+ _debug("folio %zx/%zx", flen, fsize);
+
+ if (!upload->avail && !cache->avail) {
+ trace_netfs_folio(folio, netfs_folio_trace_cancel_store);
+ return 0;
+ }
+
+ if (!upload->construct)
+ trace_netfs_folio(folio, netfs_folio_trace_store);
+ else
+ trace_netfs_folio(folio, netfs_folio_trace_store_plus);
+
+ /* Attach the folio to the rolling buffer. */
+ folio_get(folio);
+ rolling_buffer_append(&wreq->buffer, folio, NETFS_ROLLBUF_PUT_MARK);
+
+ /* Move the submission point forward to allow for write-streaming data
+ * not starting at the front of the page. We don't do write-streaming
+ * with the cache as the cache requires DIO alignment.
+ *
+ * Also skip uploading for data that's been read and just needs copying
+ * to the cache.
+ */
+ for (int s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ stream->submit_off = 0;
+ stream->submit_len = flen;
+ if (!stream->avail) {
+ stream->submit_off = UINT_MAX;
+ stream->submit_len = 0;
+ }
+ }
+
+ /* Attach the folio to one or more subrequests. For a big folio, we
+ * could end up with thousands of subrequests if the wsize is small -
+ * but we might need to wait during the creation of subrequests for
+ * network resources (eg. SMB credits).
+ */
+ for (;;) {
+ ssize_t part;
+ size_t lowest_off = ULONG_MAX;
+ int choose_s = -1;
+
+ /* Always add to the lowest-submitted stream first. */
+ for (int s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->submit_len > 0 &&
+ stream->submit_off < lowest_off) {
+ lowest_off = stream->submit_off;
+ choose_s = s;
+ }
+ }
+
+ if (choose_s < 0)
+ break;
+ stream = &wreq->io_streams[choose_s];
+
+ /* Advance the iterator(s). */
+ if (stream->submit_off > iter_off) {
+ rolling_buffer_advance(&wreq->buffer, stream->submit_off - iter_off);
+ iter_off = stream->submit_off;
+ }
+
+ atomic64_set(&wreq->issued_to, fpos + stream->submit_off);
+ stream->submit_extendable_to = fsize - stream->submit_off;
+ part = netfs_advance_write(wreq, stream, fpos + stream->submit_off,
+ stream->submit_len, to_eof);
+ stream->submit_off += part;
+ if (part > stream->submit_len)
+ stream->submit_len = 0;
+ else
+ stream->submit_len -= part;
+ if (part > 0)
+ no_debug = true;
+ }
+
+ wreq->buffer.iter.iov_offset = 0;
+ if (fsize > iter_off)
+ rolling_buffer_advance(&wreq->buffer, fsize - iter_off);
+ atomic64_set(&wreq->issued_to, fpos + fsize);
+
+ if (!no_debug)
+ kdebug("R=%x: No submit", wreq->debug_id);
+ _leave(" = 0");
+ return 0;
+}
+
+/**
+ * netfs_writeback_single - Write back a monolithic payload
+ * @mapping: The mapping to write from
+ * @wbc: Hints from the VM
+ * @iter: Data to write, must be ITER_FOLIOQ.
+ *
+ * Write a monolithic, non-pagecache object back to the server and/or
+ * the cache.
+ */
+int netfs_writeback_single(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct iov_iter *iter)
+{
+ struct netfs_io_request *wreq;
+ struct netfs_inode *ictx = netfs_inode(mapping->host);
+ struct folio_queue *fq;
+ size_t size = iov_iter_count(iter);
+ int ret;
+
+ if (WARN_ON_ONCE(!iov_iter_is_folioq(iter)))
+ return -EIO;
+
+ if (!mutex_trylock(&ictx->wb_lock)) {
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ netfs_stat(&netfs_n_wb_lock_skip);
+ return 0;
+ }
+ netfs_stat(&netfs_n_wb_lock_wait);
+ mutex_lock(&ictx->wb_lock);
+ }
+
+ wreq = netfs_create_write_req(mapping, NULL, 0, NETFS_WRITEBACK_SINGLE);
+ if (IS_ERR(wreq)) {
+ ret = PTR_ERR(wreq);
+ goto couldnt_start;
+ }
+
+ trace_netfs_write(wreq, netfs_write_trace_writeback);
+ netfs_stat(&netfs_n_wh_writepages);
+
+ if (__test_and_set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
+ wreq->netfs_ops->begin_writeback(wreq);
+
+ for (fq = (struct folio_queue *)iter->folioq; fq; fq = fq->next) {
+ for (int slot = 0; slot < folioq_count(fq); slot++) {
+ struct folio *folio = folioq_folio(fq, slot);
+ size_t part = umin(folioq_folio_size(fq, slot), size);
+
+ _debug("wbiter %lx %llx", folio->index, atomic64_read(&wreq->issued_to));
+
+ ret = netfs_write_folio_single(wreq, folio);
+ if (ret < 0)
+ goto stop;
+ size -= part;
+ if (size <= 0)
+ goto stop;
+ }
+ }
+
+stop:
+ for (int s = 0; s < NR_IO_STREAMS; s++)
+ netfs_issue_write(wreq, &wreq->io_streams[s]);
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
+
+ mutex_unlock(&ictx->wb_lock);
+
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ _leave(" = %d", ret);
+ return ret;
+
+couldnt_start:
+ mutex_unlock(&ictx->wb_lock);
+ _leave(" = %d", ret);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_writeback_single);
diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c
new file mode 100644
index 000000000000..545d33079a77
--- /dev/null
+++ b/fs/netfs/write_retry.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem write retrying.
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+/*
+ * Perform retries on the streams that need it.
+ */
+static void netfs_retry_write_stream(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream)
+{
+ struct list_head *next;
+
+ _enter("R=%x[%x:]", wreq->debug_id, stream->stream_nr);
+
+ if (list_empty(&stream->subrequests))
+ return;
+
+ if (stream->source == NETFS_UPLOAD_TO_SERVER &&
+ wreq->netfs_ops->retry_request)
+ wreq->netfs_ops->retry_request(wreq, stream);
+
+ if (unlikely(stream->failed))
+ return;
+
+ /* If there's no renegotiation to do, just resend each failed subreq. */
+ if (!stream->prepare_write) {
+ struct netfs_io_subrequest *subreq;
+
+ list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
+ if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
+ break;
+ if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
+ struct iov_iter source = subreq->io_iter;
+
+ iov_iter_revert(&source, subreq->len - source.count);
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+ netfs_reissue_write(stream, subreq, &source);
+ }
+ }
+ return;
+ }
+
+ next = stream->subrequests.next;
+
+ do {
+ struct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp;
+ struct iov_iter source;
+ unsigned long long start, len;
+ size_t part;
+ bool boundary = false;
+
+ /* Go through the stream and find the next span of contiguous
+ * data that we then rejig (cifs, for example, needs the wsize
+ * renegotiating) and reissue.
+ */
+ from = list_entry(next, struct netfs_io_subrequest, rreq_link);
+ to = from;
+ start = from->start + from->transferred;
+ len = from->len - from->transferred;
+
+ if (test_bit(NETFS_SREQ_FAILED, &from->flags) ||
+ !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags))
+ return;
+
+ list_for_each_continue(next, &stream->subrequests) {
+ subreq = list_entry(next, struct netfs_io_subrequest, rreq_link);
+ if (subreq->start + subreq->transferred != start + len ||
+ test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) ||
+ !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))
+ break;
+ to = subreq;
+ len += to->len;
+ }
+
+ /* Determine the set of buffers we're going to use. Each
+ * subreq gets a subset of a single overall contiguous buffer.
+ */
+ netfs_reset_iter(from);
+ source = from->io_iter;
+ source.count = len;
+
+ /* Work through the sublist. */
+ subreq = from;
+ list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
+ if (!len)
+ break;
+
+ subreq->start = start;
+ subreq->len = len;
+ __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ subreq->retry_count++;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+
+ /* Renegotiate max_len (wsize) */
+ stream->sreq_max_len = len;
+ stream->prepare_write(subreq);
+
+ part = umin(len, stream->sreq_max_len);
+ if (unlikely(stream->sreq_max_segs))
+ part = netfs_limit_iter(&source, 0, part, stream->sreq_max_segs);
+ subreq->len = part;
+ subreq->transferred = 0;
+ len -= part;
+ start += part;
+ if (len && subreq == to &&
+ __test_and_clear_bit(NETFS_SREQ_BOUNDARY, &to->flags))
+ boundary = true;
+
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+ netfs_reissue_write(stream, subreq, &source);
+ if (subreq == to)
+ break;
+ }
+
+ /* If we managed to use fewer subreqs, we can discard the
+ * excess; if we used the same number, then we're done.
+ */
+ if (!len) {
+ if (subreq == to)
+ continue;
+ list_for_each_entry_safe_from(subreq, tmp,
+ &stream->subrequests, rreq_link) {
+ trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
+ list_del(&subreq->rreq_link);
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
+ if (subreq == to)
+ break;
+ }
+ continue;
+ }
+
+ /* We ran out of subrequests, so we need to allocate some more
+ * and insert them after.
+ */
+ do {
+ subreq = netfs_alloc_subrequest(wreq);
+ subreq->source = to->source;
+ subreq->start = start;
+ subreq->debug_index = atomic_inc_return(&wreq->subreq_counter);
+ subreq->stream_nr = to->stream_nr;
+ subreq->retry_count = 1;
+
+ trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
+ refcount_read(&subreq->ref),
+ netfs_sreq_trace_new);
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+
+ list_add(&subreq->rreq_link, &to->rreq_link);
+ to = list_next_entry(to, rreq_link);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+
+ stream->sreq_max_len = len;
+ stream->sreq_max_segs = INT_MAX;
+ switch (stream->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload);
+ stream->sreq_max_len = umin(len, wreq->wsize);
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+
+ stream->prepare_write(subreq);
+
+ part = umin(len, stream->sreq_max_len);
+ subreq->len = subreq->transferred + part;
+ len -= part;
+ start += part;
+ if (!len && boundary) {
+ __set_bit(NETFS_SREQ_BOUNDARY, &to->flags);
+ boundary = false;
+ }
+
+ netfs_reissue_write(stream, subreq, &source);
+ if (!len)
+ break;
+
+ } while (len);
+
+ } while (!list_is_head(next, &stream->subrequests));
+}
+
+/*
+ * Perform retries on the streams that need it. If we're doing content
+ * encryption and the server copy changed due to a third-party write, we may
+ * need to do an RMW cycle and also rewrite the data to the cache.
+ */
+void netfs_retry_writes(struct netfs_io_request *wreq)
+{
+ struct netfs_io_subrequest *subreq;
+ struct netfs_io_stream *stream;
+ int s;
+
+ netfs_stat(&netfs_n_wh_retry_write_req);
+
+ /* Wait for all outstanding I/O to quiesce before performing retries as
+ * we may need to renegotiate the I/O sizes.
+ */
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (!stream->active)
+ continue;
+
+ list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
+ wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+ }
+ }
+
+ // TODO: Enc: Fetch changed partial pages
+ // TODO: Enc: Reencrypt content if needed.
+ // TODO: Enc: Wind back transferred point.
+ // TODO: Enc: Mark cache pages for retry.
+
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->need_retry) {
+ stream->need_retry = false;
+ netfs_retry_write_stream(wreq, stream);
+ }
+ }
+}