summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/caching/cachefiles.rst178
-rw-r--r--fs/cachefiles/Kconfig12
-rw-r--r--fs/cachefiles/Makefile1
-rw-r--r--fs/cachefiles/daemon.c117
-rw-r--r--fs/cachefiles/interface.c2
-rw-r--r--fs/cachefiles/internal.h78
-rw-r--r--fs/cachefiles/io.c76
-rw-r--r--fs/cachefiles/namei.c16
-rw-r--r--fs/cachefiles/ondemand.c503
-rw-r--r--fs/erofs/Kconfig10
-rw-r--r--fs/erofs/Makefile1
-rw-r--r--fs/erofs/data.c26
-rw-r--r--fs/erofs/decompressor.c7
-rw-r--r--fs/erofs/erofs_fs.h50
-rw-r--r--fs/erofs/fscache.c521
-rw-r--r--fs/erofs/inode.c11
-rw-r--r--fs/erofs/internal.h76
-rw-r--r--fs/erofs/namei.c5
-rw-r--r--fs/erofs/super.c221
-rw-r--r--fs/erofs/sysfs.c4
-rw-r--r--include/linux/fscache.h1
-rw-r--r--include/linux/netfs.h1
-rw-r--r--include/trace/events/cachefiles.h176
-rw-r--r--include/uapi/linux/cachefiles.h68
24 files changed, 1997 insertions, 164 deletions
diff --git a/Documentation/filesystems/caching/cachefiles.rst b/Documentation/filesystems/caching/cachefiles.rst
index 8bf396b76359..fc7abf712315 100644
--- a/Documentation/filesystems/caching/cachefiles.rst
+++ b/Documentation/filesystems/caching/cachefiles.rst
@@ -28,6 +28,7 @@ Cache on Already Mounted Filesystem
(*) Debugging.
+ (*) On-demand Read.
Overview
@@ -482,3 +483,180 @@ the control file. For example::
echo $((1|4|8)) >/sys/module/cachefiles/parameters/debug
will turn on all function entry debugging.
+
+
+On-demand Read
+==============
+
+When working in its original mode, CacheFiles serves as a local cache for a
+remote networking fs - while in on-demand read mode, CacheFiles can boost the
+scenario where on-demand read semantics are needed, e.g. container image
+distribution.
+
+The essential difference between these two modes is seen when a cache miss
+occurs: In the original mode, the netfs will fetch the data from the remote
+server and then write it to the cache file; in on-demand read mode, fetching
+the data and writing it into the cache is delegated to a user daemon.
+
+``CONFIG_CACHEFILES_ONDEMAND`` should be enabled to support on-demand read mode.
+
+
+Protocol Communication
+----------------------
+
+The on-demand read mode uses a simple protocol for communication between kernel
+and user daemon. The protocol can be modeled as::
+
+ kernel --[request]--> user daemon --[reply]--> kernel
+
+CacheFiles will send requests to the user daemon when needed. The user daemon
+should poll the devnode ('/dev/cachefiles') to check if there's a pending
+request to be processed. A POLLIN event will be returned when there's a pending
+request.
+
+The user daemon then reads the devnode to fetch a request to process. It should
+be noted that each read only gets one request. When it has finished processing
+the request, the user daemon should write the reply to the devnode.
+
+Each request starts with a message header of the form::
+
+ struct cachefiles_msg {
+ __u32 msg_id;
+ __u32 opcode;
+ __u32 len;
+ __u32 object_id;
+ __u8 data[];
+ };
+
+where:
+
+ * ``msg_id`` is a unique ID identifying this request among all pending
+ requests.
+
+ * ``opcode`` indicates the type of this request.
+
+ * ``object_id`` is a unique ID identifying the cache file operated on.
+
+ * ``data`` indicates the payload of this request.
+
+ * ``len`` indicates the whole length of this request, including the
+ header and following type-specific payload.
+
+
+Turning on On-demand Mode
+-------------------------
+
+An optional parameter becomes available to the "bind" command::
+
+ bind [ondemand]
+
+When the "bind" command is given no argument, it defaults to the original mode.
+When it is given the "ondemand" argument, i.e. "bind ondemand", on-demand read
+mode will be enabled.
+
+
+The OPEN Request
+----------------
+
+When the netfs opens a cache file for the first time, a request with the
+CACHEFILES_OP_OPEN opcode, a.k.a an OPEN request will be sent to the user
+daemon. The payload format is of the form::
+
+ struct cachefiles_open {
+ __u32 volume_key_size;
+ __u32 cookie_key_size;
+ __u32 fd;
+ __u32 flags;
+ __u8 data[];
+ };
+
+where:
+
+ * ``data`` contains the volume_key followed directly by the cookie_key.
+ The volume key is a NUL-terminated string; the cookie key is binary
+ data.
+
+ * ``volume_key_size`` indicates the size of the volume key in bytes.
+
+ * ``cookie_key_size`` indicates the size of the cookie key in bytes.
+
+ * ``fd`` indicates an anonymous fd referring to the cache file, through
+ which the user daemon can perform write/llseek file operations on the
+ cache file.
+
+
+The user daemon can use the given (volume_key, cookie_key) pair to distinguish
+the requested cache file. With the given anonymous fd, the user daemon can
+fetch the data and write it to the cache file in the background, even when
+kernel has not triggered a cache miss yet.
+
+Be noted that each cache file has a unique object_id, while it may have multiple
+anonymous fds. The user daemon may duplicate anonymous fds from the initial
+anonymous fd indicated by the @fd field through dup(). Thus each object_id can
+be mapped to multiple anonymous fds, while the usr daemon itself needs to
+maintain the mapping.
+
+When implementing a user daemon, please be careful of RLIMIT_NOFILE,
+``/proc/sys/fs/nr_open`` and ``/proc/sys/fs/file-max``. Typically these needn't
+be huge since they're related to the number of open device blobs rather than
+open files of each individual filesystem.
+
+The user daemon should reply the OPEN request by issuing a "copen" (complete
+open) command on the devnode::
+
+ copen <msg_id>,<cache_size>
+
+where:
+
+ * ``msg_id`` must match the msg_id field of the OPEN request.
+
+ * When >= 0, ``cache_size`` indicates the size of the cache file;
+ when < 0, ``cache_size`` indicates any error code encountered by the
+ user daemon.
+
+
+The CLOSE Request
+-----------------
+
+When a cookie withdrawn, a CLOSE request (opcode CACHEFILES_OP_CLOSE) will be
+sent to the user daemon. This tells the user daemon to close all anonymous fds
+associated with the given object_id. The CLOSE request has no extra payload,
+and shouldn't be replied.
+
+
+The READ Request
+----------------
+
+When a cache miss is encountered in on-demand read mode, CacheFiles will send a
+READ request (opcode CACHEFILES_OP_READ) to the user daemon. This tells the user
+daemon to fetch the contents of the requested file range. The payload is of the
+form::
+
+ struct cachefiles_read {
+ __u64 off;
+ __u64 len;
+ };
+
+where:
+
+ * ``off`` indicates the starting offset of the requested file range.
+
+ * ``len`` indicates the length of the requested file range.
+
+
+When it receives a READ request, the user daemon should fetch the requested data
+and write it to the cache file identified by object_id.
+
+When it has finished processing the READ request, the user daemon should reply
+by using the CACHEFILES_IOC_READ_COMPLETE ioctl on one of the anonymous fds
+associated with the object_id given in the READ request. The ioctl is of the
+form::
+
+ ioctl(fd, CACHEFILES_IOC_READ_COMPLETE, msg_id);
+
+where:
+
+ * ``fd`` is one of the anonymous fds associated with the object_id
+ given.
+
+ * ``msg_id`` must match the msg_id field of the READ request.
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
index 719faeeda168..8df715640a48 100644
--- a/fs/cachefiles/Kconfig
+++ b/fs/cachefiles/Kconfig
@@ -26,3 +26,15 @@ config CACHEFILES_ERROR_INJECTION
help
This permits error injection to be enabled in cachefiles whilst a
cache is in service.
+
+config CACHEFILES_ONDEMAND
+ bool "Support for on-demand read"
+ depends on CACHEFILES
+ default n
+ help
+ This permits userspace to enable the cachefiles on-demand read mode.
+ In this mode, when a cache miss occurs, responsibility for fetching
+ the data lies with the cachefiles backend instead of with the netfs
+ and is delegated to userspace.
+
+ If unsure, say N.
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
index 16d811f1a2fa..c37a7a9af10b 100644
--- a/fs/cachefiles/Makefile
+++ b/fs/cachefiles/Makefile
@@ -16,5 +16,6 @@ cachefiles-y := \
xattr.o
cachefiles-$(CONFIG_CACHEFILES_ERROR_INJECTION) += error_inject.o
+cachefiles-$(CONFIG_CACHEFILES_ONDEMAND) += ondemand.o
obj-$(CONFIG_CACHEFILES) := cachefiles.o
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 7ac04ee2c0a0..aa4efcabb5e3 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -75,6 +75,9 @@ static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
{ "inuse", cachefiles_daemon_inuse },
{ "secctx", cachefiles_daemon_secctx },
{ "tag", cachefiles_daemon_tag },
+#ifdef CONFIG_CACHEFILES_ONDEMAND
+ { "copen", cachefiles_ondemand_copen },
+#endif
{ "", NULL }
};
@@ -108,6 +111,9 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file)
INIT_LIST_HEAD(&cache->volumes);
INIT_LIST_HEAD(&cache->object_list);
spin_lock_init(&cache->object_list_lock);
+ refcount_set(&cache->unbind_pincount, 1);
+ xa_init_flags(&cache->reqs, XA_FLAGS_ALLOC);
+ xa_init_flags(&cache->ondemand_ids, XA_FLAGS_ALLOC1);
/* set default caching limits
* - limit at 1% free space and/or free files
@@ -126,6 +132,53 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file)
return 0;
}
+static void cachefiles_flush_reqs(struct cachefiles_cache *cache)
+{
+ struct xarray *xa = &cache->reqs;
+ struct cachefiles_req *req;
+ unsigned long index;
+
+ /*
+ * Make sure the following two operations won't be reordered.
+ * 1) set CACHEFILES_DEAD bit
+ * 2) flush requests in the xarray
+ * Otherwise the request may be enqueued after xarray has been
+ * flushed, leaving the orphan request never being completed.
+ *
+ * CPU 1 CPU 2
+ * ===== =====
+ * flush requests in the xarray
+ * test CACHEFILES_DEAD bit
+ * enqueue the request
+ * set CACHEFILES_DEAD bit
+ */
+ smp_mb();
+
+ xa_lock(xa);
+ xa_for_each(xa, index, req) {
+ req->error = -EIO;
+ complete(&req->done);
+ }
+ xa_unlock(xa);
+
+ xa_destroy(&cache->reqs);
+ xa_destroy(&cache->ondemand_ids);
+}
+
+void cachefiles_put_unbind_pincount(struct cachefiles_cache *cache)
+{
+ if (refcount_dec_and_test(&cache->unbind_pincount)) {
+ cachefiles_daemon_unbind(cache);
+ cachefiles_open = 0;
+ kfree(cache);
+ }
+}
+
+void cachefiles_get_unbind_pincount(struct cachefiles_cache *cache)
+{
+ refcount_inc(&cache->unbind_pincount);
+}
+
/*
* Release a cache.
*/
@@ -139,36 +192,27 @@ static int cachefiles_daemon_release(struct inode *inode, struct file *file)
set_bit(CACHEFILES_DEAD, &cache->flags);
- cachefiles_daemon_unbind(cache);
+ if (cachefiles_in_ondemand_mode(cache))
+ cachefiles_flush_reqs(cache);
/* clean up the control file interface */
cache->cachefilesd = NULL;
file->private_data = NULL;
- cachefiles_open = 0;
- kfree(cache);
+ cachefiles_put_unbind_pincount(cache);
_leave("");
return 0;
}
-/*
- * Read the cache state.
- */
-static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
- size_t buflen, loff_t *pos)
+static ssize_t cachefiles_do_daemon_read(struct cachefiles_cache *cache,
+ char __user *_buffer, size_t buflen)
{
- struct cachefiles_cache *cache = file->private_data;
unsigned long long b_released;
unsigned f_released;
char buffer[256];
int n;
- //_enter(",,%zu,", buflen);
-
- if (!test_bit(CACHEFILES_READY, &cache->flags))
- return 0;
-
/* check how much space the cache has */
cachefiles_has_space(cache, 0, 0, cachefiles_has_space_check);
@@ -207,6 +251,25 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
}
/*
+ * Read the cache state.
+ */
+static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
+ size_t buflen, loff_t *pos)
+{
+ struct cachefiles_cache *cache = file->private_data;
+
+ //_enter(",,%zu,", buflen);
+
+ if (!test_bit(CACHEFILES_READY, &cache->flags))
+ return 0;
+
+ if (cachefiles_in_ondemand_mode(cache))
+ return cachefiles_ondemand_daemon_read(cache, _buffer, buflen);
+ else
+ return cachefiles_do_daemon_read(cache, _buffer, buflen);
+}
+
+/*
* Take a command from cachefilesd, parse it and act on it.
*/
static ssize_t cachefiles_daemon_write(struct file *file,
@@ -297,8 +360,13 @@ static __poll_t cachefiles_daemon_poll(struct file *file,
poll_wait(file, &cache->daemon_pollwq, poll);
mask = 0;
- if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
- mask |= EPOLLIN;
+ if (cachefiles_in_ondemand_mode(cache)) {
+ if (!xa_empty(&cache->reqs))
+ mask |= EPOLLIN;
+ } else {
+ if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
+ mask |= EPOLLIN;
+ }
if (test_bit(CACHEFILES_CULLING, &cache->flags))
mask |= EPOLLOUT;
@@ -687,11 +755,6 @@ static int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
cache->brun_percent >= 100)
return -ERANGE;
- if (*args) {
- pr_err("'bind' command doesn't take an argument\n");
- return -EINVAL;
- }
-
if (!cache->rootdirname) {
pr_err("No cache directory specified\n");
return -EINVAL;
@@ -703,6 +766,18 @@ static int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
return -EBUSY;
}
+ if (IS_ENABLED(CONFIG_CACHEFILES_ONDEMAND)) {
+ if (!strcmp(args, "ondemand")) {
+ set_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags);
+ } else if (*args) {
+ pr_err("Invalid argument to the 'bind' command\n");
+ return -EINVAL;
+ }
+ } else if (*args) {
+ pr_err("'bind' command doesn't take an argument\n");
+ return -EINVAL;
+ }
+
/* Make sure we have copies of the tag string */
if (!cache->tag) {
/*
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index ae93cee9d25d..a69073a1d3f0 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -362,6 +362,8 @@ static void cachefiles_withdraw_cookie(struct fscache_cookie *cookie)
spin_unlock(&cache->object_list_lock);
}
+ cachefiles_ondemand_clean_object(object);
+
if (object->file) {
cachefiles_begin_secure(cache, &saved_cred);
cachefiles_clean_up_object(object, cache);
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index c793d33b0224..6cba2c6de2f9 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -15,6 +15,8 @@
#include <linux/fscache-cache.h>
#include <linux/cred.h>
#include <linux/security.h>
+#include <linux/xarray.h>
+#include <linux/cachefiles.h>
#define CACHEFILES_DIO_BLOCK_SIZE 4096
@@ -58,8 +60,13 @@ struct cachefiles_object {
enum cachefiles_content content_info:8; /* Info about content presence */
unsigned long flags;
#define CACHEFILES_OBJECT_USING_TMPFILE 0 /* Have an unlinked tmpfile */
+#ifdef CONFIG_CACHEFILES_ONDEMAND
+ int ondemand_id;
+#endif
};
+#define CACHEFILES_ONDEMAND_ID_CLOSED -1
+
/*
* Cache files cache definition
*/
@@ -98,11 +105,31 @@ struct cachefiles_cache {
#define CACHEFILES_DEAD 1 /* T if cache dead */
#define CACHEFILES_CULLING 2 /* T if cull engaged */
#define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */
+#define CACHEFILES_ONDEMAND_MODE 4 /* T if in on-demand read mode */
char *rootdirname; /* name of cache root directory */
char *secctx; /* LSM security context */
char *tag; /* cache binding tag */
+ refcount_t unbind_pincount;/* refcount to do daemon unbind */
+ struct xarray reqs; /* xarray of pending on-demand requests */
+ struct xarray ondemand_ids; /* xarray for ondemand_id allocation */
+ u32 ondemand_id_next;
+};
+
+static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache)
+{
+ return IS_ENABLED(CONFIG_CACHEFILES_ONDEMAND) &&
+ test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags);
+}
+
+struct cachefiles_req {
+ struct cachefiles_object *object;
+ struct completion done;
+ int error;
+ struct cachefiles_msg msg;
};
+#define CACHEFILES_REQ_NEW XA_MARK_1
+
#include <trace/events/cachefiles.h>
static inline
@@ -145,6 +172,8 @@ extern int cachefiles_has_space(struct cachefiles_cache *cache,
* daemon.c
*/
extern const struct file_operations cachefiles_daemon_fops;
+extern void cachefiles_get_unbind_pincount(struct cachefiles_cache *cache);
+extern void cachefiles_put_unbind_pincount(struct cachefiles_cache *cache);
/*
* error_inject.c
@@ -201,6 +230,16 @@ extern void cachefiles_put_object(struct cachefiles_object *object,
*/
extern bool cachefiles_begin_operation(struct netfs_cache_resources *cres,
enum fscache_want_state want_state);
+extern int __cachefiles_prepare_write(struct cachefiles_object *object,
+ struct file *file,
+ loff_t *_start, size_t *_len,
+ bool no_space_allocated_yet);
+extern int __cachefiles_write(struct cachefiles_object *object,
+ struct file *file,
+ loff_t start_pos,
+ struct iov_iter *iter,
+ netfs_io_terminated_t term_func,
+ void *term_func_priv);
/*
* key.c
@@ -241,6 +280,45 @@ extern bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
struct cachefiles_object *object);
/*
+ * ondemand.c
+ */
+#ifdef CONFIG_CACHEFILES_ONDEMAND
+extern ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
+ char __user *_buffer, size_t buflen);
+
+extern int cachefiles_ondemand_copen(struct cachefiles_cache *cache,
+ char *args);
+
+extern int cachefiles_ondemand_init_object(struct cachefiles_object *object);
+extern void cachefiles_ondemand_clean_object(struct cachefiles_object *object);
+
+extern int cachefiles_ondemand_read(struct cachefiles_object *object,
+ loff_t pos, size_t len);
+
+#else
+static inline ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
+ char __user *_buffer, size_t buflen)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int cachefiles_ondemand_init_object(struct cachefiles_object *object)
+{
+ return 0;
+}
+
+static inline void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
+{
+}
+
+static inline int cachefiles_ondemand_read(struct cachefiles_object *object,
+ loff_t pos, size_t len)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+/*
* security.c
*/
extern int cachefiles_get_security_ID(struct cachefiles_cache *cache);
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 9dc81e781f2b..000a28f46e59 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -277,36 +277,33 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret)
/*
* Initiate a write to the cache.
*/
-static int cachefiles_write(struct netfs_cache_resources *cres,
- loff_t start_pos,
- struct iov_iter *iter,
- netfs_io_terminated_t term_func,
- void *term_func_priv)
+int __cachefiles_write(struct cachefiles_object *object,
+ struct file *file,
+ loff_t start_pos,
+ struct iov_iter *iter,
+ netfs_io_terminated_t term_func,
+ void *term_func_priv)
{
- struct cachefiles_object *object;
struct cachefiles_cache *cache;
struct cachefiles_kiocb *ki;
struct inode *inode;
- struct file *file;
unsigned int old_nofs;
- ssize_t ret = -ENOBUFS;
+ ssize_t ret;
size_t len = iov_iter_count(iter);
- if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE))
- goto presubmission_error;
fscache_count_write();
- object = cachefiles_cres_object(cres);
cache = object->volume->cache;
- file = cachefiles_cres_file(cres);
_enter("%pD,%li,%llx,%zx/%llx",
file, file_inode(file)->i_ino, start_pos, len,
i_size_read(file_inode(file)));
- ret = -ENOMEM;
ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
- if (!ki)
- goto presubmission_error;
+ if (!ki) {
+ if (term_func)
+ term_func(term_func_priv, -ENOMEM, false);
+ return -ENOMEM;
+ }
refcount_set(&ki->ki_refcnt, 2);
ki->iocb.ki_filp = file;
@@ -314,7 +311,6 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
ki->iocb.ki_flags = IOCB_DIRECT | IOCB_WRITE;
ki->iocb.ki_ioprio = get_current_ioprio();
ki->object = object;
- ki->inval_counter = cres->inval_counter;
ki->start = start_pos;
ki->len = len;
ki->term_func = term_func;
@@ -369,11 +365,24 @@ in_progress:
cachefiles_put_kiocb(ki);
_leave(" = %zd", ret);
return ret;
+}
-presubmission_error:
- if (term_func)
- term_func(term_func_priv, ret, false);
- return ret;
+static int cachefiles_write(struct netfs_cache_resources *cres,
+ loff_t start_pos,
+ struct iov_iter *iter,
+ netfs_io_terminated_t term_func,
+ void *term_func_priv)
+{
+ if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) {
+ if (term_func)
+ term_func(term_func_priv, -ENOBUFS, false);
+ return -ENOBUFS;
+ }
+
+ return __cachefiles_write(cachefiles_cres_object(cres),
+ cachefiles_cres_file(cres),
+ start_pos, iter,
+ term_func, term_func_priv);
}
/*
@@ -394,6 +403,7 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER;
loff_t off, to;
ino_t ino = file ? file_inode(file)->i_ino : 0;
+ int rc;
_enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size);
@@ -406,7 +416,8 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) {
__set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
why = cachefiles_trace_read_no_data;
- goto out_no_object;
+ if (!test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags))
+ goto out_no_object;
}
/* The object and the file may be being created in the background. */
@@ -423,7 +434,7 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
object = cachefiles_cres_object(cres);
cache = object->volume->cache;
cachefiles_begin_secure(cache, &saved_cred);
-
+retry:
off = cachefiles_inject_read_error();
if (off == 0)
off = vfs_llseek(file, subreq->start, SEEK_DATA);
@@ -474,6 +485,15 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
download_and_store:
__set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
+ if (test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags)) {
+ rc = cachefiles_ondemand_read(object, subreq->start,
+ subreq->len);
+ if (!rc) {
+ __clear_bit(NETFS_SREQ_ONDEMAND, &subreq->flags);
+ goto retry;
+ }
+ ret = NETFS_INVALID_READ;
+ }
out:
cachefiles_end_secure(cache, saved_cred);
out_no_object:
@@ -484,13 +504,12 @@ out_no_object:
/*
* Prepare for a write to occur.
*/
-static int __cachefiles_prepare_write(struct netfs_cache_resources *cres,
- loff_t *_start, size_t *_len, loff_t i_size,
- bool no_space_allocated_yet)
+int __cachefiles_prepare_write(struct cachefiles_object *object,
+ struct file *file,
+ loff_t *_start, size_t *_len,
+ bool no_space_allocated_yet)
{
- struct cachefiles_object *object = cachefiles_cres_object(cres);
struct cachefiles_cache *cache = object->volume->cache;
- struct file *file = cachefiles_cres_file(cres);
loff_t start = *_start, pos;
size_t len = *_len, down;
int ret;
@@ -577,7 +596,8 @@ static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
}
cachefiles_begin_secure(cache, &saved_cred);
- ret = __cachefiles_prepare_write(cres, _start, _len, i_size,
+ ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres),
+ _start, _len,
no_space_allocated_yet);
cachefiles_end_secure(cache, saved_cred);
return ret;
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index ca9f3e4ec4b3..facf2ebe464b 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -452,10 +452,9 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object)
struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash];
struct file *file;
struct path path;
- uint64_t ni_size = object->cookie->object_size;
+ uint64_t ni_size;
long ret;
- ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE);
cachefiles_begin_secure(cache, &saved_cred);
@@ -481,6 +480,15 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object)
goto out_dput;
}
+ ret = cachefiles_ondemand_init_object(object);
+ if (ret < 0) {
+ file = ERR_PTR(ret);
+ goto out_unuse;
+ }
+
+ ni_size = object->cookie->object_size;
+ ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE);
+
if (ni_size > 0) {
trace_cachefiles_trunc(object, d_backing_inode(path.dentry), 0, ni_size,
cachefiles_trunc_expand_tmpfile);
@@ -586,6 +594,10 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
}
_debug("file -> %pd positive", dentry);
+ ret = cachefiles_ondemand_init_object(object);
+ if (ret < 0)
+ goto error_fput;
+
ret = cachefiles_check_auxdata(object, file);
if (ret < 0)
goto check_failed;
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
new file mode 100644
index 000000000000..a41ae6efc545
--- /dev/null
+++ b/fs/cachefiles/ondemand.c
@@ -0,0 +1,503 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/fdtable.h>
+#include <linux/anon_inodes.h>
+#include <linux/uio.h>
+#include "internal.h"
+
+static int cachefiles_ondemand_fd_release(struct inode *inode,
+ struct file *file)
+{
+ struct cachefiles_object *object = file->private_data;
+ struct cachefiles_cache *cache = object->volume->cache;
+ int object_id = object->ondemand_id;
+ struct cachefiles_req *req;
+ XA_STATE(xas, &cache->reqs, 0);
+
+ xa_lock(&cache->reqs);
+ object->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED;
+
+ /*
+ * Flush all pending READ requests since their completion depends on
+ * anon_fd.
+ */
+ xas_for_each(&xas, req, ULONG_MAX) {
+ if (req->msg.opcode == CACHEFILES_OP_READ) {
+ req->error = -EIO;
+ complete(&req->done);
+ xas_store(&xas, NULL);
+ }
+ }
+ xa_unlock(&cache->reqs);
+
+ xa_erase(&cache->ondemand_ids, object_id);
+ trace_cachefiles_ondemand_fd_release(object, object_id);
+ cachefiles_put_object(object, cachefiles_obj_put_ondemand_fd);
+ cachefiles_put_unbind_pincount(cache);
+ return 0;
+}
+
+static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb,
+ struct iov_iter *iter)
+{
+ struct cachefiles_object *object = kiocb->ki_filp->private_data;
+ struct cachefiles_cache *cache = object->volume->cache;
+ struct file *file = object->file;
+ size_t len = iter->count;
+ loff_t pos = kiocb->ki_pos;
+ const struct cred *saved_cred;
+ int ret;
+
+ if (!file)
+ return -ENOBUFS;
+
+ cachefiles_begin_secure(cache, &saved_cred);
+ ret = __cachefiles_prepare_write(object, file, &pos, &len, true);
+ cachefiles_end_secure(cache, saved_cred);
+ if (ret < 0)
+ return ret;
+
+ trace_cachefiles_ondemand_fd_write(object, file_inode(file), pos, len);
+ ret = __cachefiles_write(object, file, pos, iter, NULL, NULL);
+ if (!ret)
+ ret = len;
+
+ return ret;
+}
+
+static loff_t cachefiles_ondemand_fd_llseek(struct file *filp, loff_t pos,
+ int whence)
+{
+ struct cachefiles_object *object = filp->private_data;
+ struct file *file = object->file;
+
+ if (!file)
+ return -ENOBUFS;
+
+ return vfs_llseek(file, pos, whence);
+}
+
+static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ struct cachefiles_object *object = filp->private_data;
+ struct cachefiles_cache *cache = object->volume->cache;
+ struct cachefiles_req *req;
+ unsigned long id;
+
+ if (ioctl != CACHEFILES_IOC_READ_COMPLETE)
+ return -EINVAL;
+
+ if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+ return -EOPNOTSUPP;
+
+ id = arg;
+ req = xa_erase(&cache->reqs, id);
+ if (!req)
+ return -EINVAL;
+
+ trace_cachefiles_ondemand_cread(object, id);
+ complete(&req->done);
+ return 0;
+}
+
+static const struct file_operations cachefiles_ondemand_fd_fops = {
+ .owner = THIS_MODULE,
+ .release = cachefiles_ondemand_fd_release,
+ .write_iter = cachefiles_ondemand_fd_write_iter,
+ .llseek = cachefiles_ondemand_fd_llseek,
+ .unlocked_ioctl = cachefiles_ondemand_fd_ioctl,
+};
+
+/*
+ * OPEN request Completion (copen)
+ * - command: "copen <id>,<cache_size>"
+ * <cache_size> indicates the object size if >=0, error code if negative
+ */
+int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
+{
+ struct cachefiles_req *req;
+ struct fscache_cookie *cookie;
+ char *pid, *psize;
+ unsigned long id;
+ long size;
+ int ret;
+
+ if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+ return -EOPNOTSUPP;
+
+ if (!*args) {
+ pr_err("Empty id specified\n");
+ return -EINVAL;
+ }
+
+ pid = args;
+ psize = strchr(args, ',');
+ if (!psize) {
+ pr_err("Cache size is not specified\n");
+ return -EINVAL;
+ }
+
+ *psize = 0;
+ psize++;
+
+ ret = kstrtoul(pid, 0, &id);
+ if (ret)
+ return ret;
+
+ req = xa_erase(&cache->reqs, id);
+ if (!req)
+ return -EINVAL;
+
+ /* fail OPEN request if copen format is invalid */
+ ret = kstrtol(psize, 0, &size);
+ if (ret) {
+ req->error = ret;
+ goto out;
+ }
+
+ /* fail OPEN request if daemon reports an error */
+ if (size < 0) {
+ if (!IS_ERR_VALUE(size))
+ size = -EINVAL;
+ req->error = size;
+ goto out;
+ }
+
+ cookie = req->object->cookie;
+ cookie->object_size = size;
+ if (size)
+ clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags);
+ else
+ set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags);
+ trace_cachefiles_ondemand_copen(req->object, id, size);
+
+out:
+ complete(&req->done);
+ return ret;
+}
+
+static int cachefiles_ondemand_get_fd(struct cachefiles_req *req)
+{
+ struct cachefiles_object *object;
+ struct cachefiles_cache *cache;
+ struct cachefiles_open *load;
+ struct file *file;
+ u32 object_id;
+ int ret, fd;
+
+ object = cachefiles_grab_object(req->object,
+ cachefiles_obj_get_ondemand_fd);
+ cache = object->volume->cache;
+
+ ret = xa_alloc_cyclic(&cache->ondemand_ids, &object_id, NULL,
+ XA_LIMIT(1, INT_MAX),
+ &cache->ondemand_id_next, GFP_KERNEL);
+ if (ret < 0)
+ goto err;
+
+ fd = get_unused_fd_flags(O_WRONLY);
+ if (fd < 0) {
+ ret = fd;
+ goto err_free_id;
+ }
+
+ file = anon_inode_getfile("[cachefiles]", &cachefiles_ondemand_fd_fops,
+ object, O_WRONLY);
+ if (IS_ERR(file)) {
+ ret = PTR_ERR(file);
+ goto err_put_fd;
+ }
+
+ file->f_mode |= FMODE_PWRITE | FMODE_LSEEK;
+ fd_install(fd, file);
+
+ load = (void *)req->msg.data;
+ load->fd = fd;
+ req->msg.object_id = object_id;
+ object->ondemand_id = object_id;
+
+ cachefiles_get_unbind_pincount(cache);
+ trace_cachefiles_ondemand_open(object, &req->msg, load);
+ return 0;
+
+err_put_fd:
+ put_unused_fd(fd);
+err_free_id:
+ xa_erase(&cache->ondemand_ids, object_id);
+err:
+ cachefiles_put_object(object, cachefiles_obj_put_ondemand_fd);
+ return ret;
+}
+
+ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
+ char __user *_buffer, size_t buflen)
+{
+ struct cachefiles_req *req;
+ struct cachefiles_msg *msg;
+ unsigned long id = 0;
+ size_t n;
+ int ret = 0;
+ XA_STATE(xas, &cache->reqs, 0);
+
+ /*
+ * Search for a request that has not ever been processed, to prevent
+ * requests from being processed repeatedly.
+ */
+ xa_lock(&cache->reqs);
+ req = xas_find_marked(&xas, UINT_MAX, CACHEFILES_REQ_NEW);
+ if (!req) {
+ xa_unlock(&cache->reqs);
+ return 0;
+ }
+
+ msg = &req->msg;
+ n = msg->len;
+
+ if (n > buflen) {
+ xa_unlock(&cache->reqs);
+ return -EMSGSIZE;
+ }
+
+ xas_clear_mark(&xas, CACHEFILES_REQ_NEW);
+ xa_unlock(&cache->reqs);
+
+ id = xas.xa_index;
+ msg->msg_id = id;
+
+ if (msg->opcode == CACHEFILES_OP_OPEN) {
+ ret = cachefiles_ondemand_get_fd(req);
+ if (ret)
+ goto error;
+ }
+
+ if (copy_to_user(_buffer, msg, n) != 0) {
+ ret = -EFAULT;
+ goto err_put_fd;
+ }
+
+ /* CLOSE request has no reply */
+ if (msg->opcode == CACHEFILES_OP_CLOSE) {
+ xa_erase(&cache->reqs, id);
+ complete(&req->done);
+ }
+
+ return n;
+
+err_put_fd:
+ if (msg->opcode == CACHEFILES_OP_OPEN)
+ close_fd(((struct cachefiles_open *)msg->data)->fd);
+error:
+ xa_erase(&cache->reqs, id);
+ req->error = ret;
+ complete(&req->done);
+ return ret;
+}
+
+typedef int (*init_req_fn)(struct cachefiles_req *req, void *private);
+
+static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
+ enum cachefiles_opcode opcode,
+ size_t data_len,
+ init_req_fn init_req,
+ void *private)
+{
+ struct cachefiles_cache *cache = object->volume->cache;
+ struct cachefiles_req *req;
+ XA_STATE(xas, &cache->reqs, 0);
+ int ret;
+
+ if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+ return 0;
+
+ if (test_bit(CACHEFILES_DEAD, &cache->flags))
+ return -EIO;
+
+ req = kzalloc(sizeof(*req) + data_len, GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ req->object = object;
+ init_completion(&req->done);
+ req->msg.opcode = opcode;
+ req->msg.len = sizeof(struct cachefiles_msg) + data_len;
+
+ ret = init_req(req, private);
+ if (ret)
+ goto out;
+
+ do {
+ /*
+ * Stop enqueuing the request when daemon is dying. The
+ * following two operations need to be atomic as a whole.
+ * 1) check cache state, and
+ * 2) enqueue request if cache is alive.
+ * Otherwise the request may be enqueued after xarray has been
+ * flushed, leaving the orphan request never being completed.
+ *
+ * CPU 1 CPU 2
+ * ===== =====
+ * test CACHEFILES_DEAD bit
+ * set CACHEFILES_DEAD bit
+ * flush requests in the xarray
+ * enqueue the request
+ */
+ xas_lock(&xas);
+
+ if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+ xas_unlock(&xas);
+ ret = -EIO;
+ goto out;
+ }
+
+ /* coupled with the barrier in cachefiles_flush_reqs() */
+ smp_mb();
+
+ if (opcode != CACHEFILES_OP_OPEN && object->ondemand_id <= 0) {
+ WARN_ON_ONCE(object->ondemand_id == 0);
+ xas_unlock(&xas);
+ ret = -EIO;
+ goto out;
+ }
+
+ xas.xa_index = 0;
+ xas_find_marked(&xas, UINT_MAX, XA_FREE_MARK);
+ if (xas.xa_node == XAS_RESTART)
+ xas_set_err(&xas, -EBUSY);
+ xas_store(&xas, req);
+ xas_clear_mark(&xas, XA_FREE_MARK);
+ xas_set_mark(&xas, CACHEFILES_REQ_NEW);
+ xas_unlock(&xas);
+ } while (xas_nomem(&xas, GFP_KERNEL));
+
+ ret = xas_error(&xas);
+ if (ret)
+ goto out;
+
+ wake_up_all(&cache->daemon_pollwq);
+ wait_for_completion(&req->done);
+ ret = req->error;
+out:
+ kfree(req);
+ return ret;
+}
+
+static int cachefiles_ondemand_init_open_req(struct cachefiles_req *req,
+ void *private)
+{
+ struct cachefiles_object *object = req->object;
+ struct fscache_cookie *cookie = object->cookie;
+ struct fscache_volume *volume = object->volume->vcookie;
+ struct cachefiles_open *load = (void *)req->msg.data;
+ size_t volume_key_size, cookie_key_size;
+ void *volume_key, *cookie_key;
+
+ /*
+ * Volume key is a NUL-terminated string. key[0] stores strlen() of the
+ * string, followed by the content of the string (excluding '\0').
+ */
+ volume_key_size = volume->key[0] + 1;
+ volume_key = volume->key + 1;
+
+ /* Cookie key is binary data, which is netfs specific. */
+ cookie_key_size = cookie->key_len;
+ cookie_key = fscache_get_key(cookie);
+
+ if (!(object->cookie->advice & FSCACHE_ADV_WANT_CACHE_SIZE)) {
+ pr_err("WANT_CACHE_SIZE is needed for on-demand mode\n");
+ return -EINVAL;
+ }
+
+ load->volume_key_size = volume_key_size;
+ load->cookie_key_size = cookie_key_size;
+ memcpy(load->data, volume_key, volume_key_size);
+ memcpy(load->data + volume_key_size, cookie_key, cookie_key_size);
+
+ return 0;
+}
+
+static int cachefiles_ondemand_init_close_req(struct cachefiles_req *req,
+ void *private)
+{
+ struct cachefiles_object *object = req->object;
+ int object_id = object->ondemand_id;
+
+ /*
+ * It's possible that object id is still 0 if the cookie looking up
+ * phase failed before OPEN request has ever been sent. Also avoid
+ * sending CLOSE request for CACHEFILES_ONDEMAND_ID_CLOSED, which means
+ * anon_fd has already been closed.
+ */
+ if (object_id <= 0)
+ return -ENOENT;
+
+ req->msg.object_id = object_id;
+ trace_cachefiles_ondemand_close(object, &req->msg);
+ return 0;
+}
+
+struct cachefiles_read_ctx {
+ loff_t off;
+ size_t len;
+};
+
+static int cachefiles_ondemand_init_read_req(struct cachefiles_req *req,
+ void *private)
+{
+ struct cachefiles_object *object = req->object;
+ struct cachefiles_read *load = (void *)req->msg.data;
+ struct cachefiles_read_ctx *read_ctx = private;
+ int object_id = object->ondemand_id;
+
+ /* Stop enqueuing requests when daemon has closed anon_fd. */
+ if (object_id <= 0) {
+ WARN_ON_ONCE(object_id == 0);
+ pr_info_once("READ: anonymous fd closed prematurely.\n");
+ return -EIO;
+ }
+
+ req->msg.object_id = object_id;
+ load->off = read_ctx->off;
+ load->len = read_ctx->len;
+ trace_cachefiles_ondemand_read(object, &req->msg, load);
+ return 0;
+}
+
+int cachefiles_ondemand_init_object(struct cachefiles_object *object)
+{
+ struct fscache_cookie *cookie = object->cookie;
+ struct fscache_volume *volume = object->volume->vcookie;
+ size_t volume_key_size, cookie_key_size, data_len;
+
+ /*
+ * CacheFiles will firstly check the cache file under the root cache
+ * directory. If the coherency check failed, it will fallback to
+ * creating a new tmpfile as the cache file. Reuse the previously
+ * allocated object ID if any.
+ */
+ if (object->ondemand_id > 0)
+ return 0;
+
+ volume_key_size = volume->key[0] + 1;
+ cookie_key_size = cookie->key_len;
+ data_len = sizeof(struct cachefiles_open) +
+ volume_key_size + cookie_key_size;
+
+ return cachefiles_ondemand_send_req(object, CACHEFILES_OP_OPEN,
+ data_len, cachefiles_ondemand_init_open_req, NULL);
+}
+
+void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
+{
+ cachefiles_ondemand_send_req(object, CACHEFILES_OP_CLOSE, 0,
+ cachefiles_ondemand_init_close_req, NULL);
+}
+
+int cachefiles_ondemand_read(struct cachefiles_object *object,
+ loff_t pos, size_t len)
+{
+ struct cachefiles_read_ctx read_ctx = {pos, len};
+
+ return cachefiles_ondemand_send_req(object, CACHEFILES_OP_READ,
+ sizeof(struct cachefiles_read),
+ cachefiles_ondemand_init_read_req, &read_ctx);
+}
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index f57255ab88ed..85490370e0ca 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -98,3 +98,13 @@ config EROFS_FS_ZIP_LZMA
systems will be readable without selecting this option.
If unsure, say N.
+
+config EROFS_FS_ONDEMAND
+ bool "EROFS fscache-based on-demand read support"
+ depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y)
+ default n
+ help
+ This permits EROFS to use fscache-backed data blobs with on-demand
+ read support.
+
+ If unsure, say N.
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 8a3317e38e5a..99bbc597a3e9 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,3 +5,4 @@ erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o sysfs.o
erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
+erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 780db1e5f4b7..bb9c1fd48c19 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -6,6 +6,7 @@
*/
#include "internal.h"
#include <linux/prefetch.h>
+#include <linux/sched/mm.h>
#include <linux/dax.h>
#include <trace/events/erofs.h>
@@ -35,14 +36,20 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
erofs_off_t offset = blknr_to_addr(blkaddr);
pgoff_t index = offset >> PAGE_SHIFT;
struct page *page = buf->page;
+ struct folio *folio;
+ unsigned int nofs_flag;
if (!page || page->index != index) {
erofs_put_metabuf(buf);
- page = read_cache_page_gfp(mapping, index,
- mapping_gfp_constraint(mapping, ~__GFP_FS));
- if (IS_ERR(page))
- return page;
+
+ nofs_flag = memalloc_nofs_save();
+ folio = read_cache_folio(mapping, index, NULL, NULL);
+ memalloc_nofs_restore(nofs_flag);
+ if (IS_ERR(folio))
+ return folio;
+
/* should already be PageUptodate, no need to lock page */
+ page = folio_file_page(folio, index);
buf->page = page;
}
if (buf->kmap_type == EROFS_NO_KMAP) {
@@ -63,6 +70,10 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
erofs_blk_t blkaddr, enum erofs_kmap_type type)
{
+ if (erofs_is_fscache_mode(sb))
+ return erofs_bread(buf, EROFS_SB(sb)->s_fscache->inode,
+ blkaddr, type);
+
return erofs_bread(buf, sb->s_bdev->bd_inode, blkaddr, type);
}
@@ -110,8 +121,8 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
return 0;
}
-static int erofs_map_blocks(struct inode *inode,
- struct erofs_map_blocks *map, int flags)
+int erofs_map_blocks(struct inode *inode,
+ struct erofs_map_blocks *map, int flags)
{
struct super_block *sb = inode->i_sb;
struct erofs_inode *vi = EROFS_I(inode);
@@ -199,6 +210,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
map->m_bdev = sb->s_bdev;
map->m_daxdev = EROFS_SB(sb)->dax_dev;
map->m_dax_part_off = EROFS_SB(sb)->dax_part_off;
+ map->m_fscache = EROFS_SB(sb)->s_fscache;
if (map->m_deviceid) {
down_read(&devs->rwsem);
@@ -210,6 +222,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
map->m_bdev = dif->bdev;
map->m_daxdev = dif->dax_dev;
map->m_dax_part_off = dif->dax_part_off;
+ map->m_fscache = dif->fscache;
up_read(&devs->rwsem);
} else if (devs->extra_devices) {
down_read(&devs->rwsem);
@@ -227,6 +240,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
map->m_bdev = dif->bdev;
map->m_daxdev = dif->dax_dev;
map->m_dax_part_off = dif->dax_part_off;
+ map->m_fscache = dif->fscache;
break;
}
}
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 3efa686c7644..6dca1900c733 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -46,8 +46,6 @@ int z_erofs_load_lz4_config(struct super_block *sb,
erofs_err(sb, "too large lz4 pclusterblks %u",
sbi->lz4.max_pclusterblks);
return -EINVAL;
- } else if (sbi->lz4.max_pclusterblks >= 2) {
- erofs_info(sb, "EXPERIMENTAL big pcluster feature in use. Use at your own risk!");
}
} else {
distance = le16_to_cpu(dsb->u1.lz4_max_distance);
@@ -322,6 +320,7 @@ static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq,
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
const unsigned int righthalf = min_t(unsigned int, rq->outputsize,
PAGE_SIZE - rq->pageofs_out);
+ const unsigned int lefthalf = rq->outputsize - righthalf;
unsigned char *src, *dst;
if (nrpages_out > 2) {
@@ -344,10 +343,10 @@ static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq,
if (nrpages_out == 2) {
DBG_BUGON(!rq->out[1]);
if (rq->out[1] == *rq->in) {
- memmove(src, src + righthalf, rq->pageofs_out);
+ memmove(src, src + righthalf, lefthalf);
} else {
dst = kmap_atomic(rq->out[1]);
- memcpy(dst, src + righthalf, rq->pageofs_out);
+ memcpy(dst, src + righthalf, lefthalf);
kunmap_atomic(dst);
}
}
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 1238ca104f09..2b48373f690b 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -37,12 +37,9 @@
#define EROFS_SB_EXTSLOT_SIZE 16
struct erofs_deviceslot {
- union {
- u8 uuid[16]; /* used for device manager later */
- u8 userdata[64]; /* digest(sha256), etc. */
- } u;
- __le32 blocks; /* total fs blocks of this device */
- __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */
+ u8 tag[64]; /* digest(sha256), etc. */
+ __le32 blocks; /* total fs blocks of this device */
+ __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */
u8 reserved[56];
};
#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot)
@@ -58,8 +55,8 @@ struct erofs_super_block {
__le16 root_nid; /* nid of root directory */
__le64 inos; /* total valid ino # (== f_files - f_favail) */
- __le64 build_time; /* inode v1 time derivation */
- __le32 build_time_nsec; /* inode v1 time derivation in nano scale */
+ __le64 build_time; /* compact inode time derivation */
+ __le32 build_time_nsec; /* compact inode time derivation in ns scale */
__le32 blocks; /* used for statfs */
__le32 meta_blkaddr; /* start block address of metadata area */
__le32 xattr_blkaddr; /* start block address of shared xattr area */
@@ -79,15 +76,15 @@ struct erofs_super_block {
/*
* erofs inode datalayout (i_format in on-disk inode):
- * 0 - inode plain without inline data A:
+ * 0 - uncompressed flat inode without tail-packing inline data:
* inode, [xattrs], ... | ... | no-holed data
- * 1 - inode VLE compression B (legacy):
- * inode, [xattrs], extents ... | ...
- * 2 - inode plain with inline data C:
- * inode, [xattrs], last_inline_data, ... | ... | no-holed data
- * 3 - inode compression D:
+ * 1 - compressed inode with non-compact indexes:
+ * inode, [xattrs], [map_header], extents ... | ...
+ * 2 - uncompressed flat inode with tail-packing inline data:
+ * inode, [xattrs], tailpacking data, ... | ... | no-holed data
+ * 3 - compressed inode with compact indexes:
* inode, [xattrs], map_header, extents ... | ...
- * 4 - inode chunk-based E:
+ * 4 - chunk-based inode with (optional) multi-device support:
* inode, [xattrs], chunk indexes ... | ...
* 5~7 - reserved
*/
@@ -106,7 +103,7 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY;
}
-/* bit definitions of inode i_advise */
+/* bit definitions of inode i_format */
#define EROFS_I_VERSION_BITS 1
#define EROFS_I_DATALAYOUT_BITS 3
@@ -140,8 +137,9 @@ struct erofs_inode_compact {
__le32 i_size;
__le32 i_reserved;
union {
- /* file total compressed blocks for data mapping 1 */
+ /* total compressed blocks for compressed inodes */
__le32 compressed_blocks;
+ /* block address for uncompressed flat inodes */
__le32 raw_blkaddr;
/* for device files, used to indicate old/new device # */
@@ -156,9 +154,9 @@ struct erofs_inode_compact {
__le32 i_reserved2;
};
-/* 32 bytes on-disk inode */
+/* 32-byte on-disk inode */
#define EROFS_INODE_LAYOUT_COMPACT 0
-/* 64 bytes on-disk inode */
+/* 64-byte on-disk inode */
#define EROFS_INODE_LAYOUT_EXTENDED 1
/* 64-byte complete form of an ondisk inode */
@@ -171,8 +169,9 @@ struct erofs_inode_extended {
__le16 i_reserved;
__le64 i_size;
union {
- /* file total compressed blocks for data mapping 1 */
+ /* total compressed blocks for compressed inodes */
__le32 compressed_blocks;
+ /* block address for uncompressed flat inodes */
__le32 raw_blkaddr;
/* for device files, used to indicate old/new device # */
@@ -365,17 +364,16 @@ enum {
struct z_erofs_vle_decompressed_index {
__le16 di_advise;
- /* where to decompress in the head cluster */
+ /* where to decompress in the head lcluster */
__le16 di_clusterofs;
union {
- /* for the head cluster */
+ /* for the HEAD lclusters */
__le32 blkaddr;
/*
- * for the rest clusters
- * eg. for 4k page-sized cluster, maximum 4K*64k = 256M)
- * [0] - pointing to the head cluster
- * [1] - pointing to the tail cluster
+ * for the NONHEAD lclusters
+ * [0] - distance to its HEAD lcluster
+ * [1] - distance to the next HEAD lcluster
*/
__le16 delta[2];
} di_u;
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
new file mode 100644
index 000000000000..7e4417167d0b
--- /dev/null
+++ b/fs/erofs/fscache.c
@@ -0,0 +1,521 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022, Alibaba Cloud
+ */
+#include <linux/fscache.h>
+#include "internal.h"
+
+static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping,
+ loff_t start, size_t len)
+{
+ struct netfs_io_request *rreq;
+
+ rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL);
+ if (!rreq)
+ return ERR_PTR(-ENOMEM);
+
+ rreq->start = start;
+ rreq->len = len;
+ rreq->mapping = mapping;
+ INIT_LIST_HEAD(&rreq->subrequests);
+ refcount_set(&rreq->ref, 1);
+ return rreq;
+}
+
+static void erofs_fscache_put_request(struct netfs_io_request *rreq)
+{
+ if (!refcount_dec_and_test(&rreq->ref))
+ return;
+ if (rreq->cache_resources.ops)
+ rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
+ kfree(rreq);
+}
+
+static void erofs_fscache_put_subrequest(struct netfs_io_subrequest *subreq)
+{
+ if (!refcount_dec_and_test(&subreq->ref))
+ return;
+ erofs_fscache_put_request(subreq->rreq);
+ kfree(subreq);
+}
+
+static void erofs_fscache_clear_subrequests(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+
+ while (!list_empty(&rreq->subrequests)) {
+ subreq = list_first_entry(&rreq->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ list_del(&subreq->rreq_link);
+ erofs_fscache_put_subrequest(subreq);
+ }
+}
+
+static void erofs_fscache_rreq_unlock_folios(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+ struct folio *folio;
+ unsigned int iopos = 0;
+ pgoff_t start_page = rreq->start / PAGE_SIZE;
+ pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
+ bool subreq_failed = false;
+
+ XA_STATE(xas, &rreq->mapping->i_pages, start_page);
+
+ subreq = list_first_entry(&rreq->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ subreq_failed = (subreq->error < 0);
+
+ rcu_read_lock();
+ xas_for_each(&xas, folio, last_page) {
+ unsigned int pgpos =
+ (folio_index(folio) - start_page) * PAGE_SIZE;
+ unsigned int pgend = pgpos + folio_size(folio);
+ bool pg_failed = false;
+
+ for (;;) {
+ if (!subreq) {
+ pg_failed = true;
+ break;
+ }
+
+ pg_failed |= subreq_failed;
+ if (pgend < iopos + subreq->len)
+ break;
+
+ iopos += subreq->len;
+ if (!list_is_last(&subreq->rreq_link,
+ &rreq->subrequests)) {
+ subreq = list_next_entry(subreq, rreq_link);
+ subreq_failed = (subreq->error < 0);
+ } else {
+ subreq = NULL;
+ subreq_failed = false;
+ }
+ if (pgend == iopos)
+ break;
+ }
+
+ if (!pg_failed)
+ folio_mark_uptodate(folio);
+
+ folio_unlock(folio);
+ }
+ rcu_read_unlock();
+}
+
+static void erofs_fscache_rreq_complete(struct netfs_io_request *rreq)
+{
+ erofs_fscache_rreq_unlock_folios(rreq);
+ erofs_fscache_clear_subrequests(rreq);
+ erofs_fscache_put_request(rreq);
+}
+
+static void erofc_fscache_subreq_complete(void *priv,
+ ssize_t transferred_or_error, bool was_async)
+{
+ struct netfs_io_subrequest *subreq = priv;
+ struct netfs_io_request *rreq = subreq->rreq;
+
+ if (IS_ERR_VALUE(transferred_or_error))
+ subreq->error = transferred_or_error;
+
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
+ erofs_fscache_rreq_complete(rreq);
+
+ erofs_fscache_put_subrequest(subreq);
+}
+
+/*
+ * Read data from fscache and fill the read data into page cache described by
+ * @rreq, which shall be both aligned with PAGE_SIZE. @pstart describes
+ * the start physical address in the cache file.
+ */
+static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
+ struct netfs_io_request *rreq, loff_t pstart)
+{
+ enum netfs_io_source source;
+ struct super_block *sb = rreq->mapping->host->i_sb;
+ struct netfs_io_subrequest *subreq;
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+ struct iov_iter iter;
+ loff_t start = rreq->start;
+ size_t len = rreq->len;
+ size_t done = 0;
+ int ret;
+
+ atomic_set(&rreq->nr_outstanding, 1);
+
+ ret = fscache_begin_read_operation(cres, cookie);
+ if (ret)
+ goto out;
+
+ while (done < len) {
+ subreq = kzalloc(sizeof(struct netfs_io_subrequest),
+ GFP_KERNEL);
+ if (subreq) {
+ INIT_LIST_HEAD(&subreq->rreq_link);
+ refcount_set(&subreq->ref, 2);
+ subreq->rreq = rreq;
+ refcount_inc(&rreq->ref);
+ } else {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ subreq->start = pstart + done;
+ subreq->len = len - done;
+ subreq->flags = 1 << NETFS_SREQ_ONDEMAND;
+
+ list_add_tail(&subreq->rreq_link, &rreq->subrequests);
+
+ source = cres->ops->prepare_read(subreq, LLONG_MAX);
+ if (WARN_ON(subreq->len == 0))
+ source = NETFS_INVALID_READ;
+ if (source != NETFS_READ_FROM_CACHE) {
+ erofs_err(sb, "failed to fscache prepare_read (source %d)",
+ source);
+ ret = -EIO;
+ subreq->error = ret;
+ erofs_fscache_put_subrequest(subreq);
+ goto out;
+ }
+
+ atomic_inc(&rreq->nr_outstanding);
+
+ iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
+ start + done, subreq->len);
+
+ ret = fscache_read(cres, subreq->start, &iter,
+ NETFS_READ_HOLE_FAIL,
+ erofc_fscache_subreq_complete, subreq);
+ if (ret == -EIOCBQUEUED)
+ ret = 0;
+ if (ret) {
+ erofs_err(sb, "failed to fscache_read (ret %d)", ret);
+ goto out;
+ }
+
+ done += subreq->len;
+ }
+out:
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
+ erofs_fscache_rreq_complete(rreq);
+
+ return ret;
+}
+
+static int erofs_fscache_meta_readpage(struct file *data, struct page *page)
+{
+ int ret;
+ struct folio *folio = page_folio(page);
+ struct super_block *sb = folio_mapping(folio)->host->i_sb;
+ struct netfs_io_request *rreq;
+ struct erofs_map_dev mdev = {
+ .m_deviceid = 0,
+ .m_pa = folio_pos(folio),
+ };
+
+ ret = erofs_map_dev(sb, &mdev);
+ if (ret)
+ goto out;
+
+ rreq = erofs_fscache_alloc_request(folio_mapping(folio),
+ folio_pos(folio), folio_size(folio));
+ if (IS_ERR(rreq))
+ goto out;
+
+ return erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+ rreq, mdev.m_pa);
+out:
+ folio_unlock(folio);
+ return ret;
+}
+
+static int erofs_fscache_readpage_inline(struct folio *folio,
+ struct erofs_map_blocks *map)
+{
+ struct super_block *sb = folio_mapping(folio)->host->i_sb;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ erofs_blk_t blknr;
+ size_t offset, len;
+ void *src, *dst;
+
+ /* For tail packing layout, the offset may be non-zero. */
+ offset = erofs_blkoff(map->m_pa);
+ blknr = erofs_blknr(map->m_pa);
+ len = map->m_llen;
+
+ src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP);
+ if (IS_ERR(src))
+ return PTR_ERR(src);
+
+ dst = kmap_local_folio(folio, 0);
+ memcpy(dst, src + offset, len);
+ memset(dst + len, 0, PAGE_SIZE - len);
+ kunmap_local(dst);
+
+ erofs_put_metabuf(&buf);
+ return 0;
+}
+
+static int erofs_fscache_readpage(struct file *file, struct page *page)
+{
+ struct folio *folio = page_folio(page);
+ struct inode *inode = folio_mapping(folio)->host;
+ struct super_block *sb = inode->i_sb;
+ struct erofs_map_blocks map;
+ struct erofs_map_dev mdev;
+ struct netfs_io_request *rreq;
+ erofs_off_t pos;
+ loff_t pstart;
+ int ret;
+
+ DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ);
+
+ pos = folio_pos(folio);
+ map.m_la = pos;
+
+ ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+ if (ret)
+ goto out_unlock;
+
+ if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+ folio_zero_range(folio, 0, folio_size(folio));
+ goto out_uptodate;
+ }
+
+ if (map.m_flags & EROFS_MAP_META) {
+ ret = erofs_fscache_readpage_inline(folio, &map);
+ goto out_uptodate;
+ }
+
+ mdev = (struct erofs_map_dev) {
+ .m_deviceid = map.m_deviceid,
+ .m_pa = map.m_pa,
+ };
+
+ ret = erofs_map_dev(sb, &mdev);
+ if (ret)
+ goto out_unlock;
+
+
+ rreq = erofs_fscache_alloc_request(folio_mapping(folio),
+ folio_pos(folio), folio_size(folio));
+ if (IS_ERR(rreq))
+ goto out_unlock;
+
+ pstart = mdev.m_pa + (pos - map.m_la);
+ return erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+ rreq, pstart);
+
+out_uptodate:
+ if (!ret)
+ folio_mark_uptodate(folio);
+out_unlock:
+ folio_unlock(folio);
+ return ret;
+}
+
+static void erofs_fscache_advance_folios(struct readahead_control *rac,
+ size_t len, bool unlock)
+{
+ while (len) {
+ struct folio *folio = readahead_folio(rac);
+ len -= folio_size(folio);
+ if (unlock) {
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ }
+ }
+}
+
+static void erofs_fscache_readahead(struct readahead_control *rac)
+{
+ struct inode *inode = rac->mapping->host;
+ struct super_block *sb = inode->i_sb;
+ size_t len, count, done = 0;
+ erofs_off_t pos;
+ loff_t start, offset;
+ int ret;
+
+ if (!readahead_count(rac))
+ return;
+
+ start = readahead_pos(rac);
+ len = readahead_length(rac);
+
+ do {
+ struct erofs_map_blocks map;
+ struct erofs_map_dev mdev;
+ struct netfs_io_request *rreq;
+
+ pos = start + done;
+ map.m_la = pos;
+
+ ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+ if (ret)
+ return;
+
+ offset = start + done;
+ count = min_t(size_t, map.m_llen - (pos - map.m_la),
+ len - done);
+
+ if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+ struct iov_iter iter;
+
+ iov_iter_xarray(&iter, READ, &rac->mapping->i_pages,
+ offset, count);
+ iov_iter_zero(count, &iter);
+
+ erofs_fscache_advance_folios(rac, count, true);
+ ret = count;
+ continue;
+ }
+
+ if (map.m_flags & EROFS_MAP_META) {
+ struct folio *folio = readahead_folio(rac);
+
+ ret = erofs_fscache_readpage_inline(folio, &map);
+ if (!ret) {
+ folio_mark_uptodate(folio);
+ ret = folio_size(folio);
+ }
+
+ folio_unlock(folio);
+ continue;
+ }
+
+ mdev = (struct erofs_map_dev) {
+ .m_deviceid = map.m_deviceid,
+ .m_pa = map.m_pa,
+ };
+ ret = erofs_map_dev(sb, &mdev);
+ if (ret)
+ return;
+
+ rreq = erofs_fscache_alloc_request(rac->mapping, offset, count);
+ if (IS_ERR(rreq))
+ return;
+ /*
+ * Drop the ref of folios here. Unlock them in
+ * rreq_unlock_folios() when rreq complete.
+ */
+ erofs_fscache_advance_folios(rac, count, false);
+ ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+ rreq, mdev.m_pa + (pos - map.m_la));
+ if (!ret)
+ ret = count;
+ } while (ret > 0 && ((done += ret) < len));
+}
+
+static const struct address_space_operations erofs_fscache_meta_aops = {
+ .readpage = erofs_fscache_meta_readpage,
+};
+
+const struct address_space_operations erofs_fscache_access_aops = {
+ .readpage = erofs_fscache_readpage,
+ .readahead = erofs_fscache_readahead,
+};
+
+int erofs_fscache_register_cookie(struct super_block *sb,
+ struct erofs_fscache **fscache,
+ char *name, bool need_inode)
+{
+ struct fscache_volume *volume = EROFS_SB(sb)->volume;
+ struct erofs_fscache *ctx;
+ struct fscache_cookie *cookie;
+ int ret;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE,
+ name, strlen(name), NULL, 0, 0);
+ if (!cookie) {
+ erofs_err(sb, "failed to get cookie for %s", name);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ fscache_use_cookie(cookie, false);
+ ctx->cookie = cookie;
+
+ if (need_inode) {
+ struct inode *const inode = new_inode(sb);
+
+ if (!inode) {
+ erofs_err(sb, "failed to get anon inode for %s", name);
+ ret = -ENOMEM;
+ goto err_cookie;
+ }
+
+ set_nlink(inode, 1);
+ inode->i_size = OFFSET_MAX;
+ inode->i_mapping->a_ops = &erofs_fscache_meta_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+
+ ctx->inode = inode;
+ }
+
+ *fscache = ctx;
+ return 0;
+
+err_cookie:
+ fscache_unuse_cookie(ctx->cookie, NULL, NULL);
+ fscache_relinquish_cookie(ctx->cookie, false);
+ ctx->cookie = NULL;
+err:
+ kfree(ctx);
+ return ret;
+}
+
+void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache)
+{
+ struct erofs_fscache *ctx = *fscache;
+
+ if (!ctx)
+ return;
+
+ fscache_unuse_cookie(ctx->cookie, NULL, NULL);
+ fscache_relinquish_cookie(ctx->cookie, false);
+ ctx->cookie = NULL;
+
+ iput(ctx->inode);
+ ctx->inode = NULL;
+
+ kfree(ctx);
+ *fscache = NULL;
+}
+
+int erofs_fscache_register_fs(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct fscache_volume *volume;
+ char *name;
+ int ret = 0;
+
+ name = kasprintf(GFP_KERNEL, "erofs,%s", sbi->opt.fsid);
+ if (!name)
+ return -ENOMEM;
+
+ volume = fscache_acquire_volume(name, NULL, NULL, 0);
+ if (IS_ERR_OR_NULL(volume)) {
+ erofs_err(sb, "failed to register volume for %s", name);
+ ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP;
+ volume = NULL;
+ }
+
+ sbi->volume = volume;
+ kfree(name);
+ return ret;
+}
+
+void erofs_fscache_unregister_fs(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ fscache_relinquish_volume(sbi->volume, NULL, false);
+ sbi->volume = NULL;
+}
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index e8b37ba5e9ad..bcc8335b46b3 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -8,11 +8,6 @@
#include <trace/events/erofs.h>
-/*
- * if inode is successfully read, return its inode page (or sometimes
- * the inode payload page if it's an extended inode) in order to fill
- * inline data if possible.
- */
static void *erofs_read_inode(struct erofs_buf *buf,
struct inode *inode, unsigned int *ofs)
{
@@ -297,6 +292,10 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
goto out_unlock;
}
inode->i_mapping->a_ops = &erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+ if (erofs_is_fscache_mode(inode->i_sb))
+ inode->i_mapping->a_ops = &erofs_fscache_access_aops;
+#endif
out_unlock:
erofs_put_metabuf(&buf);
@@ -370,7 +369,7 @@ int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
STATX_ATTR_IMMUTABLE);
- generic_fillattr(&init_user_ns, inode, stat);
+ generic_fillattr(mnt_userns, inode, stat);
return 0;
}
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 5298c4ee277d..cfee49d33b95 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -49,6 +49,7 @@ typedef u32 erofs_blk_t;
struct erofs_device_info {
char *path;
+ struct erofs_fscache *fscache;
struct block_device *bdev;
struct dax_device *dax_dev;
u64 dax_part_off;
@@ -74,6 +75,7 @@ struct erofs_mount_opts {
unsigned int max_sync_decompress_pages;
#endif
unsigned int mount_opt;
+ char *fsid;
};
struct erofs_dev_context {
@@ -96,6 +98,11 @@ struct erofs_sb_lz4_info {
u16 max_pclusterblks;
};
+struct erofs_fscache {
+ struct fscache_cookie *cookie;
+ struct inode *inode;
+};
+
struct erofs_sb_info {
struct erofs_mount_opts opt; /* options */
#ifdef CONFIG_EROFS_FS_ZIP
@@ -146,6 +153,10 @@ struct erofs_sb_info {
/* sysfs support */
struct kobject s_kobj; /* /sys/fs/erofs/<devname> */
struct completion s_kobj_unregister;
+
+ /* fscache support */
+ struct fscache_volume *volume;
+ struct erofs_fscache *s_fscache;
};
#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
@@ -161,6 +172,11 @@ struct erofs_sb_info {
#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option)
+static inline bool erofs_is_fscache_mode(struct super_block *sb)
+{
+ return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && !sb->s_bdev;
+}
+
enum {
EROFS_ZIP_CACHE_DISABLED,
EROFS_ZIP_CACHE_READAHEAD,
@@ -381,31 +397,6 @@ extern const struct super_operations erofs_sops;
extern const struct address_space_operations erofs_raw_access_aops;
extern const struct address_space_operations z_erofs_aops;
-/*
- * Logical to physical block mapping
- *
- * Different with other file systems, it is used for 2 access modes:
- *
- * 1) RAW access mode:
- *
- * Users pass a valid (m_lblk, m_lofs -- usually 0) pair,
- * and get the valid m_pblk, m_pofs and the longest m_len(in bytes).
- *
- * Note that m_lblk in the RAW access mode refers to the number of
- * the compressed ondisk block rather than the uncompressed
- * in-memory block for the compressed file.
- *
- * m_pofs equals to m_lofs except for the inline data page.
- *
- * 2) Normal access mode:
- *
- * If the inode is not compressed, it has no difference with
- * the RAW access mode. However, if the inode is compressed,
- * users should pass a valid (m_lblk, m_lofs) pair, and get
- * the needed m_pblk, m_pofs, m_len to get the compressed data
- * and the updated m_lblk, m_lofs which indicates the start
- * of the corresponding uncompressed data in the file.
- */
enum {
BH_Encoded = BH_PrivateStart,
BH_FullMapped,
@@ -467,6 +458,7 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
#endif /* !CONFIG_EROFS_FS_ZIP */
struct erofs_map_dev {
+ struct erofs_fscache *m_fscache;
struct block_device *m_bdev;
struct dax_device *m_daxdev;
u64 m_dax_part_off;
@@ -486,6 +478,8 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
+int erofs_map_blocks(struct inode *inode,
+ struct erofs_map_blocks *map, int flags);
/* inode.c */
static inline unsigned long erofs_inode_hash(erofs_nid_t nid)
@@ -509,7 +503,7 @@ int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
/* namei.c */
extern const struct inode_operations erofs_dir_iops;
-int erofs_namei(struct inode *dir, struct qstr *name,
+int erofs_namei(struct inode *dir, const struct qstr *name,
erofs_nid_t *nid, unsigned int *d_type);
/* dir.c */
@@ -611,6 +605,36 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb,
}
#endif /* !CONFIG_EROFS_FS_ZIP */
+/* fscache.c */
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+int erofs_fscache_register_fs(struct super_block *sb);
+void erofs_fscache_unregister_fs(struct super_block *sb);
+
+int erofs_fscache_register_cookie(struct super_block *sb,
+ struct erofs_fscache **fscache,
+ char *name, bool need_inode);
+void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache);
+
+extern const struct address_space_operations erofs_fscache_access_aops;
+#else
+static inline int erofs_fscache_register_fs(struct super_block *sb)
+{
+ return 0;
+}
+static inline void erofs_fscache_unregister_fs(struct super_block *sb) {}
+
+static inline int erofs_fscache_register_cookie(struct super_block *sb,
+ struct erofs_fscache **fscache,
+ char *name, bool need_inode)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache)
+{
+}
+#endif
+
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#endif /* __EROFS_INTERNAL_H */
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index 554efa363317..fd75506799c4 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -165,9 +165,8 @@ out: /* free if the candidate is valid */
return candidate;
}
-int erofs_namei(struct inode *dir,
- struct qstr *name,
- erofs_nid_t *nid, unsigned int *d_type)
+int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid,
+ unsigned int *d_type)
{
int ndirents;
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 0c4b41130c2f..c6f5fa4ab244 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -13,6 +13,7 @@
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/dax.h>
+#include <linux/exportfs.h>
#include "xattr.h"
#define CREATE_TRACE_POINTS
@@ -219,7 +220,52 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
}
#endif
-static int erofs_init_devices(struct super_block *sb,
+static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
+ struct erofs_device_info *dif, erofs_off_t *pos)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_deviceslot *dis;
+ struct block_device *bdev;
+ void *ptr;
+ int ret;
+
+ ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*pos), EROFS_KMAP);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+ dis = ptr + erofs_blkoff(*pos);
+
+ if (!dif->path) {
+ if (!dis->tag[0]) {
+ erofs_err(sb, "empty device tag @ pos %llu", *pos);
+ return -EINVAL;
+ }
+ dif->path = kmemdup_nul(dis->tag, sizeof(dis->tag), GFP_KERNEL);
+ if (!dif->path)
+ return -ENOMEM;
+ }
+
+ if (erofs_is_fscache_mode(sb)) {
+ ret = erofs_fscache_register_cookie(sb, &dif->fscache,
+ dif->path, false);
+ if (ret)
+ return ret;
+ } else {
+ bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL,
+ sb->s_type);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+ dif->bdev = bdev;
+ dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off);
+ }
+
+ dif->blocks = le32_to_cpu(dis->blocks);
+ dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
+ sbi->total_blocks += dif->blocks;
+ *pos += EROFS_DEVT_SLOT_SIZE;
+ return 0;
+}
+
+static int erofs_scan_devices(struct super_block *sb,
struct erofs_super_block *dsb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
@@ -227,8 +273,6 @@ static int erofs_init_devices(struct super_block *sb,
erofs_off_t pos;
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_device_info *dif;
- struct erofs_deviceslot *dis;
- void *ptr;
int id, err = 0;
sbi->total_blocks = sbi->primarydevice_blocks;
@@ -237,7 +281,8 @@ static int erofs_init_devices(struct super_block *sb,
else
ondisk_extradevs = le16_to_cpu(dsb->extra_devices);
- if (ondisk_extradevs != sbi->devs->extra_devices) {
+ if (sbi->devs->extra_devices &&
+ ondisk_extradevs != sbi->devs->extra_devices) {
erofs_err(sb, "extra devices don't match (ondisk %u, given %u)",
ondisk_extradevs, sbi->devs->extra_devices);
return -EINVAL;
@@ -248,30 +293,31 @@ static int erofs_init_devices(struct super_block *sb,
sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1;
pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE;
down_read(&sbi->devs->rwsem);
- idr_for_each_entry(&sbi->devs->tree, dif, id) {
- struct block_device *bdev;
-
- ptr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos),
- EROFS_KMAP);
- if (IS_ERR(ptr)) {
- err = PTR_ERR(ptr);
- break;
+ if (sbi->devs->extra_devices) {
+ idr_for_each_entry(&sbi->devs->tree, dif, id) {
+ err = erofs_init_device(&buf, sb, dif, &pos);
+ if (err)
+ break;
}
- dis = ptr + erofs_blkoff(pos);
-
- bdev = blkdev_get_by_path(dif->path,
- FMODE_READ | FMODE_EXCL,
- sb->s_type);
- if (IS_ERR(bdev)) {
- err = PTR_ERR(bdev);
- break;
+ } else {
+ for (id = 0; id < ondisk_extradevs; id++) {
+ dif = kzalloc(sizeof(*dif), GFP_KERNEL);
+ if (!dif) {
+ err = -ENOMEM;
+ break;
+ }
+
+ err = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL);
+ if (err < 0) {
+ kfree(dif);
+ break;
+ }
+ ++sbi->devs->extra_devices;
+
+ err = erofs_init_device(&buf, sb, dif, &pos);
+ if (err)
+ break;
}
- dif->bdev = bdev;
- dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off);
- dif->blocks = le32_to_cpu(dis->blocks);
- dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
- sbi->total_blocks += dif->blocks;
- pos += EROFS_DEVT_SLOT_SIZE;
}
up_read(&sbi->devs->rwsem);
erofs_put_metabuf(&buf);
@@ -358,10 +404,12 @@ static int erofs_read_superblock(struct super_block *sb)
goto out;
/* handle multiple devices */
- ret = erofs_init_devices(sb, dsb);
+ ret = erofs_scan_devices(sb, dsb);
if (erofs_sb_has_ztailpacking(sbi))
erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!");
+ if (erofs_is_fscache_mode(sb))
+ erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!");
out:
erofs_put_metabuf(&buf);
return ret;
@@ -390,6 +438,7 @@ enum {
Opt_dax,
Opt_dax_enum,
Opt_device,
+ Opt_fsid,
Opt_err
};
@@ -414,6 +463,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
fsparam_flag("dax", Opt_dax),
fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums),
fsparam_string("device", Opt_device),
+ fsparam_string("fsid", Opt_fsid),
{}
};
@@ -509,6 +559,16 @@ static int erofs_fc_parse_param(struct fs_context *fc,
}
++ctx->devs->extra_devices;
break;
+ case Opt_fsid:
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+ kfree(ctx->opt.fsid);
+ ctx->opt.fsid = kstrdup(param->string, GFP_KERNEL);
+ if (!ctx->opt.fsid)
+ return -ENOMEM;
+#else
+ errorfc(fc, "fsid option not supported");
+#endif
+ break;
default:
return -ENOPARAM;
}
@@ -577,6 +637,44 @@ static int erofs_init_managed_cache(struct super_block *sb)
static int erofs_init_managed_cache(struct super_block *sb) { return 0; }
#endif
+static struct inode *erofs_nfs_get_inode(struct super_block *sb,
+ u64 ino, u32 generation)
+{
+ return erofs_iget(sb, ino, false);
+}
+
+static struct dentry *erofs_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+ erofs_nfs_get_inode);
+}
+
+static struct dentry *erofs_fh_to_parent(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+ erofs_nfs_get_inode);
+}
+
+static struct dentry *erofs_get_parent(struct dentry *child)
+{
+ erofs_nid_t nid;
+ unsigned int d_type;
+ int err;
+
+ err = erofs_namei(d_inode(child), &dotdot_name, &nid, &d_type);
+ if (err)
+ return ERR_PTR(err);
+ return d_obtain_alias(erofs_iget(child->d_sb, nid, d_type == FT_DIR));
+}
+
+static const struct export_operations erofs_export_ops = {
+ .fh_to_dentry = erofs_fh_to_dentry,
+ .fh_to_parent = erofs_fh_to_parent,
+ .get_parent = erofs_get_parent,
+};
+
static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct inode *inode;
@@ -585,11 +683,9 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
int err;
sb->s_magic = EROFS_SUPER_MAGIC;
-
- if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) {
- erofs_err(sb, "failed to set erofs blksize");
- return -EINVAL;
- }
+ sb->s_flags |= SB_RDONLY | SB_NOATIME;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_op = &erofs_sops;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
@@ -597,10 +693,36 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_fs_info = sbi;
sbi->opt = ctx->opt;
- sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->dax_part_off);
+ ctx->opt.fsid = NULL;
sbi->devs = ctx->devs;
ctx->devs = NULL;
+ if (erofs_is_fscache_mode(sb)) {
+ sb->s_blocksize = EROFS_BLKSIZ;
+ sb->s_blocksize_bits = LOG_BLOCK_SIZE;
+
+ err = erofs_fscache_register_fs(sb);
+ if (err)
+ return err;
+
+ err = erofs_fscache_register_cookie(sb, &sbi->s_fscache,
+ sbi->opt.fsid, true);
+ if (err)
+ return err;
+
+ err = super_setup_bdi(sb);
+ if (err)
+ return err;
+ } else {
+ if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) {
+ erofs_err(sb, "failed to set erofs blksize");
+ return -EINVAL;
+ }
+
+ sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
+ &sbi->dax_part_off);
+ }
+
err = erofs_read_superblock(sb);
if (err)
return err;
@@ -613,12 +735,10 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
clear_opt(&sbi->opt, DAX_ALWAYS);
}
}
- sb->s_flags |= SB_RDONLY | SB_NOATIME;
- sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_time_gran = 1;
- sb->s_op = &erofs_sops;
+ sb->s_time_gran = 1;
sb->s_xattr = erofs_xattr_handlers;
+ sb->s_export_op = &erofs_export_ops;
if (test_opt(&sbi->opt, POSIX_ACL))
sb->s_flags |= SB_POSIXACL;
@@ -661,6 +781,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
static int erofs_fc_get_tree(struct fs_context *fc)
{
+ struct erofs_fs_context *ctx = fc->fs_private;
+
+ if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->opt.fsid)
+ return get_tree_nodev(fc, erofs_fc_fill_super);
+
return get_tree_bdev(fc, erofs_fc_fill_super);
}
@@ -690,6 +815,7 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
fs_put_dax(dif->dax_dev);
if (dif->bdev)
blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
+ erofs_fscache_unregister_cookie(&dif->fscache);
kfree(dif->path);
kfree(dif);
return 0;
@@ -709,6 +835,7 @@ static void erofs_fc_free(struct fs_context *fc)
struct erofs_fs_context *ctx = fc->fs_private;
erofs_free_dev_context(ctx->devs);
+ kfree(ctx->opt.fsid);
kfree(ctx);
}
@@ -749,7 +876,10 @@ static void erofs_kill_sb(struct super_block *sb)
WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC);
- kill_block_super(sb);
+ if (erofs_is_fscache_mode(sb))
+ generic_shutdown_super(sb);
+ else
+ kill_block_super(sb);
sbi = EROFS_SB(sb);
if (!sbi)
@@ -757,6 +887,9 @@ static void erofs_kill_sb(struct super_block *sb)
erofs_free_dev_context(sbi->devs);
fs_put_dax(sbi->dax_dev);
+ erofs_fscache_unregister_cookie(&sbi->s_fscache);
+ erofs_fscache_unregister_fs(sb);
+ kfree(sbi->opt.fsid);
kfree(sbi);
sb->s_fs_info = NULL;
}
@@ -774,6 +907,7 @@ static void erofs_put_super(struct super_block *sb)
iput(sbi->managed_cache);
sbi->managed_cache = NULL;
#endif
+ erofs_fscache_unregister_cookie(&sbi->s_fscache);
}
static struct file_system_type erofs_fs_type = {
@@ -781,7 +915,7 @@ static struct file_system_type erofs_fs_type = {
.name = "erofs",
.init_fs_context = erofs_init_fs_context,
.kill_sb = erofs_kill_sb,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("erofs");
@@ -857,7 +991,10 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
- u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+ u64 id = 0;
+
+ if (!erofs_is_fscache_mode(sb))
+ id = huge_encode_dev(sb->s_bdev->bd_dev);
buf->f_type = sb->s_magic;
buf->f_bsize = EROFS_BLKSIZ;
@@ -902,6 +1039,10 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",dax=always");
if (test_opt(opt, DAX_NEVER))
seq_puts(seq, ",dax=never");
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+ if (opt->fsid)
+ seq_printf(seq, ",fsid=%s", opt->fsid);
+#endif
return 0;
}
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index f3babf1e6608..c1383e508bbe 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -205,8 +205,8 @@ int erofs_register_sysfs(struct super_block *sb)
sbi->s_kobj.kset = &erofs_root;
init_completion(&sbi->s_kobj_unregister);
- err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL,
- "%s", sb->s_id);
+ err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s",
+ erofs_is_fscache_mode(sb) ? sbi->opt.fsid : sb->s_id);
if (err)
goto put_sb_kobj;
return 0;
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index e25539072463..72585c9729a2 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -39,6 +39,7 @@ struct fscache_cookie;
#define FSCACHE_ADV_SINGLE_CHUNK 0x01 /* The object is a single chunk of data */
#define FSCACHE_ADV_WRITE_CACHE 0x00 /* Do cache if written to locally */
#define FSCACHE_ADV_WRITE_NOCACHE 0x02 /* Don't cache if written to locally */
+#define FSCACHE_ADV_WANT_CACHE_SIZE 0x04 /* Retrieve cache size at runtime */
#define FSCACHE_INVAL_DIO_WRITE 0x01 /* Invalidate due to DIO write */
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 0c33b715cbfd..80b7728277b1 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -159,6 +159,7 @@ struct netfs_io_subrequest {
#define NETFS_SREQ_SHORT_IO 2 /* Set if the I/O was short */
#define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */
#define NETFS_SREQ_NO_PROGRESS 4 /* Set if we didn't manage to read any data */
+#define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */
};
enum netfs_io_origin {
diff --git a/include/trace/events/cachefiles.h b/include/trace/events/cachefiles.h
index 311c14a20e70..d8d4d73fe7b6 100644
--- a/include/trace/events/cachefiles.h
+++ b/include/trace/events/cachefiles.h
@@ -31,6 +31,8 @@ enum cachefiles_obj_ref_trace {
cachefiles_obj_see_lookup_failed,
cachefiles_obj_see_withdraw_cookie,
cachefiles_obj_see_withdrawal,
+ cachefiles_obj_get_ondemand_fd,
+ cachefiles_obj_put_ondemand_fd,
};
enum fscache_why_object_killed {
@@ -671,6 +673,180 @@ TRACE_EVENT(cachefiles_io_error,
__entry->error)
);
+TRACE_EVENT(cachefiles_ondemand_open,
+ TP_PROTO(struct cachefiles_object *obj, struct cachefiles_msg *msg,
+ struct cachefiles_open *load),
+
+ TP_ARGS(obj, msg, load),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, obj )
+ __field(unsigned int, msg_id )
+ __field(unsigned int, object_id )
+ __field(unsigned int, fd )
+ __field(unsigned int, flags )
+ ),
+
+ TP_fast_assign(
+ __entry->obj = obj ? obj->debug_id : 0;
+ __entry->msg_id = msg->msg_id;
+ __entry->object_id = msg->object_id;
+ __entry->fd = load->fd;
+ __entry->flags = load->flags;
+ ),
+
+ TP_printk("o=%08x mid=%x oid=%x fd=%d f=%x",
+ __entry->obj,
+ __entry->msg_id,
+ __entry->object_id,
+ __entry->fd,
+ __entry->flags)
+ );
+
+TRACE_EVENT(cachefiles_ondemand_copen,
+ TP_PROTO(struct cachefiles_object *obj, unsigned int msg_id,
+ long len),
+
+ TP_ARGS(obj, msg_id, len),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, obj )
+ __field(unsigned int, msg_id )
+ __field(long, len )
+ ),
+
+ TP_fast_assign(
+ __entry->obj = obj ? obj->debug_id : 0;
+ __entry->msg_id = msg_id;
+ __entry->len = len;
+ ),
+
+ TP_printk("o=%08x mid=%x l=%lx",
+ __entry->obj,
+ __entry->msg_id,
+ __entry->len)
+ );
+
+TRACE_EVENT(cachefiles_ondemand_close,
+ TP_PROTO(struct cachefiles_object *obj, struct cachefiles_msg *msg),
+
+ TP_ARGS(obj, msg),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, obj )
+ __field(unsigned int, msg_id )
+ __field(unsigned int, object_id )
+ ),
+
+ TP_fast_assign(
+ __entry->obj = obj ? obj->debug_id : 0;
+ __entry->msg_id = msg->msg_id;
+ __entry->object_id = msg->object_id;
+ ),
+
+ TP_printk("o=%08x mid=%x oid=%x",
+ __entry->obj,
+ __entry->msg_id,
+ __entry->object_id)
+ );
+
+TRACE_EVENT(cachefiles_ondemand_read,
+ TP_PROTO(struct cachefiles_object *obj, struct cachefiles_msg *msg,
+ struct cachefiles_read *load),
+
+ TP_ARGS(obj, msg, load),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, obj )
+ __field(unsigned int, msg_id )
+ __field(unsigned int, object_id )
+ __field(loff_t, start )
+ __field(size_t, len )
+ ),
+
+ TP_fast_assign(
+ __entry->obj = obj ? obj->debug_id : 0;
+ __entry->msg_id = msg->msg_id;
+ __entry->object_id = msg->object_id;
+ __entry->start = load->off;
+ __entry->len = load->len;
+ ),
+
+ TP_printk("o=%08x mid=%x oid=%x s=%llx l=%zx",
+ __entry->obj,
+ __entry->msg_id,
+ __entry->object_id,
+ __entry->start,
+ __entry->len)
+ );
+
+TRACE_EVENT(cachefiles_ondemand_cread,
+ TP_PROTO(struct cachefiles_object *obj, unsigned int msg_id),
+
+ TP_ARGS(obj, msg_id),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, obj )
+ __field(unsigned int, msg_id )
+ ),
+
+ TP_fast_assign(
+ __entry->obj = obj ? obj->debug_id : 0;
+ __entry->msg_id = msg_id;
+ ),
+
+ TP_printk("o=%08x mid=%x",
+ __entry->obj,
+ __entry->msg_id)
+ );
+
+TRACE_EVENT(cachefiles_ondemand_fd_write,
+ TP_PROTO(struct cachefiles_object *obj, struct inode *backer,
+ loff_t start, size_t len),
+
+ TP_ARGS(obj, backer, start, len),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, obj )
+ __field(unsigned int, backer )
+ __field(loff_t, start )
+ __field(size_t, len )
+ ),
+
+ TP_fast_assign(
+ __entry->obj = obj ? obj->debug_id : 0;
+ __entry->backer = backer->i_ino;
+ __entry->start = start;
+ __entry->len = len;
+ ),
+
+ TP_printk("o=%08x iB=%x s=%llx l=%zx",
+ __entry->obj,
+ __entry->backer,
+ __entry->start,
+ __entry->len)
+ );
+
+TRACE_EVENT(cachefiles_ondemand_fd_release,
+ TP_PROTO(struct cachefiles_object *obj, int object_id),
+
+ TP_ARGS(obj, object_id),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, obj )
+ __field(unsigned int, object_id )
+ ),
+
+ TP_fast_assign(
+ __entry->obj = obj ? obj->debug_id : 0;
+ __entry->object_id = object_id;
+ ),
+
+ TP_printk("o=%08x oid=%x",
+ __entry->obj,
+ __entry->object_id)
+ );
+
#endif /* _TRACE_CACHEFILES_H */
/* This part must be outside protection */
diff --git a/include/uapi/linux/cachefiles.h b/include/uapi/linux/cachefiles.h
new file mode 100644
index 000000000000..78caa73e5343
--- /dev/null
+++ b/include/uapi/linux/cachefiles.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _LINUX_CACHEFILES_H
+#define _LINUX_CACHEFILES_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/*
+ * Fscache ensures that the maximum length of cookie key is 255. The volume key
+ * is controlled by netfs, and generally no bigger than 255.
+ */
+#define CACHEFILES_MSG_MAX_SIZE 1024
+
+enum cachefiles_opcode {
+ CACHEFILES_OP_OPEN,
+ CACHEFILES_OP_CLOSE,
+ CACHEFILES_OP_READ,
+};
+
+/*
+ * Message Header
+ *
+ * @msg_id a unique ID identifying this message
+ * @opcode message type, CACHEFILE_OP_*
+ * @len message length, including message header and following data
+ * @object_id a unique ID identifying a cache file
+ * @data message type specific payload
+ */
+struct cachefiles_msg {
+ __u32 msg_id;
+ __u32 opcode;
+ __u32 len;
+ __u32 object_id;
+ __u8 data[];
+};
+
+/*
+ * @data contains the volume_key followed directly by the cookie_key. volume_key
+ * is a NUL-terminated string; @volume_key_size indicates the size of the volume
+ * key in bytes. cookie_key is binary data, which is netfs specific;
+ * @cookie_key_size indicates the size of the cookie key in bytes.
+ *
+ * @fd identifies an anon_fd referring to the cache file.
+ */
+struct cachefiles_open {
+ __u32 volume_key_size;
+ __u32 cookie_key_size;
+ __u32 fd;
+ __u32 flags;
+ __u8 data[];
+};
+
+/*
+ * @off indicates the starting offset of the requested file range
+ * @len indicates the length of the requested file range
+ */
+struct cachefiles_read {
+ __u64 off;
+ __u64 len;
+};
+
+/*
+ * Reply for READ request
+ * @arg for this ioctl is the @id field of READ request.
+ */
+#define CACHEFILES_IOC_READ_COMPLETE _IOW(0x98, 1, int)
+
+#endif