18 files changed, 3949 insertions, 3185 deletions
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
index 80e9c6167f0b..c5a070550ee3 100644
--- a/fs/cachefiles/Kconfig
+++ b/fs/cachefiles/Kconfig
@@ -1,13 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
 
 config CACHEFILES
 	tristate "Filesystem caching on files"
-	depends on FSCACHE && BLOCK
+	depends on NETFS_SUPPORT && FSCACHE && BLOCK
 	help
 	  This permits use of a mounted filesystem as a cache for other
 	  filesystems - primarily networking filesystems - thus allowing fast
 	  local disk to enhance the speed of slower devices.
 
-	  See Documentation/filesystems/caching/cachefiles.txt for more
+	  See Documentation/filesystems/caching/cachefiles.rst for more
 	  information.
 
 config CACHEFILES_DEBUG
@@ -19,21 +20,21 @@ config CACHEFILES_DEBUG
 	  enabled by setting bits in /sys/modules/cachefiles/parameter/debug or
 	  by including a debugging specifier in /etc/cachefilesd.conf.
 
-config CACHEFILES_HISTOGRAM
-	bool "Gather latency information on CacheFiles"
-	depends on CACHEFILES && PROC_FS
+config CACHEFILES_ERROR_INJECTION
+	bool "Provide error injection for cachefiles"
+	depends on CACHEFILES && SYSCTL
 	help
+	  This permits error injection to be enabled in cachefiles whilst a
+	  cache is in service.
 
-	  This option causes latency information to be gathered on CacheFiles
-	  operation and exported through file:
-
-		/proc/fs/cachefiles/histogram
-
-	  The generation of this histogram adds a certain amount of overhead to
-	  execution as there are a number of points at which data is gathered,
-	  and on a multi-CPU system these may be on cachelines that keep
-	  bouncing between CPUs.  On the other hand, the histogram may be
-	  useful for debugging purposes.  Saying 'N' here is recommended.
+config CACHEFILES_ONDEMAND
+	bool "Support for on-demand read"
+	depends on CACHEFILES
+	default n
+	help
+	  This permits userspace to enable the cachefiles on-demand read mode.
+	  In this mode, when a cache miss occurs, responsibility for fetching
+	  the data lies with the cachefiles backend instead of with the netfs
+	  and is delegated to userspace.
 
-	  See Documentation/filesystems/caching/cachefiles.txt for more
-	  information.
+	  If unsure, say N.
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
index 32cbab0ffce3..c37a7a9af10b 100644
--- a/fs/cachefiles/Makefile
+++ b/fs/cachefiles/Makefile
@@ -1,18 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0
 #
 # Makefile for caching in a mounted filesystem
 #
 
 cachefiles-y := \
-	bind.o \
+	cache.o \
 	daemon.o \
 	interface.o \
+	io.o \
 	key.o \
 	main.o \
 	namei.o \
-	rdwr.o \
 	security.o \
+	volume.o \
 	xattr.o
 
-cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o
+cachefiles-$(CONFIG_CACHEFILES_ERROR_INJECTION) += error_inject.o
+cachefiles-$(CONFIG_CACHEFILES_ONDEMAND) += ondemand.o
 
 obj-$(CONFIG_CACHEFILES) := cachefiles.o
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
deleted file mode 100644
index 622f4696e484..000000000000
--- a/fs/cachefiles/bind.c
+++ /dev/null
@@ -1,281 +0,0 @@
-/* Bind and unbind a cache from the filesystem backing it
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/completion.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/namei.h>
-#include <linux/mount.h>
-#include <linux/statfs.h>
-#include <linux/ctype.h>
-#include "internal.h"
-
-static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches);
-
-/*
- * bind a directory as a cache
- */
-int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
-{
-	_enter("{%u,%u,%u,%u,%u,%u},%s",
-	       cache->frun_percent,
-	       cache->fcull_percent,
-	       cache->fstop_percent,
-	       cache->brun_percent,
-	       cache->bcull_percent,
-	       cache->bstop_percent,
-	       args);
-
-	/* start by checking things over */
-	ASSERT(cache->fstop_percent >= 0 &&
-	       cache->fstop_percent < cache->fcull_percent &&
-	       cache->fcull_percent < cache->frun_percent &&
-	       cache->frun_percent  < 100);
-
-	ASSERT(cache->bstop_percent >= 0 &&
-	       cache->bstop_percent < cache->bcull_percent &&
-	       cache->bcull_percent < cache->brun_percent &&
-	       cache->brun_percent  < 100);
-
-	if (*args) {
-		kerror("'bind' command doesn't take an argument");
-		return -EINVAL;
-	}
-
-	if (!cache->rootdirname) {
-		kerror("No cache directory specified");
-		return -EINVAL;
-	}
-
-	/* don't permit already bound caches to be re-bound */
-	if (test_bit(CACHEFILES_READY, &cache->flags)) {
-		kerror("Cache already bound");
-		return -EBUSY;
-	}
-
-	/* make sure we have copies of the tag and dirname strings */
-	if (!cache->tag) {
-		/* the tag string is released by the fops->release()
-		 * function, so we don't release it on error here */
-		cache->tag = kstrdup("CacheFiles", GFP_KERNEL);
-		if (!cache->tag)
-			return -ENOMEM;
-	}
-
-	/* add the cache */
-	return cachefiles_daemon_add_cache(cache);
-}
-
-/*
- * add a cache
- */
-static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
-{
-	struct cachefiles_object *fsdef;
-	struct path path;
-	struct kstatfs stats;
-	struct dentry *graveyard, *cachedir, *root;
-	const struct cred *saved_cred;
-	int ret;
-
-	_enter("");
-
-	/* we want to work under the module's security ID */
-	ret = cachefiles_get_security_ID(cache);
-	if (ret < 0)
-		return ret;
-
-	cachefiles_begin_secure(cache, &saved_cred);
-
-	/* allocate the root index object */
-	ret = -ENOMEM;
-
-	fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
-	if (!fsdef)
-		goto error_root_object;
-
-	ASSERTCMP(fsdef->backer, ==, NULL);
-
-	atomic_set(&fsdef->usage, 1);
-	fsdef->type = FSCACHE_COOKIE_TYPE_INDEX;
-
-	_debug("- fsdef %p", fsdef);
-
-	/* look up the directory at the root of the cache */
-	ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path);
-	if (ret < 0)
-		goto error_open_root;
-
-	cache->mnt = path.mnt;
-	root = path.dentry;
-
-	/* check parameters */
-	ret = -EOPNOTSUPP;
-	if (!root->d_inode ||
-	    !root->d_inode->i_op ||
-	    !root->d_inode->i_op->lookup ||
-	    !root->d_inode->i_op->mkdir ||
-	    !root->d_inode->i_op->setxattr ||
-	    !root->d_inode->i_op->getxattr ||
-	    !root->d_sb->s_op->statfs ||
-	    !root->d_sb->s_op->sync_fs)
-		goto error_unsupported;
-
-	ret = -EROFS;
-	if (root->d_sb->s_flags & MS_RDONLY)
-		goto error_unsupported;
-
-	/* determine the security of the on-disk cache as this governs
-	 * security ID of files we create */
-	ret = cachefiles_determine_cache_security(cache, root, &saved_cred);
-	if (ret < 0)
-		goto error_unsupported;
-
-	/* get the cache size and blocksize */
-	ret = vfs_statfs(&path, &stats);
-	if (ret < 0)
-		goto error_unsupported;
-
-	ret = -ERANGE;
-	if (stats.f_bsize <= 0)
-		goto error_unsupported;
-
-	ret = -EOPNOTSUPP;
-	if (stats.f_bsize > PAGE_SIZE)
-		goto error_unsupported;
-
-	cache->bsize = stats.f_bsize;
-	cache->bshift = 0;
-	if (stats.f_bsize < PAGE_SIZE)
-		cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize);
-
-	_debug("blksize %u (shift %u)",
-	       cache->bsize, cache->bshift);
-
-	_debug("size %llu, avail %llu",
-	       (unsigned long long) stats.f_blocks,
-	       (unsigned long long) stats.f_bavail);
-
-	/* set up caching limits */
-	do_div(stats.f_files, 100);
-	cache->fstop = stats.f_files * cache->fstop_percent;
-	cache->fcull = stats.f_files * cache->fcull_percent;
-	cache->frun  = stats.f_files * cache->frun_percent;
-
-	_debug("limits {%llu,%llu,%llu} files",
-	       (unsigned long long) cache->frun,
-	       (unsigned long long) cache->fcull,
-	       (unsigned long long) cache->fstop);
-
-	stats.f_blocks >>= cache->bshift;
-	do_div(stats.f_blocks, 100);
-	cache->bstop = stats.f_blocks * cache->bstop_percent;
-	cache->bcull = stats.f_blocks * cache->bcull_percent;
-	cache->brun  = stats.f_blocks * cache->brun_percent;
-
-	_debug("limits {%llu,%llu,%llu} blocks",
-	       (unsigned long long) cache->brun,
-	       (unsigned long long) cache->bcull,
-	       (unsigned long long) cache->bstop);
-
-	/* get the cache directory and check its type */
-	cachedir = cachefiles_get_directory(cache, root, "cache");
-	if (IS_ERR(cachedir)) {
-		ret = PTR_ERR(cachedir);
-		goto error_unsupported;
-	}
-
-	fsdef->dentry = cachedir;
-	fsdef->fscache.cookie = NULL;
-
-	ret = cachefiles_check_object_type(fsdef);
-	if (ret < 0)
-		goto error_unsupported;
-
-	/* get the graveyard directory */
-	graveyard = cachefiles_get_directory(cache, root, "graveyard");
-	if (IS_ERR(graveyard)) {
-		ret = PTR_ERR(graveyard);
-		goto error_unsupported;
-	}
-
-	cache->graveyard = graveyard;
-
-	/* publish the cache */
-	fscache_init_cache(&cache->cache,
-			   &cachefiles_cache_ops,
-			   "%s",
-			   fsdef->dentry->d_sb->s_id);
-
-	fscache_object_init(&fsdef->fscache, NULL, &cache->cache);
-
-	ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag);
-	if (ret < 0)
-		goto error_add_cache;
-
-	/* done */
-	set_bit(CACHEFILES_READY, &cache->flags);
-	dput(root);
-
-	printk(KERN_INFO "CacheFiles:"
-	       " File cache on %s registered\n",
-	       cache->cache.identifier);
-
-	/* check how much space the cache has */
-	cachefiles_has_space(cache, 0, 0);
-	cachefiles_end_secure(cache, saved_cred);
-	return 0;
-
-error_add_cache:
-	dput(cache->graveyard);
-	cache->graveyard = NULL;
-error_unsupported:
-	mntput(cache->mnt);
-	cache->mnt = NULL;
-	dput(fsdef->dentry);
-	fsdef->dentry = NULL;
-	dput(root);
-error_open_root:
-	kmem_cache_free(cachefiles_object_jar, fsdef);
-error_root_object:
-	cachefiles_end_secure(cache, saved_cred);
-	kerror("Failed to register: %d", ret);
-	return ret;
-}
-
-/*
- * unbind a cache on fd release
- */
-void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
-{
-	_enter("");
-
-	if (test_bit(CACHEFILES_READY, &cache->flags)) {
-		printk(KERN_INFO "CacheFiles:"
-		       " File cache on %s unregistering\n",
-		       cache->cache.identifier);
-
-		fscache_withdraw_cache(&cache->cache);
-	}
-
-	dput(cache->graveyard);
-	mntput(cache->mnt);
-
-	kfree(cache->rootdirname);
-	kfree(cache->secctx);
-	kfree(cache->tag);
-
-	_leave("");
-}
diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c
new file mode 100644
index 000000000000..9fb06dc16520
--- /dev/null
+++ b/fs/cachefiles/cache.c
@@ -0,0 +1,428 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Manage high-level VFS aspects of a cache.
+ *
+ * Copyright (C) 2007, 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/namei.h>
+#include <trace/events/fscache.h>
+#include "internal.h"
+
+/*
+ * Bring a cache online.
+ */
+int cachefiles_add_cache(struct cachefiles_cache *cache)
+{
+	struct fscache_cache *cache_cookie;
+	struct path path;
+	struct kstatfs stats;
+	struct dentry *graveyard, *cachedir, *root;
+	const struct cred *saved_cred;
+	int ret;
+
+	_enter("");
+
+	cache_cookie = fscache_acquire_cache(cache->tag);
+	if (IS_ERR(cache_cookie))
+		return PTR_ERR(cache_cookie);
+
+	/* we want to work under the module's security ID */
+	ret = cachefiles_get_security_ID(cache);
+	if (ret < 0)
+		goto error_getsec;
+
+	cachefiles_begin_secure(cache, &saved_cred);
+
+	/* look up the directory at the root of the cache */
+	ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path);
+	if (ret < 0)
+		goto error_open_root;
+
+	cache->mnt = path.mnt;
+	root = path.dentry;
+
+	ret = -EINVAL;
+	if (is_idmapped_mnt(path.mnt)) {
+		pr_warn("File cache on idmapped mounts not supported");
+		goto error_unsupported;
+	}
+
+	/* Check features of the backing filesystem:
+	 * - Directories must support looking up and directory creation
+	 * - We create tmpfiles to handle invalidation
+	 * - We use xattrs to store metadata
+	 * - We need to be able to query the amount of space available
+	 * - We want to be able to sync the filesystem when stopping the cache
+	 * - We use DIO to/from pages, so the blocksize mustn't be too big.
+	 */
+	ret = -EOPNOTSUPP;
+	if (d_is_negative(root) ||
+	    !d_backing_inode(root)->i_op->lookup ||
+	    !d_backing_inode(root)->i_op->mkdir ||
+	    !d_backing_inode(root)->i_op->tmpfile ||
+	    !(d_backing_inode(root)->i_opflags & IOP_XATTR) ||
+	    !root->d_sb->s_op->statfs ||
+	    !root->d_sb->s_op->sync_fs ||
+	    root->d_sb->s_blocksize > PAGE_SIZE)
+		goto error_unsupported;
+
+	ret = -EROFS;
+	if (sb_rdonly(root->d_sb))
+		goto error_unsupported;
+
+	/* determine the security of the on-disk cache as this governs
+	 * security ID of files we create */
+	ret = cachefiles_determine_cache_security(cache, root, &saved_cred);
+	if (ret < 0)
+		goto error_unsupported;
+
+	/* get the cache size and blocksize */
+	ret = vfs_statfs(&path, &stats);
+	if (ret < 0)
+		goto error_unsupported;
+
+	ret = -ERANGE;
+	if (stats.f_bsize <= 0)
+		goto error_unsupported;
+
+	ret = -EOPNOTSUPP;
+	if (stats.f_bsize > PAGE_SIZE)
+		goto error_unsupported;
+
+	cache->bsize = stats.f_bsize;
+	cache->bshift = ilog2(stats.f_bsize);
+
+	_debug("blksize %u (shift %u)",
+	       cache->bsize, cache->bshift);
+
+	_debug("size %llu, avail %llu",
+	       (unsigned long long) stats.f_blocks,
+	       (unsigned long long) stats.f_bavail);
+
+	/* set up caching limits */
+	do_div(stats.f_files, 100);
+	cache->fstop = stats.f_files * cache->fstop_percent;
+	cache->fcull = stats.f_files * cache->fcull_percent;
+	cache->frun  = stats.f_files * cache->frun_percent;
+
+	_debug("limits {%llu,%llu,%llu} files",
+	       (unsigned long long) cache->frun,
+	       (unsigned long long) cache->fcull,
+	       (unsigned long long) cache->fstop);
+
+	do_div(stats.f_blocks, 100);
+	cache->bstop = stats.f_blocks * cache->bstop_percent;
+	cache->bcull = stats.f_blocks * cache->bcull_percent;
+	cache->brun  = stats.f_blocks * cache->brun_percent;
+
+	_debug("limits {%llu,%llu,%llu} blocks",
+	       (unsigned long long) cache->brun,
+	       (unsigned long long) cache->bcull,
+	       (unsigned long long) cache->bstop);
+
+	/* get the cache directory and check its type */
+	cachedir = cachefiles_get_directory(cache, root, "cache", NULL);
+	if (IS_ERR(cachedir)) {
+		ret = PTR_ERR(cachedir);
+		goto error_unsupported;
+	}
+
+	cache->store = cachedir;
+
+	/* get the graveyard directory */
+	graveyard = cachefiles_get_directory(cache, root, "graveyard", NULL);
+	if (IS_ERR(graveyard)) {
+		ret = PTR_ERR(graveyard);
+		goto error_unsupported;
+	}
+
+	cache->graveyard = graveyard;
+	cache->cache = cache_cookie;
+
+	ret = fscache_add_cache(cache_cookie, &cachefiles_cache_ops, cache);
+	if (ret < 0)
+		goto error_add_cache;
+
+	/* done */
+	set_bit(CACHEFILES_READY, &cache->flags);
+	dput(root);
+
+	pr_info("File cache on %s registered\n", cache_cookie->name);
+
+	/* check how much space the cache has */
+	cachefiles_has_space(cache, 0, 0, cachefiles_has_space_check);
+	cachefiles_end_secure(cache, saved_cred);
+	_leave(" = 0 [%px]", cache->cache);
+	return 0;
+
+error_add_cache:
+	cachefiles_put_directory(cache->graveyard);
+	cache->graveyard = NULL;
+error_unsupported:
+	cachefiles_put_directory(cache->store);
+	cache->store = NULL;
+	mntput(cache->mnt);
+	cache->mnt = NULL;
+	dput(root);
+error_open_root:
+	cachefiles_end_secure(cache, saved_cred);
+	put_cred(cache->cache_cred);
+	cache->cache_cred = NULL;
+error_getsec:
+	fscache_relinquish_cache(cache_cookie);
+	cache->cache = NULL;
+	pr_err("Failed to register: %d\n", ret);
+	return ret;
+}
+
+/*
+ * See if we have space for a number of pages and/or a number of files in the
+ * cache
+ */
+int cachefiles_has_space(struct cachefiles_cache *cache,
+			 unsigned fnr, unsigned bnr,
+			 enum cachefiles_has_space_for reason)
+{
+	struct kstatfs stats;
+	u64 b_avail, b_writing;
+	int ret;
+
+	struct path path = {
+		.mnt	= cache->mnt,
+		.dentry	= cache->mnt->mnt_root,
+	};
+
+	//_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
+	//       (unsigned long long) cache->frun,
+	//       (unsigned long long) cache->fcull,
+	//       (unsigned long long) cache->fstop,
+	//       (unsigned long long) cache->brun,
+	//       (unsigned long long) cache->bcull,
+	//       (unsigned long long) cache->bstop,
+	//       fnr, bnr);
+
+	/* find out how many pages of blockdev are available */
+	memset(&stats, 0, sizeof(stats));
+
+	ret = vfs_statfs(&path, &stats);
+	if (ret < 0) {
+		trace_cachefiles_vfs_error(NULL, d_inode(path.dentry), ret,
+					   cachefiles_trace_statfs_error);
+		if (ret == -EIO)
+			cachefiles_io_error(cache, "statfs failed");
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	b_avail = stats.f_bavail;
+	b_writing = atomic_long_read(&cache->b_writing);
+	if (b_avail > b_writing)
+		b_avail -= b_writing;
+	else
+		b_avail = 0;
+
+	//_debug("avail %llu,%llu",
+	//       (unsigned long long)stats.f_ffree,
+	//       (unsigned long long)b_avail);
+
+	/* see if there is sufficient space */
+	if (stats.f_ffree > fnr)
+		stats.f_ffree -= fnr;
+	else
+		stats.f_ffree = 0;
+
+	if (b_avail > bnr)
+		b_avail -= bnr;
+	else
+		b_avail = 0;
+
+	ret = -ENOBUFS;
+	if (stats.f_ffree < cache->fstop ||
+	    b_avail < cache->bstop)
+		goto stop_and_begin_cull;
+
+	ret = 0;
+	if (stats.f_ffree < cache->fcull ||
+	    b_avail < cache->bcull)
+		goto begin_cull;
+
+	if (test_bit(CACHEFILES_CULLING, &cache->flags) &&
+	    stats.f_ffree >= cache->frun &&
+	    b_avail >= cache->brun &&
+	    test_and_clear_bit(CACHEFILES_CULLING, &cache->flags)
+	    ) {
+		_debug("cease culling");
+		cachefiles_state_changed(cache);
+	}
+
+	//_leave(" = 0");
+	return 0;
+
+stop_and_begin_cull:
+	switch (reason) {
+	case cachefiles_has_space_for_write:
+		fscache_count_no_write_space();
+		break;
+	case cachefiles_has_space_for_create:
+		fscache_count_no_create_space();
+		break;
+	default:
+		break;
+	}
+begin_cull:
+	if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) {
+		_debug("### CULL CACHE ###");
+		cachefiles_state_changed(cache);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * Mark all the objects as being out of service and queue them all for cleanup.
+ */
+static void cachefiles_withdraw_objects(struct cachefiles_cache *cache)
+{
+	struct cachefiles_object *object;
+	unsigned int count = 0;
+
+	_enter("");
+
+	spin_lock(&cache->object_list_lock);
+
+	while (!list_empty(&cache->object_list)) {
+		object = list_first_entry(&cache->object_list,
+					  struct cachefiles_object, cache_link);
+		cachefiles_see_object(object, cachefiles_obj_see_withdrawal);
+		list_del_init(&object->cache_link);
+		fscache_withdraw_cookie(object->cookie);
+		count++;
+		if ((count & 63) == 0) {
+			spin_unlock(&cache->object_list_lock);
+			cond_resched();
+			spin_lock(&cache->object_list_lock);
+		}
+	}
+
+	spin_unlock(&cache->object_list_lock);
+	_leave(" [%u objs]", count);
+}
+
+/*
+ * Withdraw fscache volumes.
+ */
+static void cachefiles_withdraw_fscache_volumes(struct cachefiles_cache *cache)
+{
+	struct list_head *cur;
+	struct cachefiles_volume *volume;
+	struct fscache_volume *vcookie;
+
+	_enter("");
+retry:
+	spin_lock(&cache->object_list_lock);
+	list_for_each(cur, &cache->volumes) {
+		volume = list_entry(cur, struct cachefiles_volume, cache_link);
+
+		if (atomic_read(&volume->vcookie->n_accesses) == 0)
+			continue;
+
+		vcookie = fscache_try_get_volume(volume->vcookie,
+						 fscache_volume_get_withdraw);
+		if (vcookie) {
+			spin_unlock(&cache->object_list_lock);
+			fscache_withdraw_volume(vcookie);
+			fscache_put_volume(vcookie, fscache_volume_put_withdraw);
+			goto retry;
+		}
+	}
+	spin_unlock(&cache->object_list_lock);
+
+	_leave("");
+}
+
+/*
+ * Withdraw cachefiles volumes.
+ */
+static void cachefiles_withdraw_volumes(struct cachefiles_cache *cache)
+{
+	_enter("");
+
+	for (;;) {
+		struct fscache_volume *vcookie = NULL;
+		struct cachefiles_volume *volume = NULL;
+
+		spin_lock(&cache->object_list_lock);
+		if (!list_empty(&cache->volumes)) {
+			volume = list_first_entry(&cache->volumes,
+						  struct cachefiles_volume, cache_link);
+			vcookie = fscache_try_get_volume(volume->vcookie,
+							 fscache_volume_get_withdraw);
+			if (!vcookie) {
+				spin_unlock(&cache->object_list_lock);
+				cpu_relax();
+				continue;
+			}
+			list_del_init(&volume->cache_link);
+		}
+		spin_unlock(&cache->object_list_lock);
+		if (!volume)
+			break;
+
+		cachefiles_withdraw_volume(volume);
+		fscache_put_volume(vcookie, fscache_volume_put_withdraw);
+	}
+
+	_leave("");
+}
+
+/*
+ * Sync a cache to backing disk.
+ */
+static void cachefiles_sync_cache(struct cachefiles_cache *cache)
+{
+	const struct cred *saved_cred;
+	int ret;
+
+	_enter("%s", cache->cache->name);
+
+	/* make sure all pages pinned by operations on behalf of the netfs are
+	 * written to disc */
+	cachefiles_begin_secure(cache, &saved_cred);
+	down_read(&cache->mnt->mnt_sb->s_umount);
+	ret = sync_filesystem(cache->mnt->mnt_sb);
+	up_read(&cache->mnt->mnt_sb->s_umount);
+	cachefiles_end_secure(cache, saved_cred);
+
+	if (ret == -EIO)
+		cachefiles_io_error(cache,
+				    "Attempt to sync backing fs superblock returned error %d",
+				    ret);
+}
+
+/*
+ * Withdraw cache objects.
+ */
+void cachefiles_withdraw_cache(struct cachefiles_cache *cache)
+{
+	struct fscache_cache *fscache = cache->cache;
+
+	pr_info("File cache on %s unregistering\n", fscache->name);
+
+	fscache_withdraw_cache(fscache);
+	cachefiles_withdraw_fscache_volumes(cache);
+
+	/* we now have to destroy all the active objects pertaining to this
+	 * cache - which we do by passing them off to thread pool to be
+	 * disposed of */
+	cachefiles_withdraw_objects(cache);
+	fscache_wait_for_objects(fscache);
+
+	cachefiles_withdraw_volumes(cache);
+	cachefiles_sync_cache(cache);
+	cache->cache = NULL;
+	fscache_relinquish_cache(fscache);
+}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 0a1467b15516..1806bff8e59b 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Daemon interface
  *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2007, 2021 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/module.h>
@@ -19,6 +15,7 @@
 #include <linux/namei.h>
 #include <linux/poll.h>
 #include <linux/mount.h>
+#include <linux/security.h>
 #include <linux/statfs.h>
 #include <linux/ctype.h>
 #include <linux/string.h>
@@ -31,7 +28,7 @@ static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t,
 				      loff_t *);
 static ssize_t cachefiles_daemon_write(struct file *, const char __user *,
 				       size_t, loff_t *);
-static unsigned int cachefiles_daemon_poll(struct file *,
+static __poll_t cachefiles_daemon_poll(struct file *,
 					   struct poll_table_struct *);
 static int cachefiles_daemon_frun(struct cachefiles_cache *, char *);
 static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *);
@@ -45,6 +42,8 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *, char *);
 static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *);
 static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *);
 static int cachefiles_daemon_tag(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_bind(struct cachefiles_cache *, char *);
+static void cachefiles_daemon_unbind(struct cachefiles_cache *);
 
 static unsigned long cachefiles_open;
 
@@ -77,12 +76,16 @@ static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
 	{ "inuse",	cachefiles_daemon_inuse		},
 	{ "secctx",	cachefiles_daemon_secctx	},
 	{ "tag",	cachefiles_daemon_tag		},
+#ifdef CONFIG_CACHEFILES_ONDEMAND
+	{ "copen",	cachefiles_ondemand_copen	},
+	{ "restore",	cachefiles_ondemand_restore	},
+#endif
 	{ "",		NULL				}
 };
 
 
 /*
- * do various checks
+ * Prepare a cache for caching.
  */
 static int cachefiles_daemon_open(struct inode *inode, struct file *file)
 {
@@ -106,9 +109,13 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file)
 	}
 
 	mutex_init(&cache->daemon_mutex);
-	cache->active_nodes = RB_ROOT;
-	rwlock_init(&cache->active_lock);
 	init_waitqueue_head(&cache->daemon_pollwq);
+	INIT_LIST_HEAD(&cache->volumes);
+	INIT_LIST_HEAD(&cache->object_list);
+	spin_lock_init(&cache->object_list_lock);
+	refcount_set(&cache->unbind_pincount, 1);
+	xa_init_flags(&cache->reqs, XA_FLAGS_ALLOC);
+	xa_init_flags(&cache->ondemand_ids, XA_FLAGS_ALLOC1);
 
 	/* set default caching limits
 	 * - limit at 1% free space and/or free files
@@ -127,8 +134,56 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+void cachefiles_flush_reqs(struct cachefiles_cache *cache)
+{
+	struct xarray *xa = &cache->reqs;
+	struct cachefiles_req *req;
+	unsigned long index;
+
+	/*
+	 * Make sure the following two operations won't be reordered.
+	 *   1) set CACHEFILES_DEAD bit
+	 *   2) flush requests in the xarray
+	 * Otherwise the request may be enqueued after xarray has been
+	 * flushed, leaving the orphan request never being completed.
+	 *
+	 * CPU 1			CPU 2
+	 * =====			=====
+	 * flush requests in the xarray
+	 *				test CACHEFILES_DEAD bit
+	 *				enqueue the request
+	 * set CACHEFILES_DEAD bit
+	 */
+	smp_mb();
+
+	xa_lock(xa);
+	xa_for_each(xa, index, req) {
+		req->error = -EIO;
+		complete(&req->done);
+		__xa_erase(xa, index);
+	}
+	xa_unlock(xa);
+
+	xa_destroy(&cache->reqs);
+	xa_destroy(&cache->ondemand_ids);
+}
+
+void cachefiles_put_unbind_pincount(struct cachefiles_cache *cache)
+{
+	if (refcount_dec_and_test(&cache->unbind_pincount)) {
+		cachefiles_daemon_unbind(cache);
+		cachefiles_open = 0;
+		kfree(cache);
+	}
+}
+
+void cachefiles_get_unbind_pincount(struct cachefiles_cache *cache)
+{
+	refcount_inc(&cache->unbind_pincount);
+}
+
 /*
- * release a cache
+ * Release a cache.
  */
 static int cachefiles_daemon_release(struct inode *inode, struct file *file)
 {
@@ -140,40 +195,33 @@ static int cachefiles_daemon_release(struct inode *inode, struct file *file)
 
 	set_bit(CACHEFILES_DEAD, &cache->flags);
 
-	cachefiles_daemon_unbind(cache);
-
-	ASSERT(!cache->active_nodes.rb_node);
+	if (cachefiles_in_ondemand_mode(cache))
+		cachefiles_flush_reqs(cache);
 
 	/* clean up the control file interface */
 	cache->cachefilesd = NULL;
 	file->private_data = NULL;
-	cachefiles_open = 0;
 
-	kfree(cache);
+	cachefiles_put_unbind_pincount(cache);
 
 	_leave("");
 	return 0;
 }
 
-/*
- * read the cache state
- */
-static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
-				      size_t buflen, loff_t *pos)
+static ssize_t cachefiles_do_daemon_read(struct cachefiles_cache *cache,
+					 char __user *_buffer, size_t buflen)
 {
-	struct cachefiles_cache *cache = file->private_data;
+	unsigned long long b_released;
+	unsigned f_released;
 	char buffer[256];
 	int n;
 
-	//_enter(",,%zu,", buflen);
-
-	if (!test_bit(CACHEFILES_READY, &cache->flags))
-		return 0;
-
 	/* check how much space the cache has */
-	cachefiles_has_space(cache, 0, 0);
+	cachefiles_has_space(cache, 0, 0, cachefiles_has_space_check);
 
 	/* summarise */
+	f_released = atomic_xchg(&cache->f_released, 0);
+	b_released = atomic_long_xchg(&cache->b_released, 0);
 	clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
 
 	n = snprintf(buffer, sizeof(buffer),
@@ -183,15 +231,18 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
 		     " fstop=%llx"
 		     " brun=%llx"
 		     " bcull=%llx"
-		     " bstop=%llx",
+		     " bstop=%llx"
+		     " freleased=%x"
+		     " breleased=%llx",
 		     test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0',
 		     (unsigned long long) cache->frun,
 		     (unsigned long long) cache->fcull,
 		     (unsigned long long) cache->fstop,
 		     (unsigned long long) cache->brun,
 		     (unsigned long long) cache->bcull,
-		     (unsigned long long) cache->bstop
-		     );
+		     (unsigned long long) cache->bstop,
+		     f_released,
+		     b_released);
 
 	if (n > buflen)
 		return -EMSGSIZE;
@@ -203,7 +254,26 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
 }
 
 /*
- * command the cache
+ * Read the cache state.
+ */
+static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
+				      size_t buflen, loff_t *pos)
+{
+	struct cachefiles_cache *cache = file->private_data;
+
+	//_enter(",,%zu,", buflen);
+
+	if (!test_bit(CACHEFILES_READY, &cache->flags))
+		return 0;
+
+	if (cachefiles_in_ondemand_mode(cache))
+		return cachefiles_ondemand_daemon_read(cache, _buffer, buflen);
+	else
+		return cachefiles_do_daemon_read(cache, _buffer, buflen);
+}
+
+/*
+ * Take a command from cachefilesd, parse it and act on it.
  */
 static ssize_t cachefiles_daemon_write(struct file *file,
 				       const char __user *_data,
@@ -222,19 +292,13 @@ static ssize_t cachefiles_daemon_write(struct file *file,
 	if (test_bit(CACHEFILES_DEAD, &cache->flags))
 		return -EIO;
 
-	if (datalen < 0 || datalen > PAGE_SIZE - 1)
+	if (datalen > PAGE_SIZE - 1)
 		return -EOPNOTSUPP;
 
 	/* drag the command string into the kernel so we can parse it */
-	data = kmalloc(datalen + 1, GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
-	ret = -EFAULT;
-	if (copy_from_user(data, _data, datalen) != 0)
-		goto error;
-
-	data[datalen] = '\0';
+	data = memdup_user_nul(_data, datalen);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
 
 	ret = -EINVAL;
 	if (memchr(data, '\0', datalen))
@@ -287,42 +351,56 @@ found_command:
 }
 
 /*
- * poll for culling state
- * - use POLLOUT to indicate culling state
+ * Poll for culling state
+ * - use EPOLLOUT to indicate culling state
  */
-static unsigned int cachefiles_daemon_poll(struct file *file,
+static __poll_t cachefiles_daemon_poll(struct file *file,
 					   struct poll_table_struct *poll)
 {
 	struct cachefiles_cache *cache = file->private_data;
-	unsigned int mask;
+	XA_STATE(xas, &cache->reqs, 0);
+	struct cachefiles_req *req;
+	__poll_t mask;
 
 	poll_wait(file, &cache->daemon_pollwq, poll);
 	mask = 0;
 
-	if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
-		mask |= POLLIN;
+	if (cachefiles_in_ondemand_mode(cache)) {
+		if (!xa_empty(&cache->reqs)) {
+			xas_lock(&xas);
+			xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) {
+				if (!cachefiles_ondemand_is_reopening_read(req)) {
+					mask |= EPOLLIN;
+					break;
+				}
+			}
+			xas_unlock(&xas);
+		}
+	} else {
+		if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
+			mask |= EPOLLIN;
+	}
 
 	if (test_bit(CACHEFILES_CULLING, &cache->flags))
-		mask |= POLLOUT;
+		mask |= EPOLLOUT;
 
 	return mask;
 }
 
 /*
- * give a range error for cache space constraints
+ * Give a range error for cache space constraints
  * - can be tail-called
  */
 static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
 					 char *args)
 {
-	kerror("Free space limits must be in range"
-	       " 0%%<=stop<cull<run<100%%");
+	pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%\n");
 
 	return -EINVAL;
 }
 
 /*
- * set the percentage of files at which to stop culling
+ * Set the percentage of files at which to stop culling
  * - command: "frun <N>%"
  */
 static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
@@ -346,7 +424,7 @@ static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
 }
 
 /*
- * set the percentage of files at which to start culling
+ * Set the percentage of files at which to start culling
  * - command: "fcull <N>%"
  */
 static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
@@ -370,7 +448,7 @@ static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
 }
 
 /*
- * set the percentage of files at which to stop allocating
+ * Set the percentage of files at which to stop allocating
  * - command: "fstop <N>%"
  */
 static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
@@ -386,7 +464,7 @@ static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
 	if (args[0] != '%' || args[1] != '\0')
 		return -EINVAL;
 
-	if (fstop < 0 || fstop >= cache->fcull_percent)
+	if (fstop >= cache->fcull_percent)
 		return cachefiles_daemon_range_error(cache, args);
 
 	cache->fstop_percent = fstop;
@@ -394,7 +472,7 @@ static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
 }
 
 /*
- * set the percentage of blocks at which to stop culling
+ * Set the percentage of blocks at which to stop culling
  * - command: "brun <N>%"
  */
 static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
@@ -418,7 +496,7 @@ static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
 }
 
 /*
- * set the percentage of blocks at which to start culling
+ * Set the percentage of blocks at which to start culling
  * - command: "bcull <N>%"
  */
 static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
@@ -442,7 +520,7 @@ static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
 }
 
 /*
- * set the percentage of blocks at which to stop allocating
+ * Set the percentage of blocks at which to stop allocating
  * - command: "bstop <N>%"
  */
 static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
@@ -458,7 +536,7 @@ static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
 	if (args[0] != '%' || args[1] != '\0')
 		return -EINVAL;
 
-	if (bstop < 0 || bstop >= cache->bcull_percent)
+	if (bstop >= cache->bcull_percent)
 		return cachefiles_daemon_range_error(cache, args);
 
 	cache->bstop_percent = bstop;
@@ -466,7 +544,7 @@ static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
 }
 
 /*
- * set the cache directory
+ * Set the cache directory
  * - command: "dir <name>"
  */
 static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
@@ -476,12 +554,12 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
 	_enter(",%s", args);
 
 	if (!*args) {
-		kerror("Empty directory specified");
+		pr_err("Empty directory specified\n");
 		return -EINVAL;
 	}
 
 	if (cache->rootdirname) {
-		kerror("Second cache directory specified");
+		pr_err("Second cache directory specified\n");
 		return -EEXIST;
 	}
 
@@ -494,35 +572,35 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
 }
 
 /*
- * set the cache security context
+ * Set the cache security context
  * - command: "secctx <ctx>"
  */
 static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
 {
-	char *secctx;
+	int err;
 
 	_enter(",%s", args);
 
 	if (!*args) {
-		kerror("Empty security context specified");
+		pr_err("Empty security context specified\n");
 		return -EINVAL;
 	}
 
-	if (cache->secctx) {
-		kerror("Second security context specified");
+	if (cache->have_secid) {
+		pr_err("Second security context specified\n");
 		return -EINVAL;
 	}
 
-	secctx = kstrdup(args, GFP_KERNEL);
-	if (!secctx)
-		return -ENOMEM;
+	err = security_secctx_to_secid(args, strlen(args), &cache->secid);
+	if (err)
+		return err;
 
-	cache->secctx = secctx;
+	cache->have_secid = true;
 	return 0;
 }
 
 /*
- * set the cache tag
+ * Set the cache tag
  * - command: "tag <name>"
  */
 static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
@@ -532,7 +610,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
 	_enter(",%s", args);
 
 	if (!*args) {
-		kerror("Empty tag specified");
+		pr_err("Empty tag specified\n");
 		return -EINVAL;
 	}
 
@@ -548,7 +626,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
 }
 
 /*
- * request a node in the cache be culled from the current working directory
+ * Request a node in the cache be culled from the current working directory
  * - command: "cull <name>"
  */
 static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
@@ -563,19 +641,18 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
 		goto inval;
 
 	if (!test_bit(CACHEFILES_READY, &cache->flags)) {
-		kerror("cull applied to unready cache");
+		pr_err("cull applied to unready cache\n");
 		return -EIO;
 	}
 
 	if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
-		kerror("cull applied to dead cache");
+		pr_err("cull applied to dead cache\n");
 		return -EIO;
 	}
 
-	/* extract the directory dentry from the cwd */
 	get_fs_pwd(current->fs, &path);
 
-	if (!S_ISDIR(path.dentry->d_inode->i_mode))
+	if (!d_can_lookup(path.dentry))
 		goto notdir;
 
 	cachefiles_begin_secure(cache, &saved_cred);
@@ -588,16 +665,16 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
 
 notdir:
 	path_put(&path);
-	kerror("cull command requires dirfd to be a directory");
+	pr_err("cull command requires dirfd to be a directory\n");
 	return -ENOTDIR;
 
 inval:
-	kerror("cull command requires dirfd and filename");
+	pr_err("cull command requires dirfd and filename\n");
 	return -EINVAL;
 }
 
 /*
- * set debugging mode
+ * Set debugging mode
  * - command: "debug <mask>"
  */
 static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
@@ -615,12 +692,12 @@ static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
 	return 0;
 
 inval:
-	kerror("debug command requires mask");
+	pr_err("debug command requires mask\n");
 	return -EINVAL;
 }
 
 /*
- * find out whether an object in the current working directory is in use or not
+ * Find out whether an object in the current working directory is in use or not
  * - command: "inuse <name>"
  */
 static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
@@ -635,19 +712,18 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
 		goto inval;
 
 	if (!test_bit(CACHEFILES_READY, &cache->flags)) {
-		kerror("inuse applied to unready cache");
+		pr_err("inuse applied to unready cache\n");
 		return -EIO;
 	}
 
 	if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
-		kerror("inuse applied to dead cache");
+		pr_err("inuse applied to dead cache\n");
 		return -EIO;
 	}
 
-	/* extract the directory dentry from the cwd */
 	get_fs_pwd(current->fs, &path);
 
-	if (!S_ISDIR(path.dentry->d_inode->i_mode))
+	if (!d_can_lookup(path.dentry))
 		goto notdir;
 
 	cachefiles_begin_secure(cache, &saved_cred);
@@ -660,93 +736,92 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
 
 notdir:
 	path_put(&path);
-	kerror("inuse command requires dirfd to be a directory");
+	pr_err("inuse command requires dirfd to be a directory\n");
 	return -ENOTDIR;
 
 inval:
-	kerror("inuse command requires dirfd and filename");
+	pr_err("inuse command requires dirfd and filename\n");
 	return -EINVAL;
 }
 
 /*
- * see if we have space for a number of pages and/or a number of files in the
- * cache
+ * Bind a directory as a cache
  */
-int cachefiles_has_space(struct cachefiles_cache *cache,
-			 unsigned fnr, unsigned bnr)
+static int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
 {
-	struct kstatfs stats;
-	struct path path = {
-		.mnt	= cache->mnt,
-		.dentry	= cache->mnt->mnt_root,
-	};
-	int ret;
+	_enter("{%u,%u,%u,%u,%u,%u},%s",
+	       cache->frun_percent,
+	       cache->fcull_percent,
+	       cache->fstop_percent,
+	       cache->brun_percent,
+	       cache->bcull_percent,
+	       cache->bstop_percent,
+	       args);
+
+	if (cache->fstop_percent >= cache->fcull_percent ||
+	    cache->fcull_percent >= cache->frun_percent ||
+	    cache->frun_percent  >= 100)
+		return -ERANGE;
+
+	if (cache->bstop_percent >= cache->bcull_percent ||
+	    cache->bcull_percent >= cache->brun_percent ||
+	    cache->brun_percent  >= 100)
+		return -ERANGE;
+
+	if (!cache->rootdirname) {
+		pr_err("No cache directory specified\n");
+		return -EINVAL;
+	}
+
+	/* Don't permit already bound caches to be re-bound */
+	if (test_bit(CACHEFILES_READY, &cache->flags)) {
+		pr_err("Cache already bound\n");
+		return -EBUSY;
+	}
 
-	//_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
-	//       (unsigned long long) cache->frun,
-	//       (unsigned long long) cache->fcull,
-	//       (unsigned long long) cache->fstop,
-	//       (unsigned long long) cache->brun,
-	//       (unsigned long long) cache->bcull,
-	//       (unsigned long long) cache->bstop,
-	//       fnr, bnr);
-
-	/* find out how many pages of blockdev are available */
-	memset(&stats, 0, sizeof(stats));
-
-	ret = vfs_statfs(&path, &stats);
-	if (ret < 0) {
-		if (ret == -EIO)
-			cachefiles_io_error(cache, "statfs failed");
-		_leave(" = %d", ret);
-		return ret;
+	if (IS_ENABLED(CONFIG_CACHEFILES_ONDEMAND)) {
+		if (!strcmp(args, "ondemand")) {
+			set_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags);
+		} else if (*args) {
+			pr_err("Invalid argument to the 'bind' command\n");
+			return -EINVAL;
+		}
+	} else if (*args) {
+		pr_err("'bind' command doesn't take an argument\n");
+		return -EINVAL;
 	}
 
-	stats.f_bavail >>= cache->bshift;
+	/* Make sure we have copies of the tag string */
+	if (!cache->tag) {
+		/*
+		 * The tag string is released by the fops->release()
+		 * function, so we don't release it on error here
+		 */
+		cache->tag = kstrdup("CacheFiles", GFP_KERNEL);
+		if (!cache->tag)
+			return -ENOMEM;
+	}
 
-	//_debug("avail %llu,%llu",
-	//       (unsigned long long) stats.f_ffree,
-	//       (unsigned long long) stats.f_bavail);
+	return cachefiles_add_cache(cache);
+}
 
-	/* see if there is sufficient space */
-	if (stats.f_ffree > fnr)
-		stats.f_ffree -= fnr;
-	else
-		stats.f_ffree = 0;
+/*
+ * Unbind a cache.
+ */
+static void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
+{
+	_enter("");
 
-	if (stats.f_bavail > bnr)
-		stats.f_bavail -= bnr;
-	else
-		stats.f_bavail = 0;
-
-	ret = -ENOBUFS;
-	if (stats.f_ffree < cache->fstop ||
-	    stats.f_bavail < cache->bstop)
-		goto begin_cull;
-
-	ret = 0;
-	if (stats.f_ffree < cache->fcull ||
-	    stats.f_bavail < cache->bcull)
-		goto begin_cull;
-
-	if (test_bit(CACHEFILES_CULLING, &cache->flags) &&
-	    stats.f_ffree >= cache->frun &&
-	    stats.f_bavail >= cache->brun &&
-	    test_and_clear_bit(CACHEFILES_CULLING, &cache->flags)
-	    ) {
-		_debug("cease culling");
-		cachefiles_state_changed(cache);
-	}
+	if (test_bit(CACHEFILES_READY, &cache->flags))
+		cachefiles_withdraw_cache(cache);
 
-	//_leave(" = 0");
-	return 0;
+	cachefiles_put_directory(cache->graveyard);
+	cachefiles_put_directory(cache->store);
+	mntput(cache->mnt);
+	put_cred(cache->cache_cred);
 
-begin_cull:
-	if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) {
-		_debug("### CULL CACHE ###");
-		cachefiles_state_changed(cache);
-	}
+	kfree(cache->rootdirname);
+	kfree(cache->tag);
 
-	_leave(" = %d", ret);
-	return ret;
+	_leave("");
 }
diff --git a/fs/cachefiles/error_inject.c b/fs/cachefiles/error_inject.c
new file mode 100644
index 000000000000..e341ade47dd8
--- /dev/null
+++ b/fs/cachefiles/error_inject.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Error injection handling.
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/sysctl.h>
+#include "internal.h"
+
+unsigned int cachefiles_error_injection_state;
+
+static struct ctl_table_header *cachefiles_sysctl;
+static const struct ctl_table cachefiles_sysctls[] = {
+	{
+		.procname	= "error_injection",
+		.data		= &cachefiles_error_injection_state,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec,
+	},
+};
+
+int __init cachefiles_register_error_injection(void)
+{
+	cachefiles_sysctl = register_sysctl("cachefiles", cachefiles_sysctls);
+	if (!cachefiles_sysctl)
+		return -ENOMEM;
+	return 0;
+
+}
+
+void cachefiles_unregister_error_injection(void)
+{
+	unregister_sysctl_table(cachefiles_sysctl);
+}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index d4c1206af9fc..a08250d244ea 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -1,525 +1,461 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* FS-Cache interface to CacheFiles
  *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/slab.h>
 #include <linux/mount.h>
+#include <linux/xattr.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/falloc.h>
+#include <trace/events/fscache.h>
 #include "internal.h"
 
-struct cachefiles_lookup_data {
-	struct cachefiles_xattr	*auxdata;	/* auxiliary data */
-	char			*key;		/* key path */
-};
-
-static int cachefiles_attr_changed(struct fscache_object *_object);
+static atomic_t cachefiles_object_debug_id;
 
 /*
- * allocate an object record for a cookie lookup and prepare the lookup data
+ * Allocate a cache object record.
  */
-static struct fscache_object *cachefiles_alloc_object(
-	struct fscache_cache *_cache,
-	struct fscache_cookie *cookie)
+static
+struct cachefiles_object *cachefiles_alloc_object(struct fscache_cookie *cookie)
 {
-	struct cachefiles_lookup_data *lookup_data;
+	struct fscache_volume *vcookie = cookie->volume;
+	struct cachefiles_volume *volume = vcookie->cache_priv;
 	struct cachefiles_object *object;
-	struct cachefiles_cache *cache;
-	struct cachefiles_xattr *auxdata;
-	unsigned keylen, auxlen;
-	void *buffer;
-	char *key;
-
-	cache = container_of(_cache, struct cachefiles_cache, cache);
-
-	_enter("{%s},%p,", cache->cache.identifier, cookie);
 
-	lookup_data = kmalloc(sizeof(*lookup_data), cachefiles_gfp);
-	if (!lookup_data)
-		goto nomem_lookup_data;
+	_enter("{%s},%x,", vcookie->key, cookie->debug_id);
 
-	/* create a new object record and a temporary leaf image */
-	object = kmem_cache_alloc(cachefiles_object_jar, cachefiles_gfp);
+	object = kmem_cache_zalloc(cachefiles_object_jar, GFP_KERNEL);
 	if (!object)
-		goto nomem_object;
+		return NULL;
 
-	ASSERTCMP(object->backer, ==, NULL);
-
-	BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
-	atomic_set(&object->usage, 1);
-
-	fscache_object_init(&object->fscache, cookie, &cache->cache);
+	if (cachefiles_ondemand_init_obj_info(object, volume)) {
+		kmem_cache_free(cachefiles_object_jar, object);
+		return NULL;
+	}
 
-	object->type = cookie->def->type;
+	refcount_set(&object->ref, 1);
 
-	/* get hold of the raw key
-	 * - stick the length on the front and leave space on the back for the
-	 *   encoder
-	 */
-	buffer = kmalloc((2 + 512) + 3, cachefiles_gfp);
-	if (!buffer)
-		goto nomem_buffer;
-
-	keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512);
-	ASSERTCMP(keylen, <, 512);
-
-	*(uint16_t *)buffer = keylen;
-	((char *)buffer)[keylen + 2] = 0;
-	((char *)buffer)[keylen + 3] = 0;
-	((char *)buffer)[keylen + 4] = 0;
-
-	/* turn the raw key into something that can work with as a filename */
-	key = cachefiles_cook_key(buffer, keylen + 2, object->type);
-	if (!key)
-		goto nomem_key;
-
-	/* get hold of the auxiliary data and prepend the object type */
-	auxdata = buffer;
-	auxlen = 0;
-	if (cookie->def->get_aux) {
-		auxlen = cookie->def->get_aux(cookie->netfs_data,
-					      auxdata->data, 511);
-		ASSERTCMP(auxlen, <, 511);
-	}
+	spin_lock_init(&object->lock);
+	INIT_LIST_HEAD(&object->cache_link);
+	object->volume = volume;
+	object->debug_id = atomic_inc_return(&cachefiles_object_debug_id);
+	object->cookie = fscache_get_cookie(cookie, fscache_cookie_get_attach_object);
 
-	auxdata->len = auxlen + 1;
-	auxdata->type = cookie->def->type;
-
-	lookup_data->auxdata = auxdata;
-	lookup_data->key = key;
-	object->lookup_data = lookup_data;
-
-	_leave(" = %p [%p]", &object->fscache, lookup_data);
-	return &object->fscache;
-
-nomem_key:
-	kfree(buffer);
-nomem_buffer:
-	BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
-	kmem_cache_free(cachefiles_object_jar, object);
-	fscache_object_destroyed(&cache->cache);
-nomem_object:
-	kfree(lookup_data);
-nomem_lookup_data:
-	_leave(" = -ENOMEM");
-	return ERR_PTR(-ENOMEM);
+	fscache_count_object(vcookie->cache);
+	trace_cachefiles_ref(object->debug_id, cookie->debug_id, 1,
+			     cachefiles_obj_new);
+	return object;
 }
 
 /*
- * attempt to look up the nominated node in this cache
- * - return -ETIMEDOUT to be scheduled again
+ * Note that an object has been seen.
  */
-static int cachefiles_lookup_object(struct fscache_object *_object)
+void cachefiles_see_object(struct cachefiles_object *object,
+			   enum cachefiles_obj_ref_trace why)
 {
-	struct cachefiles_lookup_data *lookup_data;
-	struct cachefiles_object *parent, *object;
-	struct cachefiles_cache *cache;
-	const struct cred *saved_cred;
-	int ret;
-
-	_enter("{OBJ%x}", _object->debug_id);
-
-	cache = container_of(_object->cache, struct cachefiles_cache, cache);
-	parent = container_of(_object->parent,
-			      struct cachefiles_object, fscache);
-	object = container_of(_object, struct cachefiles_object, fscache);
-	lookup_data = object->lookup_data;
-
-	ASSERTCMP(lookup_data, !=, NULL);
-
-	/* look up the key, creating any missing bits */
-	cachefiles_begin_secure(cache, &saved_cred);
-	ret = cachefiles_walk_to_object(parent, object,
-					lookup_data->key,
-					lookup_data->auxdata);
-	cachefiles_end_secure(cache, saved_cred);
-
-	/* polish off by setting the attributes of non-index files */
-	if (ret == 0 &&
-	    object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
-		cachefiles_attr_changed(&object->fscache);
-
-	if (ret < 0 && ret != -ETIMEDOUT) {
-		if (ret != -ENOBUFS)
-			printk(KERN_WARNING
-			       "CacheFiles: Lookup failed error %d\n", ret);
-		fscache_object_lookup_error(&object->fscache);
-	}
-
-	_leave(" [%d]", ret);
-	return ret;
+	trace_cachefiles_ref(object->debug_id, object->cookie->debug_id,
+			     refcount_read(&object->ref), why);
 }
 
 /*
- * indication of lookup completion
+ * Increment the usage count on an object;
  */
-static void cachefiles_lookup_complete(struct fscache_object *_object)
+struct cachefiles_object *cachefiles_grab_object(struct cachefiles_object *object,
+						 enum cachefiles_obj_ref_trace why)
 {
-	struct cachefiles_object *object;
-
-	object = container_of(_object, struct cachefiles_object, fscache);
-
-	_enter("{OBJ%x,%p}", object->fscache.debug_id, object->lookup_data);
+	int r;
 
-	if (object->lookup_data) {
-		kfree(object->lookup_data->key);
-		kfree(object->lookup_data->auxdata);
-		kfree(object->lookup_data);
-		object->lookup_data = NULL;
-	}
+	__refcount_inc(&object->ref, &r);
+	trace_cachefiles_ref(object->debug_id, object->cookie->debug_id, r, why);
+	return object;
 }
 
 /*
- * increment the usage count on an inode object (may fail if unmounting)
+ * dispose of a reference to an object
  */
-static
-struct fscache_object *cachefiles_grab_object(struct fscache_object *_object)
+void cachefiles_put_object(struct cachefiles_object *object,
+			   enum cachefiles_obj_ref_trace why)
 {
-	struct cachefiles_object *object =
-		container_of(_object, struct cachefiles_object, fscache);
+	unsigned int object_debug_id = object->debug_id;
+	unsigned int cookie_debug_id = object->cookie->debug_id;
+	struct fscache_cache *cache;
+	bool done;
+	int r;
+
+	done = __refcount_dec_and_test(&object->ref, &r);
+	trace_cachefiles_ref(object_debug_id, cookie_debug_id, r, why);
+	if (done) {
+		_debug("- kill object OBJ%x", object_debug_id);
 
-	_enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage));
+		ASSERTCMP(object->file, ==, NULL);
 
-#ifdef CACHEFILES_DEBUG_SLAB
-	ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
-#endif
+		kfree(object->d_name);
+		cachefiles_ondemand_deinit_obj_info(object);
+		cache = object->volume->cache->cache;
+		fscache_put_cookie(object->cookie, fscache_cookie_put_object);
+		object->cookie = NULL;
+		kmem_cache_free(cachefiles_object_jar, object);
+		fscache_uncount_object(cache);
+	}
 
-	atomic_inc(&object->usage);
-	return &object->fscache;
+	_leave("");
 }
 
 /*
- * update the auxiliary data for an object object on disk
+ * Adjust the size of a cache file if necessary to match the DIO size.  We keep
+ * the EOF marker a multiple of DIO blocks so that we don't fall back to doing
+ * non-DIO for a partial block straddling the EOF, but we also have to be
+ * careful of someone expanding the file and accidentally accreting the
+ * padding.
  */
-static void cachefiles_update_object(struct fscache_object *_object)
+static int cachefiles_adjust_size(struct cachefiles_object *object)
 {
-	struct cachefiles_object *object;
-	struct cachefiles_xattr *auxdata;
-	struct cachefiles_cache *cache;
-	struct fscache_cookie *cookie;
-	const struct cred *saved_cred;
-	unsigned auxlen;
+	struct iattr newattrs;
+	struct file *file = object->file;
+	uint64_t ni_size;
+	loff_t oi_size;
+	int ret;
 
-	_enter("{OBJ%x}", _object->debug_id);
+	ni_size = object->cookie->object_size;
+	ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE);
 
-	object = container_of(_object, struct cachefiles_object, fscache);
-	cache = container_of(object->fscache.cache, struct cachefiles_cache,
-			     cache);
+	_enter("{OBJ%x},[%llu]",
+	       object->debug_id, (unsigned long long) ni_size);
 
-	if (!fscache_use_cookie(_object)) {
-		_leave(" [relinq]");
-		return;
-	}
+	if (!file)
+		return -ENOBUFS;
 
-	cookie = object->fscache.cookie;
+	oi_size = i_size_read(file_inode(file));
+	if (oi_size == ni_size)
+		return 0;
 
-	if (!cookie->def->get_aux) {
-		fscache_unuse_cookie(_object);
-		_leave(" [no aux]");
-		return;
-	}
+	inode_lock(file_inode(file));
 
-	auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
-	if (!auxdata) {
-		fscache_unuse_cookie(_object);
-		_leave(" [nomem]");
-		return;
+	/* if there's an extension to a partial page at the end of the backing
+	 * file, we need to discard the partial page so that we pick up new
+	 * data after it */
+	if (oi_size & ~PAGE_MASK && ni_size > oi_size) {
+		_debug("discard tail %llx", oi_size);
+		newattrs.ia_valid = ATTR_SIZE;
+		newattrs.ia_size = oi_size & PAGE_MASK;
+		ret = cachefiles_inject_remove_error();
+		if (ret == 0)
+			ret = notify_change(&nop_mnt_idmap, file->f_path.dentry,
+					    &newattrs, NULL);
+		if (ret < 0)
+			goto truncate_failed;
 	}
 
-	auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
-	fscache_unuse_cookie(_object);
-	ASSERTCMP(auxlen, <, 511);
+	newattrs.ia_valid = ATTR_SIZE;
+	newattrs.ia_size = ni_size;
+	ret = cachefiles_inject_write_error();
+	if (ret == 0)
+		ret = notify_change(&nop_mnt_idmap, file->f_path.dentry,
+				    &newattrs, NULL);
 
-	auxdata->len = auxlen + 1;
-	auxdata->type = cookie->def->type;
+truncate_failed:
+	inode_unlock(file_inode(file));
 
-	cachefiles_begin_secure(cache, &saved_cred);
-	cachefiles_update_object_xattr(object, auxdata);
-	cachefiles_end_secure(cache, saved_cred);
-	kfree(auxdata);
-	_leave("");
+	if (ret < 0)
+		trace_cachefiles_io_error(NULL, file_inode(file), ret,
+					  cachefiles_trace_notify_change_error);
+	if (ret == -EIO) {
+		cachefiles_io_error_obj(object, "Size set failed");
+		ret = -ENOBUFS;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
 }
 
 /*
- * discard the resources pinned by an object and effect retirement if
- * requested
+ * Attempt to look up the nominated node in this cache
  */
-static void cachefiles_drop_object(struct fscache_object *_object)
+static bool cachefiles_lookup_cookie(struct fscache_cookie *cookie)
 {
 	struct cachefiles_object *object;
-	struct cachefiles_cache *cache;
+	struct cachefiles_cache *cache = cookie->volume->cache->cache_priv;
 	const struct cred *saved_cred;
+	bool success;
 
-	ASSERT(_object);
+	object = cachefiles_alloc_object(cookie);
+	if (!object)
+		goto fail;
 
-	object = container_of(_object, struct cachefiles_object, fscache);
+	_enter("{OBJ%x}", object->debug_id);
 
-	_enter("{OBJ%x,%d}",
-	       object->fscache.debug_id, atomic_read(&object->usage));
+	if (!cachefiles_cook_key(object))
+		goto fail_put;
 
-	cache = container_of(object->fscache.cache,
-			     struct cachefiles_cache, cache);
+	cookie->cache_priv = object;
 
-#ifdef CACHEFILES_DEBUG_SLAB
-	ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
-#endif
+	cachefiles_begin_secure(cache, &saved_cred);
 
-	/* delete retired objects */
-	if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) &&
-	    _object != cache->cache.fsdef
-	    ) {
-		_debug("- retire object OBJ%x", object->fscache.debug_id);
-		cachefiles_begin_secure(cache, &saved_cred);
-		cachefiles_delete_object(cache, object);
-		cachefiles_end_secure(cache, saved_cred);
-	}
+	success = cachefiles_look_up_object(object);
+	if (!success)
+		goto fail_withdraw;
 
-	/* close the filesystem stuff attached to the object */
-	if (object->backer != object->dentry)
-		dput(object->backer);
-	object->backer = NULL;
-
-	/* note that the object is now inactive */
-	if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
-		write_lock(&cache->active_lock);
-		if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE,
-					&object->flags))
-			BUG();
-		rb_erase(&object->active_node, &cache->active_nodes);
-		wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
-		write_unlock(&cache->active_lock);
-	}
+	cachefiles_see_object(object, cachefiles_obj_see_lookup_cookie);
 
-	dput(object->dentry);
-	object->dentry = NULL;
+	spin_lock(&cache->object_list_lock);
+	list_add(&object->cache_link, &cache->object_list);
+	spin_unlock(&cache->object_list_lock);
+	cachefiles_adjust_size(object);
 
-	_leave("");
+	cachefiles_end_secure(cache, saved_cred);
+	_leave(" = t");
+	return true;
+
+fail_withdraw:
+	cachefiles_end_secure(cache, saved_cred);
+	cachefiles_see_object(object, cachefiles_obj_see_lookup_failed);
+	fscache_caching_failed(cookie);
+	_debug("failed c=%08x o=%08x", cookie->debug_id, object->debug_id);
+	/* The caller holds an access count on the cookie, so we need them to
+	 * drop it before we can withdraw the object.
+	 */
+	return false;
+
+fail_put:
+	cachefiles_put_object(object, cachefiles_obj_put_alloc_fail);
+fail:
+	return false;
 }
 
 /*
- * dispose of a reference to an object
+ * Shorten the backing object to discard any dirty data and free up
+ * any unused granules.
  */
-static void cachefiles_put_object(struct fscache_object *_object)
+static bool cachefiles_shorten_object(struct cachefiles_object *object,
+				      struct file *file, loff_t new_size)
 {
-	struct cachefiles_object *object;
-	struct fscache_cache *cache;
-
-	ASSERT(_object);
-
-	object = container_of(_object, struct cachefiles_object, fscache);
-
-	_enter("{OBJ%x,%d}",
-	       object->fscache.debug_id, atomic_read(&object->usage));
-
-#ifdef CACHEFILES_DEBUG_SLAB
-	ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
-#endif
-
-	ASSERTIFCMP(object->fscache.parent,
-		    object->fscache.parent->n_children, >, 0);
-
-	if (atomic_dec_and_test(&object->usage)) {
-		_debug("- kill object OBJ%x", object->fscache.debug_id);
+	struct cachefiles_cache *cache = object->volume->cache;
+	struct inode *inode = file_inode(file);
+	loff_t i_size, dio_size;
+	int ret;
 
-		ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
-		ASSERTCMP(object->fscache.parent, ==, NULL);
-		ASSERTCMP(object->backer, ==, NULL);
-		ASSERTCMP(object->dentry, ==, NULL);
-		ASSERTCMP(object->fscache.n_ops, ==, 0);
-		ASSERTCMP(object->fscache.n_children, ==, 0);
+	dio_size = round_up(new_size, CACHEFILES_DIO_BLOCK_SIZE);
+	i_size = i_size_read(inode);
+
+	trace_cachefiles_trunc(object, inode, i_size, dio_size,
+			       cachefiles_trunc_shrink);
+	ret = cachefiles_inject_remove_error();
+	if (ret == 0)
+		ret = vfs_truncate(&file->f_path, dio_size);
+	if (ret < 0) {
+		trace_cachefiles_io_error(object, file_inode(file), ret,
+					  cachefiles_trace_trunc_error);
+		cachefiles_io_error_obj(object, "Trunc-to-size failed %d", ret);
+		cachefiles_remove_object_xattr(cache, object, file->f_path.dentry);
+		return false;
+	}
 
-		if (object->lookup_data) {
-			kfree(object->lookup_data->key);
-			kfree(object->lookup_data->auxdata);
-			kfree(object->lookup_data);
-			object->lookup_data = NULL;
+	if (new_size < dio_size) {
+		trace_cachefiles_trunc(object, inode, dio_size, new_size,
+				       cachefiles_trunc_dio_adjust);
+		ret = cachefiles_inject_write_error();
+		if (ret == 0)
+			ret = vfs_fallocate(file, FALLOC_FL_ZERO_RANGE,
+					    new_size, dio_size - new_size);
+		if (ret < 0) {
+			trace_cachefiles_io_error(object, file_inode(file), ret,
+						  cachefiles_trace_fallocate_error);
+			cachefiles_io_error_obj(object, "Trunc-to-dio-size failed %d", ret);
+			cachefiles_remove_object_xattr(cache, object, file->f_path.dentry);
+			return false;
 		}
-
-		cache = object->fscache.cache;
-		fscache_object_destroy(&object->fscache);
-		kmem_cache_free(cachefiles_object_jar, object);
-		fscache_object_destroyed(cache);
 	}
 
-	_leave("");
+	return true;
 }
 
 /*
- * sync a cache
+ * Resize the backing object.
  */
-static void cachefiles_sync_cache(struct fscache_cache *_cache)
+static void cachefiles_resize_cookie(struct netfs_cache_resources *cres,
+				     loff_t new_size)
 {
-	struct cachefiles_cache *cache;
+	struct cachefiles_object *object = cachefiles_cres_object(cres);
+	struct cachefiles_cache *cache = object->volume->cache;
+	struct fscache_cookie *cookie = object->cookie;
 	const struct cred *saved_cred;
-	int ret;
-
-	_enter("%p", _cache);
+	struct file *file = cachefiles_cres_file(cres);
+	loff_t old_size = cookie->object_size;
 
-	cache = container_of(_cache, struct cachefiles_cache, cache);
+	_enter("%llu->%llu", old_size, new_size);
 
-	/* make sure all pages pinned by operations on behalf of the netfs are
-	 * written to disc */
-	cachefiles_begin_secure(cache, &saved_cred);
-	down_read(&cache->mnt->mnt_sb->s_umount);
-	ret = sync_filesystem(cache->mnt->mnt_sb);
-	up_read(&cache->mnt->mnt_sb->s_umount);
-	cachefiles_end_secure(cache, saved_cred);
+	if (new_size < old_size) {
+		cachefiles_begin_secure(cache, &saved_cred);
+		cachefiles_shorten_object(object, file, new_size);
+		cachefiles_end_secure(cache, saved_cred);
+		object->cookie->object_size = new_size;
+		return;
+	}
 
-	if (ret == -EIO)
-		cachefiles_io_error(cache,
-				    "Attempt to sync backing fs superblock"
-				    " returned error %d",
-				    ret);
+	/* The file is being expanded.  We don't need to do anything
+	 * particularly.  cookie->initial_size doesn't change and so the point
+	 * at which we have to download before doesn't change.
+	 */
+	cookie->object_size = new_size;
 }
 
 /*
- * notification the attributes on an object have changed
- * - called with reads/writes excluded by FS-Cache
+ * Commit changes to the object as we drop it.
  */
-static int cachefiles_attr_changed(struct fscache_object *_object)
+static void cachefiles_commit_object(struct cachefiles_object *object,
+				     struct cachefiles_cache *cache)
 {
-	struct cachefiles_object *object;
-	struct cachefiles_cache *cache;
-	const struct cred *saved_cred;
-	struct iattr newattrs;
-	uint64_t ni_size;
-	loff_t oi_size;
-	int ret;
-
-	_object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size);
+	bool update = false;
 
-	_enter("{OBJ%x},[%llu]",
-	       _object->debug_id, (unsigned long long) ni_size);
+	if (test_and_clear_bit(FSCACHE_COOKIE_LOCAL_WRITE, &object->cookie->flags))
+		update = true;
+	if (test_and_clear_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &object->cookie->flags))
+		update = true;
+	if (update)
+		cachefiles_set_object_xattr(object);
 
-	object = container_of(_object, struct cachefiles_object, fscache);
-	cache = container_of(object->fscache.cache,
-			     struct cachefiles_cache, cache);
+	if (test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags))
+		cachefiles_commit_tmpfile(cache, object);
+}
 
-	if (ni_size == object->i_size)
-		return 0;
+/*
+ * Finalise and object and close the VFS structs that we have.
+ */
+static void cachefiles_clean_up_object(struct cachefiles_object *object,
+				       struct cachefiles_cache *cache)
+{
+	struct file *file;
+
+	if (test_bit(FSCACHE_COOKIE_RETIRED, &object->cookie->flags)) {
+		if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) {
+			cachefiles_see_object(object, cachefiles_obj_see_clean_delete);
+			_debug("- inval object OBJ%x", object->debug_id);
+			cachefiles_delete_object(object, FSCACHE_OBJECT_WAS_RETIRED);
+		} else {
+			cachefiles_see_object(object, cachefiles_obj_see_clean_drop_tmp);
+			_debug("- inval object OBJ%x tmpfile", object->debug_id);
+		}
+	} else {
+		cachefiles_see_object(object, cachefiles_obj_see_clean_commit);
+		cachefiles_commit_object(object, cache);
+	}
 
-	if (!object->backer)
-		return -ENOBUFS;
+	cachefiles_unmark_inode_in_use(object, object->file);
 
-	ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+	spin_lock(&object->lock);
+	file = object->file;
+	object->file = NULL;
+	spin_unlock(&object->lock);
 
-	fscache_set_store_limit(&object->fscache, ni_size);
+	if (file)
+		fput(file);
+}
 
-	oi_size = i_size_read(object->backer->d_inode);
-	if (oi_size == ni_size)
-		return 0;
+/*
+ * Withdraw caching for a cookie.
+ */
+static void cachefiles_withdraw_cookie(struct fscache_cookie *cookie)
+{
+	struct cachefiles_object *object = cookie->cache_priv;
+	struct cachefiles_cache *cache = object->volume->cache;
+	const struct cred *saved_cred;
 
-	cachefiles_begin_secure(cache, &saved_cred);
-	mutex_lock(&object->backer->d_inode->i_mutex);
+	_enter("o=%x", object->debug_id);
+	cachefiles_see_object(object, cachefiles_obj_see_withdraw_cookie);
 
-	/* if there's an extension to a partial page at the end of the backing
-	 * file, we need to discard the partial page so that we pick up new
-	 * data after it */
-	if (oi_size & ~PAGE_MASK && ni_size > oi_size) {
-		_debug("discard tail %llx", oi_size);
-		newattrs.ia_valid = ATTR_SIZE;
-		newattrs.ia_size = oi_size & PAGE_MASK;
-		ret = notify_change(object->backer, &newattrs);
-		if (ret < 0)
-			goto truncate_failed;
+	if (!list_empty(&object->cache_link)) {
+		spin_lock(&cache->object_list_lock);
+		cachefiles_see_object(object, cachefiles_obj_see_withdrawal);
+		list_del_init(&object->cache_link);
+		spin_unlock(&cache->object_list_lock);
 	}
 
-	newattrs.ia_valid = ATTR_SIZE;
-	newattrs.ia_size = ni_size;
-	ret = notify_change(object->backer, &newattrs);
+	cachefiles_ondemand_clean_object(object);
 
-truncate_failed:
-	mutex_unlock(&object->backer->d_inode->i_mutex);
-	cachefiles_end_secure(cache, saved_cred);
-
-	if (ret == -EIO) {
-		fscache_set_store_limit(&object->fscache, 0);
-		cachefiles_io_error_obj(object, "Size set failed");
-		ret = -ENOBUFS;
+	if (object->file) {
+		cachefiles_begin_secure(cache, &saved_cred);
+		cachefiles_clean_up_object(object, cache);
+		cachefiles_end_secure(cache, saved_cred);
 	}
 
-	_leave(" = %d", ret);
-	return ret;
+	cookie->cache_priv = NULL;
+	cachefiles_put_object(object, cachefiles_obj_put_detach);
 }
 
 /*
- * Invalidate an object
+ * Invalidate the storage associated with a cookie.
  */
-static void cachefiles_invalidate_object(struct fscache_operation *op)
+static bool cachefiles_invalidate_cookie(struct fscache_cookie *cookie)
 {
-	struct cachefiles_object *object;
-	struct cachefiles_cache *cache;
-	const struct cred *saved_cred;
-	struct path path;
-	uint64_t ni_size;
-	int ret;
-
-	object = container_of(op->object, struct cachefiles_object, fscache);
-	cache = container_of(object->fscache.cache,
-			     struct cachefiles_cache, cache);
-
-	op->object->cookie->def->get_attr(op->object->cookie->netfs_data,
-					  &ni_size);
-
-	_enter("{OBJ%x},[%llu]",
-	       op->object->debug_id, (unsigned long long)ni_size);
+	struct cachefiles_object *object = cookie->cache_priv;
+	struct file *new_file, *old_file;
+	bool old_tmpfile;
 
-	if (object->backer) {
-		ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+	_enter("o=%x,[%llu]", object->debug_id, object->cookie->object_size);
 
-		fscache_set_store_limit(&object->fscache, ni_size);
+	old_tmpfile = test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags);
 
-		path.dentry = object->backer;
-		path.mnt = cache->mnt;
-
-		cachefiles_begin_secure(cache, &saved_cred);
-		ret = vfs_truncate(&path, 0);
-		if (ret == 0)
-			ret = vfs_truncate(&path, ni_size);
-		cachefiles_end_secure(cache, saved_cred);
+	if (!object->file) {
+		fscache_resume_after_invalidation(cookie);
+		_leave(" = t [light]");
+		return true;
+	}
 
-		if (ret != 0) {
-			fscache_set_store_limit(&object->fscache, 0);
-			if (ret == -EIO)
-				cachefiles_io_error_obj(object,
-							"Invalidate failed");
+	new_file = cachefiles_create_tmpfile(object);
+	if (IS_ERR(new_file))
+		goto failed;
+
+	/* Substitute the VFS target */
+	_debug("sub");
+	spin_lock(&object->lock);
+
+	old_file = object->file;
+	object->file = new_file;
+	object->content_info = CACHEFILES_CONTENT_NO_DATA;
+	set_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags);
+	set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &object->cookie->flags);
+
+	spin_unlock(&object->lock);
+	_debug("subbed");
+
+	/* Allow I/O to take place again */
+	fscache_resume_after_invalidation(cookie);
+
+	if (old_file) {
+		if (!old_tmpfile) {
+			struct cachefiles_volume *volume = object->volume;
+			struct dentry *fan = volume->fanout[(u8)cookie->key_hash];
+			struct dentry *obj;
+
+			obj = start_removing_dentry(fan, old_file->f_path.dentry);
+			if (!IS_ERR(obj))
+				cachefiles_bury_object(volume->cache, object,
+						       fan, obj,
+						       FSCACHE_OBJECT_INVALIDATED);
 		}
+		fput(old_file);
 	}
 
-	fscache_op_complete(op, true);
-	_leave("");
-}
+	_leave(" = t");
+	return true;
 
-/*
- * dissociate a cache from all the pages it was backing
- */
-static void cachefiles_dissociate_pages(struct fscache_cache *cache)
-{
-	_enter("");
+failed:
+	_leave(" = f");
+	return false;
 }
 
 const struct fscache_cache_ops cachefiles_cache_ops = {
 	.name			= "cachefiles",
-	.alloc_object		= cachefiles_alloc_object,
-	.lookup_object		= cachefiles_lookup_object,
-	.lookup_complete	= cachefiles_lookup_complete,
-	.grab_object		= cachefiles_grab_object,
-	.update_object		= cachefiles_update_object,
-	.invalidate_object	= cachefiles_invalidate_object,
-	.drop_object		= cachefiles_drop_object,
-	.put_object		= cachefiles_put_object,
-	.sync_cache		= cachefiles_sync_cache,
-	.attr_changed		= cachefiles_attr_changed,
-	.read_or_alloc_page	= cachefiles_read_or_alloc_page,
-	.read_or_alloc_pages	= cachefiles_read_or_alloc_pages,
-	.allocate_page		= cachefiles_allocate_page,
-	.allocate_pages		= cachefiles_allocate_pages,
-	.write_page		= cachefiles_write_page,
-	.uncache_page		= cachefiles_uncache_page,
-	.dissociate_pages	= cachefiles_dissociate_pages,
+	.acquire_volume		= cachefiles_acquire_volume,
+	.free_volume		= cachefiles_free_volume,
+	.lookup_cookie		= cachefiles_lookup_cookie,
+	.withdraw_cookie	= cachefiles_withdraw_cookie,
+	.invalidate_cookie	= cachefiles_invalidate_cookie,
+	.begin_operation	= cachefiles_begin_operation,
+	.resize_cookie		= cachefiles_resize_cookie,
+	.prepare_to_write	= cachefiles_prepare_to_write,
 };
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 49382519907a..b62cd3e9a18e 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -1,65 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* General netfs cache on cache files internal defs
  *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) "CacheFiles: " fmt
+
+
 #include <linux/fscache-cache.h>
-#include <linux/timer.h>
-#include <linux/wait.h>
-#include <linux/workqueue.h>
+#include <linux/cred.h>
 #include <linux/security.h>
+#include <linux/xarray.h>
+#include <linux/cachefiles.h>
+
+#define CACHEFILES_DIO_BLOCK_SIZE 4096
 
 struct cachefiles_cache;
 struct cachefiles_object;
 
-extern unsigned cachefiles_debug;
-#define CACHEFILES_DEBUG_KENTER	1
-#define CACHEFILES_DEBUG_KLEAVE	2
-#define CACHEFILES_DEBUG_KDEBUG	4
+enum cachefiles_content {
+	/* These values are saved on disk */
+	CACHEFILES_CONTENT_NO_DATA	= 0, /* No content stored */
+	CACHEFILES_CONTENT_SINGLE	= 1, /* Content is monolithic, all is present */
+	CACHEFILES_CONTENT_ALL		= 2, /* Content is all present, no map */
+	CACHEFILES_CONTENT_BACKFS_MAP	= 3, /* Content is piecemeal, mapped through backing fs */
+	CACHEFILES_CONTENT_DIRTY	= 4, /* Content is dirty (only seen on disk) */
+	nr__cachefiles_content
+};
 
-#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC)
+/*
+ * Cached volume representation.
+ */
+struct cachefiles_volume {
+	struct cachefiles_cache		*cache;
+	struct list_head		cache_link;	/* Link in cache->volumes */
+	struct fscache_volume		*vcookie;	/* The netfs's representation */
+	struct dentry			*dentry;	/* The volume dentry */
+	struct dentry			*fanout[256];	/* Fanout subdirs */
+};
+
+enum cachefiles_object_state {
+	CACHEFILES_ONDEMAND_OBJSTATE_CLOSE, /* Anonymous fd closed by daemon or initial state */
+	CACHEFILES_ONDEMAND_OBJSTATE_OPEN, /* Anonymous fd associated with object is available */
+	CACHEFILES_ONDEMAND_OBJSTATE_REOPENING, /* Object that was closed and is being reopened. */
+	CACHEFILES_ONDEMAND_OBJSTATE_DROPPING, /* Object is being dropped. */
+};
+
+struct cachefiles_ondemand_info {
+	struct work_struct		ondemand_work;
+	int				ondemand_id;
+	enum cachefiles_object_state	state;
+	struct cachefiles_object	*object;
+	spinlock_t			lock;
+};
 
 /*
- * node records
+ * Backing file state.
  */
 struct cachefiles_object {
-	struct fscache_object		fscache;	/* fscache handle */
-	struct cachefiles_lookup_data	*lookup_data;	/* cached lookup data */
-	struct dentry			*dentry;	/* the file/dir representing this object */
-	struct dentry			*backer;	/* backing file */
-	loff_t				i_size;		/* object size */
+	struct fscache_cookie		*cookie;	/* Netfs data storage object cookie */
+	struct cachefiles_volume	*volume;	/* Cache volume that holds this object */
+	struct list_head		cache_link;	/* Link in cache->*_list */
+	struct file			*file;		/* The file representing this object */
+	char				*d_name;	/* Backing file name */
+	int				debug_id;
+	spinlock_t			lock;
+	refcount_t			ref;
+	enum cachefiles_content		content_info:8;	/* Info about content presence */
 	unsigned long			flags;
-#define CACHEFILES_OBJECT_ACTIVE	0		/* T if marked active */
-#define CACHEFILES_OBJECT_BURIED	1		/* T if preemptively buried */
-	atomic_t			usage;		/* object usage count */
-	uint8_t				type;		/* object type */
-	uint8_t				new;		/* T if object new */
-	spinlock_t			work_lock;
-	struct rb_node			active_node;	/* link in active tree (dentry is key) */
+#define CACHEFILES_OBJECT_USING_TMPFILE	0		/* Have an unlinked tmpfile */
+#ifdef CONFIG_CACHEFILES_ONDEMAND
+	struct cachefiles_ondemand_info	*ondemand;
+#endif
 };
 
-extern struct kmem_cache *cachefiles_object_jar;
+#define CACHEFILES_ONDEMAND_ID_CLOSED	-1
 
 /*
  * Cache files cache definition
  */
 struct cachefiles_cache {
-	struct fscache_cache		cache;		/* FS-Cache record */
+	struct fscache_cache		*cache;		/* Cache cookie */
 	struct vfsmount			*mnt;		/* mountpoint holding the cache */
+	struct dentry			*store;		/* Directory into which live objects go */
 	struct dentry			*graveyard;	/* directory into which dead objects go */
 	struct file			*cachefilesd;	/* manager daemon handle */
+	struct list_head		volumes;	/* List of volume objects */
+	struct list_head		object_list;	/* List of active objects */
+	spinlock_t			object_list_lock; /* Lock for volumes and object_list */
 	const struct cred		*cache_cred;	/* security override for accessing cache */
 	struct mutex			daemon_mutex;	/* command serialisation mutex */
 	wait_queue_head_t		daemon_pollwq;	/* poll waitqueue for daemon */
-	struct rb_root			active_nodes;	/* active nodes (can't be culled) */
-	rwlock_t			active_lock;	/* lock for active_nodes */
 	atomic_t			gravecounter;	/* graveyard uniquifier */
+	atomic_t			f_released;	/* number of objects released lately */
+	atomic_long_t			b_released;	/* number of blocks released lately */
+	atomic_long_t			b_writing;	/* Number of blocks being written */
 	unsigned			frun_percent;	/* when to stop culling (% files) */
 	unsigned			fcull_percent;	/* when to start culling (% files) */
 	unsigned			fstop_percent;	/* when to stop allocating (% files) */
@@ -67,7 +107,7 @@ struct cachefiles_cache {
 	unsigned			bcull_percent;	/* when to start culling (% blocks) */
 	unsigned			bstop_percent;	/* when to stop allocating (% blocks) */
 	unsigned			bsize;		/* cache's block size */
-	unsigned			bshift;		/* min(ilog2(PAGE_SIZE / bsize), 0) */
+	unsigned			bshift;		/* ilog2(bsize) */
 	uint64_t			frun;		/* when to stop culling */
 	uint64_t			fcull;		/* when to start culling */
 	uint64_t			fstop;		/* when to stop allocating */
@@ -79,41 +119,48 @@ struct cachefiles_cache {
 #define CACHEFILES_DEAD			1	/* T if cache dead */
 #define CACHEFILES_CULLING		2	/* T if cull engaged */
 #define CACHEFILES_STATE_CHANGED	3	/* T if state changed (poll trigger) */
+#define CACHEFILES_ONDEMAND_MODE	4	/* T if in on-demand read mode */
 	char				*rootdirname;	/* name of cache root directory */
-	char				*secctx;	/* LSM security context */
 	char				*tag;		/* cache binding tag */
+	refcount_t			unbind_pincount;/* refcount to do daemon unbind */
+	struct xarray			reqs;		/* xarray of pending on-demand requests */
+	unsigned long			req_id_next;
+	struct xarray			ondemand_ids;	/* xarray for ondemand_id allocation */
+	u32				ondemand_id_next;
+	u32				msg_id_next;
+	u32				secid;		/* LSM security id */
+	bool				have_secid;	/* whether "secid" was set */
 };
 
-/*
- * backing file read tracking
- */
-struct cachefiles_one_read {
-	wait_queue_t			monitor;	/* link into monitored waitqueue */
-	struct page			*back_page;	/* backing file page we're waiting for */
-	struct page			*netfs_page;	/* netfs page we're going to fill */
-	struct fscache_retrieval	*op;		/* retrieval op covering this */
-	struct list_head		op_link;	/* link in op's todo list */
-};
+static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache)
+{
+	return IS_ENABLED(CONFIG_CACHEFILES_ONDEMAND) &&
+		test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags);
+}
 
-/*
- * backing file write tracking
- */
-struct cachefiles_one_write {
-	struct page			*netfs_page;	/* netfs page to copy */
-	struct cachefiles_object	*object;
-	struct list_head		obj_link;	/* link in object's lists */
-	fscache_rw_complete_t		end_io_func;
-	void				*context;
+struct cachefiles_req {
+	struct cachefiles_object *object;
+	struct completion done;
+	refcount_t ref;
+	int error;
+	struct cachefiles_msg msg;
 };
 
-/*
- * auxiliary data xattr buffer
- */
-struct cachefiles_xattr {
-	uint16_t			len;
-	uint8_t				type;
-	uint8_t				data[];
-};
+#define CACHEFILES_REQ_NEW	XA_MARK_1
+
+#include <trace/events/cachefiles.h>
+
+static inline
+struct file *cachefiles_cres_file(struct netfs_cache_resources *cres)
+{
+	return cres->cache_priv2;
+}
+
+static inline
+struct cachefiles_object *cachefiles_cres_object(struct netfs_cache_resources *cres)
+{
+	return fscache_cres_cookie(cres)->cache_priv;
+}
 
 /*
  * note change of state for daemon
@@ -125,87 +172,215 @@ static inline void cachefiles_state_changed(struct cachefiles_cache *cache)
 }
 
 /*
- * bind.c
+ * cache.c
  */
-extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args);
-extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache);
+extern int cachefiles_add_cache(struct cachefiles_cache *cache);
+extern void cachefiles_withdraw_cache(struct cachefiles_cache *cache);
+
+enum cachefiles_has_space_for {
+	cachefiles_has_space_check,
+	cachefiles_has_space_for_write,
+	cachefiles_has_space_for_create,
+};
+extern int cachefiles_has_space(struct cachefiles_cache *cache,
+				unsigned fnr, unsigned bnr,
+				enum cachefiles_has_space_for reason);
 
 /*
  * daemon.c
  */
 extern const struct file_operations cachefiles_daemon_fops;
+extern void cachefiles_flush_reqs(struct cachefiles_cache *cache);
+extern void cachefiles_get_unbind_pincount(struct cachefiles_cache *cache);
+extern void cachefiles_put_unbind_pincount(struct cachefiles_cache *cache);
 
-extern int cachefiles_has_space(struct cachefiles_cache *cache,
-				unsigned fnr, unsigned bnr);
+/*
+ * error_inject.c
+ */
+#ifdef CONFIG_CACHEFILES_ERROR_INJECTION
+extern unsigned int cachefiles_error_injection_state;
+extern int cachefiles_register_error_injection(void);
+extern void cachefiles_unregister_error_injection(void);
+
+#else
+#define cachefiles_error_injection_state 0
+
+static inline int cachefiles_register_error_injection(void)
+{
+	return 0;
+}
+
+static inline void cachefiles_unregister_error_injection(void)
+{
+}
+#endif
+
+
+static inline int cachefiles_inject_read_error(void)
+{
+	return cachefiles_error_injection_state & 2 ? -EIO : 0;
+}
+
+static inline int cachefiles_inject_write_error(void)
+{
+	return cachefiles_error_injection_state & 2 ? -EIO :
+		cachefiles_error_injection_state & 1 ? -ENOSPC :
+		0;
+}
+
+static inline int cachefiles_inject_remove_error(void)
+{
+	return cachefiles_error_injection_state & 2 ? -EIO : 0;
+}
 
 /*
  * interface.c
  */
 extern const struct fscache_cache_ops cachefiles_cache_ops;
+extern void cachefiles_see_object(struct cachefiles_object *object,
+				  enum cachefiles_obj_ref_trace why);
+extern struct cachefiles_object *cachefiles_grab_object(struct cachefiles_object *object,
+							enum cachefiles_obj_ref_trace why);
+extern void cachefiles_put_object(struct cachefiles_object *object,
+				  enum cachefiles_obj_ref_trace why);
+
+/*
+ * io.c
+ */
+extern bool cachefiles_begin_operation(struct netfs_cache_resources *cres,
+				       enum fscache_want_state want_state);
+extern int __cachefiles_prepare_write(struct cachefiles_object *object,
+				      struct file *file,
+				      loff_t *_start, size_t *_len, size_t upper_len,
+				      bool no_space_allocated_yet);
+extern int __cachefiles_write(struct cachefiles_object *object,
+			      struct file *file,
+			      loff_t start_pos,
+			      struct iov_iter *iter,
+			      netfs_io_terminated_t term_func,
+			      void *term_func_priv);
 
 /*
  * key.c
  */
-extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
+extern bool cachefiles_cook_key(struct cachefiles_object *object);
+
+/*
+ * main.c
+ */
+extern struct kmem_cache *cachefiles_object_jar;
 
 /*
  * namei.c
  */
-extern int cachefiles_delete_object(struct cachefiles_cache *cache,
-				    struct cachefiles_object *object);
-extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
-				     struct cachefiles_object *object,
-				     const char *key,
-				     struct cachefiles_xattr *auxdata);
+extern void cachefiles_unmark_inode_in_use(struct cachefiles_object *object,
+					   struct file *file);
+extern int cachefiles_bury_object(struct cachefiles_cache *cache,
+				  struct cachefiles_object *object,
+				  struct dentry *dir,
+				  struct dentry *rep,
+				  enum fscache_why_object_killed why);
+extern int cachefiles_delete_object(struct cachefiles_object *object,
+				    enum fscache_why_object_killed why);
+extern bool cachefiles_look_up_object(struct cachefiles_object *object);
 extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 					       struct dentry *dir,
-					       const char *name);
+					       const char *name,
+					       bool *_is_new);
+extern void cachefiles_put_directory(struct dentry *dir);
 
 extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
 			   char *filename);
 
 extern int cachefiles_check_in_use(struct cachefiles_cache *cache,
 				   struct dentry *dir, char *filename);
+extern struct file *cachefiles_create_tmpfile(struct cachefiles_object *object);
+extern bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
+				      struct cachefiles_object *object);
 
 /*
- * proc.c
+ * ondemand.c
  */
-#ifdef CONFIG_CACHEFILES_HISTOGRAM
-extern atomic_t cachefiles_lookup_histogram[HZ];
-extern atomic_t cachefiles_mkdir_histogram[HZ];
-extern atomic_t cachefiles_create_histogram[HZ];
+#ifdef CONFIG_CACHEFILES_ONDEMAND
+extern ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
+					char __user *_buffer, size_t buflen);
+
+extern int cachefiles_ondemand_copen(struct cachefiles_cache *cache,
+				     char *args);
+
+extern int cachefiles_ondemand_restore(struct cachefiles_cache *cache,
+					char *args);
+
+extern int cachefiles_ondemand_init_object(struct cachefiles_object *object);
+extern void cachefiles_ondemand_clean_object(struct cachefiles_object *object);
+
+extern int cachefiles_ondemand_read(struct cachefiles_object *object,
+				    loff_t pos, size_t len);
+
+extern int cachefiles_ondemand_init_obj_info(struct cachefiles_object *obj,
+					struct cachefiles_volume *volume);
+extern void cachefiles_ondemand_deinit_obj_info(struct cachefiles_object *obj);
+
+#define CACHEFILES_OBJECT_STATE_FUNCS(_state, _STATE)	\
+static inline bool								\
+cachefiles_ondemand_object_is_##_state(const struct cachefiles_object *object) \
+{												\
+	return object->ondemand->state == CACHEFILES_ONDEMAND_OBJSTATE_##_STATE; \
+}												\
+												\
+static inline void								\
+cachefiles_ondemand_set_object_##_state(struct cachefiles_object *object) \
+{												\
+	object->ondemand->state = CACHEFILES_ONDEMAND_OBJSTATE_##_STATE; \
+}
 
-extern int __init cachefiles_proc_init(void);
-extern void cachefiles_proc_cleanup(void);
-static inline
-void cachefiles_hist(atomic_t histogram[], unsigned long start_jif)
+CACHEFILES_OBJECT_STATE_FUNCS(open, OPEN);
+CACHEFILES_OBJECT_STATE_FUNCS(close, CLOSE);
+CACHEFILES_OBJECT_STATE_FUNCS(reopening, REOPENING);
+CACHEFILES_OBJECT_STATE_FUNCS(dropping, DROPPING);
+
+static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req)
 {
-	unsigned long jif = jiffies - start_jif;
-	if (jif >= HZ)
-		jif = HZ - 1;
-	atomic_inc(&histogram[jif]);
+	return cachefiles_ondemand_object_is_reopening(req->object) &&
+			req->msg.opcode == CACHEFILES_OP_READ;
 }
 
 #else
-#define cachefiles_proc_init()		(0)
-#define cachefiles_proc_cleanup()	do {} while (0)
-#define cachefiles_hist(hist, start_jif) do {} while (0)
-#endif
+static inline ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
+					char __user *_buffer, size_t buflen)
+{
+	return -EOPNOTSUPP;
+}
 
-/*
- * rdwr.c
- */
-extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *,
-					 struct page *, gfp_t);
-extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *,
-					  struct list_head *, unsigned *,
-					  gfp_t);
-extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *,
-				    gfp_t);
-extern int cachefiles_allocate_pages(struct fscache_retrieval *,
-				     struct list_head *, unsigned *, gfp_t);
-extern int cachefiles_write_page(struct fscache_storage *, struct page *);
-extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
+static inline int cachefiles_ondemand_init_object(struct cachefiles_object *object)
+{
+	return 0;
+}
+
+static inline void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
+{
+}
+
+static inline int cachefiles_ondemand_read(struct cachefiles_object *object,
+					   loff_t pos, size_t len)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int cachefiles_ondemand_init_obj_info(struct cachefiles_object *obj,
+						struct cachefiles_volume *volume)
+{
+	return 0;
+}
+static inline void cachefiles_ondemand_deinit_obj_info(struct cachefiles_object *obj)
+{
+}
+
+static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req)
+{
+	return false;
+}
+#endif
 
 /*
  * security.c
@@ -228,44 +403,55 @@ static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
 }
 
 /*
+ * volume.c
+ */
+void cachefiles_acquire_volume(struct fscache_volume *volume);
+void cachefiles_free_volume(struct fscache_volume *volume);
+void cachefiles_withdraw_volume(struct cachefiles_volume *volume);
+
+/*
  * xattr.c
  */
-extern int cachefiles_check_object_type(struct cachefiles_object *object);
-extern int cachefiles_set_object_xattr(struct cachefiles_object *object,
-				       struct cachefiles_xattr *auxdata);
-extern int cachefiles_update_object_xattr(struct cachefiles_object *object,
-					  struct cachefiles_xattr *auxdata);
-extern int cachefiles_check_object_xattr(struct cachefiles_object *object,
-					 struct cachefiles_xattr *auxdata);
+extern int cachefiles_set_object_xattr(struct cachefiles_object *object);
+extern int cachefiles_check_auxdata(struct cachefiles_object *object,
+				    struct file *file);
 extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
+					  struct cachefiles_object *object,
 					  struct dentry *dentry);
-
+extern void cachefiles_prepare_to_write(struct fscache_cookie *cookie);
+extern bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume);
+extern int cachefiles_check_volume_xattr(struct cachefiles_volume *volume);
 
 /*
- * error handling
+ * Error handling
  */
-#define kerror(FMT, ...) printk(KERN_ERR "CacheFiles: "FMT"\n", ##__VA_ARGS__)
-
 #define cachefiles_io_error(___cache, FMT, ...)		\
 do {							\
-	kerror("I/O Error: " FMT, ##__VA_ARGS__);	\
-	fscache_io_error(&(___cache)->cache);		\
+	pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__);	\
+	fscache_io_error((___cache)->cache);		\
 	set_bit(CACHEFILES_DEAD, &(___cache)->flags);	\
+	if (cachefiles_in_ondemand_mode(___cache))	\
+		cachefiles_flush_reqs(___cache);	\
 } while (0)
 
 #define cachefiles_io_error_obj(object, FMT, ...)			\
 do {									\
 	struct cachefiles_cache *___cache;				\
 									\
-	___cache = container_of((object)->fscache.cache,		\
-				struct cachefiles_cache, cache);	\
-	cachefiles_io_error(___cache, FMT, ##__VA_ARGS__);		\
+	___cache = (object)->volume->cache;				\
+	cachefiles_io_error(___cache, FMT " [o=%08x]", ##__VA_ARGS__,	\
+			    (object)->debug_id);			\
 } while (0)
 
 
 /*
- * debug tracing
+ * Debug tracing
  */
+extern unsigned cachefiles_debug;
+#define CACHEFILES_DEBUG_KENTER	1
+#define CACHEFILES_DEBUG_KLEAVE	2
+#define CACHEFILES_DEBUG_KDEBUG	4
+
 #define dbgprintk(FMT, ...) \
 	printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
 
@@ -309,8 +495,8 @@ do {							\
 #define ASSERT(X)							\
 do {									\
 	if (unlikely(!(X))) {						\
-		printk(KERN_ERR "\n");					\
-		printk(KERN_ERR "CacheFiles: Assertion failed\n");	\
+		pr_err("\n");						\
+		pr_err("Assertion failed\n");		\
 		BUG();							\
 	}								\
 } while (0)
@@ -318,9 +504,9 @@ do {									\
 #define ASSERTCMP(X, OP, Y)						\
 do {									\
 	if (unlikely(!((X) OP (Y)))) {					\
-		printk(KERN_ERR "\n");					\
-		printk(KERN_ERR "CacheFiles: Assertion failed\n");	\
-		printk(KERN_ERR "%lx " #OP " %lx is false\n",		\
+		pr_err("\n");						\
+		pr_err("Assertion failed\n");		\
+		pr_err("%lx " #OP " %lx is false\n",			\
 		       (unsigned long)(X), (unsigned long)(Y));		\
 		BUG();							\
 	}								\
@@ -329,8 +515,8 @@ do {									\
 #define ASSERTIF(C, X)							\
 do {									\
 	if (unlikely((C) && !(X))) {					\
-		printk(KERN_ERR "\n");					\
-		printk(KERN_ERR "CacheFiles: Assertion failed\n");	\
+		pr_err("\n");						\
+		pr_err("Assertion failed\n");		\
 		BUG();							\
 	}								\
 } while (0)
@@ -338,9 +524,9 @@ do {									\
 #define ASSERTIFCMP(C, X, OP, Y)					\
 do {									\
 	if (unlikely((C) && !((X) OP (Y)))) {				\
-		printk(KERN_ERR "\n");					\
-		printk(KERN_ERR "CacheFiles: Assertion failed\n");	\
-		printk(KERN_ERR "%lx " #OP " %lx is false\n",		\
+		pr_err("\n");						\
+		pr_err("Assertion failed\n");		\
+		pr_err("%lx " #OP " %lx is false\n",			\
 		       (unsigned long)(X), (unsigned long)(Y));		\
 		BUG();							\
 	}								\
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
new file mode 100644
index 000000000000..3e0576d9db1d
--- /dev/null
+++ b/fs/cachefiles/io.c
@@ -0,0 +1,762 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* kiocb-using read/write
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include <linux/falloc.h>
+#include <linux/sched/mm.h>
+#include <trace/events/fscache.h>
+#include <trace/events/netfs.h>
+#include "internal.h"
+
+struct cachefiles_kiocb {
+	struct kiocb		iocb;
+	refcount_t		ki_refcnt;
+	loff_t			start;
+	union {
+		size_t		skipped;
+		size_t		len;
+	};
+	struct cachefiles_object *object;
+	netfs_io_terminated_t	term_func;
+	void			*term_func_priv;
+	bool			was_async;
+	unsigned int		inval_counter;	/* Copy of cookie->inval_counter */
+	u64			b_writing;
+};
+
+static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki)
+{
+	if (refcount_dec_and_test(&ki->ki_refcnt)) {
+		cachefiles_put_object(ki->object, cachefiles_obj_put_ioreq);
+		fput(ki->iocb.ki_filp);
+		kfree(ki);
+	}
+}
+
+/*
+ * Handle completion of a read from the cache.
+ */
+static void cachefiles_read_complete(struct kiocb *iocb, long ret)
+{
+	struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
+	struct inode *inode = file_inode(ki->iocb.ki_filp);
+
+	_enter("%ld", ret);
+
+	if (ret < 0)
+		trace_cachefiles_io_error(ki->object, inode, ret,
+					  cachefiles_trace_read_error);
+
+	if (ki->term_func) {
+		if (ret >= 0) {
+			if (ki->object->cookie->inval_counter == ki->inval_counter)
+				ki->skipped += ret;
+			else
+				ret = -ESTALE;
+		}
+
+		ki->term_func(ki->term_func_priv, ret);
+	}
+
+	cachefiles_put_kiocb(ki);
+}
+
+/*
+ * Initiate a read from the cache.
+ */
+static int cachefiles_read(struct netfs_cache_resources *cres,
+			   loff_t start_pos,
+			   struct iov_iter *iter,
+			   enum netfs_read_from_hole read_hole,
+			   netfs_io_terminated_t term_func,
+			   void *term_func_priv)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_kiocb *ki;
+	struct file *file;
+	unsigned int old_nofs;
+	ssize_t ret = -ENOBUFS;
+	size_t len = iov_iter_count(iter), skipped = 0;
+
+	if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ))
+		goto presubmission_error;
+
+	fscache_count_read();
+	object = cachefiles_cres_object(cres);
+	file = cachefiles_cres_file(cres);
+
+	_enter("%pD,%li,%llx,%zx/%llx",
+	       file, file_inode(file)->i_ino, start_pos, len,
+	       i_size_read(file_inode(file)));
+
+	/* If the caller asked us to seek for data before doing the read, then
+	 * we should do that now.  If we find a gap, we fill it with zeros.
+	 */
+	if (read_hole != NETFS_READ_HOLE_IGNORE) {
+		loff_t off = start_pos, off2;
+
+		off2 = cachefiles_inject_read_error();
+		if (off2 == 0)
+			off2 = vfs_llseek(file, off, SEEK_DATA);
+		if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO && off2 != -ENXIO) {
+			skipped = 0;
+			ret = off2;
+			goto presubmission_error;
+		}
+
+		if (off2 == -ENXIO || off2 >= start_pos + len) {
+			/* The region is beyond the EOF or there's no more data
+			 * in the region, so clear the rest of the buffer and
+			 * return success.
+			 */
+			ret = -ENODATA;
+			if (read_hole == NETFS_READ_HOLE_FAIL)
+				goto presubmission_error;
+
+			iov_iter_zero(len, iter);
+			skipped = len;
+			ret = 0;
+			goto presubmission_error;
+		}
+
+		skipped = off2 - off;
+		iov_iter_zero(skipped, iter);
+	}
+
+	ret = -ENOMEM;
+	ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
+	if (!ki)
+		goto presubmission_error;
+
+	refcount_set(&ki->ki_refcnt, 2);
+	ki->iocb.ki_filp	= file;
+	ki->iocb.ki_pos		= start_pos + skipped;
+	ki->iocb.ki_flags	= IOCB_DIRECT;
+	ki->iocb.ki_ioprio	= get_current_ioprio();
+	ki->skipped		= skipped;
+	ki->object		= object;
+	ki->inval_counter	= cres->inval_counter;
+	ki->term_func		= term_func;
+	ki->term_func_priv	= term_func_priv;
+	ki->was_async		= true;
+
+	if (ki->term_func)
+		ki->iocb.ki_complete = cachefiles_read_complete;
+
+	get_file(ki->iocb.ki_filp);
+	cachefiles_grab_object(object, cachefiles_obj_get_ioreq);
+
+	trace_cachefiles_read(object, file_inode(file), ki->iocb.ki_pos, len - skipped);
+	old_nofs = memalloc_nofs_save();
+	ret = cachefiles_inject_read_error();
+	if (ret == 0)
+		ret = vfs_iocb_iter_read(file, &ki->iocb, iter);
+	memalloc_nofs_restore(old_nofs);
+	switch (ret) {
+	case -EIOCBQUEUED:
+		goto in_progress;
+
+	case -ERESTARTSYS:
+	case -ERESTARTNOINTR:
+	case -ERESTARTNOHAND:
+	case -ERESTART_RESTARTBLOCK:
+		/* There's no easy way to restart the syscall since other AIO's
+		 * may be already running. Just fail this IO with EINTR.
+		 */
+		ret = -EINTR;
+		fallthrough;
+	default:
+		ki->was_async = false;
+		cachefiles_read_complete(&ki->iocb, ret);
+		if (ret > 0)
+			ret = 0;
+		break;
+	}
+
+in_progress:
+	cachefiles_put_kiocb(ki);
+	_leave(" = %zd", ret);
+	return ret;
+
+presubmission_error:
+	if (term_func)
+		term_func(term_func_priv, ret < 0 ? ret : skipped);
+	return ret;
+}
+
+/*
+ * Query the occupancy of the cache in a region, returning where the next chunk
+ * of data starts and how long it is.
+ */
+static int cachefiles_query_occupancy(struct netfs_cache_resources *cres,
+				      loff_t start, size_t len, size_t granularity,
+				      loff_t *_data_start, size_t *_data_len)
+{
+	struct cachefiles_object *object;
+	struct file *file;
+	loff_t off, off2;
+
+	*_data_start = -1;
+	*_data_len = 0;
+
+	if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ))
+		return -ENOBUFS;
+
+	object = cachefiles_cres_object(cres);
+	file = cachefiles_cres_file(cres);
+	granularity = max_t(size_t, object->volume->cache->bsize, granularity);
+
+	_enter("%pD,%li,%llx,%zx/%llx",
+	       file, file_inode(file)->i_ino, start, len,
+	       i_size_read(file_inode(file)));
+
+	off = cachefiles_inject_read_error();
+	if (off == 0)
+		off = vfs_llseek(file, start, SEEK_DATA);
+	if (off == -ENXIO)
+		return -ENODATA; /* Beyond EOF */
+	if (off < 0 && off >= (loff_t)-MAX_ERRNO)
+		return -ENOBUFS; /* Error. */
+	if (round_up(off, granularity) >= start + len)
+		return -ENODATA; /* No data in range */
+
+	off2 = cachefiles_inject_read_error();
+	if (off2 == 0)
+		off2 = vfs_llseek(file, off, SEEK_HOLE);
+	if (off2 == -ENXIO)
+		return -ENODATA; /* Beyond EOF */
+	if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO)
+		return -ENOBUFS; /* Error. */
+
+	/* Round away partial blocks */
+	off = round_up(off, granularity);
+	off2 = round_down(off2, granularity);
+	if (off2 <= off)
+		return -ENODATA;
+
+	*_data_start = off;
+	if (off2 > start + len)
+		*_data_len = len;
+	else
+		*_data_len = off2 - off;
+	return 0;
+}
+
+/*
+ * Handle completion of a write to the cache.
+ */
+static void cachefiles_write_complete(struct kiocb *iocb, long ret)
+{
+	struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
+	struct cachefiles_object *object = ki->object;
+	struct inode *inode = file_inode(ki->iocb.ki_filp);
+
+	_enter("%ld", ret);
+
+	if (ki->was_async)
+		kiocb_end_write(iocb);
+
+	if (ret < 0)
+		trace_cachefiles_io_error(object, inode, ret,
+					  cachefiles_trace_write_error);
+
+	atomic_long_sub(ki->b_writing, &object->volume->cache->b_writing);
+	set_bit(FSCACHE_COOKIE_HAVE_DATA, &object->cookie->flags);
+	if (ki->term_func)
+		ki->term_func(ki->term_func_priv, ret);
+	cachefiles_put_kiocb(ki);
+}
+
+/*
+ * Initiate a write to the cache.
+ */
+int __cachefiles_write(struct cachefiles_object *object,
+		       struct file *file,
+		       loff_t start_pos,
+		       struct iov_iter *iter,
+		       netfs_io_terminated_t term_func,
+		       void *term_func_priv)
+{
+	struct cachefiles_cache *cache;
+	struct cachefiles_kiocb *ki;
+	unsigned int old_nofs;
+	ssize_t ret;
+	size_t len = iov_iter_count(iter);
+
+	fscache_count_write();
+	cache = object->volume->cache;
+
+	_enter("%pD,%li,%llx,%zx/%llx",
+	       file, file_inode(file)->i_ino, start_pos, len,
+	       i_size_read(file_inode(file)));
+
+	ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
+	if (!ki) {
+		if (term_func)
+			term_func(term_func_priv, -ENOMEM);
+		return -ENOMEM;
+	}
+
+	refcount_set(&ki->ki_refcnt, 2);
+	ki->iocb.ki_filp	= file;
+	ki->iocb.ki_pos		= start_pos;
+	ki->iocb.ki_flags	= IOCB_DIRECT | IOCB_WRITE;
+	ki->iocb.ki_ioprio	= get_current_ioprio();
+	ki->object		= object;
+	ki->start		= start_pos;
+	ki->len			= len;
+	ki->term_func		= term_func;
+	ki->term_func_priv	= term_func_priv;
+	ki->was_async		= true;
+	ki->b_writing		= (len + (1 << cache->bshift) - 1) >> cache->bshift;
+
+	if (ki->term_func)
+		ki->iocb.ki_complete = cachefiles_write_complete;
+	atomic_long_add(ki->b_writing, &cache->b_writing);
+
+	get_file(ki->iocb.ki_filp);
+	cachefiles_grab_object(object, cachefiles_obj_get_ioreq);
+
+	trace_cachefiles_write(object, file_inode(file), ki->iocb.ki_pos, len);
+	old_nofs = memalloc_nofs_save();
+	ret = cachefiles_inject_write_error();
+	if (ret == 0)
+		ret = vfs_iocb_iter_write(file, &ki->iocb, iter);
+	memalloc_nofs_restore(old_nofs);
+	switch (ret) {
+	case -EIOCBQUEUED:
+		goto in_progress;
+
+	case -ERESTARTSYS:
+	case -ERESTARTNOINTR:
+	case -ERESTARTNOHAND:
+	case -ERESTART_RESTARTBLOCK:
+		/* There's no easy way to restart the syscall since other AIO's
+		 * may be already running. Just fail this IO with EINTR.
+		 */
+		ret = -EINTR;
+		fallthrough;
+	default:
+		ki->was_async = false;
+		cachefiles_write_complete(&ki->iocb, ret);
+		break;
+	}
+
+in_progress:
+	cachefiles_put_kiocb(ki);
+	_leave(" = %zd", ret);
+	return ret;
+}
+
+static int cachefiles_write(struct netfs_cache_resources *cres,
+			    loff_t start_pos,
+			    struct iov_iter *iter,
+			    netfs_io_terminated_t term_func,
+			    void *term_func_priv)
+{
+	if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) {
+		if (term_func)
+			term_func(term_func_priv, -ENOBUFS);
+		trace_netfs_sreq(term_func_priv, netfs_sreq_trace_cache_nowrite);
+		return -ENOBUFS;
+	}
+
+	return __cachefiles_write(cachefiles_cres_object(cres),
+				  cachefiles_cres_file(cres),
+				  start_pos, iter,
+				  term_func, term_func_priv);
+}
+
+static inline enum netfs_io_source
+cachefiles_do_prepare_read(struct netfs_cache_resources *cres,
+			   loff_t start, size_t *_len, loff_t i_size,
+			   unsigned long *_flags, ino_t netfs_ino)
+{
+	enum cachefiles_prepare_read_trace why;
+	struct cachefiles_object *object = NULL;
+	struct cachefiles_cache *cache;
+	struct fscache_cookie *cookie = fscache_cres_cookie(cres);
+	const struct cred *saved_cred;
+	struct file *file = cachefiles_cres_file(cres);
+	enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER;
+	size_t len = *_len;
+	loff_t off, to;
+	ino_t ino = file ? file_inode(file)->i_ino : 0;
+	int rc;
+
+	_enter("%zx @%llx/%llx", len, start, i_size);
+
+	if (start >= i_size) {
+		ret = NETFS_FILL_WITH_ZEROES;
+		why = cachefiles_trace_read_after_eof;
+		goto out_no_object;
+	}
+
+	if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) {
+		__set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags);
+		why = cachefiles_trace_read_no_data;
+		if (!test_bit(NETFS_SREQ_ONDEMAND, _flags))
+			goto out_no_object;
+	}
+
+	/* The object and the file may be being created in the background. */
+	if (!file) {
+		why = cachefiles_trace_read_no_file;
+		if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ))
+			goto out_no_object;
+		file = cachefiles_cres_file(cres);
+		if (!file)
+			goto out_no_object;
+		ino = file_inode(file)->i_ino;
+	}
+
+	object = cachefiles_cres_object(cres);
+	cache = object->volume->cache;
+	cachefiles_begin_secure(cache, &saved_cred);
+retry:
+	off = cachefiles_inject_read_error();
+	if (off == 0)
+		off = vfs_llseek(file, start, SEEK_DATA);
+	if (off < 0 && off >= (loff_t)-MAX_ERRNO) {
+		if (off == (loff_t)-ENXIO) {
+			why = cachefiles_trace_read_seek_nxio;
+			goto download_and_store;
+		}
+		trace_cachefiles_io_error(object, file_inode(file), off,
+					  cachefiles_trace_seek_error);
+		why = cachefiles_trace_read_seek_error;
+		goto out;
+	}
+
+	if (off >= start + len) {
+		why = cachefiles_trace_read_found_hole;
+		goto download_and_store;
+	}
+
+	if (off > start) {
+		off = round_up(off, cache->bsize);
+		len = off - start;
+		*_len = len;
+		why = cachefiles_trace_read_found_part;
+		goto download_and_store;
+	}
+
+	to = cachefiles_inject_read_error();
+	if (to == 0)
+		to = vfs_llseek(file, start, SEEK_HOLE);
+	if (to < 0 && to >= (loff_t)-MAX_ERRNO) {
+		trace_cachefiles_io_error(object, file_inode(file), to,
+					  cachefiles_trace_seek_error);
+		why = cachefiles_trace_read_seek_error;
+		goto out;
+	}
+
+	if (to < start + len) {
+		if (start + len >= i_size)
+			to = round_up(to, cache->bsize);
+		else
+			to = round_down(to, cache->bsize);
+		len = to - start;
+		*_len = len;
+	}
+
+	why = cachefiles_trace_read_have_data;
+	ret = NETFS_READ_FROM_CACHE;
+	goto out;
+
+download_and_store:
+	__set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags);
+	if (test_bit(NETFS_SREQ_ONDEMAND, _flags)) {
+		rc = cachefiles_ondemand_read(object, start, len);
+		if (!rc) {
+			__clear_bit(NETFS_SREQ_ONDEMAND, _flags);
+			goto retry;
+		}
+		ret = NETFS_INVALID_READ;
+	}
+out:
+	cachefiles_end_secure(cache, saved_cred);
+out_no_object:
+	trace_cachefiles_prep_read(object, start, len, *_flags, ret, why, ino, netfs_ino);
+	return ret;
+}
+
+/*
+ * Prepare a read operation, shortening it to a cached/uncached
+ * boundary as appropriate.
+ */
+static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq,
+						    unsigned long long i_size)
+{
+	return cachefiles_do_prepare_read(&subreq->rreq->cache_resources,
+					  subreq->start, &subreq->len, i_size,
+					  &subreq->flags, subreq->rreq->inode->i_ino);
+}
+
+/*
+ * Prepare an on-demand read operation, shortening it to a cached/uncached
+ * boundary as appropriate.
+ */
+static enum netfs_io_source
+cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres,
+				 loff_t start, size_t *_len, loff_t i_size,
+				 unsigned long *_flags, ino_t ino)
+{
+	return cachefiles_do_prepare_read(cres, start, _len, i_size, _flags, ino);
+}
+
+/*
+ * Prepare for a write to occur.
+ */
+int __cachefiles_prepare_write(struct cachefiles_object *object,
+			       struct file *file,
+			       loff_t *_start, size_t *_len, size_t upper_len,
+			       bool no_space_allocated_yet)
+{
+	struct cachefiles_cache *cache = object->volume->cache;
+	loff_t start = *_start, pos;
+	size_t len = *_len;
+	int ret;
+
+	/* Round to DIO size */
+	start = round_down(*_start, PAGE_SIZE);
+	if (start != *_start || *_len > upper_len) {
+		/* Probably asked to cache a streaming write written into the
+		 * pagecache when the cookie was temporarily out of service to
+		 * culling.
+		 */
+		fscache_count_dio_misfit();
+		return -ENOBUFS;
+	}
+
+	*_len = round_up(len, PAGE_SIZE);
+
+	/* We need to work out whether there's sufficient disk space to perform
+	 * the write - but we can skip that check if we have space already
+	 * allocated.
+	 */
+	if (no_space_allocated_yet)
+		goto check_space;
+
+	pos = cachefiles_inject_read_error();
+	if (pos == 0)
+		pos = vfs_llseek(file, start, SEEK_DATA);
+	if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) {
+		if (pos == -ENXIO)
+			goto check_space; /* Unallocated tail */
+		trace_cachefiles_io_error(object, file_inode(file), pos,
+					  cachefiles_trace_seek_error);
+		return pos;
+	}
+	if ((u64)pos >= (u64)start + *_len)
+		goto check_space; /* Unallocated region */
+
+	/* We have a block that's at least partially filled - if we're low on
+	 * space, we need to see if it's fully allocated.  If it's not, we may
+	 * want to cull it.
+	 */
+	if (cachefiles_has_space(cache, 0, *_len / PAGE_SIZE,
+				 cachefiles_has_space_check) == 0)
+		return 0; /* Enough space to simply overwrite the whole block */
+
+	pos = cachefiles_inject_read_error();
+	if (pos == 0)
+		pos = vfs_llseek(file, start, SEEK_HOLE);
+	if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) {
+		trace_cachefiles_io_error(object, file_inode(file), pos,
+					  cachefiles_trace_seek_error);
+		return pos;
+	}
+	if ((u64)pos >= (u64)start + *_len)
+		return 0; /* Fully allocated */
+
+	/* Partially allocated, but insufficient space: cull. */
+	fscache_count_no_write_space();
+	ret = cachefiles_inject_remove_error();
+	if (ret == 0)
+		ret = vfs_fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+				    start, *_len);
+	if (ret < 0) {
+		trace_cachefiles_io_error(object, file_inode(file), ret,
+					  cachefiles_trace_fallocate_error);
+		cachefiles_io_error_obj(object,
+					"CacheFiles: fallocate failed (%d)\n", ret);
+		ret = -EIO;
+	}
+
+	return ret;
+
+check_space:
+	return cachefiles_has_space(cache, 0, *_len / PAGE_SIZE,
+				    cachefiles_has_space_for_write);
+}
+
+static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
+				    loff_t *_start, size_t *_len, size_t upper_len,
+				    loff_t i_size, bool no_space_allocated_yet)
+{
+	struct cachefiles_object *object = cachefiles_cres_object(cres);
+	struct cachefiles_cache *cache = object->volume->cache;
+	const struct cred *saved_cred;
+	int ret;
+
+	if (!cachefiles_cres_file(cres)) {
+		if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE))
+			return -ENOBUFS;
+		if (!cachefiles_cres_file(cres))
+			return -ENOBUFS;
+	}
+
+	cachefiles_begin_secure(cache, &saved_cred);
+	ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres),
+					 _start, _len, upper_len,
+					 no_space_allocated_yet);
+	cachefiles_end_secure(cache, saved_cred);
+	return ret;
+}
+
+static void cachefiles_prepare_write_subreq(struct netfs_io_subrequest *subreq)
+{
+	struct netfs_io_request *wreq = subreq->rreq;
+	struct netfs_cache_resources *cres = &wreq->cache_resources;
+	struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];
+
+	_enter("W=%x[%x] %llx", wreq->debug_id, subreq->debug_index, subreq->start);
+
+	stream->sreq_max_len = MAX_RW_COUNT;
+	stream->sreq_max_segs = BIO_MAX_VECS;
+
+	if (!cachefiles_cres_file(cres)) {
+		if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE))
+			return netfs_prepare_write_failed(subreq);
+		if (!cachefiles_cres_file(cres))
+			return netfs_prepare_write_failed(subreq);
+	}
+}
+
+static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
+{
+	struct netfs_io_request *wreq = subreq->rreq;
+	struct netfs_cache_resources *cres = &wreq->cache_resources;
+	struct cachefiles_object *object = cachefiles_cres_object(cres);
+	struct cachefiles_cache *cache = object->volume->cache;
+	struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];
+	const struct cred *saved_cred;
+	size_t off, pre, post, len = subreq->len;
+	loff_t start = subreq->start;
+	int ret;
+
+	_enter("W=%x[%x] %llx-%llx",
+	       wreq->debug_id, subreq->debug_index, start, start + len - 1);
+
+	/* We need to start on the cache granularity boundary */
+	off = start & (CACHEFILES_DIO_BLOCK_SIZE - 1);
+	if (off) {
+		pre = CACHEFILES_DIO_BLOCK_SIZE - off;
+		if (pre >= len) {
+			fscache_count_dio_misfit();
+			netfs_write_subrequest_terminated(subreq, len);
+			return;
+		}
+		subreq->transferred += pre;
+		start += pre;
+		len -= pre;
+		iov_iter_advance(&subreq->io_iter, pre);
+	}
+
+	/* We also need to end on the cache granularity boundary */
+	if (start + len == wreq->i_size) {
+		size_t part = len % CACHEFILES_DIO_BLOCK_SIZE;
+		size_t need = CACHEFILES_DIO_BLOCK_SIZE - part;
+
+		if (part && stream->submit_extendable_to >= need) {
+			len += need;
+			subreq->len += need;
+			subreq->io_iter.count += need;
+		}
+	}
+
+	post = len & (CACHEFILES_DIO_BLOCK_SIZE - 1);
+	if (post) {
+		len -= post;
+		if (len == 0) {
+			fscache_count_dio_misfit();
+			netfs_write_subrequest_terminated(subreq, post);
+			return;
+		}
+		iov_iter_truncate(&subreq->io_iter, len);
+	}
+
+	trace_netfs_sreq(subreq, netfs_sreq_trace_cache_prepare);
+	cachefiles_begin_secure(cache, &saved_cred);
+	ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres),
+					 &start, &len, len, true);
+	cachefiles_end_secure(cache, saved_cred);
+	if (ret < 0) {
+		netfs_write_subrequest_terminated(subreq, ret);
+		return;
+	}
+
+	trace_netfs_sreq(subreq, netfs_sreq_trace_cache_write);
+	cachefiles_write(&subreq->rreq->cache_resources,
+			 subreq->start, &subreq->io_iter,
+			 netfs_write_subrequest_terminated, subreq);
+}
+
+/*
+ * Clean up an operation.
+ */
+static void cachefiles_end_operation(struct netfs_cache_resources *cres)
+{
+	struct file *file = cachefiles_cres_file(cres);
+
+	if (file)
+		fput(file);
+	fscache_end_cookie_access(fscache_cres_cookie(cres), fscache_access_io_end);
+}
+
+static const struct netfs_cache_ops cachefiles_netfs_cache_ops = {
+	.end_operation		= cachefiles_end_operation,
+	.read			= cachefiles_read,
+	.write			= cachefiles_write,
+	.issue_write		= cachefiles_issue_write,
+	.prepare_read		= cachefiles_prepare_read,
+	.prepare_write		= cachefiles_prepare_write,
+	.prepare_write_subreq	= cachefiles_prepare_write_subreq,
+	.prepare_ondemand_read	= cachefiles_prepare_ondemand_read,
+	.query_occupancy	= cachefiles_query_occupancy,
+};
+
+/*
+ * Open the cache file when beginning a cache operation.
+ */
+bool cachefiles_begin_operation(struct netfs_cache_resources *cres,
+				enum fscache_want_state want_state)
+{
+	struct cachefiles_object *object = cachefiles_cres_object(cres);
+
+	if (!cachefiles_cres_file(cres)) {
+		cres->ops = &cachefiles_netfs_cache_ops;
+		if (object->file) {
+			spin_lock(&object->lock);
+			if (!cres->cache_priv2 && object->file)
+				cres->cache_priv2 = get_file(object->file);
+			spin_unlock(&object->lock);
+		}
+	}
+
+	if (!cachefiles_cres_file(cres) && want_state != FSCACHE_WANT_PARAMS) {
+		pr_err("failed to get cres->file\n");
+		return false;
+	}
+
+	return true;
+}
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
index 33b58c60f2d1..aae86af48ed5 100644
--- a/fs/cachefiles/key.c
+++ b/fs/cachefiles/key.c
@@ -1,18 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Key to pathname encoder
  *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/slab.h>
 #include "internal.h"
 
-static const char cachefiles_charmap[64] =
+static const char cachefiles_charmap[64] __nonstring =
 	"0123456789"			/* 0 - 9 */
 	"abcdefghijklmnopqrstuvwxyz"	/* 10 - 35 */
 	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"	/* 36 - 61 */
@@ -26,134 +22,116 @@ static const char cachefiles_filecharmap[256] = {
 	[48 ... 127] = 1,		/* '0' -> '~' */
 };
 
+static inline unsigned int how_many_hex_digits(unsigned int x)
+{
+	return x ? round_up(ilog2(x) + 1, 4) / 4 : 0;
+}
+
 /*
  * turn the raw key into something cooked
- * - the raw key should include the length in the two bytes at the front
- * - the key may be up to 514 bytes in length (including the length word)
+ * - the key may be up to NAME_MAX in length (including the length word)
  *   - "base64" encode the strange keys, mapping 3 bytes of raw to four of
  *     cooked
  *   - need to cut the cooked key into 252 char lengths (189 raw bytes)
  */
-char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
+bool cachefiles_cook_key(struct cachefiles_object *object)
 {
-	unsigned char csum, ch;
-	unsigned int acc;
-	char *key;
-	int loop, len, max, seg, mark, print;
+	const u8 *key = fscache_get_key(object->cookie), *kend;
+	unsigned char ch;
+	unsigned int acc, i, n, nle, nbe, keylen = object->cookie->key_len;
+	unsigned int b64len, len, print, pad;
+	char *name, sep;
 
-	_enter(",%d", keylen);
+	_enter(",%u,%*phN", keylen, keylen, key);
 
-	BUG_ON(keylen < 2 || keylen > 514);
+	BUG_ON(keylen > NAME_MAX - 3);
 
-	csum = raw[0] + raw[1];
 	print = 1;
-	for (loop = 2; loop < keylen; loop++) {
-		ch = raw[loop];
-		csum += ch;
+	for (i = 0; i < keylen; i++) {
+		ch = key[i];
 		print &= cachefiles_filecharmap[ch];
 	}
 
+	/* If the path is usable ASCII, then we render it directly */
 	if (print) {
-		/* if the path is usable ASCII, then we render it directly */
-		max = keylen - 2;
-		max += 2;	/* two base64'd length chars on the front */
-		max += 5;	/* @checksum/M */
-		max += 3 * 2;	/* maximum number of segment dividers (".../M")
-				 * is ((514 + 251) / 252) = 3
-				 */
-		max += 1;	/* NUL on end */
-	} else {
-		/* calculate the maximum length of the cooked key */
-		keylen = (keylen + 2) / 3;
-
-		max = keylen * 4;
-		max += 5;	/* @checksum/M */
-		max += 3 * 2;	/* maximum number of segment dividers (".../M")
-				 * is ((514 + 188) / 189) = 3
-				 */
-		max += 1;	/* NUL on end */
+		len = 1 + keylen;
+		name = kmalloc(len + 1, GFP_KERNEL);
+		if (!name)
+			return false;
+
+		name[0] = 'D'; /* Data object type, string encoding */
+		memcpy(name + 1, key, keylen);
+		goto success;
 	}
 
-	max += 1;	/* 2nd NUL on end */
-
-	_debug("max: %d", max);
-
-	key = kmalloc(max, cachefiles_gfp);
-	if (!key)
-		return NULL;
-
-	len = 0;
-
-	/* build the cooked key */
-	sprintf(key, "@%02x%c+", (unsigned) csum, 0);
-	len = 5;
-	mark = len - 1;
-
-	if (print) {
-		acc = *(uint16_t *) raw;
-		raw += 2;
-
-		key[len + 1] = cachefiles_charmap[acc & 63];
-		acc >>= 6;
-		key[len] = cachefiles_charmap[acc & 63];
-		len += 2;
-
-		seg = 250;
-		for (loop = keylen; loop > 0; loop--) {
-			if (seg <= 0) {
-				key[len++] = '\0';
-				mark = len;
-				key[len++] = '+';
-				seg = 252;
-			}
-
-			key[len++] = *raw++;
-			ASSERT(len < max);
-		}
-
-		switch (type) {
-		case FSCACHE_COOKIE_TYPE_INDEX:		type = 'I';	break;
-		case FSCACHE_COOKIE_TYPE_DATAFILE:	type = 'D';	break;
-		default:				type = 'S';	break;
-		}
-	} else {
-		seg = 252;
-		for (loop = keylen; loop > 0; loop--) {
-			if (seg <= 0) {
-				key[len++] = '\0';
-				mark = len;
-				key[len++] = '+';
-				seg = 252;
-			}
-
-			acc = *raw++;
-			acc |= *raw++ << 8;
-			acc |= *raw++ << 16;
-
-			_debug("acc: %06x", acc);
-
-			key[len++] = cachefiles_charmap[acc & 63];
-			acc >>= 6;
-			key[len++] = cachefiles_charmap[acc & 63];
-			acc >>= 6;
-			key[len++] = cachefiles_charmap[acc & 63];
-			acc >>= 6;
-			key[len++] = cachefiles_charmap[acc & 63];
-
-			ASSERT(len < max);
-		}
+	/* See if it makes sense to encode it as "hex,hex,hex" for each 32-bit
+	 * chunk.  We rely on the key having been padded out to a whole number
+	 * of 32-bit words.
+	 */
+	n = round_up(keylen, 4);
+	nbe = nle = 0;
+	for (i = 0; i < n; i += 4) {
+		u32 be = be32_to_cpu(*(__be32 *)(key + i));
+		u32 le = le32_to_cpu(*(__le32 *)(key + i));
+
+		nbe += 1 + how_many_hex_digits(be);
+		nle += 1 + how_many_hex_digits(le);
+	}
 
-		switch (type) {
-		case FSCACHE_COOKIE_TYPE_INDEX:		type = 'J';	break;
-		case FSCACHE_COOKIE_TYPE_DATAFILE:	type = 'E';	break;
-		default:				type = 'T';	break;
+	b64len = DIV_ROUND_UP(keylen, 3);
+	pad = b64len * 3 - keylen;
+	b64len = 2 + b64len * 4; /* Length if we base64-encode it */
+	_debug("len=%u nbe=%u nle=%u b64=%u", keylen, nbe, nle, b64len);
+	if (nbe < b64len || nle < b64len) {
+		unsigned int nlen = min(nbe, nle) + 1;
+		name = kmalloc(nlen, GFP_KERNEL);
+		if (!name)
+			return false;
+		sep = (nbe <= nle) ? 'S' : 'T'; /* Encoding indicator */
+		len = 0;
+		for (i = 0; i < n; i += 4) {
+			u32 x;
+			if (nbe <= nle)
+				x = be32_to_cpu(*(__be32 *)(key + i));
+			else
+				x = le32_to_cpu(*(__le32 *)(key + i));
+			name[len++] = sep;
+			if (x != 0)
+				len += snprintf(name + len, nlen - len, "%x", x);
+			sep = ',';
 		}
+		goto success;
 	}
 
-	key[mark] = type;
-	key[len++] = 0;
-	key[len] = 0;
+	/* We need to base64-encode it */
+	name = kmalloc(b64len + 1, GFP_KERNEL);
+	if (!name)
+		return false;
+
+	name[0] = 'E';
+	name[1] = '0' + pad;
+	len = 2;
+	kend = key + keylen;
+	do {
+		acc  = *key++;
+		if (key < kend) {
+			acc |= *key++ << 8;
+			if (key < kend)
+				acc |= *key++ << 16;
+		}
 
-	_leave(" = %p %d", key, len);
-	return key;
+		name[len++] = cachefiles_charmap[acc & 63];
+		acc >>= 6;
+		name[len++] = cachefiles_charmap[acc & 63];
+		acc >>= 6;
+		name[len++] = cachefiles_charmap[acc & 63];
+		acc >>= 6;
+		name[len++] = cachefiles_charmap[acc & 63];
+	} while (key < kend);
+
+success:
+	name[len] = 0;
+	object->d_name = name;
+	_leave(" = %s", object->d_name);
+	return true;
 }
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
index 4bfa8cf43bf5..3f369c6f816d 100644
--- a/fs/cachefiles/main.c
+++ b/fs/cachefiles/main.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Network filesystem caching backend to use cache files on a premounted
  * filesystem
  *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/module.h>
@@ -22,6 +18,9 @@
 #include <linux/statfs.h>
 #include <linux/sysctl.h>
 #include <linux/miscdevice.h>
+#include <linux/netfs.h>
+#include <trace/events/netfs.h>
+#define CREATE_TRACE_POINTS
 #include "internal.h"
 
 unsigned cachefiles_debug;
@@ -40,14 +39,6 @@ static struct miscdevice cachefiles_dev = {
 	.fops	= &cachefiles_daemon_fops,
 };
 
-static void cachefiles_object_init_once(void *_object)
-{
-	struct cachefiles_object *object = _object;
-
-	memset(object, 0, sizeof(*object));
-	spin_lock_init(&object->work_lock);
-}
-
 /*
  * initialise the fs caching module
  */
@@ -55,6 +46,9 @@ static int __init cachefiles_init(void)
 {
 	int ret;
 
+	ret = cachefiles_register_error_injection();
+	if (ret < 0)
+		goto error_einj;
 	ret = misc_register(&cachefiles_dev);
 	if (ret < 0)
 		goto error_dev;
@@ -64,28 +58,21 @@ static int __init cachefiles_init(void)
 	cachefiles_object_jar =
 		kmem_cache_create("cachefiles_object_jar",
 				  sizeof(struct cachefiles_object),
-				  0,
-				  SLAB_HWCACHE_ALIGN,
-				  cachefiles_object_init_once);
+				  0, SLAB_HWCACHE_ALIGN, NULL);
 	if (!cachefiles_object_jar) {
-		printk(KERN_NOTICE
-		       "CacheFiles: Failed to allocate an object jar\n");
+		pr_notice("Failed to allocate an object jar\n");
 		goto error_object_jar;
 	}
 
-	ret = cachefiles_proc_init();
-	if (ret < 0)
-		goto error_proc;
-
-	printk(KERN_INFO "CacheFiles: Loaded\n");
+	pr_info("Loaded\n");
 	return 0;
 
-error_proc:
-	kmem_cache_destroy(cachefiles_object_jar);
 error_object_jar:
 	misc_deregister(&cachefiles_dev);
 error_dev:
-	kerror("failed to register: %d", ret);
+	cachefiles_unregister_error_injection();
+error_einj:
+	pr_err("failed to register: %d\n", ret);
 	return ret;
 }
 
@@ -96,11 +83,11 @@ fs_initcall(cachefiles_init);
  */
 static void __exit cachefiles_exit(void)
 {
-	printk(KERN_INFO "CacheFiles: Unloading\n");
+	pr_info("Unloading\n");
 
-	cachefiles_proc_cleanup();
 	kmem_cache_destroy(cachefiles_object_jar);
 	misc_deregister(&cachefiles_dev);
+	cachefiles_unregister_error_injection();
 }
 
 module_exit(cachefiles_exit);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 25badd1aec5c..e5ec90dccc27 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -1,309 +1,292 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* CacheFiles path walking and related routines
  *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/file.h>
 #include <linux/fs.h>
-#include <linux/fsnotify.h>
-#include <linux/quotaops.h>
-#include <linux/xattr.h>
-#include <linux/mount.h>
 #include <linux/namei.h>
-#include <linux/security.h>
-#include <linux/slab.h>
 #include "internal.h"
 
-#define CACHEFILES_KEYBUF_SIZE 512
-
 /*
- * dump debugging info about an object
+ * Mark the backing file as being a cache file if it's not already in use.  The
+ * mark tells the culling request command that it's not allowed to cull the
+ * file or directory.  The caller must hold the inode lock.
  */
-static noinline
-void __cachefiles_printk_object(struct cachefiles_object *object,
-				const char *prefix,
-				u8 *keybuf)
+static bool __cachefiles_mark_inode_in_use(struct cachefiles_object *object,
+					   struct inode *inode)
 {
-	struct fscache_cookie *cookie;
-	unsigned keylen, loop;
-
-	printk(KERN_ERR "%sobject: OBJ%x\n",
-	       prefix, object->fscache.debug_id);
-	printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
-	       prefix, object->fscache.state->name,
-	       object->fscache.flags, work_busy(&object->fscache.work),
-	       object->fscache.events, object->fscache.event_mask);
-	printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
-	       prefix, object->fscache.n_ops, object->fscache.n_in_progress,
-	       object->fscache.n_exclusive);
-	printk(KERN_ERR "%sparent=%p\n",
-	       prefix, object->fscache.parent);
-
-	spin_lock(&object->fscache.lock);
-	cookie = object->fscache.cookie;
-	if (cookie) {
-		printk(KERN_ERR "%scookie=%p [pr=%p nd=%p fl=%lx]\n",
-		       prefix,
-		       object->fscache.cookie,
-		       object->fscache.cookie->parent,
-		       object->fscache.cookie->netfs_data,
-		       object->fscache.cookie->flags);
-		if (keybuf)
-			keylen = cookie->def->get_key(cookie->netfs_data, keybuf,
-						      CACHEFILES_KEYBUF_SIZE);
-		else
-			keylen = 0;
+	bool can_use = false;
+
+	if (!(inode->i_flags & S_KERNEL_FILE)) {
+		inode->i_flags |= S_KERNEL_FILE;
+		trace_cachefiles_mark_active(object, inode);
+		can_use = true;
 	} else {
-		printk(KERN_ERR "%scookie=NULL\n", prefix);
-		keylen = 0;
+		trace_cachefiles_mark_failed(object, inode);
 	}
-	spin_unlock(&object->fscache.lock);
 
-	if (keylen) {
-		printk(KERN_ERR "%skey=[%u] '", prefix, keylen);
-		for (loop = 0; loop < keylen; loop++)
-			printk("%02x", keybuf[loop]);
-		printk("'\n");
-	}
+	return can_use;
+}
+
+static bool cachefiles_mark_inode_in_use(struct cachefiles_object *object,
+					 struct inode *inode)
+{
+	bool can_use;
+
+	inode_lock(inode);
+	can_use = __cachefiles_mark_inode_in_use(object, inode);
+	inode_unlock(inode);
+	return can_use;
 }
 
 /*
- * dump debugging info about a pair of objects
+ * Unmark a backing inode.  The caller must hold the inode lock.
  */
-static noinline void cachefiles_printk_object(struct cachefiles_object *object,
-					      struct cachefiles_object *xobject)
+static void __cachefiles_unmark_inode_in_use(struct cachefiles_object *object,
+					     struct inode *inode)
 {
-	u8 *keybuf;
-
-	keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO);
-	if (object)
-		__cachefiles_printk_object(object, "", keybuf);
-	if (xobject)
-		__cachefiles_printk_object(xobject, "x", keybuf);
-	kfree(keybuf);
+	inode->i_flags &= ~S_KERNEL_FILE;
+	trace_cachefiles_mark_inactive(object, inode);
+}
+
+static void cachefiles_do_unmark_inode_in_use(struct cachefiles_object *object,
+					      struct inode *inode)
+{
+	inode_lock(inode);
+	__cachefiles_unmark_inode_in_use(object, inode);
+	inode_unlock(inode);
 }
 
 /*
- * mark the owner of a dentry, if there is one, to indicate that that dentry
- * has been preemptively deleted
- * - the caller must hold the i_mutex on the dentry's parent as required to
- *   call vfs_unlink(), vfs_rmdir() or vfs_rename()
+ * Unmark a backing inode and tell cachefilesd that there's something that can
+ * be culled.
  */
-static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
-					  struct dentry *dentry)
+void cachefiles_unmark_inode_in_use(struct cachefiles_object *object,
+				    struct file *file)
 {
-	struct cachefiles_object *object;
-	struct rb_node *p;
+	struct cachefiles_cache *cache = object->volume->cache;
+	struct inode *inode = file_inode(file);
 
-	_enter(",'%*.*s'",
-	       dentry->d_name.len, dentry->d_name.len, dentry->d_name.name);
+	cachefiles_do_unmark_inode_in_use(object, inode);
 
-	write_lock(&cache->active_lock);
-
-	p = cache->active_nodes.rb_node;
-	while (p) {
-		object = rb_entry(p, struct cachefiles_object, active_node);
-		if (object->dentry > dentry)
-			p = p->rb_left;
-		else if (object->dentry < dentry)
-			p = p->rb_right;
-		else
-			goto found_dentry;
+	if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) {
+		atomic_long_add(inode->i_blocks, &cache->b_released);
+		if (atomic_inc_return(&cache->f_released))
+			cachefiles_state_changed(cache);
 	}
+}
 
-	write_unlock(&cache->active_lock);
-	_leave(" [no owner]");
-	return;
+/*
+ * get a subdirectory
+ */
+struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
+					struct dentry *dir,
+					const char *dirname,
+					bool *_is_new)
+{
+	struct dentry *subdir;
+	struct path path;
+	int ret;
+
+	_enter(",,%s", dirname);
 
-	/* found the dentry for  */
-found_dentry:
-	kdebug("preemptive burial: OBJ%x [%s] %p",
-	       object->fscache.debug_id,
-	       object->fscache.state->name,
-	       dentry);
+	/* search the current directory for the element name */
 
-	if (fscache_object_is_live(&object->fscache)) {
-		printk(KERN_ERR "\n");
-		printk(KERN_ERR "CacheFiles: Error:"
-		       " Can't preemptively bury live object\n");
-		cachefiles_printk_object(object, NULL);
-	} else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
-		printk(KERN_ERR "CacheFiles: Error:"
-		       " Object already preemptively buried\n");
+retry:
+	ret = cachefiles_inject_read_error();
+	if (ret == 0)
+		subdir = start_creating(&nop_mnt_idmap, dir, &QSTR(dirname));
+	else
+		subdir = ERR_PTR(ret);
+	trace_cachefiles_lookup(NULL, dir, subdir);
+	if (IS_ERR(subdir)) {
+		trace_cachefiles_vfs_error(NULL, d_backing_inode(dir),
+					   PTR_ERR(subdir),
+					   cachefiles_trace_lookup_error);
+		if (PTR_ERR(subdir) == -ENOMEM)
+			goto nomem_d_alloc;
+		goto lookup_error;
 	}
 
-	write_unlock(&cache->active_lock);
-	_leave(" [owner marked]");
-}
+	_debug("subdir -> %pd %s",
+	       subdir, d_backing_inode(subdir) ? "positive" : "negative");
 
-/*
- * record the fact that an object is now active
- */
-static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
-					 struct cachefiles_object *object)
-{
-	struct cachefiles_object *xobject;
-	struct rb_node **_p, *_parent = NULL;
-	struct dentry *dentry;
+	/* we need to create the subdir if it doesn't exist yet */
+	if (d_is_negative(subdir)) {
+		ret = cachefiles_has_space(cache, 1, 0,
+					   cachefiles_has_space_for_create);
+		if (ret < 0)
+			goto mkdir_error;
 
-	_enter(",%p", object);
+		_debug("attempt mkdir");
 
-try_again:
-	write_lock(&cache->active_lock);
+		path.mnt = cache->mnt;
+		path.dentry = dir;
+		ret = security_path_mkdir(&path, subdir, 0700);
+		if (ret < 0)
+			goto mkdir_error;
+		ret = cachefiles_inject_write_error();
+		if (ret == 0) {
+			subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700, NULL);
+		} else {
+			end_creating(subdir);
+			subdir = ERR_PTR(ret);
+		}
+		if (IS_ERR(subdir)) {
+			trace_cachefiles_vfs_error(NULL, d_inode(dir), ret,
+						   cachefiles_trace_mkdir_error);
+			goto mkdir_error;
+		}
+		trace_cachefiles_mkdir(dir, subdir);
+
+		if (unlikely(d_unhashed(subdir) || d_is_negative(subdir))) {
+			end_creating(subdir);
+			goto retry;
+		}
+		ASSERT(d_backing_inode(subdir));
+
+		_debug("mkdir -> %pd{ino=%lu}",
+		       subdir, d_backing_inode(subdir)->i_ino);
+		if (_is_new)
+			*_is_new = true;
+	}
+
+	/* Tell rmdir() it's not allowed to delete the subdir */
+	inode_lock(d_inode(subdir));
+	end_creating_keep(subdir);
 
-	if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
-		printk(KERN_ERR "CacheFiles: Error: Object already active\n");
-		cachefiles_printk_object(object, NULL);
-		BUG();
+	if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) {
+		pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n",
+			  subdir, d_inode(subdir)->i_ino);
+		goto mark_error;
 	}
 
-	dentry = object->dentry;
-	_p = &cache->active_nodes.rb_node;
-	while (*_p) {
-		_parent = *_p;
-		xobject = rb_entry(_parent,
-				   struct cachefiles_object, active_node);
+	inode_unlock(d_inode(subdir));
 
-		ASSERT(xobject != object);
+	/* we need to make sure the subdir is a directory */
+	ASSERT(d_backing_inode(subdir));
 
-		if (xobject->dentry > dentry)
-			_p = &(*_p)->rb_left;
-		else if (xobject->dentry < dentry)
-			_p = &(*_p)->rb_right;
-		else
-			goto wait_for_old_object;
+	if (!d_can_lookup(subdir)) {
+		pr_err("%s is not a directory\n", dirname);
+		ret = -EIO;
+		goto check_error;
 	}
 
-	rb_link_node(&object->active_node, _parent, _p);
-	rb_insert_color(&object->active_node, &cache->active_nodes);
+	ret = -EPERM;
+	if (!(d_backing_inode(subdir)->i_opflags & IOP_XATTR) ||
+	    !d_backing_inode(subdir)->i_op->lookup ||
+	    !d_backing_inode(subdir)->i_op->mkdir ||
+	    !d_backing_inode(subdir)->i_op->rename ||
+	    !d_backing_inode(subdir)->i_op->rmdir ||
+	    !d_backing_inode(subdir)->i_op->unlink)
+		goto check_error;
 
-	write_unlock(&cache->active_lock);
-	_leave(" = 0");
-	return 0;
+	_leave(" = [%lu]", d_backing_inode(subdir)->i_ino);
+	return subdir;
 
-	/* an old object from a previous incarnation is hogging the slot - we
-	 * need to wait for it to be destroyed */
-wait_for_old_object:
-	if (fscache_object_is_live(&object->fscache)) {
-		printk(KERN_ERR "\n");
-		printk(KERN_ERR "CacheFiles: Error:"
-		       " Unexpected object collision\n");
-		cachefiles_printk_object(object, xobject);
-		BUG();
-	}
-	atomic_inc(&xobject->usage);
-	write_unlock(&cache->active_lock);
-
-	if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
-		wait_queue_head_t *wq;
-
-		signed long timeout = 60 * HZ;
-		wait_queue_t wait;
-		bool requeue;
-
-		/* if the object we're waiting for is queued for processing,
-		 * then just put ourselves on the queue behind it */
-		if (work_pending(&xobject->fscache.work)) {
-			_debug("queue OBJ%x behind OBJ%x immediately",
-			       object->fscache.debug_id,
-			       xobject->fscache.debug_id);
-			goto requeue;
-		}
+check_error:
+	cachefiles_put_directory(subdir);
+	_leave(" = %d [check]", ret);
+	return ERR_PTR(ret);
 
-		/* otherwise we sleep until either the object we're waiting for
-		 * is done, or the fscache_object is congested */
-		wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
-		init_wait(&wait);
-		requeue = false;
-		do {
-			prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
-			if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
-				break;
-
-			requeue = fscache_object_sleep_till_congested(&timeout);
-		} while (timeout > 0 && !requeue);
-		finish_wait(wq, &wait);
-
-		if (requeue &&
-		    test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
-			_debug("queue OBJ%x behind OBJ%x after wait",
-			       object->fscache.debug_id,
-			       xobject->fscache.debug_id);
-			goto requeue;
-		}
+mark_error:
+	inode_unlock(d_inode(subdir));
+	dput(subdir);
+	return ERR_PTR(-EBUSY);
 
-		if (timeout <= 0) {
-			printk(KERN_ERR "\n");
-			printk(KERN_ERR "CacheFiles: Error: Overlong"
-			       " wait for old active object to go away\n");
-			cachefiles_printk_object(object, xobject);
-			goto requeue;
-		}
+mkdir_error:
+	end_creating(subdir);
+	pr_err("mkdir %s failed with error %d\n", dirname, ret);
+	return ERR_PTR(ret);
+
+lookup_error:
+	ret = PTR_ERR(subdir);
+	pr_err("Lookup %s failed with error %d\n", dirname, ret);
+	return ERR_PTR(ret);
+
+nomem_d_alloc:
+	inode_unlock(d_inode(dir));
+	_leave(" = -ENOMEM");
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * Put a subdirectory.
+ */
+void cachefiles_put_directory(struct dentry *dir)
+{
+	if (dir) {
+		cachefiles_do_unmark_inode_in_use(NULL, d_inode(dir));
+		dput(dir);
 	}
+}
 
-	ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags));
+/*
+ * Remove a regular file from the cache.
+ */
+static int cachefiles_unlink(struct cachefiles_cache *cache,
+			     struct cachefiles_object *object,
+			     struct dentry *dir, struct dentry *dentry,
+			     enum fscache_why_object_killed why)
+{
+	struct path path = {
+		.mnt	= cache->mnt,
+		.dentry	= dir,
+	};
+	int ret;
 
-	cache->cache.ops->put_object(&xobject->fscache);
-	goto try_again;
+	trace_cachefiles_unlink(object, d_inode(dentry)->i_ino, why);
+	ret = security_path_unlink(&path, dentry);
+	if (ret < 0) {
+		cachefiles_io_error(cache, "Unlink security error");
+		return ret;
+	}
 
-requeue:
-	clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
-	cache->cache.ops->put_object(&xobject->fscache);
-	_leave(" = -ETIMEDOUT");
-	return -ETIMEDOUT;
+	ret = cachefiles_inject_remove_error();
+	if (ret == 0) {
+		ret = vfs_unlink(&nop_mnt_idmap, d_backing_inode(dir), dentry, NULL);
+		if (ret == -EIO)
+			cachefiles_io_error(cache, "Unlink failed");
+	}
+	if (ret != 0)
+		trace_cachefiles_vfs_error(object, d_backing_inode(dir), ret,
+					   cachefiles_trace_unlink_error);
+	return ret;
 }
 
 /*
- * delete an object representation from the cache
- * - file backed objects are unlinked
- * - directory backed objects are stuffed into the graveyard for userspace to
+ * Delete an object representation from the cache
+ * - File backed objects are unlinked
+ * - Directory backed objects are stuffed into the graveyard for userspace to
  *   delete
- * - unlocks the directory mutex
+ * On entry dir must be locked.  It will be unlocked on exit.
+ * On entry there must be at least 2 refs on rep, one will be dropped on exit.
  */
-static int cachefiles_bury_object(struct cachefiles_cache *cache,
-				  struct dentry *dir,
-				  struct dentry *rep,
-				  bool preemptive)
+int cachefiles_bury_object(struct cachefiles_cache *cache,
+			   struct cachefiles_object *object,
+			   struct dentry *dir,
+			   struct dentry *rep,
+			   enum fscache_why_object_killed why)
 {
 	struct dentry *grave, *trap;
 	struct path path, path_to_graveyard;
 	char nbuffer[8 + 8 + 1];
 	int ret;
 
-	_enter(",'%*.*s','%*.*s'",
-	       dir->d_name.len, dir->d_name.len, dir->d_name.name,
-	       rep->d_name.len, rep->d_name.len, rep->d_name.name);
+	_enter(",'%pd','%pd'", dir, rep);
 
-	_debug("remove %p from %p", rep, dir);
+	if (rep->d_parent != dir) {
+		end_removing(rep);
+		_leave(" = -ESTALE");
+		return -ESTALE;
+	}
 
 	/* non-directories can just be unlinked */
-	if (!S_ISDIR(rep->d_inode->i_mode)) {
-		_debug("unlink stale object");
-
-		path.mnt = cache->mnt;
-		path.dentry = dir;
-		ret = security_path_unlink(&path, rep);
-		if (ret < 0) {
-			cachefiles_io_error(cache, "Unlink security error");
-		} else {
-			ret = vfs_unlink(dir->d_inode, rep);
-
-			if (preemptive)
-				cachefiles_mark_object_buried(cache, rep);
-		}
-
-		mutex_unlock(&dir->d_inode->i_mutex);
-
-		if (ret == -EIO)
-			cachefiles_io_error(cache, "Unlink failed");
+	if (!d_is_dir(rep)) {
+		ret = cachefiles_unlink(cache, object, dir, rep, why);
+		end_removing(rep);
 
 		_leave(" = %d", ret);
 		return ret;
@@ -311,19 +294,21 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
 
 	/* directories have to be moved to the graveyard */
 	_debug("move stale object to graveyard");
-	mutex_unlock(&dir->d_inode->i_mutex);
+	end_removing(rep);
 
 try_again:
 	/* first step is to make up a grave dentry in the graveyard */
 	sprintf(nbuffer, "%08x%08x",
-		(uint32_t) get_seconds(),
+		(uint32_t) ktime_get_real_seconds(),
 		(uint32_t) atomic_inc_return(&cache->gravecounter));
 
 	/* do the multiway lock magic */
 	trap = lock_rename(cache->graveyard, dir);
+	if (IS_ERR(trap))
+		return PTR_ERR(trap);
 
 	/* do some checks before getting the grave dentry */
-	if (rep->d_parent != dir) {
+	if (rep->d_parent != dir || IS_DEADDIR(d_inode(rep))) {
 		/* the entry was probably culled when we dropped the parent dir
 		 * lock */
 		unlock_rename(cache->graveyard, dir);
@@ -331,7 +316,7 @@ try_again:
 		return 0;
 	}
 
-	if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) {
+	if (!d_can_lookup(cache->graveyard)) {
 		unlock_rename(cache->graveyard, dir);
 		cachefiles_io_error(cache, "Graveyard no longer a directory");
 		return -EIO;
@@ -349,21 +334,23 @@ try_again:
 		return -EIO;
 	}
 
-	grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer));
+	grave = lookup_one(&nop_mnt_idmap, &QSTR(nbuffer), cache->graveyard);
 	if (IS_ERR(grave)) {
 		unlock_rename(cache->graveyard, dir);
+		trace_cachefiles_vfs_error(object, d_inode(cache->graveyard),
+					   PTR_ERR(grave),
+					   cachefiles_trace_lookup_error);
 
 		if (PTR_ERR(grave) == -ENOMEM) {
 			_leave(" = -ENOMEM");
 			return -ENOMEM;
 		}
 
-		cachefiles_io_error(cache, "Lookup error %ld",
-				    PTR_ERR(grave));
+		cachefiles_io_error(cache, "Lookup error %ld", PTR_ERR(grave));
 		return -EIO;
 	}
 
-	if (grave->d_inode) {
+	if (d_is_positive(grave)) {
 		unlock_rename(cache->graveyard, dir);
 		dput(grave);
 		grave = NULL;
@@ -391,20 +378,30 @@ try_again:
 	path.dentry = dir;
 	path_to_graveyard.mnt = cache->mnt;
 	path_to_graveyard.dentry = cache->graveyard;
-	ret = security_path_rename(&path, rep, &path_to_graveyard, grave);
+	ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0);
 	if (ret < 0) {
 		cachefiles_io_error(cache, "Rename security error %d", ret);
 	} else {
-		ret = vfs_rename(dir->d_inode, rep,
-				 cache->graveyard->d_inode, grave);
+		struct renamedata rd = {
+			.mnt_idmap	= &nop_mnt_idmap,
+			.old_parent	= dir,
+			.old_dentry	= rep,
+			.new_parent	= cache->graveyard,
+			.new_dentry	= grave,
+		};
+		trace_cachefiles_rename(object, d_inode(rep)->i_ino, why);
+		ret = cachefiles_inject_read_error();
+		if (ret == 0)
+			ret = vfs_rename(&rd);
+		if (ret != 0)
+			trace_cachefiles_vfs_error(object, d_inode(dir), ret,
+						   cachefiles_trace_rename_error);
 		if (ret != 0 && ret != -ENOMEM)
 			cachefiles_io_error(cache,
 					    "Rename failed with error %d", ret);
-
-		if (preemptive)
-			cachefiles_mark_object_buried(cache, rep);
 	}
 
+	__cachefiles_unmark_inode_in_use(object, d_inode(rep));
 	unlock_rename(cache->graveyard, dir);
 	dput(grave);
 	_leave(" = 0");
@@ -412,551 +409,425 @@ try_again:
 }
 
 /*
- * delete an object representation from the cache
+ * Delete a cache file.
  */
-int cachefiles_delete_object(struct cachefiles_cache *cache,
-			     struct cachefiles_object *object)
+int cachefiles_delete_object(struct cachefiles_object *object,
+			     enum fscache_why_object_killed why)
 {
-	struct dentry *dir;
+	struct cachefiles_volume *volume = object->volume;
+	struct dentry *dentry = object->file->f_path.dentry;
+	struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash];
 	int ret;
 
-	_enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry);
-
-	ASSERT(object->dentry);
-	ASSERT(object->dentry->d_inode);
-	ASSERT(object->dentry->d_parent);
-
-	dir = dget_parent(object->dentry);
-
-	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-
-	if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
-		/* object allocation for the same key preemptively deleted this
-		 * object's file so that it could create its own file */
-		_debug("object preemptively buried");
-		mutex_unlock(&dir->d_inode->i_mutex);
-		ret = 0;
-	} else {
-		/* we need to check that our parent is _still_ our parent - it
-		 * may have been renamed */
-		if (dir == object->dentry->d_parent) {
-			ret = cachefiles_bury_object(cache, dir,
-						     object->dentry, false);
-		} else {
-			/* it got moved, presumably by cachefilesd culling it,
-			 * so it's no longer in the key path and we can ignore
-			 * it */
-			mutex_unlock(&dir->d_inode->i_mutex);
-			ret = 0;
-		}
-	}
+	_enter(",OBJ%x{%pD}", object->debug_id, object->file);
 
-	dput(dir);
-	_leave(" = %d", ret);
+	dentry = start_removing_dentry(fan, dentry);
+	if (IS_ERR(dentry))
+		ret = PTR_ERR(dentry);
+	else
+		ret = cachefiles_unlink(volume->cache, object, fan, dentry, why);
+	end_removing(dentry);
 	return ret;
 }
 
 /*
- * walk from the parent object to the child object through the backing
- * filesystem, creating directories as we go
+ * Create a temporary file and leave it unattached and un-xattr'd until the
+ * time comes to discard the object from memory.
  */
-int cachefiles_walk_to_object(struct cachefiles_object *parent,
-			      struct cachefiles_object *object,
-			      const char *key,
-			      struct cachefiles_xattr *auxdata)
+struct file *cachefiles_create_tmpfile(struct cachefiles_object *object)
 {
-	struct cachefiles_cache *cache;
-	struct dentry *dir, *next = NULL;
-	struct path path;
-	unsigned long start;
-	const char *name;
-	int ret, nlen;
-
-	_enter("OBJ%x{%p},OBJ%x,%s,",
-	       parent->fscache.debug_id, parent->dentry,
-	       object->fscache.debug_id, key);
-
-	cache = container_of(parent->fscache.cache,
-			     struct cachefiles_cache, cache);
-	path.mnt = cache->mnt;
-
-	ASSERT(parent->dentry);
-	ASSERT(parent->dentry->d_inode);
-
-	if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) {
-		// TODO: convert file to dir
-		_leave("looking up in none directory");
-		return -ENOBUFS;
+	struct cachefiles_volume *volume = object->volume;
+	struct cachefiles_cache *cache = volume->cache;
+	const struct cred *saved_cred;
+	struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash];
+	struct file *file;
+	const struct path parentpath = { .mnt = cache->mnt, .dentry = fan };
+	uint64_t ni_size;
+	long ret;
+
+
+	cachefiles_begin_secure(cache, &saved_cred);
+
+	ret = cachefiles_inject_write_error();
+	if (ret == 0) {
+		file = kernel_tmpfile_open(&nop_mnt_idmap, &parentpath,
+					   S_IFREG | 0600,
+					   O_RDWR | O_LARGEFILE | O_DIRECT,
+					   cache->cache_cred);
+		ret = PTR_ERR_OR_ZERO(file);
+	}
+	if (ret) {
+		trace_cachefiles_vfs_error(object, d_inode(fan), ret,
+					   cachefiles_trace_tmpfile_error);
+		if (ret == -EIO)
+			cachefiles_io_error_obj(object, "Failed to create tmpfile");
+		goto err;
 	}
 
-	dir = dget(parent->dentry);
-
-advance:
-	/* attempt to transit the first directory component */
-	name = key;
-	nlen = strlen(key);
-
-	/* key ends in a double NUL */
-	key = key + nlen + 1;
-	if (!*key)
-		key = NULL;
-
-lookup_again:
-	/* search the current directory for the element name */
-	_debug("lookup '%s'", name);
+	trace_cachefiles_tmpfile(object, file_inode(file));
 
-	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
+	/* This is a newly created file with no other possible user */
+	if (!cachefiles_mark_inode_in_use(object, file_inode(file)))
+		WARN_ON(1);
 
-	start = jiffies;
-	next = lookup_one_len(name, dir, nlen);
-	cachefiles_hist(cachefiles_lookup_histogram, start);
-	if (IS_ERR(next))
-		goto lookup_error;
+	ret = cachefiles_ondemand_init_object(object);
+	if (ret < 0)
+		goto err_unuse;
 
-	_debug("next -> %p %s", next, next->d_inode ? "positive" : "negative");
-
-	if (!key)
-		object->new = !next->d_inode;
-
-	/* if this element of the path doesn't exist, then the lookup phase
-	 * failed, and we can release any readers in the certain knowledge that
-	 * there's nothing for them to actually read */
-	if (!next->d_inode)
-		fscache_object_lookup_negative(&object->fscache);
-
-	/* we need to create the object if it's negative */
-	if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) {
-		/* index objects and intervening tree levels must be subdirs */
-		if (!next->d_inode) {
-			ret = cachefiles_has_space(cache, 1, 0);
-			if (ret < 0)
-				goto create_error;
-
-			path.dentry = dir;
-			ret = security_path_mkdir(&path, next, 0);
-			if (ret < 0)
-				goto create_error;
-			start = jiffies;
-			ret = vfs_mkdir(dir->d_inode, next, 0);
-			cachefiles_hist(cachefiles_mkdir_histogram, start);
-			if (ret < 0)
-				goto create_error;
-
-			ASSERT(next->d_inode);
-
-			_debug("mkdir -> %p{%p{ino=%lu}}",
-			       next, next->d_inode, next->d_inode->i_ino);
-
-		} else if (!S_ISDIR(next->d_inode->i_mode)) {
-			kerror("inode %lu is not a directory",
-			       next->d_inode->i_ino);
-			ret = -ENOBUFS;
-			goto error;
-		}
+	ni_size = object->cookie->object_size;
+	ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE);
 
-	} else {
-		/* non-index objects start out life as files */
-		if (!next->d_inode) {
-			ret = cachefiles_has_space(cache, 1, 0);
-			if (ret < 0)
-				goto create_error;
-
-			path.dentry = dir;
-			ret = security_path_mknod(&path, next, S_IFREG, 0);
-			if (ret < 0)
-				goto create_error;
-			start = jiffies;
-			ret = vfs_create(dir->d_inode, next, S_IFREG, true);
-			cachefiles_hist(cachefiles_create_histogram, start);
-			if (ret < 0)
-				goto create_error;
-
-			ASSERT(next->d_inode);
-
-			_debug("create -> %p{%p{ino=%lu}}",
-			       next, next->d_inode, next->d_inode->i_ino);
-
-		} else if (!S_ISDIR(next->d_inode->i_mode) &&
-			   !S_ISREG(next->d_inode->i_mode)
-			   ) {
-			kerror("inode %lu is not a file or directory",
-			       next->d_inode->i_ino);
-			ret = -ENOBUFS;
-			goto error;
+	if (ni_size > 0) {
+		trace_cachefiles_trunc(object, file_inode(file), 0, ni_size,
+				       cachefiles_trunc_expand_tmpfile);
+		ret = cachefiles_inject_write_error();
+		if (ret == 0)
+			ret = vfs_truncate(&file->f_path, ni_size);
+		if (ret < 0) {
+			trace_cachefiles_vfs_error(
+				object, file_inode(file), ret,
+				cachefiles_trace_trunc_error);
+			goto err_unuse;
 		}
 	}
 
-	/* process the next component */
-	if (key) {
-		_debug("advance");
-		mutex_unlock(&dir->d_inode->i_mutex);
-		dput(dir);
-		dir = next;
-		next = NULL;
-		goto advance;
+	ret = -EINVAL;
+	if (unlikely(!file->f_op->read_iter) ||
+	    unlikely(!file->f_op->write_iter)) {
+		fput(file);
+		pr_notice("Cache does not support read_iter and write_iter\n");
+		goto err_unuse;
 	}
+out:
+	cachefiles_end_secure(cache, saved_cred);
+	return file;
 
-	/* we've found the object we were looking for */
-	object->dentry = next;
-
-	/* if we've found that the terminal object exists, then we need to
-	 * check its attributes and delete it if it's out of date */
-	if (!object->new) {
-		_debug("validate '%*.*s'",
-		       next->d_name.len, next->d_name.len, next->d_name.name);
-
-		ret = cachefiles_check_object_xattr(object, auxdata);
-		if (ret == -ESTALE) {
-			/* delete the object (the deleter drops the directory
-			 * mutex) */
-			object->dentry = NULL;
-
-			ret = cachefiles_bury_object(cache, dir, next, true);
-			dput(next);
-			next = NULL;
+err_unuse:
+	cachefiles_do_unmark_inode_in_use(object, file_inode(file));
+	fput(file);
+err:
+	file = ERR_PTR(ret);
+	goto out;
+}
 
-			if (ret < 0)
-				goto delete_error;
+/*
+ * Create a new file.
+ */
+static bool cachefiles_create_file(struct cachefiles_object *object)
+{
+	struct file *file;
+	int ret;
 
-			_debug("redo lookup");
-			goto lookup_again;
-		}
-	}
+	ret = cachefiles_has_space(object->volume->cache, 1, 0,
+				   cachefiles_has_space_for_create);
+	if (ret < 0)
+		return false;
 
-	/* note that we're now using this object */
-	ret = cachefiles_mark_object_active(cache, object);
+	file = cachefiles_create_tmpfile(object);
+	if (IS_ERR(file))
+		return false;
 
-	mutex_unlock(&dir->d_inode->i_mutex);
-	dput(dir);
-	dir = NULL;
+	set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &object->cookie->flags);
+	set_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags);
+	_debug("create -> %pD{ino=%lu}", file, file_inode(file)->i_ino);
+	object->file = file;
+	return true;
+}
 
-	if (ret == -ETIMEDOUT)
-		goto mark_active_timed_out;
+/*
+ * Open an existing file, checking its attributes and replacing it if it is
+ * stale.
+ */
+static bool cachefiles_open_file(struct cachefiles_object *object,
+				 struct dentry *dentry)
+{
+	struct cachefiles_cache *cache = object->volume->cache;
+	struct file *file;
+	struct path path;
+	int ret;
 
-	_debug("=== OBTAINED_OBJECT ===");
+	_enter("%pd", dentry);
 
-	if (object->new) {
-		/* attach data to a newly constructed terminal object */
-		ret = cachefiles_set_object_xattr(object, auxdata);
-		if (ret < 0)
-			goto check_error;
-	} else {
-		/* always update the atime on an object we've just looked up
-		 * (this is used to keep track of culling, and atimes are only
-		 * updated by read, write and readdir but not lookup or
-		 * open) */
-		path.dentry = next;
-		touch_atime(&path);
+	if (!cachefiles_mark_inode_in_use(object, d_inode(dentry))) {
+		pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n",
+			  dentry, d_inode(dentry)->i_ino);
+		return false;
 	}
 
-	/* open a file interface onto a data file */
-	if (object->type != FSCACHE_COOKIE_TYPE_INDEX) {
-		if (S_ISREG(object->dentry->d_inode->i_mode)) {
-			const struct address_space_operations *aops;
-
-			ret = -EPERM;
-			aops = object->dentry->d_inode->i_mapping->a_ops;
-			if (!aops->bmap)
-				goto check_error;
-
-			object->backer = object->dentry;
-		} else {
-			BUG(); // TODO: open file in data-class subdir
-		}
+	/* We need to open a file interface onto a data file now as we can't do
+	 * it on demand because writeback called from do_exit() sees
+	 * current->fs == NULL - which breaks d_path() called from ext4 open.
+	 */
+	path.mnt = cache->mnt;
+	path.dentry = dentry;
+	file = kernel_file_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT, cache->cache_cred);
+	if (IS_ERR(file)) {
+		trace_cachefiles_vfs_error(object, d_backing_inode(dentry),
+					   PTR_ERR(file),
+					   cachefiles_trace_open_error);
+		goto error;
 	}
 
-	object->new = 0;
-	fscache_obtained_object(&object->fscache);
+	if (unlikely(!file->f_op->read_iter) ||
+	    unlikely(!file->f_op->write_iter)) {
+		pr_notice("Cache does not support read_iter and write_iter\n");
+		goto error_fput;
+	}
+	_debug("file -> %pd positive", dentry);
 
-	_leave(" = 0 [%lu]", object->dentry->d_inode->i_ino);
-	return 0;
+	ret = cachefiles_ondemand_init_object(object);
+	if (ret < 0)
+		goto error_fput;
 
-create_error:
-	_debug("create error %d", ret);
-	if (ret == -EIO)
-		cachefiles_io_error(cache, "Create/mkdir failed");
-	goto error;
+	ret = cachefiles_check_auxdata(object, file);
+	if (ret < 0)
+		goto check_failed;
 
-mark_active_timed_out:
-	_debug("mark active timed out");
-	goto release_dentry;
+	clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &object->cookie->flags);
 
-check_error:
-	_debug("check error %d", ret);
-	write_lock(&cache->active_lock);
-	rb_erase(&object->active_node, &cache->active_nodes);
-	clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
-	wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
-	write_unlock(&cache->active_lock);
-release_dentry:
-	dput(object->dentry);
-	object->dentry = NULL;
-	goto error_out;
-
-delete_error:
-	_debug("delete error %d", ret);
-	goto error_out2;
+	object->file = file;
 
-lookup_error:
-	_debug("lookup error %ld", PTR_ERR(next));
-	ret = PTR_ERR(next);
-	if (ret == -EIO)
-		cachefiles_io_error(cache, "Lookup failed");
-	next = NULL;
+	/* Always update the atime on an object we've just looked up (this is
+	 * used to keep track of culling, and atimes are only updated by read,
+	 * write and readdir but not lookup or open).
+	 */
+	touch_atime(&file->f_path);
+	return true;
+
+check_failed:
+	fscache_cookie_lookup_negative(object->cookie);
+	cachefiles_unmark_inode_in_use(object, file);
+	fput(file);
+	if (ret == -ESTALE)
+		return cachefiles_create_file(object);
+	return false;
+
+error_fput:
+	fput(file);
 error:
-	mutex_unlock(&dir->d_inode->i_mutex);
-	dput(next);
-error_out2:
-	dput(dir);
-error_out:
-	_leave(" = error %d", -ret);
-	return ret;
+	cachefiles_do_unmark_inode_in_use(object, d_inode(dentry));
+	return false;
 }
 
 /*
- * get a subdirectory
+ * walk from the parent object to the child object through the backing
+ * filesystem, creating directories as we go
  */
-struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
-					struct dentry *dir,
-					const char *dirname)
+bool cachefiles_look_up_object(struct cachefiles_object *object)
 {
-	struct dentry *subdir;
-	unsigned long start;
-	struct path path;
+	struct cachefiles_volume *volume = object->volume;
+	struct dentry *dentry, *fan = volume->fanout[(u8)object->cookie->key_hash];
 	int ret;
 
-	_enter(",,%s", dirname);
-
-	/* search the current directory for the element name */
-	mutex_lock(&dir->d_inode->i_mutex);
-
-	start = jiffies;
-	subdir = lookup_one_len(dirname, dir, strlen(dirname));
-	cachefiles_hist(cachefiles_lookup_histogram, start);
-	if (IS_ERR(subdir)) {
-		if (PTR_ERR(subdir) == -ENOMEM)
-			goto nomem_d_alloc;
-		goto lookup_error;
+	_enter("OBJ%x,%s,", object->debug_id, object->d_name);
+
+	/* Look up path "cache/vol/fanout/file". */
+	ret = cachefiles_inject_read_error();
+	if (ret == 0)
+		dentry = lookup_one_positive_unlocked(&nop_mnt_idmap,
+						      &QSTR(object->d_name), fan);
+	else
+		dentry = ERR_PTR(ret);
+	trace_cachefiles_lookup(object, fan, dentry);
+	if (IS_ERR(dentry)) {
+		if (dentry == ERR_PTR(-ENOENT))
+			goto new_file;
+		if (dentry == ERR_PTR(-EIO))
+			cachefiles_io_error_obj(object, "Lookup failed");
+		return false;
+	}
+
+	if (!d_is_reg(dentry)) {
+		pr_err("%pd is not a file\n", dentry);
+		struct dentry *de = start_removing_dentry(fan, dentry);
+		if (IS_ERR(de))
+			ret = PTR_ERR(de);
+		else
+			ret = cachefiles_bury_object(volume->cache, object,
+						     fan, de,
+						     FSCACHE_OBJECT_IS_WEIRD);
+		dput(dentry);
+		if (ret < 0)
+			return false;
+		goto new_file;
 	}
 
-	_debug("subdir -> %p %s",
-	       subdir, subdir->d_inode ? "positive" : "negative");
+	ret = cachefiles_open_file(object, dentry);
+	dput(dentry);
+	if (!ret)
+		return false;
 
-	/* we need to create the subdir if it doesn't exist yet */
-	if (!subdir->d_inode) {
-		ret = cachefiles_has_space(cache, 1, 0);
-		if (ret < 0)
-			goto mkdir_error;
+	_leave(" = t [%lu]", file_inode(object->file)->i_ino);
+	return true;
 
-		_debug("attempt mkdir");
+new_file:
+	fscache_cookie_lookup_negative(object->cookie);
+	return cachefiles_create_file(object);
+}
 
-		path.mnt = cache->mnt;
-		path.dentry = dir;
-		ret = security_path_mkdir(&path, subdir, 0700);
-		if (ret < 0)
-			goto mkdir_error;
-		ret = vfs_mkdir(dir->d_inode, subdir, 0700);
-		if (ret < 0)
-			goto mkdir_error;
+/*
+ * Attempt to link a temporary file into its rightful place in the cache.
+ */
+bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
+			       struct cachefiles_object *object)
+{
+	struct cachefiles_volume *volume = object->volume;
+	struct dentry *dentry, *fan = volume->fanout[(u8)object->cookie->key_hash];
+	bool success = false;
+	int ret;
 
-		ASSERT(subdir->d_inode);
+	_enter(",%pD", object->file);
 
-		_debug("mkdir -> %p{%p{ino=%lu}}",
-		       subdir,
-		       subdir->d_inode,
-		       subdir->d_inode->i_ino);
+	ret = cachefiles_inject_read_error();
+	if (ret == 0)
+		dentry = start_creating(&nop_mnt_idmap, fan, &QSTR(object->d_name));
+	else
+		dentry = ERR_PTR(ret);
+	if (IS_ERR(dentry)) {
+		trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry),
+					   cachefiles_trace_lookup_error);
+		_debug("lookup fail %ld", PTR_ERR(dentry));
+		goto out;
 	}
 
-	mutex_unlock(&dir->d_inode->i_mutex);
+	/*
+	 * This loop will only execute more than once if some other thread
+	 * races to create the object we are trying to create.
+	 */
+	while (!d_is_negative(dentry)) {
+		ret = cachefiles_unlink(volume->cache, object, fan, dentry,
+					FSCACHE_OBJECT_IS_STALE);
+		if (ret < 0)
+			goto out_end;
 
-	/* we need to make sure the subdir is a directory */
-	ASSERT(subdir->d_inode);
+		end_creating(dentry);
 
-	if (!S_ISDIR(subdir->d_inode->i_mode)) {
-		kerror("%s is not a directory", dirname);
-		ret = -EIO;
-		goto check_error;
+		ret = cachefiles_inject_read_error();
+		if (ret == 0)
+			dentry = start_creating(&nop_mnt_idmap, fan,
+						&QSTR(object->d_name));
+		else
+			dentry = ERR_PTR(ret);
+		if (IS_ERR(dentry)) {
+			trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry),
+						   cachefiles_trace_lookup_error);
+			_debug("lookup fail %ld", PTR_ERR(dentry));
+			goto out;
+		}
 	}
 
-	ret = -EPERM;
-	if (!subdir->d_inode->i_op ||
-	    !subdir->d_inode->i_op->setxattr ||
-	    !subdir->d_inode->i_op->getxattr ||
-	    !subdir->d_inode->i_op->lookup ||
-	    !subdir->d_inode->i_op->mkdir ||
-	    !subdir->d_inode->i_op->create ||
-	    !subdir->d_inode->i_op->rename ||
-	    !subdir->d_inode->i_op->rmdir ||
-	    !subdir->d_inode->i_op->unlink)
-		goto check_error;
-
-	_leave(" = [%lu]", subdir->d_inode->i_ino);
-	return subdir;
-
-check_error:
-	dput(subdir);
-	_leave(" = %d [check]", ret);
-	return ERR_PTR(ret);
-
-mkdir_error:
-	mutex_unlock(&dir->d_inode->i_mutex);
-	dput(subdir);
-	kerror("mkdir %s failed with error %d", dirname, ret);
-	return ERR_PTR(ret);
-
-lookup_error:
-	mutex_unlock(&dir->d_inode->i_mutex);
-	ret = PTR_ERR(subdir);
-	kerror("Lookup %s failed with error %d", dirname, ret);
-	return ERR_PTR(ret);
-
-nomem_d_alloc:
-	mutex_unlock(&dir->d_inode->i_mutex);
-	_leave(" = -ENOMEM");
-	return ERR_PTR(-ENOMEM);
+	ret = cachefiles_inject_read_error();
+	if (ret == 0)
+		ret = vfs_link(object->file->f_path.dentry, &nop_mnt_idmap,
+			       d_inode(fan), dentry, NULL);
+	if (ret < 0) {
+		trace_cachefiles_vfs_error(object, d_inode(fan), ret,
+					   cachefiles_trace_link_error);
+		_debug("link fail %d", ret);
+	} else {
+		trace_cachefiles_link(object, file_inode(object->file));
+		spin_lock(&object->lock);
+		/* TODO: Do we want to switch the file pointer to the new dentry? */
+		clear_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags);
+		spin_unlock(&object->lock);
+		success = true;
+	}
+
+out_end:
+	end_creating(dentry);
+out:
+	_leave(" = %u", success);
+	return success;
 }
 
 /*
- * find out if an object is in use or not
- * - if finds object and it's not in use:
- *   - returns a pointer to the object and a reference on it
- *   - returns with the directory locked
+ * Look up an inode to be checked or culled.  Return -EBUSY if the inode is
+ * marked in use.
  */
-static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
-					      struct dentry *dir,
-					      char *filename)
+static struct dentry *cachefiles_lookup_for_cull(struct cachefiles_cache *cache,
+						 struct dentry *dir,
+						 char *filename)
 {
-	struct cachefiles_object *object;
-	struct rb_node *_n;
 	struct dentry *victim;
-	unsigned long start;
-	int ret;
-
-	//_enter(",%*.*s/,%s",
-	//       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+	int ret = -ENOENT;
 
-	/* look up the victim */
-	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
+	victim = start_removing(&nop_mnt_idmap, dir, &QSTR(filename));
 
-	start = jiffies;
-	victim = lookup_one_len(filename, dir, strlen(filename));
-	cachefiles_hist(cachefiles_lookup_histogram, start);
 	if (IS_ERR(victim))
 		goto lookup_error;
-
-	//_debug("victim -> %p %s",
-	//       victim, victim->d_inode ? "positive" : "negative");
-
-	/* if the object is no longer there then we probably retired the object
-	 * at the netfs's request whilst the cull was in progress
-	 */
-	if (!victim->d_inode) {
-		mutex_unlock(&dir->d_inode->i_mutex);
-		dput(victim);
-		_leave(" = -ENOENT [absent]");
-		return ERR_PTR(-ENOENT);
-	}
-
-	/* check to see if we're using this object */
-	read_lock(&cache->active_lock);
-
-	_n = cache->active_nodes.rb_node;
-
-	while (_n) {
-		object = rb_entry(_n, struct cachefiles_object, active_node);
-
-		if (object->dentry > victim)
-			_n = _n->rb_left;
-		else if (object->dentry < victim)
-			_n = _n->rb_right;
-		else
-			goto object_in_use;
-	}
-
-	read_unlock(&cache->active_lock);
-
-	//_leave(" = %p", victim);
+	if (d_inode(victim)->i_flags & S_KERNEL_FILE)
+		goto lookup_busy;
 	return victim;
 
-object_in_use:
-	read_unlock(&cache->active_lock);
-	mutex_unlock(&dir->d_inode->i_mutex);
-	dput(victim);
-	//_leave(" = -EBUSY [in use]");
-	return ERR_PTR(-EBUSY);
+lookup_busy:
+	ret = -EBUSY;
+	end_removing(victim);
+	return ERR_PTR(ret);
 
 lookup_error:
-	mutex_unlock(&dir->d_inode->i_mutex);
 	ret = PTR_ERR(victim);
-	if (ret == -ENOENT) {
-		/* file or dir now absent - probably retired by netfs */
-		_leave(" = -ESTALE [absent]");
-		return ERR_PTR(-ESTALE);
-	}
+	if (ret == -ENOENT)
+		return ERR_PTR(-ESTALE); /* Probably got retired by the netfs */
 
 	if (ret == -EIO) {
 		cachefiles_io_error(cache, "Lookup failed");
 	} else if (ret != -ENOMEM) {
-		kerror("Internal error: %d", ret);
+		pr_err("Internal error: %d\n", ret);
 		ret = -EIO;
 	}
 
-	_leave(" = %d", ret);
 	return ERR_PTR(ret);
 }
 
 /*
- * cull an object if it's not in use
+ * Cull an object if it's not in use
  * - called only by cache manager daemon
  */
 int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
 		    char *filename)
 {
 	struct dentry *victim;
+	struct inode *inode;
 	int ret;
 
-	_enter(",%*.*s/,%s",
-	       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+	_enter(",%pd/,%s", dir, filename);
 
-	victim = cachefiles_check_active(cache, dir, filename);
+	victim = cachefiles_lookup_for_cull(cache, dir, filename);
 	if (IS_ERR(victim))
 		return PTR_ERR(victim);
 
-	_debug("victim -> %p %s",
-	       victim, victim->d_inode ? "positive" : "negative");
-
-	/* okay... the victim is not being used so we can cull it
-	 * - start by marking it as stale
-	 */
-	_debug("victim is cullable");
-
-	ret = cachefiles_remove_object_xattr(cache, victim);
+	/* check to see if someone is using this object */
+	inode = d_inode(victim);
+	inode_lock(inode);
+	if (inode->i_flags & S_KERNEL_FILE) {
+		ret = -EBUSY;
+	} else {
+		/* Stop the cache from picking it back up */
+		inode->i_flags |= S_KERNEL_FILE;
+		ret = 0;
+	}
+	inode_unlock(inode);
 	if (ret < 0)
 		goto error_unlock;
 
-	/*  actually remove the victim (drops the dir mutex) */
-	_debug("bury");
-
-	ret = cachefiles_bury_object(cache, dir, victim, false);
+	ret = cachefiles_bury_object(cache, NULL, dir, victim,
+				     FSCACHE_OBJECT_WAS_CULLED);
+	dput(victim);
 	if (ret < 0)
 		goto error;
 
-	dput(victim);
+	fscache_count_culled();
 	_leave(" = 0");
 	return 0;
 
 error_unlock:
-	mutex_unlock(&dir->d_inode->i_mutex);
+	end_removing(victim);
 error:
-	dput(victim);
-	if (ret == -ENOENT) {
-		/* file or dir now absent - probably retired by netfs */
-		_leave(" = -ESTALE [absent]");
-		return -ESTALE;
-	}
+	if (ret == -ENOENT)
+		return -ESTALE; /* Probably got retired by the netfs */
 
 	if (ret != -ENOMEM) {
-		kerror("Internal error: %d", ret);
+		pr_err("Internal error: %d\n", ret);
 		ret = -EIO;
 	}
 
@@ -965,7 +836,7 @@ error:
 }
 
 /*
- * find out if an object is in use or not
+ * Find out if an object is in use or not
  * - called only by cache manager daemon
  * - returns -EBUSY or 0 to indicate whether an object is in use or not
  */
@@ -973,16 +844,13 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
 			    char *filename)
 {
 	struct dentry *victim;
+	int ret = 0;
 
-	//_enter(",%*.*s/,%s",
-	//       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
-
-	victim = cachefiles_check_active(cache, dir, filename);
+	victim = cachefiles_lookup_for_cull(cache, dir, filename);
 	if (IS_ERR(victim))
 		return PTR_ERR(victim);
 
-	mutex_unlock(&dir->d_inode->i_mutex);
+	inode_unlock(d_inode(dir));
 	dput(victim);
-	//_leave(" = 0");
-	return 0;
+	return ret;
 }
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
new file mode 100644
index 000000000000..a7ed86fa98bb
--- /dev/null
+++ b/fs/cachefiles/ondemand.c
@@ -0,0 +1,762 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/anon_inodes.h>
+#include <linux/uio.h>
+#include "internal.h"
+
+struct ondemand_anon_file {
+	struct file *file;
+	int fd;
+};
+
+static inline void cachefiles_req_put(struct cachefiles_req *req)
+{
+	if (refcount_dec_and_test(&req->ref))
+		kfree(req);
+}
+
+static int cachefiles_ondemand_fd_release(struct inode *inode,
+					  struct file *file)
+{
+	struct cachefiles_object *object = file->private_data;
+	struct cachefiles_cache *cache;
+	struct cachefiles_ondemand_info *info;
+	int object_id;
+	struct cachefiles_req *req;
+	XA_STATE(xas, NULL, 0);
+
+	if (!object)
+		return 0;
+
+	info = object->ondemand;
+	cache = object->volume->cache;
+	xas.xa = &cache->reqs;
+
+	xa_lock(&cache->reqs);
+	spin_lock(&info->lock);
+	object_id = info->ondemand_id;
+	info->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED;
+	cachefiles_ondemand_set_object_close(object);
+	spin_unlock(&info->lock);
+
+	/* Only flush CACHEFILES_REQ_NEW marked req to avoid race with daemon_read */
+	xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) {
+		if (req->msg.object_id == object_id &&
+		    req->msg.opcode == CACHEFILES_OP_CLOSE) {
+			complete(&req->done);
+			xas_store(&xas, NULL);
+		}
+	}
+	xa_unlock(&cache->reqs);
+
+	xa_erase(&cache->ondemand_ids, object_id);
+	trace_cachefiles_ondemand_fd_release(object, object_id);
+	cachefiles_put_object(object, cachefiles_obj_put_ondemand_fd);
+	cachefiles_put_unbind_pincount(cache);
+	return 0;
+}
+
+static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb,
+						 struct iov_iter *iter)
+{
+	struct cachefiles_object *object = kiocb->ki_filp->private_data;
+	struct cachefiles_cache *cache = object->volume->cache;
+	struct file *file;
+	size_t len = iter->count, aligned_len = len;
+	loff_t pos = kiocb->ki_pos;
+	const struct cred *saved_cred;
+	int ret;
+
+	spin_lock(&object->lock);
+	file = object->file;
+	if (!file) {
+		spin_unlock(&object->lock);
+		return -ENOBUFS;
+	}
+	get_file(file);
+	spin_unlock(&object->lock);
+
+	cachefiles_begin_secure(cache, &saved_cred);
+	ret = __cachefiles_prepare_write(object, file, &pos, &aligned_len, len, true);
+	cachefiles_end_secure(cache, saved_cred);
+	if (ret < 0)
+		goto out;
+
+	trace_cachefiles_ondemand_fd_write(object, file_inode(file), pos, len);
+	ret = __cachefiles_write(object, file, pos, iter, NULL, NULL);
+	if (ret > 0)
+		kiocb->ki_pos += ret;
+
+out:
+	fput(file);
+	return ret;
+}
+
+static loff_t cachefiles_ondemand_fd_llseek(struct file *filp, loff_t pos,
+					    int whence)
+{
+	struct cachefiles_object *object = filp->private_data;
+	struct file *file;
+	loff_t ret;
+
+	spin_lock(&object->lock);
+	file = object->file;
+	if (!file) {
+		spin_unlock(&object->lock);
+		return -ENOBUFS;
+	}
+	get_file(file);
+	spin_unlock(&object->lock);
+
+	ret = vfs_llseek(file, pos, whence);
+	fput(file);
+
+	return ret;
+}
+
+static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl,
+					 unsigned long id)
+{
+	struct cachefiles_object *object = filp->private_data;
+	struct cachefiles_cache *cache = object->volume->cache;
+	struct cachefiles_req *req;
+	XA_STATE(xas, &cache->reqs, id);
+
+	if (ioctl != CACHEFILES_IOC_READ_COMPLETE)
+		return -EINVAL;
+
+	if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+		return -EOPNOTSUPP;
+
+	xa_lock(&cache->reqs);
+	req = xas_load(&xas);
+	if (!req || req->msg.opcode != CACHEFILES_OP_READ ||
+	    req->object != object) {
+		xa_unlock(&cache->reqs);
+		return -EINVAL;
+	}
+	xas_store(&xas, NULL);
+	xa_unlock(&cache->reqs);
+
+	trace_cachefiles_ondemand_cread(object, id);
+	complete(&req->done);
+	return 0;
+}
+
+static const struct file_operations cachefiles_ondemand_fd_fops = {
+	.owner		= THIS_MODULE,
+	.release	= cachefiles_ondemand_fd_release,
+	.write_iter	= cachefiles_ondemand_fd_write_iter,
+	.llseek		= cachefiles_ondemand_fd_llseek,
+	.unlocked_ioctl	= cachefiles_ondemand_fd_ioctl,
+};
+
+/*
+ * OPEN request Completion (copen)
+ * - command: "copen <id>,<cache_size>"
+ *   <cache_size> indicates the object size if >=0, error code if negative
+ */
+int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
+{
+	struct cachefiles_req *req;
+	struct fscache_cookie *cookie;
+	struct cachefiles_ondemand_info *info;
+	char *pid, *psize;
+	unsigned long id;
+	long size;
+	int ret;
+	XA_STATE(xas, &cache->reqs, 0);
+
+	if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+		return -EOPNOTSUPP;
+
+	if (!*args) {
+		pr_err("Empty id specified\n");
+		return -EINVAL;
+	}
+
+	pid = args;
+	psize = strchr(args, ',');
+	if (!psize) {
+		pr_err("Cache size is not specified\n");
+		return -EINVAL;
+	}
+
+	*psize = 0;
+	psize++;
+
+	ret = kstrtoul(pid, 0, &id);
+	if (ret)
+		return ret;
+
+	xa_lock(&cache->reqs);
+	xas.xa_index = id;
+	req = xas_load(&xas);
+	if (!req || req->msg.opcode != CACHEFILES_OP_OPEN ||
+	    !req->object->ondemand->ondemand_id) {
+		xa_unlock(&cache->reqs);
+		return -EINVAL;
+	}
+	xas_store(&xas, NULL);
+	xa_unlock(&cache->reqs);
+
+	info = req->object->ondemand;
+	/* fail OPEN request if copen format is invalid */
+	ret = kstrtol(psize, 0, &size);
+	if (ret) {
+		req->error = ret;
+		goto out;
+	}
+
+	/* fail OPEN request if daemon reports an error */
+	if (size < 0) {
+		if (!IS_ERR_VALUE(size)) {
+			req->error = -EINVAL;
+			ret = -EINVAL;
+		} else {
+			req->error = size;
+			ret = 0;
+		}
+		goto out;
+	}
+
+	spin_lock(&info->lock);
+	/*
+	 * The anonymous fd was closed before copen ? Fail the request.
+	 *
+	 *             t1             |             t2
+	 * ---------------------------------------------------------
+	 *                             cachefiles_ondemand_copen
+	 *                             req = xa_erase(&cache->reqs, id)
+	 * // Anon fd is maliciously closed.
+	 * cachefiles_ondemand_fd_release
+	 * xa_lock(&cache->reqs)
+	 * cachefiles_ondemand_set_object_close(object)
+	 * xa_unlock(&cache->reqs)
+	 *                             cachefiles_ondemand_set_object_open
+	 *                             // No one will ever close it again.
+	 * cachefiles_ondemand_daemon_read
+	 * cachefiles_ondemand_select_req
+	 *
+	 * Get a read req but its fd is already closed. The daemon can't
+	 * issue a cread ioctl with an closed fd, then hung.
+	 */
+	if (info->ondemand_id == CACHEFILES_ONDEMAND_ID_CLOSED) {
+		spin_unlock(&info->lock);
+		req->error = -EBADFD;
+		goto out;
+	}
+	cookie = req->object->cookie;
+	cookie->object_size = size;
+	if (size)
+		clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags);
+	else
+		set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags);
+	trace_cachefiles_ondemand_copen(req->object, id, size);
+
+	cachefiles_ondemand_set_object_open(req->object);
+	spin_unlock(&info->lock);
+	wake_up_all(&cache->daemon_pollwq);
+
+out:
+	spin_lock(&info->lock);
+	/* Need to set object close to avoid reopen status continuing */
+	if (info->ondemand_id == CACHEFILES_ONDEMAND_ID_CLOSED)
+		cachefiles_ondemand_set_object_close(req->object);
+	spin_unlock(&info->lock);
+	complete(&req->done);
+	return ret;
+}
+
+int cachefiles_ondemand_restore(struct cachefiles_cache *cache, char *args)
+{
+	struct cachefiles_req *req;
+
+	XA_STATE(xas, &cache->reqs, 0);
+
+	if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+		return -EOPNOTSUPP;
+
+	/*
+	 * Reset the requests to CACHEFILES_REQ_NEW state, so that the
+	 * requests have been processed halfway before the crash of the
+	 * user daemon could be reprocessed after the recovery.
+	 */
+	xas_lock(&xas);
+	xas_for_each(&xas, req, ULONG_MAX)
+		xas_set_mark(&xas, CACHEFILES_REQ_NEW);
+	xas_unlock(&xas);
+
+	wake_up_all(&cache->daemon_pollwq);
+	return 0;
+}
+
+static int cachefiles_ondemand_get_fd(struct cachefiles_req *req,
+				      struct ondemand_anon_file *anon_file)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	struct cachefiles_open *load;
+	u32 object_id;
+	int ret;
+
+	object = cachefiles_grab_object(req->object,
+			cachefiles_obj_get_ondemand_fd);
+	cache = object->volume->cache;
+
+	ret = xa_alloc_cyclic(&cache->ondemand_ids, &object_id, NULL,
+			      XA_LIMIT(1, INT_MAX),
+			      &cache->ondemand_id_next, GFP_KERNEL);
+	if (ret < 0)
+		goto err;
+
+	anon_file->fd = get_unused_fd_flags(O_WRONLY);
+	if (anon_file->fd < 0) {
+		ret = anon_file->fd;
+		goto err_free_id;
+	}
+
+	anon_file->file = anon_inode_getfile_fmode("[cachefiles]",
+				&cachefiles_ondemand_fd_fops, object,
+				O_WRONLY, FMODE_PWRITE | FMODE_LSEEK);
+	if (IS_ERR(anon_file->file)) {
+		ret = PTR_ERR(anon_file->file);
+		goto err_put_fd;
+	}
+
+	spin_lock(&object->ondemand->lock);
+	if (object->ondemand->ondemand_id > 0) {
+		spin_unlock(&object->ondemand->lock);
+		/* Pair with check in cachefiles_ondemand_fd_release(). */
+		anon_file->file->private_data = NULL;
+		ret = -EEXIST;
+		goto err_put_file;
+	}
+
+	load = (void *)req->msg.data;
+	load->fd = anon_file->fd;
+	object->ondemand->ondemand_id = object_id;
+	spin_unlock(&object->ondemand->lock);
+
+	cachefiles_get_unbind_pincount(cache);
+	trace_cachefiles_ondemand_open(object, &req->msg, load);
+	return 0;
+
+err_put_file:
+	fput(anon_file->file);
+	anon_file->file = NULL;
+err_put_fd:
+	put_unused_fd(anon_file->fd);
+	anon_file->fd = ret;
+err_free_id:
+	xa_erase(&cache->ondemand_ids, object_id);
+err:
+	spin_lock(&object->ondemand->lock);
+	/* Avoid marking an opened object as closed. */
+	if (object->ondemand->ondemand_id <= 0)
+		cachefiles_ondemand_set_object_close(object);
+	spin_unlock(&object->ondemand->lock);
+	cachefiles_put_object(object, cachefiles_obj_put_ondemand_fd);
+	return ret;
+}
+
+static void ondemand_object_worker(struct work_struct *work)
+{
+	struct cachefiles_ondemand_info *info =
+		container_of(work, struct cachefiles_ondemand_info, ondemand_work);
+
+	cachefiles_ondemand_init_object(info->object);
+}
+
+/*
+ * If there are any inflight or subsequent READ requests on the
+ * closed object, reopen it.
+ * Skip read requests whose related object is reopening.
+ */
+static struct cachefiles_req *cachefiles_ondemand_select_req(struct xa_state *xas,
+							      unsigned long xa_max)
+{
+	struct cachefiles_req *req;
+	struct cachefiles_object *object;
+	struct cachefiles_ondemand_info *info;
+
+	xas_for_each_marked(xas, req, xa_max, CACHEFILES_REQ_NEW) {
+		if (req->msg.opcode != CACHEFILES_OP_READ)
+			return req;
+		object = req->object;
+		info = object->ondemand;
+		if (cachefiles_ondemand_object_is_close(object)) {
+			cachefiles_ondemand_set_object_reopening(object);
+			queue_work(fscache_wq, &info->ondemand_work);
+			continue;
+		}
+		if (cachefiles_ondemand_object_is_reopening(object))
+			continue;
+		return req;
+	}
+	return NULL;
+}
+
+static inline bool cachefiles_ondemand_finish_req(struct cachefiles_req *req,
+						  struct xa_state *xas, int err)
+{
+	if (unlikely(!xas || !req))
+		return false;
+
+	if (xa_cmpxchg(xas->xa, xas->xa_index, req, NULL, 0) != req)
+		return false;
+
+	req->error = err;
+	complete(&req->done);
+	return true;
+}
+
+ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
+					char __user *_buffer, size_t buflen)
+{
+	struct cachefiles_req *req;
+	struct cachefiles_msg *msg;
+	size_t n;
+	int ret = 0;
+	struct ondemand_anon_file anon_file;
+	XA_STATE(xas, &cache->reqs, cache->req_id_next);
+
+	xa_lock(&cache->reqs);
+	/*
+	 * Cyclically search for a request that has not ever been processed,
+	 * to prevent requests from being processed repeatedly, and make
+	 * request distribution fair.
+	 */
+	req = cachefiles_ondemand_select_req(&xas, ULONG_MAX);
+	if (!req && cache->req_id_next > 0) {
+		xas_set(&xas, 0);
+		req = cachefiles_ondemand_select_req(&xas, cache->req_id_next - 1);
+	}
+	if (!req) {
+		xa_unlock(&cache->reqs);
+		return 0;
+	}
+
+	msg = &req->msg;
+	n = msg->len;
+
+	if (n > buflen) {
+		xa_unlock(&cache->reqs);
+		return -EMSGSIZE;
+	}
+
+	xas_clear_mark(&xas, CACHEFILES_REQ_NEW);
+	cache->req_id_next = xas.xa_index + 1;
+	refcount_inc(&req->ref);
+	cachefiles_grab_object(req->object, cachefiles_obj_get_read_req);
+	xa_unlock(&cache->reqs);
+
+	if (msg->opcode == CACHEFILES_OP_OPEN) {
+		ret = cachefiles_ondemand_get_fd(req, &anon_file);
+		if (ret)
+			goto out;
+	}
+
+	msg->msg_id = xas.xa_index;
+	msg->object_id = req->object->ondemand->ondemand_id;
+
+	if (copy_to_user(_buffer, msg, n) != 0)
+		ret = -EFAULT;
+
+	if (msg->opcode == CACHEFILES_OP_OPEN) {
+		if (ret < 0) {
+			fput(anon_file.file);
+			put_unused_fd(anon_file.fd);
+			goto out;
+		}
+		fd_install(anon_file.fd, anon_file.file);
+	}
+out:
+	cachefiles_put_object(req->object, cachefiles_obj_put_read_req);
+	/* Remove error request and CLOSE request has no reply */
+	if (ret || msg->opcode == CACHEFILES_OP_CLOSE)
+		cachefiles_ondemand_finish_req(req, &xas, ret);
+	cachefiles_req_put(req);
+	return ret ? ret : n;
+}
+
+typedef int (*init_req_fn)(struct cachefiles_req *req, void *private);
+
+static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
+					enum cachefiles_opcode opcode,
+					size_t data_len,
+					init_req_fn init_req,
+					void *private)
+{
+	struct cachefiles_cache *cache = object->volume->cache;
+	struct cachefiles_req *req = NULL;
+	XA_STATE(xas, &cache->reqs, 0);
+	int ret;
+
+	if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+		return 0;
+
+	if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+		ret = -EIO;
+		goto out;
+	}
+
+	req = kzalloc(sizeof(*req) + data_len, GFP_KERNEL);
+	if (!req) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	refcount_set(&req->ref, 1);
+	req->object = object;
+	init_completion(&req->done);
+	req->msg.opcode = opcode;
+	req->msg.len = sizeof(struct cachefiles_msg) + data_len;
+
+	ret = init_req(req, private);
+	if (ret)
+		goto out;
+
+	do {
+		/*
+		 * Stop enqueuing the request when daemon is dying. The
+		 * following two operations need to be atomic as a whole.
+		 *   1) check cache state, and
+		 *   2) enqueue request if cache is alive.
+		 * Otherwise the request may be enqueued after xarray has been
+		 * flushed, leaving the orphan request never being completed.
+		 *
+		 * CPU 1			CPU 2
+		 * =====			=====
+		 *				test CACHEFILES_DEAD bit
+		 * set CACHEFILES_DEAD bit
+		 * flush requests in the xarray
+		 *				enqueue the request
+		 */
+		xas_lock(&xas);
+
+		if (test_bit(CACHEFILES_DEAD, &cache->flags) ||
+		    cachefiles_ondemand_object_is_dropping(object)) {
+			xas_unlock(&xas);
+			ret = -EIO;
+			goto out;
+		}
+
+		/* coupled with the barrier in cachefiles_flush_reqs() */
+		smp_mb();
+
+		if (opcode == CACHEFILES_OP_CLOSE &&
+		    !cachefiles_ondemand_object_is_open(object)) {
+			WARN_ON_ONCE(object->ondemand->ondemand_id == 0);
+			xas_unlock(&xas);
+			ret = -EIO;
+			goto out;
+		}
+
+		/*
+		 * Cyclically find a free xas to avoid msg_id reuse that would
+		 * cause the daemon to successfully copen a stale msg_id.
+		 */
+		xas.xa_index = cache->msg_id_next;
+		xas_find_marked(&xas, UINT_MAX, XA_FREE_MARK);
+		if (xas.xa_node == XAS_RESTART) {
+			xas.xa_index = 0;
+			xas_find_marked(&xas, cache->msg_id_next - 1, XA_FREE_MARK);
+		}
+		if (xas.xa_node == XAS_RESTART)
+			xas_set_err(&xas, -EBUSY);
+
+		xas_store(&xas, req);
+		if (xas_valid(&xas)) {
+			cache->msg_id_next = xas.xa_index + 1;
+			xas_clear_mark(&xas, XA_FREE_MARK);
+			xas_set_mark(&xas, CACHEFILES_REQ_NEW);
+		}
+		xas_unlock(&xas);
+	} while (xas_nomem(&xas, GFP_KERNEL));
+
+	ret = xas_error(&xas);
+	if (ret)
+		goto out;
+
+	wake_up_all(&cache->daemon_pollwq);
+wait:
+	ret = wait_for_completion_killable(&req->done);
+	if (!ret) {
+		ret = req->error;
+	} else {
+		ret = -EINTR;
+		if (!cachefiles_ondemand_finish_req(req, &xas, ret)) {
+			/* Someone will complete it soon. */
+			cpu_relax();
+			goto wait;
+		}
+	}
+	cachefiles_req_put(req);
+	return ret;
+out:
+	/* Reset the object to close state in error handling path.
+	 * If error occurs after creating the anonymous fd,
+	 * cachefiles_ondemand_fd_release() will set object to close.
+	 */
+	if (opcode == CACHEFILES_OP_OPEN &&
+	    !cachefiles_ondemand_object_is_dropping(object))
+		cachefiles_ondemand_set_object_close(object);
+	kfree(req);
+	return ret;
+}
+
+static int cachefiles_ondemand_init_open_req(struct cachefiles_req *req,
+					     void *private)
+{
+	struct cachefiles_object *object = req->object;
+	struct fscache_cookie *cookie = object->cookie;
+	struct fscache_volume *volume = object->volume->vcookie;
+	struct cachefiles_open *load = (void *)req->msg.data;
+	size_t volume_key_size, cookie_key_size;
+	void *volume_key, *cookie_key;
+
+	/*
+	 * Volume key is a NUL-terminated string. key[0] stores strlen() of the
+	 * string, followed by the content of the string (excluding '\0').
+	 */
+	volume_key_size = volume->key[0] + 1;
+	volume_key = volume->key + 1;
+
+	/* Cookie key is binary data, which is netfs specific. */
+	cookie_key_size = cookie->key_len;
+	cookie_key = fscache_get_key(cookie);
+
+	if (!(object->cookie->advice & FSCACHE_ADV_WANT_CACHE_SIZE)) {
+		pr_err("WANT_CACHE_SIZE is needed for on-demand mode\n");
+		return -EINVAL;
+	}
+
+	load->volume_key_size = volume_key_size;
+	load->cookie_key_size = cookie_key_size;
+	memcpy(load->data, volume_key, volume_key_size);
+	memcpy(load->data + volume_key_size, cookie_key, cookie_key_size);
+
+	return 0;
+}
+
+static int cachefiles_ondemand_init_close_req(struct cachefiles_req *req,
+					      void *private)
+{
+	struct cachefiles_object *object = req->object;
+
+	if (!cachefiles_ondemand_object_is_open(object))
+		return -ENOENT;
+
+	trace_cachefiles_ondemand_close(object, &req->msg);
+	return 0;
+}
+
+struct cachefiles_read_ctx {
+	loff_t off;
+	size_t len;
+};
+
+static int cachefiles_ondemand_init_read_req(struct cachefiles_req *req,
+					     void *private)
+{
+	struct cachefiles_object *object = req->object;
+	struct cachefiles_read *load = (void *)req->msg.data;
+	struct cachefiles_read_ctx *read_ctx = private;
+
+	load->off = read_ctx->off;
+	load->len = read_ctx->len;
+	trace_cachefiles_ondemand_read(object, &req->msg, load);
+	return 0;
+}
+
+int cachefiles_ondemand_init_object(struct cachefiles_object *object)
+{
+	struct fscache_cookie *cookie = object->cookie;
+	struct fscache_volume *volume = object->volume->vcookie;
+	size_t volume_key_size, cookie_key_size, data_len;
+
+	if (!object->ondemand)
+		return 0;
+
+	/*
+	 * CacheFiles will firstly check the cache file under the root cache
+	 * directory. If the coherency check failed, it will fallback to
+	 * creating a new tmpfile as the cache file. Reuse the previously
+	 * allocated object ID if any.
+	 */
+	if (cachefiles_ondemand_object_is_open(object))
+		return 0;
+
+	volume_key_size = volume->key[0] + 1;
+	cookie_key_size = cookie->key_len;
+	data_len = sizeof(struct cachefiles_open) +
+		   volume_key_size + cookie_key_size;
+
+	return cachefiles_ondemand_send_req(object, CACHEFILES_OP_OPEN,
+			data_len, cachefiles_ondemand_init_open_req, NULL);
+}
+
+void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
+{
+	unsigned long index;
+	struct cachefiles_req *req;
+	struct cachefiles_cache *cache;
+
+	if (!object->ondemand)
+		return;
+
+	cachefiles_ondemand_send_req(object, CACHEFILES_OP_CLOSE, 0,
+			cachefiles_ondemand_init_close_req, NULL);
+
+	if (!object->ondemand->ondemand_id)
+		return;
+
+	/* Cancel all requests for the object that is being dropped. */
+	cache = object->volume->cache;
+	xa_lock(&cache->reqs);
+	cachefiles_ondemand_set_object_dropping(object);
+	xa_for_each(&cache->reqs, index, req) {
+		if (req->object == object) {
+			req->error = -EIO;
+			complete(&req->done);
+			__xa_erase(&cache->reqs, index);
+		}
+	}
+	xa_unlock(&cache->reqs);
+
+	/* Wait for ondemand_object_worker() to finish to avoid UAF. */
+	cancel_work_sync(&object->ondemand->ondemand_work);
+}
+
+int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object,
+				struct cachefiles_volume *volume)
+{
+	if (!cachefiles_in_ondemand_mode(volume->cache))
+		return 0;
+
+	object->ondemand = kzalloc(sizeof(struct cachefiles_ondemand_info),
+					GFP_KERNEL);
+	if (!object->ondemand)
+		return -ENOMEM;
+
+	object->ondemand->object = object;
+	spin_lock_init(&object->ondemand->lock);
+	INIT_WORK(&object->ondemand->ondemand_work, ondemand_object_worker);
+	return 0;
+}
+
+void cachefiles_ondemand_deinit_obj_info(struct cachefiles_object *object)
+{
+	kfree(object->ondemand);
+	object->ondemand = NULL;
+}
+
+int cachefiles_ondemand_read(struct cachefiles_object *object,
+			     loff_t pos, size_t len)
+{
+	struct cachefiles_read_ctx read_ctx = {pos, len};
+
+	return cachefiles_ondemand_send_req(object, CACHEFILES_OP_READ,
+			sizeof(struct cachefiles_read),
+			cachefiles_ondemand_init_read_req, &read_ctx);
+}
diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c
deleted file mode 100644
index eccd33941199..000000000000
--- a/fs/cachefiles/proc.c
+++ /dev/null
@@ -1,134 +0,0 @@
-/* CacheFiles statistics
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include "internal.h"
-
-atomic_t cachefiles_lookup_histogram[HZ];
-atomic_t cachefiles_mkdir_histogram[HZ];
-atomic_t cachefiles_create_histogram[HZ];
-
-/*
- * display the latency histogram
- */
-static int cachefiles_histogram_show(struct seq_file *m, void *v)
-{
-	unsigned long index;
-	unsigned x, y, z, t;
-
-	switch ((unsigned long) v) {
-	case 1:
-		seq_puts(m, "JIFS  SECS  LOOKUPS   MKDIRS    CREATES\n");
-		return 0;
-	case 2:
-		seq_puts(m, "===== ===== ========= ========= =========\n");
-		return 0;
-	default:
-		index = (unsigned long) v - 3;
-		x = atomic_read(&cachefiles_lookup_histogram[index]);
-		y = atomic_read(&cachefiles_mkdir_histogram[index]);
-		z = atomic_read(&cachefiles_create_histogram[index]);
-		if (x == 0 && y == 0 && z == 0)
-			return 0;
-
-		t = (index * 1000) / HZ;
-
-		seq_printf(m, "%4lu  0.%03u %9u %9u %9u\n", index, t, x, y, z);
-		return 0;
-	}
-}
-
-/*
- * set up the iterator to start reading from the first line
- */
-static void *cachefiles_histogram_start(struct seq_file *m, loff_t *_pos)
-{
-	if ((unsigned long long)*_pos >= HZ + 2)
-		return NULL;
-	if (*_pos == 0)
-		*_pos = 1;
-	return (void *)(unsigned long) *_pos;
-}
-
-/*
- * move to the next line
- */
-static void *cachefiles_histogram_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	(*pos)++;
-	return (unsigned long long)*pos > HZ + 2 ?
-		NULL : (void *)(unsigned long) *pos;
-}
-
-/*
- * clean up after reading
- */
-static void cachefiles_histogram_stop(struct seq_file *m, void *v)
-{
-}
-
-static const struct seq_operations cachefiles_histogram_ops = {
-	.start		= cachefiles_histogram_start,
-	.stop		= cachefiles_histogram_stop,
-	.next		= cachefiles_histogram_next,
-	.show		= cachefiles_histogram_show,
-};
-
-/*
- * open "/proc/fs/cachefiles/XXX" which provide statistics summaries
- */
-static int cachefiles_histogram_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &cachefiles_histogram_ops);
-}
-
-static const struct file_operations cachefiles_histogram_fops = {
-	.owner		= THIS_MODULE,
-	.open		= cachefiles_histogram_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-/*
- * initialise the /proc/fs/cachefiles/ directory
- */
-int __init cachefiles_proc_init(void)
-{
-	_enter("");
-
-	if (!proc_mkdir("fs/cachefiles", NULL))
-		goto error_dir;
-
-	if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL,
-			 &cachefiles_histogram_fops))
-		goto error_histogram;
-
-	_leave(" = 0");
-	return 0;
-
-error_histogram:
-	remove_proc_entry("fs/cachefiles", NULL);
-error_dir:
-	_leave(" = -ENOMEM");
-	return -ENOMEM;
-}
-
-/*
- * clean up the /proc/fs/cachefiles/ directory
- */
-void cachefiles_proc_cleanup(void)
-{
-	remove_proc_entry("fs/cachefiles/histogram", NULL);
-	remove_proc_entry("fs/cachefiles", NULL);
-}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
deleted file mode 100644
index ebaff368120d..000000000000
--- a/fs/cachefiles/rdwr.c
+++ /dev/null
@@ -1,990 +0,0 @@
-/* Storage object read/write
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/mount.h>
-#include <linux/slab.h>
-#include <linux/file.h>
-#include <linux/swap.h>
-#include "internal.h"
-
-/*
- * detect wake up events generated by the unlocking of pages in which we're
- * interested
- * - we use this to detect read completion of backing pages
- * - the caller holds the waitqueue lock
- */
-static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
-				  int sync, void *_key)
-{
-	struct cachefiles_one_read *monitor =
-		container_of(wait, struct cachefiles_one_read, monitor);
-	struct cachefiles_object *object;
-	struct wait_bit_key *key = _key;
-	struct page *page = wait->private;
-
-	ASSERT(key);
-
-	_enter("{%lu},%u,%d,{%p,%u}",
-	       monitor->netfs_page->index, mode, sync,
-	       key->flags, key->bit_nr);
-
-	if (key->flags != &page->flags ||
-	    key->bit_nr != PG_locked)
-		return 0;
-
-	_debug("--- monitor %p %lx ---", page, page->flags);
-
-	if (!PageUptodate(page) && !PageError(page)) {
-		/* unlocked, not uptodate and not erronous? */
-		_debug("page probably truncated");
-	}
-
-	/* remove from the waitqueue */
-	list_del(&wait->task_list);
-
-	/* move onto the action list and queue for FS-Cache thread pool */
-	ASSERT(monitor->op);
-
-	object = container_of(monitor->op->op.object,
-			      struct cachefiles_object, fscache);
-
-	spin_lock(&object->work_lock);
-	list_add_tail(&monitor->op_link, &monitor->op->to_do);
-	spin_unlock(&object->work_lock);
-
-	fscache_enqueue_retrieval(monitor->op);
-	return 0;
-}
-
-/*
- * handle a probably truncated page
- * - check to see if the page is still relevant and reissue the read if
- *   possible
- * - return -EIO on error, -ENODATA if the page is gone, -EINPROGRESS if we
- *   must wait again and 0 if successful
- */
-static int cachefiles_read_reissue(struct cachefiles_object *object,
-				   struct cachefiles_one_read *monitor)
-{
-	struct address_space *bmapping = object->backer->d_inode->i_mapping;
-	struct page *backpage = monitor->back_page, *backpage2;
-	int ret;
-
-	_enter("{ino=%lx},{%lx,%lx}",
-	       object->backer->d_inode->i_ino,
-	       backpage->index, backpage->flags);
-
-	/* skip if the page was truncated away completely */
-	if (backpage->mapping != bmapping) {
-		_leave(" = -ENODATA [mapping]");
-		return -ENODATA;
-	}
-
-	backpage2 = find_get_page(bmapping, backpage->index);
-	if (!backpage2) {
-		_leave(" = -ENODATA [gone]");
-		return -ENODATA;
-	}
-
-	if (backpage != backpage2) {
-		put_page(backpage2);
-		_leave(" = -ENODATA [different]");
-		return -ENODATA;
-	}
-
-	/* the page is still there and we already have a ref on it, so we don't
-	 * need a second */
-	put_page(backpage2);
-
-	INIT_LIST_HEAD(&monitor->op_link);
-	add_page_wait_queue(backpage, &monitor->monitor);
-
-	if (trylock_page(backpage)) {
-		ret = -EIO;
-		if (PageError(backpage))
-			goto unlock_discard;
-		ret = 0;
-		if (PageUptodate(backpage))
-			goto unlock_discard;
-
-		_debug("reissue read");
-		ret = bmapping->a_ops->readpage(NULL, backpage);
-		if (ret < 0)
-			goto unlock_discard;
-	}
-
-	/* but the page may have been read before the monitor was installed, so
-	 * the monitor may miss the event - so we have to ensure that we do get
-	 * one in such a case */
-	if (trylock_page(backpage)) {
-		_debug("jumpstart %p {%lx}", backpage, backpage->flags);
-		unlock_page(backpage);
-	}
-
-	/* it'll reappear on the todo list */
-	_leave(" = -EINPROGRESS");
-	return -EINPROGRESS;
-
-unlock_discard:
-	unlock_page(backpage);
-	spin_lock_irq(&object->work_lock);
-	list_del(&monitor->op_link);
-	spin_unlock_irq(&object->work_lock);
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
- * copy data from backing pages to netfs pages to complete a read operation
- * - driven by FS-Cache's thread pool
- */
-static void cachefiles_read_copier(struct fscache_operation *_op)
-{
-	struct cachefiles_one_read *monitor;
-	struct cachefiles_object *object;
-	struct fscache_retrieval *op;
-	struct pagevec pagevec;
-	int error, max;
-
-	op = container_of(_op, struct fscache_retrieval, op);
-	object = container_of(op->op.object,
-			      struct cachefiles_object, fscache);
-
-	_enter("{ino=%lu}", object->backer->d_inode->i_ino);
-
-	pagevec_init(&pagevec, 0);
-
-	max = 8;
-	spin_lock_irq(&object->work_lock);
-
-	while (!list_empty(&op->to_do)) {
-		monitor = list_entry(op->to_do.next,
-				     struct cachefiles_one_read, op_link);
-		list_del(&monitor->op_link);
-
-		spin_unlock_irq(&object->work_lock);
-
-		_debug("- copy {%lu}", monitor->back_page->index);
-
-	recheck:
-		if (test_bit(FSCACHE_COOKIE_INVALIDATING,
-			     &object->fscache.cookie->flags)) {
-			error = -ESTALE;
-		} else if (PageUptodate(monitor->back_page)) {
-			copy_highpage(monitor->netfs_page, monitor->back_page);
-			fscache_mark_page_cached(monitor->op,
-						 monitor->netfs_page);
-			error = 0;
-		} else if (!PageError(monitor->back_page)) {
-			/* the page has probably been truncated */
-			error = cachefiles_read_reissue(object, monitor);
-			if (error == -EINPROGRESS)
-				goto next;
-			goto recheck;
-		} else {
-			cachefiles_io_error_obj(
-				object,
-				"Readpage failed on backing file %lx",
-				(unsigned long) monitor->back_page->flags);
-			error = -EIO;
-		}
-
-		page_cache_release(monitor->back_page);
-
-		fscache_end_io(op, monitor->netfs_page, error);
-		page_cache_release(monitor->netfs_page);
-		fscache_retrieval_complete(op, 1);
-		fscache_put_retrieval(op);
-		kfree(monitor);
-
-	next:
-		/* let the thread pool have some air occasionally */
-		max--;
-		if (max < 0 || need_resched()) {
-			if (!list_empty(&op->to_do))
-				fscache_enqueue_retrieval(op);
-			_leave(" [maxed out]");
-			return;
-		}
-
-		spin_lock_irq(&object->work_lock);
-	}
-
-	spin_unlock_irq(&object->work_lock);
-	_leave("");
-}
-
-/*
- * read the corresponding page to the given set from the backing file
- * - an uncertain page is simply discarded, to be tried again another time
- */
-static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
-					    struct fscache_retrieval *op,
-					    struct page *netpage)
-{
-	struct cachefiles_one_read *monitor;
-	struct address_space *bmapping;
-	struct page *newpage, *backpage;
-	int ret;
-
-	_enter("");
-
-	_debug("read back %p{%lu,%d}",
-	       netpage, netpage->index, page_count(netpage));
-
-	monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);
-	if (!monitor)
-		goto nomem;
-
-	monitor->netfs_page = netpage;
-	monitor->op = fscache_get_retrieval(op);
-
-	init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter);
-
-	/* attempt to get hold of the backing page */
-	bmapping = object->backer->d_inode->i_mapping;
-	newpage = NULL;
-
-	for (;;) {
-		backpage = find_get_page(bmapping, netpage->index);
-		if (backpage)
-			goto backing_page_already_present;
-
-		if (!newpage) {
-			newpage = __page_cache_alloc(cachefiles_gfp |
-						     __GFP_COLD);
-			if (!newpage)
-				goto nomem_monitor;
-		}
-
-		ret = add_to_page_cache(newpage, bmapping,
-					netpage->index, cachefiles_gfp);
-		if (ret == 0)
-			goto installed_new_backing_page;
-		if (ret != -EEXIST)
-			goto nomem_page;
-	}
-
-	/* we've installed a new backing page, so now we need to add it
-	 * to the LRU list and start it reading */
-installed_new_backing_page:
-	_debug("- new %p", newpage);
-
-	backpage = newpage;
-	newpage = NULL;
-
-	lru_cache_add_file(backpage);
-
-read_backing_page:
-	ret = bmapping->a_ops->readpage(NULL, backpage);
-	if (ret < 0)
-		goto read_error;
-
-	/* set the monitor to transfer the data across */
-monitor_backing_page:
-	_debug("- monitor add");
-
-	/* install the monitor */
-	page_cache_get(monitor->netfs_page);
-	page_cache_get(backpage);
-	monitor->back_page = backpage;
-	monitor->monitor.private = backpage;
-	add_page_wait_queue(backpage, &monitor->monitor);
-	monitor = NULL;
-
-	/* but the page may have been read before the monitor was installed, so
-	 * the monitor may miss the event - so we have to ensure that we do get
-	 * one in such a case */
-	if (trylock_page(backpage)) {
-		_debug("jumpstart %p {%lx}", backpage, backpage->flags);
-		unlock_page(backpage);
-	}
-	goto success;
-
-	/* if the backing page is already present, it can be in one of
-	 * three states: read in progress, read failed or read okay */
-backing_page_already_present:
-	_debug("- present");
-
-	if (newpage) {
-		page_cache_release(newpage);
-		newpage = NULL;
-	}
-
-	if (PageError(backpage))
-		goto io_error;
-
-	if (PageUptodate(backpage))
-		goto backing_page_already_uptodate;
-
-	if (!trylock_page(backpage))
-		goto monitor_backing_page;
-	_debug("read %p {%lx}", backpage, backpage->flags);
-	goto read_backing_page;
-
-	/* the backing page is already up to date, attach the netfs
-	 * page to the pagecache and LRU and copy the data across */
-backing_page_already_uptodate:
-	_debug("- uptodate");
-
-	fscache_mark_page_cached(op, netpage);
-
-	copy_highpage(netpage, backpage);
-	fscache_end_io(op, netpage, 0);
-	fscache_retrieval_complete(op, 1);
-
-success:
-	_debug("success");
-	ret = 0;
-
-out:
-	if (backpage)
-		page_cache_release(backpage);
-	if (monitor) {
-		fscache_put_retrieval(monitor->op);
-		kfree(monitor);
-	}
-	_leave(" = %d", ret);
-	return ret;
-
-read_error:
-	_debug("read error %d", ret);
-	if (ret == -ENOMEM) {
-		fscache_retrieval_complete(op, 1);
-		goto out;
-	}
-io_error:
-	cachefiles_io_error_obj(object, "Page read error on backing file");
-	fscache_retrieval_complete(op, 1);
-	ret = -ENOBUFS;
-	goto out;
-
-nomem_page:
-	page_cache_release(newpage);
-nomem_monitor:
-	fscache_put_retrieval(monitor->op);
-	kfree(monitor);
-nomem:
-	fscache_retrieval_complete(op, 1);
-	_leave(" = -ENOMEM");
-	return -ENOMEM;
-}
-
-/*
- * read a page from the cache or allocate a block in which to store it
- * - cache withdrawal is prevented by the caller
- * - returns -EINTR if interrupted
- * - returns -ENOMEM if ran out of memory
- * - returns -ENOBUFS if no buffers can be made available
- * - returns -ENOBUFS if page is beyond EOF
- * - if the page is backed by a block in the cache:
- *   - a read will be started which will call the callback on completion
- *   - 0 will be returned
- * - else if the page is unbacked:
- *   - the metadata will be retained
- *   - -ENODATA will be returned
- */
-int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
-				  struct page *page,
-				  gfp_t gfp)
-{
-	struct cachefiles_object *object;
-	struct cachefiles_cache *cache;
-	struct pagevec pagevec;
-	struct inode *inode;
-	sector_t block0, block;
-	unsigned shift;
-	int ret;
-
-	object = container_of(op->op.object,
-			      struct cachefiles_object, fscache);
-	cache = container_of(object->fscache.cache,
-			     struct cachefiles_cache, cache);
-
-	_enter("{%p},{%lx},,,", object, page->index);
-
-	if (!object->backer)
-		goto enobufs;
-
-	inode = object->backer->d_inode;
-	ASSERT(S_ISREG(inode->i_mode));
-	ASSERT(inode->i_mapping->a_ops->bmap);
-	ASSERT(inode->i_mapping->a_ops->readpages);
-
-	/* calculate the shift required to use bmap */
-	if (inode->i_sb->s_blocksize > PAGE_SIZE)
-		goto enobufs;
-
-	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
-
-	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
-	op->op.flags |= FSCACHE_OP_ASYNC;
-	op->op.processor = cachefiles_read_copier;
-
-	pagevec_init(&pagevec, 0);
-
-	/* we assume the absence or presence of the first block is a good
-	 * enough indication for the page as a whole
-	 * - TODO: don't use bmap() for this as it is _not_ actually good
-	 *   enough for this as it doesn't indicate errors, but it's all we've
-	 *   got for the moment
-	 */
-	block0 = page->index;
-	block0 <<= shift;
-
-	block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0);
-	_debug("%llx -> %llx",
-	       (unsigned long long) block0,
-	       (unsigned long long) block);
-
-	if (block) {
-		/* submit the apparently valid page to the backing fs to be
-		 * read from disk */
-		ret = cachefiles_read_backing_file_one(object, op, page);
-	} else if (cachefiles_has_space(cache, 0, 1) == 0) {
-		/* there's space in the cache we can use */
-		fscache_mark_page_cached(op, page);
-		fscache_retrieval_complete(op, 1);
-		ret = -ENODATA;
-	} else {
-		goto enobufs;
-	}
-
-	_leave(" = %d", ret);
-	return ret;
-
-enobufs:
-	fscache_retrieval_complete(op, 1);
-	_leave(" = -ENOBUFS");
-	return -ENOBUFS;
-}
-
-/*
- * read the corresponding pages to the given set from the backing file
- * - any uncertain pages are simply discarded, to be tried again another time
- */
-static int cachefiles_read_backing_file(struct cachefiles_object *object,
-					struct fscache_retrieval *op,
-					struct list_head *list)
-{
-	struct cachefiles_one_read *monitor = NULL;
-	struct address_space *bmapping = object->backer->d_inode->i_mapping;
-	struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
-	int ret = 0;
-
-	_enter("");
-
-	list_for_each_entry_safe(netpage, _n, list, lru) {
-		list_del(&netpage->lru);
-
-		_debug("read back %p{%lu,%d}",
-		       netpage, netpage->index, page_count(netpage));
-
-		if (!monitor) {
-			monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);
-			if (!monitor)
-				goto nomem;
-
-			monitor->op = fscache_get_retrieval(op);
-			init_waitqueue_func_entry(&monitor->monitor,
-						  cachefiles_read_waiter);
-		}
-
-		for (;;) {
-			backpage = find_get_page(bmapping, netpage->index);
-			if (backpage)
-				goto backing_page_already_present;
-
-			if (!newpage) {
-				newpage = __page_cache_alloc(cachefiles_gfp |
-							     __GFP_COLD);
-				if (!newpage)
-					goto nomem;
-			}
-
-			ret = add_to_page_cache(newpage, bmapping,
-						netpage->index, cachefiles_gfp);
-			if (ret == 0)
-				goto installed_new_backing_page;
-			if (ret != -EEXIST)
-				goto nomem;
-		}
-
-		/* we've installed a new backing page, so now we need to add it
-		 * to the LRU list and start it reading */
-	installed_new_backing_page:
-		_debug("- new %p", newpage);
-
-		backpage = newpage;
-		newpage = NULL;
-
-		lru_cache_add_file(backpage);
-
-	reread_backing_page:
-		ret = bmapping->a_ops->readpage(NULL, backpage);
-		if (ret < 0)
-			goto read_error;
-
-		/* add the netfs page to the pagecache and LRU, and set the
-		 * monitor to transfer the data across */
-	monitor_backing_page:
-		_debug("- monitor add");
-
-		ret = add_to_page_cache(netpage, op->mapping, netpage->index,
-					cachefiles_gfp);
-		if (ret < 0) {
-			if (ret == -EEXIST) {
-				page_cache_release(netpage);
-				fscache_retrieval_complete(op, 1);
-				continue;
-			}
-			goto nomem;
-		}
-
-		lru_cache_add_file(netpage);
-
-		/* install a monitor */
-		page_cache_get(netpage);
-		monitor->netfs_page = netpage;
-
-		page_cache_get(backpage);
-		monitor->back_page = backpage;
-		monitor->monitor.private = backpage;
-		add_page_wait_queue(backpage, &monitor->monitor);
-		monitor = NULL;
-
-		/* but the page may have been read before the monitor was
-		 * installed, so the monitor may miss the event - so we have to
-		 * ensure that we do get one in such a case */
-		if (trylock_page(backpage)) {
-			_debug("2unlock %p {%lx}", backpage, backpage->flags);
-			unlock_page(backpage);
-		}
-
-		page_cache_release(backpage);
-		backpage = NULL;
-
-		page_cache_release(netpage);
-		netpage = NULL;
-		continue;
-
-		/* if the backing page is already present, it can be in one of
-		 * three states: read in progress, read failed or read okay */
-	backing_page_already_present:
-		_debug("- present %p", backpage);
-
-		if (PageError(backpage))
-			goto io_error;
-
-		if (PageUptodate(backpage))
-			goto backing_page_already_uptodate;
-
-		_debug("- not ready %p{%lx}", backpage, backpage->flags);
-
-		if (!trylock_page(backpage))
-			goto monitor_backing_page;
-
-		if (PageError(backpage)) {
-			_debug("error %lx", backpage->flags);
-			unlock_page(backpage);
-			goto io_error;
-		}
-
-		if (PageUptodate(backpage))
-			goto backing_page_already_uptodate_unlock;
-
-		/* we've locked a page that's neither up to date nor erroneous,
-		 * so we need to attempt to read it again */
-		goto reread_backing_page;
-
-		/* the backing page is already up to date, attach the netfs
-		 * page to the pagecache and LRU and copy the data across */
-	backing_page_already_uptodate_unlock:
-		_debug("uptodate %lx", backpage->flags);
-		unlock_page(backpage);
-	backing_page_already_uptodate:
-		_debug("- uptodate");
-
-		ret = add_to_page_cache(netpage, op->mapping, netpage->index,
-					cachefiles_gfp);
-		if (ret < 0) {
-			if (ret == -EEXIST) {
-				page_cache_release(netpage);
-				fscache_retrieval_complete(op, 1);
-				continue;
-			}
-			goto nomem;
-		}
-
-		copy_highpage(netpage, backpage);
-
-		page_cache_release(backpage);
-		backpage = NULL;
-
-		fscache_mark_page_cached(op, netpage);
-
-		lru_cache_add_file(netpage);
-
-		/* the netpage is unlocked and marked up to date here */
-		fscache_end_io(op, netpage, 0);
-		page_cache_release(netpage);
-		netpage = NULL;
-		fscache_retrieval_complete(op, 1);
-		continue;
-	}
-
-	netpage = NULL;
-
-	_debug("out");
-
-out:
-	/* tidy up */
-	if (newpage)
-		page_cache_release(newpage);
-	if (netpage)
-		page_cache_release(netpage);
-	if (backpage)
-		page_cache_release(backpage);
-	if (monitor) {
-		fscache_put_retrieval(op);
-		kfree(monitor);
-	}
-
-	list_for_each_entry_safe(netpage, _n, list, lru) {
-		list_del(&netpage->lru);
-		page_cache_release(netpage);
-		fscache_retrieval_complete(op, 1);
-	}
-
-	_leave(" = %d", ret);
-	return ret;
-
-nomem:
-	_debug("nomem");
-	ret = -ENOMEM;
-	goto record_page_complete;
-
-read_error:
-	_debug("read error %d", ret);
-	if (ret == -ENOMEM)
-		goto record_page_complete;
-io_error:
-	cachefiles_io_error_obj(object, "Page read error on backing file");
-	ret = -ENOBUFS;
-record_page_complete:
-	fscache_retrieval_complete(op, 1);
-	goto out;
-}
-
-/*
- * read a list of pages from the cache or allocate blocks in which to store
- * them
- */
-int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
-				   struct list_head *pages,
-				   unsigned *nr_pages,
-				   gfp_t gfp)
-{
-	struct cachefiles_object *object;
-	struct cachefiles_cache *cache;
-	struct list_head backpages;
-	struct pagevec pagevec;
-	struct inode *inode;
-	struct page *page, *_n;
-	unsigned shift, nrbackpages;
-	int ret, ret2, space;
-
-	object = container_of(op->op.object,
-			      struct cachefiles_object, fscache);
-	cache = container_of(object->fscache.cache,
-			     struct cachefiles_cache, cache);
-
-	_enter("{OBJ%x,%d},,%d,,",
-	       object->fscache.debug_id, atomic_read(&op->op.usage),
-	       *nr_pages);
-
-	if (!object->backer)
-		goto all_enobufs;
-
-	space = 1;
-	if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
-		space = 0;
-
-	inode = object->backer->d_inode;
-	ASSERT(S_ISREG(inode->i_mode));
-	ASSERT(inode->i_mapping->a_ops->bmap);
-	ASSERT(inode->i_mapping->a_ops->readpages);
-
-	/* calculate the shift required to use bmap */
-	if (inode->i_sb->s_blocksize > PAGE_SIZE)
-		goto all_enobufs;
-
-	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
-
-	pagevec_init(&pagevec, 0);
-
-	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
-	op->op.flags |= FSCACHE_OP_ASYNC;
-	op->op.processor = cachefiles_read_copier;
-
-	INIT_LIST_HEAD(&backpages);
-	nrbackpages = 0;
-
-	ret = space ? -ENODATA : -ENOBUFS;
-	list_for_each_entry_safe(page, _n, pages, lru) {
-		sector_t block0, block;
-
-		/* we assume the absence or presence of the first block is a
-		 * good enough indication for the page as a whole
-		 * - TODO: don't use bmap() for this as it is _not_ actually
-		 *   good enough for this as it doesn't indicate errors, but
-		 *   it's all we've got for the moment
-		 */
-		block0 = page->index;
-		block0 <<= shift;
-
-		block = inode->i_mapping->a_ops->bmap(inode->i_mapping,
-						      block0);
-		_debug("%llx -> %llx",
-		       (unsigned long long) block0,
-		       (unsigned long long) block);
-
-		if (block) {
-			/* we have data - add it to the list to give to the
-			 * backing fs */
-			list_move(&page->lru, &backpages);
-			(*nr_pages)--;
-			nrbackpages++;
-		} else if (space && pagevec_add(&pagevec, page) == 0) {
-			fscache_mark_pages_cached(op, &pagevec);
-			fscache_retrieval_complete(op, 1);
-			ret = -ENODATA;
-		} else {
-			fscache_retrieval_complete(op, 1);
-		}
-	}
-
-	if (pagevec_count(&pagevec) > 0)
-		fscache_mark_pages_cached(op, &pagevec);
-
-	if (list_empty(pages))
-		ret = 0;
-
-	/* submit the apparently valid pages to the backing fs to be read from
-	 * disk */
-	if (nrbackpages > 0) {
-		ret2 = cachefiles_read_backing_file(object, op, &backpages);
-		if (ret2 == -ENOMEM || ret2 == -EINTR)
-			ret = ret2;
-	}
-
-	_leave(" = %d [nr=%u%s]",
-	       ret, *nr_pages, list_empty(pages) ? " empty" : "");
-	return ret;
-
-all_enobufs:
-	fscache_retrieval_complete(op, *nr_pages);
-	return -ENOBUFS;
-}
-
-/*
- * allocate a block in the cache in which to store a page
- * - cache withdrawal is prevented by the caller
- * - returns -EINTR if interrupted
- * - returns -ENOMEM if ran out of memory
- * - returns -ENOBUFS if no buffers can be made available
- * - returns -ENOBUFS if page is beyond EOF
- * - otherwise:
- *   - the metadata will be retained
- *   - 0 will be returned
- */
-int cachefiles_allocate_page(struct fscache_retrieval *op,
-			     struct page *page,
-			     gfp_t gfp)
-{
-	struct cachefiles_object *object;
-	struct cachefiles_cache *cache;
-	int ret;
-
-	object = container_of(op->op.object,
-			      struct cachefiles_object, fscache);
-	cache = container_of(object->fscache.cache,
-			     struct cachefiles_cache, cache);
-
-	_enter("%p,{%lx},", object, page->index);
-
-	ret = cachefiles_has_space(cache, 0, 1);
-	if (ret == 0)
-		fscache_mark_page_cached(op, page);
-	else
-		ret = -ENOBUFS;
-
-	fscache_retrieval_complete(op, 1);
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
- * allocate blocks in the cache in which to store a set of pages
- * - cache withdrawal is prevented by the caller
- * - returns -EINTR if interrupted
- * - returns -ENOMEM if ran out of memory
- * - returns -ENOBUFS if some buffers couldn't be made available
- * - returns -ENOBUFS if some pages are beyond EOF
- * - otherwise:
- *   - -ENODATA will be returned
- * - metadata will be retained for any page marked
- */
-int cachefiles_allocate_pages(struct fscache_retrieval *op,
-			      struct list_head *pages,
-			      unsigned *nr_pages,
-			      gfp_t gfp)
-{
-	struct cachefiles_object *object;
-	struct cachefiles_cache *cache;
-	struct pagevec pagevec;
-	struct page *page;
-	int ret;
-
-	object = container_of(op->op.object,
-			      struct cachefiles_object, fscache);
-	cache = container_of(object->fscache.cache,
-			     struct cachefiles_cache, cache);
-
-	_enter("%p,,,%d,", object, *nr_pages);
-
-	ret = cachefiles_has_space(cache, 0, *nr_pages);
-	if (ret == 0) {
-		pagevec_init(&pagevec, 0);
-
-		list_for_each_entry(page, pages, lru) {
-			if (pagevec_add(&pagevec, page) == 0)
-				fscache_mark_pages_cached(op, &pagevec);
-		}
-
-		if (pagevec_count(&pagevec) > 0)
-			fscache_mark_pages_cached(op, &pagevec);
-		ret = -ENODATA;
-	} else {
-		ret = -ENOBUFS;
-	}
-
-	fscache_retrieval_complete(op, *nr_pages);
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
- * request a page be stored in the cache
- * - cache withdrawal is prevented by the caller
- * - this request may be ignored if there's no cache block available, in which
- *   case -ENOBUFS will be returned
- * - if the op is in progress, 0 will be returned
- */
-int cachefiles_write_page(struct fscache_storage *op, struct page *page)
-{
-	struct cachefiles_object *object;
-	struct cachefiles_cache *cache;
-	mm_segment_t old_fs;
-	struct file *file;
-	struct path path;
-	loff_t pos, eof;
-	size_t len;
-	void *data;
-	int ret;
-
-	ASSERT(op != NULL);
-	ASSERT(page != NULL);
-
-	object = container_of(op->op.object,
-			      struct cachefiles_object, fscache);
-
-	_enter("%p,%p{%lx},,,", object, page, page->index);
-
-	if (!object->backer) {
-		_leave(" = -ENOBUFS");
-		return -ENOBUFS;
-	}
-
-	ASSERT(S_ISREG(object->backer->d_inode->i_mode));
-
-	cache = container_of(object->fscache.cache,
-			     struct cachefiles_cache, cache);
-
-	/* write the page to the backing filesystem and let it store it in its
-	 * own time */
-	path.mnt = cache->mnt;
-	path.dentry = object->backer;
-	file = dentry_open(&path, O_RDWR | O_LARGEFILE, cache->cache_cred);
-	if (IS_ERR(file)) {
-		ret = PTR_ERR(file);
-	} else {
-		ret = -EIO;
-		if (file->f_op->write) {
-			pos = (loff_t) page->index << PAGE_SHIFT;
-
-			/* we mustn't write more data than we have, so we have
-			 * to beware of a partial page at EOF */
-			eof = object->fscache.store_limit_l;
-			len = PAGE_SIZE;
-			if (eof & ~PAGE_MASK) {
-				ASSERTCMP(pos, <, eof);
-				if (eof - pos < PAGE_SIZE) {
-					_debug("cut short %llx to %llx",
-					       pos, eof);
-					len = eof - pos;
-					ASSERTCMP(pos + len, ==, eof);
-				}
-			}
-
-			data = kmap(page);
-			file_start_write(file);
-			old_fs = get_fs();
-			set_fs(KERNEL_DS);
-			ret = file->f_op->write(
-				file, (const void __user *) data, len, &pos);
-			set_fs(old_fs);
-			kunmap(page);
-			file_end_write(file);
-			if (ret != len)
-				ret = -EIO;
-		}
-		fput(file);
-	}
-
-	if (ret < 0) {
-		if (ret == -EIO)
-			cachefiles_io_error_obj(
-				object, "Write page to backing file failed");
-		ret = -ENOBUFS;
-	}
-
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
- * detach a backing block from a page
- * - cache withdrawal is prevented by the caller
- */
-void cachefiles_uncache_page(struct fscache_object *_object, struct page *page)
-{
-	struct cachefiles_object *object;
-	struct cachefiles_cache *cache;
-
-	object = container_of(_object, struct cachefiles_object, fscache);
-	cache = container_of(object->fscache.cache,
-			     struct cachefiles_cache, cache);
-
-	_enter("%p,{%lu}", object, page->index);
-
-	spin_unlock(&object->fscache.cookie->lock);
-}
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
index 039b5011d83b..fc6611886b3b 100644
--- a/fs/cachefiles/security.c
+++ b/fs/cachefiles/security.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* CacheFiles security management
  *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2007, 2021 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/fs.h>
@@ -22,7 +18,7 @@ int cachefiles_get_security_ID(struct cachefiles_cache *cache)
 	struct cred *new;
 	int ret;
 
-	_enter("{%s}", cache->secctx);
+	_enter("{%u}", cache->have_secid ? cache->secid : 0);
 
 	new = prepare_kernel_cred(current);
 	if (!new) {
@@ -30,13 +26,11 @@ int cachefiles_get_security_ID(struct cachefiles_cache *cache)
 		goto error;
 	}
 
-	if (cache->secctx) {
-		ret = set_security_override_from_ctx(new, cache->secctx);
+	if (cache->have_secid) {
+		ret = set_security_override(new, cache->secid);
 		if (ret < 0) {
 			put_cred(new);
-			printk(KERN_ERR "CacheFiles:"
-			       " Security denies permission to nominate"
-			       " security context: error %d\n",
+			pr_err("Security denies permission to nominate security context: error %d\n",
 			       ret);
 			goto error;
 		}
@@ -57,18 +51,16 @@ static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
 {
 	int ret;
 
-	ret = security_inode_mkdir(root->d_inode, root, 0);
+	ret = security_inode_mkdir(d_backing_inode(root), root, 0);
 	if (ret < 0) {
-		printk(KERN_ERR "CacheFiles:"
-		       " Security denies permission to make dirs: error %d",
+		pr_err("Security denies permission to make dirs: error %d",
 		       ret);
 		return ret;
 	}
 
-	ret = security_inode_create(root->d_inode, root, 0);
+	ret = security_inode_create(d_backing_inode(root), root, 0);
 	if (ret < 0)
-		printk(KERN_ERR "CacheFiles:"
-		       " Security denies permission to create files: error %d",
+		pr_err("Security denies permission to create files: error %d",
 		       ret);
 
 	return ret;
@@ -99,7 +91,7 @@ int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
 
 	/* use the cache root dir's security context as the basis with
 	 * which create files */
-	ret = set_create_files_as(new, root->d_inode);
+	ret = set_create_files_as(new, d_backing_inode(root));
 	if (ret < 0) {
 		abort_creds(new);
 		cachefiles_begin_secure(cache, _saved_cred);
diff --git a/fs/cachefiles/volume.c b/fs/cachefiles/volume.c
new file mode 100644
index 000000000000..90ba926f488e
--- /dev/null
+++ b/fs/cachefiles/volume.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Volume handling.
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include "internal.h"
+#include <trace/events/fscache.h>
+
+/*
+ * Allocate and set up a volume representation.  We make sure all the fanout
+ * directories are created and pinned.
+ */
+void cachefiles_acquire_volume(struct fscache_volume *vcookie)
+{
+	struct cachefiles_volume *volume;
+	struct cachefiles_cache *cache = vcookie->cache->cache_priv;
+	const struct cred *saved_cred;
+	struct dentry *vdentry, *fan;
+	size_t len;
+	char *name;
+	bool is_new = false;
+	int ret, n_accesses, i;
+
+	_enter("");
+
+	volume = kzalloc(sizeof(struct cachefiles_volume), GFP_KERNEL);
+	if (!volume)
+		return;
+	volume->vcookie = vcookie;
+	volume->cache = cache;
+	INIT_LIST_HEAD(&volume->cache_link);
+
+	cachefiles_begin_secure(cache, &saved_cred);
+
+	len = vcookie->key[0];
+	name = kmalloc(len + 3, GFP_NOFS);
+	if (!name)
+		goto error_vol;
+	name[0] = 'I';
+	memcpy(name + 1, vcookie->key + 1, len);
+	name[len + 1] = 0;
+
+retry:
+	vdentry = cachefiles_get_directory(cache, cache->store, name, &is_new);
+	if (IS_ERR(vdentry))
+		goto error_name;
+	volume->dentry = vdentry;
+
+	if (is_new) {
+		if (!cachefiles_set_volume_xattr(volume))
+			goto error_dir;
+	} else {
+		ret = cachefiles_check_volume_xattr(volume);
+		if (ret < 0) {
+			if (ret != -ESTALE)
+				goto error_dir;
+			vdentry = start_removing_dentry(cache->store, vdentry);
+			if (!IS_ERR(vdentry))
+				cachefiles_bury_object(cache, NULL, cache->store,
+						       vdentry,
+						       FSCACHE_VOLUME_IS_WEIRD);
+			cachefiles_put_directory(volume->dentry);
+			cond_resched();
+			goto retry;
+		}
+	}
+	
+	for (i = 0; i < 256; i++) {
+		sprintf(name, "@%02x", i);
+		fan = cachefiles_get_directory(cache, vdentry, name, NULL);
+		if (IS_ERR(fan))
+			goto error_fan;
+		volume->fanout[i] = fan;
+	}
+
+	cachefiles_end_secure(cache, saved_cred);
+
+	vcookie->cache_priv = volume;
+	n_accesses = atomic_inc_return(&vcookie->n_accesses); /* Stop wakeups on dec-to-0 */
+	trace_fscache_access_volume(vcookie->debug_id, 0,
+				    refcount_read(&vcookie->ref),
+				    n_accesses, fscache_access_cache_pin);
+
+	spin_lock(&cache->object_list_lock);
+	list_add(&volume->cache_link, &volume->cache->volumes);
+	spin_unlock(&cache->object_list_lock);
+
+	kfree(name);
+	return;
+
+error_fan:
+	for (i = 0; i < 256; i++)
+		cachefiles_put_directory(volume->fanout[i]);
+error_dir:
+	cachefiles_put_directory(volume->dentry);
+error_name:
+	kfree(name);
+error_vol:
+	kfree(volume);
+	cachefiles_end_secure(cache, saved_cred);
+}
+
+/*
+ * Release a volume representation.
+ */
+static void __cachefiles_free_volume(struct cachefiles_volume *volume)
+{
+	int i;
+
+	_enter("");
+
+	volume->vcookie->cache_priv = NULL;
+
+	for (i = 0; i < 256; i++)
+		cachefiles_put_directory(volume->fanout[i]);
+	cachefiles_put_directory(volume->dentry);
+	kfree(volume);
+}
+
+void cachefiles_free_volume(struct fscache_volume *vcookie)
+{
+	struct cachefiles_volume *volume = vcookie->cache_priv;
+
+	if (volume) {
+		spin_lock(&volume->cache->object_list_lock);
+		list_del_init(&volume->cache_link);
+		spin_unlock(&volume->cache->object_list_lock);
+		__cachefiles_free_volume(volume);
+	}
+}
+
+void cachefiles_withdraw_volume(struct cachefiles_volume *volume)
+{
+	cachefiles_set_volume_xattr(volume);
+	__cachefiles_free_volume(volume);
+}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 2476e5162609..52383b1d0ba6 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* CacheFiles extended attribute management
  *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/module.h>
@@ -19,272 +15,290 @@
 #include <linux/slab.h>
 #include "internal.h"
 
+#define CACHEFILES_COOKIE_TYPE_DATA 1
+
+struct cachefiles_xattr {
+	__be64	object_size;	/* Actual size of the object */
+	__be64	zero_point;	/* Size after which server has no data not written by us */
+	__u8	type;		/* Type of object */
+	__u8	content;	/* Content presence (enum cachefiles_content) */
+	__u8	data[];		/* netfs coherency data */
+} __packed;
+
 static const char cachefiles_xattr_cache[] =
 	XATTR_USER_PREFIX "CacheFiles.cache";
 
+struct cachefiles_vol_xattr {
+	__be32	reserved;	/* Reserved, should be 0 */
+	__u8	data[];		/* netfs volume coherency data */
+} __packed;
+
 /*
- * check the type label on an object
- * - done using xattrs
+ * set the state xattr on a cache file
  */
-int cachefiles_check_object_type(struct cachefiles_object *object)
+int cachefiles_set_object_xattr(struct cachefiles_object *object)
 {
-	struct dentry *dentry = object->dentry;
-	char type[3], xtype[3];
+	struct cachefiles_xattr *buf;
+	struct dentry *dentry;
+	struct file *file = object->file;
+	unsigned int len = object->cookie->aux_len;
 	int ret;
 
-	ASSERT(dentry);
-	ASSERT(dentry->d_inode);
+	if (!file)
+		return -ESTALE;
+	dentry = file->f_path.dentry;
 
-	if (!object->fscache.cookie)
-		strcpy(type, "C3");
-	else
-		snprintf(type, 3, "%02x", object->fscache.cookie->def->type);
+	_enter("%x,#%d", object->debug_id, len);
 
-	_enter("%p{%s}", object, type);
+	buf = kmalloc(sizeof(struct cachefiles_xattr) + len, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
 
-	/* attempt to install a type label directly */
-	ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2,
-			   XATTR_CREATE);
-	if (ret == 0) {
-		_debug("SET"); /* we succeeded */
-		goto error;
-	}
+	buf->object_size	= cpu_to_be64(object->cookie->object_size);
+	buf->zero_point		= 0;
+	buf->type		= CACHEFILES_COOKIE_TYPE_DATA;
+	buf->content		= object->content_info;
+	if (test_bit(FSCACHE_COOKIE_LOCAL_WRITE, &object->cookie->flags))
+		buf->content	= CACHEFILES_CONTENT_DIRTY;
+	if (len > 0)
+		memcpy(buf->data, fscache_get_aux(object->cookie), len);
 
-	if (ret != -EEXIST) {
-		kerror("Can't set xattr on %*.*s [%lu] (err %d)",
-		       dentry->d_name.len, dentry->d_name.len,
-		       dentry->d_name.name, dentry->d_inode->i_ino,
-		       -ret);
-		goto error;
+	ret = cachefiles_inject_write_error();
+	if (ret == 0) {
+		ret = mnt_want_write_file(file);
+		if (ret == 0) {
+			ret = vfs_setxattr(&nop_mnt_idmap, dentry,
+					   cachefiles_xattr_cache, buf,
+					   sizeof(struct cachefiles_xattr) + len, 0);
+			mnt_drop_write_file(file);
+		}
 	}
-
-	/* read the current type label */
-	ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3);
 	if (ret < 0) {
-		if (ret == -ERANGE)
-			goto bad_type_length;
-
-		kerror("Can't read xattr on %*.*s [%lu] (err %d)",
-		       dentry->d_name.len, dentry->d_name.len,
-		       dentry->d_name.name, dentry->d_inode->i_ino,
-		       -ret);
-		goto error;
+		trace_cachefiles_vfs_error(object, file_inode(file), ret,
+					   cachefiles_trace_setxattr_error);
+		trace_cachefiles_coherency(object, file_inode(file)->i_ino,
+					   be64_to_cpup((__be64 *)buf->data),
+					   buf->content,
+					   cachefiles_coherency_set_fail);
+		if (ret != -ENOMEM)
+			cachefiles_io_error_obj(
+				object,
+				"Failed to set xattr with error %d", ret);
+	} else {
+		trace_cachefiles_coherency(object, file_inode(file)->i_ino,
+					   be64_to_cpup((__be64 *)buf->data),
+					   buf->content,
+					   cachefiles_coherency_set_ok);
 	}
 
-	/* check the type is what we're expecting */
-	if (ret != 2)
-		goto bad_type_length;
-
-	if (xtype[0] != type[0] || xtype[1] != type[1])
-		goto bad_type;
-
-	ret = 0;
-
-error:
+	kfree(buf);
 	_leave(" = %d", ret);
 	return ret;
-
-bad_type_length:
-	kerror("Cache object %lu type xattr length incorrect",
-	       dentry->d_inode->i_ino);
-	ret = -EIO;
-	goto error;
-
-bad_type:
-	xtype[2] = 0;
-	kerror("Cache object %*.*s [%lu] type %s not %s",
-	       dentry->d_name.len, dentry->d_name.len,
-	       dentry->d_name.name, dentry->d_inode->i_ino,
-	       xtype, type);
-	ret = -EIO;
-	goto error;
 }
 
 /*
- * set the state xattr on a cache file
+ * check the consistency between the backing cache and the FS-Cache cookie
  */
-int cachefiles_set_object_xattr(struct cachefiles_object *object,
-				struct cachefiles_xattr *auxdata)
+int cachefiles_check_auxdata(struct cachefiles_object *object, struct file *file)
 {
-	struct dentry *dentry = object->dentry;
-	int ret;
-
-	ASSERT(dentry);
-
-	_enter("%p,#%d", object, auxdata->len);
+	struct cachefiles_xattr *buf;
+	struct dentry *dentry = file->f_path.dentry;
+	unsigned int len = object->cookie->aux_len, tlen;
+	const void *p = fscache_get_aux(object->cookie);
+	enum cachefiles_coherency_trace why;
+	ssize_t xlen;
+	int ret = -ESTALE;
+
+	tlen = sizeof(struct cachefiles_xattr) + len;
+	buf = kmalloc(tlen, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
 
-	/* attempt to install the cache metadata directly */
-	_debug("SET #%u", auxdata->len);
+	xlen = cachefiles_inject_read_error();
+	if (xlen == 0)
+		xlen = vfs_getxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, tlen);
+	if (xlen != tlen) {
+		if (xlen < 0) {
+			ret = xlen;
+			trace_cachefiles_vfs_error(object, file_inode(file), xlen,
+						   cachefiles_trace_getxattr_error);
+		}
+		if (xlen == -EIO)
+			cachefiles_io_error_obj(
+				object,
+				"Failed to read aux with error %zd", xlen);
+		why = cachefiles_coherency_check_xattr;
+		goto out;
+	}
 
-	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
-			   &auxdata->type, auxdata->len,
-			   XATTR_CREATE);
-	if (ret < 0 && ret != -ENOMEM)
-		cachefiles_io_error_obj(
-			object,
-			"Failed to set xattr with error %d", ret);
+	if (buf->type != CACHEFILES_COOKIE_TYPE_DATA) {
+		why = cachefiles_coherency_check_type;
+	} else if (memcmp(buf->data, p, len) != 0) {
+		why = cachefiles_coherency_check_aux;
+	} else if (be64_to_cpu(buf->object_size) != object->cookie->object_size) {
+		why = cachefiles_coherency_check_objsize;
+	} else if (buf->content == CACHEFILES_CONTENT_DIRTY) {
+		// TODO: Begin conflict resolution
+		pr_warn("Dirty object in cache\n");
+		why = cachefiles_coherency_check_dirty;
+	} else {
+		why = cachefiles_coherency_check_ok;
+		ret = 0;
+	}
 
-	_leave(" = %d", ret);
+out:
+	trace_cachefiles_coherency(object, file_inode(file)->i_ino,
+				   be64_to_cpup((__be64 *)buf->data),
+				   buf->content, why);
+	kfree(buf);
 	return ret;
 }
 
 /*
- * update the state xattr on a cache file
+ * remove the object's xattr to mark it stale
  */
-int cachefiles_update_object_xattr(struct cachefiles_object *object,
-				   struct cachefiles_xattr *auxdata)
+int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
+				   struct cachefiles_object *object,
+				   struct dentry *dentry)
 {
-	struct dentry *dentry = object->dentry;
 	int ret;
 
-	ASSERT(dentry);
-
-	_enter("%p,#%d", object, auxdata->len);
-
-	/* attempt to install the cache metadata directly */
-	_debug("SET #%u", auxdata->len);
-
-	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
-			   &auxdata->type, auxdata->len,
-			   XATTR_REPLACE);
-	if (ret < 0 && ret != -ENOMEM)
-		cachefiles_io_error_obj(
-			object,
-			"Failed to update xattr with error %d", ret);
+	ret = cachefiles_inject_remove_error();
+	if (ret == 0) {
+		ret = mnt_want_write(cache->mnt);
+		if (ret == 0) {
+			ret = vfs_removexattr(&nop_mnt_idmap, dentry,
+					      cachefiles_xattr_cache);
+			mnt_drop_write(cache->mnt);
+		}
+	}
+	if (ret < 0) {
+		trace_cachefiles_vfs_error(object, d_inode(dentry), ret,
+					   cachefiles_trace_remxattr_error);
+		if (ret == -ENOENT || ret == -ENODATA)
+			ret = 0;
+		else if (ret != -ENOMEM)
+			cachefiles_io_error(cache,
+					    "Can't remove xattr from %lu"
+					    " (error %d)",
+					    d_backing_inode(dentry)->i_ino, -ret);
+	}
 
 	_leave(" = %d", ret);
 	return ret;
 }
 
 /*
- * check the state xattr on a cache file
- * - return -ESTALE if the object should be deleted
+ * Stick a marker on the cache object to indicate that it's dirty.
  */
-int cachefiles_check_object_xattr(struct cachefiles_object *object,
-				  struct cachefiles_xattr *auxdata)
+void cachefiles_prepare_to_write(struct fscache_cookie *cookie)
 {
-	struct cachefiles_xattr *auxbuf;
-	struct dentry *dentry = object->dentry;
-	int ret;
+	const struct cred *saved_cred;
+	struct cachefiles_object *object = cookie->cache_priv;
+	struct cachefiles_cache *cache = object->volume->cache;
 
-	_enter("%p,#%d", object, auxdata->len);
+	_enter("c=%08x", object->cookie->debug_id);
 
-	ASSERT(dentry);
-	ASSERT(dentry->d_inode);
-
-	auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp);
-	if (!auxbuf) {
-		_leave(" = -ENOMEM");
-		return -ENOMEM;
+	if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) {
+		cachefiles_begin_secure(cache, &saved_cred);
+		cachefiles_set_object_xattr(object);
+		cachefiles_end_secure(cache, saved_cred);
 	}
+}
 
-	/* read the current type label */
-	ret = vfs_getxattr(dentry, cachefiles_xattr_cache,
-			   &auxbuf->type, 512 + 1);
-	if (ret < 0) {
-		if (ret == -ENODATA)
-			goto stale; /* no attribute - power went off
-				     * mid-cull? */
-
-		if (ret == -ERANGE)
-			goto bad_type_length;
-
-		cachefiles_io_error_obj(object,
-					"Can't read xattr on %lu (err %d)",
-					dentry->d_inode->i_ino, -ret);
-		goto error;
-	}
-
-	/* check the on-disk object */
-	if (ret < 1)
-		goto bad_type_length;
-
-	if (auxbuf->type != auxdata->type)
-		goto stale;
-
-	auxbuf->len = ret;
-
-	/* consult the netfs */
-	if (object->fscache.cookie->def->check_aux) {
-		enum fscache_checkaux result;
-		unsigned int dlen;
-
-		dlen = auxbuf->len - 1;
-
-		_debug("checkaux %s #%u",
-		       object->fscache.cookie->def->name, dlen);
-
-		result = fscache_check_aux(&object->fscache,
-					   &auxbuf->data, dlen);
-
-		switch (result) {
-			/* entry okay as is */
-		case FSCACHE_CHECKAUX_OKAY:
-			goto okay;
-
-			/* entry requires update */
-		case FSCACHE_CHECKAUX_NEEDS_UPDATE:
-			break;
+/*
+ * Set the state xattr on a volume directory.
+ */
+bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume)
+{
+	struct cachefiles_vol_xattr *buf;
+	unsigned int len = volume->vcookie->coherency_len;
+	const void *p = volume->vcookie->coherency;
+	struct dentry *dentry = volume->dentry;
+	int ret;
 
-			/* entry requires deletion */
-		case FSCACHE_CHECKAUX_OBSOLETE:
-			goto stale;
+	_enter("%x,#%d", volume->vcookie->debug_id, len);
 
-		default:
-			BUG();
-		}
+	len += sizeof(*buf);
+	buf = kmalloc(len, GFP_KERNEL);
+	if (!buf)
+		return false;
+	buf->reserved = cpu_to_be32(0);
+	memcpy(buf->data, p, volume->vcookie->coherency_len);
 
-		/* update the current label */
-		ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
-				   &auxdata->type, auxdata->len,
-				   XATTR_REPLACE);
-		if (ret < 0) {
-			cachefiles_io_error_obj(object,
-						"Can't update xattr on %lu"
-						" (error %d)",
-						dentry->d_inode->i_ino, -ret);
-			goto error;
+	ret = cachefiles_inject_write_error();
+	if (ret == 0) {
+		ret = mnt_want_write(volume->cache->mnt);
+		if (ret == 0) {
+			ret = vfs_setxattr(&nop_mnt_idmap, dentry,
+					   cachefiles_xattr_cache,
+					   buf, len, 0);
+			mnt_drop_write(volume->cache->mnt);
 		}
 	}
+	if (ret < 0) {
+		trace_cachefiles_vfs_error(NULL, d_inode(dentry), ret,
+					   cachefiles_trace_setxattr_error);
+		trace_cachefiles_vol_coherency(volume, d_inode(dentry)->i_ino,
+					       cachefiles_coherency_vol_set_fail);
+		if (ret != -ENOMEM)
+			cachefiles_io_error(
+				volume->cache, "Failed to set xattr with error %d", ret);
+	} else {
+		trace_cachefiles_vol_coherency(volume, d_inode(dentry)->i_ino,
+					       cachefiles_coherency_vol_set_ok);
+	}
 
-okay:
-	ret = 0;
-
-error:
-	kfree(auxbuf);
+	kfree(buf);
 	_leave(" = %d", ret);
-	return ret;
-
-bad_type_length:
-	kerror("Cache object %lu xattr length incorrect",
-	       dentry->d_inode->i_ino);
-	ret = -EIO;
-	goto error;
-
-stale:
-	ret = -ESTALE;
-	goto error;
+	return ret == 0;
 }
 
 /*
- * remove the object's xattr to mark it stale
+ * Check the consistency between the backing cache and the volume cookie.
  */
-int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
-				   struct dentry *dentry)
+int cachefiles_check_volume_xattr(struct cachefiles_volume *volume)
 {
-	int ret;
+	struct cachefiles_vol_xattr *buf;
+	struct dentry *dentry = volume->dentry;
+	unsigned int len = volume->vcookie->coherency_len;
+	const void *p = volume->vcookie->coherency;
+	enum cachefiles_coherency_trace why;
+	ssize_t xlen;
+	int ret = -ESTALE;
+
+	_enter("");
+
+	len += sizeof(*buf);
+	buf = kmalloc(len, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
 
-	ret = vfs_removexattr(dentry, cachefiles_xattr_cache);
-	if (ret < 0) {
-		if (ret == -ENOENT || ret == -ENODATA)
-			ret = 0;
-		else if (ret != -ENOMEM)
-			cachefiles_io_error(cache,
-					    "Can't remove xattr from %lu"
-					    " (error %d)",
-					    dentry->d_inode->i_ino, -ret);
+	xlen = cachefiles_inject_read_error();
+	if (xlen == 0)
+		xlen = vfs_getxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, len);
+	if (xlen != len) {
+		if (xlen < 0) {
+			ret = xlen;
+			trace_cachefiles_vfs_error(NULL, d_inode(dentry), xlen,
+						   cachefiles_trace_getxattr_error);
+			if (xlen == -EIO)
+				cachefiles_io_error(
+					volume->cache,
+					"Failed to read xattr with error %zd", xlen);
+		}
+		why = cachefiles_coherency_vol_check_xattr;
+	} else if (buf->reserved != cpu_to_be32(0)) {
+		why = cachefiles_coherency_vol_check_resv;
+	} else if (memcmp(buf->data, p, len - sizeof(*buf)) != 0) {
+		why = cachefiles_coherency_vol_check_cmp;
+	} else {
+		why = cachefiles_coherency_vol_check_ok;
+		ret = 0;
 	}
 
+	trace_cachefiles_vol_coherency(volume, d_inode(dentry)->i_ino, why);
+	kfree(buf);
 	_leave(" = %d", ret);
 	return ret;
 }